コード例 #1
0
def run_clearcut(session, args):
    if args.clone_ids is not None:
        clones = session.query(Clone.id).filter(
            Clone.id.in_(args.clone_ids))
    else:
        if args.subject_ids is not None:
            clones = session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids))
        else:
            clones = session.query(Clone.id)

    if not args.force:
        clones = clones.filter(Clone.tree.is_(None))
    clones = [c.id for c in clones]
    mod_log.make_mod('clone_tree', session=session, commit=True,
                     info=vars(args))

    tasks = concurrent.TaskQueue()

    logger.info('Creating task queue for clones')
    for clone_id in clones:
        tasks.add_task(clone_id)

    for _ in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(LineageWorker(
            session, get_newick,
            args.min_mut_copies, args.min_mut_samples,
            args.min_seq_copies,
            args.min_seq_samples,
            args.exclude_stops,
            args.full_seq,
            post_tree_hook=minimize_tree))

    tasks.start()
コード例 #2
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = map(lambda s: s.id, session.query(Subject.id).all())
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if args.regen:
        logger.info('Deleting existing clones')
        session.query(Clone).filter(
            Clone.subject_id.in_(subject_ids)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id,
            Sequence.clone_id.is_(None)
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'tcells': TCellClonalWorker,
        'lineage': LineageClonalWorker,
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](
            config.init_db(args.db_config), **args.__dict__
        )
        tasks.add_worker(worker)
    tasks.start()

    if args.subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')

    push_clone_ids(session)
    session.commit()
コード例 #3
0
ファイル: sample_stats.py プロジェクト: wangdi2014/immunedb
def run_sample_stats(session, args):
    np.seterr(all='raise')
    mod_log.make_mod('sample_stats',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.sample_ids is None:
        samples = map(lambda s: s.id, session.query(Sample.id).all())
    else:
        samples = args.sample_ids

    if args.force:
        q = session.query(SampleStats).filter(
            SampleStats.sample_id.in_(samples))
        q.delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for sample_id in samples:
        _queue_tasks(session, sample_id, args.force, tasks)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(SampleStatsWorker(session))

    tasks.start()
    session.commit()
コード例 #4
0
ファイル: baseline.py プロジェクト: wangdi2014/immunedb
def run_selection_pressure(session, args):
    mod_log.make_mod('clone_pressure',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = map(
            lambda c: c.id,
            session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids)).all())
    else:
        clones = map(lambda c: c.id, session.query(Clone.id).all())
    clones.sort()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to calculate selection pressure for {} '
                'clones.'.format(len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(
            SelectionPressureWorker(session, args.baseline_path, args.temp,
                                    args.regen, args.thresholds))

    tasks.start()
コード例 #5
0
ファイル: collapse.py プロジェクト: wangdi2014/immunedb
def run_collapse(session, args):
    mod_log.make_mod('collapse', session=session, commit=True,
                     info=vars(args))
    subject_ids = []

    for subject in (args.subject_ids or map(
                lambda e: e.id, session.query(Subject.id).all()
                )):
        if session.query(Sample).filter(
                Sample.subject_id == subject,
                ~exists().where(
                    SequenceCollapse.sample_id == Sample.id
                )).first() is None:
            logger.info('Subject {} already collapsed.  Skipping.'.format(
                subject))
        else:
            logger.info('Resetting collapse info for subject {}'.format(
                subject))
            samples = session.query(Sample).filter(
                  Sample.subject_id == subject
            ).all()
            for sample in samples:
                session.query(SequenceCollapse).filter(
                    SequenceCollapse.sample_id == sample.id
                ).delete(synchronize_session=False)
            logger.info('Resetting clone info for subject {}'.format(subject))
            session.query(Clone).filter(Clone.subject_id == subject).delete()
            subject_ids.append(subject)
    session.commit()

    logger.info('Creating task queue to collapse {} subjects.'.format(
        len(subject_ids)))

    tasks = concurrent.TaskQueue()

    for subject_id in subject_ids:
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(CollapseWorker(config.init_db(args.db_config)))
    tasks.start()

    session.close()
コード例 #6
0
def run_identify(session, args):
    mod_log.make_mod('identification',
                     session=session,
                     commit=True,
                     info=vars(args))
    session.close()
    # Load the germlines from files
    v_germlines = VGermlines(args.v_germlines)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len)
    tasks = concurrent.TaskQueue()

    # If metadata is not specified, assume it is "metadata." in the
    # directory
    meta_fn = args.metadata if args.metadata else os.path.join(
        args.sample_dir, 'metadata.tsv')

    # Verify the metadata file exists
    if not os.path.isfile(meta_fn):
        logger.error('Metadata file not found.')
        return

    with open(meta_fn, 'rU') as fh:
        try:
            metadata = parse_metadata(session, fh, args.warn_existing,
                                      args.sample_dir)
        except MetadataException as ex:
            logger.error(ex.message)
            return

    # Create the tasks for each file
    for sample_name in sorted(metadata.keys()):
        tasks.add_task({
            'path':
            os.path.join(args.sample_dir, metadata[sample_name]['file_name']),
            'meta':
            metadata[sample_name]
        })

    props = IdentificationProps(**args.__dict__)
    lock = mp.Lock()
    for i in range(0, min(args.nproc, tasks.num_tasks())):
        worker_session = config.init_db(args.db_config)
        tasks.add_worker(
            IdentificationWorker(worker_session, v_germlines, j_germlines,
                                 props, lock))

    tasks.start()
コード例 #7
0
def run_subclones(session, subject_ids, args):
    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info(
            'Generating subclone task queue for subject {}'.format(subject_id))
        buckets = session.query(Clone.subject_id, Clone.v_gene, Clone.j_gene,
                                Clone.cdr3_num_nts).filter(
                                    Clone.subject_id == subject_id).group_by(
                                        Clone.subject_id, Clone.v_gene,
                                        Clone.j_gene, Clone.cdr3_num_nts)
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks()))
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(SubcloneWorker(config.init_db(args.db_config)))
    tasks.start()
コード例 #8
0
def run_clone_stats(session, args):
    """Runs the clone statistics generation stage of the pipeline.
    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    mod_log.make_mod('clone_stats',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = map(
            lambda c: c.id,
            session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids)).all())
    else:
        clones = map(lambda c: c.id, session.query(Clone.id).all())
    clones.sort()

    if args.regen:
        logger.info('Deleting old clone statistics for {} clones'.format(
            len(clones)))
        session.query(CloneStats).filter(
            CloneStats.clone_id.in_(clones)).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to generate stats for {} clones.'.format(
        len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(CloneStatsWorker(session))

    tasks.start()
コード例 #9
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = [s.id for s in session.query(Subject.id)]
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if not args.skip_regen:
        logger.info('Deleting existing clones')
        q = session.query(Clone).filter(Clone.subject_id.in_(subject_ids))
        if args.gene:
            q = q.filter(Clone.v_gene.like(args.gene + '%'))
        q.delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    all_buckets = []
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(subject_id))
        buckets = session.query(Sequence.subject_id, Sequence.v_gene,
                                Sequence.j_gene, Sequence.cdr3_num_nts).filter(
                                    Sequence.subject_id == subject_id,
                                    Sequence.clone_id.is_(None)).group_by(
                                        Sequence.subject_id, Sequence.v_gene,
                                        Sequence.j_gene, Sequence.cdr3_num_nts)
        for bucket in buckets:
            if not args.gene or bucket.v_gene.startswith(args.gene):
                tasks.add_task(bucket)
        all_buckets.extend(buckets)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'cluster': ClusteringClonalWorker
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](config.init_db(args.db_config),
                                      **args.__dict__)
        tasks.add_worker(worker)
    tasks.start()

    session.commit()
    if args.reduce_difference:
        buckets = session.query(Clone.subject_id, Clone.cdr3_num_nts).filter(
            Clone.subject_id.in_(subject_ids)).group_by(
                Clone.subject_id, Clone.cdr3_num_nts)
        collapse_similar_cdr3s(session, buckets, args.reduce_difference)
    else:
        logger.info('Skipping reduce since --reduce-differece set to 0')

    push_clone_ids(session)
    session.commit()

    if not args.skip_subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')
コード例 #10
0
ファイル: identify.py プロジェクト: rasteh/immunedb
def run_identify(session, args):
    mod_log.make_mod('identification',
                     session=session,
                     commit=True,
                     info=vars(args))
    session.close()
    # Load the germlines from files
    v_germlines = VGermlines(args.v_germlines)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len)
    tasks = concurrent.TaskQueue()

    sample_names = set([])
    fail = False
    for directory in args.sample_dirs:
        # If metadata is not specified, assume it is "metadata.json" in the
        # directory
        if args.metadata is None:
            meta_fn = os.path.join(directory, 'metadata.json')
        else:
            meta_fn = args.metadata

        # Verify the metadata file exists
        if not os.path.isfile(meta_fn):
            logger.error('Metadata file not found.')
            return

        with open(meta_fn) as fh:
            metadata = json.load(fh)

        # Create the tasks for each file
        for fn in sorted(metadata.keys()):
            if fn == 'all':
                continue
            meta = SampleMetadata(
                metadata[fn], metadata['all'] if 'all' in metadata else None)
            if session.query(Sample).filter(
                    Sample.name == meta.get('sample_name'),
                    exists().where(
                        Sequence.sample_id == Sample.id)).first() is not None:
                log_f = logger.warning if args.warn_existing else logger.error
                log_f('Sample {} already exists. {}'.format(
                    meta.get('sample_name'),
                    'Skipping.' if args.warn_existing else 'Cannot continue.'))
                fail = True
            elif meta.get('sample_name') in sample_names:
                logger.error(
                    'Sample {} exists more than once in metadata.'.format(
                        meta.get('sample_name')))
                return
            else:
                tasks.add_task({'path': directory, 'fn': fn, 'meta': meta})
                sample_names.add(meta.get('sample_name'))

        if fail and not args.warn_existing:
            logger.error('Encountered errors.  Not running any identification.'
                         ' To skip samples that are already in the database '
                         'use --warn-existing.')
            return

    lock = mp.Lock()
    for i in range(0, min(args.nproc, tasks.num_tasks())):
        worker_session = config.init_db(args.db_config)
        tasks.add_worker(
            IdentificationWorker(worker_session, v_germlines, j_germlines,
                                 args.trim_to, args.max_padding,
                                 args.max_vties,
                                 args.min_similarity / float(100), lock))

    tasks.start()