Beispiel #1
0
def create(main_parser, args):
    if re.search(r'[^A-Za-z0-9_-]', args.db_name) is not None:
        main_parser.error('Database name must only contain letters, numbers, '
                          'dashes and underscores.')

    try:
        conn = _get_root_connection(args.db_host, args.admin_user,
                                    args.admin_pass)

        db_user = args.db_user or args.db_name
        if args.db_pass:
            db_pass = args.db_pass
        else:
            db_pass = ''.join(
                random.choice(string.ascii_uppercase + string.ascii_lowercase +
                              string.digits) for _ in range(10))

        with conn.cursor() as cursor:
            logger.info('Creating user "{}"'.format(db_user))
            existing_password = _create_user_if_not_exists(conn, '%', db_user,
                                                           db_pass)
            if existing_password is not None:
                if not args.db_pass:
                    logger.warning(
                        'User {} already exists.  To generate the '
                        'configuration file, you must enter it\'s '
                        'password.'.format(db_user)
                    )
                    db_pass = _get_user_pass(conn, args.db_host, db_user,
                                             existing_password)
                else:
                    db_pass = args.db_pass

            logger.info('Creating database "{}"'.format(args.db_name))
            cursor.execute('CREATE DATABASE {}'.format(args.db_name))

            cursor.execute(
                'GRANT ALL PRIVILEGES ON {}.* TO \'{}\'@\'%\''.format(
                    args.db_name, db_user))

        config_path = os.path.join(args.config_dir, '{}.json'.format(
            args.db_name))
        logger.info('Creating config at {}'.format(config_path))
        with open(config_path, 'w+') as fh:
            json.dump({
                'host': args.db_host,
                'database': args.db_name,
                'username': db_user,
                'password': db_pass
            }, fh, sort_keys=True, indent=4, separators=(',', ': '))

        logger.info('Initializing tables')
        config.init_db(config_path)
        logger.info('Success!')
        return True
    except Exception as e:
        logger.error(e)
        return False
Beispiel #2
0
def run_sample_stats(session, args):
    np.seterr(all='raise')
    mod_log.make_mod('sample_stats',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.sample_ids is None:
        samples = map(lambda s: s.id, session.query(Sample.id).all())
    else:
        samples = args.sample_ids

    if args.force:
        q = session.query(SampleStats).filter(
            SampleStats.sample_id.in_(samples))
        q.delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for sample_id in samples:
        _queue_tasks(session, sample_id, args.force, tasks)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(SampleStatsWorker(session))

    tasks.start()
    session.commit()
def run_sample_stats(session, args):
    np.seterr(all='raise')
    mod_log.make_mod('sample_stats', session=session, commit=True,
                     info=vars(args))

    if args.sample_ids is None:
        samples = [s.id for s in session.query(Sample.id)]
    else:
        samples = args.sample_ids

    if args.force:
        q = session.query(SampleStats).filter(
            SampleStats.sample_id.in_(samples))
        q.delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for sample_id in samples:
        _queue_tasks(session, sample_id, args.force, tasks)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(SampleStatsWorker(session))

    tasks.start()
    session.commit()
    session.close()
def run_clearcut(session, args):
    if args.clone_ids is not None:
        clones = session.query(Clone.id).filter(
            Clone.id.in_(args.clone_ids))
    else:
        if args.subject_ids is not None:
            clones = session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids))
        else:
            clones = session.query(Clone.id)

    if not args.force:
        clones = clones.filter(Clone.tree.is_(None))
    clones = [c.id for c in clones]
    mod_log.make_mod('clone_tree', session=session, commit=True,
                     info=vars(args))

    tasks = concurrent.TaskQueue()

    logger.info('Creating task queue for clones')
    for clone_id in clones:
        tasks.add_task(clone_id)

    for _ in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(LineageWorker(
            session, get_newick,
            args.min_mut_copies, args.min_mut_samples,
            args.min_seq_copies,
            args.min_seq_samples,
            args.exclude_stops,
            args.full_seq,
            post_tree_hook=minimize_tree))

    tasks.start()
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = [s.id for s in session.query(Subject.id)]
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if args.regen:
        logger.info('Deleting existing clones')
        session.query(Clone).filter(
            Clone.subject_id.in_(subject_ids)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id,
            Sequence.clone_id.is_(None)
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        )
        for bucket in buckets:
            if not args.gene or bucket.v_gene.startswith(args.gene):
                tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'lineage': LineageClonalWorker,
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](
            config.init_db(args.db_config), **args.__dict__
        )
        tasks.add_worker(worker)
    tasks.start()

    if args.subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')

    push_clone_ids(session)
    session.commit()
Beispiel #6
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = map(lambda s: s.id, session.query(Subject.id).all())
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if args.regen:
        logger.info('Deleting existing clones')
        session.query(Clone).filter(
            Clone.subject_id.in_(subject_ids)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id,
            Sequence.clone_id.is_(None)
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions,
            Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'tcells': TCellClonalWorker,
        'lineage': LineageClonalWorker,
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](
            config.init_db(args.db_config), **args.__dict__
        )
        tasks.add_worker(worker)
    tasks.start()

    if args.subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')

    push_clone_ids(session)
    session.commit()
Beispiel #7
0
def run_selection_pressure(session, args):
    mod_log.make_mod('clone_pressure',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = map(
            lambda c: c.id,
            session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids)).all())
    else:
        clones = map(lambda c: c.id, session.query(Clone.id).all())
    clones.sort()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to calculate selection pressure for {} '
                'clones.'.format(len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(
            SelectionPressureWorker(session, args.baseline_path, args.temp,
                                    args.regen, args.thresholds))

    tasks.start()
 def _wrapper(*args, **kwargs):
     session = config.init_db(db_config)
     try:
         return f(session, *args, **kwargs)
     except Exception:
         raise
     finally:
         session.close()
def run_collapse(session, args):
    mod_log.make_mod('collapse', session=session, commit=True,
                     info=vars(args))
    subject_ids = []

    subjects = (args.subject_ids or [e.id for e in session.query(Subject.id)])
    for subject in subjects:
        if session.query(Sample).filter(
                Sample.subject_id == subject,
                ~exists().where(
                    SequenceCollapse.sample_id == Sample.id
                )).first() is None:
            logger.info('Subject {} already collapsed.  Skipping.'.format(
                subject))
        else:
            logger.info('Resetting collapse info for subject {}'.format(
                subject))
            samples = session.query(Sample).filter(
                  Sample.subject_id == subject
            )
            for sample in samples:
                session.query(SequenceCollapse).filter(
                    SequenceCollapse.sample_id == sample.id
                ).delete(synchronize_session=False)
                sample.sample_stats = []
            logger.info('Resetting clone info for subject {}'.format(subject))
            session.query(Clone).filter(Clone.subject_id == subject).delete()
            subject_ids.append(subject)
    session.commit()

    logger.info('Creating task queue to collapse {} subjects.'.format(
        len(subject_ids)))

    tasks = concurrent.TaskQueue()

    for subject_id in subject_ids:
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(CollapseWorker(config.init_db(args.db_config)))
    tasks.start()

    session.close()
Beispiel #10
0
def run_collapse(session, args):
    mod_log.make_mod('collapse', session=session, commit=True,
                     info=vars(args))
    subject_ids = []

    for subject in (args.subject_ids or map(
                lambda e: e.id, session.query(Subject.id).all()
                )):
        if session.query(Sample).filter(
                Sample.subject_id == subject,
                ~exists().where(
                    SequenceCollapse.sample_id == Sample.id
                )).first() is None:
            logger.info('Subject {} already collapsed.  Skipping.'.format(
                subject))
        else:
            logger.info('Resetting collapse info for subject {}'.format(
                subject))
            samples = session.query(Sample).filter(
                  Sample.subject_id == subject
            ).all()
            for sample in samples:
                session.query(SequenceCollapse).filter(
                    SequenceCollapse.sample_id == sample.id
                ).delete(synchronize_session=False)
            logger.info('Resetting clone info for subject {}'.format(subject))
            session.query(Clone).filter(Clone.subject_id == subject).delete()
            subject_ids.append(subject)
    session.commit()

    logger.info('Creating task queue to collapse {} subjects.'.format(
        len(subject_ids)))

    tasks = concurrent.TaskQueue()

    for subject_id in subject_ids:
        buckets = session.query(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        ).filter(
            Sequence.subject_id == subject_id
        ).group_by(
            Sequence.subject_id, Sequence.v_gene, Sequence.j_gene,
            Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(CollapseWorker(config.init_db(args.db_config)))
    tasks.start()

    session.close()
Beispiel #11
0
def run_identify(session, args):
    mod_log.make_mod('identification',
                     session=session,
                     commit=True,
                     info=vars(args))
    session.close()
    # Load the germlines from files
    v_germlines = VGermlines(args.v_germlines)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len)
    tasks = concurrent.TaskQueue()

    # If metadata is not specified, assume it is "metadata." in the
    # directory
    meta_fn = args.metadata if args.metadata else os.path.join(
        args.sample_dir, 'metadata.tsv')

    # Verify the metadata file exists
    if not os.path.isfile(meta_fn):
        logger.error('Metadata file not found.')
        return

    with open(meta_fn, 'rU') as fh:
        try:
            metadata = parse_metadata(session, fh, args.warn_existing,
                                      args.sample_dir)
        except MetadataException as ex:
            logger.error(ex.message)
            return

    # Create the tasks for each file
    for sample_name in sorted(metadata.keys()):
        tasks.add_task({
            'path':
            os.path.join(args.sample_dir, metadata[sample_name]['file_name']),
            'meta':
            metadata[sample_name]
        })

    props = IdentificationProps(**args.__dict__)
    lock = mp.Lock()
    for i in range(0, min(args.nproc, tasks.num_tasks())):
        worker_session = config.init_db(args.db_config)
        tasks.add_worker(
            IdentificationWorker(worker_session, v_germlines, j_germlines,
                                 props, lock))

    tasks.start()
Beispiel #12
0
def run_subclones(session, subject_ids, args):
    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info(
            'Generating subclone task queue for subject {}'.format(subject_id))
        buckets = session.query(Clone.subject_id, Clone.v_gene, Clone.j_gene,
                                Clone.cdr3_num_nts).filter(
                                    Clone.subject_id == subject_id).group_by(
                                        Clone.subject_id, Clone.v_gene,
                                        Clone.j_gene, Clone.cdr3_num_nts)
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks()))
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(SubcloneWorker(config.init_db(args.db_config)))
    tasks.start()
def aggregate_collapse(aggregate_queue, db_config, sample_id, props):
    seqs_to_add = []
    session = config.init_db(db_config, create=False)
    sample = session.query(Sample).filter(Sample.id == sample_id).one()
    for i, alignment in enumerate(aggregate_queue):
        for seq in alignment:
            seqs_to_add.append(seq)
            if len(seqs_to_add) >= 1000:
                add_sequences(session, seqs_to_add, sample,
                              strip_alleles=not props.genotyping)
                seqs_to_add = []
                session.commit()
    if seqs_to_add:
        add_sequences(session, seqs_to_add, sample,
                      strip_alleles=not props.genotyping)
    logger.info('Finished aggregating sequences')
    session.commit()
    session.close()
Beispiel #14
0
def aggregate_collapse(aggregate_queue, db_config, sample_id, props):
    seqs_to_add = []
    session = config.init_db(db_config, create=False)
    sample = session.query(Sample).filter(Sample.id == sample_id).one()
    for i, alignment in enumerate(aggregate_queue):
        for seq in alignment:
            seqs_to_add.append(seq)
            if len(seqs_to_add) >= 1000:
                add_sequences(session, seqs_to_add, sample,
                              strip_alleles=not props.genotyping)
                seqs_to_add = []
                session.commit()
    if seqs_to_add:
        add_sequences(session, seqs_to_add, sample,
                      strip_alleles=not props.genotyping)
    logger.info('Finished aggregating sequences')
    session.commit()
    session.close()
Beispiel #15
0
def run_subclones(session, subject_ids, args):
    tasks = concurrent.TaskQueue()
    for subject_id in subject_ids:
        logger.info('Generating subclone task queue for subject {}'.format(
            subject_id))
        buckets = session.query(
            Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts
        ).filter(
            Clone.subject_id == subject_id
        ).group_by(
            Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts
        )
        for bucket in buckets:
            tasks.add_task(bucket)

    logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks()))
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        tasks.add_worker(SubcloneWorker(config.init_db(args.db_config),
                                        args.similarity))
    tasks.start()
def get_clones_from_remote_db():
    session = config.init_db({
        'host': '35.241.233.255',
        'database': 'influenza',
        'username': '******',
        'password': '',
    })

    rows_final = []
    counter = 10

    for clone in session.query(Clone):
        row_final = np.append(clone.id, clone.tree)
        rows_final.append(row_final)
        #
        if counter > 0:
            counter = counter - 1
            print(clone.id)
        else:
            return np.asarray(rows_final)
Beispiel #17
0
def run_clone_stats(session, args):
    """Runs the clone statistics generation stage of the pipeline.
    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    mod_log.make_mod('clone_stats',
                     session=session,
                     commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = map(
            lambda c: c.id,
            session.query(Clone.id).filter(
                Clone.subject_id.in_(args.subject_ids)).all())
    else:
        clones = map(lambda c: c.id, session.query(Clone.id).all())
    clones.sort()

    if args.regen:
        logger.info('Deleting old clone statistics for {} clones'.format(
            len(clones)))
        session.query(CloneStats).filter(
            CloneStats.clone_id.in_(clones)).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to generate stats for {} clones.'.format(
        len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(CloneStatsWorker(session))

    tasks.start()
def run_clone_stats(session, args):
    """Runs the clone statistics generation stage of the pipeline.
    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    mod_log.make_mod('clone_stats', session=session, commit=True,
                     info=vars(args))

    if args.clone_ids is not None:
        clones = args.clone_ids
    elif args.subject_ids is not None:
        clones = [c.id for c in session.query(Clone.id).filter(
            Clone.subject_id.in_(args.subject_ids))]
    else:
        clones = [c.id for c in session.query(Clone.id)]
    clones.sort()

    if args.regen:
        logger.info('Deleting old clone statistics for {} clones'.format(
            len(clones)))
        session.query(CloneStats).filter(
            CloneStats.clone_id.in_(clones)
        ).delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    logger.info('Creating task queue to generate stats for {} clones.'.format(
        len(clones)))
    for cid in clones:
        tasks.add_task(cid)

    for i in range(0, args.nproc):
        session = config.init_db(args.db_config)
        tasks.add_worker(CloneStatsWorker(session))

    tasks.start()
Beispiel #19
0
from argparse import Namespace  # noqa: E402

from immunedb.api.rest_service import run_rest_service  # noqa: E402
import immunedb.common.config as config  # noqa: E402

session = config.init_db('test_db.json', as_maker=True)
run_rest_service(
    session,
    Namespace(port=8891,
              debug=True,
              allow_shutdown=True,
              rollbar_token=None,
              rollbar_env=None,
              server='gunicorn'))
Beispiel #20
0
    return dict((k, v.dropna().to_dict())
                for k, v in pd.compat.iteritems(data))


if __name__ == '__main__':
    parser = config.get_base_arg_parser()
    parser.add_argument('subject')
    parser.add_argument('feature')
    parser.add_argument('magnitude',
                        choices=['copies', 'instances'])
    parser.add_argument(
        '--limit-by', choices=['copies', 'instances'], default=None)
    parser.add_argument('--limit-val', type=int, default=None)

    args = parser.parse_args()
    session = config.init_db(args.db_config)

    subject_id = session.query(Subject.id).filter(
        Subject.identifier == args.subject).one().id
    features = {
        s.id: str(getattrd(s, args.feature))
        for s in session.query(Sample).filter(Sample.subject_id == subject_id)
    }
    if args.magnitude == 'instances' or args.limit_by == 'instances':
        instances = session.query(
            Sequence.clone_id,
            Sequence.sample_id,
            func.count(Sequence.seq_id).label('inst')
        ).filter(
            ~Sequence.clone_id.is_(None),
            Sequence.subject_id == subject_id
 def setUp(self):
     self.session = config.init_db(CONFIG_PATH)
Beispiel #22
0
def run_identify(session, args):
    mod_log.make_mod('identification',
                     session=session,
                     commit=True,
                     info=vars(args))
    session.close()
    # Load the germlines from files
    v_germlines = VGermlines(args.v_germlines)
    j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3,
                             args.anchor_len, args.min_anchor_len)
    tasks = concurrent.TaskQueue()

    sample_names = set([])
    fail = False
    for directory in args.sample_dirs:
        # If metadata is not specified, assume it is "metadata.json" in the
        # directory
        if args.metadata is None:
            meta_fn = os.path.join(directory, 'metadata.json')
        else:
            meta_fn = args.metadata

        # Verify the metadata file exists
        if not os.path.isfile(meta_fn):
            logger.error('Metadata file not found.')
            return

        with open(meta_fn) as fh:
            metadata = json.load(fh)

        # Create the tasks for each file
        for fn in sorted(metadata.keys()):
            if fn == 'all':
                continue
            meta = SampleMetadata(
                metadata[fn], metadata['all'] if 'all' in metadata else None)
            if session.query(Sample).filter(
                    Sample.name == meta.get('sample_name'),
                    exists().where(
                        Sequence.sample_id == Sample.id)).first() is not None:
                log_f = logger.warning if args.warn_existing else logger.error
                log_f('Sample {} already exists. {}'.format(
                    meta.get('sample_name'),
                    'Skipping.' if args.warn_existing else 'Cannot continue.'))
                fail = True
            elif meta.get('sample_name') in sample_names:
                logger.error(
                    'Sample {} exists more than once in metadata.'.format(
                        meta.get('sample_name')))
                return
            else:
                tasks.add_task({'path': directory, 'fn': fn, 'meta': meta})
                sample_names.add(meta.get('sample_name'))

        if fail and not args.warn_existing:
            logger.error('Encountered errors.  Not running any identification.'
                         ' To skip samples that are already in the database '
                         'use --warn-existing.')
            return

    lock = mp.Lock()
    for i in range(0, min(args.nproc, tasks.num_tasks())):
        worker_session = config.init_db(args.db_config)
        tasks.add_worker(
            IdentificationWorker(worker_session, v_germlines, j_germlines,
                                 args.trim_to, args.max_padding,
                                 args.max_vties,
                                 args.min_similarity / float(100), lock))

    tasks.start()
Beispiel #23
0
 def setUp(self):
     self.session = config.init_db(CONFIG_PATH, drop_all=True)
Beispiel #24
0
def run_clones(session, args):
    """Runs the clone-assignment pipeline stage.

    :param Session session: The database session
    :param Namespace args: The arguments passed to the command

    """
    if args.subject_ids is None:
        subject_ids = [s.id for s in session.query(Subject.id)]
    else:
        subject_ids = args.subject_ids
    mod_log.make_mod('clones', session=session, commit=True, info=vars(args))

    if not args.skip_regen:
        logger.info('Deleting existing clones')
        q = session.query(Clone).filter(Clone.subject_id.in_(subject_ids))
        if args.gene:
            q = q.filter(Clone.v_gene.like(args.gene + '%'))
        q.delete(synchronize_session=False)
        session.commit()

    tasks = concurrent.TaskQueue()
    all_buckets = []
    for subject_id in subject_ids:
        logger.info('Generating task queue for subject {}'.format(subject_id))
        buckets = session.query(Sequence.subject_id, Sequence.v_gene,
                                Sequence.j_gene, Sequence.cdr3_num_nts).filter(
                                    Sequence.subject_id == subject_id,
                                    Sequence.clone_id.is_(None)).group_by(
                                        Sequence.subject_id, Sequence.v_gene,
                                        Sequence.j_gene, Sequence.cdr3_num_nts)
        for bucket in buckets:
            if not args.gene or bucket.v_gene.startswith(args.gene):
                tasks.add_task(bucket)
        all_buckets.extend(buckets)

    logger.info('Generated {} total tasks'.format(tasks.num_tasks()))

    methods = {
        'similarity': SimilarityClonalWorker,
        'cluster': ClusteringClonalWorker
    }
    for i in range(0, min(tasks.num_tasks(), args.nproc)):
        worker = methods[args.method](config.init_db(args.db_config),
                                      **args.__dict__)
        tasks.add_worker(worker)
    tasks.start()

    session.commit()
    if args.reduce_difference:
        buckets = session.query(Clone.subject_id, Clone.cdr3_num_nts).filter(
            Clone.subject_id.in_(subject_ids)).group_by(
                Clone.subject_id, Clone.cdr3_num_nts)
        collapse_similar_cdr3s(session, buckets, args.reduce_difference)
    else:
        logger.info('Skipping reduce since --reduce-differece set to 0')

    push_clone_ids(session)
    session.commit()

    if not args.skip_subclones:
        run_subclones(session, subject_ids, args)
    else:
        logger.info('Skipping subclones')
Beispiel #25
0
 def setUp(self):
     self.session_maker = config.init_db(CONFIG_PATH,
                                         drop_all=True,
                                         as_maker=True)
     self.session = self.session_maker()
def process_sample(db_config, v_germlines, j_germlines, path, meta, props,
                   nproc):
    session = config.init_db(db_config)
    start = time.time()
    logger.info('Starting sample {}'.format(meta['sample_name']))
    sample = setup_sample(session, meta)

    aligner = AnchorAligner(v_germlines, j_germlines)

    # Initial VJ assignment
    alignments = concurrent.process_data(
        read_input,
        process_vdj,
        aggregate_vdj,
        nproc,
        process_args={'aligner': aligner},
        generate_args={'path': path},
    )
    logger.info('Adding noresults')
    for result in alignments['noresult']:
        add_noresults_for_vdj(session, result['vdj'], sample, result['reason'])

    alignments = alignments['success']
    if alignments:
        avg_len = (
            sum([v.v_length for v in alignments]) /
            len(alignments))
        avg_mut = (
            sum([v.v_mutation_fraction for v in alignments]) /
            len(alignments)
        )
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        logger.info('Re-aligning {} sequences to V-ties: Mutations={}, '
                    'Length={}'.format(len(alignments),
                                       round(avg_mut, 2),
                                       round(avg_len, 2)))
        session.commit()
        # Realign to V-ties
        v_ties = concurrent.process_data(
            alignments,
            process_vties,
            aggregate_vties,
            nproc,
            process_args={'aligner': aligner, 'avg_len': avg_len, 'avg_mut':
                          avg_mut, 'props': props},
        )
        logger.info('Adding noresults')

        for result in funcs.periodic_commit(session, v_ties['noresult'], 100):
            add_noresults_for_vdj(session, result['alignment'].sequence,
                                  sample, result['reason'])

        logger.info('Collapsing {} buckets'.format(len(v_ties['success'])))
        session.commit()

        # TODO: Change this so we arent copying everything between processes
        concurrent.process_data(
            [list(v) for v in v_ties['success']],
            process_collapse,
            aggregate_collapse,
            nproc,
            aggregate_args={'db_config': db_config, 'sample_id': sample.id,
                            'props': props}
        )
        session.expire_all()
        session.commit()

        identified = int(session.query(
            func.sum(Sequence.copy_number)
        ).filter(
            Sequence.sample == sample
        ).scalar() or 0)
        noresults = int(session.query(
            func.count(NoResult.pk)
        ).filter(
            NoResult.sample == sample
        ).scalar() or 0)
        if identified + noresults:
            frac = int(100 * identified / (identified + noresults))
        else:
            frac = 0
        logger.info(
            'Completed sample {} in {}m - {}/{} ({}%) identified'.format(
                sample.name,
                round((time.time() - start) / 60., 1),
                identified,
                identified + noresults,
                frac
            )
        )
    session.close()
Beispiel #27
0
 def setUp(self):
     self.session = config.init_db(CONFIG_PATH)
Beispiel #28
0
def process_sample(db_config, v_germlines, j_germlines, path, meta, props,
                   nproc):
    session = config.init_db(db_config)
    start = time.time()
    logger.info('Starting sample {}'.format(meta['sample_name']))
    sample = setup_sample(session, meta)

    aligner = AnchorAligner(v_germlines, j_germlines)

    # Initial VJ assignment
    alignments = concurrent.process_data(
        read_input,
        process_vdj,
        aggregate_vdj,
        nproc,
        process_args={'aligner': aligner},
        generate_args={'path': path},
    )
    logger.info('Adding noresults')
    for result in alignments['noresult']:
        add_noresults_for_vdj(session, result['vdj'], sample, result['reason'])

    alignments = alignments['success']
    if alignments:
        avg_len = (sum([v.v_length for v in alignments]) / len(alignments))
        avg_mut = (sum([v.v_mutation_fraction
                        for v in alignments]) / len(alignments))
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        logger.info('Re-aligning {} sequences to V-ties: Mutations={}, '
                    'Length={}'.format(len(alignments), round(avg_mut, 2),
                                       round(avg_len, 2)))
        session.commit()
        # Realign to V-ties
        v_ties = concurrent.process_data(
            alignments,
            process_vties,
            aggregate_vties,
            nproc,
            process_args={
                'aligner': aligner,
                'avg_len': avg_len,
                'avg_mut': avg_mut,
                'props': props
            },
        )
        logger.info('Adding noresults')

        for result in funcs.periodic_commit(session, v_ties['noresult'], 100):
            add_noresults_for_vdj(session, result['alignment'].sequence,
                                  sample, result['reason'])

        logger.info('Collapsing {} buckets'.format(len(v_ties['success'])))
        session.commit()

        # TODO: Change this so we arent copying everything between processes
        concurrent.process_data([list(v) for v in v_ties['success']],
                                process_collapse,
                                aggregate_collapse,
                                nproc,
                                aggregate_args={
                                    'db_config': db_config,
                                    'sample_id': sample.id,
                                    'props': props
                                })
        session.expire_all()
        session.commit()

        identified = int(
            session.query(func.sum(Sequence.copy_number)).filter(
                Sequence.sample == sample).scalar() or 0)
        noresults = int(
            session.query(func.count(
                NoResult.pk)).filter(NoResult.sample == sample).scalar() or 0)
        if identified + noresults:
            frac = int(100 * identified / (identified + noresults))
        else:
            frac = 0
        logger.info(
            'Completed sample {} in {}m - {}/{} ({}%) identified'.format(
                sample.name, round((time.time() - start) / 60., 1), identified,
                identified + noresults, frac))
    session.close()
 def setUp(self):
     self.session_maker = config.init_db(CONFIG_PATH, drop_all=True,
                                         as_maker=True)
     self.session = self.session_maker()