def run_clearcut(session, args): if args.clone_ids is not None: clones = session.query(Clone.id).filter( Clone.id.in_(args.clone_ids)) else: if args.subject_ids is not None: clones = session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids)) else: clones = session.query(Clone.id) if not args.force: clones = clones.filter(Clone.tree.is_(None)) clones = [c.id for c in clones] mod_log.make_mod('clone_tree', session=session, commit=True, info=vars(args)) tasks = concurrent.TaskQueue() logger.info('Creating task queue for clones') for clone_id in clones: tasks.add_task(clone_id) for _ in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(LineageWorker( session, get_newick, args.min_mut_copies, args.min_mut_samples, args.min_seq_copies, args.min_seq_samples, args.exclude_stops, args.full_seq, post_tree_hook=minimize_tree)) tasks.start()
def run_clones(session, args): """Runs the clone-assignment pipeline stage. :param Session session: The database session :param Namespace args: The arguments passed to the command """ if args.subject_ids is None: subject_ids = map(lambda s: s.id, session.query(Subject.id).all()) else: subject_ids = args.subject_ids mod_log.make_mod('clones', session=session, commit=True, info=vars(args)) if args.regen: logger.info('Deleting existing clones') session.query(Clone).filter( Clone.subject_id.in_(subject_ids) ).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info('Generating task queue for subject {}'.format( subject_id)) buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id, Sequence.clone_id.is_(None) ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) methods = { 'similarity': SimilarityClonalWorker, 'tcells': TCellClonalWorker, 'lineage': LineageClonalWorker, } for i in range(0, min(tasks.num_tasks(), args.nproc)): worker = methods[args.method]( config.init_db(args.db_config), **args.__dict__ ) tasks.add_worker(worker) tasks.start() if args.subclones: run_subclones(session, subject_ids, args) else: logger.info('Skipping subclones') push_clone_ids(session) session.commit()
def run_sample_stats(session, args): np.seterr(all='raise') mod_log.make_mod('sample_stats', session=session, commit=True, info=vars(args)) if args.sample_ids is None: samples = map(lambda s: s.id, session.query(Sample.id).all()) else: samples = args.sample_ids if args.force: q = session.query(SampleStats).filter( SampleStats.sample_id.in_(samples)) q.delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() for sample_id in samples: _queue_tasks(session, sample_id, args.force, tasks) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(SampleStatsWorker(session)) tasks.start() session.commit()
def run_selection_pressure(session, args): mod_log.make_mod('clone_pressure', session=session, commit=True, info=vars(args)) if args.clone_ids is not None: clones = args.clone_ids elif args.subject_ids is not None: clones = map( lambda c: c.id, session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids)).all()) else: clones = map(lambda c: c.id, session.query(Clone.id).all()) clones.sort() tasks = concurrent.TaskQueue() logger.info('Creating task queue to calculate selection pressure for {} ' 'clones.'.format(len(clones))) for cid in clones: tasks.add_task(cid) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker( SelectionPressureWorker(session, args.baseline_path, args.temp, args.regen, args.thresholds)) tasks.start()
def run_collapse(session, args): mod_log.make_mod('collapse', session=session, commit=True, info=vars(args)) subject_ids = [] for subject in (args.subject_ids or map( lambda e: e.id, session.query(Subject.id).all() )): if session.query(Sample).filter( Sample.subject_id == subject, ~exists().where( SequenceCollapse.sample_id == Sample.id )).first() is None: logger.info('Subject {} already collapsed. Skipping.'.format( subject)) else: logger.info('Resetting collapse info for subject {}'.format( subject)) samples = session.query(Sample).filter( Sample.subject_id == subject ).all() for sample in samples: session.query(SequenceCollapse).filter( SequenceCollapse.sample_id == sample.id ).delete(synchronize_session=False) logger.info('Resetting clone info for subject {}'.format(subject)) session.query(Clone).filter(Clone.subject_id == subject).delete() subject_ids.append(subject) session.commit() logger.info('Creating task queue to collapse {} subjects.'.format( len(subject_ids))) tasks = concurrent.TaskQueue() for subject_id in subject_ids: buckets = session.query( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ).filter( Sequence.subject_id == subject_id ).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence._insertions, Sequence._deletions ) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(CollapseWorker(config.init_db(args.db_config))) tasks.start() session.close()
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) session.close() # Load the germlines from files v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) tasks = concurrent.TaskQueue() # If metadata is not specified, assume it is "metadata." in the # directory meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.sample_dir) except MetadataException as ex: logger.error(ex.message) return # Create the tasks for each file for sample_name in sorted(metadata.keys()): tasks.add_task({ 'path': os.path.join(args.sample_dir, metadata[sample_name]['file_name']), 'meta': metadata[sample_name] }) props = IdentificationProps(**args.__dict__) lock = mp.Lock() for i in range(0, min(args.nproc, tasks.num_tasks())): worker_session = config.init_db(args.db_config) tasks.add_worker( IdentificationWorker(worker_session, v_germlines, j_germlines, props, lock)) tasks.start()
def run_subclones(session, subject_ids, args): tasks = concurrent.TaskQueue() for subject_id in subject_ids: logger.info( 'Generating subclone task queue for subject {}'.format(subject_id)) buckets = session.query(Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts).filter( Clone.subject_id == subject_id).group_by( Clone.subject_id, Clone.v_gene, Clone.j_gene, Clone.cdr3_num_nts) for bucket in buckets: tasks.add_task(bucket) logger.info('Generated {} total subclone tasks'.format(tasks.num_tasks())) for i in range(0, min(tasks.num_tasks(), args.nproc)): tasks.add_worker(SubcloneWorker(config.init_db(args.db_config))) tasks.start()
def run_clone_stats(session, args): """Runs the clone statistics generation stage of the pipeline. :param Session session: The database session :param Namespace args: The arguments passed to the command """ mod_log.make_mod('clone_stats', session=session, commit=True, info=vars(args)) if args.clone_ids is not None: clones = args.clone_ids elif args.subject_ids is not None: clones = map( lambda c: c.id, session.query(Clone.id).filter( Clone.subject_id.in_(args.subject_ids)).all()) else: clones = map(lambda c: c.id, session.query(Clone.id).all()) clones.sort() if args.regen: logger.info('Deleting old clone statistics for {} clones'.format( len(clones))) session.query(CloneStats).filter( CloneStats.clone_id.in_(clones)).delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() logger.info('Creating task queue to generate stats for {} clones.'.format( len(clones))) for cid in clones: tasks.add_task(cid) for i in range(0, args.nproc): session = config.init_db(args.db_config) tasks.add_worker(CloneStatsWorker(session)) tasks.start()
def run_clones(session, args): """Runs the clone-assignment pipeline stage. :param Session session: The database session :param Namespace args: The arguments passed to the command """ if args.subject_ids is None: subject_ids = [s.id for s in session.query(Subject.id)] else: subject_ids = args.subject_ids mod_log.make_mod('clones', session=session, commit=True, info=vars(args)) if not args.skip_regen: logger.info('Deleting existing clones') q = session.query(Clone).filter(Clone.subject_id.in_(subject_ids)) if args.gene: q = q.filter(Clone.v_gene.like(args.gene + '%')) q.delete(synchronize_session=False) session.commit() tasks = concurrent.TaskQueue() all_buckets = [] for subject_id in subject_ids: logger.info('Generating task queue for subject {}'.format(subject_id)) buckets = session.query(Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts).filter( Sequence.subject_id == subject_id, Sequence.clone_id.is_(None)).group_by( Sequence.subject_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts) for bucket in buckets: if not args.gene or bucket.v_gene.startswith(args.gene): tasks.add_task(bucket) all_buckets.extend(buckets) logger.info('Generated {} total tasks'.format(tasks.num_tasks())) methods = { 'similarity': SimilarityClonalWorker, 'cluster': ClusteringClonalWorker } for i in range(0, min(tasks.num_tasks(), args.nproc)): worker = methods[args.method](config.init_db(args.db_config), **args.__dict__) tasks.add_worker(worker) tasks.start() session.commit() if args.reduce_difference: buckets = session.query(Clone.subject_id, Clone.cdr3_num_nts).filter( Clone.subject_id.in_(subject_ids)).group_by( Clone.subject_id, Clone.cdr3_num_nts) collapse_similar_cdr3s(session, buckets, args.reduce_difference) else: logger.info('Skipping reduce since --reduce-differece set to 0') push_clone_ids(session) session.commit() if not args.skip_subclones: run_subclones(session, subject_ids, args) else: logger.info('Skipping subclones')
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) session.close() # Load the germlines from files v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) tasks = concurrent.TaskQueue() sample_names = set([]) fail = False for directory in args.sample_dirs: # If metadata is not specified, assume it is "metadata.json" in the # directory if args.metadata is None: meta_fn = os.path.join(directory, 'metadata.json') else: meta_fn = args.metadata # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn) as fh: metadata = json.load(fh) # Create the tasks for each file for fn in sorted(metadata.keys()): if fn == 'all': continue meta = SampleMetadata( metadata[fn], metadata['all'] if 'all' in metadata else None) if session.query(Sample).filter( Sample.name == meta.get('sample_name'), exists().where( Sequence.sample_id == Sample.id)).first() is not None: log_f = logger.warning if args.warn_existing else logger.error log_f('Sample {} already exists. {}'.format( meta.get('sample_name'), 'Skipping.' if args.warn_existing else 'Cannot continue.')) fail = True elif meta.get('sample_name') in sample_names: logger.error( 'Sample {} exists more than once in metadata.'.format( meta.get('sample_name'))) return else: tasks.add_task({'path': directory, 'fn': fn, 'meta': meta}) sample_names.add(meta.get('sample_name')) if fail and not args.warn_existing: logger.error('Encountered errors. Not running any identification.' ' To skip samples that are already in the database ' 'use --warn-existing.') return lock = mp.Lock() for i in range(0, min(args.nproc, tasks.num_tasks())): worker_session = config.init_db(args.db_config) tasks.add_worker( IdentificationWorker(worker_session, v_germlines, j_germlines, args.trim_to, args.max_padding, args.max_vties, args.min_similarity / float(100), lock)) tasks.start()