def aggregate_vties(aggregate_queue): bucketed_seqs = {'success': {}, 'noresult': []} for result in aggregate_queue: if result['status'] == 'success': alignment = result['alignment'] bucket_key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), len(alignment.cdr3)) bucket = bucketed_seqs['success'].setdefault(bucket_key, {}) if alignment.sequence.sequence in bucket: bucket[alignment.sequence.sequence].sequence.copy_number += ( alignment.sequence.copy_number) else: bucket[alignment.sequence.sequence] = alignment elif result['status'] == 'noresult': bucketed_seqs['noresult'].append(result) elif result['status'] == 'error': logger.error( 'Unexpected error processing sequence {}\n\t{}'.format( result['alignment'].sequence.seq_id)) bucketed_seqs['success'] = [ b.values() for b in bucketed_seqs['success'].values() ] return bucketed_seqs
def import_alignments(session, args): parse_funcs = { 'airr': (parse_airr, preprocess_airr), } meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.warn_missing, args.sample_dir) except MetadataException as ex: logger.error(ex.message) return props = IdentificationProps(**args.__dict__) v_germlines = raw_germlines(args.v_germlines, 'v') j_germlines = raw_germlines(args.j_germlines, 'j') for sample_name in sorted(metadata.keys()): sample = create_sample(session, metadata[sample_name]) if sample: path = os.path.join( args.sample_dir, metadata[sample_name]['file_name']) with open(path) as fh: parse_file(fh, sample, session, parse_funcs[args.format][0], props, v_germlines, j_germlines, args.nproc, preprocess_func=parse_funcs[args.format][1])
def aggregate_vties(aggregate_queue): bucketed_seqs = { 'success': {}, 'noresult': [] } for result in aggregate_queue: if result['status'] == 'success': alignment = result['alignment'] bucket_key = ( funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), len(alignment.cdr3) ) bucket = bucketed_seqs['success'].setdefault(bucket_key, {}) if alignment.sequence.sequence in bucket: bucket[alignment.sequence.sequence].sequence.copy_number += ( alignment.sequence.copy_number ) else: bucket[alignment.sequence.sequence] = alignment elif result['status'] == 'noresult': bucketed_seqs['noresult'].append(result) elif result['status'] == 'error': logger.error( 'Unexpected error processing sequence {}\n\t{}'.format( result['alignment'].sequence.seq_id)) bucketed_seqs['success'] = [ b.values() for b in bucketed_seqs['success'].values() ] return bucketed_seqs
def run_import(session, args): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len) meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.warn_missing, args.sample_dir) except MetadataException as ex: logger.error(ex.message) return props = IdentificationProps(**args.__dict__) for sample_name in sorted(metadata.keys()): sample = create_sample(session, metadata[sample_name]) if sample: path = os.path.join(args.sample_dir, metadata[sample_name]['file_name']) with open(path) as fh: read_file(session, args.format, fh, sample, v_germlines, j_germlines, props)
def create_sample(session, metadata): study, new = funcs.get_or_create( session, Study, name=metadata['study_name']) if new: logger.info('Created new study "{}"'.format(study.name)) session.commit() sample, new = funcs.get_or_create( session, Sample, name=metadata['sample_name'], study=study) if new: logger.info('Created new sample "{}"'.format(sample.name)) for key, value in metadata.items(): if key not in REQUIRED_FIELDS: session.add(SampleMetadata( sample=sample, key=key, value=value )) subject, new = funcs.get_or_create( session, Subject, study=study, identifier=metadata['subject']) sample.subject = subject session.commit() else: logger.error( 'Sample "{}" already exists'.format(metadata['sample_name'])) return return sample
def run_import(session, args, remaps=None): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) study, new = funcs.get_or_create(session, Study, name=args.study_name) if new: logger.info('Created new study "{}"'.format(study.name)) session.commit() sample, new = funcs.get_or_create(session, Sample, name=args.sample_name, study=study) if new: sample.date = args.date logger.info('Created new sample "{}"'.format(sample.name)) for key in ('subset', 'tissue', 'disease', 'lab', 'experimenter', 'ig_class', 'v_primer', 'j_primer'): setattr(sample, key, vars(args).get(key, None)) subject, new = funcs.get_or_create(session, Subject, study=study, identifier=args.subject) sample.subject = subject session.commit() else: logger.error('Sample "{}" already exists'.format(args.sample_name)) return with open(args.input_file) as fh: read_file(session, fh, sample, v_germlines, j_germlines, args, remaps)
def add_uniques(session, sample, vdjs, realign_len=None, realign_mut=None, min_similarity=0, max_vties=50, trim_to=None, max_padding=None): bucketed_seqs = OrderedDict() vdjs = sorted(vdjs, key=lambda v: v.ids[0]) for vdj in funcs.periodic_commit(session, vdjs): try: if realign_len is not None: vdj.align_to_germline(realign_len, realign_mut, trim_to) if vdj.v_match / float(vdj.v_length) < min_similarity: raise AlignmentException('V-identity too low {} < {}'.format( vdj.v_match / float(vdj.v_length), min_similarity)) if len(vdj.v_gene) > max_vties: raise AlignmentException('Too many V-ties {} > {}'.format( len(vdj.v_gene), max_vties)) if max_padding is not None and vdj.pad_length > max_padding: raise AlignmentException('Too much padding {} (max {})'.format( vdj.pad_length, max_padding)) bucket_key = (funcs.format_ties(vdj.v_gene, vdj.v_germlines.prefix, strip_alleles=True), funcs.format_ties(vdj.j_gene, vdj.j_germlines.prefix, strip_alleles=True), len(vdj.cdr3)) if bucket_key not in bucketed_seqs: bucketed_seqs[bucket_key] = {} bucket = bucketed_seqs[bucket_key] if vdj.sequence in bucket: bucket[vdj.sequence].ids += vdj.ids else: bucket[vdj.sequence] = vdj except AlignmentException as e: add_as_noresult(session, vdj, sample, str(e)) except: logger.error('\tUnexpected error processing sequence ' '{}\n\t{}'.format(vdj.ids[0], traceback.format_exc())) # Collapse sequences that are the same except for Ns for bucket, sequences in funcs.periodic_commit(session, bucketed_seqs.iteritems()): sequences = sorted(sequences.values(), key=lambda s: (len(s.ids), s.ids[0]), reverse=True) while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence, smaller.sequence): larger.ids += smaller.ids del sequences[i] add_as_sequence(session, larger, sample) session.commit()
def run_import(session, args): v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len) meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.warn_missing, args.sample_dir) except MetadataException as ex: logger.error(ex.message) return props = IdentificationProps(**args.__dict__) for sample_name in sorted(metadata.keys()): sample = create_sample(session, metadata[sample_name]) if sample: path = os.path.join( args.sample_dir, metadata[sample_name]['file_name']) with open(path) as fh: read_file(session, args.format, fh, sample, v_germlines, j_germlines, props)
def create(main_parser, args): if re.search(r'[^A-Za-z0-9_-]', args.db_name) is not None: main_parser.error('Database name must only contain letters, numbers, ' 'dashes and underscores.') try: conn = _get_root_connection(args.db_host, args.admin_user, args.admin_pass) db_user = args.db_user or args.db_name if args.db_pass: db_pass = args.db_pass else: db_pass = ''.join( random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(10)) with conn.cursor() as cursor: logger.info('Creating user "{}"'.format(db_user)) existing_password = _create_user_if_not_exists(conn, '%', db_user, db_pass) if existing_password is not None: if not args.db_pass: logger.warning( 'User {} already exists. To generate the ' 'configuration file, you must enter it\'s ' 'password.'.format(db_user) ) db_pass = _get_user_pass(conn, args.db_host, db_user, existing_password) else: db_pass = args.db_pass logger.info('Creating database "{}"'.format(args.db_name)) cursor.execute('CREATE DATABASE {}'.format(args.db_name)) cursor.execute( 'GRANT ALL PRIVILEGES ON {}.* TO \'{}\'@\'%\''.format( args.db_name, db_user)) config_path = os.path.join(args.config_dir, '{}.json'.format( args.db_name)) logger.info('Creating config at {}'.format(config_path)) with open(config_path, 'w+') as fh: json.dump({ 'host': args.db_host, 'database': args.db_name, 'username': db_user, 'password': db_pass }, fh, sort_keys=True, indent=4, separators=(',', ': ')) logger.info('Initializing tables') config.init_db(config_path) logger.info('Success!') return True except Exception as e: logger.error(e) return False
def _get_user_pass(conn, host, user, existing_password): with conn.cursor() as cursor: while True: db_pass = getpass.getpass() cursor.execute('SELECT PASSWORD(%s) as password', db_pass) if cursor.fetchone()['password'] != existing_password: logger.error('Password does not match.') else: logger.info('Correct password') return db_pass
def add_uniques(session, sample, alignments, props, aligner, realign_len=None, realign_mut=None): bucketed_seqs = OrderedDict() alignments = sorted(alignments, key=lambda v: v.sequence.ids[0]) for alignment in funcs.periodic_commit(session, alignments): try: if realign_len is not None: aligner.align_to_germline(alignment, realign_len, realign_mut) if props.trim_to: alignment.trim_to(props.trim_to) props.validate(alignment) bucket_key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), len(alignment.cdr3)) if bucket_key not in bucketed_seqs: bucketed_seqs[bucket_key] = {} bucket = bucketed_seqs[bucket_key] if alignment.sequence.sequence in bucket: bucket[alignment.sequence.sequence].sequence.ids += ( alignment.sequence.ids) else: bucket[alignment.sequence.sequence] = alignment except AlignmentException as e: add_as_noresult(session, alignment.sequence, sample, str(e)) except Exception: logger.error('\tUnexpected error processing sequence ' '{}\n\t{}'.format(alignment.sequence.ids[0], traceback.format_exc())) # Collapse sequences that are the same except for Ns for bucket, sequences in funcs.periodic_commit(session, bucketed_seqs.iteritems()): sequences = sorted(sequences.values(), key=lambda s: (len(s.sequence.ids), s.sequence.ids[0]), reverse=True) while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence.sequence, smaller.sequence.sequence): larger.sequence.ids += smaller.sequence.ids del sequences[i] add_as_sequence(session, larger, sample) session.commit()
def parse_metadata(session, fh, warn_existing, path): reader = csv.DictReader(fh, delimiter='\t') provided_fields = set(reader.fieldnames) missing_fields = set(REQUIRED_FIELDS) - provided_fields if len(missing_fields) > 0: raise MetadataException( 'Metadata is missing the following headers: {}'.format( ','.join(missing_fields))) unknown_fields = provided_fields - (set(REQUIRED_FIELDS).union( set(OPTIONAL_FIELDS))) if len(unknown_fields) > 0: logger.warning('Ignoring unknown headers in metadata: {}'.format( ','.join(unknown_fields))) metadata = {} for row in reader: row = { k: v for k, v in row.iteritems() if v is not None and len(v) > 0 } if len(row) == 0: continue check_populated(row) # Check if the sample name is unique if row['sample_name'] in metadata: logger.error('Duplicate sample name {} in metadata.'.format( row['sample_name'])) # Check if a sample with the same name is in the database sample_in_db = session.query(Sample).filter( Sample.name == row['sample_name'], exists().where(Sequence.sample_id == Sample.id)).first() if sample_in_db: message = 'Sample {} already exists. {}'.format( row['sample_name'], 'Skipping.' if warn_existing else 'Cannot continue.') if warn_existing: logger.warning(message) continue else: raise MetadataException(message) # Check if specified file exists if not os.path.isfile(os.path.join(path, row['file_name'])): raise MetadataException( 'File {} for sample {} does not exist'.format( row['file_name'], row['sample_name'])) metadata[row['sample_name']] = row return metadata
def combine_samples(session, args): groups = {} subjects = set() for meta in session.query(SampleMetadata).filter( SampleMetadata.key == args.combine_field): groups.setdefault(meta.value, set()).add(meta.sample) subjects.add(meta.sample.subject) for group_id, samples in groups.items(): group_subs = set(s.subject for s in samples) if len(group_subs) > 1: logger.error('Cannot combine samples across subjects ' '(group "{}" has {} subjects)'.format( group_id, len(group_subs))) sys.exit(1) for subject in subjects: subject.reset() for group_id, samples in groups.items(): all_sample_ids = set(s.id for s in samples) final_sample_id = min(all_sample_ids) logger.info('Combining {} samples into new sample "{}" (ID {})'.format( len(samples), group_id, final_sample_id)) session.query(Sequence).filter( Sequence.sample_id.in_(all_sample_ids)).update( { Sequence.sample_id: final_sample_id, }, synchronize_session=False) logger.info('Updating sample name and deleting empty samples') # collapse to one sample final_sample = session.query(Sample).get(final_sample_id) final_sample.name = group_id remove_duplicates(session, final_sample) logger.info('Moving noresults') session.query(NoResult).filter( NoResult.sample_id.in_(all_sample_ids)).update( {'sample_id': final_sample_id}, synchronize_session=False) # delete the now-empty samples session.query(Sample).filter( Sample.id.in_(all_sample_ids - set([final_sample_id]))).delete( synchronize_session=False) session.commit() logger.info('Sequences successfully collapsed: please re-run ' 'immunedb_collapse and later pipeline steps.')
def run_rest_service(session_maker, args): if args.rollbar_token: if not ROLLBAR_SUPPORT: logger.error('Rollbar is not installed') return rbr = RollbarBottleReporter(access_token=args.rollbar_token, environment=args.rollbar_env) bottle.install(rbr) app.config['session_maker'] = session_maker app.config['allow_shutdown'] = args.allow_shutdown if args.debug: app.catchall = False app.run(host='0.0.0.0', port=args.port, server='gevent', debug=args.debug)
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) session.close() # Load the germlines from files v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) tasks = concurrent.TaskQueue() # If metadata is not specified, assume it is "metadata." in the # directory meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.sample_dir) except MetadataException as ex: logger.error(ex.message) return # Create the tasks for each file for sample_name in sorted(metadata.keys()): tasks.add_task({ 'path': os.path.join(args.sample_dir, metadata[sample_name]['file_name']), 'meta': metadata[sample_name] }) props = IdentificationProps(**args.__dict__) lock = mp.Lock() for i in range(0, min(args.nproc, tasks.num_tasks())): worker_session = config.init_db(args.db_config) tasks.add_worker( IdentificationWorker(worker_session, v_germlines, j_germlines, props, lock)) tasks.start()
def delete(main_parser, args): try: with open(args.db_config) as fh: db_config = json.load(fh) conn = _get_root_connection(db_config['host'], args.admin_user, args.admin_pass) with conn.cursor() as cursor: logger.info('Deleting database {}'.format(db_config['database'])) cursor.execute('DROP DATABASE `{}`'.format(db_config['database'])) if args.delete_user: logger.info('Deleting user {}'.format(db_config['username'])) cursor.execute('DROP USER `{}`'.format(db_config['username'])) return True except Exception as e: logger.error(e) return False
def do_task(self, clone_id): clone_inst = self.session.query(Clone).filter( Clone.id == clone_id).first() if not clone_inst: return self.info('Running clone {}'.format(clone_inst.id)) sequences = self.session.query( Sequence ).join(SequenceCollapse).filter( Sequence.clone_id == clone_id, SequenceCollapse.copy_number_in_subject >= self.min_seq_copies, SequenceCollapse.samples_in_subject >= self.min_seq_samples, ) if self.exclude_stops: sequences = sequences.filter(Sequence.stop == 0) sequences = sequences.order_by(Sequence.v_length) try: tree = self.get_tree(clone_inst, sequences) if not tree: logger.warning('No sequences to make tree for clone {}'.format( clone_id)) return except Exception as e: logger.error('Error running clone {}: {}'.format(clone_id, e)) return for node_id, node in enumerate(tree.traverse()): node.add_feature('node_id', node_id) final = { 'info': { 'min_mut_copies': self.min_mut_copies, 'min_mut_samples': self.min_mut_samples, 'min_seq_copies': self.min_seq_copies, 'min_seq_samples': self.min_seq_samples, 'exclude_stops': self.exclude_stops, 'full_seq': self.full_seq, }, 'tree': tree_as_dict(tree) } clone_inst.tree = json.dumps(final) self.session.add(clone_inst) self.session.commit()
def do_task(self, clone_id): clone_inst = self.session.query(Clone).filter( Clone.id == clone_id).first() if not clone_inst: return self.info('Running clone {}'.format(clone_inst.id)) sequences = self.session.query(Sequence).join(SequenceCollapse).filter( Sequence.clone_id == clone_id, SequenceCollapse.copy_number_in_subject >= self.min_seq_copies, SequenceCollapse.samples_in_subject >= self.min_seq_samples, ) if self.exclude_stops: sequences = sequences.filter(Sequence.stop == 0) sequences = sequences.order_by(Sequence.v_length) try: tree = self.get_tree(clone_inst, sequences) if not tree: logger.warning( 'No sequences to make tree for clone {}'.format(clone_id)) return except Exception as e: logger.error('Error running clone {}: {}'.format(clone_id, e)) return for node_id, node in enumerate(tree.traverse()): node.add_feature('node_id', node_id) final = { 'info': { 'min_mut_copies': self.min_mut_copies, 'min_mut_samples': self.min_mut_samples, 'min_seq_copies': self.min_seq_copies, 'min_seq_samples': self.min_seq_samples, 'exclude_stops': self.exclude_stops, 'full_seq': self.full_seq, }, 'tree': tree_as_dict(tree) } clone_inst.tree = json.dumps(final) self.session.add(clone_inst) self.session.commit()
def run_rest_service(session_maker, args): if args.rollbar_token: if not ROLLBAR_SUPPORT: logger.error('Rollbar is not installed') return rbr = RollbarBottleReporter(access_token=args.rollbar_token, environment=args.rollbar_env) bottle.install(rbr) app = create_app(session_maker, args.allow_shutdown) if args.debug: app.catchall = False app.run(host='0.0.0.0', port=args.port, server=args.server, debug=args.debug, worker_class='eventlet', timeout=0)
def aggregate_vdj(aggregate_queue): alignments = {'success': {}, 'noresult': []} for result in aggregate_queue: if result['status'] == 'success': alignment = result['alignment'] seq_key = alignment.sequence.sequence if seq_key in alignments['success']: alignments['success'][seq_key].sequence.copy_number += ( alignment.sequence.copy_number) else: alignments['success'][seq_key] = alignment elif result['status'] == 'noresult': alignments['noresult'].append(result) elif result['status'] == 'error': logger.error( 'Unexpected error processing sequence {}\n\t{}'.format( result['vdj'].seq_id, result['reason'])) alignments['success'] = alignments['success'].values() return alignments
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) # Load the germlines from files v_germlines = VGermlines(args.v_germlines, ties=args.ties and not args.genotyping) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len, ties=args.ties and not args.genotyping) # If metadata is not specified, assume it is "metadata." in the # directory meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') sys.exit(-1) with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.warn_missing, args.sample_dir) except MetadataException as ex: logger.error(ex) sys.exit(-1) session.close() # Create the tasks for each file props = IdentificationProps(**args.__dict__) for sample_name in sorted(metadata.keys()): process_sample( args.db_config, v_germlines, j_germlines, os.path.join( args.sample_dir, metadata[sample_name]['file_name'] ), metadata[sample_name], props, args.nproc )
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) # Load the germlines from files v_germlines = VGermlines(args.v_germlines, no_ties=args.genotyping) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len, no_ties=args.genotyping) # If metadata is not specified, assume it is "metadata." in the # directory meta_fn = args.metadata if args.metadata else os.path.join( args.sample_dir, 'metadata.tsv') # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') sys.exit(-1) with open(meta_fn, 'rU') as fh: try: metadata = parse_metadata(session, fh, args.warn_existing, args.warn_missing, args.sample_dir) except MetadataException as ex: logger.error(ex) sys.exit(-1) session.close() # Create the tasks for each file props = IdentificationProps(**args.__dict__) for sample_name in sorted(metadata.keys()): process_sample( args.db_config, v_germlines, j_germlines, os.path.join( args.sample_dir, metadata[sample_name]['file_name'] ), metadata[sample_name], props, args.nproc )
def do_task(self, clone_id): clone_inst = self._session.query(Clone).filter( Clone.id == clone_id).first() if clone_inst is None: return self.info('Running clone {}'.format(clone_inst.id)) sequences = self._session.query( Sequence ).join(SequenceCollapse).filter( Sequence.clone_id == clone_id, SequenceCollapse.copy_number_in_subject > self._min_seq_copies ) if self._exclude_stops: sequences = sequences.filter(Sequence.stop == 0) sequences = sequences.order_by(Sequence.v_length) try: tree = PhylogeneticTree(clone_inst.consensus_germline, sequences) tree.run(self._session, self._tree_prog) except Exception as e: logger.error('Error running clone {}: {}'.format(clone_id, e)) return final = { 'info': { 'min_count': self._min_count, 'min_samples': self._min_samples, 'min_seq_copies': self._min_seq_copies, 'exclude_stops': self._exclude_stops }, 'tree': tree_as_dict(tree.tree) } clone_inst.tree = json.dumps(final) self._session.add(clone_inst) self._session.commit()
def aggregate_vdj(aggregate_queue): alignments = { 'success': {}, 'noresult': [] } for result in aggregate_queue: if result['status'] == 'success': alignment = result['alignment'] seq_key = alignment.sequence.sequence if seq_key in alignments['success']: alignments['success'][seq_key].sequence.copy_number += ( alignment.sequence.copy_number) else: alignments['success'][seq_key] = alignment elif result['status'] == 'noresult': alignments['noresult'].append(result) elif result['status'] == 'error': logger.error( 'Unexpected error processing sequence {}\n\t{}'.format( result['vdj'].seq_id, result['reason'])) alignments['success'] = alignments['success'].values() return alignments
def combine_samples(session, args): groups = {} for meta in session.query(SampleMetadata).filter( SampleMetadata.key == args.combine_field): groups.setdefault(meta.value, set()).add(meta.sample_id) all_subjects = set() for group_id, samples in groups.items(): group_subs = session.query(Sample.subject_id).filter( Sample.id.in_(samples) ).group_by(Sample.subject_id) group_subs = [s.subject_id for s in group_subs] all_subjects.update(set(group_subs)) if len(group_subs) > 1: logger.error('Cannot combine samples across subjects ' '(group "{}" has {} subjects)'.format( group_id, len(group_subs))) sys.exit(1) all_samples = [s.id for s in session.query(Sample.id).filter( Sample.subject_id.in_(all_subjects))] logger.info('Resetting information for {} subjects'.format( len(all_subjects), len(all_samples))) logger.info(' Resetting collapsing') session.query(SequenceCollapse).filter( SequenceCollapse.sample_id.in_(all_samples) ).delete(synchronize_session=False) logger.info(' Resetting clones') session.query(Clone).filter( Clone.subject_id.in_(all_subjects) ).delete(synchronize_session=False) logger.info(' Resetting sample statistics') session.query(SampleStats).filter( SampleStats.sample_id.in_(all_samples) ).delete(synchronize_session=False) for group_id, samples in groups.items(): final_sample_id = min(samples) logger.info('Combining {} samples into new sample "{}" (ID {})'.format( len(samples), group_id, final_sample_id)) session.query(Sequence).filter( Sequence.sample_id.in_(samples) ).update({ Sequence.sample_id: final_sample_id, }, synchronize_session=False) logger.info('Updating sample name and deleting empty samples') # collapse to one sample final_sample = session.query(Sample).get(final_sample_id) final_sample.name = group_id remove_duplicates(session, final_sample) logger.info('Moving noresults') session.query(NoResult).filter( NoResult.sample_id.in_(samples) ).update({ 'sample_id': final_sample_id }, synchronize_session=False) # delete the now-empty samples session.query(Sample).filter( Sample.id.in_(samples - set([final_sample_id])) ).delete(synchronize_session=False) session.commit() logger.info('Sequences successfully collapsed: please re-run ' 'immunedb_collapse and later pipeline steps.')
def run_identify(session, args): mod_log.make_mod('identification', session=session, commit=True, info=vars(args)) session.close() # Load the germlines from files v_germlines = VGermlines(args.v_germlines) j_germlines = JGermlines(args.j_germlines, args.upstream_of_cdr3, args.anchor_len, args.min_anchor_len) tasks = concurrent.TaskQueue() sample_names = set([]) fail = False for directory in args.sample_dirs: # If metadata is not specified, assume it is "metadata.json" in the # directory if args.metadata is None: meta_fn = os.path.join(directory, 'metadata.json') else: meta_fn = args.metadata # Verify the metadata file exists if not os.path.isfile(meta_fn): logger.error('Metadata file not found.') return with open(meta_fn) as fh: metadata = json.load(fh) # Create the tasks for each file for fn in sorted(metadata.keys()): if fn == 'all': continue meta = SampleMetadata( metadata[fn], metadata['all'] if 'all' in metadata else None) if session.query(Sample).filter( Sample.name == meta.get('sample_name'), exists().where( Sequence.sample_id == Sample.id)).first() is not None: log_f = logger.warning if args.warn_existing else logger.error log_f('Sample {} already exists. {}'.format( meta.get('sample_name'), 'Skipping.' if args.warn_existing else 'Cannot continue.')) fail = True elif meta.get('sample_name') in sample_names: logger.error( 'Sample {} exists more than once in metadata.'.format( meta.get('sample_name'))) return else: tasks.add_task({'path': directory, 'fn': fn, 'meta': meta}) sample_names.add(meta.get('sample_name')) if fail and not args.warn_existing: logger.error('Encountered errors. Not running any identification.' ' To skip samples that are already in the database ' 'use --warn-existing.') return lock = mp.Lock() for i in range(0, min(args.nproc, tasks.num_tasks())): worker_session = config.init_db(args.db_config) tasks.add_worker( IdentificationWorker(worker_session, v_germlines, j_germlines, args.trim_to, args.max_padding, args.max_vties, args.min_similarity / float(100), lock)) tasks.start()