def write_pooled_clones(session, out_format, sample_ids=None, pool_on=('sample', ), zipped=False, **kwargs): # Samples and subjects can't be combined with other features exclusives = set(pool_on).intersection(set(('sample', 'subject'))) if len(pool_on) > 1 and exclusives: pool_on = (list(exclusives)[0], ) logger.warning('You specified pooling on {feat} which ' 'cannot be combined with other features.' ' Using only {feat}.'.format(feat=pool_on[0])) logger.info('Writing clones pooled by {} in {} format'.format( ','.join(pool_on), out_format)) sample_ids = sample_ids or [s.id for s in session.query(Sample)] aggregated = get_pooled_samples(session, sample_ids, pool_on) output_func = { 'immunedb': get_immunedb_output, 'vdjtools': get_vdjtools_output }[out_format] with ExportWriter(zipped=zipped) as fh: for (subject, feature_value), clones in aggregated.items(): logger.info('Pooling subject {} for feature(s) {}'.format( subject, ','.join(feature_value))) fh.set_filename(get_filename(subject, pool_on, feature_value)) fh.write(output_func(session, clones)) return fh.get_zip_value()
def parse_metadata(session, fh, warn_existing, warn_missing, path): reader = csv.DictReader(fh, delimiter='\t') provided_fields = set(reader.fieldnames) for field in provided_fields: if not re.match('^[a-z][a-z0-9_]*$', field): raise MetadataException( 'Metadata headers must only contain numbers, letters, and ' 'underscores, and cannot start with a number: {}'.format( field)) missing_fields = set(REQUIRED_FIELDS) - provided_fields if len(missing_fields) > 0: raise MetadataException( 'Metadata is missing the following headers: {}'.format( ','.join(missing_fields))) metadata = {} for row in reader: row = { k: v for k, v in row.items() if v is not None and len(v) > 0 and v.lower() not in NA_VALUES } if len(row) == 0: continue check_populated(row) # Check if the sample name is unique if row['sample_name'] in metadata: raise MetadataException( 'Duplicate sample name {} in metadata.'.format( row['sample_name'])) # Check if a sample with the same name is in the database sample_in_db = session.query(Sample).filter( Sample.name == row['sample_name'], exists().where(Sequence.sample_id == Sample.id)).first() if sample_in_db: message = 'Sample {} already exists. {}'.format( row['sample_name'], 'Skipping.' if warn_existing else 'Cannot continue.') if warn_existing: logger.warning(message) continue else: raise MetadataException(message) # Check if specified file exists if not os.path.isfile(os.path.join(path, row['file_name'])): message = ('File {} for sample {} does not exist. {}'.format( row['file_name'], row['sample_name'], 'Skipping.' if warn_missing else 'Cannot continue.')) if warn_missing: logger.warning(message) continue else: raise MetadataException(message) metadata[row['sample_name']] = row return metadata
def create(main_parser, args): if re.search(r'[^A-Za-z0-9_-]', args.db_name) is not None: main_parser.error('Database name must only contain letters, numbers, ' 'dashes and underscores.') try: conn = _get_root_connection(args.db_host, args.admin_user, args.admin_pass) db_user = args.db_user or args.db_name if args.db_pass: db_pass = args.db_pass else: db_pass = ''.join( random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(10)) with conn.cursor() as cursor: logger.info('Creating user "{}"'.format(db_user)) existing_password = _create_user_if_not_exists(conn, '%', db_user, db_pass) if existing_password is not None: if not args.db_pass: logger.warning( 'User {} already exists. To generate the ' 'configuration file, you must enter it\'s ' 'password.'.format(db_user) ) db_pass = _get_user_pass(conn, args.db_host, db_user, existing_password) else: db_pass = args.db_pass logger.info('Creating database "{}"'.format(args.db_name)) cursor.execute('CREATE DATABASE {}'.format(args.db_name)) cursor.execute( 'GRANT ALL PRIVILEGES ON {}.* TO \'{}\'@\'%\''.format( args.db_name, db_user)) config_path = os.path.join(args.config_dir, '{}.json'.format( args.db_name)) logger.info('Creating config at {}'.format(config_path)) with open(config_path, 'w+') as fh: json.dump({ 'host': args.db_host, 'database': args.db_name, 'username': db_user, 'password': db_pass }, fh, sort_keys=True, indent=4, separators=(',', ': ')) logger.info('Initializing tables') config.init_db(config_path) logger.info('Success!') return True except Exception as e: logger.error(e) return False
def parse_metadata(session, fh, warn_existing, path): reader = csv.DictReader(fh, delimiter='\t') provided_fields = set(reader.fieldnames) missing_fields = set(REQUIRED_FIELDS) - provided_fields if len(missing_fields) > 0: raise MetadataException( 'Metadata is missing the following headers: {}'.format( ','.join(missing_fields))) unknown_fields = provided_fields - (set(REQUIRED_FIELDS).union( set(OPTIONAL_FIELDS))) if len(unknown_fields) > 0: logger.warning('Ignoring unknown headers in metadata: {}'.format( ','.join(unknown_fields))) metadata = {} for row in reader: row = { k: v for k, v in row.iteritems() if v is not None and len(v) > 0 } if len(row) == 0: continue check_populated(row) # Check if the sample name is unique if row['sample_name'] in metadata: logger.error('Duplicate sample name {} in metadata.'.format( row['sample_name'])) # Check if a sample with the same name is in the database sample_in_db = session.query(Sample).filter( Sample.name == row['sample_name'], exists().where(Sequence.sample_id == Sample.id)).first() if sample_in_db: message = 'Sample {} already exists. {}'.format( row['sample_name'], 'Skipping.' if warn_existing else 'Cannot continue.') if warn_existing: logger.warning(message) continue else: raise MetadataException(message) # Check if specified file exists if not os.path.isfile(os.path.join(path, row['file_name'])): raise MetadataException( 'File {} for sample {} does not exist'.format( row['file_name'], row['sample_name'])) metadata[row['sample_name']] = row return metadata
def restore(main_parser, args): with open(args.db_config) as fh: db_config = json.load(fh) with open(args.backup_path, 'r') as fh: cmd = shlex.split( 'mysql -h {} -u {} -p{} {}'.format(db_config['host'], db_config['username'], db_config['password'], db_config['database'])) proc = subprocess.Popen(cmd, stdin=fh, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if stderr: logger.warning(stderr) return True
def restore(main_parser, args): with open(args.db_config) as fh: db_config = json.load(fh) with open(args.backup_path, 'r') as fh: cmd = shlex.split('mysql -h {} -u {} -p{} {}'.format( db_config['host'], db_config['username'], db_config['password'], db_config['database'])) proc = subprocess.Popen(cmd, stdin=fh, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if stderr: logger.warning(stderr) return True
def write_clone_overlap(session, sample_ids=None, pool_on=('sample', ), size_metric='copies', sim_func='cosine', agg_func='median', zipped=False, **kwargs): samples = session.query(Sample) if sample_ids: samples = samples.filter(Sample.id.in_(sample_ids)) sample_instances = {s.id: s for s in samples} with ExportWriter(zipped=zipped) as writer: for subject in set([s.subject for s in sample_instances.values()]): logger.info('Calculating overlap for {}'.format( subject.identifier)) sub_samples = [ s.id for s in sample_instances.values() if s.subject == subject ] sdf = get_sample_df(session, sub_samples, pool_on, size_metric, getattr(distance, sim_func)) if sdf.empty: logger.warning( 'Subject {} had no clones for calculation'.format( subject.identifier)) continue sdf = collapse_df_features(sdf, pool_on, sample_instances, getattr(np, agg_func)) name = '{}.overlap'.format(subject.identifier) with writer.get_handle(name + '.tsv') as fh: sdf.to_csv(fh, sep='\t') title_fmt = 'Subject {}\npooled by={}, similarity metric={}' if 'sample' not in pool_on: title_fmt += ', aggregation function={}' fig, ax = plt.subplots(figsize=(20, 20)) ax = sns.heatmap(sdf, annot=True, linewidths=.25, vmin=0, vmax=1) ax.set_title( title_fmt.format(subject.identifier, ' & '.join(pool_on), sim_func, agg_func)) with writer.get_handle(name + '.pdf', 'wb+') as fh: plt.savefig(fh, bbox_inches='tight', format='pdf') return writer.get_zip_value()
def do_task(self, clone_id): clone_inst = self.session.query(Clone).filter( Clone.id == clone_id).first() if not clone_inst: return self.info('Running clone {}'.format(clone_inst.id)) sequences = self.session.query( Sequence ).join(SequenceCollapse).filter( Sequence.clone_id == clone_id, SequenceCollapse.copy_number_in_subject >= self.min_seq_copies, SequenceCollapse.samples_in_subject >= self.min_seq_samples, ) if self.exclude_stops: sequences = sequences.filter(Sequence.stop == 0) sequences = sequences.order_by(Sequence.v_length) try: tree = self.get_tree(clone_inst, sequences) if not tree: logger.warning('No sequences to make tree for clone {}'.format( clone_id)) return except Exception as e: logger.error('Error running clone {}: {}'.format(clone_id, e)) return for node_id, node in enumerate(tree.traverse()): node.add_feature('node_id', node_id) final = { 'info': { 'min_mut_copies': self.min_mut_copies, 'min_mut_samples': self.min_mut_samples, 'min_seq_copies': self.min_seq_copies, 'min_seq_samples': self.min_seq_samples, 'exclude_stops': self.exclude_stops, 'full_seq': self.full_seq, }, 'tree': tree_as_dict(tree) } clone_inst.tree = json.dumps(final) self.session.add(clone_inst) self.session.commit()
def do_task(self, clone_id): clone_inst = self.session.query(Clone).filter( Clone.id == clone_id).first() if not clone_inst: return self.info('Running clone {}'.format(clone_inst.id)) sequences = self.session.query(Sequence).join(SequenceCollapse).filter( Sequence.clone_id == clone_id, SequenceCollapse.copy_number_in_subject >= self.min_seq_copies, SequenceCollapse.samples_in_subject >= self.min_seq_samples, ) if self.exclude_stops: sequences = sequences.filter(Sequence.stop == 0) sequences = sequences.order_by(Sequence.v_length) try: tree = self.get_tree(clone_inst, sequences) if not tree: logger.warning( 'No sequences to make tree for clone {}'.format(clone_id)) return except Exception as e: logger.error('Error running clone {}: {}'.format(clone_id, e)) return for node_id, node in enumerate(tree.traverse()): node.add_feature('node_id', node_id) final = { 'info': { 'min_mut_copies': self.min_mut_copies, 'min_mut_samples': self.min_mut_samples, 'min_seq_copies': self.min_seq_copies, 'min_seq_samples': self.min_seq_samples, 'exclude_stops': self.exclude_stops, 'full_seq': self.full_seq, }, 'tree': tree_as_dict(tree) } clone_inst.tree = json.dumps(final) self.session.add(clone_inst) self.session.commit()
def collapse_duplicate_alignments(bucket): uniques = [] while bucket: alignment = bucket.pop() for i, other_alignment in enumerate(bucket): if (len(alignment.sequence.sequence) != len( other_alignment.sequence.sequence)): logger.warning('Sequence lengths differ {} {}'.format( alignment.sequence.seq_id, other_alignment.sequence.seq_id)) continue if dnautils.equal(alignment.sequence.sequence, other_alignment.sequence.sequence): alignment.sequence.copy_number += ( other_alignment.sequence.copy_number) bucket.pop(i) uniques.append(alignment) return uniques
def write_clone_overlap(session, sample_ids=None, pool_on=('sample',), size_metric='copies', sim_func='cosine', agg_func='median', zipped=False, **kwargs): samples = session.query(Sample) if sample_ids: samples = samples.filter(Sample.id.in_(sample_ids)) sample_instances = {s.id: s for s in samples} with ExportWriter(zipped=zipped) as writer: for subject in set([s.subject for s in sample_instances.values()]): logger.info('Calculating overlap for {}'.format( subject.identifier)) sub_samples = [ s.id for s in sample_instances.values() if s.subject == subject ] sdf = get_sample_df(session, sub_samples, pool_on, size_metric, getattr(distance, sim_func)) if sdf.empty: logger.warning( 'Subject {} had no clones for calculation'.format( subject.identifier)) continue sdf = collapse_df_features(sdf, pool_on, sample_instances, getattr(np, agg_func)) name = '{}.overlap'.format(subject.identifier) with writer.get_handle(name + '.tsv') as fh: sdf.to_csv(fh, sep='\t') title_fmt = 'Subject {}\npooled by={}, similarity metric={}' if 'sample' not in pool_on: title_fmt += ', aggregation function={}' fig, ax = plt.subplots(figsize=(20, 20)) ax = sns.heatmap(sdf, annot=True, linewidths=.25, vmin=0, vmax=1) ax.set_title(title_fmt.format( subject.identifier, ' & '.join(pool_on), sim_func, agg_func )) with writer.get_handle(name + '.pdf', 'wb+') as fh: plt.savefig(fh, bbox_inches='tight', format='pdf') return writer.get_zip_value()
def delete_samples(session, args): for sample_id in args.sample_ids: sample = session.query(Sample).get(sample_id) if not sample: logger.warning('Sample #{} does not exist'.format(sample_id)) continue logger.info('Deleting sample #{}'.format(sample_id)) sample.subject.reset() session.query(Sequence).filter(Sequence.sample == sample).delete() session.delete(sample) session.commit() for subject in session.query(Subject): if not subject.samples: logger.info('Deleting orphan subject "{}"'.format( subject.identifier)) session.delete(subject) session.commit()
def update_metadata(session, args): SENTINEL = '__TEMP' # Used to temporarily avoid duplicate name issues with open(args.new_metadata) as fh: reader = csv.DictReader(fh, delimiter='\t') new_meta = {l['name']: l for l in reader} # delete existing metadata sample_ids = { s.name: s.id for s in session.query(Sample).filter(Sample.name.in_(new_meta)) } session.query(SampleMetadata).filter( SampleMetadata.sample_id.in_( sample_ids.values())).delete(synchronize_session='fetch') ignore_fields = ['name', 'new_name', 'subject', 'file_name'] for sample_name, row in new_meta.items(): if sample_name not in sample_ids: logger.warning( 'No sample {} in database. Ignoring.'.format(sample_name)) sample_id = sample_ids[sample_name] logger.info('Updating metadata for {}'.format(row['name'])) session.add_all([ SampleMetadata(sample_id=sample_id, key=k, value=v) for k, v in row.items() if k not in ignore_fields and v not in NA_VALUES ]) if row['new_name'] != row['name']: logger.info(' Updating sample name to {}'.format(row['new_name'])) session.query(Sample).filter(Sample.name == row['name']).update( {Sample.name: row['new_name'] + SENTINEL}) logger.info('Verifying uniqueness') for sample in session.query(Sample).filter(Sample.name.like('%' + SENTINEL)): sample.name = sample.name[:-len(SENTINEL)] if session.query(Clone.id).filter(~Clone.tree.is_(None)).count() > 0: logger.warning('This database has at least one clonal lineage ' 'constructed. All lineages will need to be updated ' 'to reflect the modified metadata.') session.commit()
def update_metadata(session, args): SENTINEL = '__TEMP' # Used to temporarily avoid duplicate name issues with open(args.new_metadata) as fh: reader = csv.DictReader(fh, delimiter='\t') new_meta = {l['name']: l for l in reader} # delete existing metadata sample_ids = {s.name: s.id for s in session.query(Sample).filter( Sample.name.in_(new_meta))} session.query(SampleMetadata).filter( SampleMetadata.sample_id.in_(sample_ids.values()) ).delete(synchronize_session='fetch') ignore_fields = ['name', 'new_name', 'subject', 'file_name'] for sample_name, row in new_meta.items(): if sample_name not in sample_ids: logger.warning('No sample {} in database. Ignoring.'.format( sample_name)) sample_id = sample_ids[sample_name] logger.info('Updating metadata for {}'.format(row['name'])) session.add_all([ SampleMetadata(sample_id=sample_id, key=k, value=v) for k, v in row.items() if k not in ignore_fields and v not in NA_VALUES ]) if row['new_name'] != row['name']: logger.info(' Updating sample name to {}'.format(row['new_name'])) session.query(Sample).filter(Sample.name == row['name']).update({ Sample.name: row['new_name'] + SENTINEL }) logger.info('Verifying uniqueness') for sample in session.query(Sample).filter( Sample.name.like('%' + SENTINEL)): sample.name = sample.name[:-len(SENTINEL)] if session.query(Clone.id).filter(~Clone.tree.is_(None)).count() > 0: logger.warning('This database has at least one clonal lineage ' 'constructed. All lineages will need to be updated ' 'to reflect the modified metadata.') session.commit()
def collapse_duplicates(bucket): uniques = [] while bucket: alignment = bucket.pop() for i, other_alignment in enumerate(bucket): if (len(alignment.sequence.sequence) != len(other_alignment.sequence.sequence)): logger.warning('Sequence lengths differ {} {}'.format( alignment.sequence.seq_id, other_alignment.sequence.seq_id) ) continue if dnautils.equal(alignment.sequence.sequence, other_alignment.sequence.sequence): alignment.sequence.copy_number += ( other_alignment.sequence.copy_number ) bucket.pop(i) uniques.append(alignment) return uniques
def _queue_tasks(session, sample_id, force, tasks): logger.info('Creating task queue to generate stats for sample {}.'.format( sample_id)) existing_seq = session.query(Sequence).filter( Sequence.sample_id == sample_id) existing_nores = session.query(NoResult).filter( NoResult.sample_id == sample_id) if existing_seq.first() is None and existing_nores.first() is None: logger.warning('\tSKIPPING since there are no sequences in the ' 'sample') return existing = session.query(SampleStats.sample_id).filter( SampleStats.sample_id == sample_id).first() is not None if force and existing: logger.warning('\tFORCING regeneration of stats') elif not force and existing: logger.warning('\tSKIPPING stats since they already exists and the ' '--force flag was not specified.') return min_cdr3, max_cdr3 = _get_cdr3_bounds(session, sample_id) for include_outliers in [True, False]: for only_full_reads in [True, False]: tasks.add_task({ 'func': 'seq', 'sample_id': sample_id, 'min_cdr3': min_cdr3, 'max_cdr3': max_cdr3, 'include_outliers': include_outliers, 'only_full_reads': only_full_reads }) tasks.add_task({ 'func': 'clone', 'sample_id': sample_id, 'min_cdr3': min_cdr3, 'max_cdr3': max_cdr3, 'include_outliers': include_outliers, 'only_full_reads': only_full_reads })
def _queue_tasks(session, sample_id, force, tasks): logger.info('Creating task queue to generate stats for sample {}.'.format( sample_id)) existing_seq = session.query(Sequence).filter( Sequence.sample_id == sample_id) existing_nores = session.query(NoResult).filter( NoResult.sample_id == sample_id) if existing_seq.first() is None and existing_nores.first() is None: logger.warning('\tSKIPPING since there are no sequences in the ' 'sample') return existing = session.query(SampleStats.sample_id).filter( SampleStats.sample_id == sample_id).first() is not None if force and existing: logger.warning('\tFORCING regeneration of stats') elif not force and existing: logger.warning('\tSKIPPING stats since they already exists and the ' '--force flag was not specified.') return min_cdr3, max_cdr3 = _get_cdr3_bounds(session, sample_id) for include_outliers in [True, False]: for only_full_reads in [True, False]: tasks.add_task({ 'func': 'seq', 'sample_id': sample_id, 'min_cdr3': min_cdr3, 'max_cdr3': max_cdr3, 'include_outliers': include_outliers, 'only_full_reads': only_full_reads }) tasks.add_task({ 'func': 'clone', 'sample_id': sample_id, 'min_cdr3': min_cdr3, 'max_cdr3': max_cdr3, 'include_outliers': include_outliers, 'only_full_reads': only_full_reads })
def shutdown(): if app.config['allow_shutdown']: logger.warning('Shutting down from remote request') sys.exit() return create_response(code=404)
def parse_metadata(session, fh, warn_existing, warn_missing, path): reader = csv.DictReader(fh, delimiter='\t') provided_fields = set(reader.fieldnames) for field in provided_fields: if not re.match('^[a-z][a-z0-9_]*$', field): raise MetadataException( 'Metadata headers must only contain numbers, letters, and ' 'underscores, and cannot start with a number: {}'.format(field) ) missing_fields = set(REQUIRED_FIELDS) - provided_fields if len(missing_fields) > 0: raise MetadataException( 'Metadata is missing the following headers: {}'.format( ','.join(missing_fields))) metadata = {} for row in reader: row = {k: v for k, v in row.items() if v is not None and len(v) > 0 and v.lower() not in NA_VALUES} if len(row) == 0: continue check_populated(row) # Check if the sample name is unique if row['sample_name'] in metadata: raise MetadataException( 'Duplicate sample name {} in metadata.'.format( row['sample_name'])) # Check if a sample with the same name is in the database sample_in_db = session.query(Sample).filter( Sample.name == row['sample_name'], exists().where( Sequence.sample_id == Sample.id )).first() if sample_in_db: message = 'Sample {} already exists. {}'.format( row['sample_name'], 'Skipping.' if warn_existing else 'Cannot continue.' ) if warn_existing: logger.warning(message) continue else: raise MetadataException(message) # Check if specified file exists if not os.path.isfile(os.path.join(path, row['file_name'])): message = ( 'File {} for sample {} does not exist. {}'.format( row['file_name'], row['sample_name'], 'Skipping.' if warn_missing else 'Cannot continue.')) if warn_missing: logger.warning(message) continue else: raise MetadataException(message) metadata[row['sample_name']] = row return metadata
def process_sample(session, sample, indexes, temp, v_germlines, j_germlines, nproc): indels = session.query( Sequence.ai, Sequence.seq_id, Sequence.sample_id, Sequence.sequence ).filter( Sequence.sample_id == sample.id, Sequence.probable_indel_or_misalign == 1 ).order_by(Sequence.seq_id) # Get the sequences that were not identifiable noresults = session.query(NoResult).filter( NoResult.sample_id == sample.id).order_by(NoResult.seq_id) if indels.count() == 0 and noresults.count() == 0: logger.info('Sample {} has no indels or noresults'.format( sample.id)) return logger.info('Sample {} has {} indels and {} noresults'.format( sample.id, indels.count(), noresults.count())) mut_bucket = v_germlines.mut_bucket(sample.v_ties_mutations) len_bucket = v_germlines.length_bucket(sample.v_ties_len) bucket = '{}_{}'.format(str(mut_bucket).replace('.', ''), len_bucket) sample_v_germlines = get_formatted_ties(v_germlines.all_ties( sample.v_ties_len, sample.v_ties_mutations)) sample_j_germlines = get_formatted_ties(j_germlines.all_ties( sample.v_ties_len, sample.v_ties_mutations)) if bucket not in indexes: indexes.add(bucket) v_path = os.path.join(temp, 'v_genes_{}'.format(bucket)) j_path = os.path.join(temp, 'j_genes_{}'.format(bucket)) logger.info('Creating index for V-ties at {} length, {} ' 'mutation'.format(len_bucket, mut_bucket)) build_index(sample_v_germlines, v_path) build_index(sample_j_germlines, j_path) seq_path = os.path.join(temp, 'll_{}.fasta'.format(sample.id)) with open(seq_path, 'w+') as fh: fh.write(get_fasta({'tp=Sequence|ai={}|sample_id={}|seq_id={}'.format( r.ai, r.sample_id, r.seq_id): r.sequence for r in indels})) fh.write(get_fasta({'tp=NoResult|pk={}|sample_id={}|seq_id={}'.format( r.pk, r.sample_id, r.seq_id): r.sequence for r in noresults})) alignments = {} logger.info('Running bowtie2 for V-gene sequences') for line in get_reader(align_reference(temp, 'v_genes_{}'.format(bucket), seq_path, nproc)): line['ref_offset'] = int(line['ref_offset']) - 1 ref_gene = line['reference'] try: ref, seq, rem_seqs = create_seqs( ref_seq=sample_v_germlines[ref_gene].replace('-', ''), min_size=CDR3_OFFSET, **line) except KeyError as e: logger.warning('bowtie got invalid V: ' + str(e)) continue if len(rem_seqs) == 0: continue ref, seq, seq_start = add_imgt_gaps(sample_v_germlines[ref_gene], ref, seq, line['ref_offset']) if len(ref) < CDR3_OFFSET: continue alignments[line['seq_id']] = { 'v_germline': ref, 'v_gene': line['reference'], 'seq_start': seq_start, 'v_sequence': seq, 'v_rem_seq': rem_seqs[-1], 'cdr3_start': len(ref) } seq_path = os.path.join(temp, 'll_j_{}.fasta'.format(sample.id)) with open(seq_path, 'w+') as fh: seqs = {k: v['v_rem_seq'] for k, v in alignments.items() if len(v['v_rem_seq']) > 0} fh.write(get_fasta(seqs)) tasks = [] logger.info('Running bowtie2 for J-gene sequences') for line in get_reader(align_reference(temp, 'j_genes_{}'.format(bucket), seq_path, nproc)): line['ref_offset'] = int(line['ref_offset']) - 1 ref_gene = line['reference'] ref, seq, rem_seqs = create_seqs( ref_seq=sample_j_germlines[ref_gene].replace('-', ''), min_size=j_germlines.upstream_of_cdr3, **line) alignments[line['seq_id']]['j_gene'] = line['reference'] full_seq = (alignments[line['seq_id']]['v_sequence'] + alignments[line['seq_id']]['v_rem_seq']) if len(rem_seqs) > 0: full_seq = full_seq[:-len(rem_seqs[-1])] cdr3_end = len(full_seq) if len(ref) < j_germlines.upstream_of_cdr3: continue for i in range(j_germlines.upstream_of_cdr3): if ref[-i] != '-': cdr3_end -= 1 alignments[line['seq_id']]['cdr3_end'] = cdr3_end cdr3_length = cdr3_end - alignments[line['seq_id']]['cdr3_start'] full_germ = (alignments[line['seq_id']]['v_germline'] + (GAP_PLACEHOLDER * cdr3_length)) j_length = len(full_seq) - len(full_germ) if j_length <= 0 or cdr3_length <= 0: continue full_germ += ref[-j_length:] r_type, pk, sample_id, seq_id = [ v.split('=', 1)[1] for v in line['seq_id'].split('|', 3)] insertions = gap_positions(full_germ) deletions = gap_positions(full_seq) alignment = VDJAlignment( VDJSequence(seq_id, full_seq.replace(GAP_PLACEHOLDER, '-')) ) alignment.germline = full_germ.replace(GAP_PLACEHOLDER, '-') if len(alignment.germline) != len(alignment.sequence.sequence): continue alignment.v_gene.add(GeneName(alignments[line['seq_id']]['v_gene'])) alignment.j_gene.add(GeneName(alignments[line['seq_id']]['j_gene'])) alignment.seq_offset = alignments[line['seq_id']]['seq_start'] # TODO: This should really look for a streak like in anchoring alignment.germline_cdr3 = '-' * cdr3_length gaps_in_seq = alignment.sequence.sequence[ alignment.seq_start:alignments[line['seq_id']]['cdr3_start'] ].count('-') alignment.v_length = ( alignments[line['seq_id']]['cdr3_start'] - alignment.seq_offset ) - gaps_in_seq alignment.j_length = j_length alignment.v_mutation_fraction = 1 - (alignment.v_match / alignment.v_length) alignment.cdr3_start = alignments[line['seq_id']]['cdr3_start'] alignment.cdr3_num_nts = cdr3_length alignment.post_cdr3_length = j_length alignment.insertions = insertions alignment.deletions = deletions alignment.locally_aligned = True tasks.append({ 'r_type': r_type, 'pk': int(pk), 'sample_id': int(sample_id), 'alignment': alignment }) return tasks
def process_sample(session, sample, indexes, temp, v_germlines, j_germlines, nproc): indels = session.query( Sequence.ai, Sequence.seq_id, Sequence.sample_id, Sequence.sequence).filter( Sequence.sample_id == sample.id, Sequence.probable_indel_or_misalign == 1).order_by(Sequence.seq_id) # Get the sequences that were not identifiable noresults = session.query(NoResult).filter( NoResult.sample_id == sample.id).order_by(NoResult.seq_id) if indels.count() == 0 and noresults.count() == 0: logger.info('Sample {} has no indels or noresults'.format(sample.id)) return logger.info('Sample {} has {} indels and {} noresults'.format( sample.id, indels.count(), noresults.count())) mut_bucket = v_germlines.mut_bucket(sample.v_ties_mutations) len_bucket = v_germlines.length_bucket(sample.v_ties_len) bucket = '{}_{}'.format(str(mut_bucket).replace('.', ''), len_bucket) sample_v_germlines = get_formatted_ties( v_germlines.all_ties(sample.v_ties_len, sample.v_ties_mutations)) sample_j_germlines = get_formatted_ties( j_germlines.all_ties(sample.v_ties_len, sample.v_ties_mutations)) if bucket not in indexes: indexes.add(bucket) v_path = os.path.join(temp, 'v_genes_{}'.format(bucket)) j_path = os.path.join(temp, 'j_genes_{}'.format(bucket)) logger.info('Creating index for V-ties at {} length, {} ' 'mutation'.format(len_bucket, mut_bucket)) build_index(sample_v_germlines, v_path) build_index(sample_j_germlines, j_path) seq_path = os.path.join(temp, 'll_{}.fasta'.format(sample.id)) with open(seq_path, 'w+') as fh: fh.write( get_fasta({ 'tp=Sequence|ai={}|sample_id={}|seq_id={}'.format( r.ai, r.sample_id, r.seq_id): r.sequence for r in indels })) fh.write( get_fasta({ 'tp=NoResult|pk={}|sample_id={}|seq_id={}'.format( r.pk, r.sample_id, r.seq_id): r.sequence for r in noresults })) alignments = {} logger.info('Running bowtie2 for V-gene sequences') for line in get_reader( align_reference(temp, 'v_genes_{}'.format(bucket), seq_path, nproc)): line['ref_offset'] = int(line['ref_offset']) - 1 ref_gene = line['reference'] try: ref, seq, rem_seqs = create_seqs( ref_seq=sample_v_germlines[ref_gene].replace('-', ''), min_size=CDR3_OFFSET, **line) except KeyError as e: logger.warning('bowtie got invalid V: ' + str(e)) continue if len(rem_seqs) == 0: continue ref, seq, seq_start = add_imgt_gaps(sample_v_germlines[ref_gene], ref, seq, line['ref_offset']) if len(ref) < CDR3_OFFSET: continue alignments[line['seq_id']] = { 'v_germline': ref, 'v_gene': line['reference'], 'seq_start': seq_start, 'v_sequence': seq, 'v_rem_seq': rem_seqs[-1], 'cdr3_start': len(ref) } seq_path = os.path.join(temp, 'll_j_{}.fasta'.format(sample.id)) with open(seq_path, 'w+') as fh: seqs = { k: v['v_rem_seq'] for k, v in alignments.items() if len(v['v_rem_seq']) > 0 } fh.write(get_fasta(seqs)) tasks = [] logger.info('Running bowtie2 for J-gene sequences') for line in get_reader( align_reference(temp, 'j_genes_{}'.format(bucket), seq_path, nproc)): line['ref_offset'] = int(line['ref_offset']) - 1 ref_gene = line['reference'] ref, seq, rem_seqs = create_seqs( ref_seq=sample_j_germlines[ref_gene].replace('-', ''), min_size=j_germlines.upstream_of_cdr3, **line) alignments[line['seq_id']]['j_gene'] = line['reference'] full_seq = (alignments[line['seq_id']]['v_sequence'] + alignments[line['seq_id']]['v_rem_seq']) if len(rem_seqs) > 0: full_seq = full_seq[:-len(rem_seqs[-1])] cdr3_end = len(full_seq) if len(ref) < j_germlines.upstream_of_cdr3: continue for i in range(j_germlines.upstream_of_cdr3): if ref[-i] != '-': cdr3_end -= 1 alignments[line['seq_id']]['cdr3_end'] = cdr3_end cdr3_length = cdr3_end - alignments[line['seq_id']]['cdr3_start'] full_germ = (alignments[line['seq_id']]['v_germline'] + (GAP_PLACEHOLDER * cdr3_length)) j_length = len(full_seq) - len(full_germ) if j_length <= 0 or cdr3_length <= 0: continue full_germ += ref[-j_length:] r_type, pk, sample_id, seq_id = [ v.split('=', 1)[1] for v in line['seq_id'].split('|', 3) ] insertions = gap_positions(full_germ) deletions = gap_positions(full_seq) alignment = VDJAlignment( VDJSequence(seq_id, full_seq.replace(GAP_PLACEHOLDER, '-'))) alignment.germline = full_germ.replace(GAP_PLACEHOLDER, '-') if len(alignment.germline) != len(alignment.sequence.sequence): continue alignment.v_gene.add(GeneName(alignments[line['seq_id']]['v_gene'])) alignment.j_gene.add(GeneName(alignments[line['seq_id']]['j_gene'])) alignment.seq_offset = alignments[line['seq_id']]['seq_start'] # TODO: This should really look for a streak like in anchoring alignment.germline_cdr3 = '-' * cdr3_length gaps_in_seq = alignment.sequence.sequence[ alignment. seq_start:alignments[line['seq_id']]['cdr3_start']].count('-') alignment.v_length = (alignments[line['seq_id']]['cdr3_start'] - alignment.seq_offset) - gaps_in_seq alignment.j_length = j_length alignment.v_mutation_fraction = 1 - (alignment.v_match / alignment.v_length) alignment.cdr3_start = alignments[line['seq_id']]['cdr3_start'] alignment.cdr3_num_nts = cdr3_length alignment.post_cdr3_length = j_length alignment.insertions = insertions alignment.deletions = deletions alignment.locally_aligned = True tasks.append({ 'r_type': r_type, 'pk': int(pk), 'sample_id': int(sample_id), 'alignment': alignment }) return tasks
def cleanup(self): for fn in self.files: try: os.remove(fn) except OSError as e: logger.warning('Could not remove {}: {}'.format(fn, e))
def add_noresults_for_vdj(session, vdj, sample, reason): try: session.add(get_noresult_from_vdj(session, vdj, sample, reason)) except ValueError: logger.warning('Unable to add noresult')
def shutdown(): if allow_shutdown: logger.warning('Shutting down from remote request') os.kill(os.getppid(), signal.SIGINT) return create_response(code=404)
def update_metadata(session, args): SENTINEL = '__TEMP' # Used to temporarily avoid duplicate name issues IGNORE_FIELDS = ['id', 'name', 'subject'] with open(args.new_metadata) as fh: reader = csv.DictReader(fh, delimiter='\t') new_meta = {int(l['id']): l for l in reader} session.query(SampleMetadata).filter( SampleMetadata.sample_id.in_( new_meta.keys())).delete(synchronize_session='fetch') old_subjects = set() for sample_id, row in new_meta.items(): logger.info('Updating metadata for #{id}: {name}'.format(**row)) sample = session.query(Sample).get(sample_id) # Update subject if sample.subject.identifier != row['subject']: logger.info('Subject for sample "{}" changed from {} -> {}'.format( sample.name, sample.subject.identifier, row['subject'])) old_subjects.add(sample.subject) new_subject = session.query(Subject).filter( Subject.study == sample.study, Subject.identifier == row['subject']).first() if not new_subject: new_subject = Subject(study=sample.study, identifier=row['subject']) session.add(new_subject) session.flush() logger.info('\tNew subject found') else: old_subjects.add(new_subject) sample.subject = new_subject assert new_subject.id is not None session.query(Sequence).filter(Sequence.sample == sample).update( {'subject_id': new_subject.id}) for subject in session.query(Subject): if not subject.samples: logger.info('Deleting orphan subject "{}"'.format( subject.identifier)) session.delete(subject) elif subject in old_subjects: logger.info('Resetting subject "{}"'.format( subject.identifier)) subject.reset() # Update metadata session.add_all([ SampleMetadata(sample=sample, key=k, value=v) for k, v in row.items() if k not in IGNORE_FIELDS and v not in NA_VALUES ]) # Update name sample.name = row['name'] + SENTINEL session.commit() for sample in session.query(Sample).filter(Sample.name.like('%' + SENTINEL)): sample.name = sample.name[:-len(SENTINEL)] if session.query(Clone.id).filter(~Clone.tree.is_(None)).count() > 0: logger.warning('This database has at least one clonal lineage ' 'constructed. All lineages will need to be updated ' 'to reflect the modified metadata.') session.commit()
def shutdown(): if allow_shutdown: logger.warning('Shutting down from remote request') os.kill(os.getppid(), signal.SIGINT) return create_response(code=404)