def add_uniques(session, sample, vdjs, realign_len=None, realign_mut=None, min_similarity=0, max_vties=50, trim_to=None, max_padding=None): bucketed_seqs = OrderedDict() vdjs = sorted(vdjs, key=lambda v: v.ids[0]) for vdj in funcs.periodic_commit(session, vdjs): try: if realign_len is not None: vdj.align_to_germline(realign_len, realign_mut, trim_to) if vdj.v_match / float(vdj.v_length) < min_similarity: raise AlignmentException('V-identity too low {} < {}'.format( vdj.v_match / float(vdj.v_length), min_similarity)) if len(vdj.v_gene) > max_vties: raise AlignmentException('Too many V-ties {} > {}'.format( len(vdj.v_gene), max_vties)) if max_padding is not None and vdj.pad_length > max_padding: raise AlignmentException('Too much padding {} (max {})'.format( vdj.pad_length, max_padding)) bucket_key = (funcs.format_ties(vdj.v_gene, vdj.v_germlines.prefix, strip_alleles=True), funcs.format_ties(vdj.j_gene, vdj.j_germlines.prefix, strip_alleles=True), len(vdj.cdr3)) if bucket_key not in bucketed_seqs: bucketed_seqs[bucket_key] = {} bucket = bucketed_seqs[bucket_key] if vdj.sequence in bucket: bucket[vdj.sequence].ids += vdj.ids else: bucket[vdj.sequence] = vdj except AlignmentException as e: add_as_noresult(session, vdj, sample, str(e)) except: logger.error('\tUnexpected error processing sequence ' '{}\n\t{}'.format(vdj.ids[0], traceback.format_exc())) # Collapse sequences that are the same except for Ns for bucket, sequences in funcs.periodic_commit(session, bucketed_seqs.iteritems()): sequences = sorted(sequences.values(), key=lambda s: (len(s.ids), s.ids[0]), reverse=True) while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence, smaller.sequence): larger.ids += smaller.ids del sequences[i] add_as_sequence(session, larger, sample) session.commit()
def add_uniques(session, sample, alignments, props, aligner, realign_len=None, realign_mut=None): bucketed_seqs = OrderedDict() alignments = sorted(alignments, key=lambda v: v.sequence.ids[0]) for alignment in funcs.periodic_commit(session, alignments): try: if realign_len is not None: aligner.align_to_germline(alignment, realign_len, realign_mut) if props.trim_to: alignment.trim_to(props.trim_to) props.validate(alignment) bucket_key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), len(alignment.cdr3)) if bucket_key not in bucketed_seqs: bucketed_seqs[bucket_key] = {} bucket = bucketed_seqs[bucket_key] if alignment.sequence.sequence in bucket: bucket[alignment.sequence.sequence].sequence.ids += ( alignment.sequence.ids) else: bucket[alignment.sequence.sequence] = alignment except AlignmentException as e: add_as_noresult(session, alignment.sequence, sample, str(e)) except Exception: logger.error('\tUnexpected error processing sequence ' '{}\n\t{}'.format(alignment.sequence.ids[0], traceback.format_exc())) # Collapse sequences that are the same except for Ns for bucket, sequences in funcs.periodic_commit(session, bucketed_seqs.iteritems()): sequences = sorted(sequences.values(), key=lambda s: (len(s.sequence.ids), s.sequence.ids[0]), reverse=True) while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence.sequence, smaller.sequence.sequence): larger.sequence.ids += smaller.sequence.ids del sequences[i] add_as_sequence(session, larger, sample) session.commit()
def generate_consensus(session, clone_ids): """Generates consensus CDR3s for clones. :param Session session: The database session :param list clone_ids: The list of clone IDs to assign to groups """ if len(clone_ids) == 0: return for clone in funcs.periodic_commit( session, session.query(Clone).filter(Clone.id.in_(clone_ids)), interval=1000): seqs = session.query( Sequence ).join(SequenceCollapse).filter( Sequence.clone_id == clone.id, SequenceCollapse.copy_number_in_subject > 0 ).all() clone.cdr3_nt = funcs.consensus([s.cdr3_nt for s in seqs]) clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt) clone.germline = generate_germline(session, seqs, clone) session.commit()
def generate_consensus(session, clone_ids): """Generates consensus CDR3s for clones. :param Session session: The database session :param list clone_ids: The list of clone IDs to assign to groups """ if len(clone_ids) == 0: return for clone in funcs.periodic_commit( session, session.query(Clone).filter(Clone.id.in_(clone_ids)), interval=1000): seqs = session.query( Sequence ).join(SequenceCollapse).filter( Sequence.clone_id == clone.id, SequenceCollapse.copy_number_in_subject > 0 ).all() clone.cdr3_nt = consensus([s.cdr3_nt for s in seqs]) clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt) clone.germline = generate_germline(session, seqs, clone) session.commit()
def generate_consensus(session, clone_ids): """Generates consensus CDR3s for clones. :param Session session: The database session :param list clone_ids: The list of clone IDs to assign to groups """ if not clone_ids: return for clone in funcs.periodic_commit(session, session.query(Clone).filter( Clone.id.in_(clone_ids)), interval=1000): seqs = session.query(Sequence).join(SequenceCollapse).filter( Sequence.clone_id == clone.id, SequenceCollapse.copy_number_in_subject > 0).all() clone.cdr3_nt = funcs.consensus([s.cdr3_nt for s in seqs]) clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt) clone.germline = generate_germline(session, seqs, clone) clone.overall_total_cnt = sum([s.copy_number for s in seqs]) clone.functional = (clone.cdr3_num_nts % 3 == 0 and '*' not in clone.cdr3_aa and not lookups.has_stop(clone.germline)) session.commit()
def add_sequences_from_sample(session, sample, sequences, props): logger.info('Adding {} corrected sequences to sample {}'.format( len(sequences), sample.id)) for sequence in periodic_commit(session, sequences): alignment = sequence['alignment'] try: try: props.validate(alignment) except AlignmentException: continue if sequence['r_type'] == 'NoResult': add_sequences(session, [alignment], sample, error_action='raise') session.query(NoResult).filter( NoResult.pk == sequence['pk']).delete( synchronize_session=False) elif sequence['r_type'] == 'Sequence': fields = { 'partial': alignment.partial, 'probable_indel_or_misalign': alignment.has_possible_indel, 'v_gene': format_ties(alignment.v_gene), 'j_gene': format_ties(alignment.j_gene), 'num_gaps': alignment.num_gaps, 'seq_start': alignment.seq_start, 'v_match': alignment.v_match, 'v_length': alignment.v_length, 'j_match': alignment.j_match, 'j_length': alignment.j_length, 'removed_prefix': alignment.sequence.removed_prefix_sequence, 'removed_prefix_qual': alignment.sequence.removed_prefix_quality, 'v_mutation_fraction': alignment.v_mutation_fraction, 'pre_cdr3_length': alignment.pre_cdr3_length, 'pre_cdr3_match': alignment.pre_cdr3_match, 'post_cdr3_length': alignment.post_cdr3_length, 'post_cdr3_match': alignment.post_cdr3_match, 'in_frame': alignment.in_frame, 'functional': alignment.functional, 'stop': alignment.stop, 'cdr3_nt': alignment.cdr3, 'cdr3_num_nts': len(alignment.cdr3), 'cdr3_aa': lookups.aas_from_nts(alignment.cdr3), 'sequence': str(alignment.sequence.sequence), 'quality': alignment.sequence.quality, 'locally_aligned': alignment.locally_aligned, '_insertions': serialize_gaps(alignment.insertions), '_deletions': serialize_gaps(alignment.deletions), 'germline': alignment.germline } # This line doesnt actually add anything to the DB, it's just # to validate the fields Sequence(**fields) session.query(Sequence).filter( Sequence.ai == sequence['pk']).update( fields, synchronize_session=False) except ValueError: continue
def add_results(uniques, sample, session): metrics = {'muts': [], 'lens': []} for unique in funcs.periodic_commit(session, itertools.chain(*uniques), interval=1000): try: add_sequences(session, [unique], sample) metrics['lens'].append(unique.v_length) metrics['muts'].append(unique.v_mutation_fraction) except AlignmentException as e: add_noresults_for_vdj(session, unique.sequence, sample, str(e)) if metrics['lens']: sample.v_ties_len = sum(metrics['lens']) / len(metrics['lens']) sample.v_ties_mutations = sum(metrics['muts']) / len(metrics['muts']) session.commit()
def process_sample(db_config, v_germlines, j_germlines, path, meta, props, nproc): session = config.init_db(db_config) start = time.time() logger.info('Starting sample {}'.format(meta['sample_name'])) sample = setup_sample(session, meta) aligner = AnchorAligner(v_germlines, j_germlines) # Initial VJ assignment alignments = concurrent.process_data( read_input, process_vdj, aggregate_vdj, nproc, process_args={'aligner': aligner}, generate_args={'path': path}, ) logger.info('Adding noresults') for result in alignments['noresult']: add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) alignments = alignments['success'] if alignments: avg_len = (sum([v.v_length for v in alignments]) / len(alignments)) avg_mut = (sum([v.v_mutation_fraction for v in alignments]) / len(alignments)) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len logger.info('Re-aligning {} sequences to V-ties: Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) session.commit() # Realign to V-ties v_ties = concurrent.process_data( alignments, process_vties, aggregate_vties, nproc, process_args={ 'aligner': aligner, 'avg_len': avg_len, 'avg_mut': avg_mut, 'props': props }, ) logger.info('Adding noresults') for result in funcs.periodic_commit(session, v_ties['noresult'], 100): add_noresults_for_vdj(session, result['alignment'].sequence, sample, result['reason']) logger.info('Collapsing {} buckets'.format(len(v_ties['success']))) session.commit() # TODO: Change this so we arent copying everything between processes concurrent.process_data([list(v) for v in v_ties['success']], process_collapse, aggregate_collapse, nproc, aggregate_args={ 'db_config': db_config, 'sample_id': sample.id, 'props': props }) session.expire_all() session.commit() identified = int( session.query(func.sum(Sequence.copy_number)).filter( Sequence.sample == sample).scalar() or 0) noresults = int( session.query(func.count( NoResult.pk)).filter(NoResult.sample == sample).scalar() or 0) if identified + noresults: frac = int(100 * identified / (identified + noresults)) else: frac = 0 logger.info( 'Completed sample {} in {}m - {}/{} ({}%) identified'.format( sample.name, round((time.time() - start) / 60., 1), identified, identified + noresults, frac)) session.close()
def do_task(self, args): meta = args['meta'] self.info('Starting sample {}'.format(meta['sample_name'])) study, sample = self._setup_sample(meta) vdjs = {} parser = SeqIO.parse( args['path'], 'fasta' if args['path'].endswith('.fasta') else 'fastq') # Collapse identical sequences self.info('\tCollapsing identical sequences') for record in parser: try: seq = str(record.seq) if seq not in vdjs: vdjs[seq] = VDJSequence( ids=[], sequence=seq, quality=funcs.ord_to_quality( record.letter_annotations.get('phred_quality'))) vdjs[seq].ids.append(record.description) except ValueError: continue alignments = {} aligner = AnchorAligner(self._v_germlines, self._j_germlines) self.info('\tAligning {} unique sequences'.format(len(vdjs))) # Attempt to align all unique sequences for sequence in funcs.periodic_commit(self._session, sorted(vdjs.keys())): vdj = vdjs[sequence] del vdjs[sequence] try: # The alignment was successful. If the aligned sequence # already exists, append the seq_ids. Otherwise add it as a # new unique sequence. alignment = aligner.get_alignment(vdj) seq_key = alignment.sequence.sequence if seq_key in alignments: alignments[seq_key].sequence.ids.extend( alignment.sequence.ids) else: alignments[seq_key] = alignment except AlignmentException as e: add_as_noresult(self._session, vdj, sample, str(e)) except Exception: self.error( '\tUnexpected error processing sequence {}\n\t{}'.format( vdj.ids[0], traceback.format_exc())) if len(alignments) > 0: avg_len = (sum([v.v_length for v in alignments.values()]) / float(len(alignments))) avg_mut = ( sum([v.v_mutation_fraction for v in alignments.values()]) / float(len(alignments))) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) add_uniques(self._session, sample, alignments.values(), self._props, aligner, avg_len, avg_mut) self._session.commit() self.info('Completed sample {}'.format(sample.name))
def process_sample(db_config, v_germlines, j_germlines, path, meta, props, nproc): session = config.init_db(db_config) start = time.time() logger.info('Starting sample {}'.format(meta['sample_name'])) sample = setup_sample(session, meta) aligner = AnchorAligner(v_germlines, j_germlines) # Initial VJ assignment alignments = concurrent.process_data( read_input, process_vdj, aggregate_vdj, nproc, process_args={'aligner': aligner}, generate_args={'path': path}, ) logger.info('Adding noresults') for result in alignments['noresult']: add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) alignments = alignments['success'] if alignments: avg_len = ( sum([v.v_length for v in alignments]) / len(alignments)) avg_mut = ( sum([v.v_mutation_fraction for v in alignments]) / len(alignments) ) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len logger.info('Re-aligning {} sequences to V-ties: Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) session.commit() # Realign to V-ties v_ties = concurrent.process_data( alignments, process_vties, aggregate_vties, nproc, process_args={'aligner': aligner, 'avg_len': avg_len, 'avg_mut': avg_mut, 'props': props}, ) logger.info('Adding noresults') for result in funcs.periodic_commit(session, v_ties['noresult'], 100): add_noresults_for_vdj(session, result['alignment'].sequence, sample, result['reason']) logger.info('Collapsing {} buckets'.format(len(v_ties['success']))) session.commit() # TODO: Change this so we arent copying everything between processes concurrent.process_data( [list(v) for v in v_ties['success']], process_collapse, aggregate_collapse, nproc, aggregate_args={'db_config': db_config, 'sample_id': sample.id, 'props': props} ) session.expire_all() session.commit() identified = int(session.query( func.sum(Sequence.copy_number) ).filter( Sequence.sample == sample ).scalar() or 0) noresults = int(session.query( func.count(NoResult.pk) ).filter( NoResult.sample == sample ).scalar() or 0) if identified + noresults: frac = int(100 * identified / (identified + noresults)) else: frac = 0 logger.info( 'Completed sample {} in {}m - {}/{} ({}%) identified'.format( sample.name, round((time.time() - start) / 60., 1), identified, identified + noresults, frac ) ) session.close()
def do_task(self, args): meta = args['meta'] self.info('Starting sample {}'.format(meta.get('sample_name'))) study, sample = self._setup_sample(meta) vdjs = {} parser = SeqIO.parse( os.path.join(args['path'], args['fn']), 'fasta' if args['fn'].endswith('.fasta') else 'fastq') # Collapse identical sequences self.info('\tCollapsing identical sequences') for record in parser: seq = str(record.seq) if seq not in vdjs: vdjs[seq] = VDJSequence( ids=[], seq=seq, v_germlines=self._v_germlines, j_germlines=self._j_germlines, quality=funcs.ord_to_quality( record.letter_annotations.get('phred_quality'))) vdjs[seq].ids.append(record.description) self.info('\tAligning {} unique sequences'.format(len(vdjs))) # Attempt to align all unique sequences for sequence in funcs.periodic_commit(self._session, sorted(vdjs.keys())): vdj = vdjs[sequence] del vdjs[sequence] try: # The alignment was successful. If the aligned sequence # already exists, append the seq_ids. Otherwise add it as a # new unique sequence. vdj.analyze() if vdj.sequence in vdjs: vdjs[vdj.sequence].ids += vdj.ids else: vdjs[vdj.sequence] = vdj except AlignmentException as e: add_as_noresult(self._session, vdj, sample, str(e)) except: self.error( '\tUnexpected error processing sequence {}\n\t{}'.format( vdj.ids[0], traceback.format_exc())) if len(vdjs) > 0: avg_len = sum(map(lambda vdj: vdj.v_length, vdjs.values())) / float(len(vdjs)) avg_mut = sum(map(lambda vdj: vdj.mutation_fraction, vdjs.values())) / float(len(vdjs)) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len self.info('\tRe-aligning {} sequences to V-ties, Mutations={}, ' 'Length={}'.format(len(vdjs), round(avg_mut, 2), round(avg_len, 2))) add_uniques(self._session, sample, vdjs.values(), avg_len, avg_mut, self._min_similarity, self._max_vties, self._trim_to, self._max_padding) self._session.commit() self.info('Completed sample {}'.format(sample.name))
def add_sequences_from_sample(session, sample, sequences, props): logger.info('Adding {} corrected sequences to sample {}'.format( len(sequences), sample.id)) for sequence in periodic_commit(session, sequences): alignment = sequence['alignment'] try: try: props.validate(alignment) except AlignmentException: continue if sequence['r_type'] == 'NoResult': add_sequences(session, [alignment], sample, error_action='raise') session.query(NoResult).filter( NoResult.pk == sequence['pk'] ).delete(synchronize_session=False) elif sequence['r_type'] == 'Sequence': fields = { 'partial': alignment.partial, 'probable_indel_or_misalign': alignment.has_possible_indel, 'v_gene': format_ties(alignment.v_gene), 'j_gene': format_ties(alignment.j_gene), 'num_gaps': alignment.num_gaps, 'seq_start': alignment.seq_start, 'v_match': alignment.v_match, 'v_length': alignment.v_length, 'j_match': alignment.j_match, 'j_length': alignment.j_length, 'removed_prefix': alignment.sequence.removed_prefix_sequence, 'removed_prefix_qual': alignment.sequence.removed_prefix_quality, 'v_mutation_fraction': alignment.v_mutation_fraction, 'pre_cdr3_length': alignment.pre_cdr3_length, 'pre_cdr3_match': alignment.pre_cdr3_match, 'post_cdr3_length': alignment.post_cdr3_length, 'post_cdr3_match': alignment.post_cdr3_match, 'in_frame': alignment.in_frame, 'functional': alignment.functional, 'stop': alignment.stop, 'cdr3_nt': alignment.cdr3, 'cdr3_num_nts': len(alignment.cdr3), 'cdr3_aa': lookups.aas_from_nts(alignment.cdr3), 'sequence': str(alignment.sequence.sequence), 'quality': alignment.sequence.quality, 'locally_aligned': alignment.locally_aligned, '_insertions': serialize_gaps(alignment.insertions), '_deletions': serialize_gaps(alignment.deletions), 'germline': alignment.germline } # This line doesnt actually add anything to the DB, it's just # to validate the fields Sequence(**fields) session.query(Sequence).filter( Sequence.ai == sequence['pk'] ).update(fields, synchronize_session=False) except ValueError: continue