def generate_consensus(session, clone_ids): """Generates consensus CDR3s for clones. :param Session session: The database session :param list clone_ids: The list of clone IDs to assign to groups """ if not clone_ids: return for clone in funcs.periodic_commit(session, session.query(Clone).filter( Clone.id.in_(clone_ids)), interval=1000): seqs = session.query(Sequence).join(SequenceCollapse).filter( Sequence.clone_id == clone.id, SequenceCollapse.copy_number_in_subject > 0).all() clone.cdr3_nt = funcs.consensus([s.cdr3_nt for s in seqs]) clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt) clone.germline = generate_germline(session, seqs, clone) clone.overall_total_cnt = sum([s.copy_number for s in seqs]) clone.functional = (clone.cdr3_num_nts % 3 == 0 and '*' not in clone.cdr3_aa and not lookups.has_stop(clone.germline)) session.commit()
def generate_consensus(session, clone_ids): """Generates consensus CDR3s for clones. :param Session session: The database session :param list clone_ids: The list of clone IDs to assign to groups """ if len(clone_ids) == 0: return for clone in funcs.periodic_commit( session, session.query(Clone).filter(Clone.id.in_(clone_ids)), interval=1000): seqs = session.query( Sequence ).join(SequenceCollapse).filter( Sequence.clone_id == clone.id, SequenceCollapse.copy_number_in_subject > 0 ).all() clone.cdr3_nt = funcs.consensus([s.cdr3_nt for s in seqs]) clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt) clone.germline = generate_germline(session, seqs, clone) session.commit()
def generate_consensus(session, clone_ids): """Generates consensus CDR3s for clones. :param Session session: The database session :param list clone_ids: The list of clone IDs to assign to groups """ if len(clone_ids) == 0: return for clone in funcs.periodic_commit( session, session.query(Clone).filter(Clone.id.in_(clone_ids)), interval=1000): seqs = session.query( Sequence ).join(SequenceCollapse).filter( Sequence.clone_id == clone.id, SequenceCollapse.copy_number_in_subject > 0 ).all() clone.cdr3_nt = consensus([s.cdr3_nt for s in seqs]) clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt) clone.germline = generate_germline(session, seqs, clone) session.commit()
def add_sequences_from_sample(session, sample, sequences, props): logger.info('Adding {} corrected sequences to sample {}'.format( len(sequences), sample.id)) for sequence in periodic_commit(session, sequences): alignment = sequence['alignment'] try: try: props.validate(alignment) except AlignmentException: continue if sequence['r_type'] == 'NoResult': add_sequences(session, [alignment], sample, error_action='raise') session.query(NoResult).filter( NoResult.pk == sequence['pk']).delete( synchronize_session=False) elif sequence['r_type'] == 'Sequence': fields = { 'partial': alignment.partial, 'probable_indel_or_misalign': alignment.has_possible_indel, 'v_gene': format_ties(alignment.v_gene), 'j_gene': format_ties(alignment.j_gene), 'num_gaps': alignment.num_gaps, 'seq_start': alignment.seq_start, 'v_match': alignment.v_match, 'v_length': alignment.v_length, 'j_match': alignment.j_match, 'j_length': alignment.j_length, 'removed_prefix': alignment.sequence.removed_prefix_sequence, 'removed_prefix_qual': alignment.sequence.removed_prefix_quality, 'v_mutation_fraction': alignment.v_mutation_fraction, 'pre_cdr3_length': alignment.pre_cdr3_length, 'pre_cdr3_match': alignment.pre_cdr3_match, 'post_cdr3_length': alignment.post_cdr3_length, 'post_cdr3_match': alignment.post_cdr3_match, 'in_frame': alignment.in_frame, 'functional': alignment.functional, 'stop': alignment.stop, 'cdr3_nt': alignment.cdr3, 'cdr3_num_nts': len(alignment.cdr3), 'cdr3_aa': lookups.aas_from_nts(alignment.cdr3), 'sequence': str(alignment.sequence.sequence), 'quality': alignment.sequence.quality, 'locally_aligned': alignment.locally_aligned, '_insertions': serialize_gaps(alignment.insertions), '_deletions': serialize_gaps(alignment.deletions), 'germline': alignment.germline } # This line doesnt actually add anything to the DB, it's just # to validate the fields Sequence(**fields) session.query(Sequence).filter( Sequence.ai == sequence['pk']).update( fields, synchronize_session=False) except ValueError: continue
def get_sample_vdjtools(session, sample, min_clone_size, clone_features): writer = StreamingTSV(['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j']) sample_clones = Counter() stats = session.query( CloneStats.clone_id, CloneStats.total_cnt).filter(CloneStats.sample_id == sample.id) for stat in stats: sample_clones[clone_features[stat.clone_id]] += stat.total_cnt total = sum(sample_clones.values()) yield writer.writeheader() for key in sorted(sample_clones, key=sample_clones.get, reverse=True): counts = sample_clones[key] if counts < min_clone_size: continue v, j, cdr3_nt = key yield writer.writerow({ 'count': counts, 'freq': counts / total, 'cdr3nt': cdr3_nt, 'cdr3aa': aas_from_nts(cdr3_nt), 'v': v, 'd': '.', 'j': j, })
def add_as_sequence(session, alignment, sample, error_action='discard'): try: seq = Sequence( seq_id=alignment.sequence.ids[0], sample_id=sample.id, subject_id=sample.subject.id, partial=alignment.partial, probable_indel_or_misalign=alignment.has_possible_indel, v_gene=funcs.format_ties(alignment.v_gene), j_gene=funcs.format_ties(alignment.j_gene), num_gaps=alignment.num_gaps, seq_start=alignment.seq_start, v_match=alignment.v_match, v_length=alignment.v_length, j_match=alignment.j_match, j_length=alignment.j_length, removed_prefix=alignment.sequence.removed_prefix_sequence, removed_prefix_qual=alignment.sequence.removed_prefix_quality, v_mutation_fraction=alignment.v_mutation_fraction, pre_cdr3_length=alignment.pre_cdr3_length, pre_cdr3_match=alignment.pre_cdr3_match, post_cdr3_length=alignment.post_cdr3_length, post_cdr3_match=alignment.post_cdr3_match, in_frame=alignment.in_frame, functional=alignment.functional, stop=alignment.stop, copy_number=len(alignment.sequence.ids), cdr3_nt=alignment.cdr3, cdr3_num_nts=len(alignment.cdr3), cdr3_aa=lookups.aas_from_nts(alignment.cdr3), sequence=str(alignment.sequence.sequence), quality=alignment.sequence.quality, locally_aligned=alignment.locally_aligned, insertions=alignment.insertions, deletions=alignment.deletions, germline=alignment.germline) session.add(seq) session.flush() # Add duplicate sequences try: session.bulk_save_objects([ DuplicateSequence(sample_id=sample.id, seq_id=seq_id, duplicate_seq_ai=seq.ai) for seq_id in alignment.sequence.ids[1:] ]) except ValueError as e: pass return seq except ValueError as e: if error_action == 'discard': add_as_noresult(session, alignment.sequence, sample, str(e)) return None elif error_action == 'raise': raise e
def _calculate_clone_stats(self, sample_id, min_cdr3, max_cdr3, include_outliers, only_full_reads): clone_statistics = {} for name, stat in _clone_contexts.items(): clone_statistics[name] = CloneContextStats(seqs=None, **stat) # TODO: This should be automatically generated from _dist_fields query = self._session.query( Sequence.clone_id, func.round(func.avg(Sequence.v_match)).label('v_match'), func.round(func.avg(Sequence.j_match)).label('j_match'), func.round(func.avg(Sequence.j_length)).label('j_length'), Sequence.v_gene, Sequence.j_gene, func.count(Sequence.seq_id).label('copy_number'), func.round( func.avg(Sequence.v_length + Sequence.num_gaps) ).label('v_length'), func.round( func.avg(100 * Sequence.v_match / Sequence.v_length) ).label('v_identity'), Sequence.cdr3_num_nts.label('cdr3_length'), SelectionPressure.sigma_fwr.label('sp_fwr'), SelectionPressure.sigma_cdr.label('sp_cdr'), ).join( SelectionPressure, and_( SelectionPressure.clone_id == Sequence.clone_id, SelectionPressure.sample_id == Sequence.sample_id ), isouter=True ).filter( Sequence.sample_id == sample_id, ~Sequence.clone_id.is_(None) ) if only_full_reads: query = query.filter(Sequence.partial == 0) query = query.group_by(Sequence.clone_id) for clone in query: clone_info = self._session.query(Clone.cdr3_nt).filter( Clone.id == clone.clone_id).first() in_frame = len(clone_info.cdr3_nt) % 3 == 0 stop = '*' in lookups.aas_from_nts(clone_info.cdr3_nt) functional = in_frame and not stop for name, stat in clone_statistics.items(): stat.add_if_match(clone, in_frame, stop, functional) self._add_stat(clone_statistics, sample_id, include_outliers, only_full_reads)
def add_as_sequence(session, vdj, sample): try: seq = Sequence(seq_id=vdj.ids[0], sample_id=sample.id, subject_id=sample.subject.id, partial=vdj.partial, probable_indel_or_misalign=vdj.has_possible_indel, v_gene=funcs.format_ties(vdj.v_gene, vdj.v_germlines.prefix, strip_alleles=True), j_gene=funcs.format_ties(vdj.j_gene, vdj.j_germlines.prefix, strip_alleles=True), num_gaps=vdj.num_gaps, pad_length=vdj.pad_length, v_match=vdj.v_match, v_length=vdj.v_length, j_match=vdj.j_match, j_length=vdj.j_length, removed_prefix=vdj.removed_prefix, removed_prefix_qual=vdj.removed_prefix_qual, v_mutation_fraction=vdj.mutation_fraction, pre_cdr3_length=vdj.pre_cdr3_length, pre_cdr3_match=vdj.pre_cdr3_match, post_cdr3_length=vdj.post_cdr3_length, post_cdr3_match=vdj.post_cdr3_match, in_frame=vdj.in_frame, functional=vdj.functional, stop=vdj.stop, copy_number=len(vdj.ids), cdr3_nt=vdj.cdr3, cdr3_num_nts=len(vdj.cdr3), cdr3_aa=lookups.aas_from_nts(vdj.cdr3), sequence=str(vdj.sequence), quality=vdj.quality, germline=vdj.germline) session.add(seq) session.flush() # Add duplicate sequences try: session.bulk_save_objects([ DuplicateSequence(sample_id=sample.id, seq_id=seq_id, duplicate_seq_ai=seq.ai) for seq_id in vdj.ids[1:] ]) except ValueError: pass except ValueError as e: add_as_noresult(session, vdj, sample, str(e))
def get_seq_from_alignment(session, alignment, sample, strip_alleles=True): try: return [ Sequence( seq_id=alignment.sequence.seq_id, sample_id=sample.id, subject_id=sample.subject.id, partial=alignment.partial, rev_comp=alignment.sequence.rev_comp, probable_indel_or_misalign=alignment.has_possible_indel, v_gene=funcs.format_ties(alignment.v_gene, strip_alleles), j_gene=funcs.format_ties(alignment.j_gene, strip_alleles), num_gaps=alignment.num_gaps, seq_start=alignment.seq_start, v_match=alignment.v_match, v_length=alignment.v_length, j_match=alignment.j_match, j_length=alignment.j_length, removed_prefix=alignment.sequence.removed_prefix_sequence, removed_prefix_qual=alignment.sequence.removed_prefix_quality, v_mutation_fraction=alignment.v_mutation_fraction, pre_cdr3_length=alignment.pre_cdr3_length, pre_cdr3_match=alignment.pre_cdr3_match, post_cdr3_length=alignment.post_cdr3_length, post_cdr3_match=alignment.post_cdr3_match, in_frame=alignment.in_frame, functional=alignment.functional, stop=alignment.stop, copy_number=alignment.sequence.copy_number, cdr3_nt=alignment.cdr3, cdr3_num_nts=len(alignment.cdr3), cdr3_aa=lookups.aas_from_nts(alignment.cdr3), sequence=str(alignment.sequence.sequence), quality=alignment.sequence.quality, locally_aligned=alignment.locally_aligned, insertions=alignment.insertions, deletions=alignment.deletions, germline=alignment.germline) ] except ValueError as e: try: return [ get_noresult_from_vdj(session, alignment.sequence, sample, str(e)) ] except ValueError: return []
def export_vdjtools(session, args): fieldnames = ['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j'] if args.include_uniques: fieldnames.append('unique') clone_features = { c.id: (c.v_gene, c.j_gene, c.cdr3_nt) for c in session.query(Clone.id, Clone.v_gene, Clone.j_gene, Clone.cdr3_nt) } for sample in session.query(Sample).order_by(Sample.id): logger.info('Exporting sample {}'.format(sample.name)) sample_clones = {} stats = session.query( CloneStats.clone_id, CloneStats.total_cnt, CloneStats.unique_cnt).filter(CloneStats.sample_id == sample.id) for stat in stats: key = clone_features[stat.clone_id] sample_clones.setdefault(key, Counter())['total'] += stat.total_cnt sample_clones[key]['unique'] += stat.unique_cnt writer = csv.DictWriter(open('{}.sample.txt'.format(sample.name), 'w+'), fieldnames=fieldnames, delimiter='\t', extrasaction='ignore') total = float(sum([c['total'] for c in sample_clones.values()])) writer.writeheader() for key in sorted(sample_clones, key=sample_clones.get, reverse=True): counts = sample_clones[key] if counts['total'] < args.min_clone_size: continue v, j, cdr3_nt = key writer.writerow({ 'count': counts['total'], 'freq': counts['total'] / total, 'cdr3nt': cdr3_nt, 'cdr3aa': aas_from_nts(cdr3_nt), 'v': v, 'd': '.', 'j': j, 'unique': counts['unique'] })
def get_vdjtools_output(session, clones): writer = StreamingTSV(['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j']) counts = Counter() total_copies = 0 for clone, agg in clones.items(): key = (clone.v_gene, clone.j_gene, clone.cdr3_nt) counts[key] += agg['counts']['copies'] total_copies += counts[key] yield writer.writeheader() for key in sorted(counts, key=counts.get, reverse=True): count = counts[key] v, j, cdr3_nt = key yield writer.writerow({ 'count': count, 'freq': count / total_copies, 'cdr3nt': cdr3_nt, 'cdr3aa': aas_from_nts(cdr3_nt), 'v': v, 'd': '.', 'j': j, })
def add_sequences_from_sample(session, sample, sequences, props): logger.info('Adding {} corrected sequences to sample {}'.format( len(sequences), sample.id)) for sequence in periodic_commit(session, sequences): alignment = sequence['alignment'] try: try: props.validate(alignment) except AlignmentException: continue if sequence['r_type'] == 'NoResult': add_sequences(session, [alignment], sample, error_action='raise') session.query(NoResult).filter( NoResult.pk == sequence['pk'] ).delete(synchronize_session=False) elif sequence['r_type'] == 'Sequence': fields = { 'partial': alignment.partial, 'probable_indel_or_misalign': alignment.has_possible_indel, 'v_gene': format_ties(alignment.v_gene), 'j_gene': format_ties(alignment.j_gene), 'num_gaps': alignment.num_gaps, 'seq_start': alignment.seq_start, 'v_match': alignment.v_match, 'v_length': alignment.v_length, 'j_match': alignment.j_match, 'j_length': alignment.j_length, 'removed_prefix': alignment.sequence.removed_prefix_sequence, 'removed_prefix_qual': alignment.sequence.removed_prefix_quality, 'v_mutation_fraction': alignment.v_mutation_fraction, 'pre_cdr3_length': alignment.pre_cdr3_length, 'pre_cdr3_match': alignment.pre_cdr3_match, 'post_cdr3_length': alignment.post_cdr3_length, 'post_cdr3_match': alignment.post_cdr3_match, 'in_frame': alignment.in_frame, 'functional': alignment.functional, 'stop': alignment.stop, 'cdr3_nt': alignment.cdr3, 'cdr3_num_nts': len(alignment.cdr3), 'cdr3_aa': lookups.aas_from_nts(alignment.cdr3), 'sequence': str(alignment.sequence.sequence), 'quality': alignment.sequence.quality, 'locally_aligned': alignment.locally_aligned, '_insertions': serialize_gaps(alignment.insertions), '_deletions': serialize_gaps(alignment.deletions), 'germline': alignment.germline } # This line doesnt actually add anything to the DB, it's just # to validate the fields Sequence(**fields) session.query(Sequence).filter( Sequence.ai == sequence['pk'] ).update(fields, synchronize_session=False) except ValueError: continue