def remove_duplicates(session, sample): logger.info('Removing duplicates from sample {}'.format(sample.id)) seqs = session.query(Sequence).filter( Sequence.locally_aligned.is_(True), Sequence.sample_id == sample.id).order_by(Sequence.ai) for seq in seqs: potential_collapse = session.query(Sequence).filter( Sequence.sample_id == sample.id, Sequence.v_gene == seq.v_gene, Sequence.j_gene == seq.j_gene, Sequence.cdr3_num_nts == seq.cdr3_num_nts, ).order_by(desc(Sequence.copy_number), Sequence.ai) for other_seq in potential_collapse: if (other_seq.seq_id == seq.seq_id or len(other_seq.sequence) != len(seq.sequence)): continue if dnautils.equal(other_seq.sequence, seq.sequence): other_seq.copy_number += seq.copy_number session.delete(seq) break session.commit()
def get_cigar(ref, qry): assert len(ref) == len(qry) cnt = 0 current_op = None cigar = [] for r, q in zip(ref, qry): if r == q == '-': continue elif dnautils.equal(r, q): op = '=' elif r == '-': op = 'I' elif q == '-': op = 'D' else: op = 'X' if op != current_op: if current_op: cigar.append('{}{}'.format(cnt, current_op)) current_op = op cnt = 1 else: cnt += 1 cigar.append('{}{}'.format(cnt, current_op)) return ''.join(cigar)
def remove_duplicates(session, sample): logger.info('Removing duplicates from sample {}'.format(sample.id)) seqs = session.query( Sequence ).filter( Sequence.locally_aligned.is_(True), Sequence.sample_id == sample.id ).order_by(Sequence.ai) for seq in seqs: potential_collapse = session.query( Sequence ).filter( Sequence.sample_id == sample.id, Sequence.v_gene == seq.v_gene, Sequence.j_gene == seq.j_gene, Sequence.cdr3_num_nts == seq.cdr3_num_nts, ).order_by(desc(Sequence.copy_number), Sequence.ai) for other_seq in potential_collapse: if (other_seq.seq_id == seq.seq_id or len(other_seq.sequence) != len(seq.sequence)): continue if dnautils.equal(other_seq.sequence, seq.sequence): other_seq.copy_number += seq.copy_number session.delete(seq) break session.commit()
def add_uniques(session, sample, vdjs, realign_len=None, realign_mut=None, min_similarity=0, max_vties=50, trim_to=None, max_padding=None): bucketed_seqs = OrderedDict() vdjs = sorted(vdjs, key=lambda v: v.ids[0]) for vdj in funcs.periodic_commit(session, vdjs): try: if realign_len is not None: vdj.align_to_germline(realign_len, realign_mut, trim_to) if vdj.v_match / float(vdj.v_length) < min_similarity: raise AlignmentException('V-identity too low {} < {}'.format( vdj.v_match / float(vdj.v_length), min_similarity)) if len(vdj.v_gene) > max_vties: raise AlignmentException('Too many V-ties {} > {}'.format( len(vdj.v_gene), max_vties)) if max_padding is not None and vdj.pad_length > max_padding: raise AlignmentException('Too much padding {} (max {})'.format( vdj.pad_length, max_padding)) bucket_key = (funcs.format_ties(vdj.v_gene, vdj.v_germlines.prefix, strip_alleles=True), funcs.format_ties(vdj.j_gene, vdj.j_germlines.prefix, strip_alleles=True), len(vdj.cdr3)) if bucket_key not in bucketed_seqs: bucketed_seqs[bucket_key] = {} bucket = bucketed_seqs[bucket_key] if vdj.sequence in bucket: bucket[vdj.sequence].ids += vdj.ids else: bucket[vdj.sequence] = vdj except AlignmentException as e: add_as_noresult(session, vdj, sample, str(e)) except: logger.error('\tUnexpected error processing sequence ' '{}\n\t{}'.format(vdj.ids[0], traceback.format_exc())) # Collapse sequences that are the same except for Ns for bucket, sequences in funcs.periodic_commit(session, bucketed_seqs.iteritems()): sequences = sorted(sequences.values(), key=lambda s: (len(s.ids), s.ids[0]), reverse=True) while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence, smaller.sequence): larger.ids += smaller.ids del sequences[i] add_as_sequence(session, larger, sample) session.commit()
def add_uniques(session, sample, alignments, props, aligner, realign_len=None, realign_mut=None): bucketed_seqs = OrderedDict() alignments = sorted(alignments, key=lambda v: v.sequence.ids[0]) for alignment in funcs.periodic_commit(session, alignments): try: if realign_len is not None: aligner.align_to_germline(alignment, realign_len, realign_mut) if props.trim_to: alignment.trim_to(props.trim_to) props.validate(alignment) bucket_key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), len(alignment.cdr3)) if bucket_key not in bucketed_seqs: bucketed_seqs[bucket_key] = {} bucket = bucketed_seqs[bucket_key] if alignment.sequence.sequence in bucket: bucket[alignment.sequence.sequence].sequence.ids += ( alignment.sequence.ids) else: bucket[alignment.sequence.sequence] = alignment except AlignmentException as e: add_as_noresult(session, alignment.sequence, sample, str(e)) except Exception: logger.error('\tUnexpected error processing sequence ' '{}\n\t{}'.format(alignment.sequence.ids[0], traceback.format_exc())) # Collapse sequences that are the same except for Ns for bucket, sequences in funcs.periodic_commit(session, bucketed_seqs.iteritems()): sequences = sorted(sequences.values(), key=lambda s: (len(s.sequence.ids), s.sequence.ids[0]), reverse=True) while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence.sequence, smaller.sequence.sequence): larger.sequence.ids += smaller.sequence.ids del sequences[i] add_as_sequence(session, larger, sample) session.commit()
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props): reader = csv.DictReader(handle, delimiter='\t') uniques = {} for i, line in enumerate(reader): if fmt == 'adaptive': try: line = extract_adaptive_sequence(i, line, v_germlines, j_germlines) except (AlignmentException, KeyError) as e: seq = VDJSequence('seq_{}'.format(i), '') add_noresults_for_vdj(session, seq, sample, str(e)) continue seq = VDJSequence(line['SEQUENCE_ID'], line['SEQUENCE_IMGT'].replace('.', '-')) if 'DUPCOUNT' in line: seq.copy_number = int(line['DUPCOUNT']) try: alignment = create_alignment(seq, line, v_germlines, j_germlines) for other in uniques.setdefault( len(alignment.sequence.sequence), []): if dnautils.equal(other.sequence.sequence, alignment.sequence.sequence): other.sequence.copy_number += ( alignment.sequence.copy_number) break else: uniques[len(alignment.sequence.sequence)].append(alignment) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]] lens = [] muts = [] for unique in uniques: try: props.validate(unique) add_sequences(session, [unique], sample) lens.append(unique.v_length) muts.append(unique.v_mutation_fraction) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) if len(lens) > 0: sample.v_ties_len = sum(lens) / len(lens) sample.v_ties_mutations = sum(muts) / len(muts) session.commit()
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props): reader = csv.DictReader(handle, delimiter='\t') uniques = {} for i, line in enumerate(reader): if fmt == 'adaptive': try: line = extract_adaptive_sequence(i, line, v_germlines, j_germlines) except (AlignmentException, KeyError) as e: seq = VDJSequence('seq_{}'.format(i), '') add_noresults_for_vdj(session, seq, sample, str(e)) continue seq = VDJSequence(line['SEQUENCE_ID'], line['SEQUENCE_IMGT'].replace('.', '-')) if 'DUPCOUNT' in line: seq.copy_number = int(line['DUPCOUNT']) try: alignment = create_alignment(seq, line, v_germlines, j_germlines) for other in uniques.setdefault(len(alignment.sequence.sequence), []): if dnautils.equal(other.sequence.sequence, alignment.sequence.sequence): other.sequence.copy_number += ( alignment.sequence.copy_number) break else: uniques[len(alignment.sequence.sequence)].append(alignment) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]] lens = [] muts = [] for unique in uniques: try: props.validate(unique) add_sequences(session, [unique], sample) lens.append(unique.v_length) muts.append(unique.v_mutation_fraction) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) if len(lens) > 0: sample.v_ties_len = sum(lens) / float(len(lens)) sample.v_ties_mutations = sum(muts) / float(len(muts)) session.commit()
def process_collapse(sequences): sequences = sorted(sequences, key=lambda s: (s.sequence.copy_number, s.sequence.seq_id), reverse=True) uniques = [] while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence.sequence, smaller.sequence.sequence): larger.sequence.copy_number += smaller.sequence.copy_number del sequences[i] uniques.append(larger) return uniques
def process_collapse(sequences): sequences = sorted( sequences, key=lambda s: (s.sequence.copy_number, s.sequence.seq_id), reverse=True ) uniques = [] while len(sequences) > 0: larger = sequences.pop(0) for i in reversed(range(len(sequences))): smaller = sequences[i] if dnautils.equal(larger.sequence.sequence, smaller.sequence.sequence): larger.sequence.copy_number += smaller.sequence.copy_number del sequences[i] uniques.append(larger) return uniques
def collapse_duplicate_alignments(bucket): uniques = [] while bucket: alignment = bucket.pop() for i, other_alignment in enumerate(bucket): if (len(alignment.sequence.sequence) != len( other_alignment.sequence.sequence)): logger.warning('Sequence lengths differ {} {}'.format( alignment.sequence.seq_id, other_alignment.sequence.seq_id)) continue if dnautils.equal(alignment.sequence.sequence, other_alignment.sequence.sequence): alignment.sequence.copy_number += ( other_alignment.sequence.copy_number) bucket.pop(i) uniques.append(alignment) return uniques
def collapse_duplicates(bucket): uniques = [] while bucket: alignment = bucket.pop() for i, other_alignment in enumerate(bucket): if (len(alignment.sequence.sequence) != len(other_alignment.sequence.sequence)): logger.warning('Sequence lengths differ {} {}'.format( alignment.sequence.seq_id, other_alignment.sequence.seq_id) ) continue if dnautils.equal(alignment.sequence.sequence, other_alignment.sequence.sequence): alignment.sequence.copy_number += ( other_alignment.sequence.copy_number ) bucket.pop(i) uniques.append(alignment) return uniques
def tcell_clones(self, bucket): updates = [] clones = OrderedDict() consensus_needed = set([]) for seq in self.get_query(bucket, False): key = (seq.v_gene, seq.j_gene, seq.cdr3_nt) if key in clones: clone = clones[key] else: for test_clone in clones.values(): same_bin = (test_clone.v_gene == key[0] and test_clone.j_gene == key[1] and test_clone.cdr3_num_nts == len(key[2])) if same_bin and dnautils.equal(test_clone.cdr3_nt, key[2]): clone = test_clone break else: new_clone = Clone(subject_id=seq.subject_id, v_gene=seq.v_gene, j_gene=seq.j_gene, cdr3_nt=seq.cdr3_nt, cdr3_num_nts=seq.cdr3_num_nts, _insertions=seq._insertions, _deletions=seq._deletions) clones[key] = new_clone self._session.add(new_clone) self._session.flush() clone = new_clone consensus_needed.add(new_clone.id) updates.append({ 'sample_id': seq.sample_id, 'ai': seq.ai, 'clone_id': clone.id }) if len(updates) > 0: self._session.bulk_update_mappings(Sequence, updates) generate_consensus(self._session, consensus_needed)
def remove_duplicates(session, sample): logger.info('Removing duplicates from sample {}'.format(sample.id)) all_seqs = session.query(Sequence).filter( Sequence.sample == sample).order_by(Sequence.copy_number.desc()) buckets = {} for seq in all_seqs: key = (seq.v_gene, seq.j_gene, seq.cdr3_num_nts, seq._insertions, seq._deletions) buckets.setdefault(key, []).append(seq) for i, bucket in enumerate(buckets.values()): while len(bucket) > 0: larger = bucket.pop(0) for i in reversed(range(len(bucket))): smaller = bucket[i] if dnautils.equal(larger.sequence, smaller.sequence): larger.copy_number += smaller.copy_number session.delete(smaller) del bucket[i] session.commit()
def remove_duplicates(session, sample): logger.info('Removing duplicates from sample {}'.format(sample.id)) seqs = session.query( Sequence.ai, Sequence.seq_id, Sequence.v_gene, Sequence.j_gene, Sequence.cdr3_num_nts, Sequence.copy_number, Sequence.sequence).filter( Sequence.locally_aligned.is_(True), Sequence.sample_id == sample.id).order_by(Sequence.ai) for seq in seqs: potential_collapse = session.query( Sequence.ai, Sequence.sequence, Sequence.copy_number).filter( Sequence.sample_id == sample.id, Sequence.v_gene == seq.v_gene, Sequence.j_gene == seq.j_gene, Sequence.cdr3_num_nts == seq.cdr3_num_nts, ).order_by(desc(Sequence.copy_number), Sequence.ai) for other_seq in potential_collapse: if (other_seq.ai == seq.ai or len(other_seq.sequence) != len(seq.sequence)): continue if dnautils.equal(other_seq.sequence, seq.sequence): session.query(DuplicateSequence).filter( DuplicateSequence.duplicate_seq_ai == seq.ai).update( { 'duplicate_seq_ai': other_seq.ai, }, synchronize_session=False) session.add( DuplicateSequence(seq_id=seq.seq_id, duplicate_seq_ai=other_seq.ai, sample_id=sample.id)) session.query(Sequence).filter(Sequence.ai == seq.ai).delete() break session.commit()
def remove_duplicates(session, sample): logger.info('Removing duplicates from sample {}'.format(sample.id)) all_seqs = session.query(Sequence).filter( Sequence.sample == sample ).order_by( Sequence.copy_number.desc() ) buckets = {} for seq in all_seqs: key = (seq.v_gene, seq.j_gene, seq.cdr3_num_nts) buckets.setdefault(key, []).append(seq) for i, bucket in enumerate(buckets.values()): while len(bucket) > 0: larger = bucket.pop(0) for i in reversed(range(len(bucket))): smaller = bucket[i] if dnautils.equal(larger.sequence, smaller.sequence): larger.copy_number += smaller.copy_number session.delete(smaller) del bucket[i] session.commit()
def do_task(self, bucket): seqs = self._session.query( Sequence.sample_id, Sequence.ai, Sequence.seq_id, Sequence.sequence, Sequence.copy_number).filter( Sequence.subject_id == bucket.subject_id, Sequence.v_gene == bucket.v_gene, Sequence.j_gene == bucket.j_gene, Sequence.cdr3_num_nts == bucket.cdr3_num_nts, Sequence._insertions == bucket._insertions, Sequence._deletions == bucket._deletions) to_process = sorted([{ 'sample_id': s.sample_id, 'ai': s.ai, 'seq_id': s.seq_id, 'sequence': s.sequence, 'cn': s.copy_number } for s in seqs], key=lambda e: -e['cn']) while len(to_process) > 0: # Get the largest sequence in the list larger = to_process.pop(0) # Iterate over all smaller sequences to find matches instances = 1 samples = set([larger['sample_id']]) for i in reversed(range(len(to_process))): smaller = to_process[i] if len(larger['sequence']) != len(smaller['sequence']): self.warning('Tried to collapse sequences of different ' 'lengths. AIs are {} {}'.format( larger['ai'], smaller['ai'])) elif dnautils.equal(larger['sequence'], smaller['sequence']): # Add the smaller sequence's copy number to the larger larger['cn'] += smaller['cn'] # If the smaller sequence matches the larger, collapse it # to the larger self._session.add( SequenceCollapse( **{ 'sample_id': smaller['sample_id'], 'seq_ai': smaller['ai'], 'collapse_to_subject_seq_ai': larger['ai'], 'collapse_to_subject_sample_id': larger['sample_id'], 'collapse_to_subject_seq_id': larger['seq_id'], 'instances_in_subject': 0, 'copy_number_in_subject': 0, 'samples_in_subject': 0, })) instances += 1 samples.add(smaller['sample_id']) # Delete the smaller sequence from the list to process # since it's been collapsed del to_process[i] # Update the larger sequence's copy number and "collapse" to itself self._session.add( SequenceCollapse( **{ 'sample_id': larger['sample_id'], 'seq_ai': larger['ai'], 'collapse_to_subject_sample_id': larger['sample_id'], 'collapse_to_subject_seq_id': larger['seq_id'], 'collapse_to_subject_seq_ai': larger['ai'], 'instances_in_subject': instances, 'copy_number_in_subject': larger['cn'], 'samples_in_subject': len(samples), })) self._session.commit() self._tasks += 1 if self._tasks > 0 and self._tasks % 100 == 0: self.info('Collapsed {} buckets'.format(self._tasks))
def _check_j_with_missing(self, sequence, match): for pos in range(len(sequence) - len(match)): ss = sequence[pos:pos + len(match)] if dnautils.equal(ss, match): return pos return -1
def do_task(self, bucket): seqs = self._session.query( Sequence.sample_id, Sequence.ai, Sequence.seq_id, Sequence.sequence, Sequence.copy_number ).filter( Sequence.subject_id == bucket.subject_id, Sequence.v_gene == bucket.v_gene, Sequence.j_gene == bucket.j_gene, Sequence.cdr3_num_nts == bucket.cdr3_num_nts, Sequence._insertions == bucket._insertions, Sequence._deletions == bucket._deletions ) to_process = sorted([{ 'sample_id': s.sample_id, 'ai': s.ai, 'seq_id': s.seq_id, 'sequence': s.sequence, 'cn': s.copy_number } for s in seqs], key=lambda e: -e['cn']) while len(to_process) > 0: # Get the largest sequence in the list larger = to_process.pop(0) # Iterate over all smaller sequences to find matches instances = 1 samples = set([larger['sample_id']]) for i in reversed(range(len(to_process))): smaller = to_process[i] if len(larger['sequence']) != len(smaller['sequence']): self.warning('Tried to collapse sequences of different ' 'lengths. AIs are {} {}'.format( larger['ai'], smaller['ai'])) elif dnautils.equal(larger['sequence'], smaller['sequence']): # Add the smaller sequence's copy number to the larger larger['cn'] += smaller['cn'] # If the smaller sequence matches the larger, collapse it # to the larger self._session.add(SequenceCollapse(**{ 'sample_id': smaller['sample_id'], 'seq_ai': smaller['ai'], 'collapse_to_subject_seq_ai': larger['ai'], 'collapse_to_subject_sample_id': larger['sample_id'], 'collapse_to_subject_seq_id': larger['seq_id'], 'instances_in_subject': 0, 'copy_number_in_subject': 0, 'samples_in_subject': 0, })) instances += 1 samples.add(smaller['sample_id']) # Delete the smaller sequence from the list to process # since it's been collapsed del to_process[i] # Update the larger sequence's copy number and "collapse" to itself self._session.add(SequenceCollapse(**{ 'sample_id': larger['sample_id'], 'seq_ai': larger['ai'], 'collapse_to_subject_sample_id': larger['sample_id'], 'collapse_to_subject_seq_id': larger['seq_id'], 'collapse_to_subject_seq_ai': larger['ai'], 'instances_in_subject': instances, 'copy_number_in_subject': larger['cn'], 'samples_in_subject': len(samples), })) self._session.commit() self._tasks += 1 if self._tasks > 0 and self._tasks % 100 == 0: self.info('Collapsed {} buckets'.format(self._tasks))
def can_subclone(sub_clone, parent_clone): return dnautils.equal(parent_clone.cdr3_aa.replace('X', '-'), sub_clone.cdr3_aa.replace('X', '-'))
def sliding_window_match(sequence, match): for pos in range(len(sequence) - len(match)): ss = sequence[pos:pos + len(match)] if dnautils.equal(ss, match): return pos return -1