Esempio n. 1
0
def remove_duplicates(session, sample):
    logger.info('Removing duplicates from sample {}'.format(sample.id))
    seqs = session.query(Sequence).filter(
        Sequence.locally_aligned.is_(True),
        Sequence.sample_id == sample.id).order_by(Sequence.ai)

    for seq in seqs:
        potential_collapse = session.query(Sequence).filter(
            Sequence.sample_id == sample.id,
            Sequence.v_gene == seq.v_gene,
            Sequence.j_gene == seq.j_gene,
            Sequence.cdr3_num_nts == seq.cdr3_num_nts,
        ).order_by(desc(Sequence.copy_number), Sequence.ai)

        for other_seq in potential_collapse:
            if (other_seq.seq_id == seq.seq_id
                    or len(other_seq.sequence) != len(seq.sequence)):
                continue

            if dnautils.equal(other_seq.sequence, seq.sequence):
                other_seq.copy_number += seq.copy_number
                session.delete(seq)
                break

    session.commit()
Esempio n. 2
0
def get_cigar(ref, qry):
    assert len(ref) == len(qry)
    cnt = 0
    current_op = None
    cigar = []
    for r, q in zip(ref, qry):
        if r == q == '-':
            continue
        elif dnautils.equal(r, q):
            op = '='
        elif r == '-':
            op = 'I'
        elif q == '-':
            op = 'D'
        else:
            op = 'X'
        if op != current_op:
            if current_op:
                cigar.append('{}{}'.format(cnt, current_op))
            current_op = op
            cnt = 1
        else:
            cnt += 1
    cigar.append('{}{}'.format(cnt, current_op))
    return ''.join(cigar)
def get_cigar(ref, qry):
    assert len(ref) == len(qry)
    cnt = 0
    current_op = None
    cigar = []
    for r, q in zip(ref, qry):
        if r == q == '-':
            continue
        elif dnautils.equal(r, q):
            op = '='
        elif r == '-':
            op = 'I'
        elif q == '-':
            op = 'D'
        else:
            op = 'X'
        if op != current_op:
            if current_op:
                cigar.append('{}{}'.format(cnt, current_op))
            current_op = op
            cnt = 1
        else:
            cnt += 1
    cigar.append('{}{}'.format(cnt, current_op))
    return ''.join(cigar)
Esempio n. 4
0
def remove_duplicates(session, sample):
    logger.info('Removing duplicates from sample {}'.format(sample.id))
    seqs = session.query(
        Sequence
    ).filter(
        Sequence.locally_aligned.is_(True),
        Sequence.sample_id == sample.id
    ).order_by(Sequence.ai)

    for seq in seqs:
        potential_collapse = session.query(
            Sequence
        ).filter(
            Sequence.sample_id == sample.id,
            Sequence.v_gene == seq.v_gene,
            Sequence.j_gene == seq.j_gene,
            Sequence.cdr3_num_nts == seq.cdr3_num_nts,
        ).order_by(desc(Sequence.copy_number), Sequence.ai)

        for other_seq in potential_collapse:
            if (other_seq.seq_id == seq.seq_id or
                    len(other_seq.sequence) != len(seq.sequence)):
                continue

            if dnautils.equal(other_seq.sequence, seq.sequence):
                other_seq.copy_number += seq.copy_number
                session.delete(seq)
                break

    session.commit()
Esempio n. 5
0
def add_uniques(session,
                sample,
                vdjs,
                realign_len=None,
                realign_mut=None,
                min_similarity=0,
                max_vties=50,
                trim_to=None,
                max_padding=None):
    bucketed_seqs = OrderedDict()
    vdjs = sorted(vdjs, key=lambda v: v.ids[0])
    for vdj in funcs.periodic_commit(session, vdjs):
        try:
            if realign_len is not None:
                vdj.align_to_germline(realign_len, realign_mut, trim_to)
            if vdj.v_match / float(vdj.v_length) < min_similarity:
                raise AlignmentException('V-identity too low {} < {}'.format(
                    vdj.v_match / float(vdj.v_length), min_similarity))
            if len(vdj.v_gene) > max_vties:
                raise AlignmentException('Too many V-ties {} > {}'.format(
                    len(vdj.v_gene), max_vties))
            if max_padding is not None and vdj.pad_length > max_padding:
                raise AlignmentException('Too much padding {} (max {})'.format(
                    vdj.pad_length, max_padding))
            bucket_key = (funcs.format_ties(vdj.v_gene,
                                            vdj.v_germlines.prefix,
                                            strip_alleles=True),
                          funcs.format_ties(vdj.j_gene,
                                            vdj.j_germlines.prefix,
                                            strip_alleles=True), len(vdj.cdr3))
            if bucket_key not in bucketed_seqs:
                bucketed_seqs[bucket_key] = {}
            bucket = bucketed_seqs[bucket_key]

            if vdj.sequence in bucket:
                bucket[vdj.sequence].ids += vdj.ids
            else:
                bucket[vdj.sequence] = vdj
        except AlignmentException as e:
            add_as_noresult(session, vdj, sample, str(e))
        except:
            logger.error('\tUnexpected error processing sequence '
                         '{}\n\t{}'.format(vdj.ids[0], traceback.format_exc()))

    # Collapse sequences that are the same except for Ns
    for bucket, sequences in funcs.periodic_commit(session,
                                                   bucketed_seqs.iteritems()):
        sequences = sorted(sequences.values(),
                           key=lambda s: (len(s.ids), s.ids[0]),
                           reverse=True)
        while len(sequences) > 0:
            larger = sequences.pop(0)
            for i in reversed(range(len(sequences))):
                smaller = sequences[i]

                if dnautils.equal(larger.sequence, smaller.sequence):
                    larger.ids += smaller.ids
                    del sequences[i]
            add_as_sequence(session, larger, sample)
    session.commit()
Esempio n. 6
0
def add_uniques(session,
                sample,
                alignments,
                props,
                aligner,
                realign_len=None,
                realign_mut=None):
    bucketed_seqs = OrderedDict()
    alignments = sorted(alignments, key=lambda v: v.sequence.ids[0])
    for alignment in funcs.periodic_commit(session, alignments):
        try:
            if realign_len is not None:
                aligner.align_to_germline(alignment, realign_len, realign_mut)
                if props.trim_to:
                    alignment.trim_to(props.trim_to)

            props.validate(alignment)
            bucket_key = (funcs.format_ties(alignment.v_gene),
                          funcs.format_ties(alignment.j_gene),
                          len(alignment.cdr3))

            if bucket_key not in bucketed_seqs:
                bucketed_seqs[bucket_key] = {}
            bucket = bucketed_seqs[bucket_key]

            if alignment.sequence.sequence in bucket:
                bucket[alignment.sequence.sequence].sequence.ids += (
                    alignment.sequence.ids)
            else:
                bucket[alignment.sequence.sequence] = alignment
        except AlignmentException as e:
            add_as_noresult(session, alignment.sequence, sample, str(e))
        except Exception:
            logger.error('\tUnexpected error processing sequence '
                         '{}\n\t{}'.format(alignment.sequence.ids[0],
                                           traceback.format_exc()))

    # Collapse sequences that are the same except for Ns
    for bucket, sequences in funcs.periodic_commit(session,
                                                   bucketed_seqs.iteritems()):
        sequences = sorted(sequences.values(),
                           key=lambda s:
                           (len(s.sequence.ids), s.sequence.ids[0]),
                           reverse=True)
        while len(sequences) > 0:
            larger = sequences.pop(0)
            for i in reversed(range(len(sequences))):
                smaller = sequences[i]

                if dnautils.equal(larger.sequence.sequence,
                                  smaller.sequence.sequence):
                    larger.sequence.ids += smaller.sequence.ids
                    del sequences[i]
            add_as_sequence(session, larger, sample)
    session.commit()
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props):
    reader = csv.DictReader(handle, delimiter='\t')
    uniques = {}

    for i, line in enumerate(reader):
        if fmt == 'adaptive':
            try:
                line = extract_adaptive_sequence(i, line, v_germlines,
                                                 j_germlines)
            except (AlignmentException, KeyError) as e:
                seq = VDJSequence('seq_{}'.format(i), '')
                add_noresults_for_vdj(session, seq, sample, str(e))
                continue
        seq = VDJSequence(line['SEQUENCE_ID'],
                          line['SEQUENCE_IMGT'].replace('.', '-'))
        if 'DUPCOUNT' in line:
            seq.copy_number = int(line['DUPCOUNT'])
        try:
            alignment = create_alignment(seq, line, v_germlines, j_germlines)
            for other in uniques.setdefault(
                    len(alignment.sequence.sequence), []):
                if dnautils.equal(other.sequence.sequence,
                                  alignment.sequence.sequence):
                    other.sequence.copy_number += (
                        alignment.sequence.copy_number)
                    break
            else:
                uniques[len(alignment.sequence.sequence)].append(alignment)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]]
    lens = []
    muts = []
    for unique in uniques:
        try:
            props.validate(unique)
            add_sequences(session, [unique], sample)
            lens.append(unique.v_length)
            muts.append(unique.v_mutation_fraction)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    if len(lens) > 0:
        sample.v_ties_len = sum(lens) / len(lens)
        sample.v_ties_mutations = sum(muts) / len(muts)

    session.commit()
Esempio n. 8
0
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props):
    reader = csv.DictReader(handle, delimiter='\t')
    uniques = {}

    for i, line in enumerate(reader):
        if fmt == 'adaptive':
            try:
                line = extract_adaptive_sequence(i, line, v_germlines,
                                                 j_germlines)
            except (AlignmentException, KeyError) as e:
                seq = VDJSequence('seq_{}'.format(i), '')
                add_noresults_for_vdj(session, seq, sample, str(e))
                continue
        seq = VDJSequence(line['SEQUENCE_ID'],
                          line['SEQUENCE_IMGT'].replace('.', '-'))
        if 'DUPCOUNT' in line:
            seq.copy_number = int(line['DUPCOUNT'])
        try:
            alignment = create_alignment(seq, line, v_germlines, j_germlines)
            for other in uniques.setdefault(len(alignment.sequence.sequence),
                                            []):
                if dnautils.equal(other.sequence.sequence,
                                  alignment.sequence.sequence):
                    other.sequence.copy_number += (
                        alignment.sequence.copy_number)
                    break
            else:
                uniques[len(alignment.sequence.sequence)].append(alignment)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]]
    lens = []
    muts = []
    for unique in uniques:
        try:
            props.validate(unique)
            add_sequences(session, [unique], sample)
            lens.append(unique.v_length)
            muts.append(unique.v_mutation_fraction)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    if len(lens) > 0:
        sample.v_ties_len = sum(lens) / float(len(lens))
        sample.v_ties_mutations = sum(muts) / float(len(muts))

    session.commit()
Esempio n. 9
0
def process_collapse(sequences):
    sequences = sorted(sequences,
                       key=lambda s:
                       (s.sequence.copy_number, s.sequence.seq_id),
                       reverse=True)
    uniques = []
    while len(sequences) > 0:
        larger = sequences.pop(0)
        for i in reversed(range(len(sequences))):
            smaller = sequences[i]
            if dnautils.equal(larger.sequence.sequence,
                              smaller.sequence.sequence):
                larger.sequence.copy_number += smaller.sequence.copy_number
                del sequences[i]
        uniques.append(larger)
    return uniques
def process_collapse(sequences):
    sequences = sorted(
        sequences,
        key=lambda s: (s.sequence.copy_number, s.sequence.seq_id),
        reverse=True
    )
    uniques = []
    while len(sequences) > 0:
        larger = sequences.pop(0)
        for i in reversed(range(len(sequences))):
            smaller = sequences[i]
            if dnautils.equal(larger.sequence.sequence,
                              smaller.sequence.sequence):
                larger.sequence.copy_number += smaller.sequence.copy_number
                del sequences[i]
        uniques.append(larger)
    return uniques
Esempio n. 11
0
def collapse_duplicate_alignments(bucket):
    uniques = []
    while bucket:
        alignment = bucket.pop()
        for i, other_alignment in enumerate(bucket):
            if (len(alignment.sequence.sequence) != len(
                    other_alignment.sequence.sequence)):
                logger.warning('Sequence lengths differ {} {}'.format(
                    alignment.sequence.seq_id,
                    other_alignment.sequence.seq_id))
                continue
            if dnautils.equal(alignment.sequence.sequence,
                              other_alignment.sequence.sequence):
                alignment.sequence.copy_number += (
                    other_alignment.sequence.copy_number)
                bucket.pop(i)
        uniques.append(alignment)
    return uniques
Esempio n. 12
0
def collapse_duplicates(bucket):
    uniques = []
    while bucket:
        alignment = bucket.pop()
        for i, other_alignment in enumerate(bucket):
            if (len(alignment.sequence.sequence) !=
                    len(other_alignment.sequence.sequence)):
                logger.warning('Sequence lengths differ {} {}'.format(
                    alignment.sequence.seq_id,
                    other_alignment.sequence.seq_id)
                )
                continue
            if dnautils.equal(alignment.sequence.sequence,
                              other_alignment.sequence.sequence):
                alignment.sequence.copy_number += (
                    other_alignment.sequence.copy_number
                )
                bucket.pop(i)
        uniques.append(alignment)
    return uniques
Esempio n. 13
0
    def tcell_clones(self, bucket):
        updates = []
        clones = OrderedDict()
        consensus_needed = set([])

        for seq in self.get_query(bucket, False):
            key = (seq.v_gene, seq.j_gene, seq.cdr3_nt)
            if key in clones:
                clone = clones[key]
            else:
                for test_clone in clones.values():
                    same_bin = (test_clone.v_gene == key[0] and
                                test_clone.j_gene == key[1] and
                                test_clone.cdr3_num_nts == len(key[2]))
                    if same_bin and dnautils.equal(test_clone.cdr3_nt, key[2]):
                        clone = test_clone
                        break
                else:
                    new_clone = Clone(subject_id=seq.subject_id,
                                      v_gene=seq.v_gene,
                                      j_gene=seq.j_gene,
                                      cdr3_nt=seq.cdr3_nt,
                                      cdr3_num_nts=seq.cdr3_num_nts,
                                      _insertions=seq._insertions,
                                      _deletions=seq._deletions)
                    clones[key] = new_clone
                    self._session.add(new_clone)
                    self._session.flush()
                    clone = new_clone
                    consensus_needed.add(new_clone.id)
            updates.append({
                'sample_id': seq.sample_id,
                'ai': seq.ai,
                'clone_id': clone.id
            })

        if len(updates) > 0:
            self._session.bulk_update_mappings(Sequence, updates)
        generate_consensus(self._session, consensus_needed)
Esempio n. 14
0
def remove_duplicates(session, sample):
    logger.info('Removing duplicates from sample {}'.format(sample.id))
    all_seqs = session.query(Sequence).filter(
        Sequence.sample == sample).order_by(Sequence.copy_number.desc())

    buckets = {}
    for seq in all_seqs:
        key = (seq.v_gene, seq.j_gene, seq.cdr3_num_nts, seq._insertions,
               seq._deletions)
        buckets.setdefault(key, []).append(seq)

    for i, bucket in enumerate(buckets.values()):
        while len(bucket) > 0:
            larger = bucket.pop(0)
            for i in reversed(range(len(bucket))):
                smaller = bucket[i]
                if dnautils.equal(larger.sequence, smaller.sequence):
                    larger.copy_number += smaller.copy_number
                    session.delete(smaller)
                    del bucket[i]

    session.commit()
Esempio n. 15
0
def remove_duplicates(session, sample):
    logger.info('Removing duplicates from sample {}'.format(sample.id))
    seqs = session.query(
        Sequence.ai, Sequence.seq_id, Sequence.v_gene, Sequence.j_gene,
        Sequence.cdr3_num_nts, Sequence.copy_number, Sequence.sequence).filter(
            Sequence.locally_aligned.is_(True),
            Sequence.sample_id == sample.id).order_by(Sequence.ai)

    for seq in seqs:
        potential_collapse = session.query(
            Sequence.ai, Sequence.sequence, Sequence.copy_number).filter(
                Sequence.sample_id == sample.id,
                Sequence.v_gene == seq.v_gene,
                Sequence.j_gene == seq.j_gene,
                Sequence.cdr3_num_nts == seq.cdr3_num_nts,
            ).order_by(desc(Sequence.copy_number), Sequence.ai)

        for other_seq in potential_collapse:
            if (other_seq.ai == seq.ai
                    or len(other_seq.sequence) != len(seq.sequence)):
                continue

            if dnautils.equal(other_seq.sequence, seq.sequence):
                session.query(DuplicateSequence).filter(
                    DuplicateSequence.duplicate_seq_ai == seq.ai).update(
                        {
                            'duplicate_seq_ai': other_seq.ai,
                        },
                        synchronize_session=False)
                session.add(
                    DuplicateSequence(seq_id=seq.seq_id,
                                      duplicate_seq_ai=other_seq.ai,
                                      sample_id=sample.id))
                session.query(Sequence).filter(Sequence.ai == seq.ai).delete()
                break

    session.commit()
def remove_duplicates(session, sample):
    logger.info('Removing duplicates from sample {}'.format(sample.id))
    all_seqs = session.query(Sequence).filter(
        Sequence.sample == sample
    ).order_by(
        Sequence.copy_number.desc()
    )

    buckets = {}
    for seq in all_seqs:
        key = (seq.v_gene, seq.j_gene, seq.cdr3_num_nts)
        buckets.setdefault(key, []).append(seq)

    for i, bucket in enumerate(buckets.values()):
        while len(bucket) > 0:
            larger = bucket.pop(0)
            for i in reversed(range(len(bucket))):
                smaller = bucket[i]
                if dnautils.equal(larger.sequence, smaller.sequence):
                    larger.copy_number += smaller.copy_number
                    session.delete(smaller)
                    del bucket[i]

    session.commit()
Esempio n. 17
0
    def do_task(self, bucket):
        seqs = self._session.query(
            Sequence.sample_id, Sequence.ai, Sequence.seq_id,
            Sequence.sequence, Sequence.copy_number).filter(
                Sequence.subject_id == bucket.subject_id,
                Sequence.v_gene == bucket.v_gene,
                Sequence.j_gene == bucket.j_gene,
                Sequence.cdr3_num_nts == bucket.cdr3_num_nts,
                Sequence._insertions == bucket._insertions,
                Sequence._deletions == bucket._deletions)

        to_process = sorted([{
            'sample_id': s.sample_id,
            'ai': s.ai,
            'seq_id': s.seq_id,
            'sequence': s.sequence,
            'cn': s.copy_number
        } for s in seqs],
                            key=lambda e: -e['cn'])

        while len(to_process) > 0:
            # Get the largest sequence in the list
            larger = to_process.pop(0)
            # Iterate over all smaller sequences to find matches
            instances = 1
            samples = set([larger['sample_id']])
            for i in reversed(range(len(to_process))):
                smaller = to_process[i]
                if len(larger['sequence']) != len(smaller['sequence']):
                    self.warning('Tried to collapse sequences of different '
                                 'lengths.  AIs are {} {}'.format(
                                     larger['ai'], smaller['ai']))
                elif dnautils.equal(larger['sequence'], smaller['sequence']):
                    # Add the smaller sequence's copy number to the larger
                    larger['cn'] += smaller['cn']
                    # If the smaller sequence matches the larger, collapse it
                    # to the larger
                    self._session.add(
                        SequenceCollapse(
                            **{
                                'sample_id':
                                smaller['sample_id'],
                                'seq_ai':
                                smaller['ai'],
                                'collapse_to_subject_seq_ai':
                                larger['ai'],
                                'collapse_to_subject_sample_id':
                                larger['sample_id'],
                                'collapse_to_subject_seq_id':
                                larger['seq_id'],
                                'instances_in_subject':
                                0,
                                'copy_number_in_subject':
                                0,
                                'samples_in_subject':
                                0,
                            }))
                    instances += 1
                    samples.add(smaller['sample_id'])
                    # Delete the smaller sequence from the list to process
                    # since it's been collapsed
                    del to_process[i]

            # Update the larger sequence's copy number and "collapse" to itself
            self._session.add(
                SequenceCollapse(
                    **{
                        'sample_id': larger['sample_id'],
                        'seq_ai': larger['ai'],
                        'collapse_to_subject_sample_id': larger['sample_id'],
                        'collapse_to_subject_seq_id': larger['seq_id'],
                        'collapse_to_subject_seq_ai': larger['ai'],
                        'instances_in_subject': instances,
                        'copy_number_in_subject': larger['cn'],
                        'samples_in_subject': len(samples),
                    }))

        self._session.commit()
        self._tasks += 1
        if self._tasks > 0 and self._tasks % 100 == 0:
            self.info('Collapsed {} buckets'.format(self._tasks))
Esempio n. 18
0
 def _check_j_with_missing(self, sequence, match):
     for pos in range(len(sequence) - len(match)):
         ss = sequence[pos:pos + len(match)]
         if dnautils.equal(ss, match):
             return pos
     return -1
    def do_task(self, bucket):
        seqs = self._session.query(
            Sequence.sample_id, Sequence.ai, Sequence.seq_id,
            Sequence.sequence, Sequence.copy_number
        ).filter(
            Sequence.subject_id == bucket.subject_id,
            Sequence.v_gene == bucket.v_gene,
            Sequence.j_gene == bucket.j_gene,
            Sequence.cdr3_num_nts == bucket.cdr3_num_nts,
            Sequence._insertions == bucket._insertions,
            Sequence._deletions == bucket._deletions
        )

        to_process = sorted([{
            'sample_id': s.sample_id,
            'ai': s.ai,
            'seq_id': s.seq_id,
            'sequence': s.sequence,
            'cn': s.copy_number
        } for s in seqs], key=lambda e: -e['cn'])

        while len(to_process) > 0:
            # Get the largest sequence in the list
            larger = to_process.pop(0)
            # Iterate over all smaller sequences to find matches
            instances = 1
            samples = set([larger['sample_id']])
            for i in reversed(range(len(to_process))):
                smaller = to_process[i]
                if len(larger['sequence']) != len(smaller['sequence']):
                    self.warning('Tried to collapse sequences of different '
                                 'lengths.  AIs are {} {}'.format(
                                     larger['ai'], smaller['ai']))
                elif dnautils.equal(larger['sequence'], smaller['sequence']):
                    # Add the smaller sequence's copy number to the larger
                    larger['cn'] += smaller['cn']
                    # If the smaller sequence matches the larger, collapse it
                    # to the larger
                    self._session.add(SequenceCollapse(**{
                        'sample_id': smaller['sample_id'],
                        'seq_ai': smaller['ai'],
                        'collapse_to_subject_seq_ai': larger['ai'],
                        'collapse_to_subject_sample_id': larger['sample_id'],
                        'collapse_to_subject_seq_id': larger['seq_id'],
                        'instances_in_subject': 0,
                        'copy_number_in_subject': 0,
                        'samples_in_subject': 0,
                    }))
                    instances += 1
                    samples.add(smaller['sample_id'])
                    # Delete the smaller sequence from the list to process
                    # since it's been collapsed
                    del to_process[i]

            # Update the larger sequence's copy number and "collapse" to itself
            self._session.add(SequenceCollapse(**{
                'sample_id': larger['sample_id'],
                'seq_ai': larger['ai'],
                'collapse_to_subject_sample_id': larger['sample_id'],
                'collapse_to_subject_seq_id': larger['seq_id'],
                'collapse_to_subject_seq_ai': larger['ai'],
                'instances_in_subject': instances,
                'copy_number_in_subject': larger['cn'],
                'samples_in_subject': len(samples),
            }))

        self._session.commit()
        self._tasks += 1
        if self._tasks > 0 and self._tasks % 100 == 0:
            self.info('Collapsed {} buckets'.format(self._tasks))
Esempio n. 20
0
def can_subclone(sub_clone, parent_clone):
    return dnautils.equal(parent_clone.cdr3_aa.replace('X', '-'),
                          sub_clone.cdr3_aa.replace('X', '-'))
Esempio n. 21
0
def sliding_window_match(sequence, match):
    for pos in range(len(sequence) - len(match)):
        ss = sequence[pos:pos + len(match)]
        if dnautils.equal(ss, match):
            return pos
    return -1