def aggregate_vties(aggregate_queue):
    bucketed_seqs = {
        'success': {},
        'noresult': []
    }
    for result in aggregate_queue:
        if result['status'] == 'success':
            alignment = result['alignment']
            bucket_key = (
                funcs.format_ties(alignment.v_gene),
                funcs.format_ties(alignment.j_gene),
                len(alignment.cdr3)
            )

            bucket = bucketed_seqs['success'].setdefault(bucket_key, {})
            if alignment.sequence.sequence in bucket:
                bucket[alignment.sequence.sequence].sequence.copy_number += (
                    alignment.sequence.copy_number
                )
            else:
                bucket[alignment.sequence.sequence] = alignment
        elif result['status'] == 'noresult':
            bucketed_seqs['noresult'].append(result)
        elif result['status'] == 'error':
            logger.error(
                'Unexpected error processing sequence {}\n\t{}'.format(
                    result['alignment'].sequence.seq_id))

    bucketed_seqs['success'] = [
        b.values() for b in bucketed_seqs['success'].values()
    ]
    return bucketed_seqs
Ejemplo n.º 2
0
def aggregate_vties(aggregate_queue):
    bucketed_seqs = {'success': {}, 'noresult': []}
    for result in aggregate_queue:
        if result['status'] == 'success':
            alignment = result['alignment']
            bucket_key = (funcs.format_ties(alignment.v_gene),
                          funcs.format_ties(alignment.j_gene),
                          len(alignment.cdr3))

            bucket = bucketed_seqs['success'].setdefault(bucket_key, {})
            if alignment.sequence.sequence in bucket:
                bucket[alignment.sequence.sequence].sequence.copy_number += (
                    alignment.sequence.copy_number)
            else:
                bucket[alignment.sequence.sequence] = alignment
        elif result['status'] == 'noresult':
            bucketed_seqs['noresult'].append(result)
        elif result['status'] == 'error':
            logger.error(
                'Unexpected error processing sequence {}\n\t{}'.format(
                    result['alignment'].sequence.seq_id))

    bucketed_seqs['success'] = [
        b.values() for b in bucketed_seqs['success'].values()
    ]
    return bucketed_seqs
Ejemplo n.º 3
0
def add_sequences_from_sample(session, sample, sequences, props):
    logger.info('Adding {} corrected sequences to sample {}'.format(
        len(sequences), sample.id))
    for sequence in periodic_commit(session, sequences):
        alignment = sequence['alignment']
        try:
            try:
                props.validate(alignment)
            except AlignmentException:
                continue
            if sequence['r_type'] == 'NoResult':
                add_sequences(session, [alignment],
                              sample,
                              error_action='raise')
                session.query(NoResult).filter(
                    NoResult.pk == sequence['pk']).delete(
                        synchronize_session=False)
            elif sequence['r_type'] == 'Sequence':
                fields = {
                    'partial': alignment.partial,
                    'probable_indel_or_misalign': alignment.has_possible_indel,
                    'v_gene': format_ties(alignment.v_gene),
                    'j_gene': format_ties(alignment.j_gene),
                    'num_gaps': alignment.num_gaps,
                    'seq_start': alignment.seq_start,
                    'v_match': alignment.v_match,
                    'v_length': alignment.v_length,
                    'j_match': alignment.j_match,
                    'j_length': alignment.j_length,
                    'removed_prefix':
                    alignment.sequence.removed_prefix_sequence,
                    'removed_prefix_qual':
                    alignment.sequence.removed_prefix_quality,
                    'v_mutation_fraction': alignment.v_mutation_fraction,
                    'pre_cdr3_length': alignment.pre_cdr3_length,
                    'pre_cdr3_match': alignment.pre_cdr3_match,
                    'post_cdr3_length': alignment.post_cdr3_length,
                    'post_cdr3_match': alignment.post_cdr3_match,
                    'in_frame': alignment.in_frame,
                    'functional': alignment.functional,
                    'stop': alignment.stop,
                    'cdr3_nt': alignment.cdr3,
                    'cdr3_num_nts': len(alignment.cdr3),
                    'cdr3_aa': lookups.aas_from_nts(alignment.cdr3),
                    'sequence': str(alignment.sequence.sequence),
                    'quality': alignment.sequence.quality,
                    'locally_aligned': alignment.locally_aligned,
                    '_insertions': serialize_gaps(alignment.insertions),
                    '_deletions': serialize_gaps(alignment.deletions),
                    'germline': alignment.germline
                }
                # This line doesnt actually add anything to the DB, it's just
                # to validate the fields
                Sequence(**fields)

                session.query(Sequence).filter(
                    Sequence.ai == sequence['pk']).update(
                        fields, synchronize_session=False)
        except ValueError:
            continue
Ejemplo n.º 4
0
def add_uniques(session,
                sample,
                vdjs,
                realign_len=None,
                realign_mut=None,
                min_similarity=0,
                max_vties=50,
                trim_to=None,
                max_padding=None):
    bucketed_seqs = OrderedDict()
    vdjs = sorted(vdjs, key=lambda v: v.ids[0])
    for vdj in funcs.periodic_commit(session, vdjs):
        try:
            if realign_len is not None:
                vdj.align_to_germline(realign_len, realign_mut, trim_to)
            if vdj.v_match / float(vdj.v_length) < min_similarity:
                raise AlignmentException('V-identity too low {} < {}'.format(
                    vdj.v_match / float(vdj.v_length), min_similarity))
            if len(vdj.v_gene) > max_vties:
                raise AlignmentException('Too many V-ties {} > {}'.format(
                    len(vdj.v_gene), max_vties))
            if max_padding is not None and vdj.pad_length > max_padding:
                raise AlignmentException('Too much padding {} (max {})'.format(
                    vdj.pad_length, max_padding))
            bucket_key = (funcs.format_ties(vdj.v_gene,
                                            vdj.v_germlines.prefix,
                                            strip_alleles=True),
                          funcs.format_ties(vdj.j_gene,
                                            vdj.j_germlines.prefix,
                                            strip_alleles=True), len(vdj.cdr3))
            if bucket_key not in bucketed_seqs:
                bucketed_seqs[bucket_key] = {}
            bucket = bucketed_seqs[bucket_key]

            if vdj.sequence in bucket:
                bucket[vdj.sequence].ids += vdj.ids
            else:
                bucket[vdj.sequence] = vdj
        except AlignmentException as e:
            add_as_noresult(session, vdj, sample, str(e))
        except:
            logger.error('\tUnexpected error processing sequence '
                         '{}\n\t{}'.format(vdj.ids[0], traceback.format_exc()))

    # Collapse sequences that are the same except for Ns
    for bucket, sequences in funcs.periodic_commit(session,
                                                   bucketed_seqs.iteritems()):
        sequences = sorted(sequences.values(),
                           key=lambda s: (len(s.ids), s.ids[0]),
                           reverse=True)
        while len(sequences) > 0:
            larger = sequences.pop(0)
            for i in reversed(range(len(sequences))):
                smaller = sequences[i]

                if dnautils.equal(larger.sequence, smaller.sequence):
                    larger.ids += smaller.ids
                    del sequences[i]
            add_as_sequence(session, larger, sample)
    session.commit()
Ejemplo n.º 5
0
def aggregate_results(results, session, sample):
    alignments = {}
    success = [r for r in results if r['status'] == 'success']
    noresults = [r for r in results if r['status'] == 'noresult']
    logger.info('{} total sequences ({} alignments, {} noresults)'.format(
        len(results), len(success), len(noresults)))

    for result in success:
        alignment = result['alignment']
        key = (funcs.format_ties(alignment.v_gene),
               funcs.format_ties(alignment.j_gene), alignment.cdr3_num_nts,
               tuple(alignment.insertions), tuple(alignment.deletions))
        alignments.setdefault(key, []).append(alignment)

    copies = 0
    for i, result in enumerate(noresults):
        orig_id = result['vdj'].seq_id
        copies += result['vdj'].copy_number
        for i in range(result['vdj'].copy_number):
            result['vdj'].seq_id = '{}_{}'.format(orig_id, i)
            add_noresults_for_vdj(session, result['vdj'], sample,
                                  result['reason'])
        if copies % 1000 == 0:
            session.commit()

    session.commit()
    return alignments
Ejemplo n.º 6
0
def add_as_sequence(session, alignment, sample, error_action='discard'):
    try:
        seq = Sequence(
            seq_id=alignment.sequence.ids[0],
            sample_id=sample.id,
            subject_id=sample.subject.id,
            partial=alignment.partial,
            probable_indel_or_misalign=alignment.has_possible_indel,
            v_gene=funcs.format_ties(alignment.v_gene),
            j_gene=funcs.format_ties(alignment.j_gene),
            num_gaps=alignment.num_gaps,
            seq_start=alignment.seq_start,
            v_match=alignment.v_match,
            v_length=alignment.v_length,
            j_match=alignment.j_match,
            j_length=alignment.j_length,
            removed_prefix=alignment.sequence.removed_prefix_sequence,
            removed_prefix_qual=alignment.sequence.removed_prefix_quality,
            v_mutation_fraction=alignment.v_mutation_fraction,
            pre_cdr3_length=alignment.pre_cdr3_length,
            pre_cdr3_match=alignment.pre_cdr3_match,
            post_cdr3_length=alignment.post_cdr3_length,
            post_cdr3_match=alignment.post_cdr3_match,
            in_frame=alignment.in_frame,
            functional=alignment.functional,
            stop=alignment.stop,
            copy_number=len(alignment.sequence.ids),
            cdr3_nt=alignment.cdr3,
            cdr3_num_nts=len(alignment.cdr3),
            cdr3_aa=lookups.aas_from_nts(alignment.cdr3),
            sequence=str(alignment.sequence.sequence),
            quality=alignment.sequence.quality,
            locally_aligned=alignment.locally_aligned,
            insertions=alignment.insertions,
            deletions=alignment.deletions,
            germline=alignment.germline)
        session.add(seq)
        session.flush()

        # Add duplicate sequences
        try:
            session.bulk_save_objects([
                DuplicateSequence(sample_id=sample.id,
                                  seq_id=seq_id,
                                  duplicate_seq_ai=seq.ai)
                for seq_id in alignment.sequence.ids[1:]
            ])
        except ValueError as e:
            pass
        return seq
    except ValueError as e:
        if error_action == 'discard':
            add_as_noresult(session, alignment.sequence, sample, str(e))
            return None
        elif error_action == 'raise':
            raise e
Ejemplo n.º 7
0
def add_uniques(session,
                sample,
                alignments,
                props,
                aligner,
                realign_len=None,
                realign_mut=None):
    bucketed_seqs = OrderedDict()
    alignments = sorted(alignments, key=lambda v: v.sequence.ids[0])
    for alignment in funcs.periodic_commit(session, alignments):
        try:
            if realign_len is not None:
                aligner.align_to_germline(alignment, realign_len, realign_mut)
                if props.trim_to:
                    alignment.trim_to(props.trim_to)

            props.validate(alignment)
            bucket_key = (funcs.format_ties(alignment.v_gene),
                          funcs.format_ties(alignment.j_gene),
                          len(alignment.cdr3))

            if bucket_key not in bucketed_seqs:
                bucketed_seqs[bucket_key] = {}
            bucket = bucketed_seqs[bucket_key]

            if alignment.sequence.sequence in bucket:
                bucket[alignment.sequence.sequence].sequence.ids += (
                    alignment.sequence.ids)
            else:
                bucket[alignment.sequence.sequence] = alignment
        except AlignmentException as e:
            add_as_noresult(session, alignment.sequence, sample, str(e))
        except Exception:
            logger.error('\tUnexpected error processing sequence '
                         '{}\n\t{}'.format(alignment.sequence.ids[0],
                                           traceback.format_exc()))

    # Collapse sequences that are the same except for Ns
    for bucket, sequences in funcs.periodic_commit(session,
                                                   bucketed_seqs.iteritems()):
        sequences = sorted(sequences.values(),
                           key=lambda s:
                           (len(s.sequence.ids), s.sequence.ids[0]),
                           reverse=True)
        while len(sequences) > 0:
            larger = sequences.pop(0)
            for i in reversed(range(len(sequences))):
                smaller = sequences[i]

                if dnautils.equal(larger.sequence.sequence,
                                  smaller.sequence.sequence):
                    larger.sequence.ids += smaller.sequence.ids
                    del sequences[i]
            add_as_sequence(session, larger, sample)
    session.commit()
Ejemplo n.º 8
0
def add_as_sequence(session, vdj, sample):
    try:
        seq = Sequence(seq_id=vdj.ids[0],
                       sample_id=sample.id,
                       subject_id=sample.subject.id,
                       partial=vdj.partial,
                       probable_indel_or_misalign=vdj.has_possible_indel,
                       v_gene=funcs.format_ties(vdj.v_gene,
                                                vdj.v_germlines.prefix,
                                                strip_alleles=True),
                       j_gene=funcs.format_ties(vdj.j_gene,
                                                vdj.j_germlines.prefix,
                                                strip_alleles=True),
                       num_gaps=vdj.num_gaps,
                       pad_length=vdj.pad_length,
                       v_match=vdj.v_match,
                       v_length=vdj.v_length,
                       j_match=vdj.j_match,
                       j_length=vdj.j_length,
                       removed_prefix=vdj.removed_prefix,
                       removed_prefix_qual=vdj.removed_prefix_qual,
                       v_mutation_fraction=vdj.mutation_fraction,
                       pre_cdr3_length=vdj.pre_cdr3_length,
                       pre_cdr3_match=vdj.pre_cdr3_match,
                       post_cdr3_length=vdj.post_cdr3_length,
                       post_cdr3_match=vdj.post_cdr3_match,
                       in_frame=vdj.in_frame,
                       functional=vdj.functional,
                       stop=vdj.stop,
                       copy_number=len(vdj.ids),
                       cdr3_nt=vdj.cdr3,
                       cdr3_num_nts=len(vdj.cdr3),
                       cdr3_aa=lookups.aas_from_nts(vdj.cdr3),
                       sequence=str(vdj.sequence),
                       quality=vdj.quality,
                       germline=vdj.germline)
        session.add(seq)
        session.flush()

        # Add duplicate sequences
        try:
            session.bulk_save_objects([
                DuplicateSequence(sample_id=sample.id,
                                  seq_id=seq_id,
                                  duplicate_seq_ai=seq.ai)
                for seq_id in vdj.ids[1:]
            ])
        except ValueError:
            pass
    except ValueError as e:
        add_as_noresult(session, vdj, sample, str(e))
Ejemplo n.º 9
0
def get_seq_from_alignment(session, alignment, sample, strip_alleles=True):
    try:
        return [
            Sequence(
                seq_id=alignment.sequence.seq_id,
                sample_id=sample.id,
                subject_id=sample.subject.id,
                partial=alignment.partial,
                rev_comp=alignment.sequence.rev_comp,
                probable_indel_or_misalign=alignment.has_possible_indel,
                v_gene=funcs.format_ties(alignment.v_gene, strip_alleles),
                j_gene=funcs.format_ties(alignment.j_gene, strip_alleles),
                num_gaps=alignment.num_gaps,
                seq_start=alignment.seq_start,
                v_match=alignment.v_match,
                v_length=alignment.v_length,
                j_match=alignment.j_match,
                j_length=alignment.j_length,
                removed_prefix=alignment.sequence.removed_prefix_sequence,
                removed_prefix_qual=alignment.sequence.removed_prefix_quality,
                v_mutation_fraction=alignment.v_mutation_fraction,
                pre_cdr3_length=alignment.pre_cdr3_length,
                pre_cdr3_match=alignment.pre_cdr3_match,
                post_cdr3_length=alignment.post_cdr3_length,
                post_cdr3_match=alignment.post_cdr3_match,
                in_frame=alignment.in_frame,
                functional=alignment.functional,
                stop=alignment.stop,
                copy_number=alignment.sequence.copy_number,
                cdr3_nt=alignment.cdr3,
                cdr3_num_nts=len(alignment.cdr3),
                cdr3_aa=lookups.aas_from_nts(alignment.cdr3),
                sequence=str(alignment.sequence.sequence),
                quality=alignment.sequence.quality,
                locally_aligned=alignment.locally_aligned,
                insertions=alignment.insertions,
                deletions=alignment.deletions,
                germline=alignment.germline)
        ]
    except ValueError as e:
        try:
            return [
                get_noresult_from_vdj(session, alignment.sequence, sample,
                                      str(e))
            ]
        except ValueError:
            return []
Ejemplo n.º 10
0
def aggregate_results(results, session, sample):
    alignments = {}
    for result in results:
        if result['status'] == 'success':
            alignment = result['alignment']
            key = (
                funcs.format_ties(alignment.v_gene),
                funcs.format_ties(alignment.j_gene),
                alignment.cdr3_num_nts,
                tuple(alignment.insertions),
                tuple(alignment.deletions)
            )
            alignments.setdefault(key, []).append(alignment)
        elif result['status'] == 'noresult':
            add_noresults_for_vdj(session, result['vdj'], sample,
                                  result['reason'])
    session.commit()
    return alignments
Ejemplo n.º 11
0
def get_formatted_ties(genes):
    res = {}
    for ties, seq in genes.iteritems():
        res[format_ties(ties)] = seq
    return res
Ejemplo n.º 12
0
def add_sequences_from_sample(session, sample, sequences, props):
    logger.info('Adding {} corrected sequences to sample {}'.format(
        len(sequences), sample.id))
    for sequence in periodic_commit(session, sequences):
        alignment = sequence['alignment']
        try:
            try:
                props.validate(alignment)
            except AlignmentException:
                continue
            if sequence['r_type'] == 'NoResult':
                add_sequences(session, [alignment], sample,
                              error_action='raise')
                session.query(NoResult).filter(
                    NoResult.pk == sequence['pk']
                ).delete(synchronize_session=False)
            elif sequence['r_type'] == 'Sequence':
                fields = {
                    'partial': alignment.partial,

                    'probable_indel_or_misalign':
                        alignment.has_possible_indel,

                    'v_gene': format_ties(alignment.v_gene),
                    'j_gene': format_ties(alignment.j_gene),

                    'num_gaps': alignment.num_gaps,
                    'seq_start': alignment.seq_start,

                    'v_match': alignment.v_match,
                    'v_length': alignment.v_length,
                    'j_match': alignment.j_match,
                    'j_length': alignment.j_length,

                    'removed_prefix':
                        alignment.sequence.removed_prefix_sequence,
                    'removed_prefix_qual':
                        alignment.sequence.removed_prefix_quality,
                    'v_mutation_fraction': alignment.v_mutation_fraction,

                    'pre_cdr3_length': alignment.pre_cdr3_length,
                    'pre_cdr3_match': alignment.pre_cdr3_match,
                    'post_cdr3_length': alignment.post_cdr3_length,
                    'post_cdr3_match': alignment.post_cdr3_match,

                    'in_frame': alignment.in_frame,
                    'functional': alignment.functional,
                    'stop': alignment.stop,

                    'cdr3_nt': alignment.cdr3,
                    'cdr3_num_nts': len(alignment.cdr3),
                    'cdr3_aa': lookups.aas_from_nts(alignment.cdr3),

                    'sequence': str(alignment.sequence.sequence),
                    'quality': alignment.sequence.quality,

                    'locally_aligned': alignment.locally_aligned,
                    '_insertions': serialize_gaps(alignment.insertions),
                    '_deletions': serialize_gaps(alignment.deletions),

                    'germline': alignment.germline
                }
                # This line doesnt actually add anything to the DB, it's just
                # to validate the fields
                Sequence(**fields)

                session.query(Sequence).filter(
                    Sequence.ai == sequence['pk']
                ).update(fields, synchronize_session=False)
        except ValueError:
            continue
Ejemplo n.º 13
0
def get_formatted_ties(genes):
    res = {}
    for ties, seq in genes.items():
        res[format_ties(ties)] = seq
    return res