コード例 #1
0
def generate_consensus(session, clone_ids):
    """Generates consensus CDR3s for clones.

    :param Session session: The database session
    :param list clone_ids: The list of clone IDs to assign to groups

    """

    if not clone_ids:
        return
    for clone in funcs.periodic_commit(session,
                                       session.query(Clone).filter(
                                           Clone.id.in_(clone_ids)),
                                       interval=1000):
        seqs = session.query(Sequence).join(SequenceCollapse).filter(
            Sequence.clone_id == clone.id,
            SequenceCollapse.copy_number_in_subject > 0).all()
        clone.cdr3_nt = funcs.consensus([s.cdr3_nt for s in seqs])
        clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt)

        clone.germline = generate_germline(session, seqs, clone)

        clone.overall_total_cnt = sum([s.copy_number for s in seqs])

        clone.functional = (clone.cdr3_num_nts % 3 == 0
                            and '*' not in clone.cdr3_aa
                            and not lookups.has_stop(clone.germline))

    session.commit()
コード例 #2
0
ファイル: clones.py プロジェクト: arosenfeld/immunedb
def generate_consensus(session, clone_ids):
    """Generates consensus CDR3s for clones.

    :param Session session: The database session
    :param list clone_ids: The list of clone IDs to assign to groups

    """

    if len(clone_ids) == 0:
        return
    for clone in funcs.periodic_commit(
            session,
            session.query(Clone).filter(Clone.id.in_(clone_ids)),
            interval=1000):
        seqs = session.query(
            Sequence
        ).join(SequenceCollapse).filter(
            Sequence.clone_id == clone.id,
            SequenceCollapse.copy_number_in_subject > 0
        ).all()
        clone.cdr3_nt = funcs.consensus([s.cdr3_nt for s in seqs])
        clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt)

        clone.germline = generate_germline(session, seqs, clone)

    session.commit()
コード例 #3
0
def generate_consensus(session, clone_ids):
    """Generates consensus CDR3s for clones.

    :param Session session: The database session
    :param list clone_ids: The list of clone IDs to assign to groups

    """

    if len(clone_ids) == 0:
        return
    for clone in funcs.periodic_commit(
            session,
            session.query(Clone).filter(Clone.id.in_(clone_ids)),
            interval=1000):
        seqs = session.query(
            Sequence
        ).join(SequenceCollapse).filter(
            Sequence.clone_id == clone.id,
            SequenceCollapse.copy_number_in_subject > 0
        ).all()
        clone.cdr3_nt = consensus([s.cdr3_nt for s in seqs])
        clone.cdr3_aa = lookups.aas_from_nts(clone.cdr3_nt)

        clone.germline = generate_germline(session, seqs, clone)

    session.commit()
コード例 #4
0
def add_sequences_from_sample(session, sample, sequences, props):
    logger.info('Adding {} corrected sequences to sample {}'.format(
        len(sequences), sample.id))
    for sequence in periodic_commit(session, sequences):
        alignment = sequence['alignment']
        try:
            try:
                props.validate(alignment)
            except AlignmentException:
                continue
            if sequence['r_type'] == 'NoResult':
                add_sequences(session, [alignment],
                              sample,
                              error_action='raise')
                session.query(NoResult).filter(
                    NoResult.pk == sequence['pk']).delete(
                        synchronize_session=False)
            elif sequence['r_type'] == 'Sequence':
                fields = {
                    'partial': alignment.partial,
                    'probable_indel_or_misalign': alignment.has_possible_indel,
                    'v_gene': format_ties(alignment.v_gene),
                    'j_gene': format_ties(alignment.j_gene),
                    'num_gaps': alignment.num_gaps,
                    'seq_start': alignment.seq_start,
                    'v_match': alignment.v_match,
                    'v_length': alignment.v_length,
                    'j_match': alignment.j_match,
                    'j_length': alignment.j_length,
                    'removed_prefix':
                    alignment.sequence.removed_prefix_sequence,
                    'removed_prefix_qual':
                    alignment.sequence.removed_prefix_quality,
                    'v_mutation_fraction': alignment.v_mutation_fraction,
                    'pre_cdr3_length': alignment.pre_cdr3_length,
                    'pre_cdr3_match': alignment.pre_cdr3_match,
                    'post_cdr3_length': alignment.post_cdr3_length,
                    'post_cdr3_match': alignment.post_cdr3_match,
                    'in_frame': alignment.in_frame,
                    'functional': alignment.functional,
                    'stop': alignment.stop,
                    'cdr3_nt': alignment.cdr3,
                    'cdr3_num_nts': len(alignment.cdr3),
                    'cdr3_aa': lookups.aas_from_nts(alignment.cdr3),
                    'sequence': str(alignment.sequence.sequence),
                    'quality': alignment.sequence.quality,
                    'locally_aligned': alignment.locally_aligned,
                    '_insertions': serialize_gaps(alignment.insertions),
                    '_deletions': serialize_gaps(alignment.deletions),
                    'germline': alignment.germline
                }
                # This line doesnt actually add anything to the DB, it's just
                # to validate the fields
                Sequence(**fields)

                session.query(Sequence).filter(
                    Sequence.ai == sequence['pk']).update(
                        fields, synchronize_session=False)
        except ValueError:
            continue
コード例 #5
0
def get_sample_vdjtools(session, sample, min_clone_size, clone_features):
    writer = StreamingTSV(['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j'])

    sample_clones = Counter()
    stats = session.query(
        CloneStats.clone_id,
        CloneStats.total_cnt).filter(CloneStats.sample_id == sample.id)

    for stat in stats:
        sample_clones[clone_features[stat.clone_id]] += stat.total_cnt

    total = sum(sample_clones.values())
    yield writer.writeheader()
    for key in sorted(sample_clones, key=sample_clones.get, reverse=True):
        counts = sample_clones[key]
        if counts < min_clone_size:
            continue
        v, j, cdr3_nt = key
        yield writer.writerow({
            'count': counts,
            'freq': counts / total,
            'cdr3nt': cdr3_nt,
            'cdr3aa': aas_from_nts(cdr3_nt),
            'v': v,
            'd': '.',
            'j': j,
        })
コード例 #6
0
ファイル: __init__.py プロジェクト: wangdi2014/immunedb
def add_as_sequence(session, alignment, sample, error_action='discard'):
    try:
        seq = Sequence(
            seq_id=alignment.sequence.ids[0],
            sample_id=sample.id,
            subject_id=sample.subject.id,
            partial=alignment.partial,
            probable_indel_or_misalign=alignment.has_possible_indel,
            v_gene=funcs.format_ties(alignment.v_gene),
            j_gene=funcs.format_ties(alignment.j_gene),
            num_gaps=alignment.num_gaps,
            seq_start=alignment.seq_start,
            v_match=alignment.v_match,
            v_length=alignment.v_length,
            j_match=alignment.j_match,
            j_length=alignment.j_length,
            removed_prefix=alignment.sequence.removed_prefix_sequence,
            removed_prefix_qual=alignment.sequence.removed_prefix_quality,
            v_mutation_fraction=alignment.v_mutation_fraction,
            pre_cdr3_length=alignment.pre_cdr3_length,
            pre_cdr3_match=alignment.pre_cdr3_match,
            post_cdr3_length=alignment.post_cdr3_length,
            post_cdr3_match=alignment.post_cdr3_match,
            in_frame=alignment.in_frame,
            functional=alignment.functional,
            stop=alignment.stop,
            copy_number=len(alignment.sequence.ids),
            cdr3_nt=alignment.cdr3,
            cdr3_num_nts=len(alignment.cdr3),
            cdr3_aa=lookups.aas_from_nts(alignment.cdr3),
            sequence=str(alignment.sequence.sequence),
            quality=alignment.sequence.quality,
            locally_aligned=alignment.locally_aligned,
            insertions=alignment.insertions,
            deletions=alignment.deletions,
            germline=alignment.germline)
        session.add(seq)
        session.flush()

        # Add duplicate sequences
        try:
            session.bulk_save_objects([
                DuplicateSequence(sample_id=sample.id,
                                  seq_id=seq_id,
                                  duplicate_seq_ai=seq.ai)
                for seq_id in alignment.sequence.ids[1:]
            ])
        except ValueError as e:
            pass
        return seq
    except ValueError as e:
        if error_action == 'discard':
            add_as_noresult(session, alignment.sequence, sample, str(e))
            return None
        elif error_action == 'raise':
            raise e
コード例 #7
0
    def _calculate_clone_stats(self, sample_id, min_cdr3, max_cdr3,
                               include_outliers, only_full_reads):
        clone_statistics = {}
        for name, stat in _clone_contexts.items():
            clone_statistics[name] = CloneContextStats(seqs=None, **stat)

        # TODO: This should be automatically generated from _dist_fields
        query = self._session.query(
            Sequence.clone_id,
            func.round(func.avg(Sequence.v_match)).label('v_match'),
            func.round(func.avg(Sequence.j_match)).label('j_match'),
            func.round(func.avg(Sequence.j_length)).label('j_length'),
            Sequence.v_gene,
            Sequence.j_gene,
            func.count(Sequence.seq_id).label('copy_number'),
            func.round(
                func.avg(Sequence.v_length + Sequence.num_gaps)
            ).label('v_length'),
            func.round(
                func.avg(100 * Sequence.v_match / Sequence.v_length)
            ).label('v_identity'),
            Sequence.cdr3_num_nts.label('cdr3_length'),
            SelectionPressure.sigma_fwr.label('sp_fwr'),
            SelectionPressure.sigma_cdr.label('sp_cdr'),
        ).join(
            SelectionPressure,
            and_(
                SelectionPressure.clone_id == Sequence.clone_id,
                SelectionPressure.sample_id == Sequence.sample_id
            ),
            isouter=True
        ).filter(
            Sequence.sample_id == sample_id,
            ~Sequence.clone_id.is_(None)
        )

        if only_full_reads:
            query = query.filter(Sequence.partial == 0)
        query = query.group_by(Sequence.clone_id)

        for clone in query:
            clone_info = self._session.query(Clone.cdr3_nt).filter(
                Clone.id == clone.clone_id).first()
            in_frame = len(clone_info.cdr3_nt) % 3 == 0
            stop = '*' in lookups.aas_from_nts(clone_info.cdr3_nt)
            functional = in_frame and not stop
            for name, stat in clone_statistics.items():
                stat.add_if_match(clone, in_frame, stop, functional)

        self._add_stat(clone_statistics, sample_id, include_outliers,
                       only_full_reads)
コード例 #8
0
    def _calculate_clone_stats(self, sample_id, min_cdr3, max_cdr3,
                               include_outliers, only_full_reads):
        clone_statistics = {}
        for name, stat in _clone_contexts.items():
            clone_statistics[name] = CloneContextStats(seqs=None, **stat)

        # TODO: This should be automatically generated from _dist_fields
        query = self._session.query(
            Sequence.clone_id,
            func.round(func.avg(Sequence.v_match)).label('v_match'),
            func.round(func.avg(Sequence.j_match)).label('j_match'),
            func.round(func.avg(Sequence.j_length)).label('j_length'),
            Sequence.v_gene,
            Sequence.j_gene,
            func.count(Sequence.seq_id).label('copy_number'),
            func.round(
                func.avg(Sequence.v_length + Sequence.num_gaps)
            ).label('v_length'),
            func.round(
                func.avg(100 * Sequence.v_match / Sequence.v_length)
            ).label('v_identity'),
            Sequence.cdr3_num_nts.label('cdr3_length'),
            SelectionPressure.sigma_fwr.label('sp_fwr'),
            SelectionPressure.sigma_cdr.label('sp_cdr'),
        ).join(
            SelectionPressure,
            and_(
                SelectionPressure.clone_id == Sequence.clone_id,
                SelectionPressure.sample_id == Sequence.sample_id
            ),
            isouter=True
        ).filter(
            Sequence.sample_id == sample_id,
            ~Sequence.clone_id.is_(None)
        )

        if only_full_reads:
            query = query.filter(Sequence.partial == 0)
        query = query.group_by(Sequence.clone_id)

        for clone in query:
            clone_info = self._session.query(Clone.cdr3_nt).filter(
                Clone.id == clone.clone_id).first()
            in_frame = len(clone_info.cdr3_nt) % 3 == 0
            stop = '*' in lookups.aas_from_nts(clone_info.cdr3_nt)
            functional = in_frame and not stop
            for name, stat in clone_statistics.items():
                stat.add_if_match(clone, in_frame, stop, functional)

        self._add_stat(clone_statistics, sample_id, include_outliers,
                       only_full_reads)
コード例 #9
0
def add_as_sequence(session, vdj, sample):
    try:
        seq = Sequence(seq_id=vdj.ids[0],
                       sample_id=sample.id,
                       subject_id=sample.subject.id,
                       partial=vdj.partial,
                       probable_indel_or_misalign=vdj.has_possible_indel,
                       v_gene=funcs.format_ties(vdj.v_gene,
                                                vdj.v_germlines.prefix,
                                                strip_alleles=True),
                       j_gene=funcs.format_ties(vdj.j_gene,
                                                vdj.j_germlines.prefix,
                                                strip_alleles=True),
                       num_gaps=vdj.num_gaps,
                       pad_length=vdj.pad_length,
                       v_match=vdj.v_match,
                       v_length=vdj.v_length,
                       j_match=vdj.j_match,
                       j_length=vdj.j_length,
                       removed_prefix=vdj.removed_prefix,
                       removed_prefix_qual=vdj.removed_prefix_qual,
                       v_mutation_fraction=vdj.mutation_fraction,
                       pre_cdr3_length=vdj.pre_cdr3_length,
                       pre_cdr3_match=vdj.pre_cdr3_match,
                       post_cdr3_length=vdj.post_cdr3_length,
                       post_cdr3_match=vdj.post_cdr3_match,
                       in_frame=vdj.in_frame,
                       functional=vdj.functional,
                       stop=vdj.stop,
                       copy_number=len(vdj.ids),
                       cdr3_nt=vdj.cdr3,
                       cdr3_num_nts=len(vdj.cdr3),
                       cdr3_aa=lookups.aas_from_nts(vdj.cdr3),
                       sequence=str(vdj.sequence),
                       quality=vdj.quality,
                       germline=vdj.germline)
        session.add(seq)
        session.flush()

        # Add duplicate sequences
        try:
            session.bulk_save_objects([
                DuplicateSequence(sample_id=sample.id,
                                  seq_id=seq_id,
                                  duplicate_seq_ai=seq.ai)
                for seq_id in vdj.ids[1:]
            ])
        except ValueError:
            pass
    except ValueError as e:
        add_as_noresult(session, vdj, sample, str(e))
コード例 #10
0
def get_seq_from_alignment(session, alignment, sample, strip_alleles=True):
    try:
        return [
            Sequence(
                seq_id=alignment.sequence.seq_id,
                sample_id=sample.id,
                subject_id=sample.subject.id,
                partial=alignment.partial,
                rev_comp=alignment.sequence.rev_comp,
                probable_indel_or_misalign=alignment.has_possible_indel,
                v_gene=funcs.format_ties(alignment.v_gene, strip_alleles),
                j_gene=funcs.format_ties(alignment.j_gene, strip_alleles),
                num_gaps=alignment.num_gaps,
                seq_start=alignment.seq_start,
                v_match=alignment.v_match,
                v_length=alignment.v_length,
                j_match=alignment.j_match,
                j_length=alignment.j_length,
                removed_prefix=alignment.sequence.removed_prefix_sequence,
                removed_prefix_qual=alignment.sequence.removed_prefix_quality,
                v_mutation_fraction=alignment.v_mutation_fraction,
                pre_cdr3_length=alignment.pre_cdr3_length,
                pre_cdr3_match=alignment.pre_cdr3_match,
                post_cdr3_length=alignment.post_cdr3_length,
                post_cdr3_match=alignment.post_cdr3_match,
                in_frame=alignment.in_frame,
                functional=alignment.functional,
                stop=alignment.stop,
                copy_number=alignment.sequence.copy_number,
                cdr3_nt=alignment.cdr3,
                cdr3_num_nts=len(alignment.cdr3),
                cdr3_aa=lookups.aas_from_nts(alignment.cdr3),
                sequence=str(alignment.sequence.sequence),
                quality=alignment.sequence.quality,
                locally_aligned=alignment.locally_aligned,
                insertions=alignment.insertions,
                deletions=alignment.deletions,
                germline=alignment.germline)
        ]
    except ValueError as e:
        try:
            return [
                get_noresult_from_vdj(session, alignment.sequence, sample,
                                      str(e))
            ]
        except ValueError:
            return []
コード例 #11
0
def export_vdjtools(session, args):
    fieldnames = ['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j']
    if args.include_uniques:
        fieldnames.append('unique')

    clone_features = {
        c.id: (c.v_gene, c.j_gene, c.cdr3_nt)
        for c in session.query(Clone.id, Clone.v_gene, Clone.j_gene,
                               Clone.cdr3_nt)
    }
    for sample in session.query(Sample).order_by(Sample.id):
        logger.info('Exporting sample {}'.format(sample.name))
        sample_clones = {}
        stats = session.query(
            CloneStats.clone_id, CloneStats.total_cnt,
            CloneStats.unique_cnt).filter(CloneStats.sample_id == sample.id)
        for stat in stats:
            key = clone_features[stat.clone_id]
            sample_clones.setdefault(key, Counter())['total'] += stat.total_cnt
            sample_clones[key]['unique'] += stat.unique_cnt

        writer = csv.DictWriter(open('{}.sample.txt'.format(sample.name),
                                     'w+'),
                                fieldnames=fieldnames,
                                delimiter='\t',
                                extrasaction='ignore')
        total = float(sum([c['total'] for c in sample_clones.values()]))
        writer.writeheader()
        for key in sorted(sample_clones, key=sample_clones.get, reverse=True):
            counts = sample_clones[key]
            if counts['total'] < args.min_clone_size:
                continue
            v, j, cdr3_nt = key
            writer.writerow({
                'count': counts['total'],
                'freq': counts['total'] / total,
                'cdr3nt': cdr3_nt,
                'cdr3aa': aas_from_nts(cdr3_nt),
                'v': v,
                'd': '.',
                'j': j,
                'unique': counts['unique']
            })
コード例 #12
0
def get_vdjtools_output(session, clones):
    writer = StreamingTSV(['count', 'freq', 'cdr3nt', 'cdr3aa', 'v', 'd', 'j'])
    counts = Counter()
    total_copies = 0
    for clone, agg in clones.items():
        key = (clone.v_gene, clone.j_gene, clone.cdr3_nt)
        counts[key] += agg['counts']['copies']
        total_copies += counts[key]

    yield writer.writeheader()
    for key in sorted(counts, key=counts.get, reverse=True):
        count = counts[key]
        v, j, cdr3_nt = key
        yield writer.writerow({
            'count': count,
            'freq': count / total_copies,
            'cdr3nt': cdr3_nt,
            'cdr3aa': aas_from_nts(cdr3_nt),
            'v': v,
            'd': '.',
            'j': j,
        })
コード例 #13
0
ファイル: local_align.py プロジェクト: arosenfeld/immunedb
def add_sequences_from_sample(session, sample, sequences, props):
    logger.info('Adding {} corrected sequences to sample {}'.format(
        len(sequences), sample.id))
    for sequence in periodic_commit(session, sequences):
        alignment = sequence['alignment']
        try:
            try:
                props.validate(alignment)
            except AlignmentException:
                continue
            if sequence['r_type'] == 'NoResult':
                add_sequences(session, [alignment], sample,
                              error_action='raise')
                session.query(NoResult).filter(
                    NoResult.pk == sequence['pk']
                ).delete(synchronize_session=False)
            elif sequence['r_type'] == 'Sequence':
                fields = {
                    'partial': alignment.partial,

                    'probable_indel_or_misalign':
                        alignment.has_possible_indel,

                    'v_gene': format_ties(alignment.v_gene),
                    'j_gene': format_ties(alignment.j_gene),

                    'num_gaps': alignment.num_gaps,
                    'seq_start': alignment.seq_start,

                    'v_match': alignment.v_match,
                    'v_length': alignment.v_length,
                    'j_match': alignment.j_match,
                    'j_length': alignment.j_length,

                    'removed_prefix':
                        alignment.sequence.removed_prefix_sequence,
                    'removed_prefix_qual':
                        alignment.sequence.removed_prefix_quality,
                    'v_mutation_fraction': alignment.v_mutation_fraction,

                    'pre_cdr3_length': alignment.pre_cdr3_length,
                    'pre_cdr3_match': alignment.pre_cdr3_match,
                    'post_cdr3_length': alignment.post_cdr3_length,
                    'post_cdr3_match': alignment.post_cdr3_match,

                    'in_frame': alignment.in_frame,
                    'functional': alignment.functional,
                    'stop': alignment.stop,

                    'cdr3_nt': alignment.cdr3,
                    'cdr3_num_nts': len(alignment.cdr3),
                    'cdr3_aa': lookups.aas_from_nts(alignment.cdr3),

                    'sequence': str(alignment.sequence.sequence),
                    'quality': alignment.sequence.quality,

                    'locally_aligned': alignment.locally_aligned,
                    '_insertions': serialize_gaps(alignment.insertions),
                    '_deletions': serialize_gaps(alignment.deletions),

                    'germline': alignment.germline
                }
                # This line doesnt actually add anything to the DB, it's just
                # to validate the fields
                Sequence(**fields)

                session.query(Sequence).filter(
                    Sequence.ai == sequence['pk']
                ).update(fields, synchronize_session=False)
        except ValueError:
            continue