Exemple #1
0
def aggregate_results(results, session, sample):
    alignments = {}
    success = [r for r in results if r['status'] == 'success']
    noresults = [r for r in results if r['status'] == 'noresult']
    logger.info('{} total sequences ({} alignments, {} noresults)'.format(
        len(results), len(success), len(noresults)))

    for result in success:
        alignment = result['alignment']
        key = (funcs.format_ties(alignment.v_gene),
               funcs.format_ties(alignment.j_gene), alignment.cdr3_num_nts,
               tuple(alignment.insertions), tuple(alignment.deletions))
        alignments.setdefault(key, []).append(alignment)

    copies = 0
    for i, result in enumerate(noresults):
        orig_id = result['vdj'].seq_id
        copies += result['vdj'].copy_number
        for i in range(result['vdj'].copy_number):
            result['vdj'].seq_id = '{}_{}'.format(orig_id, i)
            add_noresults_for_vdj(session, result['vdj'], sample,
                                  result['reason'])
        if copies % 1000 == 0:
            session.commit()

    session.commit()
    return alignments
Exemple #2
0
def add_results(uniques, sample, session):
    metrics = {'muts': [], 'lens': []}
    for unique in itertools.chain(*uniques):
        try:
            add_sequences(session, [unique], sample)
            metrics['lens'].append(unique.v_length)
            metrics['muts'].append(unique.v_mutation_fraction)
        except AlignmentException as e:
            add_noresults_for_vdj(session, unique.sequence, sample, str(e))

    if metrics['lens']:
        sample.v_ties_len = sum(metrics['lens']) / len(metrics['lens'])
        sample.v_ties_mutations = sum(metrics['muts']) / len(metrics['muts'])
    session.commit()
Exemple #3
0
def aggregate_results(results, session, sample):
    alignments = {}
    for result in results:
        if result['status'] == 'success':
            alignment = result['alignment']
            key = (
                funcs.format_ties(alignment.v_gene),
                funcs.format_ties(alignment.j_gene),
                alignment.cdr3_num_nts,
                tuple(alignment.insertions),
                tuple(alignment.deletions)
            )
            alignments.setdefault(key, []).append(alignment)
        elif result['status'] == 'noresult':
            add_noresults_for_vdj(session, result['vdj'], sample,
                                  result['reason'])
    session.commit()
    return alignments
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props):
    reader = csv.DictReader(handle, delimiter='\t')
    uniques = {}

    for i, line in enumerate(reader):
        if fmt == 'adaptive':
            try:
                line = extract_adaptive_sequence(i, line, v_germlines,
                                                 j_germlines)
            except (AlignmentException, KeyError) as e:
                seq = VDJSequence('seq_{}'.format(i), '')
                add_noresults_for_vdj(session, seq, sample, str(e))
                continue
        seq = VDJSequence(line['SEQUENCE_ID'],
                          line['SEQUENCE_IMGT'].replace('.', '-'))
        if 'DUPCOUNT' in line:
            seq.copy_number = int(line['DUPCOUNT'])
        try:
            alignment = create_alignment(seq, line, v_germlines, j_germlines)
            for other in uniques.setdefault(
                    len(alignment.sequence.sequence), []):
                if dnautils.equal(other.sequence.sequence,
                                  alignment.sequence.sequence):
                    other.sequence.copy_number += (
                        alignment.sequence.copy_number)
                    break
            else:
                uniques[len(alignment.sequence.sequence)].append(alignment)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]]
    lens = []
    muts = []
    for unique in uniques:
        try:
            props.validate(unique)
            add_sequences(session, [unique], sample)
            lens.append(unique.v_length)
            muts.append(unique.v_mutation_fraction)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    if len(lens) > 0:
        sample.v_ties_len = sum(lens) / len(lens)
        sample.v_ties_mutations = sum(muts) / len(muts)

    session.commit()
Exemple #5
0
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props):
    reader = csv.DictReader(handle, delimiter='\t')
    uniques = {}

    for i, line in enumerate(reader):
        if fmt == 'adaptive':
            try:
                line = extract_adaptive_sequence(i, line, v_germlines,
                                                 j_germlines)
            except (AlignmentException, KeyError) as e:
                seq = VDJSequence('seq_{}'.format(i), '')
                add_noresults_for_vdj(session, seq, sample, str(e))
                continue
        seq = VDJSequence(line['SEQUENCE_ID'],
                          line['SEQUENCE_IMGT'].replace('.', '-'))
        if 'DUPCOUNT' in line:
            seq.copy_number = int(line['DUPCOUNT'])
        try:
            alignment = create_alignment(seq, line, v_germlines, j_germlines)
            for other in uniques.setdefault(len(alignment.sequence.sequence),
                                            []):
                if dnautils.equal(other.sequence.sequence,
                                  alignment.sequence.sequence):
                    other.sequence.copy_number += (
                        alignment.sequence.copy_number)
                    break
            else:
                uniques[len(alignment.sequence.sequence)].append(alignment)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]]
    lens = []
    muts = []
    for unique in uniques:
        try:
            props.validate(unique)
            add_sequences(session, [unique], sample)
            lens.append(unique.v_length)
            muts.append(unique.v_mutation_fraction)
        except AlignmentException as e:
            add_noresults_for_vdj(session, seq, sample, str(e))

    if len(lens) > 0:
        sample.v_ties_len = sum(lens) / float(len(lens))
        sample.v_ties_mutations = sum(muts) / float(len(muts))

    session.commit()
Exemple #6
0
def process_sample(db_config, v_germlines, j_germlines, path, meta, props,
                   nproc):
    session = config.init_db(db_config)
    start = time.time()
    logger.info('Starting sample {}'.format(meta['sample_name']))
    sample = setup_sample(session, meta)

    aligner = AnchorAligner(v_germlines, j_germlines)

    # Initial VJ assignment
    alignments = concurrent.process_data(
        read_input,
        process_vdj,
        aggregate_vdj,
        nproc,
        process_args={'aligner': aligner},
        generate_args={'path': path},
    )
    logger.info('Adding noresults')
    for result in alignments['noresult']:
        add_noresults_for_vdj(session, result['vdj'], sample, result['reason'])

    alignments = alignments['success']
    if alignments:
        avg_len = (sum([v.v_length for v in alignments]) / len(alignments))
        avg_mut = (sum([v.v_mutation_fraction
                        for v in alignments]) / len(alignments))
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        logger.info('Re-aligning {} sequences to V-ties: Mutations={}, '
                    'Length={}'.format(len(alignments), round(avg_mut, 2),
                                       round(avg_len, 2)))
        session.commit()
        # Realign to V-ties
        v_ties = concurrent.process_data(
            alignments,
            process_vties,
            aggregate_vties,
            nproc,
            process_args={
                'aligner': aligner,
                'avg_len': avg_len,
                'avg_mut': avg_mut,
                'props': props
            },
        )
        logger.info('Adding noresults')

        for result in funcs.periodic_commit(session, v_ties['noresult'], 100):
            add_noresults_for_vdj(session, result['alignment'].sequence,
                                  sample, result['reason'])

        logger.info('Collapsing {} buckets'.format(len(v_ties['success'])))
        session.commit()

        # TODO: Change this so we arent copying everything between processes
        concurrent.process_data([list(v) for v in v_ties['success']],
                                process_collapse,
                                aggregate_collapse,
                                nproc,
                                aggregate_args={
                                    'db_config': db_config,
                                    'sample_id': sample.id,
                                    'props': props
                                })
        session.expire_all()
        session.commit()

        identified = int(
            session.query(func.sum(Sequence.copy_number)).filter(
                Sequence.sample == sample).scalar() or 0)
        noresults = int(
            session.query(func.count(
                NoResult.pk)).filter(NoResult.sample == sample).scalar() or 0)
        if identified + noresults:
            frac = int(100 * identified / (identified + noresults))
        else:
            frac = 0
        logger.info(
            'Completed sample {} in {}m - {}/{} ({}%) identified'.format(
                sample.name, round((time.time() - start) / 60., 1), identified,
                identified + noresults, frac))
    session.close()
def process_sample(db_config, v_germlines, j_germlines, path, meta, props,
                   nproc):
    session = config.init_db(db_config)
    start = time.time()
    logger.info('Starting sample {}'.format(meta['sample_name']))
    sample = setup_sample(session, meta)

    aligner = AnchorAligner(v_germlines, j_germlines)

    # Initial VJ assignment
    alignments = concurrent.process_data(
        read_input,
        process_vdj,
        aggregate_vdj,
        nproc,
        process_args={'aligner': aligner},
        generate_args={'path': path},
    )
    logger.info('Adding noresults')
    for result in alignments['noresult']:
        add_noresults_for_vdj(session, result['vdj'], sample, result['reason'])

    alignments = alignments['success']
    if alignments:
        avg_len = (
            sum([v.v_length for v in alignments]) /
            len(alignments))
        avg_mut = (
            sum([v.v_mutation_fraction for v in alignments]) /
            len(alignments)
        )
        sample.v_ties_mutations = avg_mut
        sample.v_ties_len = avg_len
        logger.info('Re-aligning {} sequences to V-ties: Mutations={}, '
                    'Length={}'.format(len(alignments),
                                       round(avg_mut, 2),
                                       round(avg_len, 2)))
        session.commit()
        # Realign to V-ties
        v_ties = concurrent.process_data(
            alignments,
            process_vties,
            aggregate_vties,
            nproc,
            process_args={'aligner': aligner, 'avg_len': avg_len, 'avg_mut':
                          avg_mut, 'props': props},
        )
        logger.info('Adding noresults')

        for result in funcs.periodic_commit(session, v_ties['noresult'], 100):
            add_noresults_for_vdj(session, result['alignment'].sequence,
                                  sample, result['reason'])

        logger.info('Collapsing {} buckets'.format(len(v_ties['success'])))
        session.commit()

        # TODO: Change this so we arent copying everything between processes
        concurrent.process_data(
            [list(v) for v in v_ties['success']],
            process_collapse,
            aggregate_collapse,
            nproc,
            aggregate_args={'db_config': db_config, 'sample_id': sample.id,
                            'props': props}
        )
        session.expire_all()
        session.commit()

        identified = int(session.query(
            func.sum(Sequence.copy_number)
        ).filter(
            Sequence.sample == sample
        ).scalar() or 0)
        noresults = int(session.query(
            func.count(NoResult.pk)
        ).filter(
            NoResult.sample == sample
        ).scalar() or 0)
        if identified + noresults:
            frac = int(100 * identified / (identified + noresults))
        else:
            frac = 0
        logger.info(
            'Completed sample {} in {}m - {}/{} ({}%) identified'.format(
                sample.name,
                round((time.time() - start) / 60., 1),
                identified,
                identified + noresults,
                frac
            )
        )
    session.close()