def aggregate_results(results, session, sample): alignments = {} success = [r for r in results if r['status'] == 'success'] noresults = [r for r in results if r['status'] == 'noresult'] logger.info('{} total sequences ({} alignments, {} noresults)'.format( len(results), len(success), len(noresults))) for result in success: alignment = result['alignment'] key = (funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), alignment.cdr3_num_nts, tuple(alignment.insertions), tuple(alignment.deletions)) alignments.setdefault(key, []).append(alignment) copies = 0 for i, result in enumerate(noresults): orig_id = result['vdj'].seq_id copies += result['vdj'].copy_number for i in range(result['vdj'].copy_number): result['vdj'].seq_id = '{}_{}'.format(orig_id, i) add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) if copies % 1000 == 0: session.commit() session.commit() return alignments
def add_results(uniques, sample, session): metrics = {'muts': [], 'lens': []} for unique in itertools.chain(*uniques): try: add_sequences(session, [unique], sample) metrics['lens'].append(unique.v_length) metrics['muts'].append(unique.v_mutation_fraction) except AlignmentException as e: add_noresults_for_vdj(session, unique.sequence, sample, str(e)) if metrics['lens']: sample.v_ties_len = sum(metrics['lens']) / len(metrics['lens']) sample.v_ties_mutations = sum(metrics['muts']) / len(metrics['muts']) session.commit()
def aggregate_results(results, session, sample): alignments = {} for result in results: if result['status'] == 'success': alignment = result['alignment'] key = ( funcs.format_ties(alignment.v_gene), funcs.format_ties(alignment.j_gene), alignment.cdr3_num_nts, tuple(alignment.insertions), tuple(alignment.deletions) ) alignments.setdefault(key, []).append(alignment) elif result['status'] == 'noresult': add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) session.commit() return alignments
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props): reader = csv.DictReader(handle, delimiter='\t') uniques = {} for i, line in enumerate(reader): if fmt == 'adaptive': try: line = extract_adaptive_sequence(i, line, v_germlines, j_germlines) except (AlignmentException, KeyError) as e: seq = VDJSequence('seq_{}'.format(i), '') add_noresults_for_vdj(session, seq, sample, str(e)) continue seq = VDJSequence(line['SEQUENCE_ID'], line['SEQUENCE_IMGT'].replace('.', '-')) if 'DUPCOUNT' in line: seq.copy_number = int(line['DUPCOUNT']) try: alignment = create_alignment(seq, line, v_germlines, j_germlines) for other in uniques.setdefault( len(alignment.sequence.sequence), []): if dnautils.equal(other.sequence.sequence, alignment.sequence.sequence): other.sequence.copy_number += ( alignment.sequence.copy_number) break else: uniques[len(alignment.sequence.sequence)].append(alignment) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]] lens = [] muts = [] for unique in uniques: try: props.validate(unique) add_sequences(session, [unique], sample) lens.append(unique.v_length) muts.append(unique.v_mutation_fraction) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) if len(lens) > 0: sample.v_ties_len = sum(lens) / len(lens) sample.v_ties_mutations = sum(muts) / len(muts) session.commit()
def read_file(session, fmt, handle, sample, v_germlines, j_germlines, props): reader = csv.DictReader(handle, delimiter='\t') uniques = {} for i, line in enumerate(reader): if fmt == 'adaptive': try: line = extract_adaptive_sequence(i, line, v_germlines, j_germlines) except (AlignmentException, KeyError) as e: seq = VDJSequence('seq_{}'.format(i), '') add_noresults_for_vdj(session, seq, sample, str(e)) continue seq = VDJSequence(line['SEQUENCE_ID'], line['SEQUENCE_IMGT'].replace('.', '-')) if 'DUPCOUNT' in line: seq.copy_number = int(line['DUPCOUNT']) try: alignment = create_alignment(seq, line, v_germlines, j_germlines) for other in uniques.setdefault(len(alignment.sequence.sequence), []): if dnautils.equal(other.sequence.sequence, alignment.sequence.sequence): other.sequence.copy_number += ( alignment.sequence.copy_number) break else: uniques[len(alignment.sequence.sequence)].append(alignment) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) uniques = [s for k in sorted(uniques.keys()) for s in uniques[k]] lens = [] muts = [] for unique in uniques: try: props.validate(unique) add_sequences(session, [unique], sample) lens.append(unique.v_length) muts.append(unique.v_mutation_fraction) except AlignmentException as e: add_noresults_for_vdj(session, seq, sample, str(e)) if len(lens) > 0: sample.v_ties_len = sum(lens) / float(len(lens)) sample.v_ties_mutations = sum(muts) / float(len(muts)) session.commit()
def process_sample(db_config, v_germlines, j_germlines, path, meta, props, nproc): session = config.init_db(db_config) start = time.time() logger.info('Starting sample {}'.format(meta['sample_name'])) sample = setup_sample(session, meta) aligner = AnchorAligner(v_germlines, j_germlines) # Initial VJ assignment alignments = concurrent.process_data( read_input, process_vdj, aggregate_vdj, nproc, process_args={'aligner': aligner}, generate_args={'path': path}, ) logger.info('Adding noresults') for result in alignments['noresult']: add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) alignments = alignments['success'] if alignments: avg_len = (sum([v.v_length for v in alignments]) / len(alignments)) avg_mut = (sum([v.v_mutation_fraction for v in alignments]) / len(alignments)) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len logger.info('Re-aligning {} sequences to V-ties: Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) session.commit() # Realign to V-ties v_ties = concurrent.process_data( alignments, process_vties, aggregate_vties, nproc, process_args={ 'aligner': aligner, 'avg_len': avg_len, 'avg_mut': avg_mut, 'props': props }, ) logger.info('Adding noresults') for result in funcs.periodic_commit(session, v_ties['noresult'], 100): add_noresults_for_vdj(session, result['alignment'].sequence, sample, result['reason']) logger.info('Collapsing {} buckets'.format(len(v_ties['success']))) session.commit() # TODO: Change this so we arent copying everything between processes concurrent.process_data([list(v) for v in v_ties['success']], process_collapse, aggregate_collapse, nproc, aggregate_args={ 'db_config': db_config, 'sample_id': sample.id, 'props': props }) session.expire_all() session.commit() identified = int( session.query(func.sum(Sequence.copy_number)).filter( Sequence.sample == sample).scalar() or 0) noresults = int( session.query(func.count( NoResult.pk)).filter(NoResult.sample == sample).scalar() or 0) if identified + noresults: frac = int(100 * identified / (identified + noresults)) else: frac = 0 logger.info( 'Completed sample {} in {}m - {}/{} ({}%) identified'.format( sample.name, round((time.time() - start) / 60., 1), identified, identified + noresults, frac)) session.close()
def process_sample(db_config, v_germlines, j_germlines, path, meta, props, nproc): session = config.init_db(db_config) start = time.time() logger.info('Starting sample {}'.format(meta['sample_name'])) sample = setup_sample(session, meta) aligner = AnchorAligner(v_germlines, j_germlines) # Initial VJ assignment alignments = concurrent.process_data( read_input, process_vdj, aggregate_vdj, nproc, process_args={'aligner': aligner}, generate_args={'path': path}, ) logger.info('Adding noresults') for result in alignments['noresult']: add_noresults_for_vdj(session, result['vdj'], sample, result['reason']) alignments = alignments['success'] if alignments: avg_len = ( sum([v.v_length for v in alignments]) / len(alignments)) avg_mut = ( sum([v.v_mutation_fraction for v in alignments]) / len(alignments) ) sample.v_ties_mutations = avg_mut sample.v_ties_len = avg_len logger.info('Re-aligning {} sequences to V-ties: Mutations={}, ' 'Length={}'.format(len(alignments), round(avg_mut, 2), round(avg_len, 2))) session.commit() # Realign to V-ties v_ties = concurrent.process_data( alignments, process_vties, aggregate_vties, nproc, process_args={'aligner': aligner, 'avg_len': avg_len, 'avg_mut': avg_mut, 'props': props}, ) logger.info('Adding noresults') for result in funcs.periodic_commit(session, v_ties['noresult'], 100): add_noresults_for_vdj(session, result['alignment'].sequence, sample, result['reason']) logger.info('Collapsing {} buckets'.format(len(v_ties['success']))) session.commit() # TODO: Change this so we arent copying everything between processes concurrent.process_data( [list(v) for v in v_ties['success']], process_collapse, aggregate_collapse, nproc, aggregate_args={'db_config': db_config, 'sample_id': sample.id, 'props': props} ) session.expire_all() session.commit() identified = int(session.query( func.sum(Sequence.copy_number) ).filter( Sequence.sample == sample ).scalar() or 0) noresults = int(session.query( func.count(NoResult.pk) ).filter( NoResult.sample == sample ).scalar() or 0) if identified + noresults: frac = int(100 * identified / (identified + noresults)) else: frac = 0 logger.info( 'Completed sample {} in {}m - {}/{} ({}%) identified'.format( sample.name, round((time.time() - start) / 60., 1), identified, identified + noresults, frac ) ) session.close()