def remap70(fastq1_filename, do_counts=False): workdir = os.path.dirname(fastq1_filename) fastq2_filename = get_reverse_filename(fastq1_filename) prelim_filename = os.path.join(workdir, 'temp70.prelim.csv') remap_filename = os.path.join(workdir, 'temp70.remap.csv') remap_counts_filename = os.path.join(workdir, 'temp70.remap_counts.csv') aligned_filename = os.path.join(workdir, 'temp70.aligned.csv') nuc_filename = os.path.join(workdir, 'temp70.nuc.csv') amino_filename = os.path.join(workdir, 'temp70.amino.csv') failed_align_filename = os.path.join(workdir, 'temp70.failed_align.csv') with open(prelim_filename, 'w+') as prelim_csv, \ open(remap_filename, 'w+') as remap_csv, \ open(remap_counts_filename, 'w+') as remap_counts_csv, \ open(aligned_filename, 'w+') as aligned_csv, \ open(nuc_filename, 'w+') as nuc_csv, \ open(amino_filename, 'w+') as amino_csv, \ open(failed_align_filename, 'w+') as failed_align_csv, \ open(os.devnull, 'w+') as real_devnull: devnull = DevNullWrapper(real_devnull) prelim_map(fastq1_filename, fastq2_filename, prelim_csv) prelim_csv.seek(0) remap(fastq1_filename, fastq2_filename, prelim_csv, remap_csv, remap_counts_csv, devnull, devnull, devnull) if not do_counts: remap_counts_csv.close() return get_max_mapped_counts(remap_counts_filename) remap_csv.seek(0) sam2aln(remap_csv, aligned_csv, devnull, failed_align_csv) aligned_csv.seek(0) aln2counts(aligned_csv, nuc_csv, amino_csv, devnull, devnull, devnull, devnull) return get_gap_level(amino_filename, 'E2', 14)
def test_insertion(self): remap_file = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,V3LOOP,1,44,12M6I14M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,V3LOOP,1,44,12M6I14M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_aligned_csv = """\ refname,qcut,rank,count,offset,seq V3LOOP,15,0,1,0,TGTACAAGACCCAATACAAGAAAAAG """ expected_insert_csv = """\ qname,fwd_rev,refname,pos,insert,qual Example_read_1,F,V3LOOP,12,AACAAC,AAAAAA Example_read_1,R,V3LOOP,12,AACAAC,AAAAAA """ actual_aligned_csv = StringIO() actual_insert_csv = StringIO() sam2aln(remap_file, actual_aligned_csv, actual_insert_csv) self.assertMultiLineEqual(expected_aligned_csv, actual_aligned_csv.getvalue()) self.assertMultiLineEqual(expected_insert_csv, actual_insert_csv.getvalue())
def test_low_mapq(self): """ We no longer fail reads because of low mapq. When we use more than one reference, reads can receive low mapq if they are in a conserved region that matches more than one reference. """ remap_file = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,V3LOOP,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,V3LOOP,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_2,99,INT,1,8,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_2,147,INT,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_aligned_csv = """\ refname,qcut,rank,count,offset,seq INT,15,0,1,0,TGTACAAGACCCAACAACAATACAAGAAAAAG V3LOOP,15,0,1,0,TGTACAAGACCCAACAACAATACAAGAAAAAG """ expected_failed_csv = """\ qname,cause """ actual_aligned_csv = StringIO() actual_failed_csv = StringIO() sam2aln(remap_file, actual_aligned_csv, failed_csv=actual_failed_csv) self.assertMultiLineEqual(expected_aligned_csv, actual_aligned_csv.getvalue()) self.assertMultiLineEqual(expected_failed_csv, actual_failed_csv.getvalue())
def test_low_read_quality(self): remap_file = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,V3LOOP,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,000000000000000000AAAAAAAAAAAAAA Example_read_1,147,V3LOOP,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,000000000000000000AAAAAAAAAAAAAA Example_read_2,99,INT,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_2,147,INT,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_aligned_csv = """\ refname,qcut,rank,count,offset,seq INT,15,0,1,0,TGTACAAGACCCAACAACAATACAAGAAAAAG """ expected_failed_csv = """\ qname,cause Example_read_1,manyNs """ actual_aligned_csv = StringIO() actual_failed_csv = StringIO() sam2aln(remap_file, actual_aligned_csv, failed_csv=actual_failed_csv) self.assertMultiLineEqual(expected_aligned_csv, actual_aligned_csv.getvalue()) self.assertMultiLineEqual(expected_failed_csv, actual_failed_csv.getvalue())
def test_escaping(self): remap_file = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,V3LOOP,1,44,3M,=,1,3,TGT,"A,A" Example_read_1,147,V3LOOP,1,44,3M,=,1,-3,TGT,"A""A" """) expected_aligned_csv = """\ refname,qcut,rank,count,offset,seq V3LOOP,15,0,1,0,TNT """ actual_aligned_csv = StringIO() sam2aln(remap_file, actual_aligned_csv) self.assertMultiLineEqual(expected_aligned_csv, actual_aligned_csv.getvalue())
def test_soft_clipping(self): remap_file = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,V3LOOP,18,44,7S10M9S,=,1,-32,TGTACAAGACCCAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,V3LOOP,18,44,3S10M13S,=,1,-32,CAAGACCCAATACAAGAAAAAGCAAC,AAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_aligned_csv = """\ refname,qcut,rank,count,offset,seq V3LOOP,15,0,1,17,GACCCAATAC """ expected_clipping_csv = """\ refname,pos,count V3LOOP,11,1 V3LOOP,12,1 V3LOOP,13,1 V3LOOP,14,1 V3LOOP,15,1 V3LOOP,16,1 V3LOOP,17,1 V3LOOP,28,1 V3LOOP,29,1 V3LOOP,30,1 V3LOOP,31,1 V3LOOP,32,1 V3LOOP,33,1 V3LOOP,34,1 V3LOOP,35,1 V3LOOP,36,1 V3LOOP,37,1 V3LOOP,38,1 V3LOOP,39,1 V3LOOP,40,1 """ actual_aligned_csv = StringIO() actual_clipping_csv = StringIO() sam2aln(remap_file, actual_aligned_csv, clipping_csv=actual_clipping_csv) self.assertMultiLineEqual(expected_aligned_csv, actual_aligned_csv.getvalue()) self.assertMultiLineEqual(expected_clipping_csv, actual_clipping_csv.getvalue())
def test_unmatched_read(self): remap_file = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,V3LOOP,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_aligned_csv = """\ refname,qcut,rank,count,offset,seq """ expected_failed_csv = """\ qname,cause Example_read_1,unmatched """ actual_aligned_csv = StringIO() actual_failed_csv = StringIO() sam2aln(remap_file, actual_aligned_csv, StringIO(), actual_failed_csv) self.assertMultiLineEqual(expected_aligned_csv, actual_aligned_csv.getvalue()) self.assertMultiLineEqual(expected_failed_csv, actual_failed_csv.getvalue())
def test_different_references(self): remap_file = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,V3LOOP,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,GP41,1,44,32M,=,1,-32,TGTACAAGACCCAACAACAATACAAGAAAAAG,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_aligned_csv = """\ refname,qcut,rank,count,offset,seq """ expected_failed_csv = """\ qname,cause Example_read_1,2refs """ actual_aligned_csv = StringIO() actual_failed_csv = StringIO() sam2aln(remap_file, actual_aligned_csv, failed_csv=actual_failed_csv) self.assertMultiLineEqual(expected_aligned_csv, actual_aligned_csv.getvalue()) self.assertMultiLineEqual(expected_failed_csv, actual_failed_csv.getvalue())
def process_sample(sample_index, run_info, data_path, pssm): """ Process a single sample. :param sample_index: which sample to process from the session JSON :param run_info: run parameters loaded from the session JSON :param str data_path: the root folder for all BaseSpace data :param pssm: the pssm library for running G2P analysis """ scratch_path = os.path.join(data_path, 'scratch') sample_info = run_info.samples[sample_index] sample_id = sample_info['Id'] sample_name = sample_info['Name'] sample_dir = os.path.join(data_path, 'input', 'samples', sample_id, 'Data', 'Intensities', 'BaseCalls') if not os.path.exists(sample_dir): sample_dir = os.path.join(data_path, 'input', 'samples', sample_id) sample_path = None for root, _dirs, files in os.walk(sample_dir): sample_paths = fnmatch.filter(files, '*_R1_*') if sample_paths: sample_path = os.path.join(root, sample_paths[0]) break if sample_path is None: raise RuntimeError( 'No R1 file found for sample id {}.'.format(sample_id)) sample_path2 = sample_path.replace('_R1_', '_R2_') if not os.path.exists(sample_path2): raise RuntimeError('R2 file missing for sample id {}: {!r}.'.format( sample_id, sample_path2)) logger.info('Processing sample %s (%d of %d): %s (%s).', sample_id, sample_index + 1, len(run_info.samples), sample_name, sample_path) sample_out_path = create_app_result(data_path, run_info, sample_info, description='Mapping results', suffix='_QC') sample_scratch_path = os.path.join(scratch_path, sample_name) makedirs(sample_scratch_path) censored_path1 = os.path.join(sample_scratch_path, 'censored1.fastq') read_summary_path1 = os.path.join(sample_scratch_path, 'read1_summary.csv') censor_sample(sample_path, os.path.join(scratch_path, 'bad_cycles.csv'), censored_path1, read_summary_path1) censored_path2 = os.path.join(sample_scratch_path, 'censored2.fastq') read_summary_path2 = os.path.join(sample_scratch_path, 'read2_summary.csv') censor_sample(sample_path2, os.path.join(scratch_path, 'bad_cycles.csv'), censored_path2, read_summary_path2) logger.info('Running prelim_map (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'wb') as prelim_csv: prelim_map(censored_path1, censored_path2, prelim_csv) logger.info('Running remap (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'rU') as prelim_csv, \ open(os.path.join(sample_scratch_path, 'remap.csv'), 'wb') as remap_csv, \ open(os.path.join(sample_out_path, 'remap_counts.csv'), 'wb') as counts_csv, \ open(os.path.join(sample_out_path, 'remap_conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(sample_out_path, 'unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(sample_out_path, 'unmapped2.fastq'), 'w') as unmapped2: remap(censored_path1, censored_path2, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, sample_scratch_path, nthreads=1) logger.info('Running sam2aln (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \ open(os.path.join(sample_scratch_path, 'aligned.csv'), 'wb') as aligned_csv, \ open(os.path.join(sample_out_path, 'conseq_ins.csv'), 'wb') as insert_csv, \ open(os.path.join(sample_out_path, 'failed_read.csv'), 'wb') as failed_csv: sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv) logger.info('Running aln2counts (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'aligned.csv'), 'rU') as aligned_csv, \ open(os.path.join(sample_out_path, 'nuc.csv'), 'wb') as nuc_csv, \ open(os.path.join(sample_out_path, 'amino.csv'), 'wb') as amino_csv, \ open(os.path.join(sample_out_path, 'coord_ins.csv'), 'wb') as coord_ins_csv, \ open(os.path.join(sample_out_path, 'conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(sample_out_path, 'failed_align.csv'), 'wb') as failed_align_csv, \ open(os.path.join(sample_out_path, 'nuc_variants.csv'), 'wb') as nuc_variants_csv, \ open(os.path.join(sample_scratch_path, 'coverage_summary.csv'), 'wb') as coverage_summary_csv: aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, nuc_variants_csv, coverage_summary_csv=coverage_summary_csv) logger.info('Running coverage_plots (%d of %d).', sample_index + 1, len(run_info.samples)) coverage_path = os.path.join(sample_out_path, 'coverage') with open(os.path.join(sample_out_path, 'amino.csv'), 'rU') as amino_csv, \ open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, path_prefix=coverage_path) with open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'rU') as coverage_scores_csv: reader = csv.DictReader(coverage_scores_csv) is_v3loop_good = False for row in reader: if row['region'] == 'V3LOOP': is_v3loop_good = row['on.score'] == '4' break if is_v3loop_good: logger.info('Running sam_g2p (%d of %d).', sample_index + 1, len(run_info.samples)) g2p_path = create_app_result(data_path, run_info, sample_info, description='Geno To Pheno results', suffix='_G2P') with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \ open(os.path.join(sample_out_path, 'nuc.csv'), 'rU') as nuc_csv, \ open(os.path.join(g2p_path, 'g2p.csv'), 'wb') as g2p_csv, \ open(os.path.join(g2p_path, 'g2p_summary.csv'), 'wb') as g2p_summary_csv: sam_g2p(pssm=pssm, remap_csv=remap_csv, nuc_csv=nuc_csv, g2p_csv=g2p_csv, g2p_summary_csv=g2p_summary_csv, min_count=DEFAULT_MIN_COUNT)
def process_sample(sample_index, run_info, data_path, pssm): """ Process a single sample. :param sample_index: which sample to process from the session JSON :param run_info: run parameters loaded from the session JSON :param str data_path: the root folder for all BaseSpace data :param pssm: the pssm library for running G2P analysis """ scratch_path = os.path.join(data_path, 'scratch') sample_info = run_info.samples[sample_index] sample_id = sample_info['Id'] sample_name = sample_info['Name'] sample_dir = os.path.join(data_path, 'input', 'samples', sample_id, 'Data', 'Intensities', 'BaseCalls') if not os.path.exists(sample_dir): sample_dir = os.path.join(data_path, 'input', 'samples', sample_id) sample_path = None for root, _dirs, files in os.walk(sample_dir): sample_paths = fnmatch.filter(files, '*_R1_*') if sample_paths: sample_path = os.path.join(root, sample_paths[0]) break if sample_path is None: raise RuntimeError('No R1 file found for sample id {}.'.format(sample_id)) sample_path2 = sample_path.replace('_R1_', '_R2_') if not os.path.exists(sample_path2): raise RuntimeError('R2 file missing for sample id {}: {!r}.'.format( sample_id, sample_path2)) logger.info('Processing sample %s (%d of %d): %s (%s).', sample_id, sample_index+1, len(run_info.samples), sample_name, sample_path) sample_out_path = create_app_result(data_path, run_info, sample_info, description='Mapping results', suffix='_QC') sample_scratch_path = os.path.join(scratch_path, sample_name) makedirs(sample_scratch_path) censored_path1 = os.path.join(sample_scratch_path, 'censored1.fastq') read_summary_path1 = os.path.join(sample_scratch_path, 'read1_summary.csv') censor_sample(sample_path, os.path.join(scratch_path, 'bad_cycles.csv'), censored_path1, read_summary_path1) censored_path2 = os.path.join(sample_scratch_path, 'censored2.fastq') read_summary_path2 = os.path.join(sample_scratch_path, 'read2_summary.csv') censor_sample(sample_path2, os.path.join(scratch_path, 'bad_cycles.csv'), censored_path2, read_summary_path2) logger.info('Running prelim_map (%d of %d).', sample_index+1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'wb') as prelim_csv: prelim_map(censored_path1, censored_path2, prelim_csv) logger.info('Running remap (%d of %d).', sample_index+1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'rU') as prelim_csv, \ open(os.path.join(sample_scratch_path, 'remap.csv'), 'wb') as remap_csv, \ open(os.path.join(sample_out_path, 'remap_counts.csv'), 'wb') as counts_csv, \ open(os.path.join(sample_out_path, 'remap_conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(sample_out_path, 'unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(sample_out_path, 'unmapped2.fastq'), 'w') as unmapped2: remap(censored_path1, censored_path2, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, sample_scratch_path, nthreads=1) logger.info('Running sam2aln (%d of %d).', sample_index+1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \ open(os.path.join(sample_scratch_path, 'aligned.csv'), 'wb') as aligned_csv, \ open(os.path.join(sample_out_path, 'conseq_ins.csv'), 'wb') as insert_csv, \ open(os.path.join(sample_out_path, 'failed_read.csv'), 'wb') as failed_csv: sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv) logger.info('Running aln2counts (%d of %d).', sample_index+1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'aligned.csv'), 'rU') as aligned_csv, \ open(os.path.join(sample_out_path, 'nuc.csv'), 'wb') as nuc_csv, \ open(os.path.join(sample_out_path, 'amino.csv'), 'wb') as amino_csv, \ open(os.path.join(sample_out_path, 'coord_ins.csv'), 'wb') as coord_ins_csv, \ open(os.path.join(sample_out_path, 'conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(sample_out_path, 'failed_align.csv'), 'wb') as failed_align_csv, \ open(os.path.join(sample_out_path, 'nuc_variants.csv'), 'wb') as nuc_variants_csv, \ open(os.path.join(sample_scratch_path, 'coverage_summary.csv'), 'wb') as coverage_summary_csv: aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, nuc_variants_csv, coverage_summary_csv=coverage_summary_csv) logger.info('Running coverage_plots (%d of %d).', sample_index+1, len(run_info.samples)) coverage_path = os.path.join(sample_out_path, 'coverage') with open(os.path.join(sample_out_path, 'amino.csv'), 'rU') as amino_csv, \ open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, path_prefix=coverage_path) with open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'rU') as coverage_scores_csv: reader = csv.DictReader(coverage_scores_csv) is_v3loop_good = False for row in reader: if row['region'] == 'V3LOOP': is_v3loop_good = row['on.score'] == '4' break if is_v3loop_good: logger.info('Running sam_g2p (%d of %d).', sample_index+1, len(run_info.samples)) g2p_path = create_app_result(data_path, run_info, sample_info, description='Geno To Pheno results', suffix='_G2P') with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \ open(os.path.join(sample_out_path, 'nuc.csv'), 'rU') as nuc_csv, \ open(os.path.join(g2p_path, 'g2p.csv'), 'wb') as g2p_csv, \ open(os.path.join(g2p_path, 'g2p_summary.csv'), 'wb') as g2p_summary_csv: sam_g2p(pssm=pssm, remap_csv=remap_csv, nuc_csv=nuc_csv, g2p_csv=g2p_csv, g2p_summary_csv=g2p_summary_csv, min_count=DEFAULT_MIN_COUNT)
def process_sample(sample_index, run_info, args, pssm): """ Process a single sample. :param sample_index: which sample to process from the session JSON :param run_info: run parameters loaded from the session JSON :param args: the command-line arguments :param pssm: the pssm library for running G2P analysis """ scratch_path = os.path.join(args.data_path, 'scratch') sample_info = run_info.samples[sample_index] sample_id = sample_info['Id'] sample_name = sample_info['Name'] sample_dir = os.path.join(args.data_path, 'input', 'samples', sample_id, 'Data', 'Intensities', 'BaseCalls') if not os.path.exists(sample_dir): sample_dir = os.path.join(args.data_path, 'input', 'samples', sample_id) sample_path = None for root, _dirs, files in os.walk(sample_dir): sample_paths = fnmatch.filter(files, '*_R1_*') if sample_paths: sample_path = os.path.join(root, sample_paths[0]) break if sample_path is None: raise RuntimeError( 'No R1 file found for sample id {}.'.format(sample_id)) sample_path2 = sample_path.replace('_R1_', '_R2_') if not os.path.exists(sample_path2): raise RuntimeError('R2 file missing for sample id {}: {!r}.'.format( sample_id, sample_path2)) logger.info('Processing sample %s (%d of %d): %s (%s).', sample_id, sample_index + 1, len(run_info.samples), sample_name, sample_path) sample_qc_path = os.path.join(args.qc_path, sample_name) makedirs(sample_qc_path) sample_scratch_path = os.path.join(scratch_path, sample_name) makedirs(sample_scratch_path) bad_cycles_path = os.path.join(scratch_path, 'bad_cycles.csv') trimmed_path1 = os.path.join(sample_scratch_path, 'trimmed1.fastq') read_summary_path = os.path.join(sample_scratch_path, 'read_summary.csv') trimmed_path2 = os.path.join(sample_scratch_path, 'trimmed2.fastq') with open(read_summary_path, 'w') as read_summary: trim((sample_path, sample_path2), bad_cycles_path, (trimmed_path1, trimmed_path2), summary_file=read_summary, use_gzip=sample_path.endswith('.gz')) logger.info('Running fastq_g2p (%d of %d).', sample_index + 1, len(run_info.samples)) g2p_unmapped1_path = os.path.join(sample_scratch_path, 'g2p_unmapped1.fastq') g2p_unmapped2_path = os.path.join(sample_scratch_path, 'g2p_unmapped2.fastq') with open(os.path.join(sample_scratch_path, 'trimmed1.fastq'), 'r') as fastq1, \ open(os.path.join(sample_scratch_path, 'trimmed2.fastq'), 'r') as fastq2, \ open(os.path.join(sample_scratch_path, 'g2p.csv'), 'w') as g2p_csv, \ open(os.path.join(sample_scratch_path, 'g2p_summary.csv'), 'w') as g2p_summary_csv, \ open(g2p_unmapped1_path, 'w') as g2p_unmapped1, \ open(g2p_unmapped2_path, 'w') as g2p_unmapped2, \ open(os.path.join(sample_scratch_path, 'g2p_aligned.csv'), 'w') as g2p_aligned_csv: fastq_g2p(pssm=pssm, fastq1=fastq1, fastq2=fastq2, g2p_csv=g2p_csv, g2p_summary_csv=g2p_summary_csv, unmapped1=g2p_unmapped1, unmapped2=g2p_unmapped2, aligned_csv=g2p_aligned_csv, min_count=DEFAULT_MIN_COUNT, min_valid=MIN_VALID, min_valid_percent=MIN_VALID_PERCENT) logger.info('Running prelim_map (%d of %d).', sample_index + 1, len(run_info.samples)) excluded_seeds = [] if args.all_projects else EXCLUDED_SEEDS with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'w') as prelim_csv: prelim_map(g2p_unmapped1_path, g2p_unmapped2_path, prelim_csv, work_path=sample_scratch_path, excluded_seeds=excluded_seeds) logger.info('Running remap (%d of %d).', sample_index + 1, len(run_info.samples)) if args.debug_remap: debug_file_prefix = os.path.join(sample_scratch_path, 'debug') else: debug_file_prefix = None with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'r') as prelim_csv, \ open(os.path.join(sample_scratch_path, 'remap.csv'), 'w') as remap_csv, \ open(os.path.join(sample_scratch_path, 'remap_counts.csv'), 'w') as counts_csv, \ open(os.path.join(sample_scratch_path, 'remap_conseq.csv'), 'w') as conseq_csv, \ open(os.path.join(sample_qc_path, 'unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(sample_qc_path, 'unmapped2.fastq'), 'w') as unmapped2: remap(g2p_unmapped1_path, g2p_unmapped2_path, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, sample_scratch_path, debug_file_prefix=debug_file_prefix) logger.info('Running sam2aln (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'remap.csv'), 'r') as remap_csv, \ open(os.path.join(sample_scratch_path, 'aligned.csv'), 'w') as aligned_csv, \ open(os.path.join(sample_scratch_path, 'conseq_ins.csv'), 'w') as conseq_ins_csv, \ open(os.path.join(sample_scratch_path, 'failed_read.csv'), 'w') as failed_csv, \ open(os.path.join(sample_scratch_path, 'clipping.csv'), 'w') as clipping_csv: sam2aln(remap_csv, aligned_csv, conseq_ins_csv, failed_csv, clipping_csv=clipping_csv) logger.info('Running aln2counts (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'aligned.csv'), 'r') as aligned_csv, \ open(os.path.join(sample_scratch_path, 'g2p_aligned.csv'), 'r') as g2p_aligned_csv, \ open(os.path.join(sample_scratch_path, 'clipping.csv'), 'r') as clipping_csv, \ open(os.path.join(sample_scratch_path, 'conseq_ins.csv'), 'r') as conseq_ins_csv, \ open(os.path.join(sample_scratch_path, 'remap_conseq.csv'), 'r') as remap_conseq_csv, \ open(os.path.join(sample_scratch_path, 'nuc.csv'), 'w') as nuc_csv, \ open(os.path.join(sample_scratch_path, 'amino.csv'), 'w') as amino_csv, \ open(os.path.join(sample_scratch_path, 'coord_ins.csv'), 'w') as coord_ins_csv, \ open(os.path.join(sample_scratch_path, 'conseq.csv'), 'w') as conseq_csv, \ open(os.path.join(sample_scratch_path, 'failed_align.csv'), 'w') as failed_align_csv, \ open(os.path.join(sample_scratch_path, 'coverage_summary.csv'), 'w') as coverage_summary_csv: aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, coverage_summary_csv=coverage_summary_csv, clipping_csv=clipping_csv, conseq_ins_csv=conseq_ins_csv, g2p_aligned_csv=g2p_aligned_csv, remap_conseq_csv=remap_conseq_csv) logger.info('Running coverage_plots (%d of %d).', sample_index + 1, len(run_info.samples)) coverage_maps_path = os.path.join(args.qc_path, 'coverage_maps') makedirs(coverage_maps_path) excluded_projects = [] if args.all_projects else EXCLUDED_PROJECTS with open(os.path.join(sample_scratch_path, 'amino.csv'), 'r') as amino_csv, \ open(os.path.join(sample_scratch_path, 'coverage_scores.csv'), 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, coverage_maps_path=coverage_maps_path, coverage_maps_prefix=sample_name, excluded_projects=excluded_projects) logger.info('Running hivdb (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'amino.csv')) as amino_csv, \ open(os.path.join(sample_scratch_path, 'coverage_scores.csv')) as coverage_scores_csv, \ open(os.path.join(sample_scratch_path, 'resistance.csv'), 'w') as resistance_csv, \ open(os.path.join(sample_scratch_path, 'mutations.csv'), 'w') as mutations_csv: hivdb(amino_csv, coverage_scores_csv, resistance_csv, mutations_csv, run_info.reports) logger.info('Running resistance report (%d of %d).', sample_index + 1, len(run_info.samples)) source_path = os.path.dirname(__file__) version_filename = os.path.join(source_path, 'version.txt') if not os.path.exists(version_filename): git_version = 'v0-dev' else: with open(version_filename) as version_file: git_version = version_file.read().strip() reports_path = os.path.join(args.qc_path, 'resistance_reports') makedirs(reports_path) report_filename = os.path.join(reports_path, sample_name + '_resistance.pdf') with open(os.path.join(sample_scratch_path, 'resistance.csv')) as resistance_csv, \ open(os.path.join(sample_scratch_path, 'mutations.csv')) as mutations_csv, \ open(report_filename, 'wb') as report_pdf: gen_report(resistance_csv, mutations_csv, report_pdf, sample_name, git_version=git_version) logger.info('Running cascade_report (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'g2p_summary.csv'), 'r') as g2p_summary_csv, \ open(os.path.join(sample_scratch_path, 'remap_counts.csv'), 'r') as remap_counts_csv, \ open(os.path.join(sample_scratch_path, 'aligned.csv'), 'r') as aligned_csv, \ open(os.path.join(sample_scratch_path, 'cascade.csv'), 'w') as cascade_csv: cascade_report = CascadeReport(cascade_csv) cascade_report.g2p_summary_csv = g2p_summary_csv cascade_report.remap_counts_csv = remap_counts_csv cascade_report.aligned_csv = aligned_csv cascade_report.generate() logger.info('Finished sample (%d of %d).', sample_index + 1, len(run_info.samples))
def process_sample(self, fastq1, progress, prefixes, image_paths, error_log): fastq2 = fastq1.replace('_R1_001', '_R2_001').replace('censored1', 'censored2') if not os.path.exists(fastq2): raise IOError('ERROR: Missing R2 file for {}'.format(fastq1)) prefix = os.path.basename(fastq1).replace('_L001_R1_001.fastq', '').replace( '.censored1.fastq', '') prefixes.append(prefix) output_csv = prefix + '.prelim.csv' self.write('Processing sample {} ({})\n'.format(prefix, progress)) with open(output_csv, 'wb') as handle: prelim_map(fastq1, fastq2, handle, nthreads=self.nthreads, callback=self.callback, stderr=error_log) # prepare file handles for remap stage with open(output_csv, 'rU') as prelim_csv, \ open(os.path.join(self.workdir, prefix + '.remap.csv'), 'wb') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.remap_counts.csv'), 'wb') as counts_csv, \ open(os.path.join(self.workdir, prefix + '.remap_conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(self.workdir, prefix + '.unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(self.workdir, prefix + '.unmapped2.fastq'), 'w') as unmapped2: self.write('... remapping\n') self.parent.update() self.progress_bar['value'] = 0 remap(fastq1, fastq2, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, self.workdir, nthreads=self.nthreads, callback=self.callback, stderr=error_log) # prepare file handles for conversion from SAM format to alignment with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'wb') as aligned_csv, \ open(os.path.join(self.workdir, prefix + '.insert.csv'), 'wb') as insert_csv, \ open(os.path.join(self.workdir, prefix + '.failed.csv'), 'wb') as failed_csv: self.write('... converting into alignment\n') self.parent.update() sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv, nthreads=self.nthreads) with open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'rU') as aligned_csv, \ open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'wb') as nuc_csv, \ open(os.path.join(self.workdir, prefix + '.amino.csv'), 'wb') as amino_csv, \ open(os.path.join(self.workdir, prefix + '.coord_ins.csv'), 'wb') as coord_ins_csv, \ open(os.path.join(self.workdir, prefix + '.conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(self.workdir, prefix + '.failed_align.csv'), 'wb') as failed_align_csv, \ open(os.path.join(self.workdir, prefix + '.nuc_variants.csv'), 'wb') as nuc_variants_csv: self.parent.update() aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, nuc_variants_csv, callback=self.callback) self.write('... generating coverage plots\n') self.parent.update() with open(os.path.join(self.workdir, prefix + '.amino.csv'), 'rU') as amino_csv: image_paths += coverage_plot(amino_csv) self.write('... performing g2p scoring on samples covering HIV-1 V3\n') self.parent.update() with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'rU') as nuc_csv, \ open(os.path.join(self.workdir, prefix + '.g2p.csv'), 'wb') as g2p_csv: sam_g2p(pssm=self.pssm, remap_csv=remap_csv, nuc_csv=nuc_csv, g2p_csv=g2p_csv)
def process_sample(self, fastq1, progress, prefixes, image_paths, error_log): fastq2 = fastq1.replace('_R1_001', '_R2_001').replace('censored1', 'censored2') if not os.path.exists(fastq2): raise IOError('ERROR: Missing R2 file for {}'.format(fastq1)) prefix = os.path.basename(fastq1).replace('_L001_R1_001.fastq', '').replace('.censored1.fastq', '') prefixes.append(prefix) output_csv = prefix + '.prelim.csv' self.write('Processing sample {} ({})\n'.format(prefix, progress)) with open(output_csv, 'wb') as handle: prelim_map(fastq1, fastq2, handle, nthreads=self.nthreads, callback=self.callback, stderr=error_log) # prepare file handles for remap stage with open(output_csv, 'rU') as prelim_csv, \ open(os.path.join(self.workdir, prefix + '.remap.csv'), 'wb') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.remap_counts.csv'), 'wb') as counts_csv, \ open(os.path.join(self.workdir, prefix + '.remap_conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(self.workdir, prefix + '.unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(self.workdir, prefix + '.unmapped2.fastq'), 'w') as unmapped2: self.write('... remapping\n') self.parent.update() self.progress_bar['value'] = 0 remap(fastq1, fastq2, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, self.workdir, nthreads=self.nthreads, callback=self.callback, stderr=error_log) # prepare file handles for conversion from SAM format to alignment with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'wb') as aligned_csv, \ open(os.path.join(self.workdir, prefix + '.insert.csv'), 'wb') as insert_csv, \ open(os.path.join(self.workdir, prefix + '.failed.csv'), 'wb') as failed_csv: self.write('... converting into alignment\n') self.parent.update() sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv, nthreads=self.nthreads) with open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'rU') as aligned_csv, \ open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'wb') as nuc_csv, \ open(os.path.join(self.workdir, prefix + '.amino.csv'), 'wb') as amino_csv, \ open(os.path.join(self.workdir, prefix + '.coord_ins.csv'), 'wb') as coord_ins_csv, \ open(os.path.join(self.workdir, prefix + '.conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(self.workdir, prefix + '.failed_align.csv'), 'wb') as failed_align_csv, \ open(os.path.join(self.workdir, prefix + '.nuc_variants.csv'), 'wb') as nuc_variants_csv: self.parent.update() aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, nuc_variants_csv, callback=self.callback) self.write('... generating coverage plots\n') self.parent.update() with open(os.path.join(self.workdir, prefix + '.amino.csv'), 'rU') as amino_csv: image_paths += coverage_plot(amino_csv) self.write('... performing g2p scoring on samples covering HIV-1 V3\n') self.parent.update() with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'rU') as nuc_csv, \ open(os.path.join(self.workdir, prefix + '.g2p.csv'), 'wb') as g2p_csv: sam_g2p(pssm=self.pssm, remap_csv=remap_csv, nuc_csv=nuc_csv, g2p_csv=g2p_csv)
def process(self, pssm, excluded_seeds=(), excluded_projects=(), force_gzip=False): """ Process a single sample. :param pssm: the pssm library for running G2P analysis :param excluded_seeds: seeds to exclude from mapping :param excluded_projects: project codes to exclude from reporting :param bool force_gzip: treat FASTQ files as gzipped, even when they don't end in .gz """ logger.info('Processing %s (%r).', self, self.fastq1) scratch_path = os.path.dirname(self.prelim_csv) os.mkdir(scratch_path) use_gzip = force_gzip or self.fastq1.endswith('.gz') with open(self.read_summary_csv, 'w') as read_summary: trim((self.fastq1, self.fastq2), self.bad_cycles_csv, (self.trimmed1_fastq, self.trimmed2_fastq), summary_file=read_summary, use_gzip=use_gzip) logger.info('Running fastq_g2p on %s.', self) with open(self.trimmed1_fastq) as fastq1, \ open(self.trimmed2_fastq) as fastq2, \ open(self.g2p_csv, 'w') as g2p_csv, \ open(self.g2p_summary_csv, 'w') as g2p_summary_csv, \ open(self.g2p_unmapped1_fastq, 'w') as g2p_unmapped1, \ open(self.g2p_unmapped2_fastq, 'w') as g2p_unmapped2, \ open(self.g2p_aligned_csv, 'w') as g2p_aligned_csv: fastq_g2p(pssm=pssm, fastq1=fastq1, fastq2=fastq2, g2p_csv=g2p_csv, g2p_summary_csv=g2p_summary_csv, unmapped1=g2p_unmapped1, unmapped2=g2p_unmapped2, aligned_csv=g2p_aligned_csv, min_count=DEFAULT_MIN_COUNT, min_valid=MIN_VALID, min_valid_percent=MIN_VALID_PERCENT) logger.info('Running prelim_map on %s.', self) with open(self.prelim_csv, 'w') as prelim_csv: prelim_map(self.g2p_unmapped1_fastq, self.g2p_unmapped2_fastq, prelim_csv, work_path=scratch_path, excluded_seeds=excluded_seeds) logger.info('Running remap on %s.', self) if self.debug_remap: debug_file_prefix = os.path.join(scratch_path, 'debug') else: debug_file_prefix = None with open(self.prelim_csv) as prelim_csv, \ open(self.remap_csv, 'w') as remap_csv, \ open(self.remap_counts_csv, 'w') as counts_csv, \ open(self.remap_conseq_csv, 'w') as conseq_csv, \ open(self.unmapped1_fastq, 'w') as unmapped1, \ open(self.unmapped2_fastq, 'w') as unmapped2: remap(self.g2p_unmapped1_fastq, self.g2p_unmapped2_fastq, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, scratch_path, debug_file_prefix=debug_file_prefix) logger.info('Running sam2aln on %s.', self) with open(self.remap_csv) as remap_csv, \ open(self.aligned_csv, 'w') as aligned_csv, \ open(self.conseq_ins_csv, 'w') as conseq_ins_csv, \ open(self.failed_csv, 'w') as failed_csv, \ open(self.clipping_csv, 'w') as clipping_csv: sam2aln(remap_csv, aligned_csv, conseq_ins_csv, failed_csv, clipping_csv=clipping_csv) logger.info('Running aln2counts on %s.', self) with open(self.aligned_csv) as aligned_csv, \ open(self.g2p_aligned_csv) as g2p_aligned_csv, \ open(self.clipping_csv) as clipping_csv, \ open(self.conseq_ins_csv) as conseq_ins_csv, \ open(self.remap_conseq_csv) as remap_conseq_csv, \ open(self.nuc_csv, 'w') as nuc_csv, \ open(self.amino_csv, 'w') as amino_csv, \ open(self.coord_ins_csv, 'w') as coord_ins_csv, \ open(self.conseq_csv, 'w') as conseq_csv, \ open(self.conseq_region_csv, 'w') as conseq_region_csv, \ open(self.failed_align_csv, 'w') as failed_align_csv, \ open(self.coverage_summary_csv, 'w') as coverage_summary_csv: aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, coverage_summary_csv=coverage_summary_csv, clipping_csv=clipping_csv, conseq_ins_csv=conseq_ins_csv, g2p_aligned_csv=g2p_aligned_csv, remap_conseq_csv=remap_conseq_csv, conseq_region_csv=conseq_region_csv) logger.info('Running coverage_plots on %s.', self) os.makedirs(self.coverage_maps) with open(self.amino_csv) as amino_csv, \ open(self.coverage_scores_csv, 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, coverage_maps_path=self.coverage_maps, coverage_maps_prefix=self.name, excluded_projects=excluded_projects) logger.info('Running cascade_report on %s.', self) with open(self.g2p_summary_csv) as g2p_summary_csv, \ open(self.remap_counts_csv) as remap_counts_csv, \ open(self.aligned_csv) as aligned_csv, \ open(self.cascade_csv, 'w') as cascade_csv: cascade_report = CascadeReport(cascade_csv) cascade_report.g2p_summary_csv = g2p_summary_csv cascade_report.remap_counts_csv = remap_counts_csv cascade_report.aligned_csv = aligned_csv cascade_report.generate() logger.info('Finished sample %s.', self)
def process(self, pssm, excluded_seeds=(), excluded_projects=(), force_gzip=False, use_denovo=False): """ Process a single sample. :param pssm: the pssm library for running G2P analysis :param excluded_seeds: seeds to exclude from mapping :param excluded_projects: project codes to exclude from reporting :param bool force_gzip: treat FASTQ files as gzipped, even when they don't end in .gz :param bool use_denovo: True if de novo assembly should be used, instead of bowtie2 mapping against references. """ logger.info('Processing %s (%r).', self, self.fastq1) scratch_path = self.get_scratch_path() makedirs(scratch_path) use_gzip = force_gzip or self.fastq1.endswith('.gz') sample_info = self.load_sample_info() with open(self.read_summary_csv, 'w') as read_summary: trim((self.fastq1, self.fastq2), self.bad_cycles_csv, (self.trimmed1_fastq, self.trimmed2_fastq), summary_file=read_summary, use_gzip=use_gzip, skip=self.skip, project_code=sample_info.get('project')) if use_denovo: logger.info('Running merge_for_entropy on %s.', self) with open(self.read_entropy_csv, 'w') as read_entropy_csv: merge_for_entropy(self.trimmed1_fastq, self.trimmed2_fastq, read_entropy_csv, scratch_path) write_merge_lengths_plot(self.read_entropy_csv, self.merge_lengths_svg) logger.info('Running fastq_g2p on %s.', self) with open(self.trimmed1_fastq) as fastq1, \ open(self.trimmed2_fastq) as fastq2, \ open(self.g2p_csv, 'w') as g2p_csv, \ open(self.g2p_summary_csv, 'w') as g2p_summary_csv, \ open(self.g2p_unmapped1_fastq, 'w') as g2p_unmapped1, \ open(self.g2p_unmapped2_fastq, 'w') as g2p_unmapped2, \ open(self.g2p_aligned_csv, 'w') as g2p_aligned_csv, \ open(self.merged_contigs_csv, 'w') as merged_contigs_csv: fastq_g2p(pssm=pssm, fastq1=fastq1, fastq2=fastq2, g2p_csv=g2p_csv, g2p_summary_csv=g2p_summary_csv, unmapped1=g2p_unmapped1, unmapped2=g2p_unmapped2, aligned_csv=g2p_aligned_csv, min_count=DEFAULT_MIN_COUNT, min_valid=MIN_VALID, min_valid_percent=MIN_VALID_PERCENT, merged_contigs_csv=merged_contigs_csv) if use_denovo: self.run_denovo(excluded_seeds) else: self.run_mapping(excluded_seeds) logger.info('Running sam2aln on %s.', self) with open(self.remap_csv) as remap_csv, \ open(self.aligned_csv, 'w') as aligned_csv, \ open(self.conseq_ins_csv, 'w') as conseq_ins_csv, \ open(self.failed_csv, 'w') as failed_csv, \ open(self.clipping_csv, 'w') as clipping_csv: sam2aln(remap_csv, aligned_csv, conseq_ins_csv, failed_csv, clipping_csv=clipping_csv) logger.info('Running aln2counts on %s.', self) if use_denovo: contigs_path = self.contigs_csv else: contigs_path = os.devnull with open(self.aligned_csv) as aligned_csv, \ open(self.g2p_aligned_csv) as g2p_aligned_csv, \ open(self.clipping_csv) as clipping_csv, \ open(self.conseq_ins_csv) as conseq_ins_csv, \ open(self.remap_conseq_csv) as remap_conseq_csv, \ open(contigs_path) as contigs_csv, \ open(self.nuc_csv, 'w') as nuc_csv, \ open(self.nuc_detail_csv, 'w') as nuc_detail_csv, \ open(self.amino_csv, 'w') as amino_csv, \ open(self.amino_detail_csv, 'w') as amino_detail_csv, \ open(self.coord_ins_csv, 'w') as coord_ins_csv, \ open(self.conseq_csv, 'w') as conseq_csv, \ open(self.conseq_region_csv, 'w') as conseq_region_csv, \ open(self.failed_align_csv, 'w') as failed_align_csv, \ open(self.coverage_summary_csv, 'w') as coverage_summary_csv, \ open(self.genome_coverage_csv, 'w') as genome_coverage_csv, \ open(self.conseq_all_csv, "w") as conseq_all_csv, \ open(self.minimap_hits_csv, "w") as minimap_hits_csv: if not use_denovo: for f in (amino_detail_csv, nuc_detail_csv): f.close() os.remove(f.name) amino_detail_csv = nuc_detail_csv = None aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, coverage_summary_csv=coverage_summary_csv, clipping_csv=clipping_csv, conseq_ins_csv=conseq_ins_csv, g2p_aligned_csv=g2p_aligned_csv, remap_conseq_csv=remap_conseq_csv, conseq_region_csv=conseq_region_csv, amino_detail_csv=amino_detail_csv, nuc_detail_csv=nuc_detail_csv, genome_coverage_csv=genome_coverage_csv, contigs_csv=contigs_csv, conseq_all_csv=conseq_all_csv, minimap_hits_csv=minimap_hits_csv) logger.info('Running coverage_plots on %s.', self) os.makedirs(self.coverage_maps) with open(self.amino_csv) as amino_csv, \ open(self.coverage_scores_csv, 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, coverage_maps_path=self.coverage_maps, coverage_maps_prefix=self.name, excluded_projects=excluded_projects) with open(self.genome_coverage_csv) as genome_coverage_csv, \ open(self.minimap_hits_csv) as minimap_hits_csv: if not use_denovo: minimap_hits_csv = None plot_genome_coverage(genome_coverage_csv, minimap_hits_csv, self.genome_coverage_svg) logger.info('Running cascade_report on %s.', self) with open(self.g2p_summary_csv) as g2p_summary_csv, \ open(self.remap_counts_csv) as remap_counts_csv, \ open(self.aligned_csv) as aligned_csv, \ open(self.cascade_csv, 'w') as cascade_csv: cascade_report = CascadeReport(cascade_csv) cascade_report.g2p_summary_csv = g2p_summary_csv cascade_report.remap_counts_csv = remap_counts_csv cascade_report.aligned_csv = aligned_csv cascade_report.generate() logger.info('Finished sample %s.', self)