def testSummaryX4(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_2,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTATGAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_2,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACGAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_3,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTATGAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_3,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACGAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,2,0.454349263704,2.6,X4,CMRPNNNTRKSIHIGPGRAFYATGEIIGDIRRAHC,CMRPN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RRAHC,, 2,1,0.0677537070158,42.3,R5,CTRPNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC,CTRPN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RQAHC,, """ expected_summary_csv = """\ mapped,valid,X4calls,X4pct,final 3,3,2,66.67,X4 """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv, self.g2p_summary_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue()) self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
def testPartialCodon(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,8M,=,877,8,TGTACAGG,AAAAAAAA Example_read_1,147,HIV1B-env-seed,877,44,8M,=,877,-8,TGTACAGG,AAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,,,notdiv3, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testLowQuality(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TNTNNNGGN,A#A###AA# Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TNTNNNGGN,A#A###AA# """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,,,low quality, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testOverlap(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,12M,=,886,12,TGTACAAGACCC,AAAAAAAAAAAA Example_read_1,147,HIV1B-env-seed,886,44,9M,=,877,-9,CCCAACAAC,AAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,CTRPNN,,cysteines, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testDeletionAtStart(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,874,44,3M3D6M,=,874,9,TGTGGGTGT,AAAAAAAAA Example_read_1,147,HIV1B-env-seed,874,44,3M3D6M,=,874,-9,TGTGGGTGT,AAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,-GC,,cysteines, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testLengthMinimum(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,51M,=,925,51,TGTGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,HIV1B-env-seed,925,44,48M,=,877,-48,AAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,0.806326707173,1.5,X4,CGGGGGGGGGGGGGGGKGGGGGGGGGGGGGGC,---CG-GGG--GGGGGG---GGGG---GKGGG----GGGGGGG--GGGGC,, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testLengthTooShort(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,51M,=,925,51,TGTGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,HIV1B-env-seed,925,44,45M,=,877,-45,AAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,CGGGGGGGGGGGGGGGKGGGGGGGGGGGGGC,,length, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testStopCodon(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTTAGTGT,AAAAAAAAA Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTTAGTGT,AAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,C*C,,stop codons, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testAmbiguousAtTwoPositions(self): """ Same thing with codons 9 and 18 - rejected. """ remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAA#AAAAAAAAAAAAAAAAAAAAAAAAAA#AA Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAA#AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,CTRPNNNTXKSIHIGPGXAFYATGEIIGDIRQAHC,,> 2 ambiguous, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testAllClipped(self): """ In this scenario, the reads map outside the clipping region. """ remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,868,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA Example_read_1,147,HIV1B-env-seed,868,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,,,zerolength, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testAmbiguousMixture(self): """ Marking position 9 as low quality means codon 3 could be S or R. """ remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAA#AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,0.0663051848427,43.0,R5,CT[RS]PNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC,CT[RS]PN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RQAHC,,ambiguous """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testAmbiguousMixtureThreeChoices(self): """ Marking position 14 as low quality means codon 5 could be L, S, or *. """ remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,21M,=,877,56,TGTACAAGACCCTTAAACTGT,AAAAAAAAAAAAA#AAAAAAA Example_read_1,147,HIV1B-env-seed,877,44,21M,=,877,56,TGTACAAGACCCTTAAACTGT,AAAAAAAAAAAAA#AAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,CTRPXNC,,> 2 ambiguous, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def test(remap_lines, temp_prefix, pssm, ruby_script, delete_results=True): """ Calculate G2P scores using ruby_script and Python, then compare. @return: 'PASS' if the results match or it's a difference we're not interested in, 'FAIL' otherwise """ with NamedTemporaryFile(suffix=".csv", prefix=temp_prefix, delete=True) as remap_file: for line in remap_lines: remap_file.write(line) remap_file.flush() remap_file.seek(0) filename_root = os.path.splitext(os.path.splitext( remap_file.name)[0])[0] nuc_filename = filename_root + ".nuc.csv" ruby_out_filename = filename_root + "_rbg2p.csv" python_out_filename = filename_root + "_pyg2p.csv" ruby_path = os.path.dirname(ruby_script) try: check_call([ ruby_script, remap_file.name, nuc_filename, ruby_out_filename ], cwd=ruby_path) with open(nuc_filename, 'rU') as nuc_csv, \ open(python_out_filename, 'wb') as g2p_csv: sam_g2p(pssm, remap_file, nuc_csv, g2p_csv) with open(os.devnull, 'w') as devnull: is_diff = call( ['diff', '-q', ruby_out_filename, python_out_filename], stdout=devnull) result = 'FAIL' if is_diff else 'PASS' logger.info('{} lines: {}'.format(len(remap_lines), result)) return result finally: if delete_results: if os.path.exists(ruby_out_filename): os.remove(ruby_out_filename) if os.path.exists(python_out_filename): os.remove(python_out_filename)
def testVariants(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA Example_read_2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA Example_read_2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA Example_read_3,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA Example_read_3,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,2,,,,CTR,,cysteines, 2,1,,,,CTG,,cysteines, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue())
def testSummaryFailed(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA Example_read_1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,,,,CTR,,cysteines, """ expected_summary_csv = """\ mapped,valid,X4calls,X4pct,final 1,0,0,, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv, self.g2p_summary_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue()) self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
def testSummarySuccess(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual Example_read_1,99,HIV1B-env-seed,877,44,56M,=,926,56,TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGC,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA Example_read_1,147,HIV1B-env-seed,926,44,56M,=,877,-56,GGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,1,0.0677537070158,42.3,R5,CTRPNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC,CTRPN-NNT--RKSIHI---GPGR---AFYAT----GEIIGDI--RQAHC,, """ expected_summary_csv = """\ mapped,valid,X4calls,X4pct,final 1,1,0,0.00,R5 """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv, self.g2p_summary_csv) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue()) self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
def test(remap_lines, temp_prefix, pssm, ruby_script, delete_results=True): """ Calculate G2P scores using ruby_script and Python, then compare. @return: 'PASS' if the results match or it's a difference we're not interested in, 'FAIL' otherwise """ with NamedTemporaryFile(suffix=".csv", prefix=temp_prefix, delete=True) as remap_file: for line in remap_lines: remap_file.write(line) remap_file.flush() remap_file.seek(0) filename_root = os.path.splitext(os.path.splitext(remap_file.name)[0])[0] nuc_filename = filename_root + ".nuc.csv" ruby_out_filename = filename_root + "_rbg2p.csv" python_out_filename = filename_root + "_pyg2p.csv" ruby_path = os.path.dirname(ruby_script) try: check_call([ruby_script, remap_file.name, nuc_filename, ruby_out_filename], cwd=ruby_path) with open(nuc_filename, 'rU') as nuc_csv, \ open(python_out_filename, 'wb') as g2p_csv: sam_g2p(pssm, remap_file, nuc_csv, g2p_csv) with open(os.devnull, 'w') as devnull: is_diff = call(['diff', '-q', ruby_out_filename, python_out_filename], stdout=devnull) result = 'FAIL' if is_diff else 'PASS' logger.info('{} lines: {}'.format(len(remap_lines), result)) return result finally: if delete_results: if os.path.exists(ruby_out_filename): os.remove(ruby_out_filename) if os.path.exists(python_out_filename): os.remove(python_out_filename)
def testMinCount(self): remap_csv = StringIO("""\ qname,flag,rname,pos,mapq,cigar,rnext,pnext,tlen,seq,qual variant1_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA variant1_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA variant1_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA variant1_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA variant1_read3,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAAGA,AAAAAAAAA variant1_read3,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAAGA,AAAAAAAAA variant2_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA variant2_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA variant2_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGGG,AAAAAAAAA variant2_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGGG,AAAAAAAAA variant3_read1,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGAA,AAAAAAAAA variant3_read1,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGAA,AAAAAAAAA variant3_read2,99,HIV1B-env-seed,877,44,9M,=,877,9,TGTACAGAA,AAAAAAAAA variant3_read2,147,HIV1B-env-seed,877,44,9M,=,877,-9,TGTACAGAA,AAAAAAAAA """) expected_g2p_csv = """\ rank,count,g2p,fpr,call,seq,aligned,error,comment 1,3,,,,CTR,,cysteines, 2,4,,,,,,count < 3, """ expected_summary_csv = """\ mapped,valid,X4calls,X4pct,final 7,0,0,, """ sam_g2p(self.pssm, remap_csv, self.nuc_csv, self.g2p_csv, self.g2p_summary_csv, min_count=3) self.assertEqual(expected_g2p_csv, self.g2p_csv.getvalue()) self.assertEqual(expected_summary_csv, self.g2p_summary_csv.getvalue())
def process_sample(self, fastq1, progress, prefixes, image_paths, error_log): fastq2 = fastq1.replace('_R1_001', '_R2_001').replace('censored1', 'censored2') if not os.path.exists(fastq2): raise IOError('ERROR: Missing R2 file for {}'.format(fastq1)) prefix = os.path.basename(fastq1).replace('_L001_R1_001.fastq', '').replace('.censored1.fastq', '') prefixes.append(prefix) output_csv = prefix + '.prelim.csv' self.write('Processing sample {} ({})\n'.format(prefix, progress)) with open(output_csv, 'wb') as handle: prelim_map(fastq1, fastq2, handle, nthreads=self.nthreads, callback=self.callback, stderr=error_log) # prepare file handles for remap stage with open(output_csv, 'rU') as prelim_csv, \ open(os.path.join(self.workdir, prefix + '.remap.csv'), 'wb') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.remap_counts.csv'), 'wb') as counts_csv, \ open(os.path.join(self.workdir, prefix + '.remap_conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(self.workdir, prefix + '.unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(self.workdir, prefix + '.unmapped2.fastq'), 'w') as unmapped2: self.write('... remapping\n') self.parent.update() self.progress_bar['value'] = 0 remap(fastq1, fastq2, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, self.workdir, nthreads=self.nthreads, callback=self.callback, stderr=error_log) # prepare file handles for conversion from SAM format to alignment with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'wb') as aligned_csv, \ open(os.path.join(self.workdir, prefix + '.insert.csv'), 'wb') as insert_csv, \ open(os.path.join(self.workdir, prefix + '.failed.csv'), 'wb') as failed_csv: self.write('... converting into alignment\n') self.parent.update() sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv, nthreads=self.nthreads) with open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'rU') as aligned_csv, \ open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'wb') as nuc_csv, \ open(os.path.join(self.workdir, prefix + '.amino.csv'), 'wb') as amino_csv, \ open(os.path.join(self.workdir, prefix + '.coord_ins.csv'), 'wb') as coord_ins_csv, \ open(os.path.join(self.workdir, prefix + '.conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(self.workdir, prefix + '.failed_align.csv'), 'wb') as failed_align_csv, \ open(os.path.join(self.workdir, prefix + '.nuc_variants.csv'), 'wb') as nuc_variants_csv: self.parent.update() aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, nuc_variants_csv, callback=self.callback) self.write('... generating coverage plots\n') self.parent.update() with open(os.path.join(self.workdir, prefix + '.amino.csv'), 'rU') as amino_csv: image_paths += coverage_plot(amino_csv) self.write('... performing g2p scoring on samples covering HIV-1 V3\n') self.parent.update() with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'rU') as nuc_csv, \ open(os.path.join(self.workdir, prefix + '.g2p.csv'), 'wb') as g2p_csv: sam_g2p(pssm=self.pssm, remap_csv=remap_csv, nuc_csv=nuc_csv, g2p_csv=g2p_csv)
def process_sample(self, fastq1, progress, prefixes, image_paths, error_log): fastq2 = fastq1.replace('_R1_001', '_R2_001').replace('censored1', 'censored2') if not os.path.exists(fastq2): raise IOError('ERROR: Missing R2 file for {}'.format(fastq1)) prefix = os.path.basename(fastq1).replace('_L001_R1_001.fastq', '').replace( '.censored1.fastq', '') prefixes.append(prefix) output_csv = prefix + '.prelim.csv' self.write('Processing sample {} ({})\n'.format(prefix, progress)) with open(output_csv, 'wb') as handle: prelim_map(fastq1, fastq2, handle, nthreads=self.nthreads, callback=self.callback, stderr=error_log) # prepare file handles for remap stage with open(output_csv, 'rU') as prelim_csv, \ open(os.path.join(self.workdir, prefix + '.remap.csv'), 'wb') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.remap_counts.csv'), 'wb') as counts_csv, \ open(os.path.join(self.workdir, prefix + '.remap_conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(self.workdir, prefix + '.unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(self.workdir, prefix + '.unmapped2.fastq'), 'w') as unmapped2: self.write('... remapping\n') self.parent.update() self.progress_bar['value'] = 0 remap(fastq1, fastq2, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, self.workdir, nthreads=self.nthreads, callback=self.callback, stderr=error_log) # prepare file handles for conversion from SAM format to alignment with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'wb') as aligned_csv, \ open(os.path.join(self.workdir, prefix + '.insert.csv'), 'wb') as insert_csv, \ open(os.path.join(self.workdir, prefix + '.failed.csv'), 'wb') as failed_csv: self.write('... converting into alignment\n') self.parent.update() sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv, nthreads=self.nthreads) with open(os.path.join(self.workdir, prefix + '.aligned.csv'), 'rU') as aligned_csv, \ open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'wb') as nuc_csv, \ open(os.path.join(self.workdir, prefix + '.amino.csv'), 'wb') as amino_csv, \ open(os.path.join(self.workdir, prefix + '.coord_ins.csv'), 'wb') as coord_ins_csv, \ open(os.path.join(self.workdir, prefix + '.conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(self.workdir, prefix + '.failed_align.csv'), 'wb') as failed_align_csv, \ open(os.path.join(self.workdir, prefix + '.nuc_variants.csv'), 'wb') as nuc_variants_csv: self.parent.update() aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, nuc_variants_csv, callback=self.callback) self.write('... generating coverage plots\n') self.parent.update() with open(os.path.join(self.workdir, prefix + '.amino.csv'), 'rU') as amino_csv: image_paths += coverage_plot(amino_csv) self.write('... performing g2p scoring on samples covering HIV-1 V3\n') self.parent.update() with open(os.path.join(self.workdir, prefix + '.remap.csv'), 'rU') as remap_csv, \ open(os.path.join(self.workdir, prefix + '.nuc.csv'), 'rU') as nuc_csv, \ open(os.path.join(self.workdir, prefix + '.g2p.csv'), 'wb') as g2p_csv: sam_g2p(pssm=self.pssm, remap_csv=remap_csv, nuc_csv=nuc_csv, g2p_csv=g2p_csv)
def process_sample(sample_index, run_info, data_path, pssm): """ Process a single sample. :param sample_index: which sample to process from the session JSON :param run_info: run parameters loaded from the session JSON :param str data_path: the root folder for all BaseSpace data :param pssm: the pssm library for running G2P analysis """ scratch_path = os.path.join(data_path, 'scratch') sample_info = run_info.samples[sample_index] sample_id = sample_info['Id'] sample_name = sample_info['Name'] sample_dir = os.path.join(data_path, 'input', 'samples', sample_id, 'Data', 'Intensities', 'BaseCalls') if not os.path.exists(sample_dir): sample_dir = os.path.join(data_path, 'input', 'samples', sample_id) sample_path = None for root, _dirs, files in os.walk(sample_dir): sample_paths = fnmatch.filter(files, '*_R1_*') if sample_paths: sample_path = os.path.join(root, sample_paths[0]) break if sample_path is None: raise RuntimeError('No R1 file found for sample id {}.'.format(sample_id)) sample_path2 = sample_path.replace('_R1_', '_R2_') if not os.path.exists(sample_path2): raise RuntimeError('R2 file missing for sample id {}: {!r}.'.format( sample_id, sample_path2)) logger.info('Processing sample %s (%d of %d): %s (%s).', sample_id, sample_index+1, len(run_info.samples), sample_name, sample_path) sample_out_path = create_app_result(data_path, run_info, sample_info, description='Mapping results', suffix='_QC') sample_scratch_path = os.path.join(scratch_path, sample_name) makedirs(sample_scratch_path) censored_path1 = os.path.join(sample_scratch_path, 'censored1.fastq') read_summary_path1 = os.path.join(sample_scratch_path, 'read1_summary.csv') censor_sample(sample_path, os.path.join(scratch_path, 'bad_cycles.csv'), censored_path1, read_summary_path1) censored_path2 = os.path.join(sample_scratch_path, 'censored2.fastq') read_summary_path2 = os.path.join(sample_scratch_path, 'read2_summary.csv') censor_sample(sample_path2, os.path.join(scratch_path, 'bad_cycles.csv'), censored_path2, read_summary_path2) logger.info('Running prelim_map (%d of %d).', sample_index+1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'wb') as prelim_csv: prelim_map(censored_path1, censored_path2, prelim_csv) logger.info('Running remap (%d of %d).', sample_index+1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'rU') as prelim_csv, \ open(os.path.join(sample_scratch_path, 'remap.csv'), 'wb') as remap_csv, \ open(os.path.join(sample_out_path, 'remap_counts.csv'), 'wb') as counts_csv, \ open(os.path.join(sample_out_path, 'remap_conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(sample_out_path, 'unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(sample_out_path, 'unmapped2.fastq'), 'w') as unmapped2: remap(censored_path1, censored_path2, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, sample_scratch_path, nthreads=1) logger.info('Running sam2aln (%d of %d).', sample_index+1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \ open(os.path.join(sample_scratch_path, 'aligned.csv'), 'wb') as aligned_csv, \ open(os.path.join(sample_out_path, 'conseq_ins.csv'), 'wb') as insert_csv, \ open(os.path.join(sample_out_path, 'failed_read.csv'), 'wb') as failed_csv: sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv) logger.info('Running aln2counts (%d of %d).', sample_index+1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'aligned.csv'), 'rU') as aligned_csv, \ open(os.path.join(sample_out_path, 'nuc.csv'), 'wb') as nuc_csv, \ open(os.path.join(sample_out_path, 'amino.csv'), 'wb') as amino_csv, \ open(os.path.join(sample_out_path, 'coord_ins.csv'), 'wb') as coord_ins_csv, \ open(os.path.join(sample_out_path, 'conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(sample_out_path, 'failed_align.csv'), 'wb') as failed_align_csv, \ open(os.path.join(sample_out_path, 'nuc_variants.csv'), 'wb') as nuc_variants_csv, \ open(os.path.join(sample_scratch_path, 'coverage_summary.csv'), 'wb') as coverage_summary_csv: aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, nuc_variants_csv, coverage_summary_csv=coverage_summary_csv) logger.info('Running coverage_plots (%d of %d).', sample_index+1, len(run_info.samples)) coverage_path = os.path.join(sample_out_path, 'coverage') with open(os.path.join(sample_out_path, 'amino.csv'), 'rU') as amino_csv, \ open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, path_prefix=coverage_path) with open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'rU') as coverage_scores_csv: reader = csv.DictReader(coverage_scores_csv) is_v3loop_good = False for row in reader: if row['region'] == 'V3LOOP': is_v3loop_good = row['on.score'] == '4' break if is_v3loop_good: logger.info('Running sam_g2p (%d of %d).', sample_index+1, len(run_info.samples)) g2p_path = create_app_result(data_path, run_info, sample_info, description='Geno To Pheno results', suffix='_G2P') with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \ open(os.path.join(sample_out_path, 'nuc.csv'), 'rU') as nuc_csv, \ open(os.path.join(g2p_path, 'g2p.csv'), 'wb') as g2p_csv, \ open(os.path.join(g2p_path, 'g2p_summary.csv'), 'wb') as g2p_summary_csv: sam_g2p(pssm=pssm, remap_csv=remap_csv, nuc_csv=nuc_csv, g2p_csv=g2p_csv, g2p_summary_csv=g2p_summary_csv, min_count=DEFAULT_MIN_COUNT)
def process_sample(sample_index, run_info, data_path, pssm): """ Process a single sample. :param sample_index: which sample to process from the session JSON :param run_info: run parameters loaded from the session JSON :param str data_path: the root folder for all BaseSpace data :param pssm: the pssm library for running G2P analysis """ scratch_path = os.path.join(data_path, 'scratch') sample_info = run_info.samples[sample_index] sample_id = sample_info['Id'] sample_name = sample_info['Name'] sample_dir = os.path.join(data_path, 'input', 'samples', sample_id, 'Data', 'Intensities', 'BaseCalls') if not os.path.exists(sample_dir): sample_dir = os.path.join(data_path, 'input', 'samples', sample_id) sample_path = None for root, _dirs, files in os.walk(sample_dir): sample_paths = fnmatch.filter(files, '*_R1_*') if sample_paths: sample_path = os.path.join(root, sample_paths[0]) break if sample_path is None: raise RuntimeError( 'No R1 file found for sample id {}.'.format(sample_id)) sample_path2 = sample_path.replace('_R1_', '_R2_') if not os.path.exists(sample_path2): raise RuntimeError('R2 file missing for sample id {}: {!r}.'.format( sample_id, sample_path2)) logger.info('Processing sample %s (%d of %d): %s (%s).', sample_id, sample_index + 1, len(run_info.samples), sample_name, sample_path) sample_out_path = create_app_result(data_path, run_info, sample_info, description='Mapping results', suffix='_QC') sample_scratch_path = os.path.join(scratch_path, sample_name) makedirs(sample_scratch_path) censored_path1 = os.path.join(sample_scratch_path, 'censored1.fastq') read_summary_path1 = os.path.join(sample_scratch_path, 'read1_summary.csv') censor_sample(sample_path, os.path.join(scratch_path, 'bad_cycles.csv'), censored_path1, read_summary_path1) censored_path2 = os.path.join(sample_scratch_path, 'censored2.fastq') read_summary_path2 = os.path.join(sample_scratch_path, 'read2_summary.csv') censor_sample(sample_path2, os.path.join(scratch_path, 'bad_cycles.csv'), censored_path2, read_summary_path2) logger.info('Running prelim_map (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'wb') as prelim_csv: prelim_map(censored_path1, censored_path2, prelim_csv) logger.info('Running remap (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'prelim.csv'), 'rU') as prelim_csv, \ open(os.path.join(sample_scratch_path, 'remap.csv'), 'wb') as remap_csv, \ open(os.path.join(sample_out_path, 'remap_counts.csv'), 'wb') as counts_csv, \ open(os.path.join(sample_out_path, 'remap_conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(sample_out_path, 'unmapped1.fastq'), 'w') as unmapped1, \ open(os.path.join(sample_out_path, 'unmapped2.fastq'), 'w') as unmapped2: remap(censored_path1, censored_path2, prelim_csv, remap_csv, counts_csv, conseq_csv, unmapped1, unmapped2, sample_scratch_path, nthreads=1) logger.info('Running sam2aln (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \ open(os.path.join(sample_scratch_path, 'aligned.csv'), 'wb') as aligned_csv, \ open(os.path.join(sample_out_path, 'conseq_ins.csv'), 'wb') as insert_csv, \ open(os.path.join(sample_out_path, 'failed_read.csv'), 'wb') as failed_csv: sam2aln(remap_csv, aligned_csv, insert_csv, failed_csv) logger.info('Running aln2counts (%d of %d).', sample_index + 1, len(run_info.samples)) with open(os.path.join(sample_scratch_path, 'aligned.csv'), 'rU') as aligned_csv, \ open(os.path.join(sample_out_path, 'nuc.csv'), 'wb') as nuc_csv, \ open(os.path.join(sample_out_path, 'amino.csv'), 'wb') as amino_csv, \ open(os.path.join(sample_out_path, 'coord_ins.csv'), 'wb') as coord_ins_csv, \ open(os.path.join(sample_out_path, 'conseq.csv'), 'wb') as conseq_csv, \ open(os.path.join(sample_out_path, 'failed_align.csv'), 'wb') as failed_align_csv, \ open(os.path.join(sample_out_path, 'nuc_variants.csv'), 'wb') as nuc_variants_csv, \ open(os.path.join(sample_scratch_path, 'coverage_summary.csv'), 'wb') as coverage_summary_csv: aln2counts(aligned_csv, nuc_csv, amino_csv, coord_ins_csv, conseq_csv, failed_align_csv, nuc_variants_csv, coverage_summary_csv=coverage_summary_csv) logger.info('Running coverage_plots (%d of %d).', sample_index + 1, len(run_info.samples)) coverage_path = os.path.join(sample_out_path, 'coverage') with open(os.path.join(sample_out_path, 'amino.csv'), 'rU') as amino_csv, \ open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'w') as coverage_scores_csv: coverage_plot(amino_csv, coverage_scores_csv, path_prefix=coverage_path) with open(os.path.join(sample_out_path, 'coverage_scores.csv'), 'rU') as coverage_scores_csv: reader = csv.DictReader(coverage_scores_csv) is_v3loop_good = False for row in reader: if row['region'] == 'V3LOOP': is_v3loop_good = row['on.score'] == '4' break if is_v3loop_good: logger.info('Running sam_g2p (%d of %d).', sample_index + 1, len(run_info.samples)) g2p_path = create_app_result(data_path, run_info, sample_info, description='Geno To Pheno results', suffix='_G2P') with open(os.path.join(sample_scratch_path, 'remap.csv'), 'rU') as remap_csv, \ open(os.path.join(sample_out_path, 'nuc.csv'), 'rU') as nuc_csv, \ open(os.path.join(g2p_path, 'g2p.csv'), 'wb') as g2p_csv, \ open(os.path.join(g2p_path, 'g2p_summary.csv'), 'wb') as g2p_summary_csv: sam_g2p(pssm=pssm, remap_csv=remap_csv, nuc_csv=nuc_csv, g2p_csv=g2p_csv, g2p_summary_csv=g2p_summary_csv, min_count=DEFAULT_MIN_COUNT)