def test_merge_with_extract(self): """ Test streaming repack with intermediate extraction to disk """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir, "out.tar.gz") out_extracted_path = os.path.join(temp_dir, "extracted") file_utils.merge_tarballs(out_tarball_file, self.input_tgz_files, extract_to_disk_path=out_extracted_path) tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) # inspect merged for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir, "raw-input", self.raw_files[i]) outf = os.path.join(temp_dir, self.raw_files[i]) assert_equal_contents(self, inf, outf) # inspect extracted for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir, "raw-input", self.raw_files[i]) outf = os.path.join(out_extracted_path, self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_deplete_blastn(self) : tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) # Make blast databases makeblastdbPath = tools.blast.MakeblastdbTool().install_and_get_path() dbnames = ['humanChr1Subset.fa', 'humanChr9Subset.fa'] refDbs = [] for dbname in dbnames : refDb = os.path.join(tempDir, dbname) os.symlink(os.path.join(myInputDir, dbname), refDb) refDbs.append(refDb) subprocess.check_call([ makeblastdbPath, '-dbtype', 'nucl', '-in', refDb]) # Run deplete_blastn outFile = os.path.join(tempDir, 'out.fastq') args = taxon_filter.parser_deplete_blastn(argparse.ArgumentParser()).parse_args( [os.path.join(myInputDir, 'in.fastq'), outFile, refDbs[0], refDbs[1]]) args.func_main(args) # Compare to expected assert_equal_contents(self, outFile, os.path.join(myInputDir, 'expected.fastq'))
def test_blastn_db_build(self): commonInputDir = util.file.get_test_input_path() refFasta = os.path.join(commonInputDir, 'ebola.fasta') myInputDir = util.file.get_test_input_path(self) tempDir = tempfile.mkdtemp() output_prefix = self.__class__.__name__ args = taxon_filter.parser_blastn_build_db( argparse.ArgumentParser()).parse_args([ # input fasta refFasta, # output directory tempDir, "--outputFilePrefix", output_prefix ]) args.func_main(args) # nhr=header. nin=index, nsq=sequence for ext in [".nhr", ".nsq"]: # ".nin" can change assert_equal_contents( self, os.path.join(tempDir, output_prefix + ext), os.path.join(myInputDir, "expected", output_prefix + ext))
def test_bmtagger_db_build(self): commonInputDir = util.file.get_test_input_path() refFasta = os.path.join(commonInputDir, 'ebola.fasta') myInputDir = util.file.get_test_input_path(self) tempDir = tempfile.mkdtemp() output_prefix = self.__class__.__name__ args = taxon_filter.parser_bmtagger_build_db( argparse.ArgumentParser()).parse_args([ # input fasta refFasta, # output directory tempDir, "--outputFilePrefix", output_prefix ]) args.func_main(args) for ext in [ ".bitmask", ".srprism.amp", ".srprism.idx", ".srprism.imp", ".srprism.pmp", ".srprism.rmp", ".srprism.ss", ".srprism.ssa", ".srprism.ssd" ]: assert_equal_contents( self, os.path.join(tempDir, output_prefix + ext), os.path.join(myInputDir, "expected", output_prefix + ext)) for ext in [".srprism.map"]: assert_md5_equal_to_line_in_file( self, os.path.join(tempDir, output_prefix + ext), os.path.join(myInputDir, "expected", output_prefix + ext + ".md5"))
def test_merge_with_extract_repack_from_disk(self): """ Test with repack from disk source after extraction """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir,"out.tar.gz") out_extracted_path = os.path.join(temp_dir,"extracted") util.file.repack_tarballs( out_tarball_file, self.input_tgz_files, extract_to_disk_path=out_extracted_path, avoid_disk_roundtrip=False ) tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) # inspect merged for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir,"raw-input",self.raw_files[i]) outf = os.path.join(temp_dir,self.raw_files[i]) assert_equal_contents(self, inf, outf) # inspect extracted for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir,"raw-input",self.raw_files[i]) outf = os.path.join(out_extracted_path,self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_deplete_blastn(self): tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) # Make blast databases makeblastdbPath = tools.blast.MakeblastdbTool().install_and_get_path() dbnames = ['humanChr1Subset.fa', 'humanChr9Subset.fa'] refDbs = [] for dbname in dbnames: refDb = os.path.join(tempDir, dbname) os.symlink(os.path.join(myInputDir, dbname), refDb) refDbs.append(refDb) util.misc.run_and_print( [makeblastdbPath, '-dbtype', 'nucl', '-in', refDb], check=True) # Run deplete_blastn outFile = os.path.join(tempDir, 'out.fastq') args = taxon_filter.parser_deplete_blastn( argparse.ArgumentParser()).parse_args([ os.path.join(myInputDir, 'in.fastq'), outFile, refDbs[0], refDbs[1] ]) args.func_main(args) # Compare to expected assert_equal_contents(self, outFile, os.path.join(myInputDir, 'expected.fastq'))
def test_blastn_db_build(self): commonInputDir = util.file.get_test_input_path() refFasta = os.path.join(commonInputDir, 'ebola.fasta') myInputDir = util.file.get_test_input_path(self) tempDir = tempfile.mkdtemp() output_prefix = self.__class__.__name__ args = taxon_filter.parser_blastn_build_db(argparse.ArgumentParser()).parse_args( [ # input fasta refFasta, # output directory tempDir, "--outputFilePrefix", output_prefix ] ) args.func_main(args) # nhr=header. nin=index, nsq=sequence for ext in [".nhr", ".nsq"]: # ".nin" can change assert_equal_contents( self, os.path.join(tempDir, output_prefix + ext), os.path.join(myInputDir, "expected", output_prefix + ext) )
def test_merge_with_extract(self): """ Test streaming repack with intermediate extraction to disk """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir,"out.tar.gz") out_extracted_path = os.path.join(temp_dir,"extracted") file_utils.merge_tarballs( out_tarball_file, self.input_tgz_files, extract_to_disk_path=out_extracted_path ) tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) # inspect merged for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir,"raw-input",self.raw_files[i]) outf = os.path.join(temp_dir,self.raw_files[i]) assert_equal_contents(self, inf, outf) # inspect extracted for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir,"raw-input",self.raw_files[i]) outf = os.path.join(out_extracted_path,self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_merge_piped_in_and_out(self): """ Test with streamed input and output """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir,"out.tar.gz") ps = subprocess.Popen("cat {files}".format(files=' '.join(self.input_tgz_files)).split(), stdout=subprocess.PIPE) with patch('sys.stdin', ps.stdout): with open(out_tarball_file, "wb", 0) as outf: # temporarily disable pytest's capture of sys.stdout with self.capsys.disabled(): with patch('sys.stdout', outf): file_utils.merge_tarballs( "-", ["-"], pipe_hint_out="gz", pipe_hint_in="gz" ) ps.wait() tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir,"raw-input",self.raw_files[i]) outf = os.path.join(temp_dir,self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_bmtagger_db_build(self): commonInputDir = util.file.get_test_input_path() refFasta = os.path.join(commonInputDir, 'ebola.fasta') myInputDir = util.file.get_test_input_path(self) tempDir = tempfile.mkdtemp() output_prefix = self.__class__.__name__ args = taxon_filter.parser_bmtagger_build_db(argparse.ArgumentParser()).parse_args( [ # input fasta refFasta, # output directory tempDir, "--outputFilePrefix", output_prefix ] ) args.func_main(args) for ext in [ ".bitmask", ".srprism.amp", ".srprism.idx", ".srprism.imp", ".srprism.pmp", ".srprism.rmp", ".srprism.ss", ".srprism.ssa", ".srprism.ssd" ]: assert_equal_contents( self, os.path.join(tempDir, output_prefix + ext), os.path.join(myInputDir, "expected", output_prefix + ext) ) for ext in [".srprism.map"]: assert_md5_equal_to_line_in_file(self, os.path.join(tempDir, output_prefix + ext), os.path.join(myInputDir, "expected", output_prefix + ext+".md5"))
def test_lastal_db_build(self): commonInputDir = util.file.get_test_input_path() refFasta = os.path.join(commonInputDir, 'ebola.fasta') myInputDir = util.file.get_test_input_path(self) tempDir = tempfile.mkdtemp() output_prefix = self.__class__.__name__ args = taxon_filter.parser_lastal_build_db(argparse.ArgumentParser()).parse_args( [ # input fasta refFasta, # output directory tempDir, "--outputFilePrefix", output_prefix ] ) args.func_main(args) for ext in [".bck", ".des", ".prj", ".sds", ".ssp", ".suf", ".tis"]: assert_equal_contents( self, os.path.join(tempDir, output_prefix + ext), os.path.join(myInputDir, "expected", output_prefix + ext) )
def test_merge_piped_in_and_out(self): """ Test with streamed input and output """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir, "out.tar.gz") ps = subprocess.Popen( "cat {files}".format(files=' '.join(self.input_tgz_files)).split(), stdout=subprocess.PIPE) with patch('sys.stdin', ps.stdout): with open(out_tarball_file, "wb", 0) as outf: # temporarily disable pytest's capture of sys.stdout with self.capsys.disabled(): with patch('sys.stdout', outf): file_utils.merge_tarballs("-", ["-"], pipe_hint_out="gz", pipe_hint_in="gz") ps.wait() tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir, "raw-input", self.raw_files[i]) outf = os.path.join(temp_dir, self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_deplete_blastn(self) : tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) # Make blast databases makeblastdbPath = tools.blast.MakeblastdbTool().install_and_get_path() dbnames = ['humanChr1Subset.fa', 'humanChr9Subset.fa'] refDbs = [] for dbname in dbnames : refDb = os.path.join(tempDir, dbname) os.symlink(os.path.join(myInputDir, dbname), refDb) refDbs.append(refDb) makeblastdbCmd = '{makeblastdbPath} -dbtype nucl -in {refDb}'.\ format(**locals()) assert not os.system(makeblastdbCmd) # Run deplete_blastn outFile = os.path.join(tempDir, 'out.fastq') args = taxon_filter.parser_deplete_blastn().parse_args( [os.path.join(myInputDir, 'in.fastq'), outFile, refDbs[0], refDbs[1]]) taxon_filter.main_deplete_blastn(args) # Compare to expected assert_equal_contents(self, outFile, os.path.join(myInputDir, 'expected.fastq'))
def test_snpeff(self): temp_dir = tempfile.gettempdir() input_dir = util.file.get_test_input_path(self) ref_fasta = os.path.join(input_dir,"ref-rabies-JQ685920.fasta") assembly_fasta = os.path.join(input_dir,"RBV16.fasta") isnv_calls = os.path.join(input_dir,"vphaser2.RBV16.mapped.txt.gz") # align sample to reference to create MSA msa_fasta = util.file.mkstempfname('.fasta') expected_msa_fasta = os.path.join(input_dir,"msa.fasta") args = [ref_fasta, assembly_fasta, msa_fasta, "--localpair", "--preservecase"] args = interhost.parser_align_mafft(argparse.ArgumentParser()).parse_args(args) args.func_main(args) test.assert_equal_contents(self, msa_fasta, expected_msa_fasta) # merge (one) VCF to merged vcf merged_vcf = os.path.join(temp_dir,"merged.vcf.gz") expected_merged_vcf = os.path.join(input_dir,"merged.vcf.gz") args = [ref_fasta, merged_vcf, "--isnvs", isnv_calls, "--alignments", msa_fasta, "--strip_chr_version", "--parse_accession"] args = intrahost.parser_merge_to_vcf(argparse.ArgumentParser()).parse_args(args) args.func_main(args) vcf = util.vcf.VcfReader(merged_vcf) expected_vcf = util.vcf.VcfReader(expected_merged_vcf) rows = list(vcf.get()) expected_rows = list(expected_vcf.get()) #self.assertEqual(rows, expected_rows) # run snpEff against merged VCF to predict SNP effects eff_vcf = os.path.join(temp_dir,"ann_eff.vcf.gz") expected_eff_vcf = os.path.join(input_dir,"ann_eff.vcf.gz") args = [merged_vcf, "JQ685920", eff_vcf, "[email protected]"] with self.capsys.disabled(): args = interhost.parser_snpEff(argparse.ArgumentParser()).parse_args(args) args.func_main(args) vcf = util.vcf.VcfReader(eff_vcf) expected_vcf = util.vcf.VcfReader(expected_eff_vcf) rows = list(vcf.get()) expected_rows = list(expected_vcf.get()) #self.assertEqual(rows, expected_rows) # create tabular iSNV output eff_txt = os.path.join(temp_dir,"ann_eff.txt.gz") expected_eff_txt = os.path.join(input_dir,"ann_eff.txt.gz") args = [eff_vcf, eff_txt] args = intrahost.parser_iSNV_table(argparse.ArgumentParser()).parse_args(args) args.func_main(args) for outrow, expectedrow in zip(util.file.read_tabfile(eff_txt),util.file.read_tabfile(expected_eff_txt)): for colout, colexpected in zip(outrow, expectedrow): # if it casts to float, perform approx comparison try: f1=float(colout) f2=float(colexpected) self.assertAlmostEqual(f1, f1) except ValueError: self.assertEqual(sorted(sorted(colout.split(","))), sorted(sorted(colexpected.split(","))))
def test_snpeff(self): temp_dir = tempfile.gettempdir() input_dir = util.file.get_test_input_path(self) ref_fasta = os.path.join(input_dir,"ref-rabies-JQ685920.fasta") assembly_fasta = os.path.join(input_dir,"RBV16.fasta") isnv_calls = os.path.join(input_dir,"vphaser2.RBV16.mapped.txt.gz") # align sample to reference to create MSA msa_fasta = util.file.mkstempfname('.fasta') expected_msa_fasta = os.path.join(input_dir,"msa.fasta") args = [ref_fasta, assembly_fasta, msa_fasta, "--localpair", "--preservecase"] args = interhost.parser_align_mafft(argparse.ArgumentParser()).parse_args(args) args.func_main(args) test.assert_equal_contents(self, msa_fasta, expected_msa_fasta) # merge (one) VCF to merged vcf merged_vcf = os.path.join(temp_dir,"merged.vcf.gz") expected_merged_vcf = os.path.join(input_dir,"merged.vcf.gz") args = [ref_fasta, merged_vcf, "--isnvs", isnv_calls, "--alignments", msa_fasta, "--strip_chr_version", "--parse_accession"] args = intrahost.parser_merge_to_vcf(argparse.ArgumentParser()).parse_args(args) args.func_main(args) vcf = util.vcf.VcfReader(merged_vcf) expected_vcf = util.vcf.VcfReader(expected_merged_vcf) rows = list(vcf.get()) expected_rows = list(expected_vcf.get()) #self.assertEqual(rows, expected_rows) # run snpEff against merged VCF to predict SNP effects eff_vcf = os.path.join(temp_dir,"ann_eff.vcf.gz") expected_eff_vcf = os.path.join(input_dir,"ann_eff.vcf.gz") args = [merged_vcf, "JQ685920", eff_vcf, "[email protected]"] args = interhost.parser_snpEff(argparse.ArgumentParser()).parse_args(args) args.func_main(args) vcf = util.vcf.VcfReader(eff_vcf) expected_vcf = util.vcf.VcfReader(expected_eff_vcf) rows = list(vcf.get()) expected_rows = list(expected_vcf.get()) #self.assertEqual(rows, expected_rows) # create tabular iSNV output eff_txt = os.path.join(temp_dir,"ann_eff.txt.gz") expected_eff_txt = os.path.join(input_dir,"ann_eff.txt.gz") args = [eff_vcf, eff_txt] args = intrahost.parser_iSNV_table(argparse.ArgumentParser()).parse_args(args) args.func_main(args) for outrow, expectedrow in zip(util.file.read_tabfile(eff_txt),util.file.read_tabfile(expected_eff_txt)): for colout, colexpected in zip(outrow, expectedrow): # if it casts to float, perform approx comparison try: f1=float(colout) f2=float(colexpected) self.assertAlmostEqual(f1, f1) except ValueError: self.assertEqual(sorted(sorted(colout.split(","))), sorted(sorted(colexpected.split(","))))
def test_deplete_bmtagger(self): myInputDir = util.file.get_test_input_path(self) args = taxon_filter.parser_partition_bmtagger(argparse.ArgumentParser()).parse_args( [os.path.join(myInputDir, 'in1.fastq'), os.path.join(myInputDir, 'in2.fastq'), os.path.join( self.tempDir, 'humanChr1Subset'), os.path.join(self.tempDir, 'humanChr9Subset'), '--outNoMatch', os.path.join(self.tempDir, 'deplete.1.fastq'), os.path.join(self.tempDir, 'deplete.2.fastq')]) args.func_main(args) # Compare to expected for case in ['1', '2']: assert_equal_contents(self, os.path.join(self.tempDir, 'deplete.' + case + '.fastq'), os.path.join(myInputDir, 'expected.NoMatch.' + case + '.fastq'))
def test_lasv_oob_clip(self): input_dir = os.path.join(self.input_dir, "lasv", "input") expected_dir = os.path.join(self.input_dir, "lasv", "expected") temp_dir = tempfile.gettempdir() infastas = [ os.path.join(input_dir, f) for f in [ "align_mafft-ref-lasv-ISTH2376_1.fasta", "align_mafft-ref-lasv-ISTH2376_2.fasta" ] ] intables = [ os.path.join(input_dir, f) for f in ["KM821997.1.tbl", "KM821998.1.tbl"] ] out_table_names = [ "LASV_NGA_2018_0026-1.tbl", "LASV_NGA_2018_0026-2.tbl", "LASV_NGA_2018_0097-1.tbl", "LASV_NGA_2018_0097-2.tbl", "LASV_NGA_2018_0541-1.tbl", "LASV_NGA_2018_0541-2.tbl", "LASV_NGA_2018_0611-1.tbl", "LASV_NGA_2018_0611-2.tbl", "LASV_NGA_2018_0664-1.tbl", "LASV_NGA_2018_0664-2.tbl", "LASV_NGA_2018_0959-1.tbl", "LASV_NGA_2018_0959-2.tbl", "LASV_NGA_2018_0998-1.tbl", "LASV_NGA_2018_0998-2.tbl", "LASV_NGA_2018_1024-1.tbl", "LASV_NGA_2018_1024-2.tbl", "LASV_NGA_2018_1079-1.tbl", "LASV_NGA_2018_1079-2.tbl", "LASV_NGA_2018_1177-1.tbl", "LASV_NGA_2018_1177-2.tbl", "LASV_NGA_2018_1375-1.tbl", "LASV_NGA_2018_1375-2.tbl", "LASV_NGA_2018_1381-1.tbl", "LASV_NGA_2018_1381-2.tbl", "LASV_NGA_2018_1392-1.tbl", "LASV_NGA_2018_1392-2.tbl", "LASV_NGA_2018_1643-1.tbl", "LASV_NGA_2018_1643-2.tbl" ] out_tbls = [os.path.join(temp_dir, f) for f in out_table_names] expected_tbls = [ os.path.join(expected_dir, f) for f in out_table_names ] for i in range(0, len(infastas)): ncbi.tbl_transfer_prealigned(infastas[i], os.path.join( input_dir, "ref-lasv-ISTH2376.fasta"), intables, temp_dir, oob_clip=True) for i in range(0, len(out_table_names)): out_tbl = out_tbls[i] expected_tbl = expected_tbls[i] assert_equal_contents(self, out_tbl, expected_tbl)
def test_partition_bmtagger(self): outMatch = [os.path.join(self.tempDir, 'outMatch.{}.fastq'.format(n)) for n in '12'] outNoMatch = [os.path.join(self.tempDir, 'outNoMatch.{}.fastq'.format(n)) for n in '12'] myInputDir = util.file.get_test_input_path(self) args = taxon_filter.parser_partition_bmtagger(argparse.ArgumentParser()).parse_args( [os.path.join(myInputDir, 'in1.fastq'), os.path.join(myInputDir, 'in2.fastq'), os.path.join( self.tempDir, 'humanChr1Subset'), os.path.join(self.tempDir, 'humanChr9Subset'), '--outMatch', outMatch[0], outMatch[1], '--outNoMatch', outNoMatch[0], outNoMatch[1]]) args.func_main(args) # Compare to expected for case in ['Match.1', 'Match.2', 'NoMatch.1', 'NoMatch.2']: assert_equal_contents(self, os.path.join(self.tempDir, 'out' + case + '.fastq'), os.path.join(myInputDir, 'expected.' + case + '.fastq'))
def test_synthetic_feature_table(self): input_dir = os.path.join(self.input_dir, "synthetic", "input") expected_dir = os.path.join(self.input_dir, "synthetic", "expected") temp_dir = tempfile.gettempdir() in_tbl = os.path.join(input_dir, "ref.tbl") out_tbl = os.path.join(temp_dir, "sample.tbl") expected = os.path.join(expected_dir, "mapped.tbl") ncbi.tbl_transfer_prealigned( os.path.join(input_dir, "aligned_1.fasta"), os.path.join(input_dir, "ref.fasta"), [in_tbl], temp_dir) assert_equal_contents(self, out_tbl, expected)
def test_trimmomatic_paired_maxinfo(self): myInputDir = util.file.get_test_input_path(self) inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') clipFasta = os.path.join(myInputDir, 'clip.fasta') with util.file.tempfnames(('.out1.fastq', '.out2.fastq')) as (pairedOutFastq1, pairedOutFastq2): tools.trimmomatic.TrimmomaticTool().execute(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta, maxinfo_target_length=30, maxinfo_strictness=.3) # Check that results match expected expected1Fastq = os.path.join(myInputDir, 'expected1.maxinfo.fastq') expected2Fastq = os.path.join(myInputDir, 'expected2.maxinfo.fastq') assert_equal_contents(self, pairedOutFastq1, expected1Fastq) assert_equal_contents(self, pairedOutFastq2, expected2Fastq)
def test_trimmomatic_paired(self): myInputDir = util.file.get_test_input_path(self) inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') pairedOutFastq1 = util.file.mkstempfname('.out1.fastq') pairedOutFastq2 = util.file.mkstempfname('.out2.fastq') clipFasta = os.path.join(myInputDir, 'clip.fasta') tools.trimmomatic.TrimmomaticTool().execute(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta) # Check that results match expected expected1Fastq = os.path.join(myInputDir, 'expected1.fastq') expected2Fastq = os.path.join(myInputDir, 'expected2.fastq') assert_equal_contents(self, pairedOutFastq1, expected1Fastq) assert_equal_contents(self, pairedOutFastq2, expected2Fastq)
def test_synthetic_feature_table(self): input_dir = os.path.join(self.input_dir, "synthetic", "input") expected_dir = os.path.join(self.input_dir, "synthetic", "expected") temp_dir = tempfile.gettempdir() in_tbl = os.path.join(input_dir,"ref.tbl") out_tbl = os.path.join(temp_dir,"sample.tbl") expected = os.path.join(expected_dir, "mapped.tbl") ncbi.tbl_transfer_prealigned( os.path.join(input_dir,"aligned_1.fasta"), os.path.join(input_dir,"ref.fasta"), [in_tbl], temp_dir) assert_equal_contents(self, out_tbl, expected)
def test_trimmomatic(self): myInputDir = util.file.get_test_input_path(self) inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') pairedOutFastq1 = util.file.mkstempfname() pairedOutFastq2 = util.file.mkstempfname() clipFasta = os.path.join(myInputDir, 'clip.fasta') parser = taxon_filter.parser_trim_trimmomatic(argparse.ArgumentParser()) args = parser.parse_args([inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta]) args.func_main(args) # Check that results match expected expected1Fastq = os.path.join(myInputDir, 'expected1.fastq') expected2Fastq = os.path.join(myInputDir, 'expected2.fastq') assert_equal_contents(self, pairedOutFastq1, expected1Fastq) assert_equal_contents(self, pairedOutFastq2, expected2Fastq)
def test_deplete_blastn_bam(self) : tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) # Make blast databases makeblastdbPath = tools.blast.MakeblastdbTool().install_and_get_path() dbnames = ['humanChr1Subset.fa', 'humanChr9Subset.fa'] refDbs = [] for dbname in dbnames : refDb = os.path.join(tempDir, dbname) os.symlink(os.path.join(myInputDir, dbname), refDb) refDbs.append(refDb) subprocess.check_call([ makeblastdbPath, '-dbtype', 'nucl', '-in', refDb]) # convert the input fastq's to a bam inFastq1 = os.path.join(myInputDir, "in1.fastq") inFastq2 = os.path.join(myInputDir, "in2.fastq") inBam = os.path.join(tempDir, 'in.bam') parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) args = parser.parse_args([inFastq1, inFastq2, inBam, '--sampleName', 'FreeSample', '--JVMmemory', '1g', '--picardOptions', 'LIBRARY_NAME=Alexandria', 'PLATFORM=9.75', 'SEQUENCING_CENTER=KareemAbdul-Jabbar', ]) args.func_main(args) # Run deplete_blastn_bam outBam = os.path.join(tempDir, 'out.bam') args = taxon_filter.parser_deplete_blastn_bam(argparse.ArgumentParser()).parse_args( [inBam, refDbs[0], refDbs[1], outBam, "--chunkSize", "1"]) args.func_main(args) # samtools view for out.sam and compare to expected outSam = os.path.join(tempDir, 'out.sam') samtools = tools.samtools.SamtoolsTool() samtools.view(['-h'], outBam, outSam) assert_equal_contents(self, outSam, os.path.join(myInputDir, 'expected.sam'))
def test_simple_merge(self): """ Simple repack test """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir, "out.tar.gz") file_utils.merge_tarballs(out_tarball_file, self.input_tgz_files) tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir, "raw-input", self.raw_files[i]) outf = os.path.join(temp_dir, self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_filter_lastal(self): # Create refDbs commonInputDir = util.file.get_test_input_path() myInputDir = util.file.get_test_input_path(self) refFasta = os.path.join(commonInputDir, 'ebola.fasta') dbsDir = tempfile.mkdtemp() refDbs = os.path.join(dbsDir, 'ebola') lastdbPath = tools.last.Lastdb().install_and_get_path() subprocess.check_call([lastdbPath, refDbs, refFasta]) # Call main_filter_lastal inFastq = os.path.join(myInputDir, 'in.fastq') outFastq = util.file.mkstempfname('.fastq') args = taxon_filter.parser_filter_lastal(argparse.ArgumentParser()).parse_args([inFastq, refDbs, outFastq]) args.func_main(args) # Check that results match expected expectedFastq = os.path.join(myInputDir, 'expected.fastq') assert_equal_contents(self, outFastq, expectedFastq)
def test_simple_merge(self): """ Simple repack test """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir,"out.tar.gz") file_utils.merge_tarballs( out_tarball_file, self.input_tgz_files ) tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir,"raw-input",self.raw_files[i]) outf = os.path.join(temp_dir,self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_filter_lastal(self) : # Create refDbs commonInputDir = util.file.get_test_input_path() myInputDir = util.file.get_test_input_path(self) refFasta = os.path.join(commonInputDir, 'ebola.fasta') dbsDir = tempfile.mkdtemp() refDbs = os.path.join(dbsDir, 'ebola') lastdbPath = tools.last.Lastdb().install_and_get_path() assert not os.system( '{lastdbPath} {refDbs} {refFasta}'.format(**locals())) # Call main_filter_lastal inFastq = os.path.join( myInputDir, 'in.fastq') outFastq = util.file.mkstempfname('.fastq') args = taxon_filter.parser_filter_lastal().parse_args([inFastq, refDbs, outFastq]) taxon_filter.main_filter_lastal(args) # Check that results match expected expectedFastq = os.path.join(myInputDir, 'expected.fastq') assert_equal_contents(self, outFastq, expectedFastq)
def test_piped_in_merge(self): """ Test with streamed input """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir,"out.tar.gz") ps = subprocess.Popen("cat {files}".format(files=' '.join(self.input_tgz_files)).split(), stdout=subprocess.PIPE) with patch('sys.stdin', ps.stdout): file_utils.merge_tarballs( out_tarball_file, ["-"], pipe_hint_in="gz" ) ps.wait() tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir,"raw-input",self.raw_files[i]) outf = os.path.join(temp_dir,self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_piped_out_merge(self): """ Test with streamed output """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir,"out.tar.gz") with open(out_tarball_file, "wb", 0) as outf: # temporarily disable pytest's capture of sys.stdout with self.capsys.disabled(): with patch('sys.stdout', outf): file_utils.merge_tarballs( "-", self.input_tgz_files, pipe_hint_out="gz" ) tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir,"raw-input",self.raw_files[i]) outf = os.path.join(temp_dir,self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_piped_out_merge(self): """ Test with streamed output """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir, "out.tar.gz") with open(out_tarball_file, "wb", 0) as outf: # temporarily disable pytest's capture of sys.stdout with self.capsys.disabled(): with patch('sys.stdout', outf): file_utils.merge_tarballs("-", self.input_tgz_files, pipe_hint_out="gz") tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir, "raw-input", self.raw_files[i]) outf = os.path.join(temp_dir, self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_piped_in_merge(self): """ Test with streamed input """ temp_dir = tempfile.gettempdir() out_tarball_file = os.path.join(temp_dir, "out.tar.gz") ps = subprocess.Popen( "cat {files}".format(files=' '.join(self.input_tgz_files)).split(), stdout=subprocess.PIPE) with patch('sys.stdin', ps.stdout): file_utils.merge_tarballs(out_tarball_file, ["-"], pipe_hint_in="gz") ps.wait() tb = tarfile.open(out_tarball_file) tb.extractall(path=temp_dir) for i in range(len(self.raw_files)): inf = os.path.join(self.input_dir, "raw-input", self.raw_files[i]) outf = os.path.join(temp_dir, self.raw_files[i]) assert_equal_contents(self, inf, outf)
def test_trimmomatic_paired_maxinfo(self): myInputDir = util.file.get_test_input_path(self) inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') clipFasta = os.path.join(myInputDir, 'clip.fasta') with util.file.tempfnames( ('.out1.fastq', '.out2.fastq')) as (pairedOutFastq1, pairedOutFastq2): tools.trimmomatic.TrimmomaticTool().execute( inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta, maxinfo_target_length=30, maxinfo_strictness=.3) # Check that results match expected expected1Fastq = os.path.join(myInputDir, 'expected1.maxinfo.fastq') expected2Fastq = os.path.join(myInputDir, 'expected2.maxinfo.fastq') assert_equal_contents(self, pairedOutFastq1, expected1Fastq) assert_equal_contents(self, pairedOutFastq2, expected2Fastq)
def test_lastal_db_build(self): commonInputDir = util.file.get_test_input_path() refFasta = os.path.join(commonInputDir, 'ebola.fasta') myInputDir = util.file.get_test_input_path(self) tempDir = tempfile.mkdtemp() output_prefix = self.__class__.__name__ args = taxon_filter.parser_lastal_build_db( argparse.ArgumentParser()).parse_args([ # input fasta refFasta, # output directory tempDir, "--outputFilePrefix", output_prefix ]) args.func_main(args) for ext in [".bck", ".des", ".prj", ".sds", ".ssp", ".suf", ".tis"]: assert_equal_contents( self, os.path.join(tempDir, output_prefix + ext), os.path.join(myInputDir, "expected", output_prefix + ext))
def test_trimmomatic_single(self): myInputDir = util.file.get_test_input_path(self) inFastq1 = os.path.join(myInputDir, 'in1.fastq') pairedOutFastq1 = util.file.mkstempfname('.out1.fastq') pairedOutFastq2 = util.file.mkstempfname('.out2.fastq') unpairedOutFastq1 = util.file.mkstempfname('.out3.fastq') unpairedOutFastq2 = util.file.mkstempfname('.out4.fastq') clipFasta = os.path.join(myInputDir, 'clip.fasta') tools.trimmomatic.TrimmomaticTool().execute(inFastq1, None, pairedOutFastq1, pairedOutFastq2, clipFasta, unpairedOutFastq1=unpairedOutFastq1, unpairedOutFastq2=unpairedOutFastq2) # Check that results match expected emptyFastq = os.path.join(myInputDir, 'empty.fastq') expectedFastq = os.path.join(myInputDir, 'expected1.fastq') assert_equal_contents(self, pairedOutFastq1, emptyFastq) assert_equal_contents(self, pairedOutFastq2, emptyFastq) assert_equal_contents(self, unpairedOutFastq1, expectedFastq)
def test_lasv_oob_clip(self): input_dir = os.path.join(self.input_dir, "lasv", "input") expected_dir = os.path.join(self.input_dir, "lasv", "expected") temp_dir = tempfile.gettempdir() infastas = [os.path.join(input_dir, f) for f in [ "align_mafft-ref-lasv-ISTH2376_1.fasta", "align_mafft-ref-lasv-ISTH2376_2.fasta" ] ] intables =[os.path.join(input_dir, f) for f in [ "KM821997.1.tbl", "KM821998.1.tbl" ] ] out_table_names = [ "LASV_NGA_2018_0026-1.tbl", "LASV_NGA_2018_0026-2.tbl", "LASV_NGA_2018_0097-1.tbl", "LASV_NGA_2018_0097-2.tbl", "LASV_NGA_2018_0541-1.tbl", "LASV_NGA_2018_0541-2.tbl", "LASV_NGA_2018_0611-1.tbl", "LASV_NGA_2018_0611-2.tbl", "LASV_NGA_2018_0664-1.tbl", "LASV_NGA_2018_0664-2.tbl", "LASV_NGA_2018_0959-1.tbl", "LASV_NGA_2018_0959-2.tbl", "LASV_NGA_2018_0998-1.tbl", "LASV_NGA_2018_0998-2.tbl", "LASV_NGA_2018_1024-1.tbl", "LASV_NGA_2018_1024-2.tbl", "LASV_NGA_2018_1079-1.tbl", "LASV_NGA_2018_1079-2.tbl", "LASV_NGA_2018_1177-1.tbl", "LASV_NGA_2018_1177-2.tbl", "LASV_NGA_2018_1375-1.tbl", "LASV_NGA_2018_1375-2.tbl", "LASV_NGA_2018_1381-1.tbl", "LASV_NGA_2018_1381-2.tbl", "LASV_NGA_2018_1392-1.tbl", "LASV_NGA_2018_1392-2.tbl", "LASV_NGA_2018_1643-1.tbl", "LASV_NGA_2018_1643-2.tbl" ] out_tbls =[os.path.join(temp_dir, f) for f in out_table_names] expected_tbls = [os.path.join(expected_dir, f) for f in out_table_names] for i in range(0, len(infastas)): ncbi.tbl_transfer_prealigned( infastas[i], os.path.join(input_dir,"ref-lasv-ISTH2376.fasta"), intables, temp_dir, oob_clip=True) for i in range(0,len(out_table_names)): out_tbl = out_tbls[i] expected_tbl = expected_tbls[i] assert_equal_contents(self, out_tbl, expected_tbl)