def test_more_samples(method): for sample_name in ["sample_v2", "sample_v3", "sample_v4"]: if method == "GATB" and sample_name in ["sample_v3", "sample_v2"]: # GATB long headers bug continue infile = bioconvert_data("{}.fastq".format(sample_name)) expected_outfile = bioconvert_data("{}.fasta".format(sample_name)) with TempFile(suffix=".fasta") as expected_unwrapped: Fastq2Fasta.unwrap_fasta( expected_outfile, expected_unwrapped.name, strip_comment=True) md5out = md5(expected_unwrapped.name) # One temporary file for the fasta created using the method # and one for an unwrapped version. # Some methods may output multi-line fasta, so we need to # compare md5 sums of unwrapped versions. with TempFile(suffix=".fasta") as outfile, \ TempFile(suffix=".fasta") as unwrapped: convert = Fastq2Fasta(infile, outfile.name) convert(method=method) Fastq2Fasta.unwrap_fasta( outfile.name, unwrapped.name, strip_comment=True) assert md5(unwrapped.name) == md5out, \ "{} failed for {}".format(method, sample_name)
def test_conv(method): infile = bioconvert_data("test_tabulated.xlsx") expected_outile = bioconvert_data("test_tabulated.csv") with TempFile(suffix=".csv") as tempfile: convert = XLSX2CSV(infile, tempfile.name) convert(method=method) assert md5(tempfile.name) == md5(expected_outile)
def test_conv(): infile = bioconvert_data("test_vcf2bcf_v1.vcf") outfile = bioconvert_data("test_vcf2bed_v1.bed") with TempFile(suffix=".bed") as tempfile: convert = VCF2BED(infile, tempfile.name) convert(method="awk") assert md5(tempfile.name) == md5(outfile)
def test_conv(method): infile = bioconvert_data("test_fastq2fasta_v1.fasta") qual_file = bioconvert_data("test_fasta2fastq.qual") expected_outfile_no_qual = bioconvert_data("test_fasta2fastq.fastq") md5out_no_qual = md5(expected_outfile_no_qual) expected_outfile_qual = bioconvert_data("test_fastq2fasta_v1.fastq") md5out_qual = md5(expected_outfile_qual) # One temporary file for the fasta created using the method # and one for an unwrapped version. # Some methods may output multi-line fasta, so we need to # compare md5 sums of unwrapped versions. with TempFile(suffix=".fastq") as outfile: convert = FASTA2FASTQ(infile, outfile.name) convert(method=method) assert md5(outfile.name) == md5out_no_qual, \ "{} failed".format(method) with TempFile(suffix=".fastq") as outfile: convert = FASTA2FASTQ(infile, outfile.name) convert(method=method, quality_file=qual_file) assert md5(outfile.name) == md5out_qual, \ "{} failed".format(method)
def test_methods(method): infile = bioconvert_data("test_measles.sorted.bam") with TempFile(suffix=".fa") as tempfile: convert = BAM2FASTA(infile, tempfile.name) convert(method=method) # samtools 1.6 / hstlib 1.6 gives different results on travis and # locally assert md5(tempfile.name.replace( ".", "_1.")) in ["9242d127969a089ddeedbc2002c62686"] assert md5(tempfile.name.replace( ".", "_2.")) in ["b753ad368c9614130884acb29861bd23"] for ext in ['gz', 'bz2']: # Test compression with TempFile(suffix=".fasta.{}".format(ext)) as tempfile: convert = BAM2FASTA(infile, tempfile.name) convert(method=method) # no check, just running infile = bioconvert_data("test_measles_unpaired.sorted.bam") with TempFile(suffix=".fa") as tempfile: convert = BAM2FASTA(infile, tempfile.name) convert(method=method) # samtools 1.6 / hstlib 1.6 gives different results on travis and # locally for ext in ['gz', 'bz2']: # Test compression with TempFile(suffix=".fasta.{}".format(ext)) as tempfile: convert = BAM2FASTA(infile, tempfile.name) convert(method=method)
def _test_conv(): infile = bioconvert_data("biocode.gb") outfile = bioconvert_data("biocode.gff") with TempFile(suffix=".gff") as tempfile: converter = GENBANK2GFF3(infile, tempfile.name) converter(method="biocode") assert md5(tempfile.name) == md5(outfile)
def test_phy2nx_biopython(method): infile = bioconvert_data(method + ".phylip") outfile = bioconvert_data(method + ".nexus") with TempFile(suffix=".nexus") as tempfile: converter = PHYLIP2NEXUS(infile, tempfile.name) converter(method=method) # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_sra2fastq_single(method): infile = "SRR6477205" outfile = bioconvert_data("SRR6477205.fastq") with TempFile(suffix=".fastq") as tempfile: converter = SRA2FASTQ(infile, tempfile.name, True) converter(method=method) # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_nexus2clustal_biopython(): infile = bioconvert_data("nexus2clustal_biopython.nexus") outfile = bioconvert_data("nexus2clustal_biopython.clustal") with TempFile(suffix=".nexus") as tempfile: converter = NEXUS2CLUSTAL(infile, tempfile.name) converter(method='biopython') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_clustal2stockholm_squizz(): infile = bioconvert_data("squizz.clustal") outfile = bioconvert_data("squizz.stockholm") with TempFile(suffix=".stockholm") as tempfile: converter = CLUSTAL2STOCKHOLM(infile, tempfile.name) converter(method='squizz') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_stockholm2clustal_biopython(): infile = bioconvert_data("biopython.stockholm") outfile = bioconvert_data("biopython.clustal") with TempFile(suffix=".clustal") as tempfile: converter = STOCKHOLM2CLUSTAL(infile, tempfile.name) converter(method='biopython') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_clustal2fasta_biopython(): infile = bioconvert_data("biopython.clustal") outfile = bioconvert_data("biopython.fasta") with TempFile(suffix=".fasta") as tempfile: converter = CLUSTAL2FASTA(infile, tempfile.name) converter(method='biopython') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_stockholm2phylip_squizz(): infile = bioconvert_data("squizz.stockholm") outfile = bioconvert_data("squizz.phylip") with TempFile(suffix=".phylip") as tempfile: converter = STOCKHOLM2PHYLIP(infile, tempfile.name) converter(method='squizz') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_nx2xml_biopython(method): infile = bioconvert_data("test_wig2bed.wig") outfile = bioconvert_data("test_wig2bed.bed") with TempFile(suffix=".phyloxml") as tempfile: converter = WIG2BED(infile, tempfile.name) converter(method=method) # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_twobit2fasta_ucsc(method): infile = bioconvert_data("ucsc.2bit") outfile = bioconvert_data("ucsc.fasta") with TempFile(suffix=".fasta") as tempfile: converter = TWOBIT2FASTA(infile, tempfile.name) converter(method=method) # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_clustal2nexus_goalign(): infile = bioconvert_data("goalign.clustal") outfile = bioconvert_data("goalign.nexus") with TempFile(suffix=".nexus") as tempfile: converter = CLUSTAL2NEXUS(infile, tempfile.name) converter(method='goalign') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_phylip2clustal_squizz(): infile = bioconvert_data("squizz.phylip") outfile = bioconvert_data("squizz.clustal") with TempFile(suffix=".clustal") as tempfile: converter = PHYLIP2CLUSTAL(infile, tempfile.name) converter(method='squizz') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_nx2fa_biopython(method): infile = bioconvert_data(method + ".nexus") outfile = bioconvert_data(method + ".fasta") with TempFile(suffix=".fasta") as tempfile: converter = NEXUS2FASTA(infile, tempfile.name) converter(method=method) # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_bigwig2bedgraph_ucsc(method): infile = bioconvert_data("ucsc.bigwig") outfile = bioconvert_data("ucsc.bedgraph") with TempFile(suffix=".bedgraph") as tempfile: converter = BIGWIG2BEDGRAPH(infile, tempfile.name) converter(method=method) # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_conv(): infile = bioconvert_data("test_v1.json") expected_outile = bioconvert_data("test_v1_nocomments.yaml") with TempFile(suffix=".yaml") as tempfile: convert = JSON2YAML(infile, tempfile.name) convert() # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(expected_outile)
def test_fa2phy_squizz(): infile = bioconvert_data("squizz.fasta") outfile = bioconvert_data("squizz.phylip") with TempFile(suffix=".phylip") as tempfile: converter = FASTA2PHYLIP(infile, tempfile.name) converter(method='squizz') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_nw2xml_biopython(method): infile = bioconvert_data(method + ".newick") outfile = bioconvert_data(method + ".xml") with TempFile(suffix=".xml") as tempfile: converter = NEWICK2PHYLOXML(infile, tempfile.name) converter(method=method) # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_phylip2stockholm_biopython(): infile = bioconvert_data("biopython.phylip") outfile = bioconvert_data("biopython.stockholm") with TempFile(suffix=".fasta") as tempfile: converter = PHYLIP2STOCKHOLM(infile, tempfile.name) converter(method='biopython') # Check that the output is correct with a checksum assert md5(tempfile.name) == md5(outfile)
def test_conv(method): infile = bioconvert_data("test_vcf2bcf_v1.vcf") outfile = bioconvert_data("test_vcf2wiggle.wiggle") md5out = md5(outfile) with TempFile(suffix=".wiggle") as tempfile: convert = VCF2WIGGLE(infile, tempfile.name) convert(method=method) assert md5(tempfile.name) == md5out, "{} failed".format(method)
def test_conv(method): infile = bioconvert_data("test_tabulated.ods") expected_outfiles = [ bioconvert_data("test_tabulated.csv"), bioconvert_data("test_tabulated_with_3_more_blank_lines.csv"), ] with TempFile(suffix=".csv") as tempfile: convert = ODS2CSV(infile, tempfile.name) convert(method=method) assert md5(tempfile.name) in [md5(f) for f in expected_outfiles]
def _test_conv(method): infile = bioconvert_data("test_measles.bigwig") outfile = bioconvert_data("test_bigwig2wiggle.wiggle") md5out = md5(outfile) with TempFile(suffix=".wiggle") as tempfile: convert = BIGWIG2WIGGLE(infile, tempfile.name) convert(method=method) assert md5(tempfile.name) == md5out, "{} failed".format(method)
def test_conv(): infile = bioconvert_data("test_sam2paf_v1.sam", where) outfile = bioconvert_data("test_sam2paf_v1.paf", where) checksum = md5(outfile) with TempFile(suffix=".paf") as tempfile: convert = SAM2PAF(infile, tempfile.name) convert() assert checksum == md5(tempfile.name) assert convert.skipped == 17
def test_converter_without_converter(): infile = bioconvert_data("test_fastq2fasta_v1.fastq") with TempFile(suffix=".fasta") as tempfile1, TempFile(suffix=".fasta") as tempfile2: cmd = "bioconvert {} {} --force".format(infile, tempfile1.name) p = subprocess.Popen(cmd, shell=True) assert p.wait() == 0 import sys sys.argv = ["bioconvert", infile, tempfile2.name, "--force"] converter.main() assert md5(tempfile1.name) == md5(tempfile2.name)
def test_conv(method): # XLS file may contains bold, border, ... i then prefere to convert it back to csv to check if it is ok or not infile = bioconvert_data("test_tabulated.csv") expected_outile = bioconvert_data("test_tabulated.csv") with TempFile(suffix=".csv") as temp_csv,TempFile(suffix=".xls") as temp_xls: convert = CSV2XLS(infile, temp_xls.name) convert() convert = XLS2CSV(temp_xls.name, temp_csv.name) convert(method=method) assert md5(temp_csv.name) == md5(expected_outile)
def test_conv(method): infile = bioconvert_data("ucsc.bigbed") outfile = bioconvert_data("ucsc.wiggle") md5out = md5(outfile) with TempFile(suffix=".wiggle") as tempfile: convert = BIGBED2WIGGLE(infile, tempfile.name) convert(method=method) assert md5(tempfile.name) == md5out, "{} failed".format(method)
def _download_minikraken(self, verbose=True): dv = DevTools() base = sequana_config_path + os.sep + "" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) logger.info("Downloading minikraken (4Gb)") filename = base + os.sep + "minikraken.tgz" if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2": logger.warning("%s already present" % filename) else: wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
def _download_sequana_db1(self, verbose=True): dbname = "sequana_db1" from easydev import md5 dir1 = sequana_config_path + os.sep + dbname dir2 = dir1 + os.sep + "taxonomy" self.dv.mkdir(dir1) self.dv.mkdir(dir2) logger.info("Downloading about 8Gb of data (if not already downloaded) from" " Synapse into %s" % dir1) from os.path import exists filename = dir1 + "ena_list.txt" if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2": pass else: self._download_from_synapse('syn6171700', dir1) # database.idx filename = dir1 + "database.idx" if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac": pass else: self._download_from_synapse('syn6171017', dir1) # database.kdb ; this one is large (8Gb) filename = dir1 + "database.kdb" if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf": pass else: self._download_from_synapse('syn6171107', dir1) # Then, the taxonomy directory filename = dir1 + "names.dmp" if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0": pass else: self._download_from_synapse('syn6171286', dir2) filename = dir1 + "nodes.dmp" if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980": pass else: self._download_from_synapse('syn6171289', dir2) filename = dir1 + "taxons.txt" if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f": pass else: self._download_from_synapse('syn6171290', dir2) logger.info('done. You should have a kraken DB in %s' % dir1) # The annotations wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv", dir1 + os.sep + "annotations.csv")
def _download_kraken_toydb(self, verbose=True): """Download the kraken DB toy example from sequana_data into .config/sequana directory Checks the md5 checksums. About 32Mb of data """ dv = DevTools() base = sequana_config_path + os.sep + "kraken_toydb" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) baseurl = "https://github.com/sequana/data/raw/master/" # download only if required logger.info("Downloading the database into %s" % base) md5sums = [ "28661f8baf0514105b0c6957bec0fc6e", "97a39d44ed86cadea470352d6f69748d", "d91a0fcbbc0f4bbac918755b6400dea6", "c8bae69565af2170ece194925b5fdeb9"] filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp"] for filename, md5sum in zip(filenames, md5sums): url = baseurl + "kraken_toydb/%s" % filename filename = base + os.sep + filename if os.path.exists(filename) and md5(filename) == md5sum: logger.warning("%s already present" % filename) else: logger.info("Downloading %s" % url) wget(url, filename)