Esempio n. 1
0
def test_more_samples(method):
    for sample_name in ["sample_v2", "sample_v3", "sample_v4"]:

        if method == "GATB" and sample_name in ["sample_v3", "sample_v2"]:
            # GATB long headers bug
            continue

        infile = bioconvert_data("{}.fastq".format(sample_name))

        expected_outfile = bioconvert_data("{}.fasta".format(sample_name))
        with TempFile(suffix=".fasta") as expected_unwrapped:
            Fastq2Fasta.unwrap_fasta(
                expected_outfile, expected_unwrapped.name, strip_comment=True)
            md5out = md5(expected_unwrapped.name)

        # One temporary file for the fasta created using the method
        # and one for an unwrapped version.
        # Some methods may output multi-line fasta, so we need to
        # compare md5 sums of unwrapped versions.
        with TempFile(suffix=".fasta") as outfile, \
                TempFile(suffix=".fasta") as unwrapped:
            convert = Fastq2Fasta(infile, outfile.name)
            convert(method=method)
            Fastq2Fasta.unwrap_fasta(
                outfile.name, unwrapped.name, strip_comment=True)
            assert md5(unwrapped.name) == md5out, \
                "{} failed for {}".format(method, sample_name)
Esempio n. 2
0
def test_conv(method):
    infile = bioconvert_data("test_tabulated.xlsx")
    expected_outile = bioconvert_data("test_tabulated.csv")
    with TempFile(suffix=".csv") as tempfile:
        convert = XLSX2CSV(infile, tempfile.name)
        convert(method=method)
        assert md5(tempfile.name) == md5(expected_outile)
Esempio n. 3
0
def test_conv():
    infile = bioconvert_data("test_vcf2bcf_v1.vcf")
    outfile = bioconvert_data("test_vcf2bed_v1.bed")
    with TempFile(suffix=".bed") as tempfile:
        convert = VCF2BED(infile, tempfile.name)
        convert(method="awk")
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 4
0
def test_conv(method):
    infile = bioconvert_data("test_fastq2fasta_v1.fasta")
    qual_file = bioconvert_data("test_fasta2fastq.qual")

    expected_outfile_no_qual = bioconvert_data("test_fasta2fastq.fastq")
    md5out_no_qual = md5(expected_outfile_no_qual)

    expected_outfile_qual = bioconvert_data("test_fastq2fasta_v1.fastq")
    md5out_qual = md5(expected_outfile_qual)

    # One temporary file for the fasta created using the method
    # and one for an unwrapped version.
    # Some methods may output multi-line fasta, so we need to
    # compare md5 sums of unwrapped versions.
    with TempFile(suffix=".fastq") as outfile:
        convert = FASTA2FASTQ(infile, outfile.name)
        convert(method=method)
        assert md5(outfile.name) == md5out_no_qual, \
            "{} failed".format(method)

    with TempFile(suffix=".fastq") as outfile:
        convert = FASTA2FASTQ(infile, outfile.name)
        convert(method=method, quality_file=qual_file)
        assert md5(outfile.name) == md5out_qual, \
            "{} failed".format(method)
Esempio n. 5
0
def test_methods(method):
    infile = bioconvert_data("test_measles.sorted.bam")

    with TempFile(suffix=".fa") as tempfile:
        convert = BAM2FASTA(infile, tempfile.name)
        convert(method=method)
        # samtools 1.6 / hstlib 1.6 gives different results on travis and
        # locally
        assert md5(tempfile.name.replace(
            ".", "_1.")) in ["9242d127969a089ddeedbc2002c62686"]
        assert md5(tempfile.name.replace(
            ".", "_2.")) in ["b753ad368c9614130884acb29861bd23"]

    for ext in ['gz', 'bz2']:  # Test compression
        with TempFile(suffix=".fasta.{}".format(ext)) as tempfile:
            convert = BAM2FASTA(infile, tempfile.name)
            convert(method=method)
            # no check, just running

    infile = bioconvert_data("test_measles_unpaired.sorted.bam")

    with TempFile(suffix=".fa") as tempfile:
        convert = BAM2FASTA(infile, tempfile.name)
        convert(method=method)
        # samtools 1.6 / hstlib 1.6 gives different results on travis and
        # locally

    for ext in ['gz', 'bz2']:  # Test compression
        with TempFile(suffix=".fasta.{}".format(ext)) as tempfile:
            convert = BAM2FASTA(infile, tempfile.name)
            convert(method=method)
Esempio n. 6
0
def _test_conv():
    infile = bioconvert_data("biocode.gb")
    outfile = bioconvert_data("biocode.gff")

    with TempFile(suffix=".gff") as tempfile:
        converter = GENBANK2GFF3(infile, tempfile.name)
        converter(method="biocode")
        assert md5(tempfile.name) == md5(outfile)
def test_phy2nx_biopython(method):
    infile = bioconvert_data(method + ".phylip")
    outfile = bioconvert_data(method + ".nexus")
    with TempFile(suffix=".nexus") as tempfile:
        converter = PHYLIP2NEXUS(infile, tempfile.name)
        converter(method=method)

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 8
0
def test_sra2fastq_single(method):
    infile = "SRR6477205"
    outfile = bioconvert_data("SRR6477205.fastq")
    with TempFile(suffix=".fastq") as tempfile:
        converter = SRA2FASTQ(infile, tempfile.name, True)
        converter(method=method)

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
def test_nexus2clustal_biopython():
    infile = bioconvert_data("nexus2clustal_biopython.nexus")
    outfile = bioconvert_data("nexus2clustal_biopython.clustal")
    with TempFile(suffix=".nexus") as tempfile:
        converter = NEXUS2CLUSTAL(infile, tempfile.name)
        converter(method='biopython')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 10
0
def test_clustal2stockholm_squizz():
    infile = bioconvert_data("squizz.clustal")
    outfile = bioconvert_data("squizz.stockholm")
    with TempFile(suffix=".stockholm") as tempfile:
        converter = CLUSTAL2STOCKHOLM(infile, tempfile.name)
        converter(method='squizz')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
def test_stockholm2clustal_biopython():
    infile = bioconvert_data("biopython.stockholm")
    outfile = bioconvert_data("biopython.clustal")
    with TempFile(suffix=".clustal") as tempfile:
        converter = STOCKHOLM2CLUSTAL(infile, tempfile.name)
        converter(method='biopython')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
def test_clustal2fasta_biopython():
    infile = bioconvert_data("biopython.clustal")
    outfile = bioconvert_data("biopython.fasta")
    with TempFile(suffix=".fasta") as tempfile:
        converter = CLUSTAL2FASTA(infile, tempfile.name)
        converter(method='biopython')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
def test_stockholm2phylip_squizz():
    infile = bioconvert_data("squizz.stockholm")
    outfile = bioconvert_data("squizz.phylip")
    with TempFile(suffix=".phylip") as tempfile:
        converter = STOCKHOLM2PHYLIP(infile, tempfile.name)
        converter(method='squizz')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 14
0
def test_nx2xml_biopython(method):
    infile = bioconvert_data("test_wig2bed.wig")
    outfile = bioconvert_data("test_wig2bed.bed")
    with TempFile(suffix=".phyloxml") as tempfile:
        converter = WIG2BED(infile, tempfile.name)
        converter(method=method)

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 15
0
def test_twobit2fasta_ucsc(method):
    infile = bioconvert_data("ucsc.2bit")
    outfile = bioconvert_data("ucsc.fasta")
    with TempFile(suffix=".fasta") as tempfile:
        converter = TWOBIT2FASTA(infile, tempfile.name)
        converter(method=method)

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 16
0
def test_clustal2nexus_goalign():
    infile = bioconvert_data("goalign.clustal")
    outfile = bioconvert_data("goalign.nexus")
    with TempFile(suffix=".nexus") as tempfile:
        converter = CLUSTAL2NEXUS(infile, tempfile.name)
        converter(method='goalign')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 17
0
def test_phylip2clustal_squizz():
    infile = bioconvert_data("squizz.phylip")
    outfile = bioconvert_data("squizz.clustal")
    with TempFile(suffix=".clustal") as tempfile:
        converter = PHYLIP2CLUSTAL(infile, tempfile.name)
        converter(method='squizz')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 18
0
def test_nx2fa_biopython(method):
    infile = bioconvert_data(method + ".nexus")
    outfile = bioconvert_data(method + ".fasta")
    with TempFile(suffix=".fasta") as tempfile:
        converter = NEXUS2FASTA(infile, tempfile.name)
        converter(method=method)

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 19
0
def test_bigwig2bedgraph_ucsc(method):
    infile = bioconvert_data("ucsc.bigwig")
    outfile = bioconvert_data("ucsc.bedgraph")
    with TempFile(suffix=".bedgraph") as tempfile:
        converter = BIGWIG2BEDGRAPH(infile, tempfile.name)
        converter(method=method)

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 20
0
def test_conv():
    infile = bioconvert_data("test_v1.json")
    expected_outile = bioconvert_data("test_v1_nocomments.yaml")
    with TempFile(suffix=".yaml") as tempfile:
        convert = JSON2YAML(infile, tempfile.name)
        convert()

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(expected_outile)
def test_fa2phy_squizz():
    infile = bioconvert_data("squizz.fasta")
    outfile = bioconvert_data("squizz.phylip")
    with TempFile(suffix=".phylip") as tempfile:
        converter = FASTA2PHYLIP(infile, tempfile.name)
        converter(method='squizz')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 22
0
def test_nw2xml_biopython(method):
    infile = bioconvert_data(method + ".newick")
    outfile = bioconvert_data(method + ".xml")
    with TempFile(suffix=".xml") as tempfile:
        converter = NEWICK2PHYLOXML(infile, tempfile.name)
        converter(method=method)

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
def test_phylip2stockholm_biopython():
    infile = bioconvert_data("biopython.phylip")
    outfile = bioconvert_data("biopython.stockholm")
    with TempFile(suffix=".fasta") as tempfile:
        converter = PHYLIP2STOCKHOLM(infile, tempfile.name)
        converter(method='biopython')

        # Check that the output is correct with a checksum
        assert md5(tempfile.name) == md5(outfile)
Esempio n. 24
0
def test_conv(method):
    infile = bioconvert_data("test_vcf2bcf_v1.vcf")
    outfile = bioconvert_data("test_vcf2wiggle.wiggle")
    md5out = md5(outfile)

    with TempFile(suffix=".wiggle") as tempfile:
        convert = VCF2WIGGLE(infile, tempfile.name)
        convert(method=method)

        assert md5(tempfile.name) == md5out, "{} failed".format(method)
Esempio n. 25
0
def test_conv(method):
    infile = bioconvert_data("test_tabulated.ods")
    expected_outfiles = [
        bioconvert_data("test_tabulated.csv"),
        bioconvert_data("test_tabulated_with_3_more_blank_lines.csv"),
    ]
    with TempFile(suffix=".csv") as tempfile:
        convert = ODS2CSV(infile, tempfile.name)
        convert(method=method)
        assert md5(tempfile.name) in [md5(f) for f in expected_outfiles]
Esempio n. 26
0
def _test_conv(method):
    infile = bioconvert_data("test_measles.bigwig")
    outfile = bioconvert_data("test_bigwig2wiggle.wiggle")
    md5out = md5(outfile)

    with TempFile(suffix=".wiggle") as tempfile:
        convert = BIGWIG2WIGGLE(infile, tempfile.name)
        convert(method=method)

        assert md5(tempfile.name) == md5out, "{} failed".format(method)
Esempio n. 27
0
def test_conv():
    infile = bioconvert_data("test_sam2paf_v1.sam", where)
    outfile = bioconvert_data("test_sam2paf_v1.paf", where)
    checksum = md5(outfile)

    with TempFile(suffix=".paf") as tempfile:
        convert = SAM2PAF(infile, tempfile.name)
        convert()
        assert checksum == md5(tempfile.name)
        assert convert.skipped == 17
Esempio n. 28
0
def test_converter_without_converter():
    infile = bioconvert_data("test_fastq2fasta_v1.fastq")
    with TempFile(suffix=".fasta") as tempfile1, TempFile(suffix=".fasta") as tempfile2:
        cmd = "bioconvert {} {} --force".format(infile, tempfile1.name)
        p = subprocess.Popen(cmd, shell=True)
        assert p.wait() == 0
        import sys
        sys.argv = ["bioconvert", infile, tempfile2.name, "--force"]
        converter.main()
        assert md5(tempfile1.name) == md5(tempfile2.name)
Esempio n. 29
0
def test_conv(method):
    # XLS file may contains bold, border, ... i then prefere to convert it back to csv to check if it is ok or not
    infile = bioconvert_data("test_tabulated.csv")
    expected_outile = bioconvert_data("test_tabulated.csv")
    with TempFile(suffix=".csv") as temp_csv,TempFile(suffix=".xls") as temp_xls:
        convert = CSV2XLS(infile, temp_xls.name)
        convert()
        convert = XLS2CSV(temp_xls.name, temp_csv.name)
        convert(method=method)
        assert md5(temp_csv.name) == md5(expected_outile)
Esempio n. 30
0
def test_conv(method):
    infile = bioconvert_data("ucsc.bigbed")
    outfile = bioconvert_data("ucsc.wiggle")
    md5out = md5(outfile)

    with TempFile(suffix=".wiggle") as tempfile:
        convert = BIGBED2WIGGLE(infile, tempfile.name)
        convert(method=method)

        assert md5(tempfile.name) == md5out, "{} failed".format(method)
Esempio n. 31
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Esempio n. 32
0
    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info("Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
Esempio n. 33
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"]
        filenames = [
            "database.idx",
            "database.kdb",
            "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)