Exemple #1
0
    def download(self, name, base_url, cookies, downloader=None):
        """Download the file if it has an URL. Otherwise, this will recursively search the children.

        See also:
        <https://genome.jgi.doe.gov/portal/help/download.jsf>

        Args:
            name (str, optional): Name of the file. Defaults to None.
            base_url (str): Link to the file on the internet.
            cookies (str, optional): cookies file. Defaults to None.
            downloader (str, optional): Use a given downloader. One of wget|curl|powershell|insecure.
            Defaults to None.
        """
        if self.name == name and base_url and self.url:
            url = urljoin(base_url, self.url)
            download(url,
                     filename=name,
                     debug=True,
                     cookies=cookies,
                     downloader=downloader)
        else:
            for child_name, child in self.items():
                if child_name == name:
                    child.download(name,
                                   base_url,
                                   cookies,
                                   downloader=downloader)
        return name
Exemple #2
0
def sra(args):
    """
    %prog sra term

    Given an SRA run ID, fetch the corresponding .sra file
    from the sra-instant FTP
    """
    sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/"
    sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$')

    p = OptionParser(sra.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    term, = args

    m = re.search(sra_run_id_re, term)
    if m is None:
        logging.error("Incorrect SRA identifier format " + \
                "[should be like SRR126150, SRR1001901. " + \
                "len(identifier) should be between 9-10 characters]")
        sys.exit()

    prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2))
    download_url = urljoin(sra_base_url, prefix, subprefix, term,
                           "{0}.sra".format(term))

    logging.debug("Downloading file: {0}".format(download_url))
    download(download_url)
Exemple #3
0
def sra(args):
    """
    %prog sra term

    Given an SRA run ID, fetch the corresponding .sra file
    from the sra-instant FTP
    """
    sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/"
    sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$')

    p = OptionParser(sra.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    term, = args

    m = re.search(sra_run_id_re, term)
    if m is None:
        logging.error("Incorrect SRA identifier format " + \
                "[should be like SRR126150, SRR1001901. " + \
                "len(identifier) should be between 9-10 characters]")
        sys.exit()

    prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2))
    download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term))

    logging.debug("Downloading file: {0}".format(download_url))
    download(download_url)
Exemple #4
0
def download_species_ensembl(species, valid_species, url):
    assert species in valid_species, "{0} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    ann_url = urljoin(url, "gtf/{0}".format(species))
    cds_url = urljoin(url, "fasta/{0}/cds".format(species))

    for u in (ann_url, cds_url):
        valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")]
        for f in valid_files:
            f = urljoin(u, f)
            download(f)
Exemple #5
0
def download_species_ensembl(species, valid_species, url):
    assert species in valid_species, \
            "{0} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    ann_url = urljoin(url, "gtf/{0}".format(species))
    cds_url = urljoin(url, "fasta/{0}/cds".format(species))

    for u in (ann_url, cds_url):
        valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")]
        for f in valid_files:
            f = urljoin(u, f)
            download(f)
Exemple #6
0
def links(args):
    """
    %prog links url

    Extract all the links "<a href=''>" from web page.
    """
    p = OptionParser(links.__doc__)
    p.add_option("--img", default=False, action="store_true",
                 help="Extract <img> tags [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    url, = args
    img = opts.img

    htmlfile = download(url)
    page = open(htmlfile).read()
    soup = BeautifulSoup(page)

    tag = 'img' if img else 'a'
    src = 'src' if img else 'href'
    aa = soup.findAll(tag)
    for a in aa:
        link = a.get(src)
        link = urljoin(url, link)
        print(link)
Exemple #7
0
def links(args):
    """
    %prog links url

    Extract all the links "<a href=''>" from web page.
    """
    p = OptionParser(links.__doc__)
    p.add_option(
        "--img",
        default=False,
        action="store_true",
        help="Extract <img> tags [default: %default]",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (url, ) = args
    img = opts.img

    htmlfile = download(url)
    page = open(htmlfile).read()
    soup = BeautifulSoup(page)

    tag = "img" if img else "a"
    src = "src" if img else "href"
    aa = soup.findAll(tag)
    for a in aa:
        link = a.get(src)
        link = urljoin(url, link)
        print(link)
Exemple #8
0
def mask(args):
    """
    %prog mask fastafile

    Mask the contaminants. By default, this will compare against UniVec_Core and
    Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can
    perform FASTA tidy if requested.
    """
    p = OptionParser(mask.__doc__)
    p.add_option(
        "--db", help="Contaminant db other than Ecoli K12 [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    assert op.exists(fastafile)

    outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta"
    vecbedfile = blast([fastafile])
    ecoliurl = \
    "ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__DH10B_uid58979/NC_010473.fna"
    ecolifile = opts.db or download(ecoliurl, filename="Ecoli.fasta")
    assert op.exists(ecolifile)
    ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)])

    cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile)
    cmd += " | mergeBed -nms -d 100 -i stdin"
    cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".\
            format(fastafile, outfastafile)
    sh(cmd)

    return tidy([outfastafile])
Exemple #9
0
def mask(args):
    """
    %prog mask fastafile

    Mask the contaminants. By default, this will compare against UniVec_Core and
    Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can
    perform FASTA tidy if requested.
    """
    p = OptionParser(mask.__doc__)
    p.add_option("--db", help="Contaminant db other than Ecoli K12 [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    assert op.exists(fastafile)

    outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta"
    vecbedfile = blast([fastafile])
    ecoliurl = "ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Escherichia_coli_K_12_substr__DH10B_uid58979/NC_010473.fna"
    ecolifile = opts.db or download(ecoliurl, filename="Ecoli.fasta")
    assert op.exists(ecolifile)
    ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)])

    cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile)
    cmd += " | mergeBed -nms -d 100 -i stdin"
    cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".format(fastafile, outfastafile)
    sh(cmd)

    return tidy([outfastafile])
Exemple #10
0
    def download(self, name, base_url, cookies):
        """Download the file if it has an URL. Otherwise, this will recursively search the children.

        See also:
        <https://genome.jgi.doe.gov/portal/help/download.jsf>

        Args:
            name (str, optional): Name of the file. Defaults to None.
        """
        if self.name == name and base_url and self.url:
            url = urljoin(base_url, self.url)
            download(url, filename=name, debug=True, cookies=cookies)
        else:
            for child_name, child in self.items():
                if child_name == name:
                    child.download(name, base_url, cookies)
        return name
Exemple #11
0
def download_species_phytozome9(species, valid_species, base_url, assembly=False):
    assert species in valid_species, "{} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    surl = urljoin(base_url, species)
    contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")]
    magic = contents[0].split("_")[1]  # Get the magic number
    logging.debug("Found magic number for {0}: {1}".format(species, magic))

    pf = "{0}_{1}".format(species, magic)
    asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf))
    ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf))
    cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf))
    res = {}
    if assembly:
        res["asm"] = download(asm_url)
    res["gff"] = download(ann_url)
    res["cds"] = download(cds_url)
    return res
Exemple #12
0
def test_download():
    from jcvi.apps.base import download
    from jcvi.apps.vecscreen import ECOLI_URL, UNIVEC_URL

    ret = download("http://www.google.com")
    assert ret == "index.html"
    remove_if_exists(ret)

    ret = download(ECOLI_URL, filename="ecoli.fa.gz")
    assert ret == "ecoli.fa.gz"
    remove_if_exists(ret)

    ret = download(UNIVEC_URL, filename="univec.fa.gz")
    assert ret == "univec.fa.gz"
    remove_if_exists(ret)

    ret = download(UNIVEC_URL)
    assert ret == "UniVec_Core"
    remove_if_exists(ret)
Exemple #13
0
def load_GODag():
    """
    OBO file retrieved from http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo
    """
    from jcvi.apps.base import download

    so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo"
    so_file = download(so_file_url, debug=False)

    return GODag(so_file)
Exemple #14
0
def load_GODag():
    """
    OBO file retrieved from http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo
    """
    from jcvi.apps.base import download

    so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo"
    so_file = download(so_file_url, debug=False)

    return GODag(so_file)
Exemple #15
0
def download_species_phytozome(species, valid_species, url, assembly=False):
    from os.path import join as urljoin

    assert species in valid_species, \
            "{0} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    surl = urljoin(url, species)
    contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")]
    magic = contents[0].split("_")[1]  # Get the magic number
    logging.debug("Found magic number for {0}: {1}".format(species, magic))

    pf = "{0}_{1}".format(species, magic)
    asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf))
    ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf))
    cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf))
    if assembly:
        download(asm_url)
    for u in (ann_url, cds_url):
        download(u)
Exemple #16
0
def download_species_phytozome(species, valid_species, url, assembly=False):
    from os.path import join as urljoin

    assert species in valid_species, \
            "{0} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    surl = urljoin(url, species)
    contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")]
    magic = contents[0].split("_")[1]  # Get the magic number
    logging.debug("Found magic number for {0}: {1}".format(species, magic))

    pf = "{0}_{1}".format(species, magic)
    asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf))
    ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf))
    cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf))
    if assembly:
        download(asm_url)
    for u in (ann_url, cds_url):
        download(u)
Exemple #17
0
def scaffold(args):
    """
    %prog scaffold contigs.fasta MP*.fastq

    Run SSPACE scaffolding.
    """
    p = OptionParser(scaffold.__doc__)
    p.set_aligner(aligner="bwa")
    p.set_home("sspace")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    contigs = args[0]
    libtxt = write_libraries(args[1:], aligner=opts.aligner)
    # Requires getopts.pl which may be missing
    download("http://web.vims.edu/bridge/bridge2/aw/lib/getopts.pl")

    cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Exemple #18
0
def scaffold(args):
    """
    %prog scaffold contigs.fasta MP*.fastq

    Run SSPACE scaffolding.
    """
    p = OptionParser(scaffold.__doc__)
    p.set_aligner(aligner="bwa")
    p.set_home("sspace")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    contigs = args[0]
    libtxt = write_libraries(args[1:], aligner=opts.aligner)
    # Requires getopts.pl which may be missing
    download("http://mflib.org/xampp/perl/lib/getopts.pl")

    cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Exemple #19
0
def blast(args):
    """
    %prog blast fastafile 

    Run BLASTN against database (default is UniVec_Core).  Output .bed format
    on the vector/contaminant ranges.
    """
    p = OptionParser(blast.__doc__)
    p.add_option("--dist",
                 dest="dist",
                 default=100,
                 type="int",
                 help="Merge adjacent HSPs separated by [default: %default]")
    p.add_option("--db",
                 dest="db",
                 default=None,
                 help="Use a different database rather than UniVec_Core")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    fastaprefix = fastafile.split(".", 1)[0]

    univec = opts.db or download(
        "ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core")
    uniprefix = univec.split(".", 1)[0]

    fastablast = fastaprefix + ".{0}.blast".format(uniprefix)

    prog = run_megablast if opts.db else run_vecscreen
    prog(infile=fastafile, outfile=fastablast, db=univec, pctid=95, hitlen=50)

    fp = open(fastablast)
    ranges = []
    for row in fp:
        b = BlastLine(row)
        ranges.append((b.query, b.qstart, b.qstop))

    merged_ranges = range_merge(ranges, dist=opts.dist)
    bedfile = fastaprefix + ".{0}.bed".format(uniprefix)
    fw = must_open(bedfile, "w")
    for seqid, start, end in merged_ranges:
        print >> fw, "\t".join(
            str(x) for x in (seqid, start - 1, end, uniprefix))

    return bedfile
Exemple #20
0
def download_srr_term(term):
    sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/"
    sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$')

    m = re.search(sra_run_id_re, term)
    if m is None:
        logging.error("Incorrect SRA identifier format " + \
                "[should be like SRR126150, SRR1001901. " + \
                "len(identifier) should be between 9-10 characters]")
        sys.exit()

    prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2))
    download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term))

    logging.debug("Downloading file: {0}".format(download_url))
    return download(download_url)
Exemple #21
0
def download_srr_term(term):
    sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/"
    sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$')

    m = re.search(sra_run_id_re, term)
    if m is None:
        logging.error("Incorrect SRA identifier format " + \
                "[should be like SRR126150, SRR1001901. " + \
                "len(identifier) should be between 9-10 characters]")
        sys.exit()

    prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2))
    download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term))

    logging.debug("Downloading file: {0}".format(download_url))
    return download(download_url)
Exemple #22
0
def load_GODag(obo_url: str, prt: Optional[IO] = None) -> (GODag, str):
    """
    Load given obo url and returns GODag object.

    Args:
        obo_url (str): URL to the remote OBO file.
        prt (Optional[IO]): IO stream to print verbose information.

    Returns:
        (GODag, str): GODag object that contains the dict, and path to the downloaded OBO file.
    """

    from jcvi.apps.base import download

    so_file = download(obo_url, debug=False)

    return GODag(so_file, prt=prt), so_file
Exemple #23
0
def validate_term(term):
    """
    Validate an SO term against so.obo
    OBO file retrieved from 'http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo'
    """
    from jcvi.formats.obo import GODag
    from jcvi.apps.base import download

    so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo"
    so_file = download(so_file_url)

    so = GODag(so_file)
    valid_names = so.valid_names
    if not term in valid_names:
        logging.error("Term `{0}` does not exist. Please refer to `{1}`".format(term, so_file_url))
        sys.exit()

    return True
Exemple #24
0
def validate_term(term):
    """
    Validate an SO term against so.obo
    OBO file retrieved from 'http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo'
    """
    from jcvi.formats.obo import GODag
    from jcvi.apps.base import download

    so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo"
    so_file = download(so_file_url)

    so = GODag(so_file)
    valid_names = so.valid_names
    if not term in valid_names:
        logging.error(
            "Term `{0}` does not exist. Please refer to `{1}`".format(
                term, so_file_url))
        sys.exit()

    return True
Exemple #25
0
def blast(args):
    """
    %prog blast fastafile

    Run BLASTN against database (default is UniVec_Core).  Output .bed format
    on the vector/contaminant ranges.
    """
    p = OptionParser(blast.__doc__)
    p.add_option("--dist", default=100, type="int",
            help="Merge adjacent HSPs separated by [default: %default]")
    p.add_option("--db",
            help="Use a different database rather than UniVec_Core")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    fastaprefix = fastafile.split(".", 1)[0]

    univec = opts.db or download("ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core")
    uniprefix = univec.split(".", 1)[0]

    fastablast = fastaprefix + ".{0}.blast".format(uniprefix)

    prog = run_megablast if opts.db else run_vecscreen
    prog(infile=fastafile, outfile=fastablast, db=univec, pctid=95, hitlen=50)

    fp = open(fastablast)
    ranges = []
    for row in fp:
        b = BlastLine(row)
        ranges.append((b.query, b.qstart, b.qstop))

    merged_ranges = range_merge(ranges, dist=opts.dist)
    bedfile = fastaprefix + ".{0}.bed".format(uniprefix)
    fw = must_open(bedfile, "w")
    for seqid, start, end in merged_ranges:
        print("\t".join(str(x) for x in (seqid, start - 1, end, uniprefix)), file=fw)

    return bedfile
Exemple #26
0
def test_oboreader():
    import os
    from jcvi.apps.base import download
    from jcvi.formats.obo import GODag

    obo_file = download("http://geneontology.org/ontology/go-basic.obo")

    go = GODag(obo_file, prt=None)
    r1, r2, r3 = [
        rec for i, rec in enumerate(
            sorted(set(go.values()), key=lambda x: x.item_id)) if i < 3
    ]
    assert r1.item_id == "GO:0000001"
    assert r1.name == "mitochondrion inheritance"
    assert r2.item_id == "GO:0000002"
    assert r2.namespace == "biological_process"
    assert r3.item_id == "GO:0000003"
    assert tuple(sorted(r3.alt_ids)) == ("GO:0019952", "GO:0050876")

    if os.path.exists(obo_file):
        os.remove(obo_file)
Exemple #27
0
def mask(args):
    """
    %prog mask fastafile

    Mask the contaminants. By default, this will compare against UniVec_Core and
    Ecoli.fasta. Merge the contaminant results, and use `maskFastaFromBed`. Can
    perform FASTA tidy if requested.
    """
    p = OptionParser(mask.__doc__)
    p.add_option(
        "--db",
        default=ECOLI_URL,
        help=
        "Contaminant db other than Ecoli K12, will download if file starts with http://, https://, or ftp://",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    db = opts.db
    assert op.exists(fastafile)

    outfastafile = fastafile.rsplit(".", 1)[0] + ".masked.fasta"
    vecbedfile = blast([fastafile])
    ecolifile = (download(db, filename="Ecoli.fasta", handle_gzip=True)
                 if is_internet_file(db) else db)
    assert op.exists(ecolifile)
    ecolibedfile = blast([fastafile, "--db={0}".format(ecolifile)])

    cmd = "cat {0} {1}".format(vecbedfile, ecolibedfile)
    cmd += " | sort -k1,1 -k2,2n"
    cmd += " | mergeBed -c 4 -o distinct -d 100 -i stdin"
    cmd += " | maskFastaFromBed -fi {0} -bed stdin -fo {1}".format(
        fastafile, outfastafile)
    sh(cmd)

    return tidy([outfastafile])
Exemple #28
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    tv = "0.32"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic jar file [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset [default: guess]")
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=15, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=36, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--adapteronly", default=False, action="store_true",
            help="Only trim adapters with no qv trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    p.add_option("--log", default=None, dest="trimlog",
            help="Specify a `trimlog` file [default: %default]")
    p.set_cpus(cpus=4)
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path), \
        "Couldn't find Trimmomatic jar file at `{0}`".\
        format(path)

    adaptersfile = "adapters.fasta"
    Adapters = must_open(op.join(datadir, adaptersfile)).read()
    write_file(adaptersfile, Adapters, skipcheck=True)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)
    threadsflag = " -threads {0}".format(opts.cpus)
    if opts.trimlog:
        trimlog = " -trimlog {0}".format(opts.trimlog)

    cmd = "java -Xmx4g -jar {0}".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += " SE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += " PE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile)

    if not opts.adapteronly:
        cmd += " LEADING:3 TRAILING:3"
        cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv)

    cmd += " MINLEN:{0}".format(opts.minlen)

    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd)
Exemple #29
0
def phytozome(args):
    """
    %prog phytozome species

    Retrieve genomes and annotations from phytozome using Globus API. Available
    species listed below. Use comma to give a list of species to download. For
    example:

    $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum

    The downloader will prompt you to enter Phytozome user name and password
    during downloading. Please register for a login at:
    https://phytozome.jgi.doe.gov/pz/portal.html.
    """
    from jcvi.apps.biomart import GlobusXMLParser

    p = OptionParser(phytozome.__doc__)
    p.add_option(
        "--version",
        default="12",
        choices=("9", "10", "11", "12", "12_unrestricted", "13"),
        help="Phytozome version",
    )
    p.add_option(
        "--assembly",
        default=False,
        action="store_true",
        help="Download assembly",
    )
    p.add_option(
        "--format",
        default=False,
        action="store_true",
        help="Format to CDS and BED for synteny inference",
    )
    p.set_downloader()
    opts, args = p.parse_args(args)

    downloader = opts.downloader
    directory_listing = ".phytozome_directory_V{}.xml".format(opts.version)
    # Get directory listing
    base_url = "http://genome.jgi.doe.gov"
    dlist = "{}/ext-api/downloads/get-directory?organism=PhytozomeV{}".format(
        base_url, opts.version
    )

    # Make sure we have a valid cookies
    cookies = get_cookies()
    if cookies is None:
        logging.error("Error fetching cookies ... cleaning up")
        FileShredder([directory_listing])
        sys.exit(1)

    # Proceed to use the cookies and download the species list
    try:
        download(
            dlist,
            filename=directory_listing,
            cookies=cookies,
            downloader=downloader,
        )
        g = GlobusXMLParser(directory_listing)
    except:
        logging.error("Error downloading directory listing ... cleaning up")
        FileShredder([directory_listing, cookies])
        sys.exit(1)

    genomes = g.get_genomes()
    valid_species = genomes.keys()
    species_tile = tile(valid_species)
    p.set_usage("\n".join((phytozome.__doc__, species_tile)))

    if len(args) != 1:
        sys.exit(not p.print_help())

    (species,) = args
    if species == "all":
        species = ",".join(valid_species)

    species = species.split(",")
    for s in species:
        res = download_species_phytozome(
            genomes,
            s,
            valid_species,
            base_url,
            cookies,
            assembly=opts.assembly,
            downloader=downloader,
        )
        if not res:
            logging.error("No files downloaded")
        gff, fa = res.get("gff"), res.get("cds")
        if opts.format:
            format_bed_and_cds(s, gff, fa)
Exemple #30
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    TrimVersion = tv = "0.20"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset {0} [default: guess]".format(phdchoices))
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=10, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=30, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path)

    adaptersfile = "adapters.fasta"
    if not op.exists(adaptersfile):
        write_file(adaptersfile, Adapters)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)

    cmd = JAVAPATH("java-1.6.0")
    cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += ".TrimmomaticSE"
        cmd += phredflag
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += ".TrimmomaticPE"
        cmd += phredflag
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile)
    cmd += " LEADING:3 TRAILING:3"
    cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen)
    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd, grid=opts.grid)
Exemple #31
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    tv = "0.32"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic jar file [default: %default]")
    p.add_option("--phred", default=None, choices=phdchoices,
            help="Phred score offset [default: guess]")
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=15, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=36, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--adapteronly", default=False, action="store_true",
            help="Only trim adapters with no qv trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    p.add_option("--log", default=None, dest="trimlog",
            help="Specify a `trimlog` file [default: %default]")
    p.set_cpus(cpus=4)
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path), \
        "Couldn't find Trimmomatic jar file at `{0}`".\
        format(path)

    adaptersfile = "adapters.fasta"
    Adapters = must_open(op.join(datadir, adaptersfile)).read()
    write_file(adaptersfile, Adapters, skipcheck=True)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)
    threadsflag = " -threads {0}".format(opts.cpus)
    if opts.trimlog:
        trimlog = " -trimlog {0}".format(opts.trimlog)

    cmd = "java -Xmx4g -jar {0}".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += " SE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += " PE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile)

    if not opts.adapteronly:
        cmd += " LEADING:3 TRAILING:3"
        cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv)

    cmd += " MINLEN:{0}".format(opts.minlen)

    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd)
Exemple #32
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    TrimVersion = tv = "0.20"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    phdchoices = ("33", "64")
    p = OptionParser(trim.__doc__)
    p.add_option("--path",
                 default=op.join("~/bin", TrimJar),
                 help="Path to trimmomatic [default: %default]")
    p.add_option(
        "--phred",
        default=None,
        choices=phdchoices,
        help="Phred score offset {0} [default: guess]".format(phdchoices))
    p.add_option("--nofrags",
                 default=False,
                 action="store_true",
                 help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv",
                 default=10,
                 type="int",
                 help="Average qv after trimming [default: %default]")
    p.add_option("--minlen",
                 default=30,
                 type="int",
                 help="Minimum length after trimming [default: %default]")
    p.add_option("--nogz",
                 default=False,
                 action="store_true",
                 help="Do not write to gzipped files [default: %default]")
    set_grid(p)

    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path)

    adaptersfile = "adapters.fasta"
    if not op.exists(adaptersfile):
        write_file(adaptersfile, Adapters)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)

    cmd = JAVAPATH("java-1.6.0")
    cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    if len(args) == 1:
        cmd += ".TrimmomaticSE"
        cmd += phredflag
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        frags1 = prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += ".TrimmomaticPE"
        cmd += phredflag
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        pairs1 = prefix1 + pairs
        pairs2 = prefix2 + pairs
        frags1 = prefix1 + frags
        frags2 = prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile)
    cmd += " LEADING:3 TRAILING:3"
    cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen)
    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd, grid=opts.grid)