Example #1
0
File: fetch.py Project: yangjl/jcvi
def batch_entrez(list_of_terms, db="nuccore", retmax=1, rettype="fasta",
            batchsize=1, email=myEmail):
    """
    Retrieve multiple rather than a single record
    """

    for term in list_of_terms:

        logging.debug("Search term %s" % term)
        success = False
        ids = None
        if not term:
            continue

        while not success:
            try:
                search_handle = Entrez.esearch(db=db, retmax=retmax, term=term)
                rec = Entrez.read(search_handle)
                success = True
                ids = rec["IdList"]
            except (urllib2.HTTPError, urllib2.URLError,
                    RuntimeError, KeyError) as e:
                logging.error(e)
                logging.debug("wait 5 seconds to reconnect...")
                time.sleep(5)

        if not ids:
            logging.error("term {0} not found".format(term))
            continue

        assert ids
        nids = len(ids)
        if nids > 1:
            logging.debug("A total of {0} results found.".format(nids))

        if batchsize != 1:
            logging.debug("Use a batch size of {0}.".format(batchsize))

        ids = list(grouper(ids, batchsize))

        for id in ids:
            id = [x for x in id if x]
            size = len(id)
            id = ",".join(id)

            success = False
            while not success:
                try:
                    fetch_handle = Entrez.efetch(db=db, id=id, rettype=rettype,
                            email=email)
                    success = True
                except (urllib2.HTTPError, urllib2.URLError,
                        RuntimeError) as e:
                    logging.error(e)
                    logging.debug("wait 5 seconds to reconnect...")
                    time.sleep(5)

            yield id, size, term, fetch_handle
Example #2
0
def batch_entrez(list_of_terms, db="nuccore", retmax=1, rettype="fasta",
            batchsize=1, email=myEmail):
    """
    Retrieve multiple rather than a single record
    """

    for term in list_of_terms:

        logging.debug("Search term %s" % term)
        success = False
        ids = None
        if not term:
            continue

        while not success:
            try:
                search_handle = Entrez.esearch(db=db, retmax=retmax, term=term)
                rec = Entrez.read(search_handle)
                success = True
                ids = rec["IdList"]
            except (urllib2.HTTPError, urllib2.URLError,
                    RuntimeError, KeyError) as e:
                logging.error(e)
                logging.debug("wait 5 seconds to reconnect...")
                time.sleep(5)

        if not ids:
            logging.error("term {0} not found".format(term))
            continue

        assert ids
        nids = len(ids)
        if nids > 1:
            logging.debug("A total of {0} results found.".format(nids))

        if batchsize != 1:
            logging.debug("Use a batch size of {0}.".format(batchsize))

        ids = list(grouper(ids, batchsize))

        for id in ids:
            id = [x for x in id if x]
            size = len(id)
            id = ",".join(id)

            success = False
            while not success:
                try:
                    fetch_handle = Entrez.efetch(db=db, id=id, rettype=rettype,
                            email=email)
                    success = True
                except (urllib2.HTTPError, urllib2.URLError,
                        RuntimeError) as e:
                    logging.error(e)
                    logging.debug("wait 5 seconds to reconnect...")
                    time.sleep(5)

            yield id, size, term, fetch_handle
Example #3
0
def scaffold(args):
    """
    %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed
                         physicalmap.blast physicalmap.sizes physicalmap.bed

    As evaluation of scaffolding, visualize external line of evidences:
    * Plot synteny to an external genome
    * Plot alignments to physical map
    * Plot alignments to genetic map (TODO)

    Each trio defines one panel to be plotted. blastfile defines the matchings
    between the evidences vs scaffolds. Then the evidence sizes, and evidence
    bed to plot dot plots.

    This script will plot a dot in the dot plot in the corresponding location
    the plots are one contig/scaffold per plot.
    """
    from jcvi.graphics.base import set_image_options
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option("--cutoff", type="int", default=1000000,
            help="Plot scaffolds with size larger than [default: %default]")
    p.add_option("--highlights",
            help="A set of regions in BED format to highlight [default: %default]")
    opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150)

    if len(args) < 4 or len(args) % 3 != 1:
        sys.exit(not p.print_help())

    highlights = opts.highlights
    scafsizes = Sizes(args[0])
    trios = list(grouper(3, args[1:]))
    trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios]
    if highlights:
        hlbed = Bed(highlights)

    for scaffoldID, scafsize in scafsizes.iter_sizes():
        if scafsize < opts.cutoff:
            continue
        logging.debug("Loading {0} (size={1})".format(scaffoldID,
            thousands(scafsize)))

        tmpname = scaffoldID + ".sizes"
        tmp = open(tmpname, "w")
        tmp.write("{0}\t{1}".format(scaffoldID, scafsize))
        tmp.close()

        tmpsizes = Sizes(tmpname)
        tmpsizes.close(clean=True)

        if highlights:
            subhighlights = list(hlbed.sub_bed(scaffoldID))

        imagename = ".".join((scaffoldID, opts.format))
        plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts,
                          highlights=subhighlights)
Example #4
0
def iter_project(folder, pattern, n=2):
    # Check for paired reads and extract project id
    filelist = [x for x in iglob(folder, pattern)]
    for p in grouper(filelist, n):
        if len(p) != n:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp)
        yield list(p), pf
Example #5
0
def iter_project(folder, n=2):
    # Check for paired reads and extract project id
    filelist = [x for x in glob(folder + "/*.*") if x.rsplit(".", 1)[-1] in ("fq", "fastq", "txt", "gz")]
    for p in grouper(filelist, n):
        if len(p) != n:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp)
        yield list(p), pf
Example #6
0
def iter_project(folder, pattern="*.fq,*.fq.gz,*.fastq,*.fastq.gz", n=2):
    # Check for paired reads and extract project id
    filelist = [x for x in iglob(folder, pattern)]
    for p in grouper(filelist, n):
        if len(p) != n or None in p:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp)
        yield list(p), pf
Example #7
0
def blasr(args):
    """
    %prog blasr ref.fasta fofn

    Run blasr on a set of PacBio reads. This is based on a divide-and-conquer
    strategy described below.
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.utils.iter import grouper

    p = OptionParser(blasr.__doc__)
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, fofn = args
    flist = sorted([x.strip() for x in open(fofn)])
    h5list = []
    mm = MakeManager()
    for i, fl in enumerate(grouper(flist, 3)):
        chunkname = "chunk{0:03d}".format(i)
        fn = chunkname + ".fofn"
        h5 = chunkname + ".cmp.h5"
        fw = open(fn, "w")
        print >> fw, "\n".join(fl)
        fw.close()

        cmd = "pbalign {0} {1} {2}".format(fn, reffasta, h5)
        cmd += " --nproc {0} --forQuiver --tmpDir .".format(opts.cpus)
        mm.add((fn, reffasta), h5, cmd)
        h5list.append(h5)

    # Merge h5, sort and repack
    allh5 = "all.cmp.h5"
    tmph5 = "tmp.cmp.h5"
    cmd_merge = "cmph5tools.py merge --outFile {0}".format(allh5)
    cmd_merge += " " + " ".join(h5list)
    cmd_sort = "cmph5tools.py sort --deep {0} --tmpDir .".format(allh5)
    cmd_repack = "h5repack -f GZIP=1 {0} {1}".format(allh5, tmph5)
    cmd_repack += " && mv {0} {1}".format(tmph5, allh5)
    mm.add(h5list, allh5, [cmd_merge, cmd_sort, cmd_repack])

    # Quiver
    pf = reffasta.rsplit(".", 1)[0]
    variantsgff = pf + ".variants.gff"
    consensusfasta = pf + ".consensus.fasta"
    cmd_faidx = "samtools faidx {0}".format(reffasta)
    cmd = "quiver -j 32 {0}".format(allh5)
    cmd += " -r {0} -o {1} -o {2}".format(reffasta, variantsgff,
                                          consensusfasta)
    mm.add(allh5, consensusfasta, [cmd_faidx, cmd])

    mm.write()
Example #8
0
def blasr(args):
    """
    %prog blasr ref.fasta fofn

    Run blasr on a set of PacBio reads. This is based on a divide-and-conquer
    strategy described below.
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.utils.iter import grouper

    p = OptionParser(blasr.__doc__)
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, fofn = args
    flist = sorted([x.strip() for x in open(fofn)])
    h5list = []
    mm = MakeManager()
    for i, fl in enumerate(grouper(flist, 3)):
        chunkname = "chunk{0:03d}".format(i)
        fn = chunkname + ".fofn"
        h5 = chunkname + ".cmp.h5"
        fw = open(fn, "w")
        print >> fw, "\n".join(fl)
        fw.close()

        cmd = "pbalign {0} {1} {2}".format(fn, reffasta, h5)
        cmd += " --nproc {0} --forQuiver --tmpDir .".format(opts.cpus)
        mm.add((fn, reffasta), h5, cmd)
        h5list.append(h5)

    # Merge h5, sort and repack
    allh5 = "all.cmp.h5"
    tmph5 = "tmp.cmp.h5"
    cmd_merge = "cmph5tools.py merge --outFile {0}".format(allh5)
    cmd_merge += " " + " ".join(h5list)
    cmd_sort = "cmph5tools.py sort --deep {0} --tmpDir .".format(allh5)
    cmd_repack = "h5repack -f GZIP=1 {0} {1}".format(allh5, tmph5)
    cmd_repack += " && mv {0} {1}".format(tmph5, allh5)
    mm.add(h5list, allh5, [cmd_merge, cmd_sort, cmd_repack])

    # Quiver
    pf = reffasta.rsplit(".", 1)[0]
    variantsgff = pf + ".variants.gff"
    consensusfasta = pf + ".consensus.fasta"
    cmd_faidx = "samtools faidx {0}".format(reffasta)
    cmd = "quiver -j 32 {0}".format(allh5)
    cmd += " -r {0} -o {1} -o {2}".format(reffasta, variantsgff, consensusfasta)
    mm.add(allh5, consensusfasta, [cmd_faidx, cmd])

    mm.write()
Example #9
0
def iter_project(folder, pattern="*.fq,*.fq.gz,*.fastq,*.fastq.gz", n=2,
                 commonprefix=True):
    # Check for paired reads and extract project id
    filelist = [x for x in iglob(folder, pattern)]
    for p in grouper(filelist, n):
        if len(p) != n or None in p:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp, commonprefix=commonprefix)
        yield sorted(p), pf
Example #10
0
def iter_project(folder, n=2):
    # Check for paired reads and extract project id
    filelist = [x for x in glob(folder + "/*.*") \
                    if x.rsplit(".", 1)[-1] in ("fq", "fastq", "txt", "gz")]
    for p in grouper(filelist, n):
        if len(p) != n:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp)
        yield list(p), pf
Example #11
0
def tile(lt, width=70, gap=1):
    """
    Pretty print list of items.
    """
    from jcvi.utils.iter import grouper

    max_len = max(len(x) for x in lt) + gap
    items_per_line = max(width / max_len, 1)
    lt = [x.rjust(max_len) for x in lt]
    g = list(grouper(lt, items_per_line, fillvalue=""))

    return "\n".join("".join(x) for x in g)
Example #12
0
def tile(lt, width=70, gap=1):
    """
    Pretty print list of items.
    """
    from jcvi.utils.iter import grouper

    max_len = max(len(x) for x in lt) + gap
    items_per_line = max(width / max_len, 1)
    lt = [x.rjust(max_len) for x in lt]
    g = list(grouper(lt, items_per_line, fillvalue=""))

    return "\n".join("".join(x) for x in g)
Example #13
0
 def __iter__(self):
     nstacks = 0
     fp = must_open(self.filename)
     for tag, contents in groupby(fp, lambda row: row[0] == '/'):
         if tag:
             continue
         data = Clust()
         for name, seq in grouper(contents, 2):
             name, seq = name.strip(), seq.strip()
             nrep = getsize(name)
             data.append((name, seq, nrep))
         yield data
         nstacks += 1
         if nstacks % 10000 == 0:
             logging.debug("{0} stacks parsed".format(nstacks))
Example #14
0
def gallery(args):
    """
    %prog gallery folder link_prefix

    Convert a folder of figures to a HTML table. For example:

    $ python -m jcvi.formats.html gallery Paper-figures/
    https://dl.dropboxusercontent.com/u/15937715/Data/Paper-figures/

    Maps the images from local to remote.
    """
    from jcvi.apps.base import iglob
    from jcvi.utils.iter import grouper

    p = OptionParser(gallery.__doc__)
    p.add_option("--columns",
                 default=3,
                 type="int",
                 help="How many cells per row")
    p.add_option("--width", default=200, type="int", help="Image width")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, link_prefix = args
    width = opts.width
    images = iglob(folder, "*.jpg,*.JPG,*.png")
    td = '<td>{0}<br><a href="{1}"><img src="{1}" width="{2}"></a></td>'
    print("<table>")
    for ims in grouper(images, opts.columns):
        print('<tr height="{0}" valign="top">'.format(width + 5))
        for im in ims:
            if not im:
                continue
            im = op.basename(im)
            pf = im.split('.')[0].replace('_', '-')
            link = link_prefix.rstrip("/") + "/" + im
            print(td.format(pf, link, width))
        print("</tr>")
    print("</table>")
Example #15
0
def gallery(args):
    """
    %prog gallery folder link_prefix

    Convert a folder of figures to a HTML table. For example:

    $ python -m jcvi.formats.html gallery Paper-figures/
    https://dl.dropboxusercontent.com/u/15937715/Data/Paper-figures/

    Maps the images from local to remote.
    """
    from jcvi.apps.base import iglob
    from jcvi.utils.iter import grouper

    p = OptionParser(gallery.__doc__)
    p.add_option("--columns", default=3, type="int",
                 help="How many cells per row")
    p.add_option("--width", default=200, type="int",
                 help="Image width")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, link_prefix = args
    width = opts.width
    images = iglob(folder, "*.jpg,*.JPG,*.png")
    td = '<td>{0}<br><a href="{1}"><img src="{1}" width="{2}"></a></td>'
    print("<table>")
    for ims in grouper(images, opts.columns):
        print('<tr height="{0}" valign="top">'.format(width + 5))
        for im in ims:
            if not im:
                continue
            im = op.basename(im)
            pf = im.split('.')[0].replace('_', '-')
            link = link_prefix.rstrip("/") + "/" + im
            print(td.format(pf, link, width))
        print("</tr>")
    print("</table>")
Example #16
0
def scaffold(args):
    """
    %prog scaffold ctgfasta reads1.fasta mapping1.bed
                            reads2.fasta mapping2.bed ...

    Run BAMBUS on set of contigs, reads and read mappings.
    """

    from jcvi.formats.base import FileMerger
    from jcvi.formats.bed import mates
    from jcvi.formats.contig import frombed
    from jcvi.formats.fasta import join
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.set_rclip(rclip=1)
    p.add_option("--conf", help="BAMBUS configuration file [default: %default]")
    p.add_option("--prefix", default=False, action="store_true",
            help="Only keep links between IDs with same prefix [default: %default]")
    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 3 or nargs % 2 != 1:
        sys.exit(not p.print_help())

    rclip = opts.rclip
    ctgfasta = args[0]
    duos = list(grouper(args[1:], 2))
    trios = []
    for fastafile, bedfile in duos:
        prefix = bedfile.rsplit(".", 1)[0]
        matefile = prefix + ".mates"
        matebedfile = matefile + ".bed"
        if need_update(bedfile, [matefile, matebedfile]):
            matesopt = [bedfile, "--lib", "--nointra",
                        "--rclip={0}".format(rclip),
                        "--cutoff={0}".format(opts.cutoff)]
            if opts.prefix:
                matesopt += ["--prefix"]
            matefile, matebedfile = mates(matesopt)
        trios.append((fastafile, matebedfile, matefile))

    # Merge the readfasta, bedfile and matefile
    bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates"

    for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)):
        FileMerger(files, outfile=outfile).merge(checkexists=True)

    ctgfile = "bambus.contig"
    idsfile = "bambus.ids"
    frombedInputs = [bbbed, ctgfasta, bbfasta]
    if need_update(frombedInputs, ctgfile):
        frombed(frombedInputs)

    inputfasta = "bambus.contigs.fasta"
    singletonfasta = "bambus.singletons.fasta"
    cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile)
    sh(cmd + inputfasta)
    sh(cmd + singletonfasta + " -exclude")

    # Run bambus
    prefix = "bambus"
    cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix)
    if opts.conf:
        cmd += " -C {0}".format(opts.conf)
    sh(cmd)

    cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\
            format(prefix)
    sh(cmd)

    final = "final"
    cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \
          "-merge -detail -oo -sum -o {1}".format(prefix, final)
    sh(cmd)

    oofile = final + ".oo"
    join([inputfasta, "--oo={0}".format(oofile)])
Example #17
0
def scaffold(args):
    """
    %prog scaffold ctgfasta reads1.fasta mapping1.bed
                            reads2.fasta mapping2.bed ...

    Run BAMBUS on set of contigs, reads and read mappings.
    """

    from jcvi.formats.base import FileMerger
    from jcvi.formats.bed import mates
    from jcvi.formats.contig import frombed
    from jcvi.formats.fasta import join
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option("--conf",
                 help="BAMBUS configuration file [default: %default]")
    p.add_option(
        "--prefix",
        default=False,
        action="store_true",
        help="Only keep links between IDs with same prefix [default: %default]"
    )
    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 3 or nargs % 2 != 1:
        sys.exit(not p.print_help())

    ctgfasta = args[0]
    duos = list(grouper(2, args[1:]))
    trios = []
    for fastafile, bedfile in duos:
        prefix = bedfile.rsplit(".", 1)[0]
        matefile = prefix + ".mates"
        matebedfile = matefile + ".bed"
        if need_update(bedfile, [matefile, matebedfile]):
            matesopt = [bedfile, "--lib", "--nointra"]
            if opts.prefix:
                matesopt += ["--prefix"]
            matefile, matebedfile = mates(matesopt)
        trios.append((fastafile, matebedfile, matefile))

    # Merge the readfasta, bedfile and matefile
    bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates"

    for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)):
        FileMerger(files, outfile=outfile).merge(checkexists=True)

    ctgfile = "bambus.contig"
    idsfile = "bambus.ids"
    frombedInputs = [bbbed, ctgfasta, bbfasta]
    if need_update(frombedInputs, ctgfile):
        frombed(frombedInputs)

    inputfasta = "bambus.contigs.fasta"
    singletonfasta = "bambus.singletons.fasta"
    cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile)
    sh(cmd + inputfasta)
    sh(cmd + singletonfasta + " -exclude")

    # Run bambus
    prefix = "bambus"
    cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix)
    if opts.conf:
        cmd += " -C {0}".format(opts.conf)
    sh(cmd)

    cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\
            format(prefix)
    sh(cmd)

    final = "final"
    cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \
          "-merge -detail -oo -sum -o {1}".format(prefix, final)
    sh(cmd)

    oofile = final + ".oo"
    join([inputfasta, "--oo={0}".format(oofile)])