Esempio n. 1
0
    def __init__(
        self,
        data: TSPDataModel,
        work_dir=Work_dir,
        clean=True,
        verbose=False,
        precision=0,
        seed=666,
    ):
        """Run concorde on TSP instance

        Args:
            data (TSPDataModel): TSP instance with edge weights
            work_dir ([type], optional): Path to the work dir. Defaults to Work_dir.
            clean (bool, optional): Clean up intermediate results. Defaults to True.
            verbose (bool, optional): Show verbose messages. Defaults to False.
            precision (int, optional): Float precision of distance. Defaults to 0.
            seed (int, optional): Random seed. Defaults to 666.
        """
        self.data = data
        self.work_dir = work_dir
        self.clean = clean
        self.verbose = verbose

        mkdir(work_dir)
        tspfile = op.join(work_dir, "data.tsp")
        self.print_to_tsplib(tspfile, precision=precision)
        _, outfile = self.run_concorde(tspfile, seed=seed)
        self.tour = self.parse_output(outfile)

        if clean:
            shutil.rmtree(work_dir)
            residual_output = ["data.sol", "data.res", "Odata.res"]
            FileShredder(residual_output, verbose=False)
Esempio n. 2
0
def mergeclean(args):
    """
    %prog mergeclean [*.bam|*.count]

    Clean redundant merged bam/count files. This usually happens after running
    formats.sam.merge() several times.
    """
    from itertools import groupby
    from jcvi.formats.base import FileShredder

    p = OptionParser(mergeclean.__doc__)
    p.set_sep(sep="_", help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    files = sorted(args)
    sep = opts.sep
    key = lambda x: x.split(sep)[0]
    mtime = lambda x: os.stat(x).st_mtime
    for pf, fs in groupby(files, key=key):
        fs = list(fs)
        if len(fs) == 1:
            continue
        newest_f = max(fs, key=mtime)
        print >> sys.stderr, "|".join(fs), "=>", newest_f
        fs.remove(newest_f)
        FileShredder(fs)
Esempio n. 3
0
def download(url, filename=None, debug=True, cookies=None):
    from urlparse import urlsplit
    from subprocess import CalledProcessError
    from jcvi.formats.base import FileShredder

    scheme, netloc, path, query, fragment = urlsplit(url)
    filename = filename or op.basename(path)
    filename = filename.strip()

    if not filename:
        filename = "index.html"

    if op.exists(filename):
        if debug:
            msg = "File `{0}` exists. Download skipped.".format(filename)
            logging.error(msg)
    else:
        from jcvi.utils.ez_setup import get_best_downloader

        downloader = get_best_downloader()
        try:
            downloader(url, filename, cookies=cookies)
        except (CalledProcessError, KeyboardInterrupt) as e:
            print >> sys.stderr, e
            FileShredder([filename])

    return filename
Esempio n. 4
0
def align(args):
    """
    %prog align database.fasta read1.fq [read2.fq]

    Wrapper for three modes of BWA - mem (default), aln, bwasw (long reads).
    """
    valid_modes = ("bwasw", "aln", "mem")
    p = OptionParser(align.__doc__)
    p.add_option("--mode",
                 default="mem",
                 choices=valid_modes,
                 help="BWA mode [default: %default]")
    p.add_option("--readtype",
                 choices=("pacbio", "pbread"),
                 help="Read type in bwa-mem")
    p.set_cutoff(cutoff=800)
    p.set_sam_options()

    opts, args = p.parse_args(args)
    mode = opts.mode
    nargs = len(args)

    if nargs not in (2, 3):
        sys.exit(not p.print_help())

    tag = "bwa-{0}: ".format(mode)
    c = mem
    if nargs == 2:
        tag += "Single-end alignment"
        if mode == "bwasw":
            c = bwasw
        elif mode == "aln":
            c = samse
    else:
        assert mode != "bwasw", "Cannot use --bwasw with paired-end mode"
        tag += "Paired-end alignment"
        if mode == "aln":
            c = sampe

    logging.debug(tag)
    args[0] = get_abs_path(args[0])
    cmd, samfile = c(args, opts)
    if cmd:
        cmd = output_bam(cmd, samfile)

    bam = opts.bam
    unmapped = opts.unmapped

    sh(cmd)
    if unmapped:
        dbfile, readfile = args[:2]
        mopts = [samfile, "--unmapped"]
        if not bam:
            mopts += ["--sam"]
        mapped(mopts)
        FileShredder([samfile])

    return samfile, None
Esempio n. 5
0
def refine(args):
    """
    %prog refine breakpoints.bed gaps.bed

    Find gaps within or near breakpoint region.
    """
    p = OptionParser(refine.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    breakpointsbed, gapsbed = args
    ncols = len(open(breakpointsbed).next().split())
    logging.debug("File {0} contains {1} columns.".format(
        breakpointsbed, ncols))
    cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed)

    pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0])
    ingapsbed = pf + ".bed"
    sh(cmd, outfile=ingapsbed)

    fp = open(ingapsbed)
    data = [x.split() for x in fp]

    nogapsbed = pf + ".nogaps.bed"
    largestgapsbed = pf + ".largestgaps.bed"
    nogapsfw = open(nogapsbed, "w")
    largestgapsfw = open(largestgapsbed, "w")
    for b, gaps in groupby(data, key=lambda x: x[:ncols]):
        gaps = list(gaps)
        gap = gaps[0]
        if len(gaps) == 1 and gap[-1] == "0":
            assert gap[-2] == "."
            print >> nogapsfw, "\t".join(b)
            continue

        gaps = [(int(x[-1]), x) for x in gaps]
        maxgap = max(gaps)[1]
        print >> largestgapsfw, "\t".join(maxgap)

    nogapsfw.close()
    largestgapsfw.close()

    closestgapsbed = pf + ".closestgaps.bed"
    closestgapsfw = open(closestgapsbed, "w")
    cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed)
    sh(cmd, outfile=closestgapsbed)

    refinedbed = pf + ".refined.bed"
    FileMerger([largestgapsbed, closestgapsbed], outfile=refinedbed).merge()

    # Clean-up
    toclean = [nogapsbed, largestgapsbed, closestgapsbed]
    FileShredder(toclean)

    return refinedbed
Esempio n. 6
0
def _needle(fa, fb, needlefile, a, b, results):
    """
    Run single needle job
    """
    from Bio.Emboss.Applications import NeedleCommandline

    needle_cline = NeedleCommandline(asequence=fa,
                                     bsequence=fb,
                                     gapopen=10,
                                     gapextend=0.5,
                                     outfile=needlefile)
    stdout, stderr = needle_cline()
    nh = NeedleHeader(needlefile)
    FileShredder([fa, fb, needlefile], verbose=False)
    r = ["\t".join((a, b, nh.identity, nh.score))]

    results.extend(r)
Esempio n. 7
0
    def __init__(self, edges, work_dir=Work_dir, clean=True, verbose=False,
                       precision=0, seed=666):

        self.work_dir = work_dir
        self.clean = clean
        self.verbose = verbose

        mkdir(work_dir)
        tspfile = op.join(work_dir, "data.tsp")
        self.print_to_tsplib(edges, tspfile, precision=precision)
        retcode, outfile = self.run_concorde(tspfile, seed=seed)
        self.tour = self.parse_output(outfile)

        if clean:
            shutil.rmtree(work_dir)
            residual_output = ["data.sol", "data.res", "Odata.res"]
            FileShredder(residual_output, verbose=False)
Esempio n. 8
0
    def write_genes(self, output="gbout", individual=False, pep=True):
        if not individual:
            fwbed = must_open(output+".bed", "w")
            fwcds = must_open(output+".cds", "w")
            fwpep = must_open(output+".pep", "w")

        for recid, rec in self.iteritems():
            if individual:
                mkdir(output)
                fwbed = must_open(op.join(output, recid+".bed"), "w")
                fwcds = must_open(op.join(output, recid+".cds"), "w")
                fwpep = must_open(op.join(output, recid+".pep"), "w")

            GenBank.write_genes_bed(rec, fwbed)
            GenBank.write_genes_fasta(rec, fwcds, fwpep)

        if not pep:
            FileShredder([fwpep.name])
Esempio n. 9
0
def needle(args):
    """
    %prog needle pairs a.pep.fasta b.pep.fasta

    Take protein pairs and needle them.
    """
    from Bio.Emboss.Applications import NeedleCommandline

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.base import FileShredder

    p = OptionParser(needle.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pairsfile, apep, bpep = args
    afasta = Fasta(apep)
    bfasta = Fasta(bpep)
    fp = open(pairsfile)
    for row in fp:
        fa = open(pairsfile + "_a.fasta", "w")
        fb = open(pairsfile + "_b.fasta", "w")
        a, b = row.split()
        a = afasta[a]
        b = bfasta[b]
        SeqIO.write([a], fa, "fasta")
        SeqIO.write([b], fb, "fasta")
        fa.close()
        fb.close()
        needlefile = pairsfile + "_ab.needle"
        needle_cline = NeedleCommandline(asequence=fa.name,
                                         bsequence=fb.name,
                                         gapopen=10,
                                         gapextend=0.5,
                                         outfile=needlefile)
        stdout, stderr = needle_cline()
        print >> sys.stderr, stdout + stderr
        #align = AlignIO.read(needlefile, "emboss")
        nh = NeedleHeader(needlefile)
        print "\t".join((a.id, b.id, nh.identity, nh.score))
        FileShredder([fa.name, fb.name, needlefile])
Esempio n. 10
0
def phytozome(args):
    """
    %prog phytozome species

    Retrieve genomes and annotations from phytozome using Globus API. Available
    species listed below. Use comma to give a list of species to download. For
    example:

    $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum

    The downloader will prompt you to enter Phytozome user name and password
    during downloading. Please register for a login at:
    https://phytozome.jgi.doe.gov/pz/portal.html.
    """
    from jcvi.apps.biomart import GlobusXMLParser

    p = OptionParser(phytozome.__doc__)
    p.add_option(
        "--version",
        default="12",
        choices=("9", "10", "11", "12", "12_unrestricted", "13"),
        help="Phytozome version",
    )
    p.add_option(
        "--assembly",
        default=False,
        action="store_true",
        help="Download assembly",
    )
    p.add_option(
        "--format",
        default=False,
        action="store_true",
        help="Format to CDS and BED for synteny inference",
    )
    p.set_downloader()
    opts, args = p.parse_args(args)

    downloader = opts.downloader
    directory_listing = ".phytozome_directory_V{}.xml".format(opts.version)
    # Get directory listing
    base_url = "http://genome.jgi.doe.gov"
    dlist = "{}/ext-api/downloads/get-directory?organism=PhytozomeV{}".format(
        base_url, opts.version
    )

    # Make sure we have a valid cookies
    cookies = get_cookies()
    if cookies is None:
        logging.error("Error fetching cookies ... cleaning up")
        FileShredder([directory_listing])
        sys.exit(1)

    # Proceed to use the cookies and download the species list
    try:
        download(
            dlist,
            filename=directory_listing,
            cookies=cookies,
            downloader=downloader,
        )
        g = GlobusXMLParser(directory_listing)
    except:
        logging.error("Error downloading directory listing ... cleaning up")
        FileShredder([directory_listing, cookies])
        sys.exit(1)

    genomes = g.get_genomes()
    valid_species = genomes.keys()
    species_tile = tile(valid_species)
    p.set_usage("\n".join((phytozome.__doc__, species_tile)))

    if len(args) != 1:
        sys.exit(not p.print_help())

    (species,) = args
    if species == "all":
        species = ",".join(valid_species)

    species = species.split(",")
    for s in species:
        res = download_species_phytozome(
            genomes,
            s,
            valid_species,
            base_url,
            cookies,
            assembly=opts.assembly,
            downloader=downloader,
        )
        if not res:
            logging.error("No files downloaded")
        gff, fa = res.get("gff"), res.get("cds")
        if opts.format:
            format_bed_and_cds(s, gff, fa)
Esempio n. 11
0
def refine(args):
    """
    %prog refine breakpoints.bed gaps.bed

    Find gaps within or near breakpoint region.

    For breakpoint regions with no gaps, there are two options:
    - Break in the middle of the region
    - Break at the closest gap (--closest)
    """
    p = OptionParser(refine.__doc__)
    p.add_option(
        "--closest",
        default=False,
        action="store_true",
        help="In case of no gaps, use closest",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    breakpointsbed, gapsbed = args
    ncols = len(open(breakpointsbed).next().split())
    logging.debug("File {0} contains {1} columns.".format(breakpointsbed, ncols))
    cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed)

    pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0])
    ingapsbed = pf + ".bed"
    sh(cmd, outfile=ingapsbed)

    fp = open(ingapsbed)
    data = [x.split() for x in fp]

    nogapsbed = pf + ".nogaps.bed"
    largestgapsbed = pf + ".largestgaps.bed"
    nogapsfw = open(nogapsbed, "w")
    largestgapsfw = open(largestgapsbed, "w")
    for b, gaps in groupby(data, key=lambda x: x[:ncols]):
        gaps = list(gaps)
        gap = gaps[0]
        if len(gaps) == 1 and gap[-1] == "0":
            assert gap[-3] == "."
            print("\t".join(b), file=nogapsfw)
            continue

        gaps = [(int(x[-1]), x) for x in gaps]
        maxgap = max(gaps)[1]
        print("\t".join(maxgap), file=largestgapsfw)

    nogapsfw.close()
    largestgapsfw.close()
    beds = [largestgapsbed]
    toclean = [nogapsbed, largestgapsbed]

    if opts.closest:
        closestgapsbed = pf + ".closestgaps.bed"
        cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed)
        sh(cmd, outfile=closestgapsbed)
        beds += [closestgapsbed]
        toclean += [closestgapsbed]
    else:
        pointbed = pf + ".point.bed"
        pbed = Bed()
        bed = Bed(nogapsbed)
        for b in bed:
            pos = (b.start + b.end) / 2
            b.start, b.end = pos, pos
            pbed.append(b)
        pbed.print_to_file(pointbed)
        beds += [pointbed]
        toclean += [pointbed]

    refinedbed = pf + ".refined.bed"
    FileMerger(beds, outfile=refinedbed).merge()

    # Clean-up
    FileShredder(toclean)

    return refinedbed
Esempio n. 12
0
def insert(args):
    """
    %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta

    Insert scaffolds into assembly.
    """
    from jcvi.formats.agp import mask, bed
    from jcvi.formats.sizes import agp

    p = OptionParser(insert.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    candidates, gapsbed, chrfasta, unplacedfasta = args
    refinedbed = refine([candidates, gapsbed])
    sizes = Sizes(unplacedfasta).mapping
    cbed = Bed(candidates)
    corder = cbed.order
    gbed = Bed(gapsbed)
    gorder = gbed.order

    gpbed = Bed()
    gappositions = {}  # (chr, start, end) => gapid

    fp = open(refinedbed)
    gap_to_scf = defaultdict(list)
    seen = set()
    for row in fp:
        atoms = row.split()
        if len(atoms) <= 6:
            continue
        unplaced = atoms[3]
        strand = atoms[5]
        gapid = atoms[9]
        if gapid not in seen:
            seen.add(gapid)
            gi, gb = gorder[gapid]
            gpbed.append(gb)
            gappositions[(gb.seqid, gb.start, gb.end)] = gapid
        gap_to_scf[gapid].append((unplaced, strand))

    gpbedfile = "candidate.gaps.bed"
    gpbed.print_to_file(gpbedfile, sorted=True)

    agpfile = agp([chrfasta])
    maskedagpfile = mask([agpfile, gpbedfile])
    maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed"
    bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)])

    mbed = Bed(maskedbedfile)
    finalbed = Bed()
    for b in mbed:
        sid = b.seqid
        key = (sid, b.start, b.end)
        if key not in gappositions:
            finalbed.add("{0}\n".format(b))
            continue

        gapid = gappositions[key]
        scfs = gap_to_scf[gapid]

        # For scaffolds placed in the same gap, sort according to positions
        scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end)
        for scf, strand in scfs:
            size = sizes[scf]
            finalbed.add("\t".join(str(x) for x in (scf, 0, size, sid, 1000, strand)))

    finalbedfile = "final.bed"
    finalbed.print_to_file(finalbedfile)

    # Clean-up
    toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile]
    FileShredder(toclean)
Esempio n. 13
0
def reindex(args):
    """
    %prog reindex gffile pep.fasta ref.pep.fasta

    Reindex the splice isoforms (mRNA) in input GFF file, preferably
    generated after PASA annotation update

    In the input GFF file, there can be several types of mRNA within a locus:
    * CDS matches reference, UTR extended, inherits reference mRNA ID
    * CDS (slightly) different from reference, inherits reference mRNA ID
    * Novel isoform added by PASA, have IDs like "LOCUS.1.1", "LOCUS.1.2"
    * Multiple mRNA collapsed due to shared structure, have IDs like "LOCUS.1-LOCUS.1.1"

    In the case of multiple mRNA which have inherited the same reference mRNA ID,
    break ties by comparing the new protein with the reference protein using
    EMBOSS `needle` to decide which mRNA retains ID and which is assigned a new ID.

    All mRNA identifiers should follow the AGI naming conventions.

    When reindexing the isoform identifiers, order mRNA based on:
    * decreasing transcript length
    * decreasing support from multiple input datasets used to run pasa.consolidate()
    """
    from jcvi.formats.gff import make_index
    from jcvi.formats.fasta import Fasta
    from jcvi.apps.emboss import needle
    from jcvi.formats.base import FileShredder
    from tempfile import mkstemp

    p = OptionParser(reindex.__doc__)
    p.add_option("--scores", type="str", \
        help="read from existing EMBOSS `needle` scores file")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    gffile, pep, refpep, = args
    gffdb = make_index(gffile)
    reffasta = Fasta(refpep)

    if not opts.scores:
        fh, pairsfile = mkstemp(prefix='pairs', suffix=".txt", dir=".")
        fw = must_open(pairsfile, "w")

    conflict, novel = AutoVivification(), {}
    for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')):
        geneid = atg_name(gene.id, retval='locus')
        novel[geneid] = []
        updated_mrna, hybrid_mrna = [], []
        for mrna in gffdb.children(gene, featuretype='mRNA', order_by=('seqid', 'start')):
            if re.match(atg_name_pat, mrna.id) is not None and "_" not in mrna.id:
                pf, mrnaid = parse_prefix(mrna.id)
                mlen = gffdb.children_bp(mrna, child_featuretype='exon')
                if "-" in mrna.id:
                    hybrid_mrna.append((mrna.id, mrna.start, mlen, len(pf)))
                else:
                    updated_mrna.append((mrna.id, mrna.start, mlen, len(pf)))

        for mrna in sorted(updated_mrna, key=lambda k:(k[1], -k[2], -k[3])):
            pf, mrnaid = parse_prefix(mrna[0])
            mstart, mlen = mrna[1], mrna[2]

            iso = atg_name(mrnaid, retval='iso')
            newiso = "{0}{1}".format(iso, re.sub(atg_name_pat, "", mrnaid))
            if iso == newiso:
                if iso not in conflict[geneid]:
                    conflict[geneid][iso] = []
                conflict[geneid][iso].append((mrna[0], iso, newiso, \
                    mstart, mlen, len(pf)))
            else:
                novel[geneid].append((mrna[0], None, newiso, \
                    mstart, mlen, len(pf)))

        for mrna in sorted(hybrid_mrna, key=lambda k:(k[1], -k[2], -k[3])):
            pf, mrnaid = parse_prefix(mrna[0])
            mstart, mlen = mrna[1], mrna[2]

            _iso, _newiso = [], []
            for id in sorted(mrnaid.split("-")):
                a = atg_name(id, retval='iso')
                b = "{0}{1}".format(a, re.sub(atg_name_pat, "", id))
                _iso.append(a)
                _newiso.append(b)

            _novel = None
            newiso = "-".join(str(x) for x in set(_newiso))
            for iso, niso in zip(_iso, _newiso):
                if iso == niso:
                    if iso not in conflict[geneid]:
                        conflict[geneid][iso] = \
                            [(mrna[0], iso, newiso, mstart, mlen, len(pf))]
                        _novel = None
                        break

                _novel = True

            if _novel is not None:
                novel[geneid].append((mrna[0], None, newiso, \
                    mstart, mlen, len(pf)))

        if not opts.scores:
            for isoform in sorted(conflict[geneid]):
                mrnaid = "{0}.{1}".format(geneid, isoform)
                if mrnaid in reffasta.keys():
                    for mrna in conflict[geneid][isoform]:
                        print >> fw, "\t".join(str(x) for x in (mrnaid, mrna[0]))

    scoresfile = None
    if not opts.scores:
        fw.close()
        needle([pairsfile, refpep, pep])
        FileShredder([pairsfile], verbose=False)
        scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0])
    else:
        scoresfile = opts.scores

    scores = read_scores(scoresfile, sort=True, trimsuffix=False)

    primary = {}
    for geneid in conflict:
        primary[geneid] = []
        for iso in sorted(conflict[geneid]):
            conflict[geneid][iso].sort(key=lambda k:(k[3], -k[4], -k[5]))
            _iso = "{0}.{1}".format(geneid, iso)
            if _iso not in scores:
                novel[geneid].extend(conflict[geneid][iso])
                continue
            top_score = scores[_iso][0][1]
            result = next((i for i, v in enumerate(conflict[geneid][iso]) if v[0] == top_score), None)
            if result is not None:
                primary[geneid].append(conflict[geneid][iso][result])
                del conflict[geneid][iso][result]
                if geneid not in novel:
                    novel[geneid] = []
                novel[geneid].extend(conflict[geneid][iso])
        novel[geneid].sort(key=lambda k:(k[3], -k[4], -k[5]))

    fw = must_open(opts.outfile, 'w')
    for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')):
        geneid = gene.id
        print >> fw, gene
        seen = []
        if geneid in primary:
            all_mrna = primary[geneid]
            all_mrna.extend(novel[geneid])
            for iso, mrna in enumerate(all_mrna):
                _mrna = gffdb[mrna[0]]
                _iso = mrna[1]
                if mrna not in novel[geneid]:
                    seen.append(int(mrna[1]))
                else:
                    mseen = 0 if len(seen) == 0 else max(seen)
                    _iso = (mseen + iso + 1) - len(seen)

                _mrnaid = "{0}.{1}".format(geneid, _iso)
                _mrna['ID'], _mrna['_old_ID'] = [_mrnaid], [_mrna.id]

                print >> fw, _mrna
                for c in gffdb.children(_mrna, order_by=('start')):
                    c['Parent'] = [_mrnaid]
                    print >> fw, c
        else:
            for feat in gffdb.children(gene, order_by=('seqid', 'start')):
                print >> fw, feat

    fw.close()
Esempio n. 14
0
def expand(args):
    """
    %prog expand bes.fasta reads.fastq

    Expand sequences using short reads. Useful, for example for getting BAC-end
    sequences. The template to use, in `bes.fasta` may just contain the junction
    sequences, then align the reads to get the 'flanks' for such sequences.
    """
    import math

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.fastq import readlen, first, fasta
    from jcvi.formats.blast import Blast
    from jcvi.formats.base import FileShredder
    from jcvi.apps.bowtie import align, get_samfile
    from jcvi.apps.align import blast

    p = OptionParser(expand.__doc__)
    p.set_depth(depth=200)
    p.set_firstN()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bes, reads = args
    size = Fasta(bes).totalsize
    rl = readlen([reads])
    expected_size = size + 2 * rl
    nreads = expected_size * opts.depth / rl
    nreads = int(math.ceil(nreads / 1000.)) * 1000

    # Attract reads
    samfile, logfile = align([bes, reads, "--reorder", "--mapped",
           "--firstN={0}".format(opts.firstN)])

    samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
    logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))

    pf = mapped.split(".")[0]
    pf = pf.split("-")[0]
    bespf = bes.split(".")[0]
    reads = pf + ".expand.fastq"
    first([str(nreads), mapped, "-o", reads])

    # Perform mini-assembly
    fastafile = reads.rsplit(".", 1)[0] + ".fasta"
    qualfile = ""
    if need_update(reads, fastafile):
        fastafile, qualfile = fasta([reads])

    contigs = op.join(pf, "454LargeContigs.fna")
    if need_update(fastafile, contigs):
        cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
        sh(cmd)
    assert op.exists(contigs)

    # Annotate contigs
    blastfile = blast([bes, contigs])
    mapping = {}
    for query, b in Blast(blastfile).iter_best_hit():
        mapping[query] = b

    f = Fasta(contigs, lazy=True)
    annotatedfasta = ".".join((pf, bespf, "fasta"))
    fw = open(annotatedfasta, "w")
    keys = list(Fasta(bes).iterkeys_ordered())  # keep an ordered list
    recs = []
    for key, v in f.iteritems_ordered():
        vid = v.id
        if vid not in mapping:
            continue
        b = mapping[vid]
        subject = b.subject
        rec = v.reverse_complement() if b.orientation == '-' else v
        rec.id = rid = "_".join((pf, vid, subject))
        rec.description = ""
        recs.append((keys.index(subject), rid, rec))

    recs = [x[-1] for x in sorted(recs)]
    SeqIO.write(recs, fw, "fasta")
    fw.close()

    FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
    logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
                    format(len(recs), annotatedfasta))

    return annotatedfasta
Esempio n. 15
0
def maker(args):
    """
    %prog maker maker.gff3 genome.fasta

    Prepare EVM inputs by separating tracks from MAKER.
    """
    from jcvi.formats.base import SetFile, FileShredder

    A, T, P = "ABINITIO_PREDICTION", "TRANSCRIPT", "PROTEIN"
    # Stores default weights and types
    Registry = {\
        "maker": (A, 5),
        "augustus_masked": (A, 1),
        "snap_masked": (A, 1),
        "genemark": (A, 1),
        "est2genome": (T, 5),
        "est_gff": (T, 5),
        "protein2genome": (P, 5),
        "blastx": (P, 1)
    }

    p = OptionParser(maker.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, fastafile = args

    types = "type.ids"
    if need_update(gffile, types):
        cmd = "cut -f2 -s {0} | sort -u".format(gffile)
        sh(cmd, outfile=types)

    types = SetFile(types)
    reg = defaultdict(list)
    weightsfile = "weights.txt"
    contents = []
    for s in types:
        rs = s.split(":")[0]
        if rs not in Registry:
            continue

        type, weight = Registry[rs]
        reg[type].append(s)
        contents.append("\t".join(str(x) for x in (type, s, weight)))

    contents = "\n".join(sorted(contents))
    write_file(weightsfile, contents, meta="weights file")

    evs = [x + ".gff" for x in (A, T, P)]
    FileShredder(evs)

    for type, tracks in reg.items():
        for t in tracks:
            cmd = "grep '\t{0}' {1} | grep -v '_match\t' >> {2}.gff".format(t, gffile, type)
            sh(cmd)

    partition(evs)
    runfile = "run.sh"
    contents = EVMRUN.format(*evs)
    write_file(runfile, contents, meta="run script")