Esempio n. 1
0
def fromaligns(args):
    """
    %prog fromaligns out.aligns

    Convert aligns file (old MCscan output) to anchors file.
    """
    p = OptionParser(fromaligns.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    alignsfile, = args
    fp = must_open(alignsfile)
    fw = must_open(opts.outfile, "w")
    for row in fp:
        if row.startswith("## Alignment"):
            print >> fw, "###"
            continue
        if row[0] == '#' or not row.strip():
            continue
        atoms = row.split(':')[-1].split()
        print >> fw, "\t".join(atoms[:2])
    fw.close()
Esempio n. 2
0
def silicosoma(args):
    """
    %prog silicosoma in.silico > out.soma

    Convert .silico to .soma file.

    Format of .silico
        A text file containing in-silico digested contigs. This file contains pairs
    of lines. The first line in each pair constains an identifier, this contig
    length in bp, and the number of restriction sites, separated by white space.
    The second line contains a white space delimited list of the restriction
    site positions.

    Format of .soma
        Each line of the text file contains two decimal numbers: The size of the
    fragment and the standard deviation (both in kb), separated by white space.
    The standard deviation is ignored.
    """
    p = OptionParser(silicosoma.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    silicofile, = args
    fp = must_open(silicofile)
    fw = must_open(opts.outfile, "w")
    fp.next()
    positions = [int(x) for x in fp.next().split()]
    for a, b in pairwise(positions):
        assert a <= b
        fragsize = int(round((b - a) / 1000.))  # kb
        if fragsize:
            print >> fw, fragsize, 0
Esempio n. 3
0
    def __init__(self, filename, source="JCVI"):
        super(MultiGenBank, self).__init__(filename)
        assert op.exists(filename)

        pf = filename.rsplit(".", 1)[0]
        fastafile, gfffile = pf + ".fasta", pf + ".gff"
        fasta_fw = must_open(fastafile, "w")
        gff_fw = must_open(gfffile, "w")

        self.source = source
        self.counter = defaultdict(list)

        nrecs, nfeats = 0, 0
        for rec in SeqIO.parse(filename, "gb"):
            seqid = rec.name
            rec.id = seqid
            SeqIO.write([rec], fasta_fw, "fasta")
            rf = rec.features
            for f in rf:
                self.print_gffline(gff_fw, f, seqid)
                nfeats += 1

                for sf in f.sub_features:
                    self.print_gffline(gff_fw, sf, seqid, parent=f)
                    nfeats += 1

            nrecs += 1

        logging.debug("A total of {0} records written to `{1}`.".\
                        format(nrecs, fastafile))
        fasta_fw.close()

        logging.debug("A total of {0} features written to `{1}`.".\
                        format(nfeats, gfffile))
        gff_fw.close()
Esempio n. 4
0
def bed_to_bedpe(bedfile, bedpefile, pairsbedfile=None, matesfile=None, ca=False):
    """
    This converts the bedfile to bedpefile, assuming the reads are from CA.
    """
    fp = must_open(bedfile)
    fw = must_open(bedpefile, "w")
    if pairsbedfile:
        fwpairs = must_open(pairsbedfile, "w")

    clones = defaultdict(list)
    for row in fp:
        b = BedLine(row)
        name = b.accn
        clonename = clone_name(name, ca=ca)
        clones[clonename].append(b)

    if matesfile:
        fp = open(matesfile)
        libraryline = fp.next()
        # 'library bes     37896   126916'
        lib, name, smin, smax = libraryline.split()
        assert lib == "library"
        smin, smax = int(smin), int(smax)
        logging.debug("Happy mates for lib {0} fall between {1} - {2}".\
                      format(name, smin, smax))

    nbedpe = 0
    nspan = 0
    for clonename, blines in clones.items():
        if len(blines) == 2:
            a, b = blines
            aseqid, astart, aend = a.seqid, a.start, a.end
            bseqid, bstart, bend = b.seqid, b.start, b.end
            print >> fw, "\t".join(str(x) for x in (aseqid, astart - 1, aend,
                bseqid, bstart - 1, bend, clonename))
            nbedpe += 1
        else:
            a, = blines
            aseqid, astart, aend = a.seqid, a.start, a.end
            bseqid, bstart, bend = 0, 0, 0

        if pairsbedfile:
            start = min(astart, bstart) if bstart > 0 else astart
            end = max(aend, bend) if bend > 0 else aend
            if aseqid != bseqid:
                continue

            span = end - start + 1
            if (not matesfile) or (smin <= span <= smax):
                print >> fwpairs, "\t".join(str(x) for x in \
                        (aseqid, start - 1, end, clonename))
                nspan += 1

    fw.close()
    logging.debug("A total of {0} bedpe written to `{1}`.".\
                  format(nbedpe, bedpefile))
    if pairsbedfile:
        fwpairs.close()
        logging.debug("A total of {0} spans written to `{1}`.".\
                      format(nspan, pairsbedfile))
Esempio n. 5
0
File: ks.py Progetto: ascendo/jcvi
def prepare(args):
    """
    %prog prepare pairsfile cdsfile [pepfile] -o paired.cds.fasta

    Pick sequences from cdsfile to form pairs, ready to be calculated. The
    pairsfile can be generated from formats.blast.cscore(). The first two
    columns contain the pair.
    """
    from jcvi.formats.fasta import Fasta

    p = OptionParser(prepare.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)
    outfile = opts.outfile

    if len(args) == 2:
        pairsfile, cdsfile = args
        pepfile = None
    elif len(args) == 3:
        pairsfile, cdsfile, pepfile = args
    else:
        sys.exit(not p.print_help())

    f = Fasta(cdsfile)
    fp = open(pairsfile)
    fw = must_open(outfile, "w")
    if pepfile:
        assert outfile != "stdout", "Please specify outfile name."
        f2 = Fasta(pepfile)
        fw2 = must_open(outfile + ".pep", "w")
    for row in fp:
        if row[0] == '#':
            continue
        a, b = row.split()[:2]
        if a == b:
            logging.debug("Self pairs found: {0} - {1}. Ignored".format(a, b))
            continue

        if a not in f:
            a = find_first_isoform(a, f)
            assert a, a
        if b not in f:
            b = find_first_isoform(b, f)
            assert b, b

        acds = f[a]
        bcds = f[b]
        SeqIO.write((acds, bcds), fw, "fasta")
        if pepfile:
            apep = f2[a]
            bpep = f2[b]
            SeqIO.write((apep, bpep), fw2, "fasta")
    fw.close()
    if pepfile:
        fw2.close()
Esempio n. 6
0
    def write_fasta(self, output="gbfasta", individual=False):
        if not individual:
            fw = must_open(output+".fasta", "w")

        for recid, rec in self.iteritems():
            if individual:
                mkdir(output)
                fw = must_open(op.join(output, recid+".fasta"), "w")

            seqid = rec.id.split(".")[0]
            if not seqid:
                seqid = rec.name.split(".")[0]
            seq = rec.seq
            fw.write(">{0}\n{1}\n".format(seqid, seq))
Esempio n. 7
0
def main(blastfile, p, opts):

    sqlite = opts.sqlite
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)
    filtered_blast = read_blast(blastfile, qorder, sorder, \
                                is_self=is_self, ostrip=opts.strip_names)
    all_data = [(b.qi, b.si) for b in filtered_blast]

    c = None
    if sqlite:
        conn = sqlite3.connect(sqlite)
        c = conn.cursor()
        c.execute("drop table if exists synteny")
        c.execute("create table synteny (query text, anchor text, "
                "gray varchar(1), score integer, dr integer, "
                "orientation varchar(1), qnote text, snote text)")
        fw = None
    else:
        fw = must_open(opts.outfile, "w")

    batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=False)
    if qbed.filename == sbed.filename:
        logging.debug("Self comparisons, mirror ignored")
    else:
        batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=True)

    if sqlite:
        c.execute("create index q on synteny (query)")
        conn.commit()
        c.close()
    else:
        fw.close()
Esempio n. 8
0
def annotate(args):
    """
    %prog annotate blastfile query.fasta subject.fasta

    Annotate overlap types (dovetail, contained, etc) in BLAST tabular file.
    """
    from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types

    p = OptionParser(annotate.__doc__)
    p.set_align(pctid=94, hitlen=500)
    p.add_option("--hang", default=500, type="int",
                 help="Maximum overhang length")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, afasta, bfasta = args
    fp = must_open(blastfile)
    asizes = Sizes(afasta).mapping
    bsizes = Sizes(bfasta).mapping
    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))
    for row in fp:
        b = BlastLine(row)
        asize = asizes[b.query]
        bsize = bsizes[b.subject]
        if b.query == b.subject:
            continue
        ov = Overlap(b, asize, bsize, cutoff)
        if ov.otype:
            ov.print_graphic()
            print("{0}\t{1}".format(b, Overlap_types[ov.otype]))
Esempio n. 9
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Esempio n. 10
0
def get_info():
    infofiles = glob("*.info")
    info = {}
    for row in must_open(infofiles):
        a = row.split()[0]
        info[a] = row.rstrip()
    return info
Esempio n. 11
0
File: vcf.py Progetto: Hensonmw/jcvi
def uniq(args):
    """
    %prog uniq vcffile

    Retain only the first entry in vcf file.
    """
    from urlparse import parse_qs

    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    fp = must_open(vcffile)
    data = []
    for row in fp:
        if row[0] == '#':
            print row.strip()
            continue
        v = VcfLine(row)
        data.append(v)

    for pos, vv in groupby(data, lambda x: x.pos):
        vv = list(vv)
        if len(vv) == 1:
            print vv[0]
            continue
        bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0]))
        print bestv
Esempio n. 12
0
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=10, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
Esempio n. 13
0
def filtervcf(args):
    """
    %prog filtervcf NA12878.hg38.vcf.gz

    Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list
    of vcf files.
    """
    p = OptionParser(filtervcf.__doc__)
    p.set_home("lobstr", default="/mnt/software/lobSTR")
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    lhome = opts.lobstr_home
    store = opts.output_path

    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]

    vcffiles = [x for x in vcffiles if ".filtered." not in x]

    run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_filter, run_args).get():
        continue
Esempio n. 14
0
    def __init__(self, filename):
        super(OVL, self).__init__(filename)
        fp = must_open(filename)
        contained = set()
        for row in fp:
            o = OVLLine(row)
            self.append(o)
            if o.tag == "a in b":
                contained.add(o.a)
            elif o.tag == "b in a":
                contained.add(o.b)
        logging.debug("Imported {} links. Contained tigs: {}".\
                        format(len(self), len(contained)))
        self.contained = contained

        self.graph = BiGraph()
        for o in self:
            if o.tag == "a->b":
                a, b = o.a, o.b
            elif o.tag == "b->a":
                a, b = o.b, o.a
            if a in contained or b in contained:
                continue
            bstrand = '<' if o.bstrand == '-' else '>'
            self.graph.add_edge(a, b, '>', bstrand, length=o.score)
Esempio n. 15
0
def read_scores(scoresfile, opts=None, sort=False, trimsuffix=True):
    scores = {}
    _pid, _score, resolve = (0.0, 0.0, 'alignment') if opts == None \
            else (opts.pid, opts.score, opts.resolve)

    fp = must_open(scoresfile)
    logging.debug("Load scores file `{0}`".format(scoresfile))
    for row in fp:
        (new, old, identity, score) = row.strip().split("\t")
        if trimsuffix:
            old = re.sub('\.\d+$', '', old)
        if resolve == "alignment":
            match = re.search("\d+\/\d+\s+\(\s*(\d+\.\d+)%\)", identity)
            pid = match.group(1)
            if float(pid) < _pid or float(score) < _score:
                continue
        else:
            pid = identity

        if new not in scores:
            scores[new] = []

        scores[new].append((new, old, float(pid), float(score)))

    if sort:
        for new in scores:
            scores[new].sort(key=lambda k: (-k[2], -k[3]))

    return scores
Esempio n. 16
0
def sizes(args):
    """
    %prog sizes bedfile

    Infer the sizes for each seqid. Useful before dot plots.
    """
    p = OptionParser(sizes.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    assert op.exists(bedfile)

    sizesfile = bedfile.rsplit(".", 1)[0] + ".sizes"

    fw = must_open(sizesfile, "w", checkexists=True, skipcheck=True)
    if fw:
        b = Bed(bedfile)
        for s, sbeds in b.sub_beds():
            print >> fw, "{0}\t{1}".format(\
                         s, max(x.end for x in sbeds))
        logging.debug("Sizes file written to `{0}`.".format(sizesfile))

    return sizesfile
Esempio n. 17
0
File: table.py Progetto: rrane/jcvi
def write_csv(header, contents, sep=",", filename="stdout", thousands=False, tee=False):
    """
    Write csv that are aligned with the column headers.

    >>> header = ["x_value", "y_value"]
    >>> contents = [(1, 100), (2, 200)]
    >>> write_csv(header, contents)
    x_value, y_value
          1,     100
          2,     200
    """
    from jcvi.formats.base import must_open, is_number
    from jcvi.utils.cbook import thousands as th

    fw = must_open(filename, "w")
    allcontents = [header] + contents if header else contents
    cols = len(contents[0])
    for content in allcontents:
        assert len(content) == cols

    # Stringify the contents
    for i, content in enumerate(allcontents):
        if thousands:
            content = [int(x) if is_number(x, cast=int) else x for x in content]
            content = [th(x) if (is_number(x, cast=int) and x >= 1000) else x for x in content]
        allcontents[i] = [str(x) for x in content]

    colwidths = [max(len(x[i]) for x in allcontents) for i in xrange(cols)]
    sep += " "
    for content in allcontents:
        rjusted = [x.rjust(cw) for x, cw in zip(content, colwidths)]
        formatted = sep.join(rjusted)
        print >> fw, formatted
        if tee and filename != "stdout":
            print formatted
Esempio n. 18
0
File: ks.py Progetto: bennyyu/jcvi
def get_mixture(data, components):
    """
    probs = [.476, .509]
    mus = [.69069, -.15038]
    variances = [.468982e-1, .959052e-1]
    """
    from jcvi.apps.base import popen

    probs, mus, sigmas = [], [], []
    fw = must_open("tmp", "w")
    log_data = [log(x) for x in data if x > .05]
    data = "\n".join(["%.4f" % x for x in log_data]).replace("inf\n", "")
    fw.write(data)
    fw.close()

    cmd = "gmm-bic {0} {1} {2}".format(components, len(log_data), fw.name)
    pipe = popen(cmd)

    for row in pipe:
        if row[0] != '#':
            continue

        atoms = row.split(",")
        a, b, c = atoms[1:4]
        a = float(a)
        b = float(b)
        c = float(c)

        mus.append(a)
        sigmas.append(b)
        probs.append(c)

    os.remove(fw.name)
    return probs, mus, sigmas
Esempio n. 19
0
def bed(args):
    """
    %prog bed blastfile

    Print out bed file based on coordinates in BLAST report. By default, write
    out subject positions. Use --swap to write query positions.
    """
    p = OptionParser(bed.__doc__)
    p.add_option("--swap", default=False, action="store_true",
                 help="Write query positions [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    blastfile, = args
    swap = opts.swap

    fp = must_open(blastfile)
    bedfile = blastfile.rsplit(".", 1)[0] + ".bed"
    fw = open(bedfile, "w")
    for row in fp:
        b = BlastLine(row)
        if swap:
            b = b.swapped
        print >> fw, b.bedline

    logging.debug("File written to `{0}`.".format(bedfile))

    return bedfile
Esempio n. 20
0
def digest(args):
    """
    %prog digest fastafile NspI,BfuCI

    Digest fasta sequences to map restriction site positions.
    """
    p = OptionParser(digest.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, enzymes = args
    enzymes = enzymes.split(",")
    enzymes = [x for x in AllEnzymes if str(x) in enzymes]
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")

    header = ["Contig", "Length"] + [str(x) for x in enzymes]
    print("\t".join(header), file=fw)
    for name, rec in f.iteritems_ordered():
        row = [name, len(rec)]
        for e in enzymes:
            pos = e.search(rec.seq)
            pos = "na" if not pos else "|".join(str(x) for x in pos)
            row.append(pos)
        print("\t".join(str(x) for x in row), file=fw)
Esempio n. 21
0
    def blast(self, blastfile=None, outfile=None):
        """
        convert anchor file to 12 col blast file
        """
        from jcvi.formats.blast import BlastSlow, BlastLineByConversion

        if not outfile:
            outfile = self.filename + ".blast"

        if blastfile is not None:
            blasts = BlastSlow(blastfile).to_dict()
        else:
            blasts = None

        fw = must_open(outfile, "w", checkexists=True)
        nlines = 0
        for a, b, id in self.iter_pairs():
            if (a, b) in blasts:
                bline = blasts[(a, b)]
            elif (b, a) in blasts:
                bline = blasts[(b, a)]
            else:
                line = "\t".join((a, b))
                bline = BlastLineByConversion(line, mode="110000000000")

            print >> fw, bline
            nlines += 1
        fw.close()

        logging.debug("A total of {0} BLAST lines written to `{1}`."\
                        .format(nlines, outfile))

        return outfile
Esempio n. 22
0
def filterm4(args):
    """
    %prog filterm4 sample.m4 > filtered.m4

    Filter .m4 file after blasr is run. As blasr takes a long time to run,
    changing -bestn is undesirable. This screens the m4 file to retain top hits.
    """
    p = OptionParser(filterm4.__doc__)
    p.add_option("--best", default=1, type="int", help="Only retain best N hits")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    m4file, = args
    best = opts.best
    fp = open(m4file)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    retained = total = 0
    for row in fp:
        r = M4Line(row)
        total += 1
        if total % 100000 == 0:
            logging.debug("Retained {0} lines".\
                            format(percentage(retained, total)))
        if seen.get(r.query, 0) < best:
            fw.write(row)
            seen[r.query] += 1
            retained += 1
    fw.close()
Esempio n. 23
0
    def __init__(self, filename, defaultcolor='#fb8072', header=False):
        super(BlockFile, self).__init__(filename)
        fp = must_open(filename)
        hd = fp.next().rstrip().split("\t")
        ncols = len(hd)
        if header:
            self.header = hd
        else:
            fp.seek(0)
            self.header = range(ncols)

        data = []
        highlight = []
        for row in fp:
            hl = ("*" in row)
            # r* highlights the block in red color
            if hl:
                hl, row = row.split("*", 1)
                hl = hl or defaultcolor
            atoms = row.rstrip().split("\t")
            atoms = [x.strip() for x in atoms]
            atoms = ["." if x == "" else x for x in atoms]
            if len(atoms) > ncols:
                atoms = atoms[:ncols]
            elif len(atoms) < ncols:
                atoms = atoms + ["."] * (ncols - len(atoms))
            data.append(atoms)
            highlight.append(hl)

        self.data = data
        self.highlight = highlight
        self.columns = zip(*data)
        self.ncols = ncols
Esempio n. 24
0
def smart_reroot(treefile, outgroupfile, outfile, format=0):
    """
    simple function to reroot Newick format tree using ete2

    Tree reading format options see here:
    http://packages.python.org/ete2/tutorial/tutorial_trees.html#reading-newick-trees
    """
    tree = Tree(treefile, format=format)
    leaves = [t.name for t in tree.get_leaves()][::-1]
    outgroup = []
    for o in must_open(outgroupfile):
        o = o.strip()
        for leaf in leaves:
            if leaf[:len(o)] == o:
                outgroup.append(leaf)
        if outgroup:
            break

    if not outgroup:
        print("Outgroup not found. Tree {0} cannot be rerooted.".format(treefile), file=sys.stderr)
        return treefile

    try:
        tree.set_outgroup(tree.get_common_ancestor(*outgroup))
    except ValueError:
        assert type(outgroup) == list
        outgroup = outgroup[0]
        tree.set_outgroup(outgroup)
    tree.write(outfile=outfile, format=format)

    logging.debug("Rerooted tree printed to {0}".format(outfile))
    return outfile
Esempio n. 25
0
def SH_raxml(reftree, querytree, phy_file, shout="SH_out.txt"):
    """
    SH test using RAxML

    querytree can be a single tree or a bunch of trees (eg. from bootstrapping)
    """
    assert op.isfile(reftree)
    shout = must_open(shout, "a")

    raxml_work = op.abspath(op.join(op.dirname(phy_file), "raxml_work"))
    mkdir(raxml_work)
    raxml_cl = RaxmlCommandline(cmd=RAXML_BIN("raxmlHPC"), \
    sequences=phy_file, algorithm="h", model="GTRGAMMA", \
    name="SH", starting_tree=reftree, bipartition_filename=querytree, \
    working_dir=raxml_work)

    logging.debug("Running SH test in RAxML: %s" % raxml_cl)
    o, stderr = raxml_cl()
    # hard coded
    try:
        pval = re.search('(Significantly.*:.*)', o).group(0)
    except:
        print("SH test failed.", file=sys.stderr)
    else:
        pval = pval.strip().replace("\t"," ").replace("%","\%")
        print("{0}\t{1}".format(op.basename(querytree), pval), file=shout)
        logging.debug("SH p-value appended to %s" % shout.name)

    shout.close()
    return shout.name
Esempio n. 26
0
def bincount(args):
    """
    %prog bincount fastafile binfile

    Count K-mers in the bin.
    """
    from bitarray import bitarray
    from jcvi.formats.sizes import Sizes

    p = OptionParser(bincount.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, binfile = args
    K = opts.K

    fp = open(binfile)
    a = bitarray()
    a.fromfile(fp)
    f = Sizes(fastafile)
    tsize = 0
    fw = must_open(opts.outfile, "w")
    for name, seqlen in f.iter_sizes():
        ksize = seqlen - K + 1
        b = a[tsize : tsize + ksize]
        bcount = b.count()
        print >> fw, "\t".join(str(x) for x in (name, bcount))
        tsize += ksize
Esempio n. 27
0
File: btab.py Progetto: rrane/jcvi
    def __init__(self, filename, aat_dialect=False):
        super(Btab, self).__init__(filename)

        for line in must_open(filename):
            if line[0] == "#":
                continue
            self.append(BtabLine(line, aat_dialect=aat_dialect))
Esempio n. 28
0
def augustus(args):
    """
    %prog augustus augustus.gff3 > reformatted.gff3

    AUGUSTUS does generate a gff3 (--gff3=on) but need some refinement.
    """
    from jcvi.formats.gff import Gff

    p = OptionParser(augustus.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    ingff3, = args
    gff = Gff(ingff3)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    for g in gff:
        if g.type not in ("gene", "transcript", "CDS"):
            continue

        if g.type == "transcript":
            g.type = "mRNA"

        prefix = g.seqid + "_"
        pid = prefix + g.id
        newid = "{0}-{1}".format(pid, seen[pid]) if pid in seen else pid
        seen[pid] += 1
        g.attributes["ID"] = [newid]
        g.attributes["Parent"] = [(prefix + x) for x in g.attributes["Parent"]]
        g.update_attributes()
        print >> fw, g
    fw.close()
Esempio n. 29
0
 def print_to_bed(self, filename="stdout"):
     fw = must_open(filename, "w")
     for lg, markers in sorted(self.items()):
         for marker, pos in markers:
             print >> fw, "\t".join(str(x) for x in \
                     (lg, pos, pos + 1, marker))
     fw.close()
Esempio n. 30
0
 def __init__(self, filename, sorted=False):
     super(BlastSlow, self).__init__(filename)
     fp = must_open(filename)
     for row in fp:
         self.append(BlastLine(row))
     if not sorted:
         self.sort(key=lambda x: x.query)
Esempio n. 31
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {
        "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
        "asn.1": ["genome", "nuccore", "nucgss", "protein"],
        "gb": ["genome", "nuccore", "nucgss"],
        "est": ["nucest"],
        "gss": ["nucgss"],
        "acc": ["nuccore"],
    }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein")

    p.add_option("--noversion",
                 dest="noversion",
                 default=False,
                 action="store_true",
                 help="Remove trailing accession versions")
    p.add_option("--format",
                 default="fasta",
                 choices=valid_formats,
                 help="download format [default: %default]")
    p.add_option("--database",
                 default="nuccore",
                 choices=valid_databases,
                 help="search database [default: %default]")
    p.add_option("--retmax",
                 default=1000000,
                 type="int",
                 help="how many results to return [default: %default]")
    p.add_option(
        "--skipcheck",
        default=False,
        action="store_true",
        help="turn off prompt to check file existence [default: %default]")
    p.add_option(
        "--batchsize",
        default=500,
        type="int",
        help="download the results in batch for speed-up [default: %default]")
    p.add_option("--outdir",
                 default=None,
                 help="output directory, with accession number as filename")
    p.add_option("--outprefix",
                 default="out",
                 help="output file name prefix [default: %default]")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    filename, = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert database in allowed_databases[fmt], \
        "For output format '{0}', allowed databases are: {1}".\
        format(fmt, allowed_databases[fmt])
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile, "w", checkexists=True, \
                skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax, \
                                 rettype=fmt, db=database, batchsize=batchsize, \
                                 email=opts.email):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile, "w", checkexists=True, \
                    skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print >> fw, rec
        print >> fw

        seen.add(id)

    if seen:
        print >> sys.stderr, "A total of {0} {1} records downloaded.".\
                format(totalsize, fmt.upper())

    return outfile
Esempio n. 32
0
File: blast.py Progetto: wangwl/jcvi
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov",
                 default=False,
                 action="store_true",
                 help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap",
                 action="store_true",
                 help="Use supermap instead of union")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    p.add_option(
        "--iterby",
        dest="iterby",
        default="query",
        choices=allowed_iterby,
        help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches +
                                    this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format(
                            "\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                     this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
Esempio n. 33
0
def calibrate(args):
    """
    %prog calibrate calibrate.JPG boxsize

    Calibrate pixel-inch ratio and color adjustment.
    - `calibrate.JPG` is the photo containig a colorchecker
    - `boxsize` is the measured size for the boxes on printed colorchecker, in
      squared centimeter (cm2) units
    """
    xargs = args[2:]
    p = OptionParser(calibrate.__doc__)
    opts, args, iopts = add_seeds_options(p, args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    imagefile, boxsize = args
    boxsize = float(boxsize)

    # Read in color checker
    colorcheckerfile = op.join(datadir, "colorchecker.txt")
    colorchecker = []
    expected = 0
    for row in open(colorcheckerfile):
        boxes = row.split()
        colorchecker.append(boxes)
        expected += len(boxes)

    folder = op.split(imagefile)[0]
    objects = seeds([imagefile, "--outdir={0}".format(folder)] + xargs)
    nseeds = len(objects)
    logging.debug("Found {0} boxes (expected={1})".format(nseeds, expected))
    assert (
        expected - 4 <= nseeds <= expected + 4
    ), "Number of boxes drastically different from {0}".format(expected)

    # Calculate pixel-cm ratio
    boxes = [t.area for t in objects]
    reject = reject_outliers(boxes)
    retained_boxes = [b for r, b in zip(reject, boxes) if not r]
    mbox = np.median(retained_boxes)  # in pixels
    pixel_cm_ratio = (mbox / boxsize) ** 0.5
    logging.debug(
        "Median box size: {0} pixels. Measured box size: {1} cm2".format(mbox, boxsize)
    )
    logging.debug("Pixel-cm ratio: {0}".format(pixel_cm_ratio))

    xs = [t.x for t in objects]
    ys = [t.y for t in objects]
    idx_xs = get_kmeans(xs, 6)
    idx_ys = get_kmeans(ys, 4)
    for xi, yi, s in zip(idx_xs, idx_ys, objects):
        s.rank = (yi, xi)

    objects.sort(key=lambda x: x.rank)

    colormap = []
    for s in objects:
        x, y = s.rank
        observed, expected = s.rgb, rgb_to_triplet(colorchecker[x][y])
        colormap.append((np.array(observed), np.array(expected)))

    # Color transfer
    tr0 = np.eye(3).flatten()
    print("Initial distance:", total_error(tr0, colormap), file=sys.stderr)
    tr = fmin(total_error, tr0, args=(colormap,))
    tr.resize((3, 3))
    print("RGB linear transform:\n", tr, file=sys.stderr)
    calib = {"PixelCMratio": pixel_cm_ratio, "RGBtransform": tr.tolist()}

    jsonfile = op.join(folder, "calibrate.json")
    fw = must_open(jsonfile, "w")
    print(json.dumps(calib, indent=4), file=fw)
    fw.close()
    logging.debug("Calibration specs written to `{0}`.".format(jsonfile))

    return jsonfile
Esempio n. 34
0
def ystr(args):
    """
    %prog ystr chrY.vcf

    Print out Y-STR info given VCF. Marker name extracted from tabfile.
    """
    from jcvi.utils.table import write_csv

    p = OptionParser(ystr.__doc__)
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    si = STRFile(opts.lobstr_home, db="hg38-named")
    register = si.register

    header = "Marker|Reads|Ref|Genotype|Motif".split("|")
    contents = []
    fp = must_open(vcffile)
    reader = vcf.Reader(fp)
    simple_register = {}
    for record in reader:
        name = register[(record.CHROM, record.POS)]
        info = record.INFO
        ref = int(float(info["REF"]))
        rpa = info.get("RPA", ref)
        if isinstance(rpa, list):
            rpa = "|".join(str(int(float(x))) for x in rpa)
        ru = info["RU"]
        simple_register[name] = rpa
        for sample in record.samples:
            contents.append((name, sample["ALLREADS"], ref, rpa, ru))

    # Multi-part markers
    a, b, c = "DYS389I", "DYS389B.1", "DYS389B"
    if a in simple_register and b in simple_register:
        simple_register[c] = int(simple_register[a]) + int(simple_register[b])

    # Multi-copy markers
    mm = ["DYS385", "DYS413", "YCAII"]
    for m in mm:
        ma, mb = m + 'a', m + 'b'
        if ma not in simple_register or mb not in simple_register:
            simple_register[ma] = simple_register[mb] = None
            del simple_register[ma]
            del simple_register[mb]
            continue
        if simple_register[ma] > simple_register[mb]:
            simple_register[ma], simple_register[mb] = \
                    simple_register[mb], simple_register[ma]

    write_csv(header, contents, sep=" ")
    print "[YSEARCH]"
    build_ysearch_link(simple_register)
    print "[YFILER]"
    build_yhrd_link(simple_register, panel=YHRD_YFILER)
    print "[YFILERPLUS]"
    build_yhrd_link(simple_register, panel=YHRD_YFILERPLUS)
    print "[YSTR-ALL]"
    build_yhrd_link(simple_register, panel=USYSTR_ALL)
Esempio n. 35
0
def assemble(args):
    """
    %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta]

    Run the PASA alignment assembly pipeline

    If two transcript fasta files (Trinity denovo and genome guided) are provided
    and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB
    protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome>

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(assemble.__doc__)
    p.set_pasa_opts()
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, dnfasta, = args[:3]
    ggfasta = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    aligners = opts.aligners.split(",")
    for aligner in aligners:
        if aligner not in ALLOWED_ALIGNERS:
            logging.error("Error: Unknown aligner `{0}`".format(aligner))
            logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \
                    "combine multiple aligners in list separated by comma")
            sys.exit()

    clean = opts.clean
    seqclean = op.join(opts.tgi_home, "seqclean")

    accn_extract = which(op.join(PASA_HOME, "misc_utilities", \
            "accession_extractor.pl"))
    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))
    build_compreh_trans = which(op.join(PASA_HOME, "scripts", \
            "build_comprehensive_transcriptome.dbi"))

    fl_accs = opts.fl_accs
    cpus = opts.cpus
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"
    pctcov, pctid = opts.pctcov, opts.pctid
    compreh_pctid = opts.compreh_pctid
    compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice

    cmds = []
    if ggfasta:
        transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge()
        accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn)
        cmds.append(accn_extract_cmd)
        if not prepare:
            sh(accn_extract_cmd)
    else:
        symlink(dnfasta, tfasta)
        transcripts = tfasta

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    prjobid = None
    if clean:
        cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus)
        if prepare:
            cmds.append(cleancmd)
        else:
            prjobid = sh(cleancmd, grid=grid, grid_opts=opts)

    aafw = must_open(aaconf, "w")
    print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), \
            pctcov, pctid, bpsplice)
    aafw.close()

    symlink(genome, gfasta)

    aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta)
    aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \
             " -t {0}".format(transcripts)
    if fl_accs:
        symlink(fl_accs, flaccs)
        aacmd += " -f {0}".format(flaccs)
    if ggfasta:
        aacmd += " --TDN {0}".format(tdn)
    aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \
            opts.intron, cpus)

    if prepare:
        cmds.append(aacmd)
    else:
        opts.hold_jid = prjobid
        prjobid = sh(aacmd, grid=grid, grid_opts=opts)

    if opts.compreh and ggfasta:
        comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts)
        comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov)

        if prepare:
            cmds.append(comprehcmd)
        else:
            opts.hold_jid = prjobid
            prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)

    if prepare:
        write_file(runfile, "\n".join(cmds))  # initialize run script
Esempio n. 36
0
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through each locus (shared locus name or overlapping CDS)
    and identify same/different isoforms (shared splicing structure)
    across the input datasets.

    If `slop` is enabled, consolidation will collapse any variation
    in terminal UTR lengths, keeping the longest as representative.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index, match_subfeats
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    supported_modes = ["name", "coords"]
    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.add_option("--inferUTR", default=False, action="store_true",
            help="infer presence of UTRs from exon coordinates")
    p.add_option("--mode", default="name", choices=supported_modes,
            help="method used to determine overlapping loci")
    p.add_option("--summary", default=False, action="store_true",
            help="Generate summary table of consolidation process")
    p.add_option("--clusters", default=False, action="store_true",
            help="Generate table of cluster members after consolidation")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop
    inferUTR = opts.inferUTR
    mode = opts.mode

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)

    loci = Grouper()
    for dbn in gffdbx:
        odbns = [odbn for odbn in gffdbx if dbn != odbn]
        for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')):
            if mode == "name":
                loci.join(gene.id, (gene.id, dbn))
            else:
                if (gene.id, dbn) not in loci:
                    loci.join((gene.id, dbn))
                    gene_cds = list(gffdbx[dbn].children(gene, \
                        featuretype='CDS', order_by=('start')))
                    gene_cds_start, gene_cds_stop = gene_cds[0].start, \
                        gene_cds[-1].stop
                    for odbn in odbns:
                        for ogene_cds in gffdbx[odbn].region(seqid=gene.seqid, \
                                start=gene_cds_start, end=gene_cds_stop, \
                                strand=gene.strand, featuretype='CDS'):
                            for ogene in gffdbx[odbn].parents(ogene_cds, featuretype='gene'):
                                loci.join((gene.id, dbn), (ogene.id, odbn))

    gfeats = {}
    mrna = AutoVivification()
    for i, locus in enumerate(loci):
        gene = "gene_{0:0{pad}}".format(i, pad=6) \
                if mode == "coords" else None

        for elem in locus:
            if type(elem) == tuple:
                _gene, dbn = elem
                if gene is None: gene = _gene

                g = gffdbx[dbn][_gene]
                if gene not in gfeats:
                    gfeats[gene] = g
                    gfeats[gene].attributes['ID'] = [gene]
                else:
                    if g.start < gfeats[gene].start:
                        gfeats[gene].start = g.start
                    if g.stop > gfeats[gene].stop:
                        gfeats[gene].stop = g.stop

                c = list(gffdbx[dbn].children(_gene, featuretype='mRNA', order_by='start'))
                if len(c) > 0:
                    mrna[gene][dbn] = c

    fw = must_open(opts.outfile, "w")
    print >> fw, "##gff-version	3"
    seen = {}
    if opts.summary:
        summaryfile = "{0}.summary.txt".format(opts.outfile.rsplit(".")[0])
        sfw = must_open(summaryfile, "w")
        summary = ["id"]
        summary.extend(gffdbx.keys())
        print >> sfw, "\t".join(str(x) for x in summary)
    if opts.clusters:
        clustersfile = "{0}.clusters.txt".format(opts.outfile.rsplit(".")[0])
        cfw = must_open(clustersfile, "w")
        clusters = ["id", "dbns", "members", "trlens"]
        print >> cfw, "\t".join(str(x) for x in clusters)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2]
                for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]):
                    mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \
                            mrna2.stop - mrna2.start + 1
                    g.join((dbn1, mrna1.id, mrna1s))
                    g.join((dbn2, mrna2.id, mrna2s))

                    if match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype='CDS'):
                        res = []
                        ftypes = ['exon'] if inferUTR else ['five_prime_UTR', 'three_prime_UTR']
                        for ftype in ftypes:
                            res.append(match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop))

                        if all(r == True for r in res):
                            g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1))

        print >> fw, gfeats[gene]

        for group in g:
            group.sort(key=lambda x: x[2], reverse=True)
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]

            dbid, _mrnaid = "|".join(str(x) for x in set(dbs)), []
            for x in mrnas:
                if x not in _mrnaid: _mrnaid.append(x)
            mrnaid = "{0}|{1}".format(dbid, "-".join(_mrnaid))
            if mrnaid not in seen:
                seen[mrnaid] = 0
            else:
                seen[mrnaid] += 1
                mrnaid = "{0}-{1}".format(mrnaid, seen[mrnaid])

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            _mrna.attributes['Parent'] = [gene]
            children = gffdbx[d].children(m, order_by='start')
            print >> fw, _mrna
            for child in children:
                child.attributes['ID'] = ["{0}|{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print >> fw, child

            if opts.summary:
                summary = [mrnaid]
                summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
                print >> sfw, "\t".join(str(x) for x in summary)

            if opts.clusters:
                clusters = [mrnaid]
                clusters.append(",".join(str(el[0]) for el in group))
                clusters.append(",".join(str(el[1]) for el in group))
                clusters.append(",".join(str(el[2]) for el in group))
                print >> cfw, "\t".join(str(x) for x in clusters)

    fw.close()
    if opts.summary: sfw.close()
    if opts.clusters: cfw.close()
Esempio n. 37
0
def compare(args):
    """
    %prog compare pasa_db_name [--annots_gff3=annotation.gff3]

    Run the PASA annotation comparison pipeline

    This assumes that PASA alignment assembly has alredy been completed and
    run directory contains `genome.fasta` and `transcript.fasta` files.

    If `--annots_gff3` is specified, the PASA database is loaded with the annotations
    first before starting annotation comparison. Otherwise, it uses previously
    loaded annotation data.

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(compare.__doc__)
    p.set_pasa_opts(action="compare")
    p.add_option("--prepare", default=False, action="store_true",
            help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    pasa_db, = args

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))

    annots_gff3 = opts.annots_gff3
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"

    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "", append=True, skipcheck=True)  # initialize run script

    acfw = must_open(acconf, "w")
    print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \
            opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \
            opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \
            opts.stompovl, opts.trust_FL, opts.utr_exons)
    acfw.close()

    if not op.exists(gfasta):
        sys.exit("Genome fasta file `{0}` does not exist".format(gfasta))

    transcripts = tfasta
    if not op.exists(transcripts):
        sys.exit("Transcript fasta file `{0}` does not exist".format(transcripts))

    if op.exists("{0}.clean".format(transcripts)):
        transcripts = "{0}.clean".format(transcripts)

    accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \
            acconf, gfasta, transcripts, opts.genetic_code)

    if annots_gff3:
        if not op.exists(annots_gff3):
            sys.exit("Annotation gff3 file `{0}` does not exist".format(annots_gff3))
        symlink(annots_gff3, annotation)
        accmd += " -L --annots_gff3 {0}".format(annotation)

    if prepare:
        write_file(runfile, accmd, append=True)
    else:
        sh(accmd, grid=grid, grid_opts=opts)
Esempio n. 38
0
def mcscan(args):
    """
    %prog mcscan bedfile anchorfile [options]

    Stack synteny blocks on a reference bed, MCSCAN style. The first column in
    the output is the reference order, given in the bedfile. Then each column
    next to it are separate 'tracks'.

    If --mergetandem=tandem_file is specified, tandem_file should have each
    tandem cluster as one line, tab separated.
    """
    p = OptionParser(mcscan.__doc__)
    p.add_option("--iter",
                 default=100,
                 type="int",
                 help="Max number of chains to output [default: %default]")
    p.add_option(
        "--ascii",
        default=False,
        action="store_true",
        help="Output symbols rather than gene names [default: %default]")
    p.add_option(
        "--Nm",
        default=10,
        type="int",
        help="Clip block ends to allow slight overlaps [default: %default]")
    p.add_option("--trackids",
                 action="store_true",
                 help="Track block IDs in separate file [default: %default]")
    p.add_option("--mergetandem", default=None,
                 help="merge tandems genes in output acoording to PATH-TO-TANDEM_FILE, "\
                 "cannot be used with --ascii")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, anchorfile = args
    ascii = opts.ascii
    clip = opts.Nm
    trackids = opts.trackids
    ofile = opts.outfile
    mergetandem = opts.mergetandem
    bed = Bed(bedfile)
    order = bed.order

    if trackids:
        olog = ofile + ".tracks"
        fwlog = must_open(olog, "w")

    if mergetandem:
        assert not ascii
        tandems = {}
        for row in file(mergetandem):
            row = row.split()
            s = ";".join(row)
            for atom in row:
                tandems[atom] = s

    ac = AnchorFile(anchorfile)
    ranges = []
    block_pairs = defaultdict(dict)
    blocks = ac.blocks
    for i, ib in enumerate(blocks):
        q, s, t = zip(*ib)
        if q[0] not in order:
            q, s = s, q

        r = get_range(q, s, t, i, order, block_pairs, clip=clip)
        ranges.append(r)

        assert q[0] in order
        if s[0] not in order:
            continue

        # is_self comparison
        q, s = s, q
        r = get_range(q, s, t, i, order, block_pairs, clip=clip)
        ranges.append(r)

    fw = must_open(ofile, "w")

    tracks = []
    print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges))
    iteration = 0
    while ranges:
        if iteration >= opts.iter:
            break

        selected, score = range_chain(ranges)
        tracks.append(selected)
        selected = set(x.id for x in selected)
        if trackids:
            print >> fwlog, ",".join(str(x) for x in sorted(selected))

        ranges = [x for x in ranges if x.id not in selected]
        msg = "Chain {0}: score={1}".format(iteration, score)
        if ranges:
            msg += " {0} blocks remained..".format(len(ranges))
        else:
            msg += " done!"

        print >> sys.stderr, msg
        iteration += 1

    mbed = []
    for b in bed:
        id = b.accn
        atoms = []
        for track in tracks:
            track_ids = [x.id for x in track]
            for tid in track_ids:
                pairs = block_pairs[tid]
                anchor = pairs.get(id, ".")
                if anchor != ".":
                    break
            if ascii and anchor != ".":
                anchor = "x"
            atoms.append(anchor)
        mbed.append((id, atoms))

    for id, atoms in mbed:
        sep = "" if ascii else "\t"
        if mergetandem:
            for i, atom in enumerate(atoms):
                atoms[i] = tandems.get(atom, atom)
        print >> fw, "\t".join((id, sep.join(atoms)))

    logging.debug("MCscan blocks written to `{0}`.".format(ofile))
    if trackids:
        logging.debug("Block IDs written to `{0}`.".format(olog))
Esempio n. 39
0
File: blast.py Progetto: wangwl/jcvi
def filter(args):
    """
    %prog filter test.blast

    Produce a new blast file and filter based on:
    - score: >= cutoff
    - pctid: >= cutoff
    - hitlen: >= cutoff
    - evalue: <= cutoff
    - ids: valid ids

    Use --inverse to obtain the complementary records for the criteria above.

    - noself: remove self-self hits
    """
    p = OptionParser(filter.__doc__)
    p.add_option("--score",
                 dest="score",
                 default=0,
                 type="int",
                 help="Score cutoff")
    p.set_align(pctid=95, hitlen=100, evalue=.01)
    p.add_option("--noself",
                 default=False,
                 action="store_true",
                 help="Remove self-self hits")
    p.add_option("--ids", help="Path to file with ids to retain")
    p.add_option("--inverse",
                 default=False,
                 action="store_true",
                 help="Similar to grep -v, inverse")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    if opts.ids:
        ids = set()
        for row in must_open(opts.ids):
            if row[0] == "#":
                continue
            row = row.replace(",", "\t")
            ids.update(row.split())
    else:
        ids = None

    blastfile, = args
    inverse = opts.inverse
    outfile = opts.outfile
    fp = must_open(blastfile)

    score, pctid, hitlen, evalue, noself = \
            opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself
    newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \
                    outfile is None else outfile
    if inverse:
        newblastfile += ".inverse"
    fw = must_open(newblastfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        c = BlastLine(row)

        if ids:
            if c.query in ids and c.subject in ids:
                noids = False
            else:
                noids = True
        else:
            noids = None

        remove = c.score < score or \
            c.pctid < pctid or \
            c.hitlen < hitlen or \
            c.evalue > evalue or \
            noids

        if inverse:
            remove = not remove

        remove = remove or (noself and c.query == c.subject)

        if not remove:
            print >> fw, row.rstrip()

    fw.close()

    return newblastfile
Esempio n. 40
0
File: blast.py Progetto: wangwl/jcvi
def cscore(args):
    """
    %prog cscore blastfile > cscoreOut

    See supplementary info for sea anemone genome paper, C-score formula:

        cscore(A,B) = score(A,B) /
             max(best score for A, best score for B)

    A C-score of one is the same as reciprocal best hit (RBH).

    Output file will be 3-column (query, subject, cscore). Use --cutoff to
    select a different cutoff.
    """
    from jcvi.utils.cbook import gene_name

    p = OptionParser(cscore.__doc__)
    p.add_option("--cutoff",
                 default=.9999,
                 type="float",
                 help="Minimum C-score to report [default: %default]")
    p.add_option("--pct",
                 default=False,
                 action="store_true",
                 help="Also include pct as last column [default: %default]")
    p.add_option("--writeblast",
                 default=False,
                 action="store_true",
                 help="Also write filtered blast file [default: %default]")
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)
    ostrip = opts.strip_names
    writeblast = opts.writeblast
    outfile = opts.outfile

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    blast = Blast(blastfile)
    logging.debug("Register best scores ..")
    best_score = defaultdict(float)
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        if score > best_score[query]:
            best_score[query] = score
        if score > best_score[subject]:
            best_score[subject] = score

    blast = Blast(blastfile)
    pairs = {}
    cutoff = opts.cutoff
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        pctid = b.pctid
        s = score / max(best_score[query], best_score[subject])
        if s > cutoff:
            pair = (query, subject)
            if pair not in pairs or s > pairs[pair][0]:
                pairs[pair] = (s, pctid, b)

    fw = must_open(outfile, "w")
    if writeblast:
        fwb = must_open(outfile + ".filtered.blast", "w")
    pct = opts.pct
    for (query, subject), (s, pctid, b) in sorted(pairs.items()):
        args = [query, subject, "{0:.2f}".format(s)]
        if pct:
            args.append("{0:.1f}".format(pctid))
        print >> fw, "\t".join(args)
        if writeblast:
            print >> fwb, b
    fw.close()
    if writeblast:
        fwb.close()
Esempio n. 41
0
    def write(self, filename="stdout"):

        fw = must_open(filename, "w")
        for e in self.edges.values():
            print >> fw, e
        logging.debug("Graph written to `{0}`.".format(filename))
Esempio n. 42
0
File: blast.py Progetto: wangwl/jcvi
 def __init__(self, filename):
     super(Blast, self).__init__(filename)
     self.fp = must_open(filename)
Esempio n. 43
0
    def write_AGP(self, filename, orientationguide={}):
        """
        For each component, we have two overlaps: North and South.

        =======
           ||||             South
           ====(=================)  Current BAC
           North             ||||
                             ===============

        For the case that says "Non-terminal", the overlap will not be
        considered. North-South would suggest a '+' orientation, South-North
        would suggest a '-' orientation. In most cases, unless the overlap
        involves phase1 BAC, the selected range will be shown as the brackets
        above - exclude North overlap, and include South overlap (aka the
        "left-greedy" rule).
        """
        fw = must_open(filename, "w")
        for aid, bb in groupby(self.lines, key=lambda x: x.aid):
            bb = list(bb)
            north, south = bb
            aid = north.aid
            assert aid == south.aid

            aphase = north.aphase
            chr = north.chr
            size = north.asize
            ar = [chr, 0, 0, 0]

            northline = southline = None
            northrange = southrange = None

            # Warn if adjacent components do not have valid overlaps
            if south.is_no_overlap:
                print >> sys.stderr, south

            # Most gaps, except telomeres occur twice, so only do the "North"
            if north.is_gap:
                bar = ar + self.get_agp_gap(north.bid)
                northline = "\t".join(str(x) for x in bar)
            else:
                if north.isTerminal:
                    northrange = north.astart, north.astop

            if south.is_gap:
                if south.bid == "telomere":
                    bar = ar + self.get_agp_gap(south.bid)
                    southline = "\t".join(str(x) for x in bar)
            else:
                if south.isTerminal:
                    southrange = south.astart, south.astop
                else:
                    bar = ar + self.get_agp_gap("fragment")
                    southline = "\t".join(str(x) for x in bar)

            # Determine the orientation and clear range for the current BAC
            clr = [1, size]
            orientation = sorientation = None
            if northrange:
                start, stop = northrange
                Lhang = start - 1
                Rhang = size - stop

                orientation = '+' if Lhang < Rhang else '-'
                if north.bphase == 1 and north.bphase < aphase:
                    if Lhang < Rhang:  # North overlap at 5`
                        clr[0] = start
                    else:
                        clr[1] = stop
                # Override left-greedy (also see below)
                else:
                    if Lhang < Rhang:
                        clr[0] = stop + 1
                    else:
                        clr[1] = start - 1

            if southrange:
                start, stop = southrange
                Lhang = start - 1
                Rhang = size - stop

                sorientation = '+' if Lhang > Rhang else '-'
                # Override left-greedy (also see above)
                if aphase == 1 and aphase < south.bphase:
                    if Lhang < Rhang:  # South overlap at 5`
                        clr[0] = stop + 1
                    else:
                        clr[1] = start - 1
                else:
                    if Lhang < Rhang:
                        clr[0] = start
                    else:
                        clr[1] = stop

            if orientation:
                if sorientation:
                    try:
                        assert orientation == sorientation, \
                                "Orientation conflicts:\n{0}\n{1}".format(north, south)
                    except AssertionError as e:
                        logging.debug(e)
            else:
                if sorientation:
                    orientation = sorientation
                else:  # Both overlaps fail to define orientation
                    orientation = orientationguide.get(aid, "+")

            component_type = "D" if aphase in (1, 2) else "F"
            bar = ar + [component_type, aid, clr[0], clr[1], orientation]
            cline = "\t".join(str(x) for x in bar)

            if northline:
                print >> fw, northline
            print >> fw, cline
            if southline:
                print >> fw, southline

        fw.close()

        reindex([filename, "--inplace"])
Esempio n. 44
0
def dotplot(args):
    """
    %prog dotplot map.csv ref.fasta

    Make dotplot between chromosomes and linkage maps.
    The input map is csv formatted, for example:

    ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition
    scaffold_2707,11508,1,0
    scaffold_2707,11525,1,1.2
    """
    from jcvi.assembly.allmaps import CSVMapLine
    from jcvi.formats.sizes import Sizes
    from jcvi.utils.natsort import natsorted
    from jcvi.graphics.base import shorten
    from jcvi.graphics.dotplot import (
        plt,
        savefig,
        markup,
        normalize_axes,
        downsample,
        plot_breaks_and_labels,
        thousands,
    )

    p = OptionParser(dotplot.__doc__)
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args,
                                            figsize="8x8",
                                            style="dark",
                                            dpi=90,
                                            cmap="copper")

    if len(args) != 2:
        sys.exit(not p.print_help())

    csvfile, fastafile = args
    sizes = natsorted(Sizes(fastafile).mapping.items())
    seen = set()
    raw_data = []

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])  # the dot plot

    fp = must_open(csvfile)
    for row in fp:
        m = CSVMapLine(row)
        seen.add(m.seqid)
        raw_data.append(m)

    # X-axis is the genome assembly
    ctgs, ctg_sizes = zip(*sizes)
    xsize = sum(ctg_sizes)
    qb = list(np.cumsum(ctg_sizes))
    qbreaks = list(zip(ctgs, [0] + qb, qb))
    qstarts = dict(zip(ctgs, [0] + qb))

    # Y-axis is the map
    key = lambda x: x.lg
    raw_data.sort(key=key)
    ssizes = {}
    for lg, d in groupby(raw_data, key=key):
        ssizes[lg] = max([x.cm for x in d])
    ssizes = natsorted(ssizes.items())
    lgs, lg_sizes = zip(*ssizes)
    ysize = sum(lg_sizes)
    sb = list(np.cumsum(lg_sizes))
    sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb))
    sstarts = dict(zip(lgs, [0] + sb))

    # Re-code all the scatter dots
    data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, "g")
            for x in raw_data if (x.seqid in qstarts)]
    npairs = downsample(data)

    x, y, c = zip(*data)
    ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)

    # Flip X-Y label
    gy, gx = op.basename(csvfile).split(".")[:2]
    gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30)
    xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize,
                                        qbreaks, sbreaks)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    title = "Alignment: {} vs {}".format(gx, gy)
    title += " ({} markers)".format(thousands(npairs))
    root.set_title(markup(title), x=0.5, y=0.96, color="k")
    logging.debug(title)
    normalize_axes(root)

    image_name = opts.outfile or (csvfile.rsplit(".", 1)[0] + "." +
                                  iopts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()
Esempio n. 45
0
 def print_to_anchors(self, outfile):
     fw = must_open(outfile, "w")
     for row in self:
         print(row.anchorline, file=fw)
     fw.close()
Esempio n. 46
0
def bed_to_bedpe(bedfile,
                 bedpefile,
                 pairsbedfile=None,
                 matesfile=None,
                 ca=False):
    """
    This converts the bedfile to bedpefile, assuming the reads are from CA.
    """
    fp = must_open(bedfile)
    fw = must_open(bedpefile, "w")
    if pairsbedfile:
        fwpairs = must_open(pairsbedfile, "w")

    clones = defaultdict(list)
    for row in fp:
        b = BedLine(row)
        name = b.accn
        clonename = clone_name(name, ca=ca)
        clones[clonename].append(b)

    if matesfile:
        fp = open(matesfile)
        libraryline = fp.next()
        # 'library bes     37896   126916'
        lib, name, smin, smax = libraryline.split()
        assert lib == "library"
        smin, smax = int(smin), int(smax)
        logging.debug("Happy mates for lib {0} fall between {1} - {2}".\
                      format(name, smin, smax))

    nbedpe = 0
    nspan = 0
    for clonename, blines in clones.items():
        if len(blines) == 2:
            a, b = blines
            aseqid, astart, aend = a.seqid, a.start, a.end
            bseqid, bstart, bend = b.seqid, b.start, b.end
            print >> fw, "\t".join(
                str(x) for x in (aseqid, astart - 1, aend, bseqid, bstart - 1,
                                 bend, clonename))
            nbedpe += 1
        else:
            a, = blines
            aseqid, astart, aend = a.seqid, a.start, a.end
            bseqid, bstart, bend = 0, 0, 0

        if pairsbedfile:
            start = min(astart, bstart) if bstart > 0 else astart
            end = max(aend, bend) if bend > 0 else aend
            if aseqid != bseqid:
                continue

            span = end - start + 1
            if (not matesfile) or (smin <= span <= smax):
                print >> fwpairs, "\t".join(str(x) for x in \
                        (aseqid, start - 1, end, clonename))
                nspan += 1

    fw.close()
    logging.debug("A total of {0} bedpe written to `{1}`.".\
                  format(nbedpe, bedpefile))
    if pairsbedfile:
        fwpairs.close()
        logging.debug("A total of {0} spans written to `{1}`.".\
                      format(nspan, pairsbedfile))
Esempio n. 47
0
File: ca.py Progetto: zjwang6/jcvi
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option(
        "--clean",
        default=False,
        action="store_true",
        help="Clean up irregular chars in seq",
    )
    p.add_option("--matefile", help="Matepairs file")
    p.add_option("--maxreadlen",
                 default=262143,
                 type="int",
                 help="Maximum read length allowed")
    p.add_option("--minreadlen",
                 default=1000,
                 type="int",
                 help="Minimum read length allowed")
    p.add_option(
        "--sequential",
        default=False,
        action="store_true",
        help="Overwrite read name (e.g. long Pacbio name)",
    )
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    maxreadlen = opts.maxreadlen
    minreadlen = opts.minreadlen
    if maxreadlen > 0:
        split = False
        f = Fasta(fastafile, lazy=True)
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug(
                    "Sequence {0} (size={1}) longer than max read len {2}".
                    format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = opts.size != 0
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = plate

    frgfile = libname + ".frg"

    if opts.clean:
        cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
        if need_update(fastafile, cleanfasta):
            clean([fastafile, "--canonical", "-o", cleanfasta])
        fastafile = cleanfasta

    if mated:
        qualfile = make_qual(fastafile, score=21)
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

        cmd = "convert-fasta-to-v2.pl"
        cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile)
        if mated:
            cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

        sh(cmd, outfile=frgfile)
        return

    fw = must_open(frgfile, "w")
    print(headerTemplate.format(libID=libname), file=fw)

    sequential = opts.sequential
    i = j = 0
    for fragID, seq in parse_fasta(fastafile):
        if len(seq) < minreadlen:
            j += 1
            continue
        i += 1
        if sequential:
            fragID = libname + str(100000000 + i)
        emitFragment(fw, fragID, libname, seq)
    fw.close()

    logging.debug(
        "A total of {0} fragments written to `{1}` ({2} discarded).".format(
            i, frgfile, j))
Esempio n. 48
0
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N + 1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i - x].accn]
                        if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.):
                            continue
                        g.join(bed[i - x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >> fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))

    return families
Esempio n. 49
0
def trim(args):
    """
    %prog trim fastqfiles

    Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes
    the paired reads mode. See manual:

    <http://www.usadellab.org/cms/index.php?page=trimmomatic>
    """
    tv = "0.32"
    TrimJar = "trimmomatic-{0}.jar".format(tv)
    p = OptionParser(trim.__doc__)
    p.add_option("--path", default=op.join("~/bin", TrimJar),
            help="Path to trimmomatic jar file [default: %default]")
    p.set_phred()
    p.add_option("--nofrags", default=False, action="store_true",
            help="Discard frags file in PE mode [default: %default]")
    p.add_option("--minqv", default=15, type="int",
            help="Average qv after trimming [default: %default]")
    p.add_option("--minlen", default=36, type="int",
            help="Minimum length after trimming [default: %default]")
    p.add_option("--adapteronly", default=False, action="store_true",
            help="Only trim adapters with no qv trimming [default: %default]")
    p.add_option("--nogz", default=False, action="store_true",
            help="Do not write to gzipped files [default: %default]")
    p.add_option("--log", default=None, dest="trimlog",
            help="Specify a `trimlog` file [default: %default]")
    p.set_cpus(cpus=4)
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    path = op.expanduser(opts.path)
    url = \
    "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\
    .format(tv)

    if not op.exists(path):
        path = download(url)
        TrimUnzipped = "Trimmomatic-" + tv
        if not op.exists(TrimUnzipped):
            sh("unzip " + path)
        os.remove(path)
        path = op.join(TrimUnzipped, TrimJar)

    assert op.exists(path), \
        "Couldn't find Trimmomatic jar file at `{0}`".\
        format(path)

    adaptersfile = "adapters.fasta"
    Adapters = must_open(op.join(datadir, adaptersfile)).read()
    write_file(adaptersfile, Adapters, skipcheck=True)

    assert op.exists(adaptersfile), \
        "Please place the illumina adapter sequence in `{0}`".\
        format(adaptersfile)

    if opts.phred is None:
        offset = guessoffset([args[0]])
    else:
        offset = int(opts.phred)

    phredflag = " -phred{0}".format(offset)
    threadsflag = " -threads {0}".format(opts.cpus)
    if opts.trimlog:
        trimlog = " -trimlog {0}".format(opts.trimlog)

    cmd = "java -Xmx4g -jar {0}".format(path)
    frags = ".frags.fastq"
    pairs = ".pairs.fastq"
    if not opts.nogz:
        frags += ".gz"
        pairs += ".gz"

    get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0]
    get_dirname = lambda x: "{0}/".format(op.dirname(x)) if op.dirname(x) else ''
    if len(args) == 1:
        cmd += " SE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile, = args
        prefix = get_prefix(fastqfile)
        dirname = get_dirname(fastqfile)
        frags1 = dirname + prefix + frags
        cmd += " {0}".format(" ".join((fastqfile, frags1)))
    else:
        cmd += " PE"
        cmd += phredflag
        cmd += threadsflag
        if opts.trimlog:
            cmd += trimlog
        fastqfile1, fastqfile2 = args
        prefix1 = get_prefix(fastqfile1)
        dirname1 = get_dirname(fastqfile1)
        prefix2 = get_prefix(fastqfile2)
        dirname2 = get_dirname(fastqfile2)
        pairs1 = dirname1 + prefix1 + pairs
        pairs2 = dirname2 + prefix2 + pairs
        frags1 = dirname1 + prefix1 + frags
        frags2 = dirname2 + prefix2 + frags
        if opts.nofrags:
            frags1 = "/dev/null"
            frags2 = "/dev/null"
        cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \
                pairs1, frags1, pairs2, frags2)))

    cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile)

    if not opts.adapteronly:
        cmd += " LEADING:3 TRAILING:3"
        cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv)

    cmd += " MINLEN:{0}".format(opts.minlen)

    if offset != 33:
        cmd += " TOPHRED33"
    sh(cmd)
Esempio n. 50
0
def shred(args):
    """
    %prog shred fastafile

    Similar to the method of `shredContig` in runCA script. The contigs are
    shredded into pseudo-reads with certain length and depth.
    """
    p = OptionParser(shred.__doc__)
    p.set_depth(depth=2)
    p.add_option("--readlen",
                 default=1000,
                 type="int",
                 help="Desired length of the reads [default: %default]")
    p.add_option("--minctglen",
                 default=0,
                 type="int",
                 help="Ignore contig sequence less than [default: %default]")
    p.add_option(
        "--shift",
        default=50,
        type="int",
        help="Overlap between reads must be at least [default: %default]")
    p.add_option(
        "--fasta",
        default=False,
        action="store_true",
        help="Output shredded reads as FASTA sequences [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    libID = fastafile.split(".")[0]
    depth = opts.depth
    readlen = opts.readlen
    shift = opts.shift

    outfile = libID + ".depth{0}".format(depth)
    if opts.fasta:
        outfile += ".fasta"
    else:
        outfile += ".frg"
    f = Fasta(fastafile, lazy=True)

    fw = must_open(outfile, "w", checkexists=True)
    if not opts.fasta:
        print >> fw, headerTemplate.format(libID=libID)
    """
    Taken from runCA:

                    |*********|
                    |###################|
    |--------------------------------------------------|
     ---------------1---------------
               ---------------2---------------
                         ---------------3---------------
    *** - center_increments
    ### - center_range_width
    """
    for ctgID, (name, rec) in enumerate(f.iteritems_ordered()):
        seq = rec.seq
        seqlen = len(seq)
        if seqlen < opts.minctglen:
            continue

        shredlen = min(seqlen - shift, readlen)
        numreads = max(seqlen * depth / shredlen, 1)
        center_range_width = seqlen - shredlen

        ranges = []
        if depth == 1:
            if seqlen < readlen:
                ranges.append((0, seqlen))
            else:
                for begin in xrange(0, seqlen, readlen - shift):
                    end = min(seqlen, begin + readlen)
                    ranges.append((begin, end))
        else:
            if numreads == 1:
                ranges.append((0, shredlen))
            else:
                prev_begin = -1
                center_increments = center_range_width * 1. / (numreads - 1)
                for i in xrange(numreads):
                    begin = center_increments * i
                    end = begin + shredlen
                    begin, end = int(begin), int(end)

                    if begin == prev_begin:
                        continue

                    ranges.append((begin, end))
                    prev_begin = begin

        for shredID, (begin, end) in enumerate(ranges):
            shredded_seq = seq[begin:end]
            fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID,
                                                      begin, end)
            emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta)

    fw.close()
    logging.debug("Shredded reads are written to `{0}`.".format(outfile))
    return outfile
Esempio n. 51
0
def certificate(args):
    """
    %prog certificate tpffile certificatefile

    Generate certificate file for all overlaps in tpffile. tpffile can be
    generated by jcvi.formats.agp.tpf().

    North  chr1  2  0  AC229737.8  telomere     58443
    South  chr1  2  1  AC229737.8  AC202463.29  58443  37835  58443  + Non-terminal

    Each line describes a relationship between the current BAC and the
    north/south BAC. First, "North/South" tag, then the chromosome, phases of
    the two BACs, ids of the two BACs, the size and the overlap start-stop of
    the CURRENT BAC, and orientation. Each BAC will have two lines in the
    certificate file.
    """
    p = OptionParser(certificate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    tpffile, certificatefile = args
    fastadir = "fasta"

    tpf = TPF(tpffile)

    data = check_certificate(certificatefile)
    fw = must_open(certificatefile, "w")
    for i, a in enumerate(tpf):
        if a.is_gap:
            continue

        aid = a.component_id

        af = op.join(fastadir, aid + ".fasta")
        if not op.exists(af):  # Check to avoid redownload
            entrez([aid, "--skipcheck", "--outdir=" + fastadir])

        north, south = tpf.getNorthSouthClone(i)
        aphase, asize = phase(aid)

        for tag, p in (("North", north), ("South", south)):
            if not p:  # end of the chromosome
                ov = "telomere\t{0}".format(asize)
            elif p.isCloneGap:
                bphase = "0"
                ov = "{0}\t{1}".format(p.gap_type, asize)
            else:
                bid = p.component_id
                bphase, bsize = phase(bid)
                key = (tag, aid, bid)
                if key in data:
                    print >> fw, data[key]
                    continue

                ar = [aid, bid, "--dir=" + fastadir]
                o = overlap(ar)
                ov = o.certificateline if o \
                        else "{0}\t{1}\tNone".format(bid, asize)

            print >> fw, "\t".join(str(x) for x in \
                    (tag, a.object, aphase, bphase, aid, ov))
            fw.flush()
Esempio n. 52
0
def subset(args):
    """
    %prog subset pairsfile ksfile1 ksfile2 ... -o pairs.ks

    Subset some pre-calculated ks ka values (in ksfile) according to pairs
    in tab delimited pairsfile/anchorfile.
    """
    p = OptionParser(subset.__doc__)
    p.add_option("--noheader", action="store_true",
                 help="don't write ksfile header line [default: %default]")
    p.add_option("--block", action="store_true",
                 help="preserve block structure in input [default: %default]")
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, ksfiles = args[0], args[1:]
    noheader = opts.noheader
    block = opts.block
    if block:
        noheader = True
    outfile = opts.outfile

    ksvals = {}
    for ksfile in ksfiles:
        ksvals.update(dict((line.name, line) for line in \
                        KsFile(ksfile, strip_names=opts.strip_names)))

    fp = open(pairsfile)
    fw = must_open(outfile, "w")

    if not noheader:
        print >>fw, fields

    i = j = 0
    for row in fp:
        if row[0] == '#':
            if block:
                print >>fw, row.strip()
            continue
        a, b = row.split()[:2]
        name = ";".join((a, b))
        if name not in ksvals:
            name = ";".join((b, a))
            if name not in ksvals:
                j += 1
                print >>fw, "\t".join((a, b, ".", "."))
                continue
        ksline = ksvals[name]
        if block:
            print >>fw, "\t".join(str(x) for x in (a, b, ksline.ks))
        else:
            ksline.name = ";".join((a, b))
            print >>fw, ksline
        i += 1
    fw.close()

    logging.debug("{0} pairs not found in ksfiles".format(j))
    logging.debug("{0} ks records written to `{1}`".format(i, outfile))
    return outfile
Esempio n. 53
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("-a",
                 "-A",
                 dest="cpus",
                 default=1,
                 type="int",
                 help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="output format, one of {0} [default: %default]".\
                 format("|".join(supported_formats)))
    p.add_option("--path",
                 dest="lastz_path",
                 default=None,
                 help="specify LASTZ path")
    p.add_option(
        "--mask",
        dest="mask",
        default=False,
        action="store_true",
        help="treat lower-case letters as mask info [default: %default]")
    p.add_option(
        "--similar",
        default=False,
        action="store_true",
        help="Use options tuned for close comparison [default: %default]")

    set_params(p)
    set_outfile(p)
    set_grid(p)

    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    grid = opts.grid
    if grid:
        print >> sys.stderr, "Running jobs on JCVI grid"

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith(
        "lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format, grid))

        if grid:
            cmds = [lastz_2bit(x) for x in args]
            g = Grid(cmds)
            g.run()
            g.writestatus()

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    if grid:
        cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \
                lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)]
        mkdir(outdir)
        g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\
                format(i) for i in range(len(cmds))])
        g.run()
        g.writestatus()

    else:
        args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin,
                 extra, mask) for k in xrange(cpus)]
        g = Jobs(target=lastz, args=args)
        g.run()
Esempio n. 54
0
 def print_to_anchors(self, outfile):
     fw = must_open(outfile, "w")
     for row in self:
         print >> fw, row.anchorline
     fw.close()
Esempio n. 55
0
def path(args):
    """
    %prog path input.bed scaffolds.fasta

    Construct golden path given a set of genetic maps. The respective weight for
    each map is given in file `weights.txt`. The map with the highest weight is
    considered the pivot map. The final output is an AGP file that contains
    ordered scaffolds.
    """
    oargs = args
    p = OptionParser(path.__doc__)
    p.add_option("-b", "--bedfile", help=SUPPRESS_HELP)
    p.add_option("-s", "--fastafile", help=SUPPRESS_HELP)
    p.add_option("-w", "--weightsfile", default="weights.txt",
                 help="Use weights from file")
    p.add_option("--distance", default="rank", choices=distance_choices,
                 help="Distance function when building initial consensus")
    p.add_option("--linkage", default="double", choices=linkage_choices,
                 help="Linkage function when building initial consensus")
    p.add_option("--gapsize", default=100, type="int",
                 help="Insert gaps of size between scaffolds")
    p.add_option("--ngen", default=500, type="int",
                 help="Iterations in GA, more ~ slower")
    p.add_option("--npop", default=100, type="int",
                 help="Population size in GA, more ~ slower")
    p.add_option("--seqid", help="Only run partition with this seqid")
    p.add_option("--links", default=10, type="int",
                 help="Only plot matchings more than")
    p.add_option("--noplot", default=False, action="store_true",
                 help="Do not visualize the alignments")
    p.set_cpus(cpus=16)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, fastafile = args
    inputbed = opts.bedfile or inputbed
    fastafile = opts.fastafile or fastafile
    pf = inputbed.rsplit(".", 1)[0]
    bedfile = pf + ".bed"
    weightsfile = opts.weightsfile
    gapsize = opts.gapsize
    ngen = opts.ngen
    npop = opts.npop
    cpus = opts.cpus
    if sys.version_info[:2] < (2, 7):
        logging.debug("Python version: {0}. CPUs set to 1.".\
                        format(sys.version.splitlines()[0].strip()))
        cpus = 1

    function = get_function(opts.distance)
    cc = Map(bedfile, function)
    mapnames = cc.mapnames
    allseqids = cc.seqids
    weights = Weights(weightsfile, mapnames)
    pivot = weights.pivot
    ref = weights.ref
    linkage = opts.linkage
    oseqid = opts.seqid
    logging.debug("Linkage function: {0}-linkage".format(linkage))
    linkage = {"single": min, "double": double_linkage, "complete": max,
               "average": np.mean, "median": np.median}[linkage]

    # Partition the linkage groups into consensus clusters
    C = Grouper()
    # Initialize the partitions
    for mlg in cc.mlgs:
        C.join(mlg)

    logging.debug("Partition LGs based on {0}".format(ref))
    for mapname in mapnames:
        if mapname == ref:
            continue
        # Compute co-occurrence between LG pairs
        G = defaultdict(int)
        for s in allseqids:
            s = Scaffold(s, cc)
            s.add_LG_pairs(G, (ref, mapname))
        # Convert edge list to adj list
        nodes = defaultdict(list)
        for (a, b), w in G.items():
            nodes[a].append((b, w))
        # Find the best ref LG every non-ref LG matches to
        for n, neighbors in nodes.items():
            if n.split("-")[0] == ref:
                continue
            neighbors = dict(neighbors)
            best_neighbor, best_value = best_no_ambiguous(neighbors, n)
            if best_neighbor is None:
                continue
            C.join(n, best_neighbor)

    partitions = defaultdict(list)
    # Partition the scaffolds and assign them to one consensus
    for s in allseqids:
        s = Scaffold(s, cc)
        seqid = s.seqid
        counts = {}
        for mlg, count in s.mlg_counts.items():
            consensus = C[mlg]
            mapname = mlg.split("-")[0]
            mw = weights[mapname]
            if consensus not in counts:
                counts[consensus] = 0
            counts[consensus] += count * mw
        best_consensus, best_value = best_no_ambiguous(counts, seqid)
        if best_consensus is None:
            continue
        partitions[best_consensus].append(seqid)

    # Perform OO within each partition
    agpfile = pf + ".chr.agp"
    tourfile = pf + ".tour"
    sizes = Sizes(fastafile).mapping
    fwagp = must_open(agpfile, "w")
    fwtour = must_open(tourfile, "w")
    solutions = []
    for lgs, scaffolds in sorted(partitions.items()):
        if oseqid and oseqid not in lgs:
            continue
        tag = "|".join(lgs)
        lgs_maps = set(x.split("-")[0] for x in lgs)
        if pivot not in lgs_maps:
            logging.debug("Skipping {0} ...".format(tag))
            continue
        logging.debug("Working on {0} ...".format(tag))
        s = ScaffoldOO(lgs, scaffolds, cc, pivot, weights, sizes,
                       function=function, linkage=linkage,
                       ngen=ngen, npop=npop, cpus=cpus)

        for fw in (sys.stderr, fwtour):
            print >> fw, ">{0} ({1})".format(s.object, tag)
            print >> fw, " ".join("".join(x) for x in s.tour)
        solutions.append(s)
    fwtour.close()

    # meta-data about the run parameters
    command = "# COMMAND: python -m jcvi.assembly.allmaps path {0}".\
                     format(" ".join(oargs))
    comment = "Generated by ALLMAPS v{0} ({1})\n{2}".\
                     format(version, get_today(), command)
    AGP.print_header(fwagp, comment=comment)

    for s in sorted(solutions, key=lambda x: x.object):
        order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize,
                     gaptype="map")
    fwagp.close()

    logging.debug("AGP file written to `{0}`.".format(agpfile))
    logging.debug("Tour file written to `{0}`.".format(tourfile))

    build([inputbed, fastafile])

    summaryfile = pf + ".summary.txt"
    summary([inputbed, fastafile, "--outfile={0}".format(summaryfile)])

    if not opts.noplot:
        plotall([inputbed, "--links={0}".format(opts.links)])
Esempio n. 56
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw (default) or muscle.
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(calc.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option("--msa", default="clustalw", choices=("clustalw", "muscle"),
                 help="software used to align the proteins [default: %default]")
    p.add_option("--workdir", default=os.getcwd(), help="Work directory")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print >>sys.stderr, "Incorrect arguments"
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    print >> output_h, fields
    work_dir = op.join(opts.workdir, "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name
        if opts.msa == "clustalw":
            align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir)
        elif opts.msa == "muscle":
            align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(str(x) for x in (pair_name,
                        ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Esempio n. 57
0
def seeds(args):
    """
    %prog seeds [pngfile|jpgfile]

    Extract seed metrics from [pngfile|jpgfile]. Use --rows and --cols to crop image.
    """
    p = OptionParser(seeds.__doc__)
    p.set_outfile()
    opts, args, iopts = add_seeds_options(p, args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (pngfile,) = args
    pf = opts.prefix or op.basename(pngfile).rsplit(".", 1)[0]
    sigma, kernel = opts.sigma, opts.kernel
    rows, cols = opts.rows, opts.cols
    labelrows, labelcols = opts.labelrows, opts.labelcols
    ff = opts.filter
    calib = opts.calibrate
    outdir = opts.outdir
    if outdir != ".":
        mkdir(outdir)
    if calib:
        calib = json.load(must_open(calib))
        pixel_cm_ratio, tr = calib["PixelCMratio"], calib["RGBtransform"]
        tr = np.array(tr)
    nbcolor = opts.changeBackground
    pngfile = convert_background(pngfile, nbcolor)
    resizefile, mainfile, labelfile, exif = convert_image(
        pngfile,
        pf,
        outdir=outdir,
        rotate=opts.rotate,
        rows=rows,
        cols=cols,
        labelrows=labelrows,
        labelcols=labelcols,
    )
    oimg = load_image(resizefile)
    img = load_image(mainfile)

    fig, (ax1, ax2, ax3, ax4) = plt.subplots(
        ncols=4, nrows=1, figsize=(iopts.w, iopts.h)
    )
    # Edge detection
    img_gray = rgb2gray(img)
    logging.debug("Running {0} edge detection ...".format(ff))
    if ff == "canny":
        edges = canny(img_gray, sigma=opts.sigma)
    elif ff == "roberts":
        edges = roberts(img_gray)
    elif ff == "sobel":
        edges = sobel(img_gray)
    edges = clear_border(edges, buffer_size=opts.border)
    selem = disk(kernel)
    closed = closing(edges, selem) if kernel else edges
    filled = binary_fill_holes(closed)

    # Watershed algorithm
    if opts.watershed:
        distance = distance_transform_edt(filled)
        local_maxi = peak_local_max(distance, threshold_rel=0.05, indices=False)
        coordinates = peak_local_max(distance, threshold_rel=0.05)
        markers, nmarkers = label(local_maxi, return_num=True)
        logging.debug("Identified {0} watershed markers".format(nmarkers))
        labels = watershed(closed, markers, mask=filled)
    else:
        labels = label(filled)

    # Object size filtering
    w, h = img_gray.shape
    canvas_size = w * h
    min_size = int(round(canvas_size * opts.minsize / 100))
    max_size = int(round(canvas_size * opts.maxsize / 100))
    logging.debug(
        "Find objects with pixels between {0} ({1}%) and {2} ({3}%)".format(
            min_size, opts.minsize, max_size, opts.maxsize
        )
    )

    # Plotting
    ax1.set_title("Original picture")
    ax1.imshow(oimg)

    params = "{0}, $\sigma$={1}, $k$={2}".format(ff, sigma, kernel)
    if opts.watershed:
        params += ", watershed"
    ax2.set_title("Edge detection\n({0})".format(params))
    closed = gray2rgb(closed)
    ax2_img = labels
    if opts.edges:
        ax2_img = closed
    elif opts.watershed:
        ax2.plot(coordinates[:, 1], coordinates[:, 0], "g.")
    ax2.imshow(ax2_img, cmap=iopts.cmap)

    ax3.set_title("Object detection")
    ax3.imshow(img)

    filename = op.basename(pngfile)
    if labelfile:
        accession = extract_label(labelfile)
    else:
        accession = pf

    # Calculate region properties
    rp = regionprops(labels)
    rp = [x for x in rp if min_size <= x.area <= max_size]
    nb_labels = len(rp)
    logging.debug("A total of {0} objects identified.".format(nb_labels))
    objects = []
    for i, props in enumerate(rp):
        i += 1
        if i > opts.count:
            break

        y0, x0 = props.centroid
        orientation = props.orientation
        major, minor = props.major_axis_length, props.minor_axis_length
        major_dx = cos(orientation) * major / 2
        major_dy = sin(orientation) * major / 2
        minor_dx = sin(orientation) * minor / 2
        minor_dy = cos(orientation) * minor / 2
        ax2.plot((x0 - major_dx, x0 + major_dx), (y0 + major_dy, y0 - major_dy), "r-")
        ax2.plot((x0 - minor_dx, x0 + minor_dx), (y0 - minor_dy, y0 + minor_dy), "r-")

        npixels = int(props.area)
        # Sample the center of the blob for color
        d = min(int(round(minor / 2 * 0.35)) + 1, 50)
        x0d, y0d = int(round(x0)), int(round(y0))
        square = img[(y0d - d) : (y0d + d), (x0d - d) : (x0d + d)]
        pixels = []
        for row in square:
            pixels.extend(row)
        logging.debug(
            "Seed #{0}: {1} pixels ({2} sampled) - {3:.2f}%".format(
                i, npixels, len(pixels), 100.0 * npixels / canvas_size
            )
        )

        rgb = pixel_stats(pixels)
        objects.append(Seed(filename, accession, i, rgb, props, exif))
        minr, minc, maxr, maxc = props.bbox
        rect = Rectangle(
            (minc, minr), maxc - minc, maxr - minr, fill=False, ec="w", lw=1
        )
        ax3.add_patch(rect)
        mc, mr = (minc + maxc) / 2, (minr + maxr) / 2
        ax3.text(mc, mr, "{0}".format(i), color="w", ha="center", va="center", size=6)

    for ax in (ax2, ax3):
        ax.set_xlim(0, h)
        ax.set_ylim(w, 0)

    # Output identified seed stats
    ax4.text(0.1, 0.92, "File: {0}".format(latex(filename)), color="g")
    ax4.text(0.1, 0.86, "Label: {0}".format(latex(accession)), color="m")
    yy = 0.8
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print(Seed.header(calibrate=calib), file=fw)
    for o in objects:
        if calib:
            o.calibrate(pixel_cm_ratio, tr)
        print(o, file=fw)
        i = o.seedno
        if i > 7:
            continue
        ax4.text(0.01, yy, str(i), va="center", bbox=dict(fc="none", ec="k"))
        ax4.text(0.1, yy, o.pixeltag, va="center")
        yy -= 0.04
        ax4.add_patch(
            Rectangle((0.1, yy - 0.025), 0.12, 0.05, lw=0, fc=rgb_to_hex(o.rgb))
        )
        ax4.text(0.27, yy, o.hashtag, va="center")
        yy -= 0.06
    ax4.text(
        0.1,
        yy,
        "(A total of {0} objects displayed)".format(nb_labels),
        color="darkslategray",
    )
    normalize_axes(ax4)

    for ax in (ax1, ax2, ax3):
        xticklabels = [int(x) for x in ax.get_xticks()]
        yticklabels = [int(x) for x in ax.get_yticks()]
        ax.set_xticklabels(xticklabels, family="Helvetica", size=8)
        ax.set_yticklabels(yticklabels, family="Helvetica", size=8)

    image_name = op.join(outdir, pf + "." + iopts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    return objects
Esempio n. 58
0
def query(args):
    """
    %prog query "SELECT feat_name FROM asm_feature WHERE feat_type = \\"{0}\\" AND end5 <= \\"{1}\\" AND end3 >= \\"{2}\\"" ::: datafile1 ....

    Script takes the data from tab-delimited datafile(s) and replaces the placeholders
    in the query which is then executed. Depending upon the type of query, results are
    either printed out (when running `select`) or not (when running `insert`, `update`
    or `delete`)

    If the query contains quotes around field values, then these need to be escaped with \\
    """
    p = OptionParser(query.__doc__)
    p.set_db_opts()
    p.add_option(
        "--dryrun",
        default=False,
        action="store_true",
        help="Don't commit to database. Just print queries [default: %default]"
    )
    p.set_sep(help="Specify output field separator")
    p.set_verbose(help="Print out all the queries")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    fieldsep = opts.sep

    sep = ":::"
    files = None
    if sep in args:
        sepidx = args.index(sep)
        files = args[sepidx + 1:]
        args = args[:sepidx]
        if not files:
            files = [""]

    qrys = []
    qry = " ".join(args)
    if ";" in qry:
        for q in qry.split(";"):
            if len(q.strip()) > 0:
                qrys.append(q)
    else:
        qrys.append(qry)

    queries = set()
    if files:
        for datafile in files:
            datafile = datafile.strip()
            fp = must_open(datafile)
            for row in fp:
                for qry in qrys:
                    qry = qry.strip()
                    m = re.findall(r"\{\d+\}", qry)
                    if m:
                        mi = [int(x.strip("{}")) for x in m]
                        atoms = row.strip().split("\t")
                        assert max(mi) <= len(atoms), \
                                "Number of columns in `datafile`({0})".format(len(atoms)) + \
                                " != number of `placeholders`({0})".format(len(m))
                        natoms = [atoms[x] for x in mi]
                        for idx, (match, atom) in enumerate(zip(m, natoms)):
                            qry = qry.replace(match, atom)
                    queries.add(qry)
    else:
        for qry in qrys:
            if re.search(r"\{\d+\}", qry):
                logging.error(
                    "Query `{0}` contains placeholders, no datafile(s) specified"
                    .format(qry))
                sys.exit()
            queries.add(qry)

    if not opts.dryrun:
        fw = must_open(opts.outfile, "w")
        dbh, cur = connect(opts.dbname, connector=opts.dbconn, hostname=opts.hostname, \
                username=opts.username, password=opts.password, port=opts.port)
    cflag = None
    for qry in queries:
        if opts.dryrun or opts.verbose:
            print(qry)
        if not opts.dryrun:
            if to_commit(qry):
                execute(cur, qry)
                cflag = True
            else:
                results = fetchall(cur, qry, connector=opts.dbconn)
                for result in results:
                    print(fieldsep.join([str(x) for x in result]), file=fw)
    if not opts.dryrun and cflag:
        commit(dbh)
Esempio n. 59
0
def meta(args):
    """
    %prog meta data.bin samples STR.ids STR-exons.wo.bed

    Compute allele frequencies and prune sites based on missingness.

    Filter subset of loci that satisfy:
    1. no redundancy (unique chr:pos)
    2. variable (n_alleles > 1)
    3. low level of missing data (>= 50% autosomal + X, > 25% for Y)

    Write meta file with the following infor:
    1. id
    2. title
    3. gene_name
    4. variant_type
    5. motif
    6. allele_frequency

    `STR-exons.wo.bed` can be generated like this:
    $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed
    $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed
    """
    p = OptionParser(meta.__doc__)
    p.add_option("--cutoff",
                 default=.5,
                 type="float",
                 help="Percent observed required (chrY half cutoff)")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, wobed = args
    cutoff = opts.cutoff

    af_file = "allele_freq"
    if need_update(binfile, af_file):
        df, m, samples, loci = read_binfile(binfile, sampleids, strids)
        nalleles = len(samples)
        fw = must_open(af_file, "w")
        for i, locus in enumerate(loci):
            a = m[:, i]
            counts = alleles_to_counts(a)
            af = counts_to_af(counts)
            seqid = locus.split("_")[0]
            remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff)
            print >> fw, "\t".join((locus, af, remove))
        fw.close()

    logging.debug("Load gene intersections from `{}`".format(wobed))
    fp = open(wobed)
    gene_map = defaultdict(set)
    for row in fp:
        chr1, start1, end1, chr2, start2, end2, name, ov = row.split()
        gene_map[(chr1, start1)] |= set(name.split(","))
    for k, v in gene_map.items():
        non_enst = sorted(x for x in v if not x.startswith("ENST"))
        #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST"))
        gene_map[k] = ",".join(non_enst)

    TREDS, df = read_treds()

    metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp())
    write_meta(af_file, gene_map, TREDS, filename=metafile)
    logging.debug("File `{}` written.".format(metafile))
Esempio n. 60
0
def estimategaps(args):
    """
    %prog estimategaps input.bed

    Estimate sizes of inter-scaffold gaps. The AGP file generated by path()
    command has unknown gap sizes with a generic number of Ns (often 100 Ns).
    The AGP file `input.chr.agp` will be modified in-place.
    """
    p = OptionParser(estimategaps.__doc__)
    p.add_option("--minsize", default=100, type="int",
                 help="Minimum gap size")
    p.add_option("--maxsize", default=500000, type="int",
                 help="Maximum gap size")
    p.add_option("--links", default=10, type="int",
                 help="Only use linkage grounds with matchings more than")
    p.set_verbose(help="Print details for each gap calculation")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    inputbed, = args
    pf = inputbed.rsplit(".", 1)[0]
    agpfile = pf + ".chr.agp"
    bedfile = pf + ".lifted.bed"

    cc = Map(bedfile, scaffold_info=True)
    agp = AGP(agpfile)
    minsize, maxsize = opts.minsize, opts.maxsize
    links = opts.links
    verbose = opts.verbose

    outagpfile = pf + ".estimategaps.agp"
    fw = must_open(outagpfile, "w")

    for ob, components in agp.iter_object():
        components = list(components)
        s = Scaffold(ob, cc)
        mlg_counts = s.mlg_counts
        gaps = [x for x in components if x.is_gap]
        gapsizes = [None] * len(gaps)   # master
        for mlg, count in mlg_counts.items():
            if count < links:
                continue
            g = GapEstimator(cc, agp, ob, mlg)
            g.compute_all_gaps(minsize=minsize, maxsize=maxsize, \
                               verbose=verbose)
            # Merge evidence from this mlg into master
            assert len(g.gapsizes) == len(gaps)
            for i, gs in enumerate(gapsizes):
                gg = g.gapsizes[i]
                if gs is None:
                    gapsizes[i] = gg
                elif gg:
                    gapsizes[i] = min(gs, gg)

        print gapsizes
        # Modify AGP
        i = 0
        for x in components:
            if x.is_gap:
                x.gap_length = gapsizes[i] or minsize
                x.component_type = 'U' if x.gap_length == 100 else 'N'
                i += 1
            print >> fw, x

    fw.close()
    reindex([outagpfile, "--inplace"])