Python SetFile Examples, jcvi.formats.base.SetFile Python Examples

Example #1

0

Show file

File: aws.py Project: xupengjack/jcvi

def ls(args):
    """
    %prog ls "s3://hli-mv-data-science/htang/str/*.vcf.gz"

    List files with support for wildcards.
    """
    p = OptionParser(ls.__doc__)
    p.add_option("--keys", help="List of keys to include")
    p.add_option("--recursive",
                 default=False,
                 action="store_true",
                 help="Recursive search")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    store, = args
    keys = opts.keys
    if keys:
        keys = SetFile(keys)
    print("\n".join(glob_s3(store, keys=keys, recursive=opts.recursive)))

Example #2

0

Show file

File: unitig.py Project: zjwang6/jcvi

def cut(args):
    """
    %prog cut unitigfile fragID

    Cut the unitig at a given fragment. Run `%prog trace unitigfile` first to
    see which fragment breaks the unitig.
    """
    from jcvi.formats.base import SetFile

    p = OptionParser(cut.__doc__)
    p.add_option("-s", dest="shredafter", default=False, action="store_true",
                 help="Shred fragments after the given fragID [default: %default]")
    p.add_option("--notest", default=False, action="store_true",
                 help="Do not test the unitigfile after edits [default: %default]")
    p.add_option("--blacklist",
                 help="File that contains blacklisted fragments to be popped "
                      "[default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    s, fragID = args
    u = UnitigLayout(s)
    blacklist = opts.blacklist
    black = SetFile(blacklist) if blacklist else None

    if opts.shredafter:
        u.shredafter(fragID)
    elif black:
        assert fragID == "0", "Must set fragID to 0 when --blacklist is on"
        u.pop(black)
    else:
        u.cut(fragID)
    u.print_to_file(inplace=True)

    if not opts.notest:
        test([s])

Example #3

0

Show file

def screen(args):
    """
    %prog screen anchorfile newanchorfile --qbed=qbedfile --sbed=sbedfile [options]

    Extract subset of blocks from anchorfile. Provide several options:

    1. Option --ids: a file with IDs, 0-based, comma separated, all in one line.
    2. Option --seqids: only allow seqids in this file.
    3. Option --seqpairs: only allow seqpairs in this file, one per line, e.g. "Chr01,Chr05".
    4. Option --minspan: remove blocks with less span than this.
    5. Option --minsize: remove blocks with less number of anchors than this.
    """
    p = OptionParser(screen.__doc__)
    p.set_beds()

    p.add_option("--ids",
                 help="File with block IDs (0-based) [default: %default]")
    p.add_option("--seqids", help="File with seqids [default: %default]")
    p.add_option("--seqpairs", help="File with seqpairs [default: %default]")
    p.add_option("--nointra",
                 action="store_true",
                 help="Remove intra-chromosomal blocks [default: %default]")
    p.add_option("--minspan",
                 default=0,
                 type="int",
                 help="Only blocks with span >= [default: %default]")
    p.add_option("--minsize",
                 default=0,
                 type="int",
                 help="Only blocks with anchors >= [default: %default]")
    p.add_option(
        "--simple",
        action="store_true",
        help="Write simple anchorfile with block ends [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    anchorfile, newanchorfile = args
    ac = AnchorFile(anchorfile)
    idsfile = opts.ids
    seqidsfile = opts.seqids
    seqpairsfile = opts.seqpairs
    minspan = opts.minspan
    minsize = opts.minsize
    osimple = opts.simple
    nointra = opts.nointra
    ids, seqids, seqpairs = None, None, None

    if idsfile:
        ids = SetFile(idsfile, delimiter=',')
        ids = set(int(x) for x in ids)
    if seqidsfile:
        seqids = SetFile(seqidsfile, delimiter=',')
    if seqpairsfile:
        fp = open(seqpairsfile)
        seqpairs = set()
        for row in fp:
            a, b = row.strip().split(",")
            seqpairs.add((a, b))
            seqpairs.add((b, a))

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)
    blocks = ac.blocks
    selected = 0
    fw = open(newanchorfile, "w")

    for i, block in enumerate(blocks):
        if ids and i not in ids:
            continue

        a, b, scores = zip(*block)
        a = [qorder[x] for x in a]
        b = [sorder[x] for x in b]
        ia, oa = zip(*a)
        ib, ob = zip(*b)
        aspan = max(ia) - min(ia) + 1
        bspan = max(ib) - min(ib) + 1
        aseqid = oa[0].seqid
        bseqid = ob[0].seqid

        if seqids:
            if (aseqid not in seqids) or (bseqid not in seqids):
                continue

        if seqpairs:
            if (aseqid, bseqid) not in seqpairs:
                continue

        if nointra and aseqid == bseqid:
            continue

        if minsize:
            if len(block) < minsize:
                continue

        if minspan:
            if aspan < minspan or bspan < minspan:
                continue

        selected += 1
        print >> fw, "###"
        for line in block:
            print >> fw, "\t".join(line)

    fw.close()

    if osimple:
        simple([newanchorfile, "--noheader", \
                "--qbed=" + qbed.filename, "--sbed=" + sbed.filename])

    logging.debug("Before: {0} blocks, After: {1} blocks".\
                  format(len(blocks), selected))

Example #4

0

Show file

def main(args):
    """
    %prog deltafile

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refids", help="Use subset of contigs in the ref")
    p.add_option("--refcov",
                 default=.01,
                 type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option(
        "--all",
        default=False,
        action="store_true",
        help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.add_option("--color",
                 default="similarity",
                 choices=("similarity", "direction", "none"),
                 help="Color the dots based on")
    p.add_option("--nolayout",
                 default=False,
                 action="store_true",
                 help="Do not rearrange contigs")
    p.set_align(pctid=0, hitlen=0)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    deltafile, = args
    reffasta, queryfasta = open(deltafile).readline().split()
    color = opts.color
    layout = not opts.nolayout
    prefix = op.basename(deltafile).split(".")[0]
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping

    refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys())
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([
        deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)
    ])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r],
                                        qsizes,
                                        rsizes,
                                        deltafile,
                                        refcov,
                                        prefix=prefix,
                                        color=color,
                                        layout=layout)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs,
                          qsizes,
                          rsizes,
                          deltafile,
                          refcov,
                          prefix=prefix,
                          color=color,
                          layout=layout)

Example #5

0

Show file

 def get_blacklist(self, filename):
     black = SetFile(filename)
     black = set(atg_name(x) for x in black)
     self.black.update(black)

Example #6

0

Show file

def adjgraph(args):
    """
    %prog adjgraph adjacency.txt subgraph.txt

    Construct adjacency graph for graphviz. The file may look like sample below.
    The lines with numbers are chromosomes with gene order information.

    genome 0
    chr 0
    -1 -13 -16 3 4 -6126 -5 17 -6 7 18 5357 8 -5358 5359 -9 -10 -11 5362 5360
    chr 1
    138 6133 -5387 144 -6132 -139 140 141 146 -147 6134 145 -170 -142 -143
    """
    import pygraphviz as pgv

    from jcvi.formats.base import SetFile

    p = OptionParser(adjgraph.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    infile, subgraph = args
    subgraph = SetFile(subgraph)
    subgraph = set(x.strip("-") for x in subgraph)

    G = pgv.AGraph(strict=False)  # allow multi-edge
    SG = pgv.AGraph(strict=False)

    palette = ("green", "magenta", "tomato", "peachpuff")
    fp = open(infile)
    genome_id = -1
    key = 0
    for row in fp:
        if row.strip() == "":
            continue

        atoms = row.split()
        tag = atoms[0]
        if tag in ("ChrNumber", "chr"):
            continue

        if tag == "genome":
            genome_id += 1
            gcolor = palette[genome_id]
            continue

        nodeseq = []
        for p in atoms:
            np = p.strip("-")
            nodeL, nodeR = np + "L", np + "R"
            if p[0] == "-":  # negative strand
                nodeseq += [nodeR, nodeL]
            else:
                nodeseq += [nodeL, nodeR]

        for a, b in pairwise(nodeseq):
            G.add_edge(a, b, key, color=gcolor)
            key += 1

            na, nb = a[:-1], b[:-1]
            if na not in subgraph and nb not in subgraph:
                continue

            SG.add_edge(a, b, key, color=gcolor)

    G.graph_attr.update(dpi="300")

    fw = open("graph.dot", "w")
    G.write(fw)
    fw.close()

    fw = open("subgraph.dot", "w")
    SG.write(fw)
    fw.close()

Example #7

0

Show file

File: reformat.py Project: zhaotao1987/jcvi

 def get_blacklist(self, filename):
     black = SetFile(filename)
     for x in black:
         chr, rank = atg_name(x)
         self.black.add((chr, rank))

Example #8

0

Show file

def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.

    If reference GFF3 is provided, reinstate UTRs from reference
    transcripts after trimming.

    Note: After running trimUTR, it is advised to also run
    `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3
    to adjust the boundaries of all parent 'gene' features
    """
    import gffutils
    from jcvi.formats.base import SetFile

    p = OptionParser(trimUTR.__doc__)
    p.add_option(
        "--trim5",
        default=None,
        type="str",
        help="File containing gene list for 5' UTR trimming",
    )
    p.add_option(
        "--trim3",
        default=None,
        type="str",
        help="File containing gene list for 3' UTR trimming",
    )
    p.add_option(
        "--trimrange",
        default=None,
        type="str",
        help="File containing gene list for UTR trim back" +
        "based on suggested (start, stop) coordinate range",
    )
    p.add_option(
        "--refgff",
        default=None,
        type="str",
        help="Reference GFF3 used as fallback to replace UTRs",
    )
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (gffile, ) = args
    gff = make_index(gffile)

    trim_both = False if (opts.trim5 or opts.trim3) else True
    trim5 = SetFile(opts.trim5) if opts.trim5 else set()
    trim3 = SetFile(opts.trim3) if opts.trim3 else set()
    trimrange = dict()
    if opts.trimrange:
        trf = must_open(opts.trimrange)
        for tr in trf:
            assert (len(tr.split("\t")) == 3
                    ), "Must specify (start, stop) coordinate range"
            id, start, stop = tr.split("\t")
            trimrange[id] = (int(start), int(stop))
        trf.close()

    refgff = make_index(opts.refgff) if opts.refgff else None

    fw = must_open(opts.outfile, "w")
    for feat in gff.iter_by_parent_childs(featuretype="gene",
                                          order_by=("seqid", "start"),
                                          level=1):
        for c in feat:
            cid, ctype, cparent = (
                c.id,
                c.featuretype,
                c.attributes.get("Parent", [None])[0],
            )
            t5, t3 = False, False
            if ctype == "gene":
                t5 = True if cid in trim5 else False
                t3 = True if cid in trim3 else False
                start, end = get_cds_minmax(gff, cid)
                trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
            elif ctype == "mRNA":
                utr_types, extras = [], set()
                if any(id in trim5 for id in (cid, cparent)):
                    t5 = True
                    trim5.add(cid)
                if any(id in trim3 for id in (cid, cparent)):
                    t3 = True
                    trim3.add(cid)
                refc = None
                if refgff:
                    try:
                        refc = refgff[cid]
                        refctype = refc.featuretype
                        refptype = refgff[refc.attributes["Parent"]
                                          [0]].featuretype
                        if refctype == "mRNA" and refptype == "gene":
                            if cmp_children(cid, gff, refgff, cftype="CDS"):
                                reinstate(c,
                                          refc,
                                          trim5=t5,
                                          trim3=t3,
                                          both=trim_both)
                                if t5:
                                    utr_types.append("five_prime_UTR")
                                if t3:
                                    utr_types.append("three_prime_UTR")
                                for utr_type in utr_types:
                                    for utr in refgff.children(
                                            refc, featuretype=utr_type):
                                        extras.add(utr)
                                        for exon in refgff.region(
                                                region=utr,
                                                featuretype="exon"):
                                            if exon.attributes["Parent"][
                                                    0] == cid:
                                                extras.add(exon)
                        else:
                            refc = None
                    except gffutils.exceptions.FeatureNotFoundError:
                        pass
                start, end = get_cds_minmax(gff, cid, level=1)
                if cid in trimrange:
                    start, end = range_minmax([trimrange[cid], (start, end)])
                if not refc:
                    trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
                for cc in gff.children(cid, order_by="start"):
                    _ctype = cc.featuretype
                    if _ctype not in utr_types:
                        if _ctype != "CDS":
                            if _ctype == "exon":
                                eskip = [
                                    range_overlap(to_range(cc), to_range(x))
                                    for x in extras if x.featuretype == "exon"
                                ]
                                if any(eskip):
                                    continue
                            trim(cc,
                                 start,
                                 end,
                                 trim5=t5,
                                 trim3=t3,
                                 both=trim_both)
                            fprint(cc, fw)
                        else:
                            fprint(cc, fw)
                for x in extras:
                    fprint(x, fw)
    fw.close()

Example #9

0

Show file

def maker(args):
    """
    %prog maker maker.gff3 genome.fasta

    Prepare EVM inputs by separating tracks from MAKER.
    """
    from jcvi.formats.base import SetFile, FileShredder

    A, T, P = "ABINITIO_PREDICTION", "TRANSCRIPT", "PROTEIN"
    # Stores default weights and types
    Registry = {\
        "maker": (A, 5),
        "augustus_masked": (A, 1),
        "snap_masked": (A, 1),
        "genemark": (A, 1),
        "est2genome": (T, 5),
        "est_gff": (T, 5),
        "protein2genome": (P, 5),
        "blastx": (P, 1)
    }

    p = OptionParser(maker.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gffile, fastafile = args

    types = "type.ids"
    if need_update(gffile, types):
        cmd = "cut -f2 -s {0} | sort -u".format(gffile)
        sh(cmd, outfile=types)

    types = SetFile(types)
    reg = defaultdict(list)
    weightsfile = "weights.txt"
    contents = []
    for s in types:
        rs = s.split(":")[0]
        if rs not in Registry:
            continue

        type, weight = Registry[rs]
        reg[type].append(s)
        contents.append("\t".join(str(x) for x in (type, s, weight)))

    contents = "\n".join(sorted(contents))
    write_file(weightsfile, contents, meta="weights file")

    evs = [x + ".gff" for x in (A, T, P)]
    FileShredder(evs)

    for type, tracks in reg.items():
        for t in tracks:
            cmd = "grep '\t{0}' {1} | grep -v '_match\t' >> {2}.gff".format(t, gffile, type)
            sh(cmd)

    partition(evs)
    runfile = "run.sh"
    contents = EVMRUN.format(*evs)
    write_file(runfile, contents, meta="run script")

Example #10

0

Show file

File: fractionation.py Project: zhaotao1987/jcvi

def segment(args):
    """
    %prog segment loss.ids bedfile

    Merge adjacent gene loss into segmental loss.

    Then based on the segmental loss, estimate amount of DNA loss in base pairs.
    Two estimates can be given:
    - conservative: just within the start and end of a single gene
    - aggressive: extend the deletion track to the next gene

    The real deletion size is within these estimates.
    """
    from jcvi.formats.base import SetFile

    p = OptionParser(segment.__doc__)
    p.add_option("--chain",
                 default=1,
                 type="int",
                 help="Allow next N genes to be chained [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    idsfile, bedfile = args
    bed = Bed(bedfile)
    order = bed.order
    ids = SetFile(idsfile)
    losses = Grouper()
    skip = opts.chain
    for i, a in enumerate(bed):
        a = a.accn
        for j in xrange(i + 1, i + 1 + skip):
            if j >= len(bed):
                break
            b = bed[j].accn
            if a in ids:
                losses.join(a, a)
            if a in ids and b in ids:
                losses.join(a, b)

    losses = list(losses)
    singletons = [x for x in losses if len(x) == 1]
    segments = [x for x in losses if len(x) > 1]
    ns, nm, nt = len(singletons), len(segments), len(losses)
    assert ns + nm == nt

    # Summary for all segments
    for x in sorted(singletons) + sorted(segments):
        print "\t".join(
            str(x) for x in ("|".join(sorted(x)), len(x),
                             estimate_size(x, bed, order)))

    # Find longest segment stretch
    if segments:
        mx, maxsegment = max([(len(x), x) for x in segments])
        print >> sys.stderr, "Longest stretch: run of {0} genes".format(mx)
        print >> sys.stderr, "  {0}".format("|".join(sorted(maxsegment)))
        seg_asize = sum(estimate_size(x, bed, order) for x in segments)
        seg_bsize = sum(estimate_size(x, bed, order, conservative=False) \
                             for x in segments)
    else:
        seg_asize = seg_bsize = 0

    sing_asize = sum(estimate_size(x, bed, order) for x in singletons)
    sing_bsize = sum(estimate_size(x, bed, order, conservative=False) \
                           for x in singletons)
    total_asize = sing_asize + seg_asize
    total_bsize = sing_bsize + seg_bsize
    print >> sys.stderr, "Singleton ({0}): {1} - {2} bp".\
                         format(ns, sing_asize, sing_bsize)
    print >> sys.stderr, "Segment ({0}): {1} - {2} bp".\
                         format(nm, seg_asize, seg_bsize)
    print >> sys.stderr, "Total ({0}): {1} - {2} bp".\
                         format(nt, total_asize, total_bsize)
    print >> sys.stderr, "Average ({0}): {1} bp".\
                         format(nt, (total_asize + total_bsize) / 2)

Example #11

0

Show file

File: qc.py Project: arvin580/jcvi

def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.

    If reference GFF3 is provided, reinstate UTRs from reference
    transcripts after trimming.

    Note: After running trimUTR, it is advised to also run
    `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3
    to adjust the boundaries of all parent 'gene' features
    """
    import gffutils
    from jcvi.formats.base import SetFile

    p = OptionParser(trimUTR.__doc__)
    p.add_option("--trim5", default=None, type="str", \
        help="File containing gene list for 5' UTR trimming")
    p.add_option("--trim3", default=None, type="str", \
        help="File containing gene list for 3' UTR trimming")
    p.add_option("--trimrange", default=None, type="str", \
        help="File containing gene list for UTR trim back" + \
             "based on suggested (start, stop) coordinate range")
    p.add_option("--refgff", default=None, type="str", \
        help="Reference GFF3 used as fallback to replace UTRs")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    gff = make_index(gffile)

    trim_both = False if (opts.trim5 or opts.trim3) else True
    trim5 = SetFile(opts.trim5) if opts.trim5 else set()
    trim3 = SetFile(opts.trim3) if opts.trim3 else set()
    trimrange = dict()
    if opts.trimrange:
        trf = must_open(opts.trimrange)
        for tr in trf:
            assert len(tr.split("\t")) == 3, \
                "Must specify (start, stop) coordinate range"
            id, start, stop = tr.split("\t")
            trimrange[id] = (int(start), int(stop))
        trf.close()

    refgff = make_index(opts.refgff) if opts.refgff else None

    fw = must_open(opts.outfile, "w")
    for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1):
        for c in feat:
            cid, ctype, cparent = c.id, c.featuretype, \
                c.attributes.get('Parent', [None])[0]
            t5, t3 = False, False
            if ctype == "gene":
                t5 = True if cid in trim5 else False
                t3 = True if cid in trim3 else False
                start, end = get_cds_minmax(gff, cid)
                trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
            elif ctype == "mRNA":
                utr_types, extras = [], set()
                if any(id in trim5 for id in (cid, cparent)):
                    t5 = True
                    trim5.add(cid)
                if any(id in trim3 for id in (cid, cparent)):
                    t3 = True
                    trim3.add(cid)
                refc = None
                if refgff:
                    try:
                        refc = refgff[cid]
                        refctype = refc.featuretype
                        refptype = refgff[refc.attributes['Parent'][0]].featuretype
                        if refctype == "mRNA" and refptype == "gene":
                            if cmp_children(cid, gff, refgff, cftype="CDS"):
                                reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both)
                                if t5: utr_types.append('five_prime_UTR')
                                if t3: utr_types.append('three_prime_UTR')
                                for utr_type in utr_types:
                                    for utr in refgff.children(refc, featuretype=utr_type):
                                        extras.add(utr)
                                        for exon in refgff.region(region=utr, featuretype="exon"):
                                            if exon.attributes['Parent'][0] == cid:
                                                extras.add(exon)
                        else:
                            refc = None
                    except gffutils.exceptions.FeatureNotFoundError:
                        pass
                start, end = get_cds_minmax(gff, cid, level=1)
                if cid in trimrange:
                    start, end = range_minmax([trimrange[cid], (start, end)])
                if not refc:
                    trim(c, start, end, trim5=t5, trim3=t3, both=trim_both)
                fprint(c, fw)
                for cc in gff.children(cid, order_by=("start")):
                    _ctype = cc.featuretype
                    if _ctype not in utr_types:
                        if _ctype != "CDS":
                            if _ctype == "exon":
                                eskip = [range_overlap(to_range(cc), to_range(x)) \
                                    for x in extras if x.featuretype == 'exon']
                                if any(skip for skip in eskip): continue
                            trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both)
                            fprint(cc, fw)
                        else:
                            fprint(cc, fw)
                for x in extras:
                    fprint(x, fw)
    fw.close()