Example #1
0
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=10,
                 type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
Example #2
0
def filterm4(args):
    """
    %prog filterm4 sample.m4 > filtered.m4

    Filter .m4 file after blasr is run. As blasr takes a long time to run,
    changing -bestn is undesirable. This screens the m4 file to retain top hits.
    """
    p = OptionParser(filterm4.__doc__)
    p.add_option("--best",
                 default=1,
                 type="int",
                 help="Only retain best N hits")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    m4file, = args
    best = opts.best
    fp = open(m4file)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    retained = total = 0
    for row in fp:
        r = M4Line(row)
        total += 1
        if total % 100000 == 0:
            logging.debug("Retained {0} lines".\
                            format(percentage(retained, total)))
        if seen.get(r.query, 0) < best:
            fw.write(row)
            seen[r.query] += 1
            retained += 1
    fw.close()
Example #3
0
File: age.py Project: xuanblo/jcvi
def compile(args):
    """
    %prog compile directory

    Extract telomere length and ccn.
    """
    p = OptionParser(compile.__doc__)
    p.set_outfile(outfile="age.tsv")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    dfs = []
    for folder in args:
        ofolder = os.listdir(folder)

        # telomeres
        subdir = [x for x in ofolder if x.startswith("telomeres")][0]
        subdir = op.join(folder, subdir)
        filename = op.join(subdir, "tel_lengths.txt")
        df = pd.read_csv(filename, sep="\t")
        d1 = df.ix[0].to_dict()

        # ccn
        subdir = [x for x in ofolder if x.startswith("ccn")][0]
        subdir = op.join(folder, subdir)
        filename = iglob(subdir, "*.ccn.json")[0]
        js = json.load(open(filename))
        d1.update(js)
        df = pd.DataFrame(d1, index=[0])
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(opts.outfile, sep="\t", index=False)
Example #4
0
File: qc.py Project: arvin580/jcvi
def nmd(args):
    """
    %prog nmd gffile

    Identify transcript variants which might be candidates for nonsense
    mediated decay (NMD)

    A transcript is considered to be a candidate for NMD when the CDS stop
    codon is located more than 50nt upstream of terminal splice site donor

    References:
    http://www.nature.com/horizon/rna/highlights/figures/s2_spec1_f3.html
    http://www.biomedcentral.com/1741-7007/7/23/figure/F1
    """
    import __builtin__
    from jcvi.utils.cbook import enumerate_reversed

    p = OptionParser(nmd.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    gff = make_index(gffile)

    fw = must_open(opts.outfile, "w")
    for gene in gff.features_of_type('gene', order_by=('seqid', 'start')):
        _enumerate = __builtin__.enumerate if gene.strand == "-" else enumerate_reversed
        for mrna in gff.children(gene, featuretype='mRNA', order_by=('start')):
            tracker = dict()
            tracker['exon'] = list(gff.children(mrna, featuretype='exon', order_by=('start')))
            tracker['cds'] = [None] * len(tracker['exon'])

            tcds_pos = None
            for i, exon in _enumerate(tracker['exon']):
                for cds in gff.region(region=exon, featuretype='CDS', completely_within=True):
                    if mrna.id in cds['Parent']:
                        tracker['cds'][i] = cds
                        tcds_pos = i
                        break
                if tcds_pos: break

            NMD, distance = False, 0
            if (mrna.strand == "+" and tcds_pos + 1 < len(tracker['exon'])) \
                or (mrna.strand == "-" and tcds_pos - 1 >= 0):
                tcds = tracker['cds'][tcds_pos]
                texon = tracker['exon'][tcds_pos]

                PTC = tcds.end if mrna.strand == '+' else tcds.start
                TDSS = texon.end if mrna.strand == '+' else texon.start
                distance = abs(TDSS - PTC)
                NMD = True if distance > 50 else False

            print >> fw, "\t".join(str(x) for x in (gene.id, mrna.id, \
                gff.children_bp(mrna, child_featuretype='CDS'), distance, NMD))

    fw.close()
Example #5
0
def mstmap(args):
    """
    %prog mstmap LMD50.snps.genotype.txt

    Convert LMDs to MSTMAP input.
    """
    from jcvi.assembly.geneticmap import MSTMatrix

    p = OptionParser(mstmap.__doc__)
    p.add_option("--population_type", default="RIL6",
                 help="Type of population, possible values are DH and RILd")
    p.add_option("--missing_threshold", default=.5,
                 help="Missing threshold, .25 excludes any marker with >25% missing")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    lmd, = args
    fp = open(lmd)
    fp.next()  # Header
    table = {"0": "-", "1": "A", "2": "B", "3": "X"}
    mh = ["locus_name"] + fp.next().split()[4:]
    genotypes = []
    for row in fp:
        atoms = row.split()
        chr, pos, ref, alt = atoms[:4]
        locus_name = ".".join((chr, pos))
        codes = [table[x] for x in atoms[4:]]
        genotypes.append([locus_name] + codes)

    mm = MSTMatrix(genotypes, mh, opts.population_type, opts.missing_threshold)
    mm.write(opts.outfile, header=True)
Example #6
0
def mergemat(args):
    """
    %prog mergemat *.npy

    Combine counts from multiple .npy data files.
    """
    p = OptionParser(mergemat.__doc__)
    p.set_outfile(outfile="out")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    npyfiles = args
    A = np.load(npyfiles[0])
    logging.debug("Load `{}`: matrix of shape {}:; sum={}"
                  .format(npyfiles[0], A.shape, A.sum()))
    for npyfile in npyfiles[1:]:
        B = np.load(npyfile)
        A += B
        logging.debug("Load `{}`: sum={}"
                      .format(npyfiles[0], A.sum()))

    pf = opts.outfile
    np.save(pf, A)
    logging.debug("Combined {} files into `{}.npy`".format(len(npyfiles), pf))
Example #7
0
def nmd(args):
    """
    %prog nmd gffile

    Identify transcript variants which might be candidates for nonsense
    mediated decay (NMD)

    A transcript is considered to be a candidate for NMD when the CDS stop
    codon is located more than 50nt upstream of terminal splice site donor

    References:
    http://www.nature.com/horizon/rna/highlights/figures/s2_spec1_f3.html
    http://www.biomedcentral.com/1741-7007/7/23/figure/F1
    """
    import __builtin__
    from jcvi.utils.cbook import enumerate_reversed

    p = OptionParser(nmd.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    gff = make_index(gffile)

    fw = must_open(opts.outfile, "w")
    for gene in gff.features_of_type('gene', order_by=('seqid', 'start')):
        _enumerate = __builtin__.enumerate if gene.strand == "-" else enumerate_reversed
        for mrna in gff.children(gene, featuretype='mRNA', order_by=('start')):
            tracker = dict()
            tracker['exon'] = list(gff.children(mrna, featuretype='exon', order_by=('start')))
            tracker['cds'] = [None] * len(tracker['exon'])

            tcds_pos = None
            for i, exon in _enumerate(tracker['exon']):
                for cds in gff.region(region=exon, featuretype='CDS', completely_within=True):
                    if mrna.id in cds['Parent']:
                        tracker['cds'][i] = cds
                        tcds_pos = i
                        break
                if tcds_pos: break

            NMD, distance = False, 0
            if (mrna.strand == "+" and tcds_pos + 1 < len(tracker['exon'])) \
                or (mrna.strand == "-" and tcds_pos - 1 >= 0):
                tcds = tracker['cds'][tcds_pos]
                texon = tracker['exon'][tcds_pos]

                PTC = tcds.end if mrna.strand == '+' else tcds.start
                TDSS = texon.end if mrna.strand == '+' else texon.start
                distance = abs(TDSS - PTC)
                NMD = True if distance > 50 else False

            print >> fw, "\t".join(str(x) for x in (gene.id, mrna.id, \
                gff.children_bp(mrna, child_featuretype='CDS'), distance, NMD))

    fw.close()
Example #8
0
File: ahrd.py Project: zjwang6/jcvi
def fix(args):
    """
    %prog fix ahrd.csv > ahrd.fixed.csv

    Fix ugly names from Uniprot.
    """
    p = OptionParser(fix.__doc__)
    p.add_option("--ignore_sym_pat", default=False, action="store_true",
        help="Do not fix names matching symbol patterns i.e." + \
        " names beginning or ending with gene symbols or a series of numbers." + \
        " e.g. `ARM repeat superfamily protein`, `beta-hexosaminidase 3`," + \
        " `CYCLIN A3;4`, `WALL ASSOCIATED KINASE (WAK)-LIKE 10`")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    csvfile, = args
    fp = open(csvfile)
    fw = must_open(opts.outfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        if row.strip() == "":
            continue
        atoms = row.rstrip("\r\n").split("\t")
        name, hit, ahrd_code, desc = atoms[:4] \
                if len(atoms) > 2 else \
                (atoms[0], None, None, atoms[-1])

        newdesc = fix_text(desc, ignore_sym_pat=opts.ignore_sym_pat)
        if hit and hit.strip() != "" and newdesc == Hypothetical:
            newdesc = "conserved " + newdesc
        print("\t".join(atoms[:4] + [newdesc] + atoms[4:]), file=fw)
Example #9
0
def first(args):
    """
    %prog first N fastqfile(s)

    Get first N reads from file.
    """
    from jcvi.apps.base import need_update

    p = OptionParser(first.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    N = int(args[0])
    nlines = N * 4
    fastqfiles = args[1:]
    fastqfile = fastqfiles[0]
    outfile = opts.outfile
    if not need_update(fastqfiles, outfile):
        logging.debug("File `{0}` exists. Will not overwrite.".format(outfile))
        return

    gz = fastqfile.endswith(".gz")
    for fastqfile in fastqfiles:
        if gz:
            cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines)
        else:
            cmd = "head -n {0} {1}".format(nlines, fastqfile)

        sh(cmd, outfile=opts.outfile, append=True)
Example #10
0
def freq(args):
    """
    %prog freq fastafile bamfile

    Call SNP frequencies and generate GFF file.
    """
    p = OptionParser(freq.__doc__)
    p.add_option("--mindepth",
                 default=3,
                 type="int",
                 help="Minimum depth [default: %default]")
    p.add_option("--minqual",
                 default=20,
                 type="int",
                 help="Minimum quality [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    cmd = "freebayes -f {0} --pooled-continuous {1}".format(fastafile, bamfile)
    cmd += " -F 0 -C {0}".format(opts.mindepth)
    cmd += ' | vcffilter -f "QUAL > {0}"'.format(opts.minqual)
    cmd += " | vcfkeepinfo - AO RO TYPE"
    sh(cmd, outfile=opts.outfile)
Example #11
0
def insertion(args):
    """
    %prog insertion mic.mac.bed

    Find IES based on mapping MIC reads to MAC genome. Output a bedfile with
    'lesions' (stack of broken reads) in the MAC genome.
    """
    p = OptionParser(insertion.__doc__)
    p.add_option("--mindepth", default=6, type="int",
                 help="Minimum depth to call an insertion")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    mindepth = opts.mindepth
    bed = Bed(bedfile)
    fw = must_open(opts.outfile, "w")
    for seqid, feats in bed.sub_beds():
        left_ends = Counter([x.start for x in feats])
        right_ends = Counter([x.end for x in feats])
        selected = []
        for le, count in left_ends.items():
            if count >= mindepth:
                selected.append((seqid, le, "LE-{0}".format(le), count))
        for re, count in right_ends.items():
            if count >= mindepth:
                selected.append((seqid, re, "RE-{0}".format(re), count))
        selected.sort()
        for seqid, pos, label, count in selected:
            label = "{0}-r{1}".format(label, count)
            print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))
Example #12
0
def fix(args):
    """
    %prog fix ahrd.csv > ahrd.fixed.csv

    Fix ugly names from Uniprot.
    """
    p = OptionParser(fix.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    csvfile, = args
    fp = open(csvfile)
    fw = must_open(opts.outfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        if row.strip() == "":
            continue
        atoms = row.rstrip("\r\n").split("\t")
        name, hit, ahrd_code, desc = atoms[:4] \
                if len(atoms) > 2 else \
                atoms[0], None, None, atoms[-1]
        newdesc = fix_text(desc)
        if hit and hit.strip() != "" and newdesc == Hypothetical:
            newdesc = "conserved " + newdesc
        print >> fw, "\t".join(atoms[:4] + [newdesc] + atoms[4:])
Example #13
0
def merge(args):
    """
    %prog merge ref.fasta query.fasta *.delta

    Merge delta files into a single delta.
    """
    p = OptionParser(merge.__doc__)
    p.set_outfile(outfile="merged_results.delta")
    opts, args = p.parse_args(args)

    if len(args) < 3:
        sys.exit(not p.print_help())

    ref, query = args[:2]
    deltafiles = args[2:]
    outfile = opts.outfile

    ref = get_abs_path(ref)
    query = get_abs_path(query)
    fw = must_open(outfile, "w")
    print(" ".join((ref, query)), file=fw)
    print("NUCMER", file=fw)
    fw.close()

    for d in deltafiles:
        cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d)
        sh(cmd, outfile=outfile, append=True)
Example #14
0
def merge(args):
    """
    %prog merge ref.fasta query.fasta *.delta

    Merge delta files into a single delta.
    """
    p = OptionParser(merge.__doc__)
    p.set_outfile(outfile="merged_results.delta")
    opts, args = p.parse_args(args)

    if len(args) < 3:
        sys.exit(not p.print_help())

    ref, query = args[:2]
    deltafiles = args[2:]
    outfile = opts.outfile

    ref = get_abs_path(ref)
    query = get_abs_path(query)
    fw = must_open(outfile, "w")
    print >> fw, " ".join((ref, query))
    print >> fw, "NUCMER"
    fw.close()

    for d in deltafiles:
        cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d)
        sh(cmd, outfile=outfile, append=True)
Example #15
0
File: ahrd.py Project: xuanblo/jcvi
def fix(args):
    """
    %prog fix ahrd.csv > ahrd.fixed.csv

    Fix ugly names from Uniprot.
    """
    p = OptionParser(fix.__doc__)
    p.add_option("--ignore_sym_pat", default=False, action="store_true",
        help="Do not fix names matching symbol patterns i.e." + \
        " names beginning or ending with gene symbols or a series of numbers." + \
        " e.g. `ARM repeat superfamily protein`, `beta-hexosaminidase 3`," + \
        " `CYCLIN A3;4`, `WALL ASSOCIATED KINASE (WAK)-LIKE 10`")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    csvfile, = args
    fp = open(csvfile)
    fw = must_open(opts.outfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        if row.strip() == "":
            continue
        atoms = row.rstrip("\r\n").split("\t")
        name, hit, ahrd_code, desc = atoms[:4] \
                if len(atoms) > 2 else \
                (atoms[0], None, None, atoms[-1])

        newdesc = fix_text(desc, ignore_sym_pat=opts.ignore_sym_pat)
        if hit and hit.strip() != "" and newdesc == Hypothetical:
            newdesc = "conserved " + newdesc
        print >> fw, "\t".join(atoms[:4] + [newdesc] + atoms[4:])
Example #16
0
def mergecsv(args):
    """
    %prog mergecsv *.tsv

    Merge a set of tsv files.
    """
    p = OptionParser(mergecsv.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    tsvfiles = args
    outfile = opts.outfile

    if op.exists(outfile):
        os.remove(outfile)

    fw = must_open(opts.outfile, "w")
    for i, tsvfile in enumerate(tsvfiles):
        fp = open(tsvfile)
        if i > 0:
            next(fp)
        for row in fp:
            fw.write(row)
    fw.close()
Example #17
0
File: hic.py Project: xuanblo/jcvi
def mergemat(args):
    """
    %prog mergemat *.npy

    Combine counts from multiple .npy data files.
    """
    p = OptionParser(mergemat.__doc__)
    p.set_outfile(outfile="out")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    npyfiles = args
    A = np.load(npyfiles[0])
    logging.debug("Load `{}`: matrix of shape {}:; sum={}"
                  .format(npyfiles[0], A.shape, A.sum()))
    for npyfile in npyfiles[1:]:
        B = np.load(npyfile)
        A += B
        logging.debug("Load `{}`: sum={}"
                      .format(npyfiles[0], A.sum()))

    pf = opts.outfile
    np.save(pf, A)
    logging.debug("Combined {} files into `{}.npy`".format(len(npyfiles), pf))
Example #18
0
def agp(args):
    """
    %prog agp main_results/ contigs.fasta

    Generate AGP file based on LACHESIS output.
    """
    p = OptionParser(agp.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    odir, contigsfasta = args
    fwagp = must_open(opts.outfile, 'w')
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
    sizes = Sizes(contigsfasta).mapping
    contigs = set(sizes.keys())
    anchored = set()

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        anchored |= set([x.contig_name for x in co])
        obj = op.basename(ofile).split('.')[0]
        co.write_agp(obj, sizes, fwagp)

    singletons = contigs - anchored
    logging.debug('Anchored: {}, Singletons: {}'.\
                  format(len(anchored), len(singletons)))

    for s in natsorted(singletons):
        order_to_agp(s, [(s, "?")], sizes, fwagp)
Example #19
0
def mstmap(args):
    """
    %prog mstmap LMD50.snps.genotype.txt

    Convert LMDs to MSTMAP input.
    """
    from jcvi.assembly.geneticmap import MSTMatrix

    p = OptionParser(mstmap.__doc__)
    p.add_option("--population_type", default="RIL6",
                 help="Type of population, possible values are DH and RILd")
    p.add_option("--missing_threshold", default=.5,
                 help="Missing threshold, .25 excludes any marker with >25% missing")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    lmd, = args
    fp = open(lmd)
    fp.next()  # Header
    table = {"0": "-", "1": "A", "2": "B", "3": "X"}
    mh = ["locus_name"] + fp.next().split()[4:]
    genotypes = []
    for row in fp:
        atoms = row.split()
        chr, pos, ref, alt = atoms[:4]
        locus_name = ".".join((chr, pos))
        codes = [table[x] for x in atoms[4:]]
        genotypes.append([locus_name] + codes)

    mm = MSTMatrix(genotypes, mh, opts.population_type, opts.missing_threshold)
    mm.write(opts.outfile, header=True)
Example #20
0
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.
    """
    p = OptionParser(trimUTR.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    g = make_index(gffile)
    gff = Gff(gffile)
    mRNA_register = {}
    fw = must_open(opts.outfile, "w")
    for c in gff:
        cid, ctype = c.accn, c.type
        if ctype == "gene":
            start, end = get_cds_minmax(g, cid)
            trim(c, start, end)
        elif ctype == "mRNA":
            start, end = get_cds_minmax(g, cid, level=1)
            trim(c, start, end)
            mRNA_register[cid] = (start, end)
        elif ctype != "CDS":
            start, end = mRNA_register[c.parent]
            trim(c, start, end)
        if c.start > c.end:
            print >> sys.stderr, cid, \
                    "destroyed [{0} > {1}]".format(c.start, c.end)
        else:
            print >> fw, c
Example #21
0
def fix(args):
    """
    %prog fix ahrd.csv > ahrd.fixed.csv

    Fix ugly names from Uniprot.
    """
    p = OptionParser(fix.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    csvfile, = args
    fp = open(csvfile)
    fw = must_open(opts.outfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        if row.strip() == "":
            continue
        atoms = row.rstrip("\r\n").split("\t")
        name, hit, ahrd_code, desc = atoms[:4] \
                if len(atoms) > 2 else \
                atoms[0], None, None, atoms[-1]
        newdesc = fix_text(desc)
        if hit and hit.strip() != "" and newdesc == Hypothetical:
            newdesc = "conserved " + newdesc
        print >> fw, "\t".join(atoms[:4] + [newdesc] + atoms[4:])
Example #22
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Example #23
0
def silicosoma(args):
    """
    %prog silicosoma in.silico > out.soma

    Convert .silico to .soma file.

    Format of .silico
        A text file containing in-silico digested contigs. This file contains pairs
    of lines. The first line in each pair constains an identifier, this contig
    length in bp, and the number of restriction sites, separated by white space.
    The second line contains a white space delimited list of the restriction
    site positions.

    Format of .soma
        Each line of the text file contains two decimal numbers: The size of the
    fragment and the standard deviation (both in kb), separated by white space.
    The standard deviation is ignored.
    """
    p = OptionParser(silicosoma.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (silicofile,) = args
    fp = must_open(silicofile)
    fw = must_open(opts.outfile, "w")
    next(fp)
    positions = [int(x) for x in next(fp).split()]
    for a, b in pairwise(positions):
        assert a <= b
        fragsize = int(round((b - a) / 1000.0))  # kb
        if fragsize:
            print(fragsize, 0, file=fw)
Example #24
0
def dump(args):
    """
    %prog dump fastafile

    Convert FASTA sequences to list of K-mers.
    """
    p = OptionParser(dump.__doc__)
    p.add_option("-K",
                 default=23,
                 type="int",
                 help="K-mer size [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    K = opts.K
    fw = must_open(opts.outfile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        kmers = list(make_kmers(rec.seq, K))
        print >> fw, "\n".join(kmers)
    fw.close()
Example #25
0
def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)
Example #26
0
def silicosoma(args):
    """
    %prog silicosoma in.silico > out.soma

    Convert .silico to .soma file.

    Format of .silico
        A text file containing in-silico digested contigs. This file contains pairs
    of lines. The first line in each pair constains an identifier, this contig
    length in bp, and the number of restriction sites, separated by white space.
    The second line contains a white space delimited list of the restriction
    site positions.

    Format of .soma
        Each line of the text file contains two decimal numbers: The size of the
    fragment and the standard deviation (both in kb), separated by white space.
    The standard deviation is ignored.
    """
    p = OptionParser(silicosoma.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    silicofile, = args
    fp = must_open(silicofile)
    fw = must_open(opts.outfile, "w")
    fp.next()
    positions = [int(x) for x in fp.next().split()]
    for a, b in pairwise(positions):
        assert a <= b
        fragsize = int(round((b - a) / 1000.))  # kb
        if fragsize:
            print >> fw, fragsize, 0
Example #27
0
File: bed.py Project: radaniba/jcvi
def index(args):
    """
    %prog index bedfile

    Compress frgscffile.sorted and index it using `tabix`.
    """
    p = OptionParser(index.__doc__)
    p.add_option("--query", help="Chromosome location [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    gzfile = bedfile + ".gz"

    if need_update(bedfile, gzfile):
        bedfile = sort([bedfile])
        cmd = "bgzip -c {0}".format(bedfile)
        sh(cmd, outfile=gzfile)

    tbifile = gzfile + ".tbi"

    if need_update(gzfile, tbifile):
        cmd = "tabix -p bed {0}".format(gzfile)
        sh(cmd)

    query = opts.query
    if not query:
        return

    cmd = "tabix {0} {1}".format(gzfile, query)
    sh(cmd, outfile=opts.outfile)
Example #28
0
def bincount(args):
    """
    %prog bincount fastafile binfile

    Count K-mers in the bin.
    """
    from bitarray import bitarray
    from jcvi.formats.sizes import Sizes

    p = OptionParser(bincount.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, binfile = args
    K = opts.K

    fp = open(binfile)
    a = bitarray()
    a.fromfile(fp)
    f = Sizes(fastafile)
    tsize = 0
    fw = must_open(opts.outfile, "w")
    for name, seqlen in f.iter_sizes():
        ksize = seqlen - K + 1
        b = a[tsize:tsize + ksize]
        bcount = b.count()
        print("\t".join(str(x) for x in (name, bcount)), file=fw)
        tsize += ksize
Example #29
0
def trimUTR(args):
    """
    %prog trimUTR gffile

    Remove UTRs in the annotation set.
    """
    p = OptionParser(trimUTR.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    g = make_index(gffile)
    gff = Gff(gffile)
    mRNA_register = {}
    fw = must_open(opts.outfile, "w")
    for c in gff:
        cid, ctype = c.accn, c.type
        if ctype == "gene":
            start, end = get_cds_minmax(g, cid)
            trim(c, start, end)
        elif ctype == "mRNA":
            start, end = get_cds_minmax(g, cid, level=1)
            trim(c, start, end)
            mRNA_register[cid] = (start, end)
        elif ctype != "CDS":
            start, end = mRNA_register[c.parent]
            trim(c, start, end)
        if c.start > c.end:
            print >> sys.stderr, cid, \
                    "destroyed [{0} > {1}]".format(c.start, c.end)
        else:
            print >> fw, c
Example #30
0
def bincount(args):
    """
    %prog bincount fastafile binfile

    Count K-mers in the bin.
    """
    from bitarray import bitarray
    from jcvi.formats.sizes import Sizes

    p = OptionParser(bincount.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, binfile = args
    K = opts.K

    fp = open(binfile)
    a = bitarray()
    a.fromfile(fp)
    f = Sizes(fastafile)
    tsize = 0
    fw = must_open(opts.outfile, "w")
    for name, seqlen in f.iter_sizes():
        ksize = seqlen - K + 1
        b = a[tsize : tsize + ksize]
        bcount = b.count()
        print >> fw, "\t".join(str(x) for x in (name, bcount))
        tsize += ksize
Example #31
0
def uniq(args):
    """
    %prog uniq fastqfile

    Retain only first instance of duplicate reads. Duplicate is defined as
    having the same read name.
    """
    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    fw = must_open(opts.outfile, "w")
    nduplicates = nreads = 0
    seen = set()
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        name = rec.name
        if name in seen:
            nduplicates += 1
            continue
        seen.add(name)
        print(rec, file=fw)
    logging.debug("Removed duplicate reads: {}".\
                  format(percentage(nduplicates, nreads)))
Example #32
0
def mergecsv(args):
    """
    %prog mergecsv *.tsv

    Merge a set of tsv files.
    """
    p = OptionParser(mergecsv.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    tsvfiles = args
    outfile = opts.outfile

    if op.exists(outfile):
        os.remove(outfile)

    tsvfile = tsvfiles[0]
    fw = must_open(opts.outfile, "w")
    for i, tsvfile in enumerate(tsvfiles):
        fp = open(tsvfile)
        if i > 0:
            next(fp)
        for row in fp:
            fw.write(row)
    fw.close()
Example #33
0
def uniq(args):
    """
    %prog uniq fastqfile

    Retain only first instance of duplicate reads. Duplicate is defined as
    having the same read name.
    """
    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    fw = must_open(opts.outfile, "w")
    nduplicates = nreads = 0
    seen = set()
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        name = rec.name
        if name in seen:
            nduplicates += 1
            continue
        seen.add(name)
        print >> fw, rec
    logging.debug("Removed duplicate reads: {}".\
                  format(percentage(nduplicates, nreads)))
Example #34
0
def suffix(args):
    """
    %prog suffix fastqfile CAG

    Filter reads based on suffix.
    """
    p = OptionParser(suffix.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastqfile, sf = args
    fw = must_open(opts.outfile, "w")
    nreads = nselected = 0
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        if rec.seq.endswith(sf):
            print >> fw, rec
            nselected += 1
    logging.debug("Selected reads with suffix {0}: {1}".\
                  format(sf, percentage(nselected, nreads)))
Example #35
0
def fromaligns(args):
    """
    %prog fromaligns out.aligns

    Convert aligns file (old MCscan output) to anchors file.
    """
    p = OptionParser(fromaligns.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    alignsfile, = args
    fp = must_open(alignsfile)
    fw = must_open(opts.outfile, "w")
    for row in fp:
        if row.startswith("## Alignment"):
            print >> fw, "###"
            continue
        if row[0] == '#' or not row.strip():
            continue
        atoms = row.split(':')[-1].split()
        print >> fw, "\t".join(atoms[:2])
    fw.close()
Example #36
0
def mini(args):
    """
    %prog mini bamfile minibamfile

    Prepare mini-BAMs that contain only the STR loci.
    """
    p = OptionParser(mini.__doc__)
    p.add_option("--pad",
                 default=20000,
                 type="int",
                 help="Add padding to the STR reigons")
    p.add_option("--treds",
                 default=None,
                 help="Extract specific treds, use comma to separate")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, minibam = args
    treds = opts.treds.split(",") if opts.treds else None
    pad = opts.pad
    bedfile = make_STR_bed(pad=pad, treds=treds)

    get_minibam_bed(bamfile, bedfile, minibam)
    logging.debug("Mini-BAM written to `{}`".format(minibam))
Example #37
0
def cat(args):
    """
    %prog cat *.pdf -o output.pdf

    Concatenate pages from pdf files into a single pdf file.

    Page ranges refer to the previously-named file.
    A file not followed by a page range means all the pages of the file.

    PAGE RANGES are like Python slices.
            {page_range_help}
    EXAMPLES
        pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1
            Concatenate all of head.pdf, all but page seven of content.pdf,
            and the last page of tail.pdf, producing output.pdf.

        pdfcat chapter*.pdf >book.pdf
            You can specify the output file by redirection.

        pdfcat chapter?.pdf chapter10.pdf >book.pdf
            In case you don't want chapter 10 before chapter 2.
    """
    p = OptionParser(cat.__doc__.format(page_range_help=PAGE_RANGE_HELP))
    p.add_option("--nosort",
                 default=False,
                 action="store_true",
                 help="Do not sort file names")
    p.set_outfile()
    p.set_verbose(help="Show page ranges as they are being read")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    outfile = opts.outfile
    if outfile in args:
        args.remove(outfile)

    if not opts.nosort:
        args = natsorted(args)

    filename_page_ranges = parse_filename_page_ranges(args)
    verbose = opts.verbose
    fw = must_open(outfile, "wb")

    merger = PdfFileMerger()
    in_fs = {}
    try:
        for (filename, page_range) in filename_page_ranges:
            if verbose:
                print >> sys.stderr, filename, page_range
            if filename not in in_fs:
                in_fs[filename] = open(filename, "rb")
            merger.append(in_fs[filename], pages=page_range)
    except:
        print >> sys.stderr, traceback.format_exc()
        print >> sys.stderr, "Error while reading " + filename
        sys.exit(1)
    merger.write(fw)
    fw.close()
Example #38
0
File: hic.py Project: xuanblo/jcvi
def agp(args):
    """
    %prog agp main_results/ contigs.fasta

    Generate AGP file based on LACHESIS output.
    """
    p = OptionParser(agp.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    odir, contigsfasta = args
    fwagp = must_open(opts.outfile, 'w')
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
    sizes = Sizes(contigsfasta).mapping
    contigs = set(sizes.keys())
    anchored = set()

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        anchored |= set([x.contig_name for x in co])
        obj = op.basename(ofile).split('.')[0]
        co.write_agp(obj, sizes, fwagp)

    singletons = contigs - anchored
    logging.debug('Anchored: {}, Singletons: {}'.
                  format(len(anchored), len(singletons)))

    for s in natsorted(singletons):
        order_to_agp(s, [(s, "?")], sizes, fwagp)
Example #39
0
def augustus(args):
    """
    %prog augustus augustus.gff3 > reformatted.gff3

    AUGUSTUS does generate a gff3 (--gff3=on) but need some refinement.
    """
    from jcvi.formats.gff import Gff

    p = OptionParser(augustus.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    ingff3, = args
    gff = Gff(ingff3)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    for g in gff:
        if g.type not in ("gene", "transcript", "CDS"):
            continue

        if g.type == "transcript":
            g.type = "mRNA"

        prefix = g.seqid + "_"
        pid = prefix + g.id
        newid = "{0}-{1}".format(pid, seen[pid]) if pid in seen else pid
        seen[pid] += 1
        g.attributes["ID"] = [newid]
        g.attributes["Parent"] = [(prefix + x) for x in g.attributes["Parent"]]
        g.update_attributes()
        print >> fw, g
    fw.close()
Example #40
0
def fromaligns(args):
    """
    %prog fromaligns out.aligns

    Convert aligns file (old MCscan output) to anchors file.
    """
    p = OptionParser(fromaligns.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    alignsfile, = args
    fp = must_open(alignsfile)
    fw = must_open(opts.outfile, "w")
    for row in fp:
        if row.startswith("## Alignment"):
            print >> fw, "###"
            continue
        if row[0] == '#' or not row.strip():
            continue
        atoms = row.split(':')[-1].split()
        print >> fw, "\t".join(atoms[:2])
    fw.close()
Example #41
0
File: cnv.py Project: xuanblo/jcvi
def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)
Example #42
0
def filterm4(args):
    """
    %prog filterm4 sample.m4 > filtered.m4

    Filter .m4 file after blasr is run. As blasr takes a long time to run,
    changing -bestn is undesirable. This screens the m4 file to retain top hits.
    """
    p = OptionParser(filterm4.__doc__)
    p.add_option("--best", default=1, type="int", help="Only retain best N hits")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    m4file, = args
    best = opts.best
    fp = open(m4file)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    retained = total = 0
    for row in fp:
        r = M4Line(row)
        total += 1
        if total % 100000 == 0:
            logging.debug("Retained {0} lines".\
                            format(percentage(retained, total)))
        if seen.get(r.query, 0) < best:
            fw.write(row)
            seen[r.query] += 1
            retained += 1
    fw.close()
Example #43
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Example #44
0
def suffix(args):
    """
    %prog suffix fastqfile CAG

    Filter reads based on suffix.
    """
    from jcvi.utils.cbook import percentage

    p = OptionParser(suffix.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastqfile, sf = args
    fw = must_open(opts.outfile, "w")
    nreads = nselected = 0
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        if rec.seq.endswith(sf):
            print >> fw, rec
            nselected += 1
    logging.debug("Selected reads with suffix {0}: {1}".format(sf, percentage(nselected, nreads)))
Example #45
0
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=10, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
Example #46
0
def first(args):
    """
    %prog first N fastqfile(s)

    Get first N reads from file.
    """
    from jcvi.apps.base import need_update

    p = OptionParser(first.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    N = int(args[0])
    nlines = N * 4
    fastqfiles = args[1:]
    fastqfile = fastqfiles[0]
    outfile = opts.outfile
    if not need_update(fastqfiles, outfile):
        logging.debug("File `{0}` exists. Will not overwrite.".format(outfile))
        return

    gz = fastqfile.endswith(".gz")
    for fastqfile in fastqfiles:
        if gz:
            cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines)
        else:
            cmd = "head -n {0} {1}".format(nlines, fastqfile)

        sh(cmd, outfile=opts.outfile, append=True)
Example #47
0
def compile(args):
    """
    %prog compile directory

    Extract telomere length and ccn.
    """
    p = OptionParser(compile.__doc__)
    p.set_outfile(outfile="age.tsv")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    dfs = []
    for folder in args:
        ofolder = os.listdir(folder)

        # telomeres
        subdir = [x for x in ofolder if x.startswith("telomeres")][0]
        subdir = op.join(folder, subdir)
        filename = op.join(subdir, "tel_lengths.txt")
        df = pd.read_csv(filename, sep="\t")
        d1 = df.ix[0].to_dict()

        # ccn
        subdir = [x for x in ofolder if x.startswith("ccn")][0]
        subdir = op.join(folder, subdir)
        filename = iglob(subdir, "*.ccn.json")[0]
        js = json.load(open(filename))
        d1.update(js)
        df = pd.DataFrame(d1, index=[0])
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(opts.outfile, sep="\t", index=False)
Example #48
0
def digest(args):
    """
    %prog digest fastafile NspI,BfuCI

    Digest fasta sequences to map restriction site positions.
    """
    p = OptionParser(digest.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, enzymes = args
    enzymes = enzymes.split(",")
    enzymes = [x for x in AllEnzymes if str(x) in enzymes]
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")

    header = ["Contig", "Length"] + [str(x) for x in enzymes]
    print("\t".join(header), file=fw)
    for name, rec in f.iteritems_ordered():
        row = [name, len(rec)]
        for e in enzymes:
            pos = e.search(rec.seq)
            pos = "na" if not pos else "|".join(str(x) for x in pos)
            row.append(pos)
        print("\t".join(str(x) for x in row), file=fw)
Example #49
0
def gff(args):
    """
    %prog gff pslfile

    Convert to gff format.
    """
    p = OptionParser(gff.__doc__)
    p.add_option("--source", default="GMAP",
                 help="specify GFF source [default: %default]")
    p.add_option("--type", default="EST_match",
                help="specify GFF feature type [default: %default]")
    p.add_option("--suffix", default=".match",
                 help="match ID suffix [default: \"%default\"]")
    p.add_option("--swap", default=False, action="store_true",
                 help="swap query and target features [default: %default]")
    p.add_option("--simple_score", default=False, action="store_true",
                 help="calculate a simple percent score [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    pslfile, = args
    fw = must_open(opts.outfile, "w")

    print("##gff-version 3", file=fw)
    psl = Psl(pslfile)
    for p in psl:
        if opts.swap:
            p.swap

        psl.trackMatches(p.qName)
        # switch from 0-origin to 1-origin
        p.qStart += 1
        p.tStart += 1

        print(p.gffline(source=opts.source, type=opts.type, suffix=opts.suffix, \
                primary_tag="ID", alt_score=opts.simple_score, \
                count=psl.getMatchCount(p.qName)), file=fw)

        # create an empty PslLine() object and load only
        # the targetName, queryName and strand info
        part = PslLine("\t".join(str(x) for x in [0] * p.nargs))
        part.tName, part.qName, part.strand = p.tName, p.qName, p.strand

        nparts = len(p.qStarts)
        for n in xrange(nparts):
            part.qStart, part.tStart, aLen = p.qStarts[n] + 1, p.tStarts[n] + 1, p.blockSizes[n]
            part.qEnd = part.qStart + aLen - 1
            part.tEnd = part.tStart + aLen - 1

            if part.strand == "-":
                part.qStart = p.qSize - (p.qStarts[n] + p.blockSizes[n]) + 1
                part.qEnd = p.qSize - p.qStarts[n]

            print(part.gffline(source=opts.source, suffix=opts.suffix, \
                    count=psl.getMatchCount(part.qName)), file=fw)
Example #50
0
def coverage(args):
    """
    %prog coverage fastafile bamfile

    Calculate coverage for BAM file. BAM file will be sorted unless with
    --nosort.
    """
    p = OptionParser(coverage.__doc__)
    p.add_option(
        "--format",
        default="bigwig",
        choices=("bedgraph", "bigwig", "coverage"),
        help="Output format",
    )
    p.add_option("--nosort",
                 default=False,
                 action="store_true",
                 help="Do not sort BAM")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    format = opts.format
    if opts.nosort:
        logging.debug("BAM sorting skipped")
    else:
        bamfile = index([bamfile, "--fasta={0}".format(fastafile)])

    pf = bamfile.rsplit(".", 2)[0]
    sizesfile = Sizes(fastafile).filename
    cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile)
    if format in ("bedgraph", "bigwig"):
        cmd += " -bg"
        bedgraphfile = pf + ".bedgraph"
        sh(cmd, outfile=bedgraphfile)

        if format == "bedgraph":
            return bedgraphfile

        bigwigfile = pf + ".bigwig"
        cmd = "bedGraphToBigWig {0} {1} {2}".format(bedgraphfile, sizesfile,
                                                    bigwigfile)
        sh(cmd)
        return bigwigfile

    coveragefile = pf + ".coverage"
    if need_update(fastafile, coveragefile):
        sh(cmd, outfile=coveragefile)

    gcf = GenomeCoverageFile(coveragefile)
    fw = must_open(opts.outfile, "w")
    for seqid, cov in gcf.iter_coverage_seqid():
        print("\t".join((seqid, "{0:.1f}".format(cov))), file=fw)
    fw.close()
Example #51
0
def bed(args):
    """
    %prog bed anchorsfile

    Convert ANCHORS file to BED format.
    """
    from collections import defaultdict
    from jcvi.compara.synteny import AnchorFile, check_beds
    from jcvi.formats.bed import Bed
    from jcvi.formats.base import get_number

    p = OptionParser(bed.__doc__)
    p.add_option("--switch", default=False, action="store_true",
                 help="Switch reference and aligned map elements")
    p.add_option("--scale", type="float",
                 help="Scale the aligned map distance by factor")
    p.set_beds()
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorsfile, = args
    switch = opts.switch
    scale = opts.scale
    ac = AnchorFile(anchorsfile)
    pairs = defaultdict(list)
    for a, b, block_id in ac.iter_pairs():
        pairs[a].append(b)

    qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts)
    bd = Bed()
    for q in qbed:
        qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn
        if qaccn not in pairs:
            continue
        for s in pairs[qaccn]:
            si, s = sorder[s]
            sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn
        if switch:
            qseqid, sseqid = sseqid, qseqid
            qstart, sstart = sstart, qstart
            qend, send = send, qend
            qaccn, saccn = saccn, qaccn
        if scale:
            sstart /= scale
        try:
            newsseqid = get_number(sseqid)
        except ValueError:
            raise ValueError, "`{0}` is on `{1}` with no number to extract".\
                                format(saccn, sseqid)
        bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend,
                            "{0}:{1}".format(newsseqid, sstart)))
        bd.add(bedline)

    bd.print_to_file(filename=opts.outfile, sorted=True)
Example #52
0
File: ks.py Project: ascendo/jcvi
def prepare(args):
    """
    %prog prepare pairsfile cdsfile [pepfile] -o paired.cds.fasta

    Pick sequences from cdsfile to form pairs, ready to be calculated. The
    pairsfile can be generated from formats.blast.cscore(). The first two
    columns contain the pair.
    """
    from jcvi.formats.fasta import Fasta

    p = OptionParser(prepare.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)
    outfile = opts.outfile

    if len(args) == 2:
        pairsfile, cdsfile = args
        pepfile = None
    elif len(args) == 3:
        pairsfile, cdsfile, pepfile = args
    else:
        sys.exit(not p.print_help())

    f = Fasta(cdsfile)
    fp = open(pairsfile)
    fw = must_open(outfile, "w")
    if pepfile:
        assert outfile != "stdout", "Please specify outfile name."
        f2 = Fasta(pepfile)
        fw2 = must_open(outfile + ".pep", "w")
    for row in fp:
        if row[0] == '#':
            continue
        a, b = row.split()[:2]
        if a == b:
            logging.debug("Self pairs found: {0} - {1}. Ignored".format(a, b))
            continue

        if a not in f:
            a = find_first_isoform(a, f)
            assert a, a
        if b not in f:
            b = find_first_isoform(b, f)
            assert b, b

        acds = f[a]
        bcds = f[b]
        SeqIO.write((acds, bcds), fw, "fasta")
        if pepfile:
            apep = f2[a]
            bpep = f2[b]
            SeqIO.write((apep, bpep), fw2, "fasta")
    fw.close()
    if pepfile:
        fw2.close()
Example #53
0
File: bed.py Project: yangjl/jcvi
def flanking(args):
    """
    %prog flanking bedfile [options]

    Get up to n features (upstream or downstream or both) flanking a given position.
    """
    from numpy import array, argsort

    p = OptionParser(flanking.__doc__)
    p.add_option("--chrom", default=None, type="string",
            help="chrom name of the position in query. Make sure it matches bedfile.")
    p.add_option("--coord", default=None, type="int",
            help="coordinate of the position in query.")
    p.add_option("-n", default=10, type="int",
            help="number of flanking features to get [default: %default]")
    p.add_option("--side", default="both", choices=("upstream", "downstream", "both"),
            help="which side to get flanking features [default: %default]")
    p.add_option("--max_d", default=None, type="int",
            help="features <= max_d away from position [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if any([len(args) != 1, opts.chrom is None, opts.coord is None]):
        sys.exit(not p.print_help())

    bedfile, = args
    position = (opts.chrom, opts.coord)
    n, side, maxd = opts.n, opts.side, opts.max_d

    chrombed = Bed(bedfile).sub_bed(position[0])

    if side == "upstream":
        data = [(abs(f.start-position[1]), f) for f in chrombed \
            if f.start <= position[1]]
    elif side == "downstream":
        data = [(abs(f.start-position[1]), f) for f in chrombed \
            if f.start >= position[1]]
    else:
        data = [(abs(f.start-position[1]), f) for f in chrombed]

    if maxd:
        data = [f for f in data if f[0]<=maxd]

    n += 1 # not counting self
    n = min(n, len(data))
    distances, subbed = zip(*data)
    distances = array(distances)
    idx = argsort(distances)[:n]
    flankingbed = [f for (i, f) in enumerate(subbed) if i in idx]

    fw = must_open(opts.outfile, "w")
    for atom in flankingbed:
        print >>fw, str(atom)

    return (position, flankingbed)
Example #54
0
File: pdf.py Project: Hensonmw/jcvi
def cat(args):
    """
    %prog cat *.pdf -o output.pdf

    Concatenate pages from pdf files into a single pdf file.

    Page ranges refer to the previously-named file.
    A file not followed by a page range means all the pages of the file.

    PAGE RANGES are like Python slices.
            {page_range_help}
    EXAMPLES
        pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1
            Concatenate all of head.pdf, all but page seven of content.pdf,
            and the last page of tail.pdf, producing output.pdf.

        pdfcat chapter*.pdf >book.pdf
            You can specify the output file by redirection.

        pdfcat chapter?.pdf chapter10.pdf >book.pdf
            In case you don't want chapter 10 before chapter 2.
    """
    p = OptionParser(cat.__doc__.format(page_range_help=PAGE_RANGE_HELP))
    p.set_outfile()
    p.set_verbose(help="Show page ranges as they are being read")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    outfile = opts.outfile
    if outfile in args:
        args.remove(outfile)

    args = natsorted(args)

    filename_page_ranges = parse_filename_page_ranges(args)
    verbose = opts.verbose
    fw = must_open(outfile, "wb")

    merger = PdfFileMerger()
    in_fs = {}
    try:
        for (filename, page_range) in filename_page_ranges:
            if verbose:
                print >> sys.stderr, filename, page_range
            if filename not in in_fs:
                in_fs[filename] = open(filename, "rb")
            merger.append(in_fs[filename], pages=page_range)
    except:
        print >> sys.stderr, traceback.format_exc()
        print >> sys.stderr, "Error while reading " + filename
        sys.exit(1)
    merger.write(fw)
    fw.close()
Example #55
0
File: ks.py Project: ascendo/jcvi
def gc3(args):
    """
    %prog gc3 ksfile cdsfile [cdsfile2] -o newksfile

    Filter the Ks results to remove high GC3 genes. High GC3 genes are
    problematic in Ks calculation - see Tang et al. 2010 PNAS. Specifically, the
    two calculation methods produce drastically different results for these
    pairs. Therefore we advise to remoeve these high GC3 genes. This is often
    the case for studying cereal genes.

    If 2 genomes are involved, the cdsfile of the 2nd genome can be provided
    concatenated or separated.
    """
    p = OptionParser(gc3.__doc__)
    p.add_option("--plot", default=False, action="store_true",
                 help="Also plot the GC3 histogram [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)

    outfile = opts.outfile
    plot = opts.plot

    if not 1 < len(args) < 4:
        sys.exit(not p.print_help())

    ks_file, cdsfile = args[:2]
    GC3 = get_GC3(cdsfile)
    if plot:
        plot_GC3(GC3, cdsfile, fill="green")

    if len(args) == 3:
        cdsfile2 = args[2]
        GC3_2 = get_GC3(cdsfile2)
        GC3.update(GC3_2)
        if plot:
            plot_GC3(GC3_2, cdsfile2, fill="lightgreen")

    data = KsFile(ks_file)
    noriginals = len(data)

    fw = must_open(outfile, "w")
    writer = csv.writer(fw)
    writer.writerow(fields.split(","))
    nlines = 0
    cutoff = .75
    for d in data:
        a, b = d.name.split(";")
        aratio, bratio = GC3[a], GC3[b]
        if (aratio + bratio) / 2 > cutoff:
            continue
        writer.writerow(d)
        nlines += 1
    logging.debug("{0} records written (from {1}).".format(nlines, noriginals))
Example #56
0
def summary(args):
    """
    %prog summary input.bed scaffolds.fasta

    Print out summary statistics per map, followed by consensus summary of
    scaffold anchoring based on multiple maps.
    """
    p = OptionParser(summary.__doc__)
    p.set_table(sep="|", align=True)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    inputbed, scaffolds = args
    pf = inputbed.rsplit(".", 1)[0]
    mapbed = pf + ".bed"
    chr_agp = pf + ".chr.agp"
    sep = opts.sep
    align = opts.align
    cc = Map(mapbed)
    mapnames = cc.mapnames
    s = Sizes(scaffolds)
    total, l50, n50 = s.summary
    r = {}
    maps = []

    fw = must_open(opts.outfile, "w")
    print >> fw, "*** Summary for each individual map ***"
    for mapname in mapnames:
        markers = [x for x in cc if x.mapname == mapname]
        ms = MapSummary(markers, l50, s)
        r["Linkage Groups", mapname] = ms.num_lgs
        ms.export_table(r, mapname, total)
        maps.append(ms)
    print >> fw, tabulate(r, sep=sep, align=align)

    r = {}
    agp = AGP(chr_agp)
    print >> fw, "*** Summary for consensus map ***"
    consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap)
    oriented_scaffolds = set(x.component_id for x in agp \
                            if (not x.is_gap) and x.orientation != '?')
    unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds

    for mapname, sc in (("Anchored", consensus_scaffolds),
                    ("Oriented", oriented_scaffolds),
                    ("Unplaced", unplaced_scaffolds)):
        markers = [x for x in cc if x.seqid in sc]
        ms = MapSummary(markers, l50, s, scaffolds=sc)
        ms.export_table(r, mapname, total)
    print >> fw, tabulate(r, sep=sep, align=align)