コード例 #1
0
ファイル: stats.py プロジェクト: zhaotao1987/jcvi
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"),
                        (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum,
                                               precision=0,
                                               target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum,
                                         s.totalsize,
                                         precision=0,
                                         mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print >> sys.stderr, tabulate(r)
コード例 #2
0
ファイル: stats.py プロジェクト: tanghaibao/jcvi
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print(tabulate(r), file=sys.stderr)
コード例 #3
0
ファイル: agp.py プロジェクト: bennyyu/jcvi
    opts, args = p.parse_args(args)

    try:
        agpfile, componentfasta, targetfasta = args
    except Exception, e:
        sys.exit(p.print_help())

    agp = AGP(agpfile)
    build = Fasta(targetfasta)
    bacs = Fasta(componentfasta, index=False)

    # go through this line by line
    for aline in agp:
        try:
            build_seq = build.sequence(dict(chr=aline.object,
                start=aline.object_beg, stop=aline.object_end))

            if aline.is_gap:
                assert build_seq.upper() == aline.gap_length * 'N', \
                    "gap mismatch: %s" % aline
            else:
                bac_seq = bacs.sequence(dict(chr=aline.component_id,
                    start=aline.component_beg, stop=aline.component_end,
                    strand=aline.orientation))

                assert build_seq.upper() == bac_seq.upper(), \
                        "sequence mismatch: %s" % aline

            logging.debug("%s:%d-%d verified" % (aline.object,
                aline.object_beg, aline.object_end))
コード例 #4
0
ファイル: snp.py プロジェクト: Hensonmw/jcvi
def frommaf(args):
    """
    %prog frommaf maffile

    Convert to four-column tabular format from MAF.
    """
    p = OptionParser(frommaf.__doc__)
    p.add_option("--validate",
                 help="Validate coordinates against FASTA [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    maf, = args
    snpfile = maf.rsplit(".", 1)[0] + ".vcf"
    fp = open(maf)
    fw = open(snpfile, "w")
    total = 0
    id = "."
    qual = 20
    filter = "PASS"
    info = "DP=20"
    print >> fw, "##fileformat=VCFv4.0"
    print >> fw, "#CHROM POS ID REF ALT QUAL FILTER INFO".replace(" ", "\t")
    for row in fp:
        atoms = row.split()
        c, pos, ref, alt = atoms[:4]
        try:
            c = int(c)
        except:
            continue
        c = "chr{0:02d}".format(c)
        pos = int(pos)
        print >> fw, "\t".join(str(x) for x in \
                (c, pos, id, ref, alt, qual, filter, info))
        total += 1
    fw.close()

    validate = opts.validate
    if not validate:
        return

    from jcvi.utils.cbook import percentage

    f = Fasta(validate)
    fp = open(snpfile)
    nsnps = 0
    for row in fp:
        if row[0] == '#':
            continue

        c, pos, id, ref, alt, qual, filter, info = row.split("\t")
        pos = int(pos)
        feat = dict(chr=c, start=pos, stop=pos)
        s = f.sequence(feat)
        s = str(s)
        assert s == ref, "Validation error: {0} is {1} (expect: {2})".\
                        format(feat, s, ref)
        nsnps += 1
        if nsnps % 50000 == 0:
            logging.debug("SNPs parsed: {0}".format(percentage(nsnps, total)))
    logging.debug("A total of {0} SNPs validated and written to `{1}`.".\
                        format(nsnps, snpfile))
コード例 #5
0
ファイル: snp.py プロジェクト: zhaotao1987/jcvi
def frommaf(args):
    """
    %prog frommaf maffile

    Convert to four-column tabular format from MAF.
    """
    p = OptionParser(frommaf.__doc__)
    p.add_option("--validate",
                 help="Validate coordinates against FASTA [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    maf, = args
    snpfile = maf.rsplit(".", 1)[0] + ".vcf"
    fp = open(maf)
    fw = open(snpfile, "w")
    total = 0
    id = "."
    qual = 20
    filter = "PASS"
    info = "DP=20"
    print >> fw, "##fileformat=VCFv4.0"
    print >> fw, "#CHROM POS ID REF ALT QUAL FILTER INFO".replace(" ", "\t")
    for row in fp:
        atoms = row.split()
        c, pos, ref, alt = atoms[:4]
        try:
            c = int(c)
        except:
            continue
        c = "chr{0:02d}".format(c)
        pos = int(pos)
        print >> fw, "\t".join(str(x) for x in \
                (c, pos, id, ref, alt, qual, filter, info))
        total += 1
    fw.close()

    validate = opts.validate
    if not validate:
        return

    from jcvi.utils.cbook import percentage

    f = Fasta(validate)
    fp = open(snpfile)
    nsnps = 0
    for row in fp:
        if row[0] == '#':
            continue

        c, pos, id, ref, alt, qual, filter, info = row.split("\t")
        pos = int(pos)
        feat = dict(chr=c, start=pos, stop=pos)
        s = f.sequence(feat)
        s = str(s)
        assert s == ref, "Validation error: {0} is {1} (expect: {2})".\
                        format(feat, s, ref)
        nsnps += 1
        if nsnps % 50000 == 0:
            logging.debug("SNPs parsed: {0}".format(percentage(nsnps, total)))
    logging.debug("A total of {0} SNPs validated and written to `{1}`.".\
                        format(nsnps, snpfile))
コード例 #6
0
ファイル: gff.py プロジェクト: linlifeng/jcvi
def load(args):
    '''
    %prog load gff_file fasta_file [--options]

    Parses the selected features out of GFF, with subfeatures concatenated.
    For example, to get the CDS sequences, do this::

    $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS
    '''
    from jcvi.formats.fasta import Seq, SeqRecord

    p = OptionParser(load.__doc__)
    p.add_option("--parents", dest="parents", default="mRNA",
            help="list of features to extract, use comma to separate (e.g."
            "'gene,mRNA') [default: %default]")
    p.add_option("--children", dest="children", default="CDS",
            help="list of features to extract, use comma to separate (e.g."
            "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    p.add_option("--attribute",
            help="The attribute field to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    gff_file, fasta_file = args

    g = make_index(gff_file)
    f = Fasta(fasta_file, index=False)
    fw = must_open(opts.outfile, "w")

    parents = set(opts.parents.split(','))
    children_list = set(opts.children.split(','))
    attr = opts.attribute

    for feat in get_parents(gff_file, parents):

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list:
                continue
            child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop,
                strand=c.strand))
            children.append((child, c))

        if not children:
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand == '-':
            children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        description = ",".join(feat.attributes[attr]) \
                if attr and attr in feat.attributes else ""
        description = description.replace("\"", "")

        rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description)
        SeqIO.write([rec], fw, "fasta")
        fw.flush()
コード例 #7
0
    opts, args = p.parse_args(args)

    try:
        agpfile, componentfasta, targetfasta = args
    except Exception, e:
        sys.exit(p.print_help())

    agp = AGP(agpfile)
    build = Fasta(targetfasta)
    bacs = Fasta(componentfasta, index=False)

    # go through this line by line
    for aline in agp:
        try:
            build_seq = build.sequence(
                dict(chr=aline.object,
                     start=aline.object_beg,
                     stop=aline.object_end))

            if aline.is_gap:
                assert build_seq.upper() == aline.gap_length * 'N', \
                    "gap mismatch: %s" % aline
            else:
                bac_seq = bacs.sequence(
                    dict(chr=aline.component_id,
                         start=aline.component_beg,
                         stop=aline.component_end,
                         strand=aline.orientation))

                assert build_seq.upper() == bac_seq.upper(), \
                        "sequence mismatch: %s" % aline
コード例 #8
0
def load(args):
    '''
    %prog load gff_file fasta_file [--options]

    Parses the selected features out of GFF, with subfeatures concatenated.
    For example, to get the CDS sequences, do this::

    $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS
    '''
    from jcvi.formats.fasta import Seq, SeqRecord

    p = OptionParser(load.__doc__)
    p.add_option(
        "--parents",
        dest="parents",
        default="mRNA",
        help="list of features to extract, use comma to separate (e.g."
        "'gene,mRNA') [default: %default]")
    p.add_option(
        "--children",
        dest="children",
        default="CDS",
        help="list of features to extract, use comma to separate (e.g."
        "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    p.add_option("--attribute",
                 help="The attribute field to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    gff_file, fasta_file = args

    g = make_index(gff_file)
    f = Fasta(fasta_file, index=False)
    fw = must_open(opts.outfile, "w")

    parents = set(opts.parents.split(','))
    children_list = set(opts.children.split(','))
    attr = opts.attribute

    for feat in get_parents(gff_file, parents):

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list:
                continue
            child = f.sequence(
                dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand))
            children.append((child, c))

        if not children:
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand == '-':
            children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        description = ",".join(feat.attributes[attr]) \
                if attr and attr in feat.attributes else ""
        description = description.replace("\"", "")

        rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description)
        SeqIO.write([rec], fw, "fasta")
        fw.flush()