Beispiel #1
0
def index(args):
    """
    %prog index bedfile

    Compress frgscffile.sorted and index it using `tabix`.
    """
    p = OptionParser(index.__doc__)
    p.add_option("--query", help="Chromosome location [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    gzfile = bedfile + ".gz"

    if need_update(bedfile, gzfile):
        bedfile = sort([bedfile])
        cmd = "bgzip -c {0}".format(bedfile)
        sh(cmd, outfile=gzfile)

    tbifile = gzfile + ".tbi"

    if need_update(gzfile, tbifile):
        cmd = "tabix -p bed {0}".format(gzfile)
        sh(cmd)

    query = opts.query
    if not query:
        return

    cmd = "tabix {0} {1}".format(gzfile, query)
    sh(cmd, outfile=opts.outfile)
Beispiel #2
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    p = OptionParser(calc.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print >> sys.stderr, "Incorrect arguments"
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n")
    work_dir = op.join(os.getcwd(), "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = translate_dna(dna_file)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print >> sys.stderr, "--------", p_rec_1.name, p_rec_2.name
        align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(
                    str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn,
                                     ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Beispiel #3
0
def index(args):
    """
    %prog index bedfile

    Compress frgscffile.sorted and index it using `tabix`.
    """
    p = OptionParser(index.__doc__)
    p.add_option("--query", help="Chromosome location [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    gzfile = bedfile + ".gz"

    if need_update(bedfile, gzfile):
        bedfile = sort([bedfile])
        cmd = "bgzip -c {0}".format(bedfile)
        sh(cmd, outfile=gzfile)

    tbifile = gzfile + ".tbi"

    if need_update(gzfile, tbifile):
        cmd = "tabix -p bed {0}".format(gzfile)
        sh(cmd)

    query = opts.query
    if not query:
        return

    cmd = "tabix {0} {1}".format(gzfile, query)
    sh(cmd, outfile=opts.outfile)
Beispiel #4
0
def bcf(args):
    """
    %prog bcf fastafile bamfiles > bcffile

    Run mpileup on bam files.
    """
    from jcvi.apps.grid import Jobs

    p = OptionParser(bcf.__doc__)
    set_outfile(p)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]

    unsorted = [x for x in bamfiles if ".sorted." not in x]
    jargs = [[[x, "--unique"]] for x in unsorted]
    jobs = Jobs(index, args=jargs)
    jobs.run()

    bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles]
    bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles]
    cmd = "samtools mpileup -P ILLUMINA -E -ugDf"
    cmd += " {0} {1}".format(fastafile, " ".join(bamfiles))
    cmd += " | bcftools view -bcvg -"
    sh(cmd, outfile=opts.outfile)
Beispiel #5
0
def bcf(args):
    """
    %prog bcf fastafile bamfiles > bcffile

    Run mpileup on bam files.
    """
    from jcvi.apps.grid import Jobs

    p = OptionParser(bcf.__doc__)
    set_outfile(p)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile = args[0]
    bamfiles = args[1:]

    unsorted = [x for x in bamfiles if ".sorted." not in x]
    jargs = [[[x, "--unique"]] for x in unsorted]
    jobs = Jobs(index, args=jargs)
    jobs.run()

    bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles]
    bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles]
    cmd = "samtools mpileup -P ILLUMINA -E -ugDf"
    cmd += " {0} {1}".format(fastafile, " ".join(bamfiles))
    cmd += " | bcftools view -bcvg -"
    sh(cmd, outfile=opts.outfile)
Beispiel #6
0
def extract(args):
    """
    %prog extract gffile

    --contigs: Extract particular contig(s) from the gff file. If multiple contigs are
    involved, use "," to separate, e.g. "contig_12,contig_150"
    --names: Provide a file with IDs, one each line
    """
    p = OptionParser(extract.__doc__)
    p.add_option("--contigs",
                help="Extract features from certain contigs [default: %default]")
    p.add_option("--names",
                help="Extract features with certain names [default: %default]")
    p.add_option("--fasta", default=False, action="store_true",
                help="Write FASTA if available [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    contigID = opts.contigs
    namesfile = opts.names

    contigID = set(contigID.split(",")) if contigID else None
    names = set(x.strip() for x in open(namesfile)) if namesfile else None

    outfile = opts.outfile
    fp = open(gffile)
    fw = must_open(outfile, "w")
    for row in fp:
        atoms = row.split()
        if len(atoms) == 0:
            continue
        tag = atoms[0]
        if row[0] == "#":
            if not (tag == RegionTag and contigID and atoms[1] not in contigID):
                print >> fw, row.rstrip()
            if tag == FastaTag:
                break
            continue

        b = GffLine(row)
        is_right_contig = (contigID and tag in contigID) or (not contigID)
        is_right_names = (names and b.attributes["Name"][0] in names) or \
                         (not names)

        if is_right_contig and is_right_names:
            print >> fw, row.rstrip()

    if not opts.fasta:
        return

    f = Fasta(gffile)
    for s in contigID:
        if s in f:
            SeqIO.write([f[s]], fw, "fasta")
Beispiel #7
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    p = OptionParser(calc.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print >>sys.stderr, "Incorrect arguments"
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    output_h.write("name,dS-yn,dN-yn,dS-ng,dN-ng\n")
    work_dir = op.join(os.getcwd(), "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = translate_dna(dna_file)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print >>sys.stderr, "--------", p_rec_1.name, p_rec_2.name
        align_fasta = clustal_align_protein(p_rec_1, p_rec_2, work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, n_rec_1, n_rec_2, work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(str(x) for x in (pair_name,
                        ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Beispiel #8
0
def join(args):
    """
    %prog join file1.txt file2.txt ..

    Join tabular files based on common column. --column specifies the column
    index to pivot on. Use comma to separate multiple values if the pivot column
    is different in each file. Maintain the order in the first file.
    """
    from jcvi.utils.iter import flatten

    p = OptionParser(join.__doc__)
    p.add_option("--column", default="0",
                 help="0-based column id, multiple values allowed [default: %default]")
    p.add_option("--noheader", default=False, action="store_true",
                 help="Do not print header [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)
    nargs = len(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    c = opts.column
    if "," in c:
        cc = [int(x) for x in c.split(",")]
    else:
        cc = [int(c)] * nargs

    assert len(cc) == nargs, "Column index number != File number"

    # Maintain the first file line order, and combine other files into it
    pivotfile = args[0]
    files = [DictFile(f, keypos=c, valuepos=None, delimiter="\t") \
                        for f, c in zip(args, cc)]
    otherfiles = files[1:]
    header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \
                        for x in files))

    fp = open(pivotfile)
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print >> fw, header

    for row in fp:
        row = row.rstrip()
        atoms = row.split("\t")
        newrow = atoms
        key = atoms[cc[0]]
        for d in otherfiles:
            drow = d.get(key, ["na"] * d.ncols)
            newrow += drow
        print >> fw, "\t".join(newrow)
Beispiel #9
0
def join(args):
    """
    %prog join file1.txt file2.txt ..

    Join tabular files based on common column. --column specifies the column
    index to pivot on. Use comma to separate multiple values if the pivot column
    is different in each file. Maintain the order in the first file.
    """
    from jcvi.utils.iter import flatten

    p = OptionParser(join.__doc__)
    p.add_option("--column", default="0",
                 help="0-based column id, multiple values allowed [default: %default]")
    p.add_option("--noheader", default=False, action="store_true",
                 help="Do not print header [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)
    nargs = len(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    c = opts.column
    if "," in c:
        cc = [int(x) for x in c.split(",")]
    else:
        cc = [int(c)] * nargs

    assert len(cc) == nargs, "Column index number != File number"

    # Maintain the first file line order, and combine other files into it
    pivotfile = args[0]
    files = [DictFile(f, keypos=c, valuepos=None, delimiter="\t") \
                        for f, c in zip(args, cc)]
    otherfiles = files[1:]
    header = "\t".join(flatten([op.basename(x.filename)] * x.ncols \
                        for x in files))

    fp = open(pivotfile)
    fw = must_open(opts.outfile, "w")
    if not opts.noheader:
        print >> fw, header

    for row in fp:
        row = row.rstrip()
        atoms = row.split("\t")
        newrow = atoms
        key = atoms[cc[0]]
        for d in otherfiles:
            drow = d.get(key, ["na"] * d.ncols)
            newrow += drow
        print >> fw, "\t".join(newrow)
Beispiel #10
0
def asn(args):
    """
    %prog asn asnfiles

    Mainly to get this block, and extract `str` field:

        general {
          db "TIGR" ,
          tag
            str "mtg2_12952" } ,
        genbank {
          accession "AC148996" ,
    """
    from jcvi.formats.base import must_open

    p = OptionParser(asn.__doc__)
    set_outfile(p)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fw = must_open(opts.outfile, "w")
    for asnfile in args:
        fp = open(asnfile)
        ingeneralblock = False
        ingenbankblock = False
        gb, name = None, None
        for row in fp:
            if row.strip() == "":
                continue

            tag = row.split()[0]

            if tag == "general":
                ingeneralblock = True
            if ingeneralblock and tag == "str":
                if name is None:  # Only allow first assignment
                    name = row.split("\"")[1]
                ingeneralblock = False

            if tag == "genbank":
                ingenbankblock = True
            if ingenbankblock and tag == "accession":
                if gb is None:
                    gb = row.split("\"")[1]
                ingenbankblock = False

        assert gb and name
        print >> fw, "{0}\t{1}".format(gb, name)
Beispiel #11
0
def merge(args):
    """
    %prog merge gffiles

    Merge several gff files into one. When only one file is given, it is assumed
    to be a file with a list of gff files.
    """
    p = OptionParser(merge.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 1:
        sys.exit(not p.print_help())

    if nargs == 1:
        listfile, = args
        fp = open(listfile)
        gffiles = [x.strip() for x in fp]
    else:
        gffiles = args

    outfile = opts.outfile

    deflines = set()
    fw = must_open(outfile, "w")
    fastarecs = {}
    for gffile in gffiles:
        fp = open(gffile)
        for row in fp:
            row = row.rstrip()
            if row[0] == '#':
                if row == FastaTag:
                    break
                if row in deflines:
                    continue
                else:
                    deflines.add(row)

            print >> fw, row

        f = Fasta(gffile, lazy=True)
        for key, rec in f.iteritems_ordered():
            if key in fastarecs.keys():
                continue
            fastarecs[key] = rec

    print >> fw, FastaTag
    SeqIO.write(fastarecs.values(), fw, "fasta")
Beispiel #12
0
def merge(args):
    """
    %prog merge gffiles

    Merge several gff files into one. When only one file is given, it is assumed
    to be a file with a list of gff files.
    """
    p = OptionParser(merge.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 1:
        sys.exit(not p.print_help())

    if nargs == 1:
        listfile, = args
        fp = open(listfile)
        gffiles = [x.strip() for x in fp]
    else:
        gffiles = args

    outfile = opts.outfile

    deflines = set()
    fw = must_open(outfile, "w")
    fastarecs = {}
    for gffile in gffiles:
        fp = open(gffile)
        for row in fp:
            row = row.rstrip()
            if row[0] == '#':
                if row == FastaTag:
                    break
                if row in deflines:
                    continue
                else:
                    deflines.add(row)

            print >> fw, row

        f = Fasta(gffile, lazy=True)
        for key, rec in f.iteritems_ordered():
            if key in fastarecs.keys():
                continue
            fastarecs[key] = rec

    print >> fw, FastaTag
    SeqIO.write(fastarecs.values(), fw, "fasta")
Beispiel #13
0
def asn(args):
    """
    %prog asn asnfiles

    Mainly to get this block, and extract `str` field:

        general {
          db "TIGR" ,
          tag
            str "mtg2_12952" } ,
        genbank {
          accession "AC148996" ,
    """
    from jcvi.formats.base import must_open

    p = OptionParser(asn.__doc__)
    set_outfile(p)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fw = must_open(opts.outfile, "w")
    for asnfile in args:
        fp = open(asnfile)
        ingeneralblock = False
        ingenbankblock = False
        gb, name = None, None
        for row in fp:
            if row.strip() == "":
                continue

            tag = row.split()[0]

            if tag == "general":
                ingeneralblock = True
            if ingeneralblock and tag == "str":
                if name is None:  # Only allow first assignment
                    name = row.split("\"")[1]
                ingeneralblock = False

            if tag == "genbank":
                ingenbankblock = True
            if ingenbankblock and tag == "accession":
                if gb is None:
                    gb = row.split("\"")[1]
                ingenbankblock = False

        assert gb and name
        print >> fw, "{0}\t{1}".format(gb, name)
Beispiel #14
0
def last(args):
    """
    %prog last old.fasta new.fasta

    Generate psl file using last. Calles apps.last() but with special
    parameters: -r5 -q95 -a0 -b95 -e500, which only reports alignments larger
    than 100 bp and >=95 % identity.
    """
    from jcvi.apps.last import main as lastapp

    p = OptionParser(last.__doc__)
    p.add_option("--distant",
                 default=False,
                 action="store_true",
                 help="Assume distant relations")
    p.add_option(
        "--minscore",
        default=100,
        type="int",
        help="Filter alignments by how many bases match [default: %default]")
    p.add_option("--minid",
                 default=95,
                 type="int",
                 help="Minimum sequence identity [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    args = [oldfasta, newfasta, "--format=maf", \
                "--outfile={0}".format(opts.outfile)]

    minscore = opts.minscore
    minid = opts.minid

    r = 100 - minid
    q = minid
    e = minscore * r

    extra = r'--params=-r{0} -q{1} -a0 -b{1} -e{2}'.format(r, q, e)
    if not opts.distant:
        args.append(extra)

    lastapp(args)
Beispiel #15
0
def bed(args):
    '''
    %prog bed gff_file [--options]

    Parses the start, stop locations of the selected features out of GFF and
    generate a bed file
    '''
    p = OptionParser(bed.__doc__)
    p.add_option(
        "--type",
        dest="type",
        default="gene",
        help=
        "Feature type to extract, use comma for multiple [default: %default]")
    p.add_option("--key",
                 dest="key",
                 default="ID",
                 help="Key in the attributes to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    key = opts.key
    if key == "None":
        key = None

    type = set(x.strip() for x in opts.type.split(","))

    gff = Gff(gffile, key=key)
    b = Bed()

    for g in gff:
        if g.type not in type:
            continue

        b.append(g.bedline)

    b.sort(key=b.key)
    b.print_to_file(opts.outfile)
Beispiel #16
0
def last(args):
    """
    %prog last old.fasta new.fasta

    Generate psl file using last. Calles apps.last() but with special
    parameters: -r5 -q95 -a0 -b95 -e500, which only reports alignments larger
    than 100 bp and >=95 % identity.
    """
    from jcvi.apps.last import main as lastapp

    p = OptionParser(last.__doc__)
    p.add_option("--distant", default=False, action="store_true",
                 help="Assume distant relations")
    p.add_option("--minscore", default=100, type="int",
                 help="Filter alignments by how many bases match [default: %default]")
    p.add_option("--minid", default=95, type="int",
                 help="Minimum sequence identity [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    args = [oldfasta, newfasta, "--format=maf", \
                "--outfile={0}".format(opts.outfile)]

    minscore = opts.minscore
    minid = opts.minid

    r = 100 - minid
    q = minid
    e = minscore * r

    extra = r'--params=-r{0} -q{1} -a0 -b{1} -e{2}'.format(r, q, e)
    if not opts.distant:
        args.append(extra)

    lastapp(args)
Beispiel #17
0
def phase(args):
    """
    %prog phase genbankfiles

    Input has to be gb file. Search the `KEYWORDS` section to look for PHASE.
    Also look for "chromosome" and "clone" in the definition line.
    """
    p = OptionParser(phase.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fw = must_open(opts.outfile, "w")
    for gbfile in args:
        for rec in SeqIO.parse(gbfile, "gb"):
            bac_phase, keywords = get_phase(rec)
            chr, clone = get_clone(rec)
            keyword_field = ";".join(keywords)
            print >> fw, "\t".join(
                (rec.id, str(bac_phase), keyword_field, chr, clone))
Beispiel #18
0
def phase(args):
    """
    %prog phase genbankfiles

    Input has to be gb file. Search the `KEYWORDS` section to look for PHASE.
    Also look for "chromosome" and "clone" in the definition line.
    """
    p = OptionParser(phase.__doc__)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fw = must_open(opts.outfile, "w")
    for gbfile in args:
        for rec in SeqIO.parse(gbfile, "gb"):
            bac_phase, keywords = get_phase(rec)
            chr, clone = get_clone(rec)
            keyword_field = ";".join(keywords)
            print >> fw, "\t".join((rec.id, str(bac_phase), keyword_field,
                    chr, clone))
Beispiel #19
0
def bed(args):
    '''
    %prog bed gff_file [--options]

    Parses the start, stop locations of the selected features out of GFF and
    generate a bed file
    '''
    p = OptionParser(bed.__doc__)
    p.add_option("--type", dest="type", default="gene",
            help="Feature type to extract, use comma for multiple [default: %default]")
    p.add_option("--key", dest="key", default="ID",
            help="Key in the attributes to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    key = opts.key
    if key == "None":
        key = None

    type = set(x.strip() for x in opts.type.split(","))

    gff = Gff(gffile, key=key)
    b = Bed()

    for g in gff:
        if g.type not in type:
            continue

        b.append(g.bedline)

    b.sort(key=b.key)
    b.print_to_file(opts.outfile)
Beispiel #20
0
def extendbed(args):
    """
    %prog extend agpfile componentfasta

    Extend the components to fill the component range. For example, a bed/gff3 file
    that was converted from the agp will contain only the BAC sequence intervals
    that are 'represented' - sometimes leaving the 5` and 3` out (those that
    overlap with adjacent sequences. This script fill up those ranges,
    potentially to make graphics for tiling path.
    """
    from jcvi.formats.sizes import Sizes

    p = OptionParser(extendbed.__doc__)
    p.add_option("--nogaps", default=False, action="store_true",
            help="Do not print bed lines for gaps [default: %default]")
    p.add_option("--bed12", default=False, action="store_true",
            help="Produce bed12 formatted output [default: %default]")
    p.add_option("--gff", default=False, action="store_true",
            help="Produce gff3 formatted output. By default, ignores " +\
                 " AGP gap lines. [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    # If output format is GFF3, ignore AGP gap lines.
    if opts.gff:
        opts.nogaps = True

    agpfile, fastafile = args
    agp = AGP(agpfile)
    fw = must_open(opts.outfile, "w")
    if opts.gff:
        print >> fw, "##gff-version 3"

    ranges = defaultdict(list)
    thickCoords = []  # These are the coordinates before modify ranges
    # Make the first pass to record all the component ranges
    for a in agp:
        thickCoords.append((a.object_beg, a.object_end))
        if a.is_gap:
            continue
        ranges[a.component_id].append(a)

    # Modify the ranges
    sizes = Sizes(fastafile).mapping
    for accn, rr in ranges.items():
        alen = sizes[accn]

        a = rr[0]
        if a.orientation == "+":
            hang = a.component_beg - 1
        else:
            hang = alen - a.component_end
        a.object_beg -= hang

        a = rr[-1]
        if a.orientation == "+":
            hang = alen - a.component_end
        else:
            hang = a.component_beg - 1
        a.object_end += hang

    for a, (ts, te) in zip(agp, thickCoords):
        if opts.nogaps and a.is_gap:
            continue
        if opts.bed12:
            line = a.bedline
            a.object_beg, a.object_end = ts, te
            line += "\t" + a.bedextra
            print >> fw, line
        elif opts.gff:
            print >> fw, a.gffline()
        else:
            print >> fw, a.bedline
Beispiel #21
0
def bed(args):
    """
    %prog bed agpfile

    print out the tiling paths in bed/gff3 format
    """
    p = OptionParser(bed.__doc__)
    p.add_option("--gaps", default=False, action="store_true",
            help="Only print bed lines for gaps [default: %default]")
    p.add_option("--nogaps", default=False, action="store_true",
            help="Do not print bed lines for gaps [default: %default]")
    p.add_option("--bed12", default=False, action="store_true",
            help="Produce bed12 formatted output [default: %default]")
    set_outfile(p)
    g1 = OptionGroup(p, "GFF specific parameters",
            "Note: If not specified, output will be in `bed` format")
    g1.add_option("--gff", default=False, action="store_true",
            help="Produce gff3 formatted output. By default, ignores " +\
                 "AGP gap lines. [default: %default]")
    g1.add_option("--source", default="MGSC",
            help="Specify a gff3 source [default: `%default`]")
    g1.add_option("--feature", default="golden_path_fragment",
            help="Specify a gff3 feature type [default: `%default`]")
    g1.add_option("--verifySO", default=False, action="store_true",
            help="Verify gff3 feature type againt SO for validity. " +\
                  "Looks for `so.obo` in current folder. If not exists, " +\
                  "it downloads the obo file. [default: %default]")
    p.add_option_group(g1)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    # If output format is gff3, ignore AGP gap lines.
    if opts.gff:
        opts.nogaps = True
        # If 'verifySO' option is invoked, validate the SO term
        if opts.verifySO:
            validate_term(opts.feature)

    agpfile, = args
    agp = AGP(agpfile)
    fw = must_open(opts.outfile, "w")
    if opts.gff:
        print >> fw, "##gff-version 3"

    for a in agp:
        if opts.nogaps and a.is_gap:
            continue
        if opts.gaps and not a.is_gap:
            continue
        if opts.bed12:
            print >> fw, a.bed12line
        elif opts.gff:
            print >> fw, a.gffline(gff_source=opts.source, gff_feat_type=opts.feature)
        else:
            print >> fw, a.bedline
    fw.close()

    return fw.name
Beispiel #22
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    p = OptionParser(covfilter.__doc__)
    p.add_option("--pctid",
                 dest="pctid",
                 default=90,
                 type="int",
                 help="Percentage identity cutoff [default: %default]")
    p.add_option("--pctcov",
                 dest="pctcov",
                 default=50,
                 type="int",
                 help="Percentage identity cutoff [default: %default]")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    set_outfile(p, outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    from jcvi.algorithms.supermap import supermap

    blastfile, fastafile = args
    sizes = Sizes(fastafile).mapping
    querysupermap = blastfile + ".query.supermap"
    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")

    blastfile = querysupermap
    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(querysupermap)
    for query, blines in blast.iter_hits():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0

        for b in blines:
            this_covered += abs(b.qstart - b.qstop + 1)
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps

        this_identity = 100. - (this_mismatches +
                                this_gaps) * 100. / this_alignlen
        this_coverage = this_covered * 100. / sizes[query]

        if opts.list:
            print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                 this_coverage)

        if this_identity >= opts.pctid and this_coverage >= opts.pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\
            format(mapped_count, mapped_count * 100. / total, total)
    print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    print >> sys.stderr, "Average id = {0:.2f}%".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sum(sizes[x] for x in queries)
    print >> sys.stderr, "Coverage: {0} covered, {1} total".\
            format(covered, queries_combined)
    print >> sys.stderr, "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fp = open(blastfile)
    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast.iter_line():
        if b.query in valid:
            print >> fw, b
Beispiel #23
0
def bed12(args):
    """
    %prog bed12 gffile > bedfile

    Produce bed12 file for coding features. The exons will be converted to blocks.
    The CDS range will be shown between thickStart to thickEnd. For reference,
    bed format consists of the following fields:

    1. chrom
    2. chromStart
    3. chromEnd
    4. name
    5. score
    6. strand
    7. thickStart
    8. thickEnd
    9. itemRgb
    10. blockCount
    11. blockSizes
    12. blockStarts
    """
    p = OptionParser(bed12.__doc__)
    p.add_option("--parent",
                 default="mRNA",
                 help="Top feature type [default: %default]")
    p.add_option("--block",
                 default="exon",
                 help="Feature type for regular blocks [default: %default]")
    p.add_option("--thick",
                 default="CDS",
                 help="Feature type for thick blocks [default: %default]")
    set_outfile(p)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    parent, block, thick = opts.parent, opts.block, opts.thick
    outfile = opts.outfile

    g = make_index(gffile)
    fw = must_open(outfile, "w")

    for f in g.features_of_type(parent):

        chrom = f.chrom
        chromStart = f.start - 1
        chromEnd = f.stop
        name = f.id
        score = 0
        strand = f.strand
        thickStart = 1e15
        thickEnd = 0
        blocks = []

        for c in g.children(name, 1):

            cstart, cend = c.start - 1, c.stop

            if c.featuretype == block:
                blockStart = cstart - chromStart
                blockSize = cend - cstart
                blocks.append((blockStart, blockSize))

            elif c.featuretype == thick:
                thickStart = min(thickStart, cstart)
                thickEnd = max(thickEnd, cend)

        blocks.sort()
        blockStarts, blockSizes = zip(*blocks)
        blockCount = len(blocks)
        blockSizes = ",".join(str(x) for x in blockSizes) + ","
        blockStarts = ",".join(str(x) for x in blockStarts) + ","
        itemRgb = 0

        print >> fw, "\t".join(str(x) for x in (chrom, chromStart, chromEnd, \
                name, score, strand, thickStart, thickEnd, itemRgb,
                blockCount, blockSizes, blockStarts))
Beispiel #24
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("-a", "-A", dest="cpus", default=1, type="int",
            help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="output format, one of {0} [default: %default]".\
                 format("|".join(supported_formats)))
    p.add_option("--path", dest="lastz_path", default=None,
            help="specify LASTZ path")
    p.add_option("--mask", dest="mask", default=False, action="store_true",
            help="treat lower-case letters as mask info [default: %default]")

    set_params(p)
    set_outfile(p)
    set_grid(p)

    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    grid = opts.grid
    if grid:
        print >>sys.stderr, "Running jobs on JCVI grid"

    extra = opts.extra
    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith("lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format, grid))

        if grid:
            cmds = [lastz_2bit(x) for x in args]
            g = Grid(cmds)
            g.run()
            g.writestatus()

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    if grid:
        cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \
                lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)]
        mkdir(outdir)
        g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\
                format(i) for i in range(len(cmds))])
        g.run()
        g.writestatus()

    else:
        args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh,
                lock, lastz_bin, extra, mask) for k in xrange(cpus)]
        g = Jobs(target=lastz, args=args)
        g.run()
Beispiel #25
0
def bed(args):
    """
    %prog bed agpfile

    print out the tiling paths in bed/gff3 format
    """
    p = OptionParser(bed.__doc__)
    p.add_option("--gaps",
                 default=False,
                 action="store_true",
                 help="Only print bed lines for gaps [default: %default]")
    p.add_option("--nogaps",
                 default=False,
                 action="store_true",
                 help="Do not print bed lines for gaps [default: %default]")
    p.add_option("--bed12",
                 default=False,
                 action="store_true",
                 help="Produce bed12 formatted output [default: %default]")
    set_outfile(p)
    g1 = OptionGroup(p, "GFF specific parameters",
                     "Note: If not specified, output will be in `bed` format")
    g1.add_option("--gff", default=False, action="store_true",
            help="Produce gff3 formatted output. By default, ignores " +\
                 "AGP gap lines. [default: %default]")
    g1.add_option("--source",
                  default="MGSC",
                  help="Specify a gff3 source [default: `%default`]")
    g1.add_option("--feature",
                  default="golden_path_fragment",
                  help="Specify a gff3 feature type [default: `%default`]")
    g1.add_option("--verifySO", default=False, action="store_true",
            help="Verify gff3 feature type againt SO for validity. " +\
                  "Looks for `so.obo` in current folder. If not exists, " +\
                  "it downloads the obo file. [default: %default]")
    p.add_option_group(g1)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    # If output format is gff3, ignore AGP gap lines.
    if opts.gff:
        opts.nogaps = True
        # If 'verifySO' option is invoked, validate the SO term
        if opts.verifySO:
            validate_term(opts.feature)

    agpfile, = args
    agp = AGP(agpfile)
    fw = must_open(opts.outfile, "w")
    if opts.gff:
        print >> fw, "##gff-version 3"

    for a in agp:
        if opts.nogaps and a.is_gap:
            continue
        if opts.gaps and not a.is_gap:
            continue
        if opts.bed12:
            print >> fw, a.bed12line
        elif opts.gff:
            print >> fw, a.gffline(gff_source=opts.source,
                                   gff_feat_type=opts.feature)
        else:
            print >> fw, a.bedline
    fw.close()

    return fw.name
Beispiel #26
0
def load(args):
    '''
    %prog load gff_file fasta_file [--options]

    Parses the selected features out of GFF, with subfeatures concatenated.
    For example, to get the CDS sequences, do this::

    $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS
    '''
    from jcvi.formats.fasta import Seq, SeqRecord

    p = OptionParser(load.__doc__)
    p.add_option("--parents", dest="parents", default="mRNA",
            help="list of features to extract, use comma to separate (e.g."
            "'gene,mRNA') [default: %default]")
    p.add_option("--children", dest="children", default="CDS",
            help="list of features to extract, use comma to separate (e.g."
            "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    p.add_option("--attribute",
            help="The attribute field to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    gff_file, fasta_file = args

    g = make_index(gff_file)
    f = Fasta(fasta_file, index=False)
    fw = must_open(opts.outfile, "w")

    parents = set(opts.parents.split(','))
    children_list = set(opts.children.split(','))
    attr = opts.attribute

    for feat in get_parents(gff_file, parents):

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list:
                continue
            child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop,
                strand=c.strand))
            children.append((child, c))

        if not children:
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand == '-':
            children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        description = ",".join(feat.attributes[attr]) \
                if attr and attr in feat.attributes else ""
        description = description.replace("\"", "")

        rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description)
        SeqIO.write([rec], fw, "fasta")
        fw.flush()
Beispiel #27
0
def load(args):
    '''
    %prog load gff_file fasta_file [--options]

    Parses the selected features out of GFF, with subfeatures concatenated.
    For example, to get the CDS sequences, do this::

    $ %prog load athaliana.gff athaliana.fa --parents mRNA --children CDS
    '''
    from jcvi.formats.fasta import Seq, SeqRecord

    p = OptionParser(load.__doc__)
    p.add_option(
        "--parents",
        dest="parents",
        default="mRNA",
        help="list of features to extract, use comma to separate (e.g."
        "'gene,mRNA') [default: %default]")
    p.add_option(
        "--children",
        dest="children",
        default="CDS",
        help="list of features to extract, use comma to separate (e.g."
        "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    p.add_option("--attribute",
                 help="The attribute field to extract [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    gff_file, fasta_file = args

    g = make_index(gff_file)
    f = Fasta(fasta_file, index=False)
    fw = must_open(opts.outfile, "w")

    parents = set(opts.parents.split(','))
    children_list = set(opts.children.split(','))
    attr = opts.attribute

    for feat in get_parents(gff_file, parents):

        children = []
        for c in g.children(feat.id, 1):

            if c.featuretype not in children_list:
                continue
            child = f.sequence(
                dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand))
            children.append((child, c))

        if not children:
            print >>sys.stderr, "[warning] %s has no children with type %s" \
                                    % (feat.id, ','.join(children_list))
            continue
        # sort children in incremental position
        children.sort(key=lambda x: x[1].start)
        # reverse children if negative strand
        if feat.strand == '-':
            children.reverse()
        feat_seq = ''.join(x[0] for x in children)

        description = ",".join(feat.attributes[attr]) \
                if attr and attr in feat.attributes else ""
        description = description.replace("\"", "")

        rec = SeqRecord(Seq(feat_seq), id=feat.id, description=description)
        SeqIO.write([rec], fw, "fasta")
        fw.flush()
Beispiel #28
0
def extract(args):
    """
    %prog extract gffile

    --contigs: Extract particular contig(s) from the gff file. If multiple contigs are
    involved, use "," to separate, e.g. "contig_12,contig_150"
    --names: Provide a file with IDs, one each line
    """
    p = OptionParser(extract.__doc__)
    p.add_option(
        "--contigs",
        help="Extract features from certain contigs [default: %default]")
    p.add_option(
        "--names",
        help="Extract features with certain names [default: %default]")
    p.add_option("--fasta",
                 default=False,
                 action="store_true",
                 help="Write FASTA if available [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    contigID = opts.contigs
    namesfile = opts.names

    contigID = set(contigID.split(",")) if contigID else None
    names = set(x.strip() for x in open(namesfile)) if namesfile else None

    outfile = opts.outfile
    fp = open(gffile)
    fw = must_open(outfile, "w")
    for row in fp:
        atoms = row.split()
        if len(atoms) == 0:
            continue
        tag = atoms[0]
        if row[0] == "#":
            if not (tag == RegionTag and contigID
                    and atoms[1] not in contigID):
                print >> fw, row.rstrip()
            if tag == FastaTag:
                break
            continue

        b = GffLine(row)
        is_right_contig = (contigID and tag in contigID) or (not contigID)
        is_right_names = (names and b.attributes["Name"][0] in names) or \
                         (not names)

        if is_right_contig and is_right_names:
            print >> fw, row.rstrip()

    if not opts.fasta:
        return

    f = Fasta(gffile)
    for s in contigID:
        if s in f:
            SeqIO.write([f[s]], fw, "fasta")
Beispiel #29
0
def bed12(args):
    """
    %prog bed12 gffile > bedfile

    Produce bed12 file for coding features. The exons will be converted to blocks.
    The CDS range will be shown between thickStart to thickEnd. For reference,
    bed format consists of the following fields:

    1. chrom
    2. chromStart
    3. chromEnd
    4. name
    5. score
    6. strand
    7. thickStart
    8. thickEnd
    9. itemRgb
    10. blockCount
    11. blockSizes
    12. blockStarts
    """
    p = OptionParser(bed12.__doc__)
    p.add_option("--parent", default="mRNA",
            help="Top feature type [default: %default]")
    p.add_option("--block", default="exon",
            help="Feature type for regular blocks [default: %default]")
    p.add_option("--thick", default="CDS",
            help="Feature type for thick blocks [default: %default]")
    set_outfile(p)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gffile, = args
    parent, block, thick = opts.parent, opts.block, opts.thick
    outfile = opts.outfile

    g = make_index(gffile)
    fw = must_open(outfile, "w")

    for f in g.features_of_type(parent):

        chrom = f.chrom
        chromStart = f.start - 1
        chromEnd = f.stop
        name = f.id
        score = 0
        strand = f.strand
        thickStart = 1e15
        thickEnd = 0
        blocks = []

        for c in g.children(name, 1):

            cstart, cend = c.start - 1, c.stop

            if c.featuretype == block:
                blockStart = cstart - chromStart
                blockSize = cend - cstart
                blocks.append((blockStart, blockSize))

            elif c.featuretype == thick:
                thickStart = min(thickStart, cstart)
                thickEnd = max(thickEnd, cend)

        blocks.sort()
        blockStarts, blockSizes = zip(*blocks)
        blockCount = len(blocks)
        blockSizes = ",".join(str(x) for x in blockSizes) + ","
        blockStarts = ",".join(str(x) for x in blockStarts) + ","
        itemRgb = 0

        print >> fw, "\t".join(str(x) for x in (chrom, chromStart, chromEnd, \
                name, score, strand, thickStart, thickEnd, itemRgb,
                blockCount, blockSizes, blockStarts))
Beispiel #30
0
def extendbed(args):
    """
    %prog extend agpfile componentfasta

    Extend the components to fill the component range. For example, a bed/gff3 file
    that was converted from the agp will contain only the BAC sequence intervals
    that are 'represented' - sometimes leaving the 5` and 3` out (those that
    overlap with adjacent sequences. This script fill up those ranges,
    potentially to make graphics for tiling path.
    """
    from jcvi.formats.sizes import Sizes

    p = OptionParser(extendbed.__doc__)
    p.add_option("--nogaps",
                 default=False,
                 action="store_true",
                 help="Do not print bed lines for gaps [default: %default]")
    p.add_option("--bed12",
                 default=False,
                 action="store_true",
                 help="Produce bed12 formatted output [default: %default]")
    p.add_option("--gff", default=False, action="store_true",
            help="Produce gff3 formatted output. By default, ignores " +\
                 " AGP gap lines. [default: %default]")
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    # If output format is GFF3, ignore AGP gap lines.
    if opts.gff:
        opts.nogaps = True

    agpfile, fastafile = args
    agp = AGP(agpfile)
    fw = must_open(opts.outfile, "w")
    if opts.gff:
        print >> fw, "##gff-version 3"

    ranges = defaultdict(list)
    thickCoords = []  # These are the coordinates before modify ranges
    # Make the first pass to record all the component ranges
    for a in agp:
        thickCoords.append((a.object_beg, a.object_end))
        if a.is_gap:
            continue
        ranges[a.component_id].append(a)

    # Modify the ranges
    sizes = Sizes(fastafile).mapping
    for accn, rr in ranges.items():
        alen = sizes[accn]

        a = rr[0]
        if a.orientation == "+":
            hang = a.component_beg - 1
        else:
            hang = alen - a.component_end
        a.object_beg -= hang

        a = rr[-1]
        if a.orientation == "+":
            hang = alen - a.component_end
        else:
            hang = a.component_beg - 1
        a.object_end += hang

    for a, (ts, te) in zip(agp, thickCoords):
        if opts.nogaps and a.is_gap:
            continue
        if opts.bed12:
            line = a.bedline
            a.object_beg, a.object_end = ts, te
            line += "\t" + a.bedextra
            print >> fw, line
        elif opts.gff:
            print >> fw, a.gffline()
        else:
            print >> fw, a.bedline
Beispiel #31
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    p = OptionParser(covfilter.__doc__)
    p.add_option("--pctid", dest="pctid", default=90, type="int",
            help="Percentage identity cutoff [default: %default]")
    p.add_option("--pctcov", dest="pctcov", default=50, type="int",
            help="Percentage identity cutoff [default: %default]")
    p.add_option("--ids", dest="ids", default=None,
            help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list", dest="list", default=False, action="store_true",
            help="List the id% and cov% per gene [default: %default]")
    set_outfile(p, outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    from jcvi.algorithms.supermap import supermap

    blastfile, fastafile = args
    sizes = Sizes(fastafile).mapping
    querysupermap = blastfile + ".query.supermap"
    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")

    blastfile = querysupermap
    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(querysupermap)
    for query, blines in blast.iter_hits():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0

        for b in blines:
            this_covered += abs(b.qstart - b.qstop + 1)
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps

        this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen
        this_coverage = this_covered * 100. / sizes[query]

        if opts.list:
            print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage)

        if this_identity >= opts.pctid and this_coverage >= opts.pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\
            format(mapped_count, mapped_count * 100. / total, total)
    print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    print >> sys.stderr, "Average id = {0:.2f}%".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sum(sizes[x] for x in queries)
    print >> sys.stderr, "Coverage: {0} covered, {1} total".\
            format(covered, queries_combined)
    print >> sys.stderr, "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fp = open(blastfile)
    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast.iter_line():
        if b.query in valid:
            print >> fw, b
Beispiel #32
0
def main(args):
    """
    %prog database.fasta query.fasta


    Run LAST by calling LASTDB, LASTAL and LASTEX.
    """

    supported_formats = ("tab", "maf", "blast")

    p = OptionParser(main.__doc__)
    p.add_option("-a", "-A", dest="cpus", default=1, type="int",
            help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--path", help="specify LAST path")
    p.add_option("--format", default="blast", choices=supported_formats,
                 help="Output format, one of {0} [default: %default]".\
                      format("|".join(supported_formats)))
    p.add_option("--eval", default=False, action="store_true",
                 help="Use lastex to recalculate E-value [default: %default]")

    set_params(p)
    set_outfile(p)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    if opts.eval and opts.cpus > 1:
        raise Exception, "Option --eval cannnot work with multiple threads"

    path = opts.path
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")
    lastex_bin = getpath("lastex")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", lastdb_bin=lastdb_bin)

    cpus = opts.cpus
    logging.debug("Dispatch job to {0} cpus".format(cpus))

    if opts.format == "maf":
        cmd = 'echo "##maf version=1"'
        sh(cmd)

    cmd = "{0} -u 0".format(lastal_bin)
    f = supported_formats.index(opts.format)
    cmd += " -f {0}".format(f)
    cmd += " {0} -".format(subjectdb)

    extra = opts.extra
    if extra:
        cmd += " " + extra

    if opts.eval:
        querydb = query.rsplit(".", 1)[0]
        run_lastdb(infile=query, outfile=querydb + ".prj")

        cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb)

    out_fh = must_open(opts.outfile, "w")
    lock = Lock()

    args = [(k + 1, cpus, out_fh, cmd, query, lock) \
                    for k in xrange(cpus)]
    g = Jobs(target=last, args=args)
    g.run()
Beispiel #33
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("-a",
                 "-A",
                 dest="cpus",
                 default=1,
                 type="int",
                 help="parallelize job to multiple cpus [default: %default]")
    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="output format, one of {0} [default: %default]".\
                 format("|".join(supported_formats)))
    p.add_option("--path",
                 dest="lastz_path",
                 default=None,
                 help="specify LASTZ path")
    p.add_option(
        "--mask",
        dest="mask",
        default=False,
        action="store_true",
        help="treat lower-case letters as mask info [default: %default]")
    p.add_option(
        "--similar",
        default=False,
        action="store_true",
        help="Use options tuned for close comparison [default: %default]")

    set_params(p)
    set_outfile(p)
    set_grid(p)

    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    grid = opts.grid
    if grid:
        print >> sys.stderr, "Running jobs on JCVI grid"

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith(
        "lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format, grid))

        if grid:
            cmds = [lastz_2bit(x) for x in args]
            g = Grid(cmds)
            g.run()
            g.writestatus()

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    if grid:
        cmds = [lastz(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, \
                lock, lastz_bin, extra, mask, grid) for k in xrange(cpus)]
        mkdir(outdir)
        g = Grid(cmds, outfiles=[op.join(outdir, "out.{0}.lastz").\
                format(i) for i in range(len(cmds))])
        g.run()
        g.writestatus()

    else:
        args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin,
                 extra, mask) for k in xrange(cpus)]
        g = Jobs(target=lastz, args=args)
        g.run()