Example #1
0
def omgprepare(args):
    """
    %prog omgprepare ploidy anchorsfile blastfile

    Prepare to run Sankoff's OMG algorithm to get orthologs.
    """
    from jcvi.formats.blast import cscore
    from jcvi.formats.base import DictFile

    p = OptionParser(omgprepare.__doc__)
    p.add_option("--norbh",
                 action="store_true",
                 help="Disable RBH hits [default: %default]")
    p.add_option("--pctid",
                 default=0,
                 type="int",
                 help="Percent id cutoff for RBH hits [default: %default]")
    p.add_option("--cscore",
                 default=90,
                 type="int",
                 help="C-score cutoff for RBH hits [default: %default]")
    p.set_stripnames()
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ploidy, anchorfile, blastfile = args
    norbh = opts.norbh
    pctid = opts.pctid
    cs = opts.cscore
    qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts)

    fp = open(ploidy)
    genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp))
    fp.close()

    ploidy = DictFile(ploidy)

    geneinfo(qbed, qorder, genomeidx, ploidy)
    geneinfo(sbed, sorder, genomeidx, ploidy)

    pf = blastfile.rsplit(".", 1)[0]
    cscorefile = pf + ".cscore"
    cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"])
    ac = AnchorFile(anchorfile)
    pairs = set((a, b) for a, b, i in ac.iter_pairs())
    logging.debug("Imported {0} pairs from `{1}`.".format(
        len(pairs), anchorfile))

    weightsfile = pf + ".weights"
    fp = open(cscorefile)
    fw = open(weightsfile, "w")
    npairs = 0
    for row in fp:
        a, b, c, pct = row.split()
        c, pct = float(c), float(pct)
        c = int(c * 100)
        if (a, b) not in pairs:
            if norbh:
                continue
            if c < cs:
                continue
            if pct < pctid:
                continue
            c /= 10  # This severely penalizes RBH against synteny

        print("\t".join((a, b, str(c))), file=fw)
        npairs += 1
    fw.close()

    logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
Example #2
0
def gatk(args):
    """
    %prog gatk bamfile reference.fasta

    Call SNPs based on GATK best practices.
    """
    p = OptionParser(gatk.__doc__)
    p.add_option(
        "--indelrealign",
        default=False,
        action="store_true",
        help="Perform indel realignment",
    )
    p.set_home("gatk")
    p.set_home("picard")
    p.set_phred()
    p.set_cpus(cpus=24)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, ref = args
    pf = bamfile.rsplit(".", 1)[0]
    mm = MakeManager()
    picard = "java -Xmx32g -jar {0}/picard.jar".format(opts.picard_home)
    tk = "java -Xmx32g -jar {0}/GenomeAnalysisTK.jar".format(opts.gatk_home)
    tk += " -R {0}".format(ref)

    # Step 0 - build reference
    dictfile = ref.rsplit(".", 1)[0] + ".dict"
    cmd1 = picard + " CreateSequenceDictionary"
    cmd1 += " R={0} O={1}".format(ref, dictfile)
    cmd2 = "samtools faidx {0}".format(ref)
    mm.add(ref, dictfile, (cmd1, cmd2))

    # Step 1 - sort bam
    sortedbamfile = pf + ".sorted.bam"
    cmd = picard + " SortSam"
    cmd += " INPUT={0} OUTPUT={1}".format(bamfile, sortedbamfile)
    cmd += " SORT_ORDER=coordinate CREATE_INDEX=true"
    mm.add(bamfile, sortedbamfile, cmd)

    # Step 2 - mark duplicates
    dedupbamfile = pf + ".dedup.bam"
    cmd = picard + " MarkDuplicates"
    cmd += " INPUT={0} OUTPUT={1}".format(sortedbamfile, dedupbamfile)
    cmd += " METRICS_FILE=dedup.log CREATE_INDEX=true"
    mm.add(sortedbamfile, dedupbamfile, cmd)

    if opts.indelrealign:
        # Step 3 - create indel realignment targets
        intervals = pf + ".intervals"
        cmd = tk + " -T RealignerTargetCreator"
        cmd += " -I {0} -o {1}".format(dedupbamfile, intervals)
        mm.add(dedupbamfile, intervals, cmd)

        # Step 4 - indel realignment
        realignedbamfile = pf + ".realigned.bam"
        cmd = tk + " -T IndelRealigner"
        cmd += " -targetIntervals {0}".format(intervals)
        cmd += " -I {0} -o {1}".format(dedupbamfile, realignedbamfile)
        mm.add((dictfile, intervals), realignedbamfile, cmd)
    else:
        realignedbamfile = dedupbamfile

    # Step 5 - SNP calling
    vcf = pf + ".vcf"
    cmd = tk + " -T HaplotypeCaller"
    cmd += " -I {0}".format(realignedbamfile)
    cmd += " --genotyping_mode DISCOVERY"
    cmd += " -stand_emit_conf 10 -stand_call_conf 30"
    cmd += " -nct {0}".format(opts.cpus)
    cmd += " -o {0}".format(vcf)
    if opts.phred == "64":
        cmd += " --fix_misencoded_quality_scores"
    mm.add(realignedbamfile, vcf, cmd)

    # Step 6 - SNP filtering
    filtered_vcf = pf + ".filtered.vcf"
    cmd = tk + " -T VariantFiltration"
    cmd += " -V {0}".format(vcf)
    cmd += ' --filterExpression "DP < 10 || DP > 300 || QD < 2.0 || FS > 60.0 || MQ < 40.0"'
    cmd += ' --filterName "LOWQUAL"'
    cmd += ' --genotypeFilterExpression "isHomVar == 1"'
    cmd += ' --genotypeFilterName "HOMOVAR"'
    cmd += ' --genotypeFilterExpression "isHet == 1"'
    cmd += ' --genotypeFilterName "HET"'
    cmd += " -o {0}".format(filtered_vcf)
    mm.add(vcf, filtered_vcf, cmd)

    mm.write()
Example #3
0
def embed(args):
    """
    %prog embed evidencefile scaffolds.fasta contigs.fasta

    Use SSPACE evidencefile to scaffold contigs into existing scaffold
    structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE
    directly to scaffold.

    Rules:
    1. Only update existing structure by embedding contigs small enough to fit.
    2. Promote singleton contigs only if they are big (>= min_length).
    """
    p = OptionParser(embed.__doc__)
    p.set_mingap(default=10)
    p.add_option("--min_length",
                 default=200,
                 type="int",
                 help="Minimum length to consider [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    evidencefile, scaffolds, contigs = args
    min_length = opts.min_length
    splitfasta, oagp, cagp = gaps(
        [scaffolds, "--split", "--mingap={0}".format(opts.mingap)])

    agp = AGP(cagp)
    p = agp.graph

    ef = EvidenceFile(evidencefile, contigs)
    sizes = ef.sz
    q = ef.graph

    logging.debug("Reference graph: {0}".format(p))
    logging.debug("Patch graph: {0}".format(q))

    newagp = deepcopy(agp)

    seen = set()
    deleted = set()
    for a in agp:
        if a.is_gap:
            continue

        name = a.component_id
        object = a.object
        if name in deleted:
            print >> sys.stderr, "* Skip {0}, already embedded".format(name)
            continue

        seen.add(name)

        target_name, tag = get_target(p, name)
        path = q.get_path(name, target_name, tag=tag)
        path_size = sum([sizes[x.v] for x, t in path]) if path else None
        status = NO_UPDATE

        # Heuristic, the patch must not be too long
        if path and path_size > min_length and len(path) > 3:
            path = None

        if not path:
            print >> sys.stderr, name, target_name, path, path_size, status
            continue

        backward = False
        for x, t in path:
            if x.v in seen:
                print >> sys.stderr, "* Does not allow backward" \
                                     " patch on {0}".format(x.v)
                backward = True
                break

        if backward:
            continue

        # Build the path plus the ends
        vv = q.get_node(name)
        path.appendleft((vv, tag))
        if tag == ">":
            path.reverse()
            status = INSERT_BEFORE
        elif target_name is None:
            status = INSERT_AFTER
        else:
            target = q.get_node(target_name)
            path.append((target, tag))
            status = INSERT_BETWEEN

        print >> sys.stderr, name, target_name, path, path_size, status

        # Trim the ends off from the constructed AGPLines
        lines = path_to_agp(q, path, object, sizes, status)
        if status == INSERT_BEFORE:
            lines = lines[:-1]
            td = newagp.insert_lines(name, lines, \
                                 delete=True, verbose=True)
        elif status == INSERT_AFTER:
            lines = lines[1:]
            td = newagp.insert_lines(name, lines, after=True, \
                                 delete=True, verbose=True)
        else:
            lines = lines[1:-1]
            td = newagp.update_between(name, target_name, lines, \
                                 delete=True, verbose=True)
        deleted |= td
        seen |= td

    # Recruite big singleton contigs
    CUTOFF = opts.min_length
    for ctg, size in sizes.items():
        if ctg in seen:
            continue
        if size < CUTOFF:
            continue
        newagp.append(AGPLine.cline(ctg, ctg, sizes, "?"))

    # Write a new AGP file
    newagpfile = "embedded.agp"
    newagp.print_to_file(newagpfile, index=True)
    tidy([newagpfile, contigs])
Example #4
0
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option("--splitdir", dest="splitdir", default="outRoot",
            help="split the ace per contig to dir [default: %default]")
    p.add_option("--unpaired", dest="unpaired", default=False,
            help="remove read pairs on the same contig [default: %default]")
    p.add_option("--minreadno", dest="minreadno", default=3, type="int",
            help="minimum read numbers per contig [default: %default]")
    p.add_option("--minctgsize", dest="minctgsize", default=100, type="int",
            help="minimum contig size per contig [default: %default]")
    p.add_option("--astat", default=False, action="store_true",
            help="create .astat to list repetitiveness [default: %default]")
    p.add_option("--readids", default=False, action="store_true",
            help="create file of mapped and unmapped ids [default: %default]")

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs,
        genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print >> fw, "AS {0} {1}".format(ncontigs, totalreads)
    print >> fw

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads,
                nsegments)
        print >> fw, fill(str(cseq.seq))
        print >> fw

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print >> astatfw, "{0}\t{1:.1f}".format(contig, astat)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print >> fw, "BQ\n{0}".format(text)
        print >> fw

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print >> readsfw, readname
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print >> fw, af

        print >> fw

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags,
                    fill(aseq))
            qs = "QA 1 {0} 1 {0}".format(alen)

            print >> fw, rd
            print >> fw
            print >> fw, qs
            print >> fw
Example #5
0
def main():
    """
    %prog database.fa query.fa [options]

    Wrapper for NCBI BLAST+.
    """
    p = OptionParser(main.__doc__)

    p.add_option("--format", default=" \'6 qseqid sseqid pident length " \
            "mismatch gapopen qstart qend sstart send evalue bitscore\' ",
            help="0-11, learn more with \"blastp -help\". [default: %default]")
    p.add_option("--path", dest="blast_path", default=None,
            help="specify BLAST+ path including the program name")
    p.add_option("--prog", dest="blast_program", default="blastp",
            help="specify BLAST+ program to use. See complete list here: " \
            "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation"
            " [default: %default]")
    p.set_align(evalue=.01)
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.set_cpus()
    p.add_option("--nprocs", default=1, type="int",
            help="number of BLAST processes to run in parallel. " + \
            "split query.fa into `nprocs` chunks, " + \
            "each chunk uses -num_threads=`cpus`")
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2 or opts.blast_program is None:
        sys.exit(not p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    blast_path = opts.blast_path
    blast_program = opts.blast_program

    blast_bin = blast_path or blast_program
    if op.basename(blast_bin) != blast_program:
        blast_bin = op.join(blast_bin, blast_program)

    nprocs, cpus = opts.nprocs, opts.cpus
    if nprocs > 1:
        logging.debug("Dispatch job to %d processes" % nprocs)
        outdir = "outdir"
        fs = split([afasta_fn, outdir, str(nprocs)])
        queries = fs.names
    else:
        queries = [afasta_fn]

    dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \
        else "nucl"

    db = bfasta_fn
    if dbtype == "prot":
        nin = db + ".pin"
    else:
        nin = db + ".nin"
        nin00 = db + ".00.nin"
        nin = nin00 if op.exists(nin00) else (db + ".nin")

    run_formatdb(infile=db, outfile=nin, dbtype=dbtype)

    lock = Lock()

    blastplus_template = "{0} -db {1} -outfmt {2}"
    blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format)
    blast_cmd += " -evalue {0} -max_target_seqs {1}".\
        format(opts.evalue, opts.best)
    blast_cmd += " -num_threads {0}".format(cpus)
    if extra:
        blast_cmd += " " + extra.strip()

    args = [(out_fh, blast_cmd, query, lock) for query in queries]
    g = Jobs(target=blastplus, args=args)
    g.run()
Example #6
0
def filter(args):
    """
    %prog filter test.blast

    Produce a new blast file and filter based on:
    - score: >= cutoff
    - pctid: >= cutoff
    - hitlen: >= cutoff
    - evalue: <= cutoff
    - ids: valid ids

    Use --inverse to obtain the complementary records for the criteria above.

    - noself: remove self-self hits
    """
    p = OptionParser(filter.__doc__)
    p.add_option("--score",
                 dest="score",
                 default=0,
                 type="int",
                 help="Score cutoff")
    p.set_align(pctid=95, hitlen=100, evalue=.01)
    p.add_option("--noself",
                 default=False,
                 action="store_true",
                 help="Remove self-self hits")
    p.add_option("--ids", help="Path to file with ids to retain")
    p.add_option("--inverse",
                 default=False,
                 action="store_true",
                 help="Similar to grep -v, inverse")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    if opts.ids:
        ids = set()
        for row in must_open(opts.ids):
            if row[0] == "#":
                continue
            row = row.replace(",", "\t")
            ids.update(row.split())
    else:
        ids = None

    blastfile, = args
    inverse = opts.inverse
    outfile = opts.outfile
    fp = must_open(blastfile)

    score, pctid, hitlen, evalue, noself = \
            opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself
    newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \
                    outfile is None else outfile
    if inverse:
        newblastfile += ".inverse"
    fw = must_open(newblastfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        c = BlastLine(row)

        if ids:
            if c.query in ids and c.subject in ids:
                noids = False
            else:
                noids = True
        else:
            noids = None

        remove = c.score < score or \
            c.pctid < pctid or \
            c.hitlen < hitlen or \
            c.evalue > evalue or \
            noids

        if inverse:
            remove = not remove

        remove = remove or (noself and c.query == c.subject)

        if not remove:
            print >> fw, row.rstrip()

    fw.close()

    return newblastfile
Example #7
0
def cscore(args):
    """
    %prog cscore blastfile > cscoreOut

    See supplementary info for sea anemone genome paper, C-score formula:

        cscore(A,B) = score(A,B) /
             max(best score for A, best score for B)

    A C-score of one is the same as reciprocal best hit (RBH).

    Output file will be 3-column (query, subject, cscore). Use --cutoff to
    select a different cutoff.
    """
    from jcvi.utils.cbook import gene_name

    p = OptionParser(cscore.__doc__)
    p.add_option("--cutoff",
                 default=.9999,
                 type="float",
                 help="Minimum C-score to report [default: %default]")
    p.add_option("--pct",
                 default=False,
                 action="store_true",
                 help="Also include pct as last column [default: %default]")
    p.add_option("--writeblast",
                 default=False,
                 action="store_true",
                 help="Also write filtered blast file [default: %default]")
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)
    ostrip = opts.strip_names
    writeblast = opts.writeblast
    outfile = opts.outfile

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    blast = Blast(blastfile)
    logging.debug("Register best scores ..")
    best_score = defaultdict(float)
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        if score > best_score[query]:
            best_score[query] = score
        if score > best_score[subject]:
            best_score[subject] = score

    blast = Blast(blastfile)
    pairs = {}
    cutoff = opts.cutoff
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        pctid = b.pctid
        s = score / max(best_score[query], best_score[subject])
        if s > cutoff:
            pair = (query, subject)
            if pair not in pairs or s > pairs[pair][0]:
                pairs[pair] = (s, pctid, b)

    fw = must_open(outfile, "w")
    if writeblast:
        fwb = must_open(outfile + ".filtered.blast", "w")
    pct = opts.pct
    for (query, subject), (s, pctid, b) in sorted(pairs.items()):
        args = [query, subject, "{0:.2f}".format(s)]
        if pct:
            args.append("{0:.1f}".format(pctid))
        print >> fw, "\t".join(args)
        if writeblast:
            print >> fwb, b
    fw.close()
    if writeblast:
        fwb.close()
Example #8
0
def genestats(args):
    """
    %prog genestats gffile

    Print summary stats, including:
    - Number of genes
    - Number of single-exon genes
    - Number of multi-exon genes
    - Number of distinct exons
    - Number of genes with alternative transcript variants
    - Number of predicted transcripts
    - Mean number of distinct exons per gene
    - Mean number of transcripts per gene
    - Mean gene locus size (first to last exon)
    - Mean transcript size (UTR, CDS)
    - Mean exon size

    Stats modeled after barley genome paper Table 1.
    A physical, genetic and functional sequence assembly of the barley genome
    """
    p = OptionParser(genestats.__doc__)
    p.add_option("--groupby", default="conf_class",
                 help="Print separate stats groupby")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gff_file, = args
    gb = opts.groupby
    g = make_index(gff_file)

    tf = "transcript.sizes"
    if need_update(gff_file, tf):
        fw = open(tf, "w")
        for feat in g.features_of_type("mRNA"):
            fid = feat.id
            conf_class = feat.attributes.get(gb, "all")
            tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \
                             if c.featuretype == "exon")
            print("\t".join((fid, str(tsize), conf_class)), file=fw)
        fw.close()

    tsizes = DictFile(tf, cast=int)
    conf_classes = DictFile(tf, valuepos=2)
    logging.debug("A total of {0} transcripts populated.".format(len(tsizes)))

    genes = []
    for feat in g.features_of_type("gene"):
        fid = feat.id
        transcripts = [c.id for c in g.children(fid, 1) \
                         if c.featuretype == "mRNA"]
        transcript_sizes = [tsizes[x] for x in transcripts]
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                         if c.featuretype == "exon")
        conf_class = conf_classes[transcripts[0]]
        gs = GeneStats(feat, conf_class, transcript_sizes, exons)
        genes.append(gs)

    r = {}  # Report
    distinct_groups = set(conf_classes.values())
    for g in distinct_groups:
        num_genes = num_single_exon_genes = num_multi_exon_genes = 0
        num_genes_with_alts = num_transcripts = num_exons = max_transcripts = 0
        cum_locus_size = cum_transcript_size = cum_exon_size = 0
        for gs in genes:
            if gs.conf_class != g:
                continue
            num_genes += 1
            if gs.num_exons == 1:
                num_single_exon_genes += 1
            else:
                num_multi_exon_genes += 1
            num_exons += gs.num_exons
            if gs.num_transcripts > 1:
                num_genes_with_alts += 1
            if gs.num_transcripts > max_transcripts:
                max_transcripts = gs.num_transcripts
            num_transcripts += gs.num_transcripts
            cum_locus_size += gs.locus_size
            cum_transcript_size += gs.cum_transcript_size
            cum_exon_size += gs.cum_exon_size

        mean_num_exons = num_exons * 1. / num_genes
        mean_num_transcripts = num_transcripts * 1. / num_genes
        mean_locus_size = cum_locus_size * 1. / num_genes
        mean_transcript_size = cum_transcript_size * 1. / num_transcripts
        mean_exon_size = cum_exon_size * 1. / num_exons

        r[("Number of genes", g)] = num_genes
        r[("Number of single-exon genes", g)] = \
            percentage(num_single_exon_genes, num_genes, mode=1)
        r[("Number of multi-exon genes", g)] = \
            percentage(num_multi_exon_genes, num_genes, mode=1)
        r[("Number of distinct exons", g)] = num_exons
        r[("Number of genes with alternative transcript variants", g)] = \
            percentage(num_genes_with_alts, num_genes, mode=1)
        r[("Number of predicted transcripts", g)] = num_transcripts
        r[("Mean number of distinct exons per gene", g)] = mean_num_exons
        r[("Mean number of transcripts per gene", g)] = mean_num_transcripts
        r[("Max number of transcripts per gene", g)] = max_transcripts
        r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size
        r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size
        r[("Mean exon size", g)] = mean_exon_size

    fw = must_open(opts.outfile, "w")
    print(tabulate(r), file=fw)
    fw.close()
Example #9
0
def report(args):
    '''
    %prog report ksfile

    generate a report given a Ks result file (as produced by synonymous_calc.py).
    describe the median Ks, Ka values, as well as the distribution in stem-leaf plot
    '''
    from jcvi.utils.cbook import SummaryStats
    from jcvi.graphics.histogram import stem_leaf_plot

    p = OptionParser(report.__doc__)
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Generate graphic output for the histogram [default: %default]")
    p.add_option(
        "--components",
        default=1,
        type="int",
        help="Number of components to decompose peaks [default: %default]")
    add_plot_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="5x5")

    if len(args) != 1:
        sys.exit(not p.print_help())

    ks_file, = args
    data = KsFile(ks_file)
    ks_min = opts.vmin
    ks_max = opts.vmax
    bins = opts.bins

    for f in fields.split(",")[1:]:
        columndata = [getattr(x, f) for x in data]
        ks = ("ks" in f)
        if not ks:
            continue

        columndata = [x for x in columndata if ks_min <= x <= ks_max]

        st = SummaryStats(columndata)
        title = "{0} ({1}): ".format(descriptions[f], ks_file)
        title += "Median:{0:.3f} (1Q:{1:.3f}|3Q:{2:.3f}||".\
                format(st.median, st.firstq, st.thirdq)
        title += "Mean:{0:.3f}|Std:{1:.3f}||N:{2})".\
                format(st.mean, st.sd, st.size)

        tbins = (0, ks_max, bins) if ks else (0, .6, 10)
        digit = 2 if (ks_max * 1. / bins) < .1 else 1
        stem_leaf_plot(columndata, *tbins, digit=digit, title=title)

    if not opts.pdf:
        return

    components = opts.components
    data = [x.ng_ks for x in data]
    data = [x for x in data if ks_min <= x <= ks_max]

    fig = plt.figure(1, (iopts.w, iopts.h))
    ax = fig.add_axes([.12, .1, .8, .8])
    kp = KsPlot(ax, ks_max, opts.bins, legendp=opts.legendp)
    kp.add_data(data, components, fill=opts.fill, fitted=opts.fit)
    kp.draw(title=opts.title)
Example #10
0
def dotplot(args):
    """
    %prog dotplot map.csv ref.fasta

    Make dotplot between chromosomes and linkage maps.
    The input map is csv formatted, for example:

    ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition
    scaffold_2707,11508,1,0
    scaffold_2707,11525,1,1.2
    """
    from jcvi.assembly.allmaps import CSVMapLine
    from jcvi.formats.sizes import Sizes
    from jcvi.utils.natsort import natsorted
    from jcvi.graphics.base import shorten
    from jcvi.graphics.dotplot import plt, savefig, markup, normalize_axes, \
                    downsample, plot_breaks_and_labels, thousands

    p = OptionParser(dotplot.__doc__)
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args,
                                            figsize="8x8",
                                            style="dark",
                                            dpi=90,
                                            cmap="copper")

    if len(args) != 2:
        sys.exit(not p.print_help())

    csvfile, fastafile = args
    sizes = natsorted(Sizes(fastafile).mapping.items())
    seen = set()
    raw_data = []

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([.1, .1, .8, .8])  # the dot plot

    fp = must_open(csvfile)
    for row in fp:
        m = CSVMapLine(row)
        seen.add(m.seqid)
        raw_data.append(m)

    # X-axis is the genome assembly
    ctgs, ctg_sizes = zip(*sizes)
    xsize = sum(ctg_sizes)
    qb = list(np.cumsum(ctg_sizes))
    qbreaks = list(zip(ctgs, [0] + qb, qb))
    qstarts = dict(zip(ctgs, [0] + qb))

    # Y-axis is the map
    key = lambda x: x.lg
    raw_data.sort(key=key)
    ssizes = {}
    for lg, d in groupby(raw_data, key=key):
        ssizes[lg] = max([x.cm for x in d])
    ssizes = natsorted(ssizes.items())
    lgs, lg_sizes = zip(*ssizes)
    ysize = sum(lg_sizes)
    sb = list(np.cumsum(lg_sizes))
    sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb))
    sstarts = dict(zip(lgs, [0] + sb))

    # Re-code all the scatter dots
    data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, 'g') \
                for x in raw_data if (x.seqid in qstarts)]
    npairs = downsample(data)

    x, y, c = zip(*data)
    ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)

    # Flip X-Y label
    gy, gx = op.basename(csvfile).split(".")[:2]
    gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30)
    xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize,
                                        qbreaks, sbreaks)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    title = "Alignment: {} vs {}".format(gx, gy)
    title += " ({} markers)".format(thousands(npairs))
    root.set_title(markup(title), x=.5, y=.96, color="k")
    logging.debug(title)
    normalize_axes(root)

    image_name = opts.outfile or \
                (csvfile.rsplit(".", 1)[0] + "." + iopts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()
Example #11
0
def ld(args):
    """
    %prog ld map

    Calculate pairwise linkage disequilibrium given MSTmap.
    """
    import numpy as np
    from random import sample

    from jcvi.algorithms.matrix import symmetrize

    p = OptionParser(ld.__doc__)
    p.add_option("--subsample",
                 default=1000,
                 type="int",
                 help="Subsample markers to speed up [default: %default]")
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 1:
        sys.exit(not p.print_help())

    mstmap, = args
    subsample = opts.subsample
    data = MSTMap(mstmap)

    markerbedfile = mstmap + ".subsample.bed"
    ldmatrix = mstmap + ".subsample.matrix"
    # Take random subsample while keeping marker order
    if subsample < data.nmarkers:
        data = [data[x] for x in \
                sorted(sample(xrange(len(data)), subsample))]
    else:
        logging.debug("Use all markers, --subsample ignored")

    nmarkers = len(data)
    if need_update(mstmap, (ldmatrix, markerbedfile)):
        fw = open(markerbedfile, "w")
        print("\n".join(x.bedline for x in data), file=fw)
        logging.debug("Write marker set of size {0} to file `{1}`."\
                        .format(nmarkers, markerbedfile))
        fw.close()

        M = np.zeros((nmarkers, nmarkers), dtype=float)
        for i, j in combinations(range(nmarkers), 2):
            a = data[i]
            b = data[j]
            M[i, j] = calc_ldscore(a.genotype, b.genotype)

        M = symmetrize(M)

        logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix))
        M.tofile(ldmatrix)
    else:
        nmarkers = len(Bed(markerbedfile))
        M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers)
        logging.debug("LD matrix `{0}` exists ({1}x{1})."\
                        .format(ldmatrix, nmarkers))

    from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap

    plt.rcParams["axes.linewidth"] = 0

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    ax = fig.add_axes([.1, .1, .8, .8])  # the heatmap

    ax.matshow(M, cmap=iopts.cmap)

    # Plot chromosomes breaks
    bed = Bed(markerbedfile)
    xsize = len(bed)
    extent = (0, nmarkers)
    chr_labels = []
    ignore_size = 20

    for (seqid, beg, end) in bed.get_breaks():
        ignore = abs(end - beg) < ignore_size
        pos = (beg + end) / 2
        chr_labels.append((seqid, pos, ignore))
        if ignore:
            continue
        ax.plot((end, end), extent, "w-", lw=1)
        ax.plot(extent, (end, end), "w-", lw=1)

    # Plot chromosome labels
    for label, pos, ignore in chr_labels:
        pos = .1 + pos * .8 / xsize
        if not ignore:
            root.text(pos,
                      .91,
                      label,
                      ha="center",
                      va="bottom",
                      rotation=45,
                      color="grey")
            root.text(.09, pos, label, ha="right", va="center", color="grey")

    ax.set_xlim(extent)
    ax.set_ylim(extent)
    ax.set_axis_off()

    draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=iopts.cmap)

    root.add_patch(Rectangle((.1, .1), .8, .8, fill=False, ec="k", lw=2))
    m = mstmap.split(".")[0]
    root.text(.5,
              .06,
              "Linkage Disequilibrium between {0} markers".format(m),
              ha="center")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = m + ".subsample" + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #12
0
def prepare(args):
    """
    %prog prepare *.fastq

    Scan input fastq files (see below) and write SOAP config files based
    on inputfiles. Use "--scaffold contigs.fasta" to perform scaffolding.
    """
    from jcvi.formats.base import write_file

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K", default=45, type="int", help="K-mer size")
    p.add_option(
        "--assemble_1st_rank_only",
        default=False,
        action="store_true",
        help="Assemble the first rank only, other libs asm_flags=2",
    )
    p.add_option("--scaffold", help="Only perform scaffolding")
    p.add_option("--gapclose", help="Only perform gap closure")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    K = opts.K
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    a1st = opts.assemble_1st_rank_only

    cfgfile = "soap.config"
    gc_cfgfile = "soap.gc.config"
    fw = open(cfgfile, "w")
    fw_gc = open(gc_cfgfile, "w")

    libs = get_libs(fnames)
    rank = 0
    max_rd_len = max(readlen([f]) for f in fnames)

    block = "max_rd_len={0}\n".format(max_rd_len)
    for stream in (sys.stderr, fw, fw_gc):
        print(block, file=stream)

    # Collect singletons first
    singletons = []
    for lib, fs in libs:
        if lib.size == 0:
            singletons += fs
            continue

    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue

        rank += 1
        block = "[LIB]\n"
        block += "avg_ins={0}\n".format(size)
        block += "reverse_seq={0}\n".format(lib.reverse_seq)
        asm_flags = 2 if (rank > 1 and a1st) else lib.asm_flags
        block += "asm_flags={0}\n".format(asm_flags)
        block += "rank={0}\n".format(rank)
        if lib.reverse_seq:
            pair_num_cutoff = 3
            block += "pair_num_cutoff={0}\n".format(pair_num_cutoff)
        block += "map_len=35\n"

        for f in fs:
            if ".1." in f:
                tag = "q1"
            elif ".2." in f:
                tag = "q2"
            block += "{0}={1}\n".format(tag, f)

        if rank == 1:
            for s in singletons:
                tag = "q" if is_fastq(s) else "f"
                block += tag + "={0}\n".format(s)

        print(block, file=sys.stderr)
        print(block, file=fw)

        if asm_flags > 2:
            print(block, file=fw_gc)

    runfile = "run.sh"
    scaffold = opts.scaffold
    bb = 63 if K <= 63 else 127
    binary = "SOAPdenovo-{0}mer".format(bb)
    header = SOAPHEADER.format(opts.cpus, K, binary)
    if opts.gapclose:
        gapclose = opts.gapclose
        outfile = gapclose.rsplit(".", 1)[0] + ".closed.fasta"
        template = header + GCRUNG.format(gapclose, outfile)
    else:
        template = header + (SCFRUN % scaffold if scaffold else SOAPRUN)

    write_file(runfile, template)
    fw.close()
    fw_gc.close()
Example #13
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="Ooutput format [default: %default]")
    p.add_option("--path", dest="lastz_path", default=None,
            help="specify LASTZ path")
    p.add_option("--mask", dest="mask", default=False, action="store_true",
            help="treat lower-case letters as mask info [default: %default]")
    p.add_option("--similar", default=False, action="store_true",
            help="Use options tuned for close comparison [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith("lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format))

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh,
            lock, lastz_bin, extra, mask) for k in xrange(cpus)]
    g = Jobs(target=lastz, args=args)
    g.run()
Example #14
0
def ortholog(args):
    """
    %prog ortholog species_a species_b

    Run a sensitive pipeline to find orthologs between two species a and b.
    The pipeline runs LAST and generate .lifted.anchors.

    `--full` mode would assume 1-to-1 quota synteny blocks as the backbone of
    such predictions. Extra orthologs will be recruited from reciprocal best
    match (RBH).
    """
    from jcvi.apps.align import last as last_main
    from jcvi.compara.blastfilter import main as blastfilter_main
    from jcvi.compara.quota import main as quota_main
    from jcvi.compara.synteny import scan, mcscan, liftover
    from jcvi.formats.blast import cscore, filter

    p = OptionParser(ortholog.__doc__)
    p.add_option("--dbtype",
                 default="nucl",
                 choices=("nucl", "prot"),
                 help="Molecule type of subject database")
    p.add_option("--full",
                 default=False,
                 action="store_true",
                 help="Run in full mode, including blocks and RBH")
    p.add_option("--cscore",
                 default=0.7,
                 type="float",
                 help="C-score cutoff [default: %default]")
    p.add_option("--dist",
                 default=20,
                 type="int",
                 help="Extent of flanking regions to search")
    p.add_option("--quota", help="Quota align parameter")
    p.add_option("--nostdpf",
                 default=False,
                 action="store_true",
                 help="Do not standardize contig names")
    p.add_option("--no_strip_names",
                 default=False,
                 action="store_true",
                 help="Do not strip alternative splicing "
                 "(e.g. At5g06540.1 -> At5g06540)")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    a, b = args
    dbtype = opts.dbtype
    suffix = ".cds" if dbtype == "nucl" else ".pep"
    abed, afasta = a + ".bed", a + suffix
    bbed, bfasta = b + ".bed", b + suffix
    ccscore = opts.cscore
    quota = opts.quota
    dist = "--dist={0}".format(opts.dist)

    aprefix = afasta.split(".")[0]
    bprefix = bfasta.split(".")[0]
    pprefix = ".".join((aprefix, bprefix))
    qprefix = ".".join((bprefix, aprefix))
    last = pprefix + ".last"
    if need_update((afasta, bfasta), last):
        last_main([bfasta, afasta], dbtype)

    if a == b:
        lastself = last + ".P98L0.inverse"
        if need_update(last, lastself):
            filter([last, "--hitlen=0", "--pctid=98", "--inverse", "--noself"])
        last = lastself

    filtered_last = last + ".filtered"
    if need_update(last, filtered_last):
        if opts.no_strip_names:
            blastfilter_main(
                [last, "--cscore={0}".format(ccscore), "--no_strip_names"])
        else:
            blastfilter_main([last, "--cscore={0}".format(ccscore)])

    anchors = pprefix + ".anchors"
    lifted_anchors = pprefix + ".lifted.anchors"
    pdf = pprefix + ".pdf"
    if not opts.full:
        if need_update(filtered_last, lifted_anchors):
            if opts.no_strip_names:
                scan([
                    filtered_last, anchors, dist,
                    "--liftover={0}".format(last), "--no_strip_names"
                ])
            else:
                scan([
                    filtered_last, anchors, dist, "--liftover={0}".format(last)
                ])
        if quota:
            quota_main(
                [lifted_anchors, "--quota={0}".format(quota), "--screen"])
        if need_update(anchors, pdf):
            from jcvi.graphics.dotplot import dotplot_main
            dargs = [anchors]
            if opts.nostdpf:
                dargs += ["--nostdpf", "--skipempty"]
            dotplot_main(dargs)
        return

    if need_update(filtered_last, anchors):
        if opts.no_strip_names:
            scan([filtered_last, anchors, dist, "--no_strip_names"])
        else:
            scan([filtered_last, anchors, dist])

    ooanchors = pprefix + ".1x1.anchors"
    if need_update(anchors, ooanchors):
        quota_main([anchors, "--quota=1:1", "--screen"])

    lifted_anchors = pprefix + ".1x1.lifted.anchors"
    if need_update((last, ooanchors), lifted_anchors):
        if opts.no_strip_names:
            liftover([last, ooanchors, dist, "--no_strip_names"])
        else:
            liftover([last, ooanchors, dist])

    pblocks = pprefix + ".1x1.blocks"
    qblocks = qprefix + ".1x1.blocks"
    if need_update(lifted_anchors, [pblocks, qblocks]):
        mcscan([abed, lifted_anchors, "--iter=1", "-o", pblocks])
        mcscan([bbed, lifted_anchors, "--iter=1", "-o", qblocks])

    rbh = pprefix + ".rbh"
    if need_update(last, rbh):
        cscore([last, "-o", rbh])

    portho = pprefix + ".ortholog"
    qortho = qprefix + ".ortholog"
    if need_update([pblocks, qblocks, rbh], [portho, qortho]):
        make_ortholog(pblocks, rbh, portho)
        make_ortholog(qblocks, rbh, qortho)
Example #15
0
def pad(args):
    """
    %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed

    Test and reconstruct candidate PADs.
    """
    from jcvi.formats.cdt import CDT

    p = OptionParser(pad.__doc__)
    p.set_beds()
    p.add_option(
        "--cutoff",
        default=0.3,
        type="float",
        help="The clustering cutoff to call similar",
    )

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cutoff = opts.cutoff
    blastfile, cdtfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    cdt = CDT(cdtfile)
    qparts = list(cdt.iter_partitions(cutoff=cutoff))
    sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False))

    qid, sid = {}, {}
    for i, part in enumerate(qparts):
        qid.update(dict((x, i) for x in part))
    for i, part in enumerate(sparts):
        sid.update(dict((x, i) for x in part))

    # Without writing files, conversion from PAD to merged PAD is done in memory
    for q in qbed:
        q.seqid = qid[q.seqid]
    for s in sbed:
        s.seqid = sid[s.seqid]

    qnames = range(len(qparts))
    snames = range(len(sparts))

    logmp = make_arrays(blastfile, qbed, sbed, qnames, snames)
    m, n = logmp.shape
    pvalue_cutoff = 1e-30
    cutoff = -log(pvalue_cutoff)

    significant = []
    for i in range(m):
        for j in range(n):
            score = logmp[i, j]
            if score < cutoff:
                continue
            significant.append((qparts[i], sparts[j], score))

    for a, b, score in significant:
        print("|".join(a), "|".join(b), score)

    logging.debug(
        "Collected {0} PAR comparisons significant at (P < {1}).".format(
            len(significant), pvalue_cutoff))

    return significant
Example #16
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw (default) or muscle.
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(calc.__doc__)
    p.add_option("--longest", action="store_true",
                 help="Get longest ORF, only works if no pep file, "\
                      "e.g. ESTs [default: %default]")
    p.add_option(
        "--msa",
        default="clustalw",
        choices=("clustalw", "muscle"),
        help="software used to align the proteins [default: %default]")
    p.add_option("--workdir", default=os.getcwd(), help="Work directory")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print >> sys.stderr, "Incorrect arguments"
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    print >> output_h, fields
    work_dir = op.join(opts.workdir, "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \
            zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator):

        print >> sys.stderr, "--------", p_rec_1.name, p_rec_2.name
        if opts.msa == "clustalw":
            align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir)
        elif opts.msa == "muscle":
            align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \
                    find_synonymous(mrtrans_fasta, work_dir)
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write("%s\n" % (",".join(
                    str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn,
                                     ds_subs_ng, dn_subs_ng))))
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Example #17
0
def patch(args):
    """
    %prog patch reference.fasta reads.fasta

    Run PBJelly with reference and reads.
    """
    from jcvi.formats.base import write_file
    from jcvi.formats.fasta import format

    p = OptionParser(patch.__doc__)
    p.add_option("--cleanfasta", default=False, action="store_true",
                 help="Clean FASTA to remove description [default: %default]")
    p.add_option("--highqual", default=False, action="store_true",
                 help="Reads are of high quality [default: %default]")
    p.set_home("pbjelly")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, reads = args
    cmd = op.join(opts.pbjelly_home, "setup.sh")
    if not which("fakeQuals.py"):
        setup = "source {0}".format(cmd)
        sh(setup)

    # Check environment
    try:
        import networkx
        version = networkx.version
    except:
        logging.error("You need networkx==1.1 to run PBJELLY")
        return

    try:
        import argparse
    except ImportError:
        logging.error("You need Python2.7 or at least argparse lib")
        return

    pf = ref.rsplit(".", 1)[0]
    pr, px = reads.rsplit(".", 1)
    # Remove description line
    if opts.cleanfasta:
        oref = pf + ".f.fasta"
        oreads = pr + ".f.fasta"
        format([ref, oref])
        format([reads, oreads])
        ref, reads = oref, oreads

    # Check if the FASTA has qual
    ref, refq = fake_quals(ref)
    convert_reads = not px in ("fq", "fastq", "txt")
    if convert_reads:
        reads, readsq = fake_quals(reads)
        readsfiles = " ".join((reads, readsq))
    else:
        readsfiles = reads

    # Make directory structure
    dref, dreads = "data/reference", "data/reads"
    sh("mkdir -p {0}".format(dref))
    sh("mkdir -p {0}".format(dreads))
    sh("cp {0} {1}/".format(" ".join((ref, refq)), dref))
    sh("cp {0} {1}/".format(readsfiles, dreads))
    cwd = os.getcwd()

    outputDir = cwd
    reference = op.join(cwd, "{0}/{1}".format(dref, ref))
    reads = op.join(cwd, "{0}/{1}".format(dreads, reads))
    p = Protocol(outputDir, reference, reads, highqual=opts.highqual)
    p.write_xml()

    # Make sure we have the patched version of Extraction.py
    # See discussion <http://seqanswers.com/forums/showthread.php?t=27599>
    # This check has been removed

    # Build the pipeline
    runsh = [setup]
    for action in "setup|mapping|support|extraction".split("|"):
        runsh.append("Jelly.py {0} Protocol.xml".format(action))

    #pcmds = """find assembly -name "ref*" -exec echo \\
    #    "Assembly.py {} \\
    #    > {}/assembly.out 2> {}/assembly.err" \; > commands.list"""
    #runsh.append(pcmds)

    runsh.append("Jelly.py assembly Protocol.xml")
    runsh.append("cp assembly/assembly_chunk0.sh commands.list")
    runsh.append("parallel < commands.list")
    runsh.append("Jelly.py output Protocol.xml")

    runfile = "run.sh"
    contents = "\n".join(runsh)
    write_file(runfile, contents, meta="run script")
Example #18
0
def subset(args):
    """
    %prog subset pairsfile ksfile1 ksfile2 ... -o pairs.ks

    Subset some pre-calculated ks ka values (in ksfile) according to pairs
    in tab delimited pairsfile/anchorfile.
    """
    p = OptionParser(subset.__doc__)
    p.add_option("--noheader",
                 action="store_true",
                 help="don't write ksfile header line [default: %default]")
    p.add_option("--block",
                 action="store_true",
                 help="preserve block structure in input [default: %default]")
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, ksfiles = args[0], args[1:]
    noheader = opts.noheader
    block = opts.block
    if block:
        noheader = True
    outfile = opts.outfile

    ksvals = {}
    for ksfile in ksfiles:
        ksvals.update(dict((line.name, line) for line in \
                        KsFile(ksfile, strip_names=opts.strip_names)))

    fp = open(pairsfile)
    fw = must_open(outfile, "w")

    if not noheader:
        print >> fw, fields

    i = j = 0
    for row in fp:
        if row[0] == '#':
            if block:
                print >> fw, row.strip()
            continue
        a, b = row.split()[:2]
        name = ";".join((a, b))
        if name not in ksvals:
            name = ";".join((b, a))
            if name not in ksvals:
                j += 1
                print >> fw, "\t".join((a, b, ".", "."))
                continue
        ksline = ksvals[name]
        if block:
            print >> fw, "\t".join(str(x) for x in (a, b, ksline.ks))
        else:
            ksline.name = ";".join((a, b))
            print >> fw, ksline
        i += 1
    fw.close()

    logging.debug("{0} pairs not found in ksfiles".format(j))
    logging.debug("{0} ks records written to `{1}`".format(i, outfile))
    return outfile
Example #19
0
def completeness(args):
    """
    %prog completeness blastfile ref.fasta > outfile

    Print statistics for each gene, the coverage of the alignment onto the best hit,
    as an indicator for completeness of the gene model. For example, one might
    BLAST sugarcane ESTs against sorghum annotations as reference, to find
    full-length transcripts.
    """
    from jcvi.utils.range import range_minmax
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(completeness.__doc__)
    p.add_option(
        "--ids",
        help="Save ids that are over 50% complete [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    idsfile = opts.ids
    f = Sizes(fastafile).mapping

    b = BlastSlow(blastfile)
    valid = []
    data = []
    cutoff = 50
    for query, blines in groupby(b, key=lambda x: x.query):
        blines = list(blines)
        ranges = [(x.sstart, x.sstop) for x in blines]
        b = blines[0]
        query, subject = b.query, b.subject

        rmin, rmax = range_minmax(ranges)
        subject_len = f[subject]

        nterminal_dist = rmin - 1
        cterminal_dist = subject_len - rmax
        covered = (rmax - rmin + 1) * 100 / subject_len
        if covered > cutoff:
            valid.append(query)

        data.append((nterminal_dist, cterminal_dist, covered))
        print "\t".join(
            str(x)
            for x in (query, subject, nterminal_dist, cterminal_dist, covered))

    nd, cd, cv = zip(*data)
    m = "Total: {0}, Coverage > {1}%: {2}\n".\
           format(len(data), cutoff, len(valid))
    m += "N-terminal: {0}\n".format(SummaryStats(nd))
    m += "C-terminal: {0}\n".format(SummaryStats(cd))
    m += "Coverage: {0}".format(SummaryStats(cv))
    print >> sys.stderr, m

    if idsfile:
        fw = open(idsfile, "w")
        print >> fw, "\n".join(valid)
        logging.debug("A total of {0} ids (cov > {1} %) written to `{2}`.".\
                      format(len(valid), cutoff, idsfile))
        fw.close()
Example #20
0
def prepare(args):
    """
    %prog prepare *.fastq

    Generate run.sh script to run clc_novo_assemble.
    """
    from itertools import groupby

    from jcvi.assembly.base import FastqNamings, Library

    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fnames = args
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    library_name = lambda x: "-".join(\
                op.basename(x).split(".")[0].split("-")[:2])
    libs = [(Library(x), sorted(fs)) for x, fs in \
                groupby(fnames, key=library_name)]

    libs.sort(key=lambda x: x[0].size)
    singletons = []
    pairs = []

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    for lib, fs in libs:
        size = lib.size
        stddev = lib.stddev

        if size == 0:
            singletons += fs
            continue

        for f in fs:

            reverse_seq = 0 if ".corr." in f else lib.reverse_seq
            fb = "bf" if reverse_seq else "fb"
            minsize, maxsize = size - 2 * stddev, size + 2 * stddev
            pair_opt = "-p {0} ss {1} {2} ".format(fb, minsize, maxsize)

            if ".1." in f:
                f = f.replace(".1.", ".?.")
                pairs.append(pair_opt + "-i {0}".format(f))
            elif ".2." in f:
                continue
            else:
                pairs.append(pair_opt + f)

    cmd = "clc_novo_assemble --cpus {0} -o contigs.fasta \\\n".format(
        opts.cpus)
    cmd += "\t-q {0} \\\n".format(" ".join(singletons))
    cmd += "\n".join("\t{0} \\".format(x) for x in pairs)

    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
Example #21
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov",
                 default=False,
                 action="store_true",
                 help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap",
                 action="store_true",
                 help="Use supermap instead of union")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    p.add_option(
        "--iterby",
        dest="iterby",
        default="query",
        choices=allowed_iterby,
        help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches +
                                    this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format(
                            "\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                     this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
Example #22
0
def align(args):
    """
    %prog align reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(align.__doc__)
    p.add_option("-o",
                 dest="outfile",
                 default=None,
                 help="Output prefix.cas file [default: %default]")
    p.add_option("-s",
                 dest="size",
                 default=0,
                 type="int",
                 help="Use paired end mapping with insert [default: %default]")
    p.add_option(
        "--short",
        default=False,
        action="store_true",
        help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations",
                 default="fb",
                 help="The reads have the orientations [default: %default]")
    p.add_option(
        "--fraction",
        default=0.5,
        help="Fraction of the read that must match [default: %default]")
    p.add_option("--similarity",
                 default=0.95,
                 help="Similarity of the matching region [default: %default]")
    p.set_params()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus {0}".format(opts.cpus)
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub,
                                                   fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:

        cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity)

    sh(cmd)
    return outfile, None
Example #23
0
def minimac(args):
    """
    %prog batchminimac input.txt

    Use MINIMAC3 to impute vcf on all chromosomes.
    """
    p = OptionParser(minimac.__doc__)
    p.set_home("shapeit")
    p.set_home("minimac")
    p.set_outfile()
    p.set_chr()
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (txtfile, ) = args
    ref = opts.ref
    mm = MakeManager()
    pf = txtfile.split(".")[0]
    allrawvcf = []
    alloutvcf = []
    chrs = opts.chr.split(",")
    for x in chrs:
        px = CM[x]
        chrvcf = pf + ".{0}.vcf".format(px)
        if txtfile.endswith(".vcf"):
            cmd = "vcftools --vcf {0} --chr {1}".format(txtfile, x)
            cmd += " --out {0}.{1} --recode".format(pf, px)
            cmd += " && mv {0}.{1}.recode.vcf {2}".format(pf, px, chrvcf)
        else:  # 23andme
            cmd = "python -m jcvi.formats.vcf from23andme {0} {1}".format(
                txtfile, x)
            cmd += " --ref {0}".format(ref)
        mm.add(txtfile, chrvcf, cmd)

        chrvcf_hg38 = pf + ".{0}.23andme.hg38.vcf".format(px)
        minimac_liftover(mm, chrvcf, chrvcf_hg38, opts)
        allrawvcf.append(chrvcf_hg38)

        minimacvcf = "{0}.{1}.minimac.dose.vcf".format(pf, px)
        if x == "X":
            minimac_X(mm, x, chrvcf, opts)
        elif x in ["Y", "MT"]:
            cmd = "python -m jcvi.variation.impute passthrough"
            cmd += " {0} {1}".format(chrvcf, minimacvcf)
            mm.add(chrvcf, minimacvcf, cmd)
        else:
            minimac_autosome(mm, x, chrvcf, opts)

        # keep the best line for multi-allelic markers
        uniqvcf = "{0}.{1}.minimac.uniq.vcf".format(pf, px)
        cmd = "python -m jcvi.formats.vcf uniq {0} > {1}".format(
            minimacvcf, uniqvcf)
        mm.add(minimacvcf, uniqvcf, cmd)

        minimacvcf_hg38 = "{0}.{1}.minimac.hg38.vcf".format(pf, px)
        minimac_liftover(mm, uniqvcf, minimacvcf_hg38, opts)
        alloutvcf.append(minimacvcf_hg38)

    if len(allrawvcf) > 1:
        rawhg38vcfgz = pf + ".all.23andme.hg38.vcf.gz"
        cmd = "vcf-concat {0} | bgzip > {1}".format(" ".join(allrawvcf),
                                                    rawhg38vcfgz)
        mm.add(allrawvcf, rawhg38vcfgz, cmd)

    if len(alloutvcf) > 1:
        outhg38vcfgz = pf + ".all.minimac.hg38.vcf.gz"
        cmd = "vcf-concat {0} | bgzip > {1}".format(" ".join(alloutvcf),
                                                    outhg38vcfgz)
        mm.add(alloutvcf, outhg38vcfgz, cmd)

    mm.write()
Example #24
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size
    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K",
                 default=23,
                 type="int",
                 help="K-mer size [default: %default]")
    p.add_option("--coverage",
                 default=40,
                 type="int",
                 help="Expected sequence coverage [default: %default]")
    p.add_option("--prefix",
                 default="jf",
                 help="Database prefix [default: %default]")
    p.add_option("--nohist",
                 default=False,
                 action="store_true",
                 help="Do not print histogram [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".\
                    format(human_size(totalfilesize,
                           a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Example #25
0
def frommaf(args):
    """
    %prog frommaf maffile

    Convert to four-column tabular format from MAF.
    """
    p = OptionParser(frommaf.__doc__)
    p.add_option("--validate", help="Validate coordinates against FASTA")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (maf,) = args
    snpfile = maf.rsplit(".", 1)[0] + ".vcf"
    fp = open(maf)
    fw = open(snpfile, "w")
    total = 0
    id = "."
    qual = 20
    filter = "PASS"
    info = "DP=20"
    print("##fileformat=VCFv4.0", file=fw)
    print("#CHROM POS ID REF ALT QUAL FILTER INFO".replace(" ", "\t"), file=fw)
    for row in fp:
        atoms = row.split()
        c, pos, ref, alt = atoms[:4]
        if is_number(c, int):
            c = int(c)
        else:
            continue
        c = "chr{0:02d}".format(c)
        pos = int(pos)
        print(
            "\t".join(str(x) for x in (c, pos, id, ref, alt, qual, filter, info)),
            file=fw,
        )
        total += 1
    fw.close()

    validate = opts.validate
    if not validate:
        return

    from jcvi.utils.cbook import percentage

    f = Fasta(validate)
    fp = open(snpfile)
    nsnps = 0
    for row in fp:
        if row[0] == "#":
            continue

        c, pos, id, ref, alt, qual, filter, info = row.split("\t")
        pos = int(pos)
        feat = dict(chr=c, start=pos, stop=pos)
        s = f.sequence(feat)
        s = str(s)
        assert s == ref, "Validation error: {0} is {1} (expect: {2})".format(
            feat, s, ref
        )
        nsnps += 1
        if nsnps % 50000 == 0:
            logging.debug("SNPs parsed: {0}".format(percentage(nsnps, total)))
    logging.debug(
        "A total of {0} SNPs validated and written to `{1}`.".format(nsnps, snpfile)
    )
Example #26
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    N = int(N)
    KMERYL, KSOAP, KALLPATHS = range(3)
    kformats = ("Meryl", "Soap", "AllPaths")
    kformat = KMERYL

    ascii = not opts.pdf
    peaks = not opts.nopeaks
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    for row in fp:
        if row.startswith("# 1:"):
            kformat = KALLPATHS
            break
        if len(row.split()) == 1:
            kformat = KSOAP
            break
    fp.seek(0)

    logging.debug("Guessed format: {0}".format(kformats[kformat]))

    data = []
    for rowno, row in enumerate(fp):
        if row[0] == '#':
            continue
        if kformat == KSOAP:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = Kcounts
        data.append((K, counts))

    covmax = 1000000
    ks = KmerSpectrum(data)
    ks.analyze(K=N, covmax=covmax)

    Total_Kmers = int(totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() \
                    if opts.vmin <= a <= opts.vmax)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        return asciiplot(x, y, title=title)

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
    tcounts = [(x, y) for x, y in counts if x in t]
    x, y = zip(*tcounts)
    plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
    tcounts = dict(tcounts)

    if peaks:
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)
Example #27
0
def adjgraph(args):
    """
    %prog adjgraph adjacency.txt subgraph.txt

    Construct adjacency graph for graphviz. The file may look like sample below.
    The lines with numbers are chromosomes with gene order information.

    genome 0
    chr 0
    -1 -13 -16 3 4 -6126 -5 17 -6 7 18 5357 8 -5358 5359 -9 -10 -11 5362 5360
    chr 1
    138 6133 -5387 144 -6132 -139 140 141 146 -147 6134 145 -170 -142 -143
    """
    import pygraphviz as pgv
    from jcvi.utils.iter import pairwise
    from jcvi.formats.base import SetFile

    p = OptionParser(adjgraph.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    infile, subgraph = args
    subgraph = SetFile(subgraph)
    subgraph = set(x.strip("-") for x in subgraph)

    G = pgv.AGraph(strict=False)  # allow multi-edge
    SG = pgv.AGraph(strict=False)

    palette = ("green", "magenta", "tomato", "peachpuff")
    fp = open(infile)
    genome_id = -1
    key = 0
    for row in fp:
        if row.strip() == "":
            continue

        atoms = row.split()
        tag = atoms[0]
        if tag in ("ChrNumber", "chr"):
            continue

        if tag == "genome":
            genome_id += 1
            gcolor = palette[genome_id]
            continue

        nodeseq = []
        for p in atoms:
            np = p.strip("-")
            nodeL, nodeR = np + "L", np + "R"
            if p[0] == "-":  # negative strand
                nodeseq += [nodeR, nodeL]
            else:
                nodeseq += [nodeL, nodeR]

        for a, b in pairwise(nodeseq):
            G.add_edge(a, b, key, color=gcolor)
            key += 1

            na, nb = a[:-1], b[:-1]
            if na not in subgraph and nb not in subgraph:
                continue

            SG.add_edge(a, b, key, color=gcolor)

    G.graph_attr.update(dpi="300")

    fw = open("graph.dot", "w")
    G.write(fw)
    fw.close()

    fw = open("subgraph.dot", "w")
    SG.write(fw)
    fw.close()
Example #28
0
def cluster(args):
    """
    %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile

    Cluster the segments and form PAD. This is the method described in Tang et
    al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks,
    based on which the genome on one or both axis can be chopped up into pieces
    and clustered.
    """
    from jcvi.utils.range import Range

    p = OptionParser(cluster.__doc__)
    p.set_beds()
    p.add_option("--minsize",
                 default=10,
                 type="int",
                 help="Only segment using blocks >= size")
    p.add_option("--path",
                 default="~/scratch/bin",
                 help="Path to the CLUSTER 3.0 binary")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, anchorfile = args
    qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts)

    minsize = opts.minsize
    ac = AnchorFile(anchorfile)
    qranges, sranges = [], []
    qextra = [x[1:] for x in qbed.get_breaks()]
    sextra = [x[1:] for x in sbed.get_breaks()]

    id = 0
    for block in ac.iter_blocks(minsize=minsize):
        q, s = list(zip(*block))[:2]
        q = [qorder[x][0] for x in q]
        s = [sorder[x][0] for x in s]
        minq, maxq = min(q), max(q)
        mins, maxs = min(s), max(s)
        id += 1

        qr = Range("0", minq, maxq, maxq - minq, id)
        sr = Range("0", mins, maxs, maxs - mins, id)
        qranges.append(qr)
        sranges.append(sr)

    qpads = list(get_segments(qranges, qextra))
    spads = list(get_segments(sranges, sextra))

    suffix = ".pad.bed"
    qpf = opts.qbed.split(".")[0]
    spf = opts.sbed.split(".")[0]
    qpadfile = qpf + suffix
    spadfile = spf + suffix
    qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed)
    snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed)

    qpadbed, spadbed = Bed(qpadfile), Bed(spadfile)

    logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames)
    m, n = logmp.shape

    matrixfile = ".".join((qpf, spf, "logmp.txt"))
    fw = open(matrixfile, "w")
    header = ["o"] + spadnames
    print("\t".join(header), file=fw)
    for i in range(m):
        row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]]
        print("\t".join(row), file=fw)

    fw.close()

    # Run CLUSTER 3.0 (Pearson correlation, average linkage)
    cmd = op.join(opts.path, "cluster")
    cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile)
    pf = matrixfile.rsplit(".", 1)[0]
    cdtfile = pf + ".cdt"
    if need_update(matrixfile, cdtfile):
        sh(cmd)
Example #29
0
def segment(args):
    """
    %prog segment loss.ids bedfile

    Merge adjacent gene loss into segmental loss.

    Then based on the segmental loss, estimate amount of DNA loss in base pairs.
    Two estimates can be given:
    - conservative: just within the start and end of a single gene
    - aggressive: extend the deletion track to the next gene

    The real deletion size is within these estimates.
    """
    from jcvi.formats.base import SetFile

    p = OptionParser(segment.__doc__)
    p.add_option(
        "--chain",
        default=1,
        type="int",
        help="Allow next N genes to be chained",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    idsfile, bedfile = args
    bed = Bed(bedfile)
    order = bed.order
    ids = SetFile(idsfile)
    losses = Grouper()
    skip = opts.chain
    for i, a in enumerate(bed):
        a = a.accn
        for j in range(i + 1, i + 1 + skip):
            if j >= len(bed):
                break
            b = bed[j].accn
            if a in ids:
                losses.join(a, a)
            if a in ids and b in ids:
                losses.join(a, b)

    losses = list(losses)
    singletons = [x for x in losses if len(x) == 1]
    segments = [x for x in losses if len(x) > 1]
    ns, nm, nt = len(singletons), len(segments), len(losses)
    assert ns + nm == nt

    # Summary for all segments
    for x in sorted(singletons) + sorted(segments):
        print(
            "\t".join(
                str(x)
                for x in ("|".join(sorted(x)), len(x), estimate_size(x, bed, order))
            )
        )

    # Find longest segment stretch
    if segments:
        mx, maxsegment = max([(len(x), x) for x in segments])
        print("Longest stretch: run of {0} genes".format(mx), file=sys.stderr)
        print("  {0}".format("|".join(sorted(maxsegment))), file=sys.stderr)
        seg_asize = sum(estimate_size(x, bed, order) for x in segments)
        seg_bsize = sum(
            estimate_size(x, bed, order, conservative=False) for x in segments
        )
    else:
        seg_asize = seg_bsize = 0

    sing_asize = sum(estimate_size(x, bed, order) for x in singletons)
    sing_bsize = sum(
        estimate_size(x, bed, order, conservative=False) for x in singletons
    )
    total_asize = sing_asize + seg_asize
    total_bsize = sing_bsize + seg_bsize
    print(
        "Singleton ({0}): {1} - {2} bp".format(ns, sing_asize, sing_bsize),
        file=sys.stderr,
    )
    print(
        "Segment ({0}): {1} - {2} bp".format(nm, seg_asize, seg_bsize), file=sys.stderr
    )
    print(
        "Total ({0}): {1} - {2} bp".format(nt, total_asize, total_bsize),
        file=sys.stderr,
    )
    print(
        "Average ({0}): {1} bp".format(nt, (total_asize + total_bsize) / 2),
        file=sys.stderr,
    )
Example #30
0
def enrich(args):
    """
    %prog enrich omgfile groups ntaxa > enriched.omg

    Enrich OMG output by pulling genes misses by OMG.
    """
    p = OptionParser(enrich.__doc__)
    p.add_option("--ghost",
                 default=False,
                 action="store_true",
                 help="Add ghost homologs already used [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    omgfile, groupsfile, ntaxa = args
    ntaxa = int(ntaxa)
    ghost = opts.ghost

    # Get gene pair => weight mapping
    weights = get_edges()
    info = get_info()
    # Get gene => taxon mapping
    info = dict((k, v.split()[5]) for k, v in info.items())

    groups = Grouper()

    fp = open(groupsfile)
    for row in fp:
        members = row.strip().split(",")
        groups.join(*members)

    logging.debug("Imported {0} families with {1} members.".\
                    format(len(groups), groups.num_members))

    seen = set()
    omggroups = Grouper()
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        genes = genes.split(",")
        seen.update(genes)
        omggroups.join(*genes)

    nmembers = omggroups.num_members
    logging.debug("Imported {0} OMG families with {1} members.".\
                    format(len(omggroups), nmembers))
    assert nmembers == len(seen)

    alltaxa = set(str(x) for x in range(ntaxa))
    recruited = []
    fp = open(omgfile)
    for row in fp:
        genes, idxs = row.split()
        genes = genes.split(",")
        a = genes[0]

        idxs = set(idxs.split(","))
        missing_taxa = alltaxa - idxs
        if not missing_taxa:
            print(row.rstrip())
            continue

        leftover = groups[a]
        if not ghost:
            leftover = set(leftover) - seen

        if not leftover:
            print(row.rstrip())
            continue

        leftover_sorted_by_taxa = dict((k, \
                             [x for x in leftover if info[x] == k]) \
                                for k in missing_taxa)

        #print genes, leftover
        #print leftover_sorted_by_taxa
        solutions = []
        for solution in product(*leftover_sorted_by_taxa.values()):
            score = sum(
                weights.get((a, b), 0) for a in solution for b in genes)
            if score == 0:
                continue
            score += sum(
                weights.get((a, b), 0) for a, b in combinations(solution, 2))
            solutions.append((score, solution))
            #print solution, score

        best_solution = max(solutions) if solutions else None
        if best_solution is None:
            print(row.rstrip())
            continue

        #print "best ==>", best_solution
        best_score, best_addition = best_solution
        genes.extend(best_addition)
        recruited.extend(best_addition)

        genes = sorted([(info[x], x) for x in genes])
        idxs, genes = zip(*genes)

        if ghost:  # decorate additions so it's clear that they were added
            pgenes = []
            for g in genes:
                if g in recruited and g in seen:
                    pgenes.append("|{0}|".format(g))
                else:
                    pgenes.append(g)
            genes = pgenes

        print("\t".join((",".join(genes), ",".join(idxs))))
        if not ghost:
            seen.update(best_addition)

    logging.debug("Recruited {0} new genes.".format(len(recruited)))