Exemple #1
0
def readlen(args):
    """
    %prog readlen fastqfile

    Calculate read length, will only try the first N reads. Output min, max, and
    avg for each file.
    """
    p = OptionParser(readlen.__doc__)
    p.set_firstN()
    p.add_option("--silent", default=False, action="store_true", help="Do not print read length stats")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    f, = args
    if not is_fastq(f):
        logging.debug("File `{0}` does not endswith .fastq or .fq")
        return 0

    s = calc_readlen(f, opts.firstN)
    if not opts.silent:
        print "\t".join(str(x) for x in (f, s.min, s.max, s.mean, s.median))

    return int(s.max)
Exemple #2
0
def links(args):
    """
    %prog links url

    Extract all the links "<a href=''>" from web page.
    """
    p = OptionParser(links.__doc__)
    p.add_option("--img", default=False, action="store_true",
                 help="Extract <img> tags [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    url, = args
    img = opts.img

    htmlfile = download(url)
    page = open(htmlfile).read()
    soup = BeautifulSoup(page)

    tag = 'img' if img else 'a'
    src = 'src' if img else 'href'
    aa = soup.findAll(tag)
    for a in aa:
        link = a.get(src)
        link = urljoin(url, link)
        print(link)
Exemple #3
0
def unitigs(args):
    """
    %prog unitigs best.edges

    Reads Celera Assembler's "best.edges" and extract all unitigs.
    """
    p = OptionParser(unitigs.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    G = read_graph(bestedges, maxerr=opts.maxerr, directed=True)
    H = nx.Graph()
    intconv = lambda x: int(x.split("-")[0])
    for k, v in G.iteritems():
        if k == G.get(v, None):
            H.add_edge(intconv(k), intconv(v))

    nunitigs = nreads = 0
    for h in nx.connected_component_subgraphs(H, copy=False):
        st = [x for x in h if h.degree(x) == 1]
        if len(st) != 2:
            continue
        src, target = st
        path = list(nx.all_simple_paths(h, src, target))
        assert len(path) == 1
        path, = path
        print "|".join(str(x) for x in path)
        nunitigs += 1
        nreads += len(path)
    logging.debug("A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads))
Exemple #4
0
def trim(args):
    """
    %prog trim fastqfile

    Wraps `fastx_trimmer` to trim from begin or end of reads.
    """
    p = OptionParser(trim.__doc__)
    p.add_option("-f", dest="first", default=0, type="int", help="First base to keep. Default is 1.")
    p.add_option("-l", dest="last", default=0, type="int", help="Last base to keep. Default is entire read.")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    obfastqfile = op.basename(fastqfile)
    fq = obfastqfile.rsplit(".", 1)[0] + ".ntrimmed.fastq"
    if fastqfile.endswith(".gz"):
        fq = obfastqfile.rsplit(".", 2)[0] + ".ntrimmed.fastq.gz"

    cmd = "fastx_trimmer -Q33 "
    if opts.first:
        cmd += "-f {0.first} ".format(opts)
    if opts.last:
        cmd += "-l {0.last} ".format(opts)

    sh(cmd, infile=fastqfile, outfile=fq)
Exemple #5
0
def histogram(args):
    """
    %prog histogram *.gff

    Plot gene statistics based on output of stats. For each gff file, look to
    see if the metrics folder (i.e. Exon_Length) contains the data and plot
    them.
    """
    from jcvi.graphics.histogram import histogram_multiple

    p = OptionParser(histogram.__doc__)
    p.add_option("--bins", dest="bins", default=40, type="int",
            help="number of bins to plot in the histogram [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    # metrics = ("Exon_Length", "Intron_Length", "Gene_Length", "Exon_Count")
    colors = ("red", "green", "blue", "black")
    vmaxes = (1000, 1000, 4000, 20)
    xlabels = ("bp", "bp", "bp", "number")
    for metric, color, vmax, xlabel in zip(metrics, colors, vmaxes, xlabels):
        logging.debug("Parsing files in `{0}`..".format(metric))
        numberfiles = [op.join(metric, op.basename(x).split(".")[0] + ".txt") \
                        for x in gff_files]

        histogram_multiple(numberfiles, 0, vmax, xlabel, metric,
                       bins=opts.bins, facet=True, fill=color,
                       prefix=metric + ".")
Exemple #6
0
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=10, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
Exemple #7
0
def ids(args):
    """
    %prog ids cdhit.clstr

    Get the representative ids from clstr file.
    """
    p = OptionParser(ids.__doc__)
    p.add_option("--prefix", type="int",
                 help="Find rep id for prefix of len [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    cf = ClstrFile(clstrfile)
    prefix = opts.prefix
    if prefix:
        reads = list(cf.iter_reps_prefix(prefix=prefix))
    else:
        reads = list(cf.iter_reps())

    nreads = len(reads)
    idsfile = clstrfile.replace(".clstr", ".ids")
    fw = open(idsfile, "w")
    for i, name in reads:
        print("\t".join(str(x) for x in (i, name)), file=fw)

    logging.debug("A total of {0} unique reads written to `{1}`.".\
            format(nreads, idsfile))
    fw.close()

    return idsfile
Exemple #8
0
def contamination(args):
    """
    %prog contamination folder Ecoli.fasta

    Remove contaminated reads. The FASTQ files in the folder will automatically
    pair and filtered against Ecoli.fasta to remove contaminants using BOWTIE2.
    """
    from jcvi.apps.bowtie import align

    p = OptionParser(contamination.__doc__)
    p.add_option("--mapped", default=False, action="store_true",
                 help="Retain contaminated reads instead [default: %default]")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, ecoli = args
    ecoli = get_abs_path(ecoli)
    tag = "--mapped" if opts.mapped else "--unmapped"
    for p, pf in iter_project(folder, 2):
        align_opts = [ecoli] + p + [tag]
        align_opts += ["--cutoff={0}".format(opts.cutoff), "--null"]
        if opts.mateorientation:
            align_opts += ["--mateorientation={0}".format(opts.mateorientation)]
        samfile, logfile = align(align_opts)
Exemple #9
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option("--minscore", default=100, type="int",
                 help="Matches minus mismatches gap penalty [default: %default]")
    p.add_option("--minid", default=98, type="int",
                 help="Minimum sequence identity [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\
                format(opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \
                for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Exemple #10
0
def pasteprepare(args):
    """
    %prog pasteprepare bacs.fasta

    Prepare sequences for paste.
    """
    p = OptionParser(pasteprepare.__doc__)
    p.add_option("--flank", default=5000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    goodfasta, = args
    flank = opts.flank
    pf = goodfasta.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"

    sizes = Sizes(goodfasta)
    fw = open(extbed, "w")
    for bac, size in sizes.iter_sizes():
        print >> fw, "\t".join(str(x) for x in \
                               (bac, 0, min(flank, size), bac + "L"))
        print >> fw, "\t".join(str(x) for x in \
                               (bac, max(size - flank, 0), size, bac + "R"))
    fw.close()

    fastaFromBed(extbed, goodfasta, name=True)
Exemple #11
0
def paste(args):
    """
    %prog paste flanks.bed flanks_vs_assembly.blast backbone.fasta

    Paste in good sequences in the final assembly.
    """
    from jcvi.formats.bed import uniq

    p = OptionParser(paste.__doc__)
    p.add_option("--maxsize", default=300000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    p.add_option("--prefix", help="Prefix of the new object [default: %default]")
    p.set_rclip(rclip=1)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pbed, blastfile, bbfasta = args
    maxsize = opts.maxsize  # Max DNA size to replace gap
    order = Bed(pbed).order

    beforebed, afterbed = blast_to_twobeds(blastfile, order, log=True,
                                           rclip=opts.rclip, maxsize=maxsize,
                                           flipbeds=True)
    beforebed = uniq([beforebed])

    afbed = Bed(beforebed)
    bfbed = Bed(afterbed)

    shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
Exemple #12
0
def batchcn(args):
    """
    %prog batchcn workdir samples.csv

    Run CNV segmentation caller in batch mode. Scans a workdir.
    """
    p = OptionParser(batchcn.__doc__)
    p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn",
                 help="Upload cn and seg results to s3")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    workdir, samples = args
    upload = opts.upload
    store = upload + "/{}/*.seg".format(workdir)
    computed = [op.basename(x).split(".")[0] for x in glob_s3(store)]
    computed = set(computed)

    # Generate a bunch of cn commands
    fp = open(samples)
    nskipped = ntotal = 0
    cmd = "python -m jcvi.variation.cnv cn --hmm --cleanup {}".format(workdir)
    for row in fp:
        samplekey, path = row.strip().split(",")
        ntotal += 1
        if samplekey in computed:
            nskipped += 1
            continue
        print " ".join((cmd, samplekey, path))

    logging.debug("Skipped: {}".format(percentage(nskipped, ntotal)))
Exemple #13
0
def liftover(args):
    """
    %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa

    LiftOver CODIS/Y-STR markers.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--checkvalid", default=False, action="store_true",
                help="Check minscore, period and length")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    refbed, fastafile = args
    genome = pyfasta.Fasta(fastafile)
    edits = []
    fp = open(refbed)
    for i, row in enumerate(fp):
        s = STRLine(row)
        seq = genome[s.seqid][s.start - 1: s.end].upper()
        s.motif = get_motif(seq, len(s.motif))
        s.fix_counts(seq)
        if opts.checkvalid and not s.is_valid():
            continue
        edits.append(s)
        if i % 10000 == 0:
            print >> sys.stderr, i, "lines read"

    edits = natsorted(edits, key=lambda x: (x.seqid, x.start))
    for e in edits:
        print str(e)
Exemple #14
0
def getgenes(args):
    """
    %prog getgenes [--options]

    Read GenBank file, or retrieve from web.
    Output bed, cds files, and pep file (can turn off with --nopep).
    Either --gb_dir or --id/--simple should be provided.
    """
    p = OptionParser(getgenes.__doc__)
    p.add_option("--prefix", default="gbout",
            help="prefix of output files [default: %default]")
    p.add_option("--nopep", default=False, action="store_true",
            help="Only get cds and bed, no pep [default: %default]")
    filenames, accessions, idfile, opts, args = preparegb(p, args)
    prefix = opts.prefix

    GenBank(filenames=filenames, accessions=accessions, idfile=idfile).\
        write_genes(output=prefix, individual=opts.individual, \
        pep=(not opts.nopep))

    if opts.individual:
        logging.debug("Output written dir {0}".format(prefix))
    elif opts.nopep:
        logging.debug("Output written to {0}.bed, {0}.cds".format(prefix,))
    else:
        logging.debug("Output written to {0}.bed, {0}.cds, {0}.pep".format(prefix,))
Exemple #15
0
def cib(args):
    """
    %prog cib bamfile samplekey

    Convert BAM to CIB (a binary storage of int8 per base).
    """
    p = OptionParser(cib.__doc__)
    p.add_option("--prefix", help="Report seqids with this prefix only")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, samplekey = args
    mkdir(samplekey)
    bam = pysam.AlignmentFile(bamfile, "rb")
    refs = [x for x in bam.header["SQ"]]
    prefix = opts.prefix
    if prefix:
        refs = [x for x in refs if x["SN"].startswith(prefix)]

    task_args = []
    for r in refs:
        task_args.append((bamfile, r, samplekey))
    cpus = min(opts.cpus, len(task_args))
    logging.debug("Use {} cpus".format(cpus))

    p = Pool(processes=cpus)
    for res in p.imap(bam_to_cib, task_args):
        continue
Exemple #16
0
def count(args):
    """
    %prog count bamfile gtf

    Count the number of reads mapped using `htseq-count`.
    """
    p = OptionParser(count.__doc__)
    p.add_option("--type", default="exon",
                 help="Only count feature type")
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, gtf = args
    cpus = opts.cpus
    pf = bamfile.split(".")[0]
    countfile = pf + ".count"
    if not need_update(bamfile, countfile):
        return

    nsorted = pf + "_nsorted"
    nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam"
    if need_update(bamfile, nsortedsam):
        cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted)
        sh(cmd)
        cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam)
        sh(cmd, outfile=nsortedsam)

    if need_update(nsortedsam, countfile):
        cmd = "htseq-count --stranded=no --minaqual=10"
        cmd += " -t {0}".format(opts.type)
        cmd += " {0} {1}".format(nsortedsam, gtf)
        sh(cmd, outfile=countfile)
Exemple #17
0
def consensus(args):
    """
    %prog consensus fastafile bamfile

    Convert bam alignments to consensus FASTQ/FASTA.
    """
    p = OptionParser(consensus.__doc__)
    p.add_option("--fasta", default=False, action="store_true",
            help="Generate consensus FASTA sequences [default: %default]")
    p.add_option("--mask", default=0, type="int",
            help="Mask bases with quality lower than")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    fasta = opts.fasta
    suffix = "fasta" if fasta else "fastq"
    pf = bamfile.rsplit(".", 1)[0]
    cnsfile = pf + ".cns.{0}".format(suffix)
    vcfgzfile = pf + ".vcf.gz"
    vcf([fastafile, bamfile, "-o", vcfgzfile])
    cmd += "zcat {0} | vcfutils.pl vcf2fq".format(vcfgzfile)
    if fasta:
        cmd += " | seqtk seq -q {0} -A -".format(opts.mask)

    sh(cmd, outfile=cnsfile)
Exemple #18
0
def bed(args):
    """
    %prog bed xmlfile

    Print summary of optical map alignment in BED format.
    """
    from jcvi.formats.bed import sort

    p = OptionParser(bed.__doc__)
    p.add_option("--blockonly", default=False, action="store_true",
                 help="Only print out large blocks, not fragments [default: %default]")
    p.add_option("--point", default=False, action="store_true",
                 help="Print accesssion as single point instead of interval")
    p.add_option("--scale", type="float",
                 help="Scale the OM distance by factor")
    p.add_option("--switch", default=False, action="store_true",
                 help="Switch reference and aligned map elements [default: %default]")
    p.add_option("--nosort", default=False, action="store_true",
                 help="Do not sort bed [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    xmlfile, = args
    bedfile = xmlfile.rsplit(".", 1)[0] + ".bed"

    om = OpticalMap(xmlfile)
    om.write_bed(bedfile, point=opts.point, scale=opts.scale,
                          blockonly=opts.blockonly, switch=opts.switch)

    if not opts.nosort:
        sort([bedfile, "--inplace"])
Exemple #19
0
def main(args):
    """
    %prog deltafile refidsfile query.fasta ref.fasta

    Plot one query. Extract the references that have major matches to this
    query. Control "major" by option --refcov.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--refcov", default=.01, type="float",
                 help="Minimum reference coverage [default: %default]")
    p.add_option("--all", default=False, action="store_true",
                 help="Plot one pdf file per ref in refidsfile [default: %default]")
    p.set_align(pctid=96, hitlen=500)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    deltafile, refidsfile, queryfasta, reffasta = args
    qsizes = Sizes(queryfasta).mapping
    rsizes = Sizes(reffasta).mapping
    refs = SetFile(refidsfile)
    refcov = opts.refcov
    pctid = opts.pctid
    hitlen = opts.hitlen
    deltafile = filter([deltafile, "--pctid={0}".format(pctid),
                        "--hitlen={0}".format(hitlen)])

    if opts.all:
        for r in refs:
            pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov)
            if pdffile:
                sh("mv {0} {1}.pdf".format(pdffile, r))
    else:
        plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
Exemple #20
0
def chain(args):
    """
    %prog chain blastfile

    Chain adjacent HSPs together to form larger HSP. The adjacent HSPs have to
    share the same orientation.
    """
    p = OptionParser(chain.__doc__)
    p.add_option("--dist", dest="dist",
            default=100, type="int",
            help="extent of flanking regions to search [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    dist = opts.dist
    assert dist > 0

    blast = BlastSlow(blastfile)
    chained_hsps = chain_HSPs(blast, xdist=dist, ydist=dist)

    for b in chained_hsps:
        print b
Exemple #21
0
def top10(args):
    """
    %prog top10 blastfile.best

    Count the most frequent 10 hits. Usually the BLASTFILE needs to be screened
    the get the best match. You can also provide an .ids file to query the ids.
    For example the ids file can contain the seqid to species mapping.

    The ids file is two-column, and can sometimes be generated by
    `jcvi.formats.fasta ids --description`.
    """
    from jcvi.formats.base import DictFile

    p = OptionParser(top10.__doc__)
    p.add_option("--top", default=10, type="int",
                help="Top N taxa to extract [default: %default]")
    p.add_option("--ids", default=None,
                help="Two column ids file to query seqid [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    mapping = DictFile(opts.ids, delimiter="\t") if opts.ids else {}

    cmd = "cut -f2 {0}".format(blastfile)
    cmd += " | sort | uniq -c | sort -k1,1nr | head -n {0}".format(opts.top)
    fp = popen(cmd)
    for row in fp:
        count, seqid = row.split()
        nseqid = mapping.get(seqid, seqid)
        print "\t".join((count, nseqid))
Exemple #22
0
def annotation(args):
    """
    %prog annotation blastfile > annotations

    Create simple two column files from the first two coluns in blastfile. Use
    --queryids and --subjectids to switch IDs or descriptions.
    """
    from jcvi.formats.base import DictFile

    p = OptionParser(annotation.__doc__)
    p.add_option("--queryids", help="Query IDS file to switch [default: %default]")
    p.add_option("--subjectids", help="Subject IDS file to switch [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    d = "\t"
    qids = DictFile(opts.queryids, delimiter=d) if opts.queryids else None
    sids = DictFile(opts.subjectids, delimiter=d) if opts.subjectids else None
    blast = Blast(blastfile)
    for b in blast:
        query, subject = b.query, b.subject
        if qids:
            query = qids[query]
        if sids:
            subject = sids[subject]
        print "\t".join((query, subject))
Exemple #23
0
def annotate(args):
    """
    %prog annotate blastfile query.fasta subject.fasta

    Annotate overlap types (dovetail, contained, etc) in BLAST tabular file.
    """
    from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types

    p = OptionParser(annotate.__doc__)
    p.set_align(pctid=94, hitlen=500)
    p.add_option("--hang", default=500, type="int",
                 help="Maximum overhang length")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, afasta, bfasta = args
    fp = open(blastfile)
    asizes = Sizes(afasta).mapping
    bsizes = Sizes(bfasta).mapping
    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))
    for row in fp:
        b = BlastLine(row)
        asize = asizes[b.query]
        bsize = bsizes[b.subject]
        if b.query == b.subject:
            continue
        ov = Overlap(b, asize, bsize, cutoff)
        if ov.otype:
            ov.print_graphic()
            print "{0}\t{1}".format(b, Overlap_types[ov.otype])
Exemple #24
0
def phytozome(args):
    """
    %prog phytozome species

    Retrieve genomes and annotations from phytozome FTP. Available species
    listed below. Use comma to give a list of species to download. For example:

    $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum
    """
    p = OptionParser(phytozome.__doc__)
    p.add_option("--version", default="9.0",
                 help="Phytozome version [default: %default]")
    p.add_option("--assembly", default=False, action="store_true",
                 help="Download assembly [default: %default]")
    opts, args = p.parse_args(args)

    url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\
                    format(opts.version)
    valid_species = [x for x in ls_ftp(url) if "." not in x]

    doc = "\n".join((phytozome.__doc__, tile(valid_species)))
    p.set_usage(doc)

    if len(args) != 1:
        sys.exit(not p.print_help())

    species, = args
    species = species.split(",")
    for s in species:
        download_species_phytozome(s, valid_species, url, assembly=opts.assembly)
Exemple #25
0
def bed(args):
    """
    %prog bed blastfile

    Print out bed file based on coordinates in BLAST report. By default, write
    out subject positions. Use --swap to write query positions.
    """
    p = OptionParser(bed.__doc__)
    p.add_option("--swap", default=False, action="store_true",
                 help="Write query positions [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    blastfile, = args
    swap = opts.swap

    fp = must_open(blastfile)
    bedfile = blastfile.rsplit(".", 1)[0] + ".bed"
    fw = open(bedfile, "w")
    for row in fp:
        b = BlastLine(row)
        if swap:
            b = b.swapped
        print >> fw, b.bedline

    logging.debug("File written to `{0}`.".format(bedfile))

    return bedfile
Exemple #26
0
def filterm4(args):
    """
    %prog filterm4 sample.m4 > filtered.m4

    Filter .m4 file after blasr is run. As blasr takes a long time to run,
    changing -bestn is undesirable. This screens the m4 file to retain top hits.
    """
    p = OptionParser(filterm4.__doc__)
    p.add_option("--best", default=1, type="int", help="Only retain best N hits")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    m4file, = args
    best = opts.best
    fp = open(m4file)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    retained = total = 0
    for row in fp:
        r = M4Line(row)
        total += 1
        if total % 100000 == 0:
            logging.debug("Retained {0} lines".\
                            format(percentage(retained, total)))
        if seen.get(r.query, 0) < best:
            fw.write(row)
            seen[r.query] += 1
            retained += 1
    fw.close()
Exemple #27
0
def ensembl(args):
    """
    %prog ensembl species

    Retrieve genomes and annotations from ensembl FTP. Available species
    listed below. Use comma to give a list of species to download. For example:

    $ %prog ensembl danio_rerio,gasterosteus_aculeatus
    """
    p = OptionParser(ensembl.__doc__)
    p.add_option("--version", default="75",
                 help="Ensembl version [default: %default]")
    opts, args = p.parse_args(args)

    version = opts.version
    url = "ftp://ftp.ensembl.org/pub/release-{0}/".format(version)
    fasta_url = url + "fasta/"

    valid_species = [x for x in ls_ftp(fasta_url) if "." not in x]
    doc = "\n".join((ensembl.__doc__, tile(valid_species)))
    p.set_usage(doc)

    if len(args) != 1:
        sys.exit(not p.print_help())

    species, = args
    species = species.split(",")
    for s in species:
        download_species_ensembl(s, valid_species, url)
Exemple #28
0
def link(args):
    """
    %prog link metafile

    Link source to target based on a tabular file.
    """
    from jcvi.apps.base import mkdir

    p = OptionParser(link.__doc__)
    p.add_option("--dir",
                 help="Place links in a subdirectory [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    meta, = args
    d = opts.dir
    if d:
        mkdir(d)

    fp = open(meta)
    for row in fp:
        source, target = row.split()
        source = get_abs_path(source)
        if d:
            target = op.join(d, target)
        lnsf(source, target, log=True)
Exemple #29
0
def rebuild(args):
    """
    %prog rebuild blocksfile blastfile

    Rebuild anchors file from pre-built blocks file.
    """
    p = OptionParser(rebuild.__doc__)
    p.add_option("--header", default=False, action="store_true",
                 help="First line is header [default: %default]")
    p.add_option("--write_blast", default=False, action="store_true",
                 help="Get blast records of rebuilt anchors [default: %default]")
    p.set_beds()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blocksfile, blastfile = args
    bk = BlockFile(blocksfile, header=opts.header)
    fw = open("pairs", "w")
    for a, b, h in bk.iter_all_pairs():
        print >> fw, "\t".join((a, b))
    fw.close()

    if opts.write_blast:
        AnchorFile("pairs").blast(blastfile, "pairs.blast")

    fw = open("tracks", "w")
    for g, col in bk.iter_gene_col():
        print >> fw, "\t".join(str(x) for x in (g, col))
    fw.close()
Exemple #30
0
def summary(args):
    """
    %prog summary anchorfile

    Provide statistics for pairwise blocks.
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(summary.__doc__)
    p.add_option("--prefix", help="Generate per block stats [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    anchorfile, = args
    ac = AnchorFile(anchorfile)
    clusters = ac.blocks

    nclusters = len(clusters)
    nanchors = [len(c) for c in clusters]
    nranchors = [_score(c) for c in clusters]  # non-redundant anchors
    print >> sys.stderr, "A total of {0} (NR:{1}) anchors found in {2} clusters.".\
                  format(sum(nanchors), sum(nranchors), nclusters)
    print >> sys.stderr, "Stats:", SummaryStats(nanchors)
    print >> sys.stderr, "NR stats:", SummaryStats(nranchors)

    prefix = opts.prefix
    if prefix:
        pad = len(str(nclusters))
        for i, c in enumerate(clusters):
            block_id = "{0}{1:0{2}d}".format(prefix, i + 1, pad)
            print "\t".join((block_id, str(len(c))))
Exemple #31
0
def ld(args):
    """
    %prog ld map

    Calculate pairwise linkage disequilibrium given MSTmap.
    """
    import numpy as np
    from random import sample

    from jcvi.algorithms.matrix import symmetrize

    p = OptionParser(ld.__doc__)
    p.add_option(
        "--subsample",
        default=1000,
        type="int",
        help="Subsample markers to speed up",
    )
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (mstmap, ) = args
    subsample = opts.subsample
    data = MSTMap(mstmap)

    markerbedfile = mstmap + ".subsample.bed"
    ldmatrix = mstmap + ".subsample.matrix"
    # Take random subsample while keeping marker order
    if subsample < data.nmarkers:
        data = [data[x] for x in sorted(sample(range(len(data)), subsample))]
    else:
        logging.debug("Use all markers, --subsample ignored")

    nmarkers = len(data)
    if need_update(mstmap, (ldmatrix, markerbedfile)):
        fw = open(markerbedfile, "w")
        print("\n".join(x.bedline for x in data), file=fw)
        logging.debug("Write marker set of size {0} to file `{1}`.".format(
            nmarkers, markerbedfile))
        fw.close()

        M = np.zeros((nmarkers, nmarkers), dtype=float)
        for i, j in combinations(range(nmarkers), 2):
            a = data[i]
            b = data[j]
            M[i, j] = calc_ldscore(a.genotype, b.genotype)

        M = symmetrize(M)

        logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix))
        M.tofile(ldmatrix)
    else:
        nmarkers = len(Bed(markerbedfile))
        M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers)
        logging.debug("LD matrix `{0}` exists ({1}x{1}).".format(
            ldmatrix, nmarkers))

    from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap

    plt.rcParams["axes.linewidth"] = 0

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])  # the heatmap

    ax.matshow(M, cmap=iopts.cmap)

    # Plot chromosomes breaks
    bed = Bed(markerbedfile)
    xsize = len(bed)
    extent = (0, nmarkers)
    chr_labels = []
    ignore_size = 20

    for (seqid, beg, end) in bed.get_breaks():
        ignore = abs(end - beg) < ignore_size
        pos = (beg + end) / 2
        chr_labels.append((seqid, pos, ignore))
        if ignore:
            continue
        ax.plot((end, end), extent, "w-", lw=1)
        ax.plot(extent, (end, end), "w-", lw=1)

    # Plot chromosome labels
    for label, pos, ignore in chr_labels:
        pos = 0.1 + pos * 0.8 / xsize
        if not ignore:
            root.text(pos,
                      0.91,
                      label,
                      ha="center",
                      va="bottom",
                      rotation=45,
                      color="grey")
            root.text(0.09, pos, label, ha="right", va="center", color="grey")

    ax.set_xlim(extent)
    ax.set_ylim(extent)
    ax.set_axis_off()

    draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=iopts.cmap)

    root.add_patch(Rectangle((0.1, 0.1), 0.8, 0.8, fill=False, ec="k", lw=2))
    m = mstmap.split(".")[0]
    root.text(0.5,
              0.06,
              "Linkage Disequilibrium between {0} markers".format(m),
              ha="center")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = m + ".subsample" + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Exemple #32
0
def filter(args):
    """
    %prog filter test.blast

    Produce a new blast file and filter based on:
    - score: >= cutoff
    - pctid: >= cutoff
    - hitlen: >= cutoff
    - evalue: <= cutoff
    - ids: valid ids

    Use --inverse to obtain the complementary records for the criteria above.

    - noself: remove self-self hits
    """
    p = OptionParser(filter.__doc__)
    p.add_option("--score",
                 dest="score",
                 default=0,
                 type="int",
                 help="Score cutoff")
    p.set_align(pctid=95, hitlen=100, evalue=.01)
    p.add_option("--noself",
                 default=False,
                 action="store_true",
                 help="Remove self-self hits")
    p.add_option("--ids", help="Path to file with ids to retain")
    p.add_option("--inverse",
                 default=False,
                 action="store_true",
                 help="Similar to grep -v, inverse")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    if opts.ids:
        ids = set()
        for row in must_open(opts.ids):
            if row[0] == "#":
                continue
            row = row.replace(",", "\t")
            ids.update(row.split())
    else:
        ids = None

    blastfile, = args
    inverse = opts.inverse
    outfile = opts.outfile
    fp = must_open(blastfile)

    score, pctid, hitlen, evalue, noself = \
            opts.score, opts.pctid, opts.hitlen, opts.evalue, opts.noself
    newblastfile = blastfile + ".P{0}L{1}".format(int(pctid), hitlen) if \
                    outfile is None else outfile
    if inverse:
        newblastfile += ".inverse"
    fw = must_open(newblastfile, "w")
    for row in fp:
        if row[0] == '#':
            continue
        c = BlastLine(row)

        if ids:
            if c.query in ids and c.subject in ids:
                noids = False
            else:
                noids = True
        else:
            noids = None

        remove = c.score < score or \
            c.pctid < pctid or \
            c.hitlen < hitlen or \
            c.evalue > evalue or \
            noids

        if inverse:
            remove = not remove

        remove = remove or (noself and c.query == c.subject)

        if not remove:
            print >> fw, row.rstrip()

    fw.close()

    return newblastfile
Exemple #33
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov",
                 default=False,
                 action="store_true",
                 help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap",
                 action="store_true",
                 help="Use supermap instead of union")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    p.add_option(
        "--iterby",
        dest="iterby",
        default="query",
        choices=allowed_iterby,
        help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches +
                                    this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format(
                            "\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                     this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
Exemple #34
0
def cscore(args):
    """
    %prog cscore blastfile > cscoreOut

    See supplementary info for sea anemone genome paper, C-score formula:

        cscore(A,B) = score(A,B) /
             max(best score for A, best score for B)

    A C-score of one is the same as reciprocal best hit (RBH).

    Output file will be 3-column (query, subject, cscore). Use --cutoff to
    select a different cutoff.
    """
    from jcvi.utils.cbook import gene_name

    p = OptionParser(cscore.__doc__)
    p.add_option("--cutoff",
                 default=.9999,
                 type="float",
                 help="Minimum C-score to report [default: %default]")
    p.add_option("--pct",
                 default=False,
                 action="store_true",
                 help="Also include pct as last column [default: %default]")
    p.add_option("--writeblast",
                 default=False,
                 action="store_true",
                 help="Also write filtered blast file [default: %default]")
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)
    ostrip = opts.strip_names
    writeblast = opts.writeblast
    outfile = opts.outfile

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    blast = Blast(blastfile)
    logging.debug("Register best scores ..")
    best_score = defaultdict(float)
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        if score > best_score[query]:
            best_score[query] = score
        if score > best_score[subject]:
            best_score[subject] = score

    blast = Blast(blastfile)
    pairs = {}
    cutoff = opts.cutoff
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        pctid = b.pctid
        s = score / max(best_score[query], best_score[subject])
        if s > cutoff:
            pair = (query, subject)
            if pair not in pairs or s > pairs[pair][0]:
                pairs[pair] = (s, pctid, b)

    fw = must_open(outfile, "w")
    if writeblast:
        fwb = must_open(outfile + ".filtered.blast", "w")
    pct = opts.pct
    for (query, subject), (s, pctid, b) in sorted(pairs.items()):
        args = [query, subject, "{0:.2f}".format(s)]
        if pct:
            args.append("{0:.1f}".format(pctid))
        print >> fw, "\t".join(args)
        if writeblast:
            print >> fwb, b
    fw.close()
    if writeblast:
        fwb.close()
Exemple #35
0
def completeness(args):
    """
    %prog completeness blastfile ref.fasta > outfile

    Print statistics for each gene, the coverage of the alignment onto the best hit,
    as an indicator for completeness of the gene model. For example, one might
    BLAST sugarcane ESTs against sorghum annotations as reference, to find
    full-length transcripts.
    """
    from jcvi.utils.range import range_minmax
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(completeness.__doc__)
    p.add_option(
        "--ids",
        help="Save ids that are over 50% complete [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    idsfile = opts.ids
    f = Sizes(fastafile).mapping

    b = BlastSlow(blastfile)
    valid = []
    data = []
    cutoff = 50
    for query, blines in groupby(b, key=lambda x: x.query):
        blines = list(blines)
        ranges = [(x.sstart, x.sstop) for x in blines]
        b = blines[0]
        query, subject = b.query, b.subject

        rmin, rmax = range_minmax(ranges)
        subject_len = f[subject]

        nterminal_dist = rmin - 1
        cterminal_dist = subject_len - rmax
        covered = (rmax - rmin + 1) * 100 / subject_len
        if covered > cutoff:
            valid.append(query)

        data.append((nterminal_dist, cterminal_dist, covered))
        print "\t".join(
            str(x)
            for x in (query, subject, nterminal_dist, cterminal_dist, covered))

    nd, cd, cv = zip(*data)
    m = "Total: {0}, Coverage > {1}%: {2}\n".\
           format(len(data), cutoff, len(valid))
    m += "N-terminal: {0}\n".format(SummaryStats(nd))
    m += "C-terminal: {0}\n".format(SummaryStats(cd))
    m += "Coverage: {0}".format(SummaryStats(cv))
    print >> sys.stderr, m

    if idsfile:
        fw = open(idsfile, "w")
        print >> fw, "\n".join(valid)
        logging.debug("A total of {0} ids (cov > {1} %) written to `{2}`.".\
                      format(len(valid), cutoff, idsfile))
        fw.close()
Exemple #36
0
def astat(args):
    """
    %prog astat coverage.log

    Create coverage-rho scatter plot.
    """
    p = OptionParser(astat.__doc__)
    p.add_option("--cutoff",
                 default=1000,
                 type="int",
                 help="Length cutoff [default: %default]")
    p.add_option("--genome",
                 default="",
                 help="Genome name [default: %default]")
    p.add_option("--arrDist",
                 default=False,
                 action="store_true",
                 help="Use arrDist instead [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    covfile, = args
    cutoff = opts.cutoff
    genome = opts.genome
    plot_arrDist = opts.arrDist

    suffix = ".{0}".format(cutoff)
    small_covfile = covfile + suffix
    update_covfile = need_update(covfile, small_covfile)
    if update_covfile:
        fw = open(small_covfile, "w")
    else:
        logging.debug("Found `{0}`, will use this one".format(small_covfile))
        covfile = small_covfile

    fp = open(covfile)
    header = fp.next()
    if update_covfile:
        fw.write(header)

    data = []
    msg = "{0} tigs scanned ..."
    for row in fp:
        tigID, rho, covStat, arrDist = row.split()
        tigID = int(tigID)
        if tigID % 1000000 == 0:
            sys.stderr.write(msg.format(tigID) + "\r")

        rho, covStat, arrDist = [float(x) for x in (rho, covStat, arrDist)]
        if rho < cutoff:
            continue

        if update_covfile:
            fw.write(row)
        data.append((tigID, rho, covStat, arrDist))

    print >> sys.stderr, msg.format(tigID)

    from jcvi.graphics.base import plt, savefig

    logging.debug("Plotting {0} data points.".format(len(data)))
    tigID, rho, covStat, arrDist = zip(*data)

    y = arrDist if plot_arrDist else covStat
    ytag = "arrDist" if plot_arrDist else "covStat"

    fig = plt.figure(1, (7, 7))
    ax = fig.add_axes([.12, .1, .8, .8])
    ax.plot(rho, y, ".", color="lightslategrey")

    xtag = "rho"
    info = (genome, xtag, ytag)
    title = "{0} {1} vs. {2}".format(*info)
    ax.set_title(title)
    ax.set_xlabel(xtag)
    ax.set_ylabel(ytag)

    if plot_arrDist:
        ax.set_yscale('log')

    imagename = "{0}.png".format(".".join(info))
    savefig(imagename, dpi=150)
Exemple #37
0
def cn(args):
    """
    %prog cn workdir 102340_NA12878 \
        s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/

    Download CCN output folder and convert cib to copy number per 1Kb.
    """
    p = OptionParser(cn.__doc__)
    p.add_option("--binsize",
                 default=1000,
                 type="int",
                 help="Window size along chromosome")
    p.add_option(
        "--cleanup",
        default=False,
        action="store_true",
        help="Clean up downloaded s3 folder",
    )
    p.add_option(
        "--hmm",
        default=False,
        action="store_true",
        help="Run HMM caller after computing CN",
    )
    p.add_option(
        "--upload",
        default="s3://hli-mv-data-science/htang/ccn",
        help="Upload cn and seg results to s3",
    )
    p.add_option("--rebuildgc",
                 help="Rebuild GC directory rather than pulling from S3")
    opts, args = p.parse_args(args)

    if len(args) == 2:
        workdir, sample_key = args
        s3dir = None
    elif len(args) == 3:
        workdir, sample_key, s3dir = args
    else:
        sys.exit(not p.print_help())

    n = opts.binsize
    rebuildgc = opts.rebuildgc
    mkdir(workdir)
    sampledir = op.join(workdir, sample_key)
    if s3dir:
        sync_from_s3(s3dir, target_dir=sampledir)

    assert op.exists(sampledir), "Directory {} doesn't exist!".format(
        sampledir)

    cndir = op.join(workdir, sample_key + "-cn")
    if op.exists(cndir):
        logging.debug("Directory {} exists. Skipped.".format(cndir))
        return

    gcdir = "gc"
    if rebuildgc:
        build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir)
    if not op.exists(gcdir):
        sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc", target_dir=gcdir)

    # Build GC correction table
    gc_bin = defaultdict(list)
    gc_med = {}
    coverage = []

    for seqid in allsomes:
        gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n))
        if not op.exists(gcfile):
            logging.error("File {} not found. Continue anyway.".format(gcfile))
            continue
        gc = np.fromfile(gcfile, dtype=np.uint8)
        cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid))
        cib = load_cib(cibfile)
        print(seqid, gc.shape[0], cib.shape[0], file=sys.stderr)
        if seqid in autosomes:
            for gci, k in zip(gc, cib):
                gc_bin[gci].append(k)
        coverage.append((seqid, gc, cib))

    for gci, k in gc_bin.items():
        nonzero_k = [x for x in k if x]
        gc_med[gci] = med = np.median(nonzero_k) / 2
        print(gci, len(nonzero_k), med, file=sys.stderr)

    mkdir(cndir)
    apply_fun = np.vectorize(gc_med.get)
    # Apply the GC correction over coverage
    for seqid, gc, cib in coverage:
        nitems = cib.shape[0]
        beta = apply_fun(gc[:nitems])
        beta_cn = cib / beta
        cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid))
        beta_cn.tofile(cnfile)

    # Run HMM caller if asked
    segfile = hmm([workdir, sample_key]) if opts.hmm else None

    upload = opts.upload
    if upload:
        push_to_s3(upload, cndir)
        if segfile:
            push_to_s3(upload, segfile)

    if opts.cleanup:
        import shutil

        shutil.rmtree(sampledir)
        shutil.rmtree(cndir)
Exemple #38
0
def shred(args):
    """
    %prog shred fastafile

    Similar to the method of `shredContig` in runCA script. The contigs are
    shredded into pseudo-reads with certain length and depth.
    """
    p = OptionParser(shred.__doc__)
    p.set_depth(depth=2)
    p.add_option("--readlen",
                 default=1000,
                 type="int",
                 help="Desired length of the reads [default: %default]")
    p.add_option("--minctglen",
                 default=0,
                 type="int",
                 help="Ignore contig sequence less than [default: %default]")
    p.add_option(
        "--shift",
        default=50,
        type="int",
        help="Overlap between reads must be at least [default: %default]")
    p.add_option(
        "--fasta",
        default=False,
        action="store_true",
        help="Output shredded reads as FASTA sequences [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    libID = fastafile.split(".")[0]
    depth = opts.depth
    readlen = opts.readlen
    shift = opts.shift

    outfile = libID + ".depth{0}".format(depth)
    if opts.fasta:
        outfile += ".fasta"
    else:
        outfile += ".frg"
    f = Fasta(fastafile, lazy=True)

    fw = must_open(outfile, "w", checkexists=True)
    if not opts.fasta:
        print >> fw, headerTemplate.format(libID=libID)
    """
    Taken from runCA:

                    |*********|
                    |###################|
    |--------------------------------------------------|
     ---------------1---------------
               ---------------2---------------
                         ---------------3---------------
    *** - center_increments
    ### - center_range_width
    """
    for ctgID, (name, rec) in enumerate(f.iteritems_ordered()):
        seq = rec.seq
        seqlen = len(seq)
        if seqlen < opts.minctglen:
            continue

        shredlen = min(seqlen - shift, readlen)
        numreads = max(seqlen * depth / shredlen, 1)
        center_range_width = seqlen - shredlen

        ranges = []
        if depth == 1:
            if seqlen < readlen:
                ranges.append((0, seqlen))
            else:
                for begin in xrange(0, seqlen, readlen - shift):
                    end = min(seqlen, begin + readlen)
                    ranges.append((begin, end))
        else:
            if numreads == 1:
                ranges.append((0, shredlen))
            else:
                prev_begin = -1
                center_increments = center_range_width * 1. / (numreads - 1)
                for i in xrange(numreads):
                    begin = center_increments * i
                    end = begin + shredlen
                    begin, end = int(begin), int(end)

                    if begin == prev_begin:
                        continue

                    ranges.append((begin, end))
                    prev_begin = begin

        for shredID, (begin, end) in enumerate(ranges):
            shredded_seq = seq[begin:end]
            fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID,
                                                      begin, end)
            emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta)

    fw.close()
    logging.debug("Shredded reads are written to `{0}`.".format(outfile))
    return outfile
Exemple #39
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {
        "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
        "asn.1": ["genome", "nuccore", "nucgss", "protein"],
        "gb": ["genome", "nuccore", "nucgss"],
        "est": ["nucest"],
        "gss": ["nucgss"],
        "acc": ["nuccore"],
    }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein")

    p.add_option("--noversion",
                 dest="noversion",
                 default=False,
                 action="store_true",
                 help="Remove trailing accession versions")
    p.add_option("--format",
                 default="fasta",
                 choices=valid_formats,
                 help="download format [default: %default]")
    p.add_option("--database",
                 default="nuccore",
                 choices=valid_databases,
                 help="search database [default: %default]")
    p.add_option("--retmax",
                 default=1000000,
                 type="int",
                 help="how many results to return [default: %default]")
    p.add_option(
        "--skipcheck",
        default=False,
        action="store_true",
        help="turn off prompt to check file existence [default: %default]")
    p.add_option(
        "--batchsize",
        default=500,
        type="int",
        help="download the results in batch for speed-up [default: %default]")
    p.add_option("--outdir",
                 default=None,
                 help="output directory, with accession number as filename")
    p.add_option("--outprefix",
                 default="out",
                 help="output file name prefix [default: %default]")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    filename, = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert database in allowed_databases[fmt], \
        "For output format '{0}', allowed databases are: {1}".\
        format(fmt, allowed_databases[fmt])
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile, "w", checkexists=True, \
                skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax, \
                                 rettype=fmt, db=database, batchsize=batchsize, \
                                 email=opts.email):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile, "w", checkexists=True, \
                    skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print >> fw, rec
        print >> fw

        seen.add(id)

    if seen:
        print >> sys.stderr, "A total of {0} {1} records downloaded.".\
                format(totalsize, fmt.upper())

    return outfile
Exemple #40
0
def ace(args):
    """
    %prog ace bamfile fastafile

    convert bam format to ace format. This often allows the remapping to be
    assessed as a denovo assembly format. bam file needs to be indexed. also
    creates a .mates file to be used in amos/bambus, and .astat file to mark
    whether the contig is unique or repetitive based on A-statistics in Celera
    assembler.
    """
    p = OptionParser(ace.__doc__)
    p.add_option(
        "--splitdir",
        dest="splitdir",
        default="outRoot",
        help="split the ace per contig to dir",
    )
    p.add_option(
        "--unpaired",
        dest="unpaired",
        default=False,
        help="remove read pairs on the same contig",
    )
    p.add_option(
        "--minreadno",
        dest="minreadno",
        default=3,
        type="int",
        help="minimum read numbers per contig",
    )
    p.add_option(
        "--minctgsize",
        dest="minctgsize",
        default=100,
        type="int",
        help="minimum contig size per contig",
    )
    p.add_option(
        "--astat",
        default=False,
        action="store_true",
        help="create .astat to list repetitiveness",
    )
    p.add_option(
        "--readids",
        default=False,
        action="store_true",
        help="create file of mapped and unmapped ids",
    )

    from pysam import Samfile

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, fastafile = args
    astat = opts.astat
    readids = opts.readids

    f = Fasta(fastafile)
    prefix = bamfile.split(".")[0]
    acefile = prefix + ".ace"
    readsfile = prefix + ".reads"
    astatfile = prefix + ".astat"

    logging.debug("Load {0}".format(bamfile))
    s = Samfile(bamfile, "rb")

    ncontigs = s.nreferences
    genomesize = sum(x for a, x in f.itersizes())
    logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize))
    qual = "20"  # default qual

    totalreads = sum(s.count(x) for x in s.references)
    logging.debug("Total {0} reads mapped".format(totalreads))

    fw = open(acefile, "w")
    if astat:
        astatfw = open(astatfile, "w")
    if readids:
        readsfw = open(readsfile, "w")

    print("AS {0} {1}".format(ncontigs, totalreads), file=fw)
    print(file=fw)

    for i, contig in enumerate(s.references):
        cseq = f[contig]
        nbases = len(cseq)

        mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped]
        nreads = len(mapped_reads)

        nsegments = 0
        print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw)
        print(fill(str(cseq.seq)), file=fw)
        print(file=fw)

        if astat:
            astat = Astat(nbases, nreads, genomesize, totalreads)
            print("{0}\t{1:.1f}".format(contig, astat), file=astatfw)

        text = fill([qual] * nbases, delimiter=" ", width=30)
        print("BQ\n{0}".format(text), file=fw)
        print(file=fw)

        rnames = []
        for a in mapped_reads:
            readname = a.qname
            rname = readname

            if readids:
                print(readname, file=readsfw)
            rnames.append(rname)

            strand = "C" if a.is_reverse else "U"
            paddedstart = a.pos + 1  # 0-based to 1-based
            af = "AF {0} {1} {2}".format(rname, strand, paddedstart)
            print(af, file=fw)

        print(file=fw)

        for a, rname in zip(mapped_reads, rnames):
            aseq, npadded = cigar_to_seq(a)
            if aseq is None:
                continue

            ninfos = 0
            ntags = 0
            alen = len(aseq)
            rd = "RD {0} {1} {2} {3}\n{4}".format(
                rname, alen, ninfos, ntags, fill(aseq)
            )
            qs = "QA 1 {0} 1 {0}".format(alen)

            print(rd, file=fw)
            print(file=fw)
            print(qs, file=fw)
            print(file=fw)
Exemple #41
0
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option("-m", dest="matefile", default=None, help="matepairs file")
    p.add_option("--maxreadlen",
                 default=32000,
                 type="int",
                 help="Maximum read length allowed [default: %default]")
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    maxreadlen = opts.maxreadlen
    f = Fasta(fastafile, lazy=True)
    if maxreadlen > 0:
        split = False
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\
                                format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = "SangerFrags-" + plate

    frgfile = libname + ".frg"

    cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
    if need_update(fastafile, cleanfasta):
        clean([fastafile, "--canonical", "-o", cleanfasta])
    fastafile = cleanfasta

    qualfile = make_qual(fastafile, score=21)
    if mated:
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

    cmd = "convert-fasta-to-v2.pl"
    cmd += " -l {0} -s {1} -q {2} ".\
            format(libname, fastafile, qualfile)
    if mated:
        cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

    sh(cmd, outfile=frgfile)
Exemple #42
0
def simulate(args):
    """
    %prog simulate

    Run simulation on female restitution.
    """
    import seaborn as sns

    sns.set_style("darkgrid")

    p = OptionParser(simulate.__doc__)
    p.add_option(
        "--verbose",
        default=False,
        action="store_true",
        help="Verbose logging during simulation",
    )
    opts, args, iopts = p.set_image_options(args, figsize="7x10")
    if len(args) != 0:
        sys.exit(not p.print_help())

    # Construct a composite figure with 6 tracks
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    rows = 7
    ypad = 0.05
    yinterval = (1 - 2 * ypad) / (rows + 1)
    yy = 1 - ypad
    xpad = 0.18
    xwidth = 0.6

    # Axes are vertically stacked, and share x-axis
    axes = []
    yy_positions = []  # Save yy positions so we can show details to the right laterr
    for idx in range(rows):
        yy_positions.append(yy)
        yy -= yinterval
        ax = fig.add_axes([xpad, yy, xwidth, yinterval * 0.85])
        if idx != rows - 1:
            plt.setp(ax.get_xticklabels(), visible=False)
        axes.append(ax)
    ax1, ax2, ax3, ax4, ax5, ax6, ax7 = axes

    # Prepare the simulated data
    # Simulate two parents
    SS = Genome("SS", "SS", 10, 8)
    SO = Genome("SO", "SO", 8, 10)

    verbose = opts.verbose
    all_F1s = [simulate_F1(SO, SS, verbose=verbose) for _ in range(1000)]
    all_F2s = [simulate_F2(SO, SS, verbose=verbose) for _ in range(1000)]
    all_F1intercrosses = [simulate_F1intercross(SO, SS, verbose) for _ in range(1000)]
    all_BC1s = [simulate_BCn(1, SO, SS, verbose=verbose) for _ in range(1000)]
    all_BC2s = [simulate_BCn(2, SO, SS, verbose=verbose) for _ in range(1000)]
    all_BC3s = [simulate_BCn(3, SO, SS, verbose=verbose) for _ in range(1000)]
    all_BC4s = [simulate_BCn(4, SO, SS, verbose=verbose) for _ in range(1000)]

    # Plotting
    f1s = plot_summary(ax1, all_F1s)
    f2s = plot_summary(ax2, all_F2s)
    f1is = plot_summary(ax3, all_F1intercrosses)
    bc1s = plot_summary(ax4, all_BC1s)
    bc2s = plot_summary(ax5, all_BC2s)
    bc3s = plot_summary(ax6, all_BC3s)
    bc4s = plot_summary(ax7, all_BC4s)

    # Show title to the left
    xx = xpad / 2
    for (title, subtitle), yy in zip(
        (
            ("F1", None),
            ("F2", "via selfing"),
            ("F2", "via intercross"),
            ("BC1", None),
            ("BC2", None),
            ("BC3", None),
            ("BC4", None),
        ),
        yy_positions,
    ):
        if subtitle:
            yy -= 0.06
        else:
            yy -= 0.07
        root.text(xx, yy, title, color="darkslategray", ha="center", va="center")
        if subtitle:
            yy -= 0.02
            root.text(
                xx, yy, subtitle, color="lightslategray", ha="center", va="center"
            )

    # Show summary stats to the right
    xx = 1 - (1 - xpad - xwidth) / 2
    for summary, yy in zip((f1s, f2s, f1is, bc1s, bc2s, bc3s, bc4s), yy_positions):
        yy -= 0.04
        root.text(
            xx, yy, summary.SO_summary, color=SoColor, ha="center", va="center",
        )
        yy -= 0.02
        root.text(
            xx, yy, summary.SS_summary, color=SsColor, ha="center", va="center",
        )
        yy -= 0.02
        root.text(
            xx, yy, summary.percent_SO_summary, color=SoColor, ha="center", va="center",
        )

    ax7.set_xlabel("Number of unique chromosomes")
    adjust_spines(ax7, ["bottom"], outward=True)
    normalize_axes(root)

    savefig("plotter.pdf", dpi=120)

    outdir = "simulations"
    mkdir(outdir)
    # Write chromosomes to disk
    for genomes, filename in (
        (all_F1s, "all_F1s"),
        (all_F2s, "all_F2s"),
        (all_F1intercrosses, "all_F1intercrosses"),
        (all_BC1s, "all_BC1s"),
        (all_BC2s, "all_BC2s"),
        (all_BC3s, "all_BC3s"),
        (all_BC4s, "all_BC4s"),
    ):
        write_chromosomes(genomes, op.join(outdir, filename))
Exemple #43
0
def htg(args):
    """
    %prog htg fastafile template.sbt

    Prepare sqnfiles for Genbank HTG submission to update existing records.

    `fastafile` contains the records to update, multiple records are allowed
    (with each one generating separate sqn file in the sqn/ folder). The record
    defline has the accession ID. For example,
    >AC148290.3

    Internally, this generates two additional files (phasefile and namesfile)
    and download records from Genbank. Below is implementation details:

    `phasefile` contains, for each accession, phase information. For example:
    AC148290.3      3       HTG     2       mth2-45h12

    which means this is a Phase-3 BAC. Record with only a single contig will be
    labeled as Phase-3 regardless of the info in the `phasefile`. Template file
    is the Genbank sbt template. See jcvi.formats.sbt for generation of such
    files.

    Another problem is that Genbank requires the name of the sequence to stay
    the same when updating and will kick back with a table of name conflicts.
    For example:

    We are unable to process the updates for these entries
    for the following reason:

    Seqname has changed

    Accession Old seq_name New seq_name
    --------- ------------ ------------
    AC239792 mtg2_29457 AC239792.1

    To prepare a submission, this script downloads genbank and asn.1 format,
    and generate the phase file and the names file (use formats.agp.phase() and
    apps.gbsubmit.asn(), respectively). These get automatically run.

    However, use --phases if the genbank files contain outdated information.
    For example, the clone name changes or phase upgrades. In this case, run
    formats.agp.phase() manually, modify the phasefile and use --phases to override.
    """
    from jcvi.formats.fasta import sequin, ids
    from jcvi.formats.agp import phase
    from jcvi.apps.fetch import entrez

    p = OptionParser(htg.__doc__)
    p.add_option("--phases",
                 default=None,
                 help="Use another phasefile to override [default: %default]")
    p.add_option("--comment",
                 default="",
                 help="Comments for this update [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, sbtfile = args
    pf = fastafile.rsplit(".", 1)[0]

    idsfile = pf + ".ids"
    phasefile = pf + ".phases"
    namesfile = pf + ".names"

    ids([fastafile, "--outfile={0}".format(idsfile)])

    asndir = "asn.1"
    mkdir(asndir)
    entrez([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)])
    asn(glob("{0}/*".format(asndir)) + \
            ["--outfile={0}".format(namesfile)])

    if opts.phases is None:
        gbdir = "gb"
        mkdir(gbdir)
        entrez([idsfile, "--format=gb", "--outdir={0}".format(gbdir)])
        phase(glob("{0}/*".format(gbdir)) + \
                ["--outfile={0}".format(phasefile)])
    else:
        phasefile = opts.phases

    assert op.exists(namesfile) and op.exists(phasefile)

    newphasefile = phasefile + ".new"
    newphasefw = open(newphasefile, "w")
    comment = opts.comment

    fastadir = "fasta"
    sqndir = "sqn"
    mkdir(fastadir)
    mkdir(sqndir)

    from jcvi.graphics.histogram import stem_leaf_plot

    names = DictFile(namesfile)
    assert len(set(names.keys())) == len(set(names.values()))

    phases = DictFile(phasefile)
    ph = [int(x) for x in phases.values()]
    # vmin 1, vmax 4, bins 3
    stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates")
    logging.debug("Information loaded for {0} records.".format(len(phases)))
    assert len(names) == len(phases)

    newph = []

    cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir)
    sh(cmd, outfile="/dev/null", errfile="/dev/null")

    acmd = 'tbl2asn -a z -p fasta -r {sqndir}'
    acmd += ' -i {splitfile} -t {sbtfile} -C tigr'
    acmd += ' -j "{qualifiers}"'
    acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr'
    acmd += ' -y "{comment}" -W T -T T'

    qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"

    nupdated = 0
    for row in open(phasefile):
        atoms = row.rstrip().split("\t")
        # see formats.agp.phase() for column contents
        accession, phase, clone = atoms[0], atoms[1], atoms[-1]
        fafile = op.join(fastadir, accession + ".fa")
        accession_nv = accession.split(".", 1)[0]

        newid = names[accession_nv]
        newidopt = "--newid={0}".format(newid)
        cloneopt = "--clone={0}".format(clone)
        splitfile, gaps = sequin([fafile, newidopt, cloneopt])
        splitfile = op.basename(splitfile)
        phase = int(phase)
        assert phase in (1, 2, 3)

        oldphase = phase
        if gaps == 0 and phase != 3:
            phase = 3

        if gaps != 0 and phase == 3:
            phase = 2

        print >> newphasefw, "{0}\t{1}\t{2}".\
                format(accession_nv, oldphase, phase)
        newph.append(phase)

        qualifiers = qq.format(phase=phase)
        if ";" in clone:
            qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]"

        cmd = acmd.format(accession=accession,
                          accession_nv=accession_nv,
                          sqndir=sqndir,
                          sbtfile=sbtfile,
                          splitfile=splitfile,
                          qualifiers=qualifiers,
                          comment=comment)
        sh(cmd)

        verify_sqn(sqndir, accession)
        nupdated += 1

    stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates")
    print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
Exemple #44
0
def summary(args):
    """
    %prog summary diploid.napus.fractionation gmap.status

    Provide summary of fractionation. `fractionation` file is generated with
    loss(). `gmap.status` is generated with genestatus().
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import percentage, Registry

    p = OptionParser(summary.__doc__)
    p.add_option("--extra", help="Cross with extra tsv file")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    frfile, statusfile = args
    status = DictFile(statusfile)
    fp = open(frfile)
    registry = Registry()  # keeps all the tags for any given gene
    for row in fp:
        seqid, gene, tag = row.split()
        if tag == ".":
            registry[gene].append("outside")
        else:
            registry[gene].append("inside")
            if tag[0] == "[":
                registry[gene].append("no_syntenic_model")
                if tag.startswith("[S]"):
                    registry[gene].append("[S]")
                    gstatus = status.get(gene, None)
                    if gstatus == "complete":
                        registry[gene].append("complete")
                    elif gstatus == "pseudogene":
                        registry[gene].append("pseudogene")
                    elif gstatus == "partial":
                        registry[gene].append("partial")
                    else:
                        registry[gene].append("gmap_fail")
                elif tag.startswith("[NS]"):
                    registry[gene].append("[NS]")
                    if "random" in tag or "Scaffold" in tag:
                        registry[gene].append("random")
                    else:
                        registry[gene].append("real_ns")
                elif tag.startswith("[NF]"):
                    registry[gene].append("[NF]")
            else:
                registry[gene].append("syntenic_model")

    inside = registry.count("inside")
    outside = registry.count("outside")
    syntenic = registry.count("syntenic_model")
    non_syntenic = registry.count("no_syntenic_model")
    s = registry.count("[S]")
    ns = registry.count("[NS]")
    nf = registry.count("[NF]")
    complete = registry.count("complete")
    pseudogene = registry.count("pseudogene")
    partial = registry.count("partial")
    gmap_fail = registry.count("gmap_fail")
    random = registry.count("random")
    real_ns = registry.count("real_ns")

    complete_models = registry.get_tag("complete")
    pseudogenes = registry.get_tag("pseudogene")
    partial_deletions = registry.get_tag("partial")

    m = "{0} inside synteny blocks\n".format(inside)
    m += "{0} outside synteny blocks\n".format(outside)
    m += "{0} has syntenic gene\n".format(syntenic)
    m += "{0} lack syntenic gene\n".format(non_syntenic)
    m += "{0} has sequence match in syntenic location\n".format(s)
    m += "{0} has sequence match in non-syntenic location\n".format(ns)
    m += "{0} has sequence match in un-ordered scaffolds\n".format(random)
    m += "{0} has sequence match in real non-syntenic location\n".format(
        real_ns)
    m += "{0} has no sequence match\n".format(nf)
    m += "{0} syntenic sequence - complete model\n".format(
        percentage(complete, s))
    m += "{0} syntenic sequence - partial model\n".format(
        percentage(partial, s))
    m += "{0} syntenic sequence - pseudogene\n".format(
        percentage(pseudogene, s))
    m += "{0} syntenic sequence - gmap fail\n".format(percentage(gmap_fail, s))
    print(m, file=sys.stderr)

    aa = ["complete_models", "partial_deletions", "pseudogenes"]
    bb = [complete_models, partial_deletions, pseudogenes]
    for a, b in zip(aa, bb):
        fw = open(a, "w")
        print("\n".join(b), file=fw)
        fw.close()

    extra = opts.extra
    if extra:
        registry.update_from(extra)

    fp.seek(0)
    fw = open("registry", "w")
    for row in fp:
        seqid, gene, tag = row.split()
        ts = registry[gene]
        print("\t".join((seqid, gene, tag, "-".join(ts))), file=fw)
    fw.close()

    logging.debug("Registry written.")
Exemple #45
0
def index(args):
    """
    %prog index samfile/bamfile

    If SAM file, convert to BAM, sort and then index, using SAMTOOLS
    """
    p = OptionParser(index.__doc__)
    p.add_option(
        "--fasta", dest="fasta", default=None, help="add @SQ header to the BAM file"
    )
    p.add_option(
        "--unique",
        default=False,
        action="store_true",
        help="only retain uniquely mapped reads",
    )
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    (samfile,) = args
    cpus = opts.cpus
    fastafile = opts.fasta
    if fastafile:
        assert op.exists(fastafile)

    bamfile = samfile.replace(".sam", ".bam")
    if fastafile:
        faifile = fastafile + ".fai"
        if need_update(fastafile, faifile):
            sh("samtools faidx {0}".format(fastafile))
        cmd = "samtools view -bt {0} {1} -o {2}".format(faifile, samfile, bamfile)
    else:
        cmd = "samtools view -bS {0} -o {1}".format(samfile, bamfile)

    cmd += " -@ {0}".format(cpus)
    if opts.unique:
        cmd += " -q 1"

    if samfile.endswith(".sam") and need_update(samfile, bamfile):
        sh(cmd)

    # Already sorted?
    if bamfile.endswith(".sorted.bam"):
        sortedbamfile = bamfile
    else:
        prefix = bamfile.replace(".bam", "")
        sortedbamfile = prefix + ".sorted.bam"

    if need_update(bamfile, sortedbamfile):
        cmd = "samtools sort {0} -o {1}".format(bamfile, sortedbamfile)
        cmd += " -@ {0}".format(cpus)
        sh(cmd)

    baifile = sortedbamfile + ".bai"
    if need_update(sortedbamfile, baifile):
        sh("samtools index {0}".format(sortedbamfile))

    return sortedbamfile
Exemple #46
0
def align(args):
    """
    %prog align database.fasta read1.fq [read2.fq]

    Wrapper for `bowtie2` single-end or paired-end, depending on the number of args.
    """
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(align.__doc__)
    p.set_firstN(firstN=0)
    p.add_option("--full",
                 default=False,
                 action="store_true",
                 help="Enforce end-to-end alignment [default: local]")
    p.add_option("--reorder",
                 default=False,
                 action="store_true",
                 help="Keep the input read order [default: %default]")
    p.add_option("--null",
                 default=False,
                 action="store_true",
                 help="Do not write to SAM/BAM output")
    p.add_option("--fasta",
                 default=False,
                 action="store_true",
                 help="Query reads are FASTA")
    p.set_cutoff(cutoff=800)
    p.set_mateorientation(mateorientation="+-")
    p.set_sam_options(bowtie=True)

    opts, args = p.parse_args(args)
    extra = opts.extra
    mo = opts.mateorientation
    if mo == '+-':
        extra += ""
    elif mo == '-+':
        extra += "--rf"
    else:
        extra += "--ff"

    PE = True
    if len(args) == 2:
        logging.debug("Single-end alignment")
        PE = False
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    firstN = opts.firstN
    mapped = opts.mapped
    unmapped = opts.unmapped
    fasta = opts.fasta
    gl = "--end-to-end" if opts.full else "--local"

    dbfile, readfile = args[0:2]
    dbfile = check_index(dbfile)
    prefix = get_prefix(readfile, dbfile)
    samfile, mapped, unmapped = get_samfile(readfile,
                                            dbfile,
                                            bowtie=True,
                                            mapped=mapped,
                                            unmapped=unmapped,
                                            bam=opts.bam)
    logfile = prefix + ".log"
    if not fasta:
        offset = guessoffset([readfile])

    if not need_update(dbfile, samfile):
        logging.error("`{0}` exists. `bowtie2` already run.".format(samfile))
        return samfile, logfile

    cmd = "bowtie2 -x {0}".format(dbfile)
    if PE:
        r1, r2 = args[1:3]
        cmd += " -1 {0} -2 {1}".format(r1, r2)
        cmd += " --maxins {0}".format(opts.cutoff)
        mtag, utag = "--al-conc", "--un-conc"
    else:
        cmd += " -U {0}".format(readfile)
        mtag, utag = "--al", "--un"

    if mapped:
        cmd += " {0} {1}".format(mtag, mapped)
    if unmapped:
        cmd += " {0} {1}".format(utag, unmapped)

    if firstN:
        cmd += " --upto {0}".format(firstN)
    cmd += " -p {0}".format(opts.cpus)
    if fasta:
        cmd += " -f"
    else:
        cmd += " --phred{0}".format(offset)
    cmd += " {0}".format(gl)
    if opts.reorder:
        cmd += " --reorder"

    cmd += " {0}".format(extra)
    # Finally the log
    cmd += " 2> {0}".format(logfile)

    if opts.null:
        samfile = "/dev/null"

    cmd = output_bam(cmd, samfile)
    sh(cmd)
    print(open(logfile).read(), file=sys.stderr)

    return samfile, logfile
Exemple #47
0
def loss(args):
    """
    %prog loss a.b.i1.blocks [a.b-genomic.blast]

    Extract likely gene loss candidates between genome a and b.
    """
    p = OptionParser(loss.__doc__)
    p.add_option(
        "--bed",
        default=False,
        action="store_true",
        help="Genomic BLAST is in bed format",
    )
    p.add_option("--gdist", default=20, type="int", help="Gene distance")
    p.add_option(
        "--bdist",
        default=20000,
        type="int",
        help="Base pair distance",
    )
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    blocksfile = args[0]
    emptyblast = len(args) == 1
    if emptyblast:
        genomicblast = "empty.blast"
        sh("touch {0}".format(genomicblast))
    else:
        genomicblast = args[1]

    gdist, bdist = opts.gdist, opts.bdist
    qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts)
    blocks = []
    fp = open(blocksfile)
    genetrack = {}
    proxytrack = {}
    for row in fp:
        a, b = row.split()
        genetrack[a] = b
        blocks.append((a, b))

    data = []
    for key, rows in groupby(blocks, key=lambda x: x[-1]):
        rows = list(rows)
        data.append((key, rows))

    imax = len(data) - 1
    for i, (key, rows) in enumerate(data):
        if i == 0 or i == imax:
            continue
        if key != ".":
            continue

        before, br = data[i - 1]
        after, ar = data[i + 1]
        bi, bx = sorder[before]
        ai, ax = sorder[after]
        dist = abs(bi - ai)
        if bx.seqid != ax.seqid or dist > gdist:
            continue

        start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end)))
        start, end = max(start - bdist, 1), end + bdist
        proxy = (bx.seqid, start, end)
        for a, b in rows:
            proxytrack[a] = proxy

    tags = {}
    if opts.bed:
        bed = Bed(genomicblast, sorted=False)
        key = lambda x: gene_name(x.accn.rsplit(".", 1)[0])
        for query, bb in groupby(bed, key=key):
            bb = list(bb)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.seqid, b.start, b.end)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.seqid, best_b.start, best_b.end)
            proxytrack[query] = hsp
            tags[query] = tag

    else:
        blast = Blast(genomicblast)
        for query, bb in blast.iter_hits():
            bb = list(bb)
            query = gene_name(query)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.subject, b.sstart, b.sstop)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.subject, best_b.sstart, best_b.sstop)
            proxytrack[query] = hsp
            tags[query] = tag

    for b in qbed:
        accn = b.accn
        target_region = genetrack[accn]
        if accn in proxytrack:
            target_region = region_str(proxytrack[accn])
            if accn in tags:
                ptag = "[{0}]".format(tags[accn])
            else:
                ptag = "[NF]"
            target_region = ptag + target_region

        print("\t".join((b.seqid, accn, target_region)))

    if emptyblast:
        sh("rm -f {0}".format(genomicblast))
Exemple #48
0
def lobstr(args):
    """
    %prog lobstr lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In
    addition, bamfile can be S3 location and --lobstr_home can be S3 location
    (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/)
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--haploid",
                 default="chrY,chrM",
                 help="Use haploid model for these chromosomes")
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--simulation",
                 default=False,
                 action="store_true",
                 help="Simulation mode")
    p.set_home("lobstr",
               default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)
    bamfile = opts.input_bam_path

    if len(args) < 1 or bamfile is None:
        sys.exit(not p.print_help())

    lbindices = args
    if opts.simulation:  # Simulation mode
        cmd, vcf_file = allelotype_on_chr(bamfile,
                                          "chr4",
                                          "/mnt/software/lobSTR/",
                                          "TREDs",
                                          haploid=opts.haploid)
        stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats"
        results_dir = "lobstr_results"
        mkdir(results_dir)
        sh(cmd)
        sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file))
        return

    s3mode = bamfile.startswith("s3")
    store = opts.output_path
    cleanup = not opts.nocleanup
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    lhome = opts.lobstr_home
    if lhome.startswith("s3://"):
        lhome = pull_from_s3(lhome, overwrite=False)

    exec_id, sample_id = opts.workflow_execution_id, opts.sample_id
    prefix = [x for x in (exec_id, sample_id) if x]
    if prefix:
        pf = "_".join(prefix)
    else:
        pf = bamfile.split("/")[-1].split(".")[0]

    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        makefile = "makefile.{0}".format(lbidx)
        mm = MakeManager(filename=makefile)
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile,
                                             chr,
                                             lhome,
                                             lbidx,
                                             haploid=opts.haploid)
            mm.add(bamfile, vcffile, cmd)
            filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf")
            cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile)
            cmd += " --lobstr_home {}".format(lhome)
            mm.add(vcffile, filteredvcffile, cmd)
            vcffiles.append(filteredvcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)

        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if cleanup:
        mm.clean()
        sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
Exemple #49
0
def segment(args):
    """
    %prog segment loss.ids bedfile

    Merge adjacent gene loss into segmental loss.

    Then based on the segmental loss, estimate amount of DNA loss in base pairs.
    Two estimates can be given:
    - conservative: just within the start and end of a single gene
    - aggressive: extend the deletion track to the next gene

    The real deletion size is within these estimates.
    """
    from jcvi.formats.base import SetFile

    p = OptionParser(segment.__doc__)
    p.add_option(
        "--chain",
        default=1,
        type="int",
        help="Allow next N genes to be chained",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    idsfile, bedfile = args
    bed = Bed(bedfile)
    order = bed.order
    ids = SetFile(idsfile)
    losses = Grouper()
    skip = opts.chain
    for i, a in enumerate(bed):
        a = a.accn
        for j in range(i + 1, i + 1 + skip):
            if j >= len(bed):
                break
            b = bed[j].accn
            if a in ids:
                losses.join(a, a)
            if a in ids and b in ids:
                losses.join(a, b)

    losses = list(losses)
    singletons = [x for x in losses if len(x) == 1]
    segments = [x for x in losses if len(x) > 1]
    ns, nm, nt = len(singletons), len(segments), len(losses)
    assert ns + nm == nt

    # Summary for all segments
    for x in sorted(singletons) + sorted(segments):
        print("\t".join(
            str(x) for x in ("|".join(sorted(x)), len(x),
                             estimate_size(x, bed, order))))

    # Find longest segment stretch
    if segments:
        mx, maxsegment = max([(len(x), x) for x in segments])
        print("Longest stretch: run of {0} genes".format(mx), file=sys.stderr)
        print("  {0}".format("|".join(sorted(maxsegment))), file=sys.stderr)
        seg_asize = sum(estimate_size(x, bed, order) for x in segments)
        seg_bsize = sum(
            estimate_size(x, bed, order, conservative=False) for x in segments)
    else:
        seg_asize = seg_bsize = 0

    sing_asize = sum(estimate_size(x, bed, order) for x in singletons)
    sing_bsize = sum(
        estimate_size(x, bed, order, conservative=False) for x in singletons)
    total_asize = sing_asize + seg_asize
    total_bsize = sing_bsize + seg_bsize
    print(
        "Singleton ({0}): {1} - {2} bp".format(ns, sing_asize, sing_bsize),
        file=sys.stderr,
    )
    print("Segment ({0}): {1} - {2} bp".format(nm, seg_asize, seg_bsize),
          file=sys.stderr)
    print(
        "Total ({0}): {1} - {2} bp".format(nt, total_asize, total_bsize),
        file=sys.stderr,
    )
    print(
        "Average ({0}): {1} bp".format(nt, (total_asize + total_bsize) / 2),
        file=sys.stderr,
    )
Exemple #50
0
def split(args):
    """
    %prog split split.bed evidences.bed predictor1.gff predictor2.gff fastafile

    Split MAKER models by checking against predictors (such as AUGUSTUS and
    FGENESH). For each region covered by a working model. Find out the
    combination of predictors that gives the best accuracy against evidences
    (such as PASA).

    `split.bed` can be generated by pulling out subset from a list of ids
    $ python -m jcvi.formats.base join split.ids working.bed
        --column=0,3 --noheader | cut -f2-7 > split.bed
    """
    from jcvi.formats.bed import Bed

    p = OptionParser(split.__doc__)
    p.add_option(
        "--key",
        default="Name",
        help=
        "Key in the attributes to extract predictor.gff [default: %default]")
    p.add_option(
        "--parents",
        default="match",
        help="list of features to extract, use comma to separate (e.g."
        "'gene,mRNA') [default: %default]")
    p.add_option(
        "--children",
        default="match_part",
        help="list of features to extract, use comma to separate (e.g."
        "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    split_bed, evidences_bed, p1_gff, p2_gff, fastafile = args
    parents = opts.parents
    children = opts.children
    key = opts.key

    bed = Bed(split_bed)

    s1 = get_splits(split_bed, p1_gff, parents, key)
    s2 = get_splits(split_bed, p2_gff, parents, key)

    for b in bed:
        query = "{0}:{1}-{2}".format(b.seqid, b.start, b.end)
        b1 = get_accuracy(query, p1_gff, evidences_bed, fastafile, children,
                          key)
        b2 = get_accuracy(query, p2_gff, evidences_bed, fastafile, children,
                          key)
        accn = b.accn
        c1 = "|".join(s1[accn])
        c2 = "|".join(s2[accn])
        ac1 = b1.accuracy
        ac2 = b2.accuracy
        tag = p1_gff if ac1 >= ac2 else p2_gff
        tag = tag.split(".")[0]

        ac1 = "{0:.3f}".format(ac1)
        ac2 = "{0:.3f}".format(ac2)

        print "\t".join((accn, tag, ac1, ac2, c1, c2))
Exemple #51
0
def meta(args):
    """
    %prog meta data.bin samples STR.ids STR-exons.wo.bed

    Compute allele frequencies and prune sites based on missingness.

    Filter subset of loci that satisfy:
    1. no redundancy (unique chr:pos)
    2. variable (n_alleles > 1)
    3. low level of missing data (>= 50% autosomal + X, > 25% for Y)

    Write meta file with the following infor:
    1. id
    2. title
    3. gene_name
    4. variant_type
    5. motif
    6. allele_frequency

    `STR-exons.wo.bed` can be generated like this:
    $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed
    $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed
    """
    p = OptionParser(meta.__doc__)
    p.add_option("--cutoff",
                 default=.5,
                 type="float",
                 help="Percent observed required (chrY half cutoff)")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, wobed = args
    cutoff = opts.cutoff

    af_file = "allele_freq"
    if need_update(binfile, af_file):
        df, m, samples, loci = read_binfile(binfile, sampleids, strids)
        nalleles = len(samples)
        fw = must_open(af_file, "w")
        for i, locus in enumerate(loci):
            a = m[:, i]
            counts = alleles_to_counts(a)
            af = counts_to_af(counts)
            seqid = locus.split("_")[0]
            remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff)
            print("\t".join((locus, af, remove)), file=fw)
        fw.close()

    logging.debug("Load gene intersections from `{}`".format(wobed))
    fp = open(wobed)
    gene_map = defaultdict(set)
    for row in fp:
        chr1, start1, end1, chr2, start2, end2, name, ov = row.split()
        gene_map[(chr1, start1)] |= set(name.split(","))
    for k, v in gene_map.items():
        non_enst = sorted(x for x in v if not x.startswith("ENST"))
        #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST"))
        gene_map[k] = ",".join(non_enst)

    TREDS, df = read_treds()

    metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp())
    write_meta(af_file, gene_map, TREDS, filename=metafile)
    logging.debug("File `{}` written.".format(metafile))
Exemple #52
0
def scaffold(args):
    """
    %prog scaffold ctgfasta agpfile

    Build scaffolds based on ordering in the AGP file.
    """
    from jcvi.formats.agp import bed, order_to_agp, build
    from jcvi.formats.bed import Bed

    p = OptionParser(scaffold.__doc__)
    p.add_option("--prefix", default=False, action="store_true",
            help="Keep IDs with same prefix together [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ctgfasta, agpfile = args
    sizes = Sizes(ctgfasta).mapping

    pf = ctgfasta.rsplit(".", 1)[0]
    phasefile = pf + ".phases"
    fwphase = open(phasefile, "w")
    newagpfile = pf + ".new.agp"
    fwagp = open(newagpfile, "w")

    scaffoldbuckets = defaultdict(list)

    bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"])
    bb = Bed(bedfile)
    for s, partialorder in bb.sub_beds():
        name = partialorder[0].accn
        bname = name.rsplit("_", 1)[0] if opts.prefix else s
        scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder])

    # Now the buckets contain a mixture of singletons and partially resolved
    # scaffolds. Print the scaffolds first then remaining singletons.
    for bname, scaffolds in sorted(scaffoldbuckets.items()):
        ctgorder = []
        singletons = set()
        for scaf in sorted(scaffolds):
            for node, orientation in scaf:
                ctgorder.append((node, orientation))
            if len(scaf) == 1:
                singletons.add(node)
        nscaffolds = len(scaffolds)
        nsingletons = len(singletons)
        if nsingletons == 1 and nscaffolds == 0:
            phase = 3
        elif nsingletons == 0 and nscaffolds == 1:
            phase = 2
        else:
            phase = 1

        msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\
            format(bname, nscaffolds, nsingletons, phase)
        print >> sys.stderr, msg
        print >> fwphase, "\t".join((bname, str(phase)))

        order_to_agp(bname, ctgorder, sizes, fwagp)

    fwagp.close()
    os.remove(bedfile)

    fastafile = "final.fasta"
    build([newagpfile, ctgfasta, fastafile])
    tidy([fastafile])
Exemple #53
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob
    cparams = "1 1 2 80 5 200 2000"

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch",
                 default=31,
                 type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore",
                 default=MINSCORE,
                 type="int",
                 help="Minimum score to report")
    p.add_option("--period",
                 default=6,
                 type="int",
                 help="Maximum period to report")
    p.add_option("--lobstr",
                 default=False,
                 action="store_true",
                 help="Generate output for lobSTR")
    p.add_option("--telomeres",
                 default=False,
                 action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    p.add_option("--centromeres",
                 default=False,
                 action="store_true",
                 help="Run centromere search: {}".format(cparams))
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    minlength = opts.minscore / 2
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    if opts.centromeres:
        params = cparams.split()

    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).rsplit(".", 1)[0]
        # Commands starting with trf ignores errors
        cmd1 = "-trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | grep -v ^Parameters".format(datfile)
        if opts.lobstr:
            cmd2 += " | awk '($8 >= {} && $8 <= {})'".\
                    format(minlength, READLEN - minlength)
        else:
            cmd2 += " | awk '($8 >= 0)'"
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()