Example #1
0
def FastqPairedIterator(read1, read2):
    if read1 == read2:
        p1fp = p2fp = must_open(read1)
    else:
        p1fp = must_open(read1)
        p2fp = must_open(read2)

    return p1fp, p2fp
Example #2
0
def splitread(args):
    """
    %prog splitread fastqfile

    Split fastqfile into two read fastqfiles, cut in the middle.
    """
    p = OptionParser(splitread.__doc__)
    sp1.add_argument("-n",
                     dest="n",
                     default=76,
                     type="int",
                     help="Split at N-th base position [default: %default]")
    sp1.add_argument("--rc",
                     default=False,
                     action="store_true",
                     help="Reverse complement second read [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    pairsfastq, = args

    base = op.basename(pairsfastq).split(".")[0]
    fq1 = base + ".1.fastq"
    fq2 = base + ".2.fastq"
    fw1 = must_open(fq1, "w")
    fw2 = must_open(fq2, "w")

    fp = must_open(pairsfastq)
    n = args.n
    minsize = n * 8 / 5

    for name, seq, qual in FastqGeneralIterator(fp):
        if len(seq) < minsize:
            logging.error("Skipping read {0}, length={1}".format(
                name, len(seq)))
            continue

        name = "@" + name
        rec1 = FastqLite(name, seq[:n], qual[:n])
        rec2 = FastqLite(name, seq[n:], qual[n:])
        if args.rc:
            rec2.rc()

        print >> fw1, rec1
        print >> fw2, rec2

    logging.debug("Reads split into `{0},{1}`".format(fq1, fq2))
    fw1.close()
    fw2.close()
Example #3
0
def vcf_filter(args):
    sites = set()
    if args.exclude:
        for line in must_open(args.exclude):
            seqid, pos = line.strip().split("\t")
            locus = "%s_%s" % (seqid, pos)
            sites.add(locus)
    
    vcfr = vcf.Reader(must_open(args.fi))
    #vcfw = vcf.Writer(fho, vcfr)
    for rcd in vcfr:
        n_sm = len(rcd.samples)
        sms = rcd.samples
        locus = "%s_%s" % (rcd.CHROM, rcd.POS)
        if locus in sites:
            continue
        sm = sms[0]
        gt = sm.gt_type
        qd = rcd.INFO['QD'] if 'QD' in rcd.INFO else None
        fs = rcd.INFO['FS'] if 'FS' in rcd.INFO else None
        mq = rcd.INFO['MQ'] if 'MQ' in rcd.INFO else None
        mqrs = rcd.INFO['MQRankSum'] if 'MQRankSum' in rcd.INFO else None
        rprs = rcd.INFO['ReadPosRankSum'] if 'ReadPosRankSum' in rcd.INFO else None
        sor = rcd.INFO['SOR'] if 'SOR' in rcd.INFO else None
        flagpass = (gt is None or gt == 2) and (
                (rcd.is_snp &
                    (qd is None or qd >= 2) &
                    (fs is None or fs <= 60) &
                    (mq is None or mq >= 40) &
                    (mqrs is None or mqrs >= -12.5) &
                    (rprs is None or rprs >= -8) & 
                    (sor is None or sor <= 4)
                ) or 
                (rcd.is_indel & 
                    (qd is None or qd >= 2) &
                    (fs is None or fs <= 200) &
                    (rprs is None or rprs >= -20) &
                    (sor is None or sor <= 10)
                )
        )
        alts = ",".join(map(str, rcd.ALT))
        alt = str(rcd.ALT[0])
        if flagpass:
            #vcfw.write_record(rcd)
            if rcd.is_snp:
                print("%s\t%s\t%s\t%d\t%s" % (locus, 'single', rcd.CHROM, rcd.POS-1, alt))
            elif rcd.is_deletion:
                print("%s\t%s\t%s\t%d\t%d" % (locus, 'deletion', rcd.CHROM, rcd.POS-1, len(rcd.REF)-1))
            else:
                print("%s\t%s\t%s\t%d\t%s" % (locus, 'insertion', rcd.CHROM, rcd.POS-1, alt[1:]))
Example #4
0
def translate(args):
    fh = must_open(args.fi)
    for rcd in SeqIO.parse(fh, "fasta"):
        sid = rcd.id
        aa = rcd.seq.translate(to_stop = True)
        nrcd = SeqRecord(aa, id = sid, description = "")
        SeqIO.write(nrcd, sys.stdout, "fasta")
Example #5
0
def uniq(args):
    """
    %prog uniq vcffile

    Retain only the first entry in vcf file.
    """
    from urlparse import parse_qs

    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    fp = must_open(vcffile)
    data = []
    for row in fp:
        if row[0] == '#':
            print(row.strip())
            continue
        v = VcfLine(row)
        data.append(v)

    for pos, vv in groupby(data, lambda x: x.pos):
        vv = list(vv)
        if len(vv) == 1:
            print(vv[0])
            continue
        bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0]))
        print(bestv)
Example #6
0
def parseEff(args):
    fhi = must_open(args.fi)
    for line in fhi:
        if line.startswith("#"):
            continue
        row = line.strip("\n").split("\t")
        chrom, pos, vid, ref, alt, qual, filt, info = row[:8]
        pos = int(pos)
        refl, altl = len(ref), len(alt)
        vnttype = ''
        if refl == 1 and altl == 1:
            vnttype = 'snp'
        elif refl == 1 and altl > 1:
            vnttype = 'ins'
        elif refl > 1 and altl == 1:
            vnttype = 'del'
        else:
            assert refl > 1 and altl > 1, "error: %s" % line
            vnttype = 'mix'
        if info == '.':
            continue
        ps = info.replace("ANN=",'').split("|")
        allele, anno, impact, gname, gid, ttyppe, tid = ps[:7]
        print("\t".join([chrom, str(pos), str(refl), str(altl), vnttype, 
            anno, impact, gid, tid]))
Example #7
0
def bam_stat(args):
    bam = pysam.AlignmentFile(args.fi, 'r')

    if not args.bychr:
        s = BamStat()
        for aln in bam:
            count_read(aln, s)

        if len(s.rdic) > 0:
            logging.debug("%d 'paired' reads don't have a mate" % len(s.rdic))

        if args.isize:
            fho = must_open(args.isize, "w")
            print("\t".join(('insert_size','count')), file=fho)
            for ins, cnt in s.idic.items():
                print("%d\t%d\n" % (ins, cnt), file=fho)
            fho.close()

        print(s)
    else:
        ss = dict()
        for ist in bam.get_index_statistics():
            ss[ist.contig] = BamStat()
        for aln in bam:
            if aln.is_unmapped: continue
            chrom = aln.reference_name
            count_read(aln, ss[chrom])
        for chrom, s in ss.items():
            for k in s.stats:
                print("\t".join((chrom, k, str(getattr(s, k)))))
Example #8
0
def merge(args):
    """
    %prog merge ref.fasta query.fasta *.delta

    Merge delta files into a single delta.
    """
    p = OptionParser(merge.__doc__)
    p.set_outfile(outfile="merged_results.delta")
    opts, args = p.parse_args(args)

    if len(args) < 3:
        sys.exit(not p.print_help())

    ref, query = args[:2]
    deltafiles = args[2:]
    outfile = args.outfile

    ref = get_abs_path(ref)
    query = get_abs_path(query)
    fw = must_open(outfile, "w")
    print >> fw, " ".join((ref, query))
    print >> fw, "NUCMER"
    fw.close()

    for d in deltafiles:
        cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d)
        sh(cmd, outfile=outfile, append=True)
Example #9
0
def gtb2tsv(args):
    fhi = must_open(args.fi)
    print("\t".join("gid tid ttype etype chrom start end srd fam note".split()))
    for line in fhi:
        line = line.strip("\n")
        if line.startswith("#") or line.startswith("id"):
            continue
        ary = line.split("\t")
        if len(ary) < 18:
            print("less than 18 columns:\n%s" % line)
            continue
        tid, gid, seqid, tbeg, tend, srd, \
                locES, locIS, locCS, loc5S, loc3S, phase, \
                src, conf, cat1, cat2, cat3, note = ary
        tbeg, tend = int(tbeg), int(tend)
        if cat1 == 'mRNA':
            assert locCS, "no CDS for %d" % tid
        else:
            assert locES, "no exon for %d" % tid
        ldic = { 'exon': locES, 'cds': locCS, \
                'utr5': loc5S, 'utr3': loc3S, 'intron':locIS }
        for etype, locS in ldic.items():
            if not locS:
                continue
            for rbeg, rend in locStr2Ary(locS):
                beg, end = 0, 0
                if srd == "-":
                    beg, end = tend - rend + 1, tend - rbeg + 1
                else:
                    assert srd == '+', "unknown strand: %s for %s" % (srd, tid)
                    beg, end = tbeg + rbeg - 1, tbeg + rend - 1
                fields = [gid, tid, cat1, etype, seqid, str(beg), str(end), srd, cat3, note]
                print("\t".join(fields)) 
    fhi.close()
Example #10
0
def annotate(args):
    """
    %prog annotate blastfile query.fasta subject.fasta

    Annotate overlap types (dovetail, contained, etc) in BLAST tabular file.
    """
    from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types

    p = OptionParser(annotate.__doc__)
    p.set_align(pctid=94, hitlen=500)
    sp1.add_argument("--hang", default=500, type="int",
                 help="Maximum overhang length")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, afasta, bfasta = args
    fp = must_open(blastfile)
    asizes = Sizes(afasta).mapping
    bsizes = Sizes(bfasta).mapping
    cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang)
    logging.debug(str(cutoff))
    for row in fp:
        b = BlastLine(row)
        asize = asizes[b.query]
        bsize = bsizes[b.subject]
        if b.query == b.subject:
            continue
        ov = Overlap(b, asize, bsize, cutoff)
        if ov.otype:
            ov.print_graphic()
            print("{0}\t{1}".format(b, Overlap_types[ov.otype]))
Example #11
0
def uniq(args):
    """
    %prog uniq vcffile

    Retain only the first entry in vcf file.
    """
    from urlparse import parse_qs

    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    fp = must_open(vcffile)
    data = []
    for row in fp:
        if row[0] == '#':
            print(row.strip())
            continue
        v = VcfLine(row)
        data.append(v)

    for pos, vv in groupby(data, lambda x: x.pos):
        vv = list(vv)
        if len(vv) == 1:
            print(vv[0])
            continue
        bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0]))
        print(bestv)
Example #12
0
def catread(args):
    """
    %prog catread fastqfile1 fastqfile2

    Concatenate paired end reads into one. Useful for example to do single-end
    mapping and perform filtering on the whole read pair level.
    """
    p = OptionParser(catread.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    r1, r2 = args
    p1fp, p2fp = FastqPairedIterator(r1, r2)
    outfile = pairspf((r1, r2)) + ".cat.fastq"
    fw = must_open(outfile, "w")
    while True:
        a = list(islice(p1fp, 4))
        if not a:
            break
        atitle, aseq, _, aqual = a
        btitle, bseq, _, bqual = list(islice(p2fp, 4))
        print >> fw, "\n".join((atitle.strip(), aseq.strip() + bseq.strip(), \
                                "+", aqual.strip() + bqual.strip()))
Example #13
0
def tile(args):
    from maize.utils.location import maketile

    fhi = must_open(args.fi)
    winstep, winsize = args.step, args.size
    for rcd in SeqIO.parse(fhi, "fasta") :
        size = len(rcd.seq)
        sid, beg, end = rcd.id, 1, size
        ary = rcd.id.split("-")
        if len(ary) >= 3:
            sid, beg, end = ary[0], int(ary[1]), int(ary[2])
            assert size == end - beg + 1, "size error: %s not %d" % (rcd.id, size)
        elif len(ary) == 2:
            sid, beg = ary[0], int(ary[1])
            end = beg + size - 1
       
        wins = maketile(1, size, winsize, winstep)
        rcds = []
        seq = str(rcd.seq)
        for rbeg, rend in wins:
            abeg, aend = beg + rbeg - 1, beg + rend - 1
            ssid = "%s-%d-%d" % (sid, abeg, aend)
            seqstr = seq[rbeg-1:rend]
            rcds.append(SeqRecord(Seq(seqstr), id = ssid, description = ''))
        SeqIO.write(rcds, sys.stdout, "fasta")
    fhi.close()
Example #14
0
def bed2chain(args):
    from maize.formats.sizes import Sizes
    tdic = Sizes(args.tsize)
    qdic = Sizes(args.qsize)
    
    firstline = True
    cid0, tName0, qName0, srd0, locs = '', '', '', '', []
    for line in must_open(args.fi):
        line = line.rstrip("\n")
        if not line:
            continue
        tName, tStart, tEnd, srd, qName, qStart, qEnd, cid = line.split()[:8]
        tStart, tEnd, qStart, qEnd = int(tStart), int(tEnd), int(qStart), int(qEnd)
        if firstline:
            cid0, tName0, qName0, srd0 = cid, tName, qName, srd
            locs.append([tStart, tEnd, qStart, qEnd])
            firstline = False
        elif cid0 == cid:
            assert tName == tName0 and qName == qName0 and srd == srd0, "inconsistent info in chain"
            locs.append([tStart, tEnd, qStart, qEnd])
        else:
            print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs)
            cid0, tName0, qName0, srd0 = cid, tName, qName, srd
            locs = [[tStart, tEnd, qStart, qEnd]]
    print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs)
Example #15
0
def suffix(args):
    """
    %prog suffix fastqfile CAG

    Filter reads based on suffix.
    """
    p = OptionParser(suffix.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastqfile, sf = args
    fw = must_open(args.outfile, "w")
    nreads = nselected = 0
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        if rec.seq.endswith(sf):
            print >> fw, rec
            nselected += 1
    logging.debug("Selected reads with suffix {0}: {1}".\
                  format(sf, percentage(nselected, nreads)))
Example #16
0
def uniq(args):
    """
    %prog uniq fastqfile

    Retain only first instance of duplicate reads. Duplicate is defined as
    having the same read name.
    """
    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    fw = must_open(args.outfile, "w")
    nduplicates = nreads = 0
    seen = set()
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        name = rec.name
        if name in seen:
            nduplicates += 1
            continue
        seen.add(name)
        print >> fw, rec
    logging.debug("Removed duplicate reads: {}".\
                  format(percentage(nduplicates, nreads)))
Example #17
0
def merge(args):
    cfg = args.cfg
    for line in must_open(cfg):
        line = line.strip(" \t\n\r")
        if line == "":
            continue
        (pre, fseq) = line.split(",")
        if not os.access(fseq, os.R_OK):
            eprint("no access to input file: %s" % fseq)
            sys.exit(1)

        fh = must_open(fseq)
        seq_it = SeqIO.parse(fh, "fasta")
        seqs = [SeqRecord(rcd.seq, id = pre + "|" + rcd.id,
            description = '') for rcd in seq_it]
        SeqIO.write(seqs, sys.stdout, "fasta")
        fh.close()
Example #18
0
 def __init__(self, filename, sorted=False):
     super(BlastSlow, self).__init__(filename)
     fp = must_open(filename)
     for row in fp:
         self.append(BlastLine(row))
     self.sorted = sorted
     if not sorted:
         self.sort(key=lambda x: x.query)
Example #19
0
def cleanid(args):
    fh = must_open(args.fi)
    for line in fh:
        line = line.strip()
        if line.startswith(">"):
            print(line.rstrip(":."))
        else:
            print(line)
    fh.close()
Example #20
0
def clean(args):
    reg = re.compile("[^ATCGN]")
    fh = must_open(args.fi)
    alns = AlignIO.read(fh, "phylip-relaxed")
    for rcd in alns:
        rcd.seq = reg.subn("N", str(rcd.seq).upper())[0]
        #rcd = SeqRecord(Seq(newseq), id = rcd.id)
    AlignIO.write(alns, sys.stdout, "phylip-relaxed")
    fh.close()
Example #21
0
def clean(args):
    reg = re.compile("[^ATCGN]")
    fh = must_open(args.fi)
    alns = AlignIO.read(fh, "phylip-relaxed")
    for rcd in alns:
        rcd.seq = reg.subn("N", str(rcd.seq).upper())[0]
        #rcd = SeqRecord(Seq(newseq), id = rcd.id)
    AlignIO.write(alns, sys.stdout, "phylip-relaxed")
    fh.close()
Example #22
0
def desc(args):
    fh = must_open(args.fi)
    if args.header:
        print("seqid\tdesc")
    for rcd in SeqIO.parse(fh, "fasta"):
        sid, desc = rcd.id, rcd.description
        if sid == desc:
            desc = ''
        print("%s\t%s" % (sid, desc))
Example #23
0
def shuffle(args):
    """
    %prog shuffle p1.fastq p2.fastq

    Shuffle pairs into interleaved format.
    """
    p = OptionParser(shuffle.__doc__)
    p.set_tag()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    p1, p2 = args
    pairsfastq = pairspf((p1, p2)) + ".fastq"
    tag = args.tag

    p1fp = must_open(p1)
    p2fp = must_open(p2)
    pairsfw = must_open(pairsfastq, "w")
    nreads = 0
    while True:
        a = list(islice(p1fp, 4))
        if not a:
            break

        b = list(islice(p2fp, 4))
        if tag:
            name = a[0].rstrip()
            a[0] = name + "/1\n"
            b[0] = name + "/2\n"

        pairsfw.writelines(a)
        pairsfw.writelines(b)
        nreads += 2

    pairsfw.close()
    extra = nreads * 2 if tag else 0
    checkShuffleSizes(p1, p2, pairsfastq, extra=extra)

    logging.debug("File `{0}` verified after writing {1} reads.".\
                     format(pairsfastq, nreads))
    return pairsfastq
Example #24
0
def gaps(args):
    import re
    reg = re.compile("N+")
    fh = must_open(args.fi)
    for rcd in SeqIO.parse(fh, "fasta"):
        sid, seq = rcd.id, str(rcd.seq).upper()
        for res in reg.finditer(seq):
            beg, end = res.start(0), res.end(0)
            if end - beg >= args.gap:
                print("%s\t%d\t%d" % (sid, beg, end))
Example #25
0
def coverage(args):
    """
    %prog coverage fastafile bamfile

    Calculate coverage for BAM file. BAM file will be sorted unless with
    --nosort.
    """
    p = OptionParser(coverage.__doc__)
    sp1.add_argument("--format",
                     default="bigwig",
                     choices=("bedgraph", "bigwig", "coverage"),
                     help="Output format")
    sp1.add_argument("--nosort",
                     default=False,
                     action="store_true",
                     help="Do not sort BAM")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    format = args.format
    if args.nosort:
        logging.debug("BAM sorting skipped")
    else:
        bamfile = index([bamfile, "--fasta={0}".format(fastafile)])

    pf = bamfile.rsplit(".", 2)[0]
    sizesfile = Sizes(fastafile).filename
    cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile)
    if format in ("bedgraph", "bigwig"):
        cmd += " -bg"
        bedgraphfile = pf + ".bedgraph"
        sh(cmd, outfile=bedgraphfile)

        if format == "bedgraph":
            return bedgraphfile

        bigwigfile = pf + ".bigwig"
        cmd = "bedGraphToBigWig {0} {1} {2}".\
                    format(bedgraphfile, sizesfile, bigwigfile)
        sh(cmd)
        return bigwigfile

    coveragefile = pf + ".coverage"
    if need_update(fastafile, coveragefile):
        sh(cmd, outfile=coveragefile)

    gcf = GenomeCoverageFile(coveragefile)
    fw = must_open(args.outfile, "w")
    for seqid, cov in gcf.iter_coverage_seqid():
        print >> fw, "\t".join((seqid, "{0:.1f}".format(cov)))
    fw.close()
Example #26
0
def filter(args):
    """
    %prog filter <deltafile|coordsfile>

    Produce a new delta/coords file and filter based on id% or cov%.
    Use `delta-filter` for .delta file.
    """
    p = OptionParser(filter.__doc__)
    p.set_align(pctid=0, hitlen=0)
    sp1.add_argument("--overlap", default=False, action="store_true",
            help="Print overlap status (e.g. terminal, contained)")

    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    pctid = args.pctid
    hitlen = args.hitlen

    filename, = args
    if pctid == 0 and hitlen == 0:
        return filename

    pf, suffix = filename.rsplit(".", 1)
    outfile = "".join((pf, ".P{0}L{1}.".format(int(pctid), int(hitlen)), suffix))
    if not need_update(filename, outfile):
        return outfile

    if suffix == "delta":
        cmd = "delta-filter -i {0} -l {1} {2}".format(pctid, hitlen, filename)
        sh(cmd, outfile=outfile)
        return outfile

    fp = open(filename)
    fw = must_open(outfile, "w")
    for row in fp:
        try:
            c = CoordsLine(row)
        except AssertionError:
            continue

        if c.identity < pctid:
            continue
        if c.len2 < hitlen:
            continue
        if args.overlap and not c.overlap:
            continue

        outrow = row.rstrip()
        if args.overlap:
            ov = Overlap_types[c.overlap]
            outrow += "\t" + ov
        print >> fw, outrow

    return outfile
Example #27
0
def subset(args):
    """
    %prog subset blastfile qbedfile sbedfile

    Extract blast hits between given query and subject chrs.

    If --qchrs or --schrs is not given, then all chrs from q/s genome will
    be included. However one of --qchrs and --schrs must be specified.
    Otherwise the script will do nothing.
    """
    p = OptionParser(subset.__doc__)
    sp1.add_argument("--qchrs", default=None,
                help="query chrs to extract, comma sep [default: %default]")
    sp1.add_argument("--schrs", default=None,
                help="subject chrs to extract, comma sep [default: %default]")
    sp1.add_argument("--convert", default=False, action="store_true",
            help="convert accns to chr_rank [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, qbedfile, sbedfile = args
    qchrs = opts.qchrs
    schrs = opts.schrs
    assert qchrs or schrs, p.print_help()
    convert = opts.convert

    outfile = blastfile + "."
    if qchrs:
        outfile += qchrs + "."
        qchrs = set(qchrs.split(","))
    else:
        qchrs = set(Bed(qbedfile).seqids)
    if schrs:
        schrs = set(schrs.split(","))
        if qbedfile != sbedfile or qchrs != schrs:
            outfile += ",".join(schrs) + "."
    else:
        schrs = set(Bed(sbedfile).seqids)
    outfile += "blast"

    qo = Bed(qbedfile).order
    so = Bed(sbedfile).order

    fw = must_open(outfile, "w")
    for b in Blast(blastfile):
        q, s = b.query, b.subject
        if qo[q][1].seqid in qchrs and so[s][1].seqid in schrs:
            if convert:
                b.query = qo[q][1].seqid + "_" + "{0:05d}".format(qo[q][0])
                b.subject = so[s][1].seqid + "_" + "{0:05d}".format(so[s][0])
            print >> fw, b
    fw.close()
    logging.debug("Subset blastfile written to `{0}`".format(outfile))
Example #28
0
def stat(args):
    fhi = must_open(args.fi)
    print("\t".join(["chr", "pos", "nalt", "rsize", "asize", "nsam", "aaf", "nucdiv"]))

    vcf_reader = vcf.Reader(fhi)
    for rcd in vcf_reader:
        num_chroms = float(2.0 * rcd.num_called)
        nucl_diversity = float(num_chroms / (num_chroms - 1.0)) * rcd.heterozygosity
        print("\t".join(map(str, [rcd.CHROM, rcd.POS, \
            len(rcd.ALT), len(rcd.REF), len(rcd.ALT[0]), \
            rcd.num_called, rcd.aaf[0], nucl_diversity])))
Example #29
0
def rmdot(args):
    from string import maketrans
    fh = must_open(args.fi)
    tt = maketrans(".", "-")
    for line in fh:
        line = line.strip()
        if line.startswith('>'):
            print(line)
        else:
            print(line.translate(tt))
    fh.close()
Example #30
0
def clean(args):
    import re
    reg = re.compile("[^ATCGN]")
    fh = must_open(args.fi)
    cnt = 0
    for rcd in SeqIO.parse(fh, "fasta"):
        sid, seq = rcd.id, str(rcd.seq).upper()
        newseq, ncnt = reg.subn("N", seq)
        cnt += ncnt
        nrcd = SeqRecord(Seq(newseq), id = sid, description = "")
        SeqIO.write(nrcd, sys.stdout, "fasta")
    logging.debug("Total bad char: %d" % cnt)
Example #31
0
def mcl2tsv(args):
    fhi = must_open(args.mcl)
    print("grp\tgid")
    grp = 1
    for line in fhi:
        line = line.strip("\n")
        gids = line.split("\t")
        if len(gids) < 5:
            continue
        for gid in gids:
            print("%d\t%s" % (grp, gid))
        grp += 1
Example #32
0
def mcl2tsv(args):
    fhi = must_open(args.mcl)
    print("grp\tgid")
    grp = 1
    for line in fhi:
        line = line.strip("\n")
        gids = line.split("\t")
        if len(gids) < 5:
            continue
        for gid in gids:
            print("%d\t%s" % (grp, gid))
        grp += 1
Example #33
0
def coverage(args):
    """
    %prog coverage fastafile bamfile

    Calculate coverage for BAM file. BAM file will be sorted unless with
    --nosort.
    """
    p = OptionParser(coverage.__doc__)
    sp1.add_argument("--format", default="bigwig",
                 choices=("bedgraph", "bigwig", "coverage"),
                 help="Output format")
    sp1.add_argument("--nosort", default=False, action="store_true",
                 help="Do not sort BAM")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, bamfile = args
    format = args.format
    if args.nosort:
        logging.debug("BAM sorting skipped")
    else:
        bamfile = index([bamfile, "--fasta={0}".format(fastafile)])

    pf = bamfile.rsplit(".", 2)[0]
    sizesfile = Sizes(fastafile).filename
    cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile)
    if format in ("bedgraph", "bigwig"):
        cmd += " -bg"
        bedgraphfile = pf + ".bedgraph"
        sh(cmd, outfile=bedgraphfile)

        if format == "bedgraph":
            return bedgraphfile

        bigwigfile = pf + ".bigwig"
        cmd = "bedGraphToBigWig {0} {1} {2}".\
                    format(bedgraphfile, sizesfile, bigwigfile)
        sh(cmd)
        return bigwigfile

    coveragefile = pf + ".coverage"
    if need_update(fastafile, coveragefile):
        sh(cmd, outfile=coveragefile)

    gcf = GenomeCoverageFile(coveragefile)
    fw = must_open(args.outfile, "w")
    for seqid, cov in gcf.iter_coverage_seqid():
        print >> fw, "\t".join((seqid, "{0:.1f}".format(cov)))
    fw.close()
Example #34
0
def iter_fastq(filename, offset=0, key=None):
    if isinstance(filename, str):
        logging.debug("Read file `{0}`".format(filename))
        fh = must_open(filename)
    else:
        fh = filename

    while True:
        rec = FastqRecord(fh, offset=offset, key=key)
        if not rec.name:
            break
        yield rec
    yield None  # sentinel
Example #35
0
    def __init__(self, filename=None):
        super(Psl, self).__init__(filename)

        import re

        self.mCounts = {}   # dict to hold match counts
        if not filename:
            return

        for line in must_open(filename):
            if not re.match(r'\d+', line[0]):
                continue
            self.append(PslLine(line))
Example #36
0
def swap(args):
    """
    %prog swap blastfile

    Print out a new blast file with query and subject swapped.
    """
    p = OptionParser(swap.__doc__)

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    blastfile, = args
    swappedblastfile = blastfile + ".swapped"
    fp = must_open(blastfile)
    fw = must_open(swappedblastfile, "w")
    for row in fp:
        b = BlastLine(row)
        print >> fw, b.swapped

    fw.close()
    sort([swappedblastfile])
Example #37
0
def UMIcount(args):
    """
    %prog UMIcount fastqfile

    Report number of occurances of each unique UMI
    """
    fhi = must_open(args.fi)
    if args.fi.endswith(".gz"):
        fhi = gzip.open(args.fi, "r")

    ud = dict()
    for (seqid, seq, qual) in read_fastq(args.fi, fhi):
        umi = seqid.split(" ")[1].split("+")[1]
        if umi in ud:
            ud[umi] += 1
        else:
            ud[umi] = 1

    fho = must_open(args.fo, 'w')
    for umi, cnt in ud.items():
        fho.write("%s\t%s\n" % (umi, cnt))

    logging.debug("{} UMIs detected".format(len(ud)))
Example #38
0
def one2tsv(args):
    fhi = must_open(args.mcl)
    print("grp\tgid")
    grp = 1
    for line in fhi:
        ps = line.strip("\n").split(",")
        if ps[0] == 'Cluster':
            continue
        mid, size, density, iwt, ewt, quality, pval, gidstr = ps
        gids = gidstr.replace("\"", "").split(" ")
        if float(pval) >= args.maxp or len(gids) < 5:
            continue
        for gid in gids:
            print("%d\t%s" % (grp, gid))
        grp += 1
Example #39
0
def vcf2tsv(args):
    vcf_reader = vcf.Reader(fsock=must_open(args.fi))
    #vcf_reader.fetch("B02", 50001, 100000)
    #print(str(vcf_reader))
    #sys.exit()
    lst1 = ["DP", "QD", "FS", "MQ", "MQRankSum", "ReadPosRankSum", "SOR"]
    lsth = ['chr', 'pos', 'ref', 'alt', 'IS_SNP', 'PASS', 'QUAL'] + lst1
    lst2 = ["AD", "DP", "GQ"]
    lstb = ['GT'] + lst2
    head = 1
    for rcd in vcf_reader:
        n_sm = len(rcd.samples)
        if head == 1:
            head = 0
            if n_sm == 1:
                print("\t".join(lsth + lstb))
            else:
                sm_names = [x.sample for x in rcd.samples]
                lstbe = []
                for sm_name in sm_names:
                    lstbe1 = [i+'_'+j for i,j in zip([sm_name]*len(lstb), lstb)]
                    lstbe += lstbe1
                print("\t".join(lsth + lstbe))
        alts = ",".join(map(str, rcd.ALT))
        alt = rcd.ALT[0]
        filt = rcd.FILTER
        flagpass = 0
        if filt is None or len(filt) == 0:
            flagpass = 1
        val1, val2 = [], []
        for k in lst1:
            v = ''
            if k in rcd.INFO:
                v = rcd.INFO[k]
            val1.append(v)
        valh = [rcd.CHROM, rcd.POS, rcd.REF, alts, int(rcd.is_snp), 
                flagpass, rcd.QUAL] + val1
        sms = rcd.samples
        valb = []
        for sm in sms: # need to address multiple alleles
            val2 = [getattr(sm.data, k, '') for k in lst2]
            if val2[0] is not None and val2[0] != '':
                val2[0] = val2[0][1]
            valb1 = [sm.gt_type] + val2
            valb += ['' if x is None else str(x) for x in valb1] 
        print("\t".join(map(str, valh + valb)))
Example #40
0
def breakread(args):
    fhi = must_open(args.fi)
    fo1 = "%s_1.fq.gz" % fo
    fo2 = "%s_2.fq.gz" % fo
    fho1 = gzip.open(fo1, "wb")
    fho2 = gzip.open(fo2, "wb")

    for (seqid, seq, qual) in read_fastq(args.fi, fhi):
        assert len(seq) == readlen * 2 and len(qual) == readlen * 2, \
                "%s: seq[%d] qual[%d] not %d" % \
                (seqid, len(seq), len(qual), readlen)
        eles = seqid.split(" ")
        if len(eles) > 2: seqid = " ".join(eles[0:2])
        seq1, seq2 = seq[0:readlen], seq[readlen:readlen * 2]
        qual1, qual2 = qual[0:readlen], qual[readlen:readlen * 2]
        fho1.write(("@%s\n%s\n+\n%s\n" % (seqid, seq1, qual1)).encode('utf8'))
        fho2.write(("@%s\n%s\n+\n%s\n" % (seqid, seq2, qual2)).encode('utf8'))
Example #41
0
def coordT(args):
    sizes = Sizes(args.fs)
    for line in must_open(args.fi):
        if not re.match(r'\d+', line[0]):
            continue
        p = PslLine(line)
        tnames = p.tName.split("-")
        if len(tnames) == 3:
            x = p.tName
            t.qName, tosStart, tosEnd = tnames[0], int(tnames[1]), int(tnames[2])
            assert tosEnd-tosStart+1 == p.tSize
            cSize = sizes.get_size(p.tName)
            p.tStart += tosStart - 1
            p.tEnd += tosStart - 1
            p.tStarts = [x + tosStart - 1 for x in p.tStarts]
            p.tSize = cSize
        print(str(p))
Example #42
0
 def write(self):
     for subjob in self.subjobs:
         subjob.write()
     source = Template(self.__stanza__)
     cmd_str = "\n".join([x.strip() for x in self.cmds])
     params = {
         "queue": self.queue,
         "node": self.node,
         "ppn": self.ppn,
         "walltime": self.walltime,
         "memstr": self.mem,
         "email": self.email,
         "cmds": cmd_str + "\n"
     }
     fhj = must_open(self.fname, "w")
     fhj.write(source.substitute(params))
     fhj.close()
Example #43
0
def psl2tsv(args):
    sMatch, sMisMatch, sGapOpen, sGapExtend = 2, -3, -5, -2
    print("\t".join('''qName qStart qEnd qSize strand
    tName tStart tEnd tSize
    alnLen match misMatch baseN qNumIns tNumIns qBaseIns tBaseIns ident score
    qLoc tLoc'''.split()))
    for line in must_open(args.fi):
        if not re.match(r'\d+', line[0]):
            continue
        p = PslLine(line)
        qName, qStart, qEnd, qSize, strand = p.qName, p.qStart, p.qEnd, p.qSize, p.qstrand
        tName, tStart, tEnd, tSize = p.tName, p.tStart, p.tEnd, p.tSize
        match, misMatch, baseN, qNumIns, tNumIns, qBaseIns, tBaseIns = \
                p.matches, p.misMatches, p.nCount, p.qNumInsert, p.tNumInsert, \
                p.qBaseInsert, p.tBaseInsert
       
        assert p.blockCount==len(p.tStarts), "unequal pieces"
        assert p.blockCount==len(p.qStarts), "unequal pieces"
        assert p.blockCount==len(p.blockSizes), "unequal pieces"
        match += p.repMatches
        alnLen = match + misMatch + baseN
        assert alnLen==sum(p.blockSizes), "block size error: %s %d %d" % (qId, alnLen, sum(blockSizes))
        assert alnLen+qBaseIns==qEnd-qStart, "%s: qLen error" % qId
        assert alnLen+tBaseIns==tEnd-tStart, "%s: tLen error" % qId

        qLocs, tLocs = [], []
        for i in range(p.blockCount):
            rtb, rte = p.tStarts[i]-tStart, p.tStarts[i]-tStart+p.blockSizes[i]
            rqb, rqe = 0, 0
            if strand == '-':
                rqb, rqe = p.qStarts[i]-(qSize-qEnd), p.qStarts[i]-(qSize-qEnd)+p.blockSizes[i]
            else:
                rqb, rqe = p.qStarts[i]-qStart, p.qStarts[i]+p.blockSizes[i]-qStart
            qLocs.append([rqb+1,rqe])
            tLocs.append([rtb+1,rte])

        score = match * sMatch + misMatch * sMisMatch
        numIns = qNumIns + tNumIns
        if numIns >= 1:
            score += sGapOpen + (numIns - 1) * sGapExtend
        ident = "%.03f" % (float(match)/(match+misMatch))
        print("\t".join(str(x) for x in [qName, qStart+1, qEnd, qSize, strand,
            tName, tStart+1, tEnd, tSize, alnLen, match, misMatch, baseN,
            qNumIns, tNumIns, qBaseIns, tBaseIns, ident, score,
            locAry2Str(qLocs), locAry2Str(tLocs)]))
Example #44
0
def create_job_chain(fjs, fo):
    cmds = ['#!/bin/bash']
    jobs = []
    for i in range(len(fjs)):
        fj = fjs[i]
        assert op.isfile(fj), "cannot read %s" % fj
        job = "job%d" % (i + 1)
        if i == 0:
            cmds.append("%s=$(qsub %s)" % (job, fj))
            cmds.append("echo $%s" % job)
        else:
            pjob = jobs[i - 1]
            cmds.append("%s=$(qsub -W depend=afterok:$%s %s)" %
                        (job, pjob, fj))
            cmds.append("echo $%s" % job)
        jobs.append(job)
    fho = must_open(fo, "w")
    fho.write("\n".join(cmds) + "\n")
Example #45
0
def fastp(args):
    """
    %prog fastp jsonfile

    Convert fastp json to tsv file.
    """
    jsons = args.json
    logging.info("reading %s files..." % len(jsons))
    keys = """passed_filter_reads
        low_quality_reads
        too_many_N_reads
        too_short_reads
        too_long_reads""".split()
    print('\t'.join(['sid'] + keys))
    for fi in jsons:
        sid = op.basename(op.splitext(fi)[0])
        fhi = must_open(fi)
        js = json.load(fhi)
        print("\t".join([sid] + [str(js['filtering_result'][x]) for x in keys]))
Example #46
0
def bbduk(args):
    """
    %prog bbduk jsonfile

    Convert bbduk json to tsv file.
    """
    jsons = args.json
    skip = args.skip
    logging.info("reading %s files..." % len(jsons))
    keys = "readsIn readsRemoved readsOut ".split()
    print('\t'.join(['sid'] + keys))
    for fi in jsons:
        sid = op.basename(op.splitext(fi)[0])
        fhi = must_open(fi)
        if skip >= 1:
            for i in range(skip):
                next(fhi)
        js = json.load(fhi)
        print("\t".join([sid] + [str(js[x]) for x in keys]))
Example #47
0
def psl2bed(args):
    for line in must_open(args.fi):
        if not re.match(r'\d+', line[0]):
            continue
        p = PslLine(line)
        for i in range(p.blockCount):
            tbeg, tend = p.tStarts[i], p.tStarts[i] + p.blockSizes[i]
            qbeg, qend = 0, 0
            if p.qstrand == '-':
                qbeg, qend = p.qSize - p.qStarts[i] - p.blockSizes[i], p.qSize - p.qStarts[i]
            else:
                qbeg, qend = p.qStarts[i], p.qStarts[i] + p.blockSizes[i]
            tstr = "%s:%d-%d" % (p.tName, tbeg, tend)
            qstr = "%s:%d-%d" % (p.qName, qbeg, qend)
            if args.qry:
                print("%s\t%d\t%d\t%s\t%s" % 
                    (p.qName, qbeg, qend, p.qstrand, tstr))
            else:
                print("%s\t%d\t%d\t%s\t%s_%d_%d_%s" % 
                    (p.tName, tbeg, tend, p.qstrand, qstr))
Example #48
0
def rmgap(args):
    firstLine = True
    pid, locs = '', []
    for line in must_open(args.fi):
        line = line.rstrip("\n")
        if not line:
            continue
        ps = line.split()
        assert len(ps) == 12, "not 12 fields: %s" % line
        cid = "\t".join(ps[0:8])
        oStart, oEnd, oSize = int(ps[9]), int(ps[10]), int(ps[11])
        if firstLine:
            pid = cid
            locs.append([oStart, oEnd, oSize])
            firstLine = False
        elif pid == cid:
            locs.append([oStart, oEnd, oSize])
        else:
            rm1gap(pid, locs)
            pid = cid
            locs = [[oStart, oEnd, oSize]]
    rm1gap(pid, locs)
Example #49
0
def gtb2tsv(args):
    fhi = must_open(args.fi)
    print("\t".join(
        "gid tid ttype etype chrom start end srd fam note".split()))
    for line in fhi:
        line = line.strip("\n")
        if line.startswith("#") or line.startswith("id"):
            continue
        ary = line.split("\t")
        if len(ary) < 18:
            print("less than 18 columns:\n%s" % line)
            continue
        tid, gid, seqid, tbeg, tend, srd, \
                locES, locIS, locCS, loc5S, loc3S, phase, \
                src, conf, cat1, cat2, cat3, note = ary
        tbeg, tend = int(tbeg), int(tend)
        if cat1 == 'mRNA':
            assert locCS, "no CDS for %d" % tid
        else:
            assert locES, "no exon for %d" % tid
        ldic = { 'exon': locES, 'cds': locCS, \
                'utr5': loc5S, 'utr3': loc3S, 'intron':locIS }
        for etype, locS in ldic.items():
            if not locS:
                continue
            for rbeg, rend in locStr2Ary(locS):
                beg, end = 0, 0
                if srd == "-":
                    beg, end = tend - rend + 1, tend - rbeg + 1
                else:
                    assert srd == '+', "unknown strand: %s for %s" % (srd, tid)
                    beg, end = tbeg + rbeg - 1, tbeg + rend - 1
                fields = [
                    gid, tid, cat1, etype, seqid,
                    str(beg),
                    str(end), srd, cat3, note
                ]
                print("\t".join(fields))
    fhi.close()
Example #50
0
def filter(args):
    fhi = must_open(args.fi)
    line = fhi.readline()
    print(line.strip("\n"))
    pqid = ''
    pscore = 0
    lines = []
    for line in fhi:
        line = line.strip("\n")
        qName, qStart, qEnd, qSrd, qSize,\
        tName, tStart, tEnd, tSrd, tSize,\
        alnLen, match, misMatch, baseN,\
        qNumIns, tNumIns, qBaseIns, tBaseIns,\
        ident, score, qLoc, tLoc = line.split("\t")
        #print(line)
        #print(qSize)
        if float(ident) < args.ident: continue
        if int(alnLen) / int(qSize) < args.cov: continue
        if int(match) < args.match: continue
        if pqid == '':
            pqid = qName
            pscore = score
            lines.append(line)
        elif qName != pqid:
            print("\n".join(lines))
            pqid = qName
            pscore = score
            lines = [line]
        else:
            if args.best:
                if score > pscore:
                    lines = [line]
                    pscore = score
                elif score == pscore:
                    lines.append(line)
            else:
                lines.append(line)
    print("\n".join(lines))
Example #51
0
def write_csv(header, contents, sep=",", filename="stdout", thousands=False,
              tee=False, align=True, comment=False):
    """
    Write csv that are aligned with the column headers.

    >>> header = ["x_value", "y_value"]
    >>> contents = [(1, 100), (2, 200)]
    >>> write_csv(header, contents)
    x_value, y_value
          1,     100
          2,     200
    """
    from maize.formats.base import must_open

    formatted = load_csv(header, contents,
                         sep=sep, thousands=thousands, align=align)
    if comment:
        formatted[0] = '#' + formatted[0][1:]
    formatted = "\n".join(formatted)
    fw = must_open(filename, "w")
    fw.write(formatted + "\n")
    if tee and filename != "stdout":
        print(formatted)
Example #52
0
def validate(args):
    """
    %prog validate input.vcf genome.fasta

    Fasta validation of vcf file.
    """
    import pyfasta

    p = OptionParser(validate.__doc__)
    p.add_option("--prefix", help="Add prefix to seqid")
    opts, args = p.parse_args(args)

    vcffile, fastafile = args
    pf = opts.prefix
    genome = pyfasta.Fasta(fastafile, record_class=pyfasta.MemoryRecord)
    fp = must_open(vcffile)
    match_ref = match_alt = total = 0
    for row in fp:
        if row[0] == '#':
            continue
        seqid, pos, id, ref, alt = row.split()[:5]
        total += 1
        if pf:
            seqid = pf + seqid
        pos = int(pos)
        if seqid not in genome:
            continue
        true_ref = genome[seqid][pos - 1]
        if total % 100000 == 0:
            print >> sys.stderr, total, "sites parsed"
        if ref == true_ref:
            match_ref += 1
        elif alt == true_ref:
            match_alt += 1

    logging.debug("Match REF: {}".format(percentage(match_ref, total)))
    logging.debug("Match ALT: {}".format(percentage(match_alt, total)))
Example #53
0
def bed(args):
    """
    %prog bed blastfile

    Print out bed file based on coordinates in BLAST report. By default, write
    out subject positions. Use --swap to write query positions.
    """
    from jcvi.formats.bed import sort as bed_sort

    p = OptionParser(bed.__doc__)
    sp1.add_argument("--swap", default=False, action="store_true",
                 help="Write query positions [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    blastfile, = args
    swap = opts.swap

    fp = must_open(blastfile)
    bedfile =  "{0}.bed".format(blastfile.rsplit(".", 1)[0]) \
            if blastfile.endswith(".blast") \
            else "{0}.bed".format(blastfile)
    fw = open(bedfile, "w")
    for row in fp:
        b = BlastLine(row)
        if swap:
            b = b.swapped
        print >> fw, b.bedline

    logging.debug("File written to `{0}`.".format(bedfile))
    fw.close()
    bed_sort([bedfile, "-i"])

    return bedfile