def phase(accession): gbdir = "gb" gbfile = op.join(gbdir, accession + ".gb") if not op.exists(gbfile): entrez([accession, "--skipcheck", "--outdir=" + gbdir, "--format=gb"]) rec = SeqIO.parse(gbfile, "gb").next() ph, keywords = get_phase(rec) return ph, len(rec)
def phase(accession): gbdir = "gb" gbfile = op.join(gbdir, accession + ".gb") if not op.exists(gbfile): entrez([accession, "--skipcheck", "--outdir=" + gbdir, "--format=gb"]) rec = next(SeqIO.parse(gbfile, "gb")) ph, keywords = get_phase(rec) return ph, len(rec)
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option( "--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced", ) p.add_option("--prefix", help="Prefix of the new object") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds( blastfile, order, rclip=rclip, maxsize=maxsize ) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count("n") + x.seq.count("N") exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug( "Ignore {0} updates because of decreasing quality.".format(len(exclude)) ) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option("--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option("--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip, maxsize=maxsize) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def weblogo(args): """ %prog weblogo [fastafile|fastqfile] Extract base composition for reads """ import numpy as np from jcvi.utils.progressbar import ProgressBar, Percentage, Bar, ETA p = OptionParser(weblogo.__doc__) p.add_option("-N", default=10, type="int", help="Count the first and last N bases") p.add_option("--nreads", default=1000000, type="int", help="Parse first N reads") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args N = opts.N nreads = opts.nreads pat = "ATCG" L = np.zeros((4, N), dtype="int32") R = np.zeros((4, N), dtype="int32") p = dict((a, i) for (i, a) in enumerate(pat)) L4, R3 = Counter(), Counter() widgets = ['Parse reads: ', Percentage(), ' ', Bar(marker='>', left='[', right=']'), ' ', ETA()] pr = ProgressBar(maxval=nreads, term_width=60, widgets=widgets).start() k = 0 fw_L = open("L.fasta", "w") fw_R = open("R.fasta", "w") fastq = fastqfile.endswith(".fastq") it = iter_fastq(fastqfile) if fastq else \ SeqIO.parse(must_open(fastqfile), "fasta") for rec in it: k += 1 if k % 1000 == 0: pr.update(k) if k > nreads: break if rec is None: break s = str(rec.seq) for i, a in enumerate(s[:N]): if a in p: a = p[a] L[a][i] += 1 for j, a in enumerate(s[-N:][::-1]): if a in p: a = p[a] R[a][N - 1 - j] += 1 l4, r3 = s[:4], s[-3:] L4[l4] += 1 R3[r3] += 1 print >> fw_L, ">{0}\n{1}".format(k, s[:N]) print >> fw_R, ">{0}\n{1}".format(k, s[-N:]) fw_L.close() fw_R.close() cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png" cmd += " --color-scheme classic --composition none -U probability" cmd += " --title {1}" sh(cmd.format('L', "First_10_bases")) sh(cmd.format('R', "Last_10_bases")) np.savetxt("L.{0}.csv".format(pat), L, delimiter=',', fmt="%d") np.savetxt("R.{0}.csv".format(pat), R, delimiter=',', fmt="%d") fw = open("L4.common", "w") for p, c in L4.most_common(N): print >> fw, "\t".join((p, str(c))) fw.close() fw = open("R3.common", "w") for p, c in R3.most_common(N): print >> fw, "\t".join((p, str(c))) fw.close()
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.base import blast from jcvi.formats.blast import BlastSlow from jcvi.formats.fasta import SeqIO from jcvi.utils.iter import roundrobin p = OptionParser(install.__doc__) p.add_option( "--rclip", default=1, type="int", help="Pair ID is derived from rstrip N chars [default: %default]") p.add_option( "--maxsize", default=1000000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args Max = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip prefix = opts.prefix blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = "before.bed", "after.bed" fwa = open(beforebed, "w") fwb = open(afterbed, "w") key1 = lambda x: x.query key2 = lambda x: x.query[:-rclip] if rclip else key1 data = BlastSlow(blastfile) for pe, lines in groupby(data, key=key2): lines = list(lines) if len(lines) != 2: continue a, b = lines aquery, bquery = a.query, b.query asubject, bsubject = a.subject, b.subject if asubject != bsubject: continue astrand, bstrand = a.orientation, b.orientation assert aquery[-1] == 'L' and bquery[-1] == 'R', str((aquery, bquery)) ai, ax = order[aquery] bi, bx = order[bquery] qstart, qstop = ax.start + a.qstart - 1, bx.start + b.qstop - 1 if astrand == '+' and bstrand == '+': sstart, sstop = a.sstart, b.sstop elif astrand == '-' and bstrand == '-': sstart, sstop = b.sstart, a.sstop else: continue if sstart > sstop: continue if sstop > sstart + Max: continue name = aquery[:-1] + "LR" print >> fwa, "\t".join(str(x) for x in \ (ax.seqid, qstart - 1, qstop, name, 1000, "+")) print >> fwb, "\t".join(str(x) for x in \ (asubject, sstart - 1, sstop, name, 1000, astrand)) fwa.close() fwb.close() beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) import math pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}) for seqid, start, end in cranges: bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) for a in aa: gapid = a.accn bi, b = border[gapid] bbeds.append(b) a = abeds[0] if abeds else [] assert abs(len(abeds) - len(bbeds)) <= 1 if (not a) or a.start > 1: abeds, bbeds = bbeds, abeds beds = list(roundrobin(abeds, bbeds)) if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled)
def weblogo(args): """ %prog weblogo [fastafile|fastqfile] Extract base composition for reads """ import numpy as np from rich.progress import Progress p = OptionParser(weblogo.__doc__) p.add_option("-N", default=10, type="int", help="Count the first and last N bases") p.add_option("--nreads", default=1000000, type="int", help="Parse first N reads") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastqfile,) = args N = opts.N nreads = opts.nreads pat = "ATCG" L = np.zeros((4, N), dtype="int32") R = np.zeros((4, N), dtype="int32") p = dict((a, i) for (i, a) in enumerate(pat)) L4, R3 = Counter(), Counter() k = 0 fw_L = open("L.fasta", "w") fw_R = open("R.fasta", "w") fastq = fastqfile.endswith(".fastq") it = iter_fastq(fastqfile) if fastq else SeqIO.parse(must_open(fastqfile), "fasta") with Progress() as progress: progress.add_task("[green] Processing ...", start=False, total=nreads) for rec in it: k += 1 if k > nreads: break if rec is None: break s = str(rec.seq) for i, a in enumerate(s[:N]): if a in p: a = p[a] L[a][i] += 1 for j, a in enumerate(s[-N:][::-1]): if a in p: a = p[a] R[a][N - 1 - j] += 1 l4, r3 = s[:4], s[-3:] L4[l4] += 1 R3[r3] += 1 print(">{0}\n{1}".format(k, s[:N]), file=fw_L) print(">{0}\n{1}".format(k, s[-N:]), file=fw_R) fw_L.close() fw_R.close() cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png" cmd += " --color-scheme classic --composition none -U probability" cmd += " --title {1}" sh(cmd.format("L", "First_10_bases")) sh(cmd.format("R", "Last_10_bases")) np.savetxt("L.{0}.csv".format(pat), L, delimiter=",", fmt="%d") np.savetxt("R.{0}.csv".format(pat), R, delimiter=",", fmt="%d") fw = open("L4.common", "w") for p, c in L4.most_common(N): print("\t".join((p, str(c))), file=fw) fw.close() fw = open("R3.common", "w") for p, c in R3.most_common(N): print("\t".join((p, str(c))), file=fw) fw.close()