def insertion(args): """ %prog insertion mic.mac.bed Find IES based on mapping MIC reads to MAC genome. Output a bedfile with 'lesions' (stack of broken reads) in the MAC genome. """ p = OptionParser(insertion.__doc__) p.add_option("--mindepth", default=6, type="int", help="Minimum depth to call an insertion") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args mindepth = opts.mindepth bed = Bed(bedfile) fw = must_open(opts.outfile, "w") for seqid, feats in bed.sub_beds(): left_ends = Counter([x.start for x in feats]) right_ends = Counter([x.end for x in feats]) selected = [] for le, count in left_ends.items(): if count >= mindepth: selected.append((seqid, le, "LE-{0}".format(le), count)) for re, count in right_ends.items(): if count >= mindepth: selected.append((seqid, re, "RE-{0}".format(re), count)) selected.sort() for seqid, pos, label, count in selected: label = "{0}-r{1}".format(label, count) print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))
def __init__(self, haplotype_set, maf=.1): self.haplotype_set = haplotype_set self.nind = len(haplotype_set) self.notmissing = sum(1 for x in haplotype_set if x) counter = Counter() for haplotypes in haplotype_set: counter.update(Counter(haplotypes)) self.counter = {} for h, c in counter.items(): if c >= self.notmissing * maf: self.counter[h] = c
def calc_ldscore(a, b): assert len(a) == len(b) # Assumes markers as A/B c = Counter(zip(a, b)) c_aa = c[('A', 'A')] c_ab = c[('A', 'B')] c_ba = c[('B', 'A')] c_bb = c[('B', 'B')] n = c_aa + c_ab + c_ba + c_bb if n == 0: return 0 f = 1. / n x_aa = c_aa * f x_ab = c_ab * f x_ba = c_ba * f x_bb = c_bb * f p_a = x_aa + x_ab p_b = x_ba + x_bb q_a = x_aa + x_ba q_b = x_ab + x_bb D = x_aa - p_a * q_a denominator = p_a * p_b * q_a * q_b if denominator == 0: return 0 r2 = D * D / denominator return r2
def calc_ldscore(a, b): assert len(a) == len(b), "{0}\n{1}".format(a, b) # Assumes markers as A/B c = Counter(zip(a, b)) c_aa = c[("A", "A")] c_ab = c[("A", "B")] c_ba = c[("B", "A")] c_bb = c[("B", "B")] n = c_aa + c_ab + c_ba + c_bb if n == 0: return 0 f = 1.0 / n x_aa = c_aa * f x_ab = c_ab * f x_ba = c_ba * f x_bb = c_bb * f p_a = x_aa + x_ab p_b = x_ba + x_bb q_a = x_aa + x_ba q_b = x_ab + x_bb D = x_aa - p_a * q_a denominator = p_a * p_b * q_a * q_b if denominator == 0: return 0 r2 = D * D / denominator return r2
def validate(args): """ %prog validate outdir genome.fasta Validate current folder after MAKER run and check for failures. Failed batch will be written to a directory for additional work. """ from jcvi.utils.counter import Counter p = OptionParser(validate.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) outdir, genome = args counter = Counter() fsnames, suffix = get_fsnames(outdir) dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log" dslogs = [dsfile.format(x, suffix) for x in fsnames] all_failed = [] for f, d in zip(fsnames, dslogs): dslog = DatastoreIndexFile(d) counter.update(dslog.scaffold_status.values()) all_failed.extend([(f, x) for x in dslog.failed]) cmd = 'tail maker.*.out | grep -c "now finished"' n = int(popen(cmd).read()) assert len(fsnames) == n print("ALL jobs have been finished", file=sys.stderr) nfailed = len(all_failed) if nfailed == 0: print("ALL scaffolds are completed with no errors", file=sys.stderr) return print("Scaffold status:", file=sys.stderr) print(counter, file=sys.stderr) failed = "FAILED" fw = open(failed, "w") print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw) fw.close() nlines = sum(1 for x in open("FAILED")) assert nlines == nfailed print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr) # Rebuild the failed batch failed_ids = failed + ".ids" failed_fasta = failed + ".fasta" cmd = "cut -f2 {0}".format(failed) sh(cmd, outfile=failed_ids) if need_update((genome, failed_ids), failed_fasta): cmd = "faSomeRecords {0} {1} {2}".\ format(genome, failed_ids, failed_fasta) sh(cmd)
def contrast_stores(bed1_store_r, bed2_store, minreads=10, minpct=.1, prefix="AB"): for target, reads in bed1_store_r.iteritems(): nreads = len(reads) if nreads < minreads: continue good_mapping = max(minreads / 2, minpct * nreads) bed2_targets = Counter(bed2_store.get(r) for r in reads) c = dict( (k, v) for (k, v) in bed2_targets.items() if v >= good_mapping) ctag = "|".join("{0}({1})".format(k, v) for (k, v) in c.items()) print prefix, target, nreads, ctag, len(set(c.keys()) - set([None]))
def variation(args): """ %prog variation P1.bed P2.bed F1.bed Associate IES in parents and progeny. """ p = OptionParser(variation.__doc__) p.add_option("--diversity", choices=("breakpoint", "variant"), default="variant", help="Plot diversity") opts, args, iopts = p.set_image_options(args, figsize="6x6") if len(args) != 3: sys.exit(not p.print_help()) pfs = [op.basename(x).split('-')[0] for x in args] P1, P2, F1 = pfs newbedfile = "-".join(pfs) + ".bed" if need_update(args, newbedfile): newbed = Bed() for pf, filename in zip(pfs, args): bed = Bed(filename) for b in bed: b.accn = "-".join((pf, b.accn)) b.score = None newbed.append(b) newbed.print_to_file(newbedfile, sorted=True) neworder = Bed(newbedfile).order mergedbedfile = mergeBed(newbedfile, nms=True) bed = Bed(mergedbedfile) valid = 0 total_counts = Counter() F1_counts = [] bp_diff = [] novelbedfile = "novel.bed" fw = open(novelbedfile, "w") for b in bed: accns = b.accn.split(',') pfs_accns = [x.split("-")[0] for x in accns] pfs_counts = Counter(pfs_accns) if len(pfs_counts) != 3: print(b, file=fw) continue valid += 1 total_counts += pfs_counts F1_counts.append(pfs_counts[F1]) # Collect breakpoint positions between P1 and F1 P1_accns = [x for x in accns if x.split("-")[0] == P1] F1_accns = [x for x in accns if x.split("-")[0] == F1] if len(P1_accns) != 1: continue ri, ref = neworder[P1_accns[0]] P1_accns = [neworder[x][-1] for x in F1_accns] bp_diff.extend(x.start - ref.start for x in P1_accns) bp_diff.extend(x.end - ref.end for x in P1_accns) print("A total of {0} sites show consistent deletions across samples.".\ format(percentage(valid, len(bed))), file=sys.stderr) for pf, count in total_counts.items(): print("{0:>9}: {1:.2f} deletions/site".\ format(pf, count * 1. / valid), file=sys.stderr) F1_counts = Counter(F1_counts) # Plot the IES variant number diversity from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica fig = plt.figure(1, (iopts.w, iopts.h)) if opts.diversity == "variant": left, height = zip(*sorted(F1_counts.items())) for l, h in zip(left, height): print("{0:>9} variants: {1}".format(l, h), file=sys.stderr) plt.text(l, h + 5, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Identified number of IES per site") plt.ylabel("Counts") plt.title("IES variation in progeny pool") ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".counts.pdf") # Plot the IES breakpoint position diversity else: bp_diff = Counter(bp_diff) bp_diff_abs = Counter() for k, v in bp_diff.items(): bp_diff_abs[abs(k)] += v plt.figure(1, (iopts.w, iopts.h)) left, height = zip(*sorted(bp_diff_abs.items())) for l, h in zip(left, height)[:21]: plt.text(l, h + 50, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Progeny breakpoint relative to SB210") plt.ylabel("Counts") plt.xlim(-.5, 20.5) ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".breaks.pdf") # Serialize the data to a file fw = open("Breakpoint-offset-histogram.csv", "w") for k, v in sorted(bp_diff.items()): print("{0},{1}".format(k, v), file=fw) fw.close() total = sum(height) zeros = bp_diff[0] within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20]) print("No deviation: {0}".format(percentage(zeros, total)), file=sys.stderr) print(" Within 20bp: {0}".format(percentage(within_20, total)), file=sys.stderr)
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')), file=fw) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)), file=fw) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([ sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile) ]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug( "Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format( lb, ub)) for b in bed: if float(b.score) > ub: continue print(b, file=fw) fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print(b, file=fw) b.start, b.end = max(start, end - flank + 1), end print(b, file=fw) fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([ validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile) ]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)), file=fw) ies_id += 1 fw.close()
def pixel_stats(img): img = [(p_round(r), p_round(g), p_round(b)) for r, g, b in img] c = Counter(img) imgx, count = c.most_common(1)[0] return imgx
def tally_markers(self, markers): counter = Counter([x.seqid for x in markers]) self.scaffold_1m = len([x for x in counter.values() if x == 1]) self.scaffold_2m = len([x for x in counter.values() if x == 2]) self.scaffold_3m = len([x for x in counter.values() if x == 3]) self.scaffold_4m = len([x for x in counter.values() if x >= 4])
def mlg_counts(self): return Counter([x.mlg for x in self.markers])
def resolve(args): """ %prog resolve matrixfile fastafile bamfolder Separate repeats along collapsed contigs. First scan the matrixfile for largely heterozygous sites. For each heterozygous site, we scan each bam to retrieve distinct haplotypes. The frequency of each haplotype is then computed, the haplotype with the highest frequency, assumed to be paralogous, is removed. """ import pysam from collections import defaultdict from itertools import groupby p = OptionParser(resolve.__doc__) p.add_option("--missing", default=.5, help="Maximum level of missing data") p.add_option("--het", default=.5, help="Maximum level of heterozygous calls") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) matrixfile, fastafile, bamfolder = args #f = Fasta(fastafile) fp = open(matrixfile) for row in fp: if row[0] != '#': break header = row.split() ngenotypes = len(header) - 4 nmissing = int(round(opts.missing * ngenotypes)) logging.debug("A total of {0} individuals scanned".format(ngenotypes)) logging.debug("Look for markers with < {0} missing and > {1} het".\ format(opts.missing, opts.het)) bamfiles = iglob(bamfolder, "*.bam") logging.debug("Folder `{0}` contained {1} bam files".\ format(bamfolder, len(bamfiles))) data = [] for row in fp: if row[0] == '#': continue atoms = row.split() seqid, pos, ref, alt = atoms[:4] genotypes = atoms[4:] c = Counter(genotypes) c0 = c.get('0', 0) c3 = c.get('3', 0) if c0 >= nmissing: continue hetratio = c3 * 1. / (ngenotypes - c0) if hetratio <= opts.het: continue pos = int(pos) data.append((seqid, pos, ref, alt, c, hetratio)) data.sort() logging.debug("A total of {0} target markers in {1} contigs.".\ format(len(data), len(set(x[0] for x in data)))) samfiles = [pysam.AlignmentFile(x, "rb") for x in bamfiles] samfiles = [(op.basename(x.filename).split(".")[0], x) for x in samfiles] samfiles.sort() logging.debug("BAM files grouped to {0} individuals".\ format(len(set(x[0] for x in samfiles)))) fw = must_open(opts.outfile, "w") for seqid, d in groupby(data, lambda x: x[0]): d = list(d) nmarkers = len(d) logging.debug("Process contig {0} ({1} markers)".format( seqid, nmarkers)) haplotype_set = [] for pf, sf in groupby(samfiles, key=lambda x: x[0]): haplotypes = [] for pfi, samfile in sf: reads = defaultdict(list) positions = [] for s, pos, ref, alt, c, hetratio in d: for c in samfile.pileup(seqid): if c.reference_pos != pos - 1: continue for r in c.pileups: rname = r.alignment.query_name rbase = r.alignment.query_sequence[ r.query_position] reads[rname].append((pos, rbase)) positions.append(pos) for read in reads.values(): hap = ['-'] * nmarkers for p, rbase in read: hap[positions.index(p)] = rbase hap = "".join(hap) if "-" in hap: continue haplotypes.append(hap) haplotypes = set(haplotypes) haplotype_set.append(haplotypes) hr = HaplotypeResolver(haplotype_set) print >> fw, seqid, hr hr.solve(fw)
def graph(args): """ %prog graph best.edges Convert Celera Assembler's "best.edges" to a GEXF which can be used to feed into Gephi to check the topology of the best overlapping graph. Mutual best edges are represented as thicker edges. Reference: https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py """ p = OptionParser(graph.__doc__) p.add_option( "--query", default=-1, type="int", help="Search from node, -1 to select random node, 0 to disable") p.add_option("--contig", help="Search from contigs, use comma to separate") p.add_option("--largest", default=0, type="int", help="Only show largest components") p.add_option("--maxsize", default=500, type="int", help="Max graph size") p.add_option("--nomutualbest", default=False, action="store_true", help="Do not plot mutual best edges as heavy") add_graph_options(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args query = opts.query contig = opts.contig largest = opts.largest frgctg = opts.frgctg edgeweight = not opts.nomutualbest G = read_graph(bestedges, maxerr=opts.maxerr) if largest: H = list(nx.connected_component_subgraphs(G)) c = min(len(H), largest) logging.debug("{0} components found, {1} retained".format(len(H), c)) G = nx.Graph() for x in H[:c]: G.add_edges_from(x.edges()) if query: if query == -1: query = choice(G.nodes()) reads_to_ctgs = parse_ctgs(bestedges, frgctg) if contig: contigs = set(contig.split(",")) core = [k for k, v in reads_to_ctgs.items() if v in contigs] else: ctg = reads_to_ctgs.get(query) core = [k for k, v in reads_to_ctgs.items() if v == ctg] logging.debug("Reads ({0}) extended from the same contig {1}".\ format(len(core), ctg)) # Extract a local neighborhood SG = nx.Graph() H = graph_local_neighborhood(G, query=core, maxsize=opts.maxsize) SG.add_edges_from(H.edges(data=edgeweight)) G = SG seen = [] for n, attrib in G.nodes_iter(data=True): contig = reads_to_ctgs.get(n, "na") attrib['label'] = contig seen.append(contig) c = Counter(seen) cc = ["{0}({1})".format(k, v) for k, v in c.most_common()] print("Contigs: {0}".format(" ".join(cc)), file=sys.stderr) gexf = "best" if query >= 0: gexf += ".{0}".format(query) gexf += ".gexf" nx.write_gexf(G, gexf) logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".\ format(gexf, len(G), G.size()))
def read_graph(bestedges, maxerr=100, directed=False): logging.debug("Max error = {0}%".format(maxerr)) tag = "dir." if directed else "" bestgraph = bestedges.split(".")[0] + ".err{0}.{1}graph".format( maxerr, tag) if need_update(bestedges, bestgraph): G = {} if directed else nx.Graph() fp = open(bestedges) best_store = {} for row in fp: if row[0] == '#': continue id1, lib_id, best5, o5, best3, o3, j1, j2 = row.split() id1, best5, best3 = int(id1), int(best5), int(best3) j1, j2 = float(j1), float(j2) if j1 <= maxerr or j2 <= maxerr: if not directed: G.add_node(id1) id1p5, id1p3 = "{0}-5'".format(id1), "{0}-3'".format(id1) best5o5 = "{0}-{1}".format(best5, o5) best3o3 = "{0}-{1}".format(best3, o3) best_store[id1p5] = best5o5 best_store[id1p3] = best3o3 if best5 and j1 <= maxerr: if directed: G[id1p5] = best5o5 else: G.add_edge(best5, id1, weight=10) if best3 and j2 <= maxerr: if directed: G[id1p3] = best3o3 else: G.add_edge(id1, best3, weight=10) # Annotate edge weight for mutual best link, note that edge weights are # (11) set close to 10, to minimize impact to layout (Yifan Hu's # multilevel) nmutuals = 0 for k, v in best_store.items(): if best_store.get(v) == k and k < v: k, v = int(k.split("-")[0]), int(v.split("-")[0]) G[k][v]["weight"] = 11 nmutuals += 1 logging.debug("Mutual best edges: {0}".format(nmutuals)) if directed: fw = open(bestgraph, "w") dump(G, fw) fw.close() else: nx.write_gpickle(G, bestgraph) logging.debug("Graph pickled to `{0}`".format(bestgraph)) # Compute node degree histogram and save in (degree, counts) tab file degrees = G.degree() degree_counter = Counter(degrees.values()) degreesfile = "degrees.txt" fw = open(degreesfile, "w") for degree, count in sorted(degree_counter.items()): print("{0}\t{1}".format(degree, count), file=fw) fw.close() logging.debug("Node degree distribution saved to `{0}`".\ format(degreesfile)) # Save high degree (top 1%) nodes in save in (node, degree) tab file percentile = sorted(degrees.values(), reverse=True)[len(degrees) / 1000] logging.debug("Top 0.1% has degree of at least {0}".format(percentile)) hubs = [(k, v) for k, v in degrees.items() if v >= percentile] hubs.sort(key=lambda x: x[1], reverse=True) # degress descending hubsfile = "hubs.txt" fw = open(hubsfile, "w") for node, degree in hubs: print("{0}\t{1}".format(node, degree), file=fw) fw.close() logging.debug("Hubs saved to `{0}`".format(hubsfile)) logging.debug("Read graph from `{0}`".format(bestgraph)) if directed: G = load(open(bestgraph)) else: G = nx.read_gpickle(bestgraph) graph_stats(G) return G
def prune(args): """ %prog prune best.edges Prune overlap graph. """ from collections import defaultdict p = OptionParser(prune.__doc__) add_graph_options(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args G = read_graph(bestedges, maxerr=opts.maxerr) reads_to_ctgs = parse_ctgs(bestedges, opts.frgctg) edges = defaultdict(int) r = defaultdict(int) for a, b, d in G.edges_iter(data=True): ua, ub = reads_to_ctgs.get(a), reads_to_ctgs.get(b) nn = (ua, ub).count(None) if nn == 0: if ua == ub: r["Same tigs"] += 1 else: r["Diff tigs"] += 1 if ua > ub: ua, ub = ub, ua edges[(ua, ub)] += 1 elif nn == 1: r["One null"] += 1 else: assert nn == 2 r["Two nulls"] += 1 U = nx.Graph() difftigs = "diff_tigs.txt" neighbors = defaultdict(list) fw = open(difftigs, "w") for (ua, ub), count in edges.items(): print("\t".join((ua, ub, str(count))), file=fw) U.add_edge(ua, ub, weight=count) neighbors[ua].append((ub, count)) neighbors[ub].append((ua, count)) fw.close() print("[Unitig edge property]", file=sys.stderr) for k, v in r.items(): print(": ".join((k, str(v))), file=sys.stderr) print("Total: {0}".format(sum(r.values())), file=sys.stderr) print("[Unitig degree distribution]", file=sys.stderr) degrees = U.degree() degree_counter = Counter(degrees.values()) for degree, count in sorted(degree_counter.items()): print("{0}\t{1}".format(degree, count), file=sys.stderr) # To find associative contigs, one look for a contig that is connected and # only connected to another single contig - and do that recursively until no # more contigs can be found associative = {} for ua, ubs in neighbors.items(): if len(ubs) == 1: # Only one neighbor ub, count = ubs[0] if count >= 2: # Bubble associative[ua] = (ub, count) print("A total of {0} associative contigs found"\ .format(len(associative)), file=sys.stderr) # Keep only one for mutual associative for ua, ub in associative.items(): if ub in associative and ua < ub: print(ua, "mutually associative with", ub, file=sys.stderr) del associative[ub] print("A total of {0} associative contigs retained"\ .format(len(associative)), file=sys.stderr) assids = "associative.ids" fw = open(assids, "w") for ua, (ub, count) in sorted(associative.items(), key=lambda x: (x[1], x[0])): print("\t".join((ua, ub, str(count))), file=fw) fw.close() logging.debug("Associative contigs written to `{0}`".format(assids))
def weblogo(args): """ %prog weblogo [fastafile|fastqfile] Extract base composition for reads """ import numpy as np from jcvi.utils.progressbar import ProgressBar, Percentage, Bar, ETA p = OptionParser(weblogo.__doc__) p.add_option("-N", default=10, type="int", help="Count the first and last N bases") p.add_option("--nreads", default=1000000, type="int", help="Parse first N reads") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args N = opts.N nreads = opts.nreads pat = "ATCG" L = np.zeros((4, N), dtype="int32") R = np.zeros((4, N), dtype="int32") p = dict((a, i) for (i, a) in enumerate(pat)) L4, R3 = Counter(), Counter() widgets = ['Parse reads: ', Percentage(), ' ', Bar(marker='>', left='[', right=']'), ' ', ETA()] pr = ProgressBar(maxval=nreads, term_width=60, widgets=widgets).start() k = 0 fw_L = open("L.fasta", "w") fw_R = open("R.fasta", "w") fastq = fastqfile.endswith(".fastq") it = iter_fastq(fastqfile) if fastq else \ SeqIO.parse(must_open(fastqfile), "fasta") for rec in it: k += 1 if k % 1000 == 0: pr.update(k) if k > nreads: break if rec is None: break s = str(rec.seq) for i, a in enumerate(s[:N]): if a in p: a = p[a] L[a][i] += 1 for j, a in enumerate(s[-N:][::-1]): if a in p: a = p[a] R[a][N - 1 - j] += 1 l4, r3 = s[:4], s[-3:] L4[l4] += 1 R3[r3] += 1 print >> fw_L, ">{0}\n{1}".format(k, s[:N]) print >> fw_R, ">{0}\n{1}".format(k, s[-N:]) fw_L.close() fw_R.close() cmd = "weblogo -F png -s large -f {0}.fasta -o {0}.png" cmd += " --color-scheme classic --composition none -U probability" cmd += " --title {1}" sh(cmd.format('L', "First_10_bases")) sh(cmd.format('R', "Last_10_bases")) np.savetxt("L.{0}.csv".format(pat), L, delimiter=',', fmt="%d") np.savetxt("R.{0}.csv".format(pat), R, delimiter=',', fmt="%d") fw = open("L4.common", "w") for p, c in L4.most_common(N): print >> fw, "\t".join((p, str(c))) fw.close() fw = open("R3.common", "w") for p, c in R3.most_common(N): print >> fw, "\t".join((p, str(c))) fw.close()