def cluster(args): outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): os.mkdir(outdir) ncpus = args.ncpus clusters = [] motifs = read_motifs(args.inputfile) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs( args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True, ncpus=ncpus, ) clusters = tree.getResult() ids = _create_images(outdir, clusters) _write_report(outdir, ids, tree, clusters)
def test1_cluster_motifs(self): """ cluster a pwm file with motifs """ # Run clustering tree = cluster_motifs(self.pwm, "total", "wic", "mean", True, threshold=0.95, include_bg=True) clusters = tree.getResult() self.assertEquals(2, len(clusters)) self.assertEquals([3,2], [len(c[1]) for c in sorted(clusters, cmp=lambda x,y: cmp(len(x), len(y)))])
def test1_cluster_motifs(self): """ cluster a pwm file with motifs """ # Run clustering tree = cluster_motifs(self.pwm, "total", "wic", "mean", True, threshold=0.95, include_bg=True) clusters = tree.getResult() self.assertEqual(2, len(clusters)) self.assertEqual([3,2], [len(c[1]) for c in sorted(clusters, key=lambda x: len(x))])
def cluster(args): outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): os.mkdir(outdir) ncpus = args.ncpus clusters = [] motifs = pwmfile_to_motifs(args.inputfile) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True, ncpus=ncpus) clusters = tree.getResult() ids = _create_images(outdir, clusters) _write_report(outdir, ids, tree, clusters)
def cluster(args): revcomp = not args.single outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): os.mkdir(outdir) trim_ic = 0.2 clusters = [] motifs = pwmfile_to_motifs(args.inputfile) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True) clusters = tree.getResult() ids = [] mc = MotifComparer() sys.stderr.write("Creating images\n") for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] config = MotifConfig() env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids) with open(os.path.join(outdir, "cluster_report.html"), "w") as f: f.write(result.encode('utf-8')) f = open(os.path.join(outdir, "cluster_key.txt"), "w") for id in ids: f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]]))) f.close() f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close()
def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold): self.logger.info("clustering significant motifs.") trim_ic = 0.2 clusters = [] motifs = read_motifs(open(pfm_file), fmt="pwm") if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False) clusters = tree.getResult() ids = [] mc = MotifComparer() for cluster, members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(self.imgdir, "%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src": "images/%s.png" % cluster.id}, []]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(), cmp=lambda x, y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1, "+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join( self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [ dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members ] env = jinja2.Environment( loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(self.cluster_report, "w") f.write(result.encode('utf-8')) f.close() f = open(cluster_pwm, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() self.logger.debug("Clustering done. See the result in %s", self.cluster_report) return clusters
def nmer_predict(fastafile): from tempfile import NamedTemporaryFile,mkdtemp from gimmemotifs.fasta import Fasta from numpy import sum,histogram from subprocess import Popen,PIPE from gimmemotifs.motif import Motif,motif_from_align from gimmemotifs.cluster import cluster_motifs from string import maketrans def rc(seq): t = maketrans("ATCG", "TAGC") return seq[::-1].translate(t) f = Fasta(fastafile) nmer = {} N = {6:4, 8:3,10:2,12:1} tmp = NamedTemporaryFile() abs_cutoff = len(f.items()) / 100.0 * 2 for check_n,cutoff in N.items(): for id,seq in f.items(): for i in range(len(seq) - check_n): n = seq[i: i + check_n] nmer.setdefault(n.upper(), []).append(i) for n,pos in nmer.items(): if len(pos) > abs_cutoff: hist = histogram(pos, bins=9, range=(0,200))[0] if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and sum(hist[3:6]) > sum(hist[7:]) * N[len(n)]: tmp.write(">%s\n" % n) for char in n: w = [] for x in ["A", "C", "G", "T"]: if x == char: w.append(len(pos)) else: w.append(0) tmp.write("\t".join([str(x) for x in w]) + "\n") tmp.flush() tmpname = tmp.name tree = cluster_motifs(tmpname, "subtotal", "ed", "mean", False, threshold=-0.1, include_bg=False) clusters = tree.getResult() def refine_by_scanning(motifs, fastafile): tmp_gff = NamedTemporaryFile() file_in = NamedTemporaryFile() for m in motifs: file_in.write("%s\n" % m.to_pfm()) file_in.flush() cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name, tmp_gff.name) p = Popen(cmd, shell=True) stdout,stderr = p.communicate() aligns = {} for line in open(tmp_gff.name): vals = line.strip().split("\t") motif,instance = [x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ")] if vals[6] == "+": aligns.setdefault(motif,[]).append(instance.upper()) else: aligns.setdefault(motif,[]).append(rc(instance.upper())) tmp_out = NamedTemporaryFile() refined_motifs = [] for id,align in aligns.items(): if len(align) > 10: motif = motif_from_align(align) refined_motifs.append(motif) return refined_motifs motifs = refine_by_scanning([x[0] for x in clusters], fastafile) tmp4 = NamedTemporaryFile() for m in motifs: tmp4.write("%s\n" % m.to_pfm()) tmp4.flush() motifs = [] tree = cluster_motifs(tmp4.name, "total", "wic", "mean", True, threshold=0.95, include_bg=True) clusters = tree.getResult() for i, (cluster,members) in enumerate(clusters): cluster.id = "Nmer_%s" % (i + 1) motifs.append(cluster) refined_motifs = refine_by_scanning(motifs, fastafile) for i,m in enumerate(refined_motifs): m.id = "WannaMotif_%s" % (i + 1) return refined_motifs, "", ""
def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold): self.logger.info("clustering significant motifs.") trim_ic = 0.2 clusters = [] motifs = read_motifs(open(pfm_file), fmt="pwm") if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs( pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False ) clusters = tree.getResult() ids = [] mc = MotifComparer() for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(self.imgdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"images/%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] env = jinja2.Environment(loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(self.cluster_report, "w") f.write(result.encode('utf-8')) f.close() f = open(cluster_pwm, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() self.logger.debug("Clustering done. See the result in %s", self.cluster_report) return clusters
def nmer_predict(fastafile): from tempfile import NamedTemporaryFile, mkdtemp from gimmemotifs.fasta import Fasta from numpy import sum, histogram from subprocess import Popen, PIPE from gimmemotifs.motif import Motif, motif_from_align from gimmemotifs.cluster import cluster_motifs from string import maketrans def rc(seq): t = maketrans("ATCG", "TAGC") return seq[::-1].translate(t) f = Fasta(fastafile) nmer = {} N = {6: 4, 8: 3, 10: 2, 12: 1} tmp = NamedTemporaryFile() abs_cutoff = len(f.items()) / 100.0 * 2 for check_n, cutoff in N.items(): for id, seq in f.items(): for i in range(len(seq) - check_n): n = seq[i:i + check_n] nmer.setdefault(n.upper(), []).append(i) for n, pos in nmer.items(): if len(pos) > abs_cutoff: hist = histogram(pos, bins=9, range=(0, 200))[0] if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and sum( hist[3:6]) > sum(hist[7:]) * N[len(n)]: tmp.write(">%s\n" % n) for char in n: w = [] for x in ["A", "C", "G", "T"]: if x == char: w.append(len(pos)) else: w.append(0) tmp.write("\t".join([str(x) for x in w]) + "\n") tmp.flush() tmpname = tmp.name tree = cluster_motifs(tmpname, "subtotal", "ed", "mean", False, threshold=-0.1, include_bg=False) clusters = tree.getResult() def refine_by_scanning(motifs, fastafile): tmp_gff = NamedTemporaryFile() file_in = NamedTemporaryFile() for m in motifs: file_in.write("%s\n" % m.to_pfm()) file_in.flush() cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name, tmp_gff.name) p = Popen(cmd, shell=True) stdout, stderr = p.communicate() aligns = {} for line in open(tmp_gff.name): vals = line.strip().split("\t") motif, instance = [ x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ") ] if vals[6] == "+": aligns.setdefault(motif, []).append(instance.upper()) else: aligns.setdefault(motif, []).append(rc(instance.upper())) tmp_out = NamedTemporaryFile() refined_motifs = [] for id, align in aligns.items(): if len(align) > 10: motif = motif_from_align(align) refined_motifs.append(motif) return refined_motifs motifs = refine_by_scanning([x[0] for x in clusters], fastafile) tmp4 = NamedTemporaryFile() for m in motifs: tmp4.write("%s\n" % m.to_pfm()) tmp4.flush() motifs = [] tree = cluster_motifs(tmp4.name, "total", "wic", "mean", True, threshold=0.95, include_bg=True) clusters = tree.getResult() for i, (cluster, members) in enumerate(clusters): cluster.id = "Nmer_%s" % (i + 1) motifs.append(cluster) refined_motifs = refine_by_scanning(motifs, fastafile) for i, m in enumerate(refined_motifs): m.id = "WannaMotif_%s" % (i + 1) return refined_motifs, "", ""