Ejemplo n.º 1
0
def cluster(args):

    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    ncpus = args.ncpus

    clusters = []
    motifs = read_motifs(args.inputfile)
    if len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        tree = cluster_motifs(
            args.inputfile,
            "total",
            "wic",
            "mean",
            True,
            threshold=args.threshold,
            include_bg=True,
            ncpus=ncpus,
        )
        clusters = tree.getResult()

    ids = _create_images(outdir, clusters)
    _write_report(outdir, ids, tree, clusters)
Ejemplo n.º 2
0
    def test1_cluster_motifs(self):
        """ cluster a pwm file with motifs """
        # Run clustering
        tree = cluster_motifs(self.pwm, 
                                "total",
                                "wic",
                                "mean",
                                True,
                                threshold=0.95,
                                include_bg=True)
        
        clusters = tree.getResult()

        self.assertEquals(2, len(clusters))
        self.assertEquals([3,2], [len(c[1]) for c 
            in sorted(clusters, cmp=lambda x,y: cmp(len(x), len(y)))])
Ejemplo n.º 3
0
    def test1_cluster_motifs(self):
        """ cluster a pwm file with motifs """
        # Run clustering
        tree = cluster_motifs(self.pwm, 
                                "total",
                                "wic",
                                "mean",
                                True,
                                threshold=0.95,
                                include_bg=True)
        
        clusters = tree.getResult()

        self.assertEqual(2, len(clusters))
        self.assertEqual([3,2], [len(c[1]) for c 
            in sorted(clusters, key=lambda x: len(x))])
Ejemplo n.º 4
0
def cluster(args):

    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    ncpus = args.ncpus
    
    clusters = []
    motifs = pwmfile_to_motifs(args.inputfile)
    if len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True, ncpus=ncpus)
        clusters = tree.getResult()
    
    ids = _create_images(outdir, clusters) 
    _write_report(outdir, ids, tree, clusters)
Ejemplo n.º 5
0
def cluster(args):

    revcomp = not args.single

    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    trim_ic = 0.2
    clusters = []
    motifs = pwmfile_to_motifs(args.inputfile)
    if len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True)
        clusters = tree.getResult()
    
    ids = []
    mc = MotifComparer()

    sys.stderr.write("Creating images\n")
    for cluster,members in clusters:
        cluster.trim(trim_ic)
        cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG")
        ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)    
            add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
            for motif in members:
                score, pos, strand = scores[motif]
                add = pos - add_pos
                
                if strand in [1,"+"]:
                    pass
                else:
                    #print "RC %s" % motif.id
                    rc = motif.rc()
                    rc.id = motif.id
                    motif = rc
                #print "%s\t%s" % (motif.id, add)    
                motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
        ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]
    
    config = MotifConfig()
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids)

    with open(os.path.join(outdir, "cluster_report.html"), "w") as f:
        f.write(result.encode('utf-8'))

    f = open(os.path.join(outdir, "cluster_key.txt"), "w")
    for id in ids:
        f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]])))
    f.close()

    f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()
Ejemplo n.º 6
0
    def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold):
        self.logger.info("clustering significant motifs.")

        trim_ic = 0.2
        clusters = []
        motifs = read_motifs(open(pfm_file), fmt="pwm")
        if len(motifs) == 1:
            clusters = [[motifs[0], motifs]]
        else:
            tree = cluster_motifs(pfm_file,
                                  "total",
                                  "wic",
                                  "mean",
                                  True,
                                  threshold=float(threshold),
                                  include_bg=True,
                                  progress=False)
            clusters = tree.getResult()

        ids = []
        mc = MotifComparer()

        for cluster, members in clusters:
            cluster.trim(trim_ic)
            cluster.to_img(os.path.join(self.imgdir, "%s.png" % cluster.id),
                           format="PNG")
            ids.append([cluster.id, {"src": "images/%s.png" % cluster.id}, []])
            if len(members) > 1:
                scores = {}
                for motif in members:
                    scores[motif] = mc.compare_motifs(cluster,
                                                      motif,
                                                      "total",
                                                      "wic",
                                                      "mean",
                                                      pval=True)
                add_pos = sorted(scores.values(),
                                 cmp=lambda x, y: cmp(x[1], y[1]))[0][1]
                for motif in members:
                    score, pos, strand = scores[motif]
                    add = pos - add_pos

                    if strand in [1, "+"]:
                        pass
                    else:
                        #print "RC %s" % motif.id
                        rc = motif.rc()
                        rc.id = motif.id
                        motif = rc
                    #print "%s\t%s" % (motif.id, add)
                    motif.to_img(os.path.join(
                        self.imgdir, "%s.png" % motif.id.replace(" ", "_")),
                                 format="PNG",
                                 add_left=add)
            ids[-1][2] = [
                dict([("src", "images/%s.png" % motif.id.replace(" ", "_")),
                      ("alt", motif.id.replace(" ", "_"))])
                for motif in members
            ]

        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("cluster_template.jinja.html")
        result = template.render(expname=self.basename,
                                 motifs=ids,
                                 inputfile=self.inputfile,
                                 date=datetime.today().strftime("%d/%m/%Y"),
                                 version=GM_VERSION)

        f = open(self.cluster_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

        f = open(cluster_pwm, "w")
        if len(clusters) == 1 and len(clusters[0][1]) == 1:
            f.write("%s\n" % clusters[0][0].to_pwm())
        else:
            for motif in tree.get_clustered_motifs():
                f.write("%s\n" % motif.to_pwm())
        f.close()

        self.logger.debug("Clustering done. See the result in %s",
                          self.cluster_report)
        return clusters
Ejemplo n.º 7
0
def nmer_predict(fastafile):
	from tempfile import NamedTemporaryFile,mkdtemp
	from gimmemotifs.fasta import Fasta
	from numpy import sum,histogram
	from subprocess import Popen,PIPE
	from gimmemotifs.motif import Motif,motif_from_align
	from gimmemotifs.cluster import cluster_motifs 
	from string import maketrans

	def rc(seq):
		t = maketrans("ATCG", "TAGC")
		return seq[::-1].translate(t)
	
	f = Fasta(fastafile)
	nmer = {}
	N = {6:4, 8:3,10:2,12:1}
	tmp = NamedTemporaryFile()
	abs_cutoff = len(f.items()) / 100.0 * 2 
	for check_n,cutoff in N.items():
		for id,seq in f.items():
			for i in range(len(seq) - check_n):
				n = seq[i: i + check_n]
				nmer.setdefault(n.upper(), []).append(i)

	for n,pos in nmer.items():
		if len(pos) > abs_cutoff:
			hist = histogram(pos, bins=9, range=(0,200))[0]	
			if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and  sum(hist[3:6]) > sum(hist[7:]) *  N[len(n)]:
				tmp.write(">%s\n" % n)
				for char in n:
					w = []
					for x in  ["A", "C", "G", "T"]:
						if x == char:
							w.append(len(pos))
						else:
							w.append(0)

					tmp.write("\t".join([str(x) for x in w]) + "\n")
	
	
	tmp.flush()
	tmpname = tmp.name
	
	tree = cluster_motifs(tmpname, "subtotal", "ed", "mean", False, threshold=-0.1, include_bg=False)	
	clusters = tree.getResult()

	def refine_by_scanning(motifs, fastafile):
		
		tmp_gff = NamedTemporaryFile()
		file_in = NamedTemporaryFile()
		for m in motifs:
			file_in.write("%s\n" % m.to_pfm())
		file_in.flush()
		
		cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name, tmp_gff.name)
		p = Popen(cmd, shell=True)
		stdout,stderr = p.communicate()

		aligns = {}
		for line in open(tmp_gff.name):	
			vals = line.strip().split("\t")
			motif,instance = [x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ")]
		
			if vals[6] == "+":
				aligns.setdefault(motif,[]).append(instance.upper())
			else:
				aligns.setdefault(motif,[]).append(rc(instance.upper()))

		tmp_out = NamedTemporaryFile()
		
		refined_motifs = []
		for id,align in aligns.items():
			if len(align) > 10:
				motif = motif_from_align(align)
				refined_motifs.append(motif)
		
		return refined_motifs
	
	motifs = refine_by_scanning([x[0] for x in clusters], fastafile)
	tmp4 = NamedTemporaryFile()
	for m in motifs:
		tmp4.write("%s\n" % m.to_pfm())
	tmp4.flush()


	motifs = []
	tree = cluster_motifs(tmp4.name, "total", "wic", "mean", True, threshold=0.95, include_bg=True)	
	clusters = tree.getResult()
	for i, (cluster,members) in enumerate(clusters):
		cluster.id = "Nmer_%s" % (i + 1)
		motifs.append(cluster)
	
	refined_motifs = refine_by_scanning(motifs, fastafile)
	for i,m in enumerate(refined_motifs):
		m.id = "WannaMotif_%s" % (i + 1)
	
	return refined_motifs, "", ""	
Ejemplo n.º 8
0
    def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold):
        self.logger.info("clustering significant motifs.")

        trim_ic = 0.2
        clusters = []
        motifs = read_motifs(open(pfm_file), fmt="pwm")
        if len(motifs) == 1:
            clusters = [[motifs[0], motifs]]
        else:
            tree = cluster_motifs(
                    pfm_file, 
                    "total", 
                    "wic", 
                    "mean", 
                    True, 
                    threshold=float(threshold), 
                    include_bg=True,
                    progress=False
                    )
            clusters = tree.getResult()

        ids = []
        mc = MotifComparer()

        for cluster,members in clusters:
            cluster.trim(trim_ic)
            cluster.to_img(os.path.join(self.imgdir,"%s.png" % cluster.id), format="PNG")
            ids.append([cluster.id, {"src":"images/%s.png" % cluster.id},[]])
            if len(members) > 1:
                scores = {}
                for motif in members:
                    scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)
                add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
                for motif in members:
                    score, pos, strand = scores[motif]
                    add = pos - add_pos

                    if strand in [1,"+"]:
                        pass
                    else:
                        #print "RC %s" % motif.id
                        rc = motif.rc()
                        rc.id = motif.id
                        motif = rc
                    #print "%s\t%s" % (motif.id, add)
                    motif.to_img(os.path.join(self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
            ids[-1][2] = [dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]

        
        env = jinja2.Environment(loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("cluster_template.jinja.html")
        result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION)
        
        f = open(self.cluster_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

        f = open(cluster_pwm, "w")
        if len(clusters) == 1 and len(clusters[0][1]) == 1:
            f.write("%s\n" % clusters[0][0].to_pwm())
        else:
            for motif in tree.get_clustered_motifs():
                f.write("%s\n" % motif.to_pwm())
        f.close()

        self.logger.debug("Clustering done. See the result in %s", 
                self.cluster_report)
        return clusters
Ejemplo n.º 9
0
def nmer_predict(fastafile):
    from tempfile import NamedTemporaryFile, mkdtemp
    from gimmemotifs.fasta import Fasta
    from numpy import sum, histogram
    from subprocess import Popen, PIPE
    from gimmemotifs.motif import Motif, motif_from_align
    from gimmemotifs.cluster import cluster_motifs
    from string import maketrans

    def rc(seq):
        t = maketrans("ATCG", "TAGC")
        return seq[::-1].translate(t)

    f = Fasta(fastafile)
    nmer = {}
    N = {6: 4, 8: 3, 10: 2, 12: 1}
    tmp = NamedTemporaryFile()
    abs_cutoff = len(f.items()) / 100.0 * 2
    for check_n, cutoff in N.items():
        for id, seq in f.items():
            for i in range(len(seq) - check_n):
                n = seq[i:i + check_n]
                nmer.setdefault(n.upper(), []).append(i)

    for n, pos in nmer.items():
        if len(pos) > abs_cutoff:
            hist = histogram(pos, bins=9, range=(0, 200))[0]
            if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and sum(
                    hist[3:6]) > sum(hist[7:]) * N[len(n)]:
                tmp.write(">%s\n" % n)
                for char in n:
                    w = []
                    for x in ["A", "C", "G", "T"]:
                        if x == char:
                            w.append(len(pos))
                        else:
                            w.append(0)

                    tmp.write("\t".join([str(x) for x in w]) + "\n")

    tmp.flush()
    tmpname = tmp.name

    tree = cluster_motifs(tmpname,
                          "subtotal",
                          "ed",
                          "mean",
                          False,
                          threshold=-0.1,
                          include_bg=False)
    clusters = tree.getResult()

    def refine_by_scanning(motifs, fastafile):

        tmp_gff = NamedTemporaryFile()
        file_in = NamedTemporaryFile()
        for m in motifs:
            file_in.write("%s\n" % m.to_pfm())
        file_in.flush()

        cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name,
                                                      tmp_gff.name)
        p = Popen(cmd, shell=True)
        stdout, stderr = p.communicate()

        aligns = {}
        for line in open(tmp_gff.name):
            vals = line.strip().split("\t")
            motif, instance = [
                x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ")
            ]

            if vals[6] == "+":
                aligns.setdefault(motif, []).append(instance.upper())
            else:
                aligns.setdefault(motif, []).append(rc(instance.upper()))

        tmp_out = NamedTemporaryFile()

        refined_motifs = []
        for id, align in aligns.items():
            if len(align) > 10:
                motif = motif_from_align(align)
                refined_motifs.append(motif)

        return refined_motifs

    motifs = refine_by_scanning([x[0] for x in clusters], fastafile)
    tmp4 = NamedTemporaryFile()
    for m in motifs:
        tmp4.write("%s\n" % m.to_pfm())
    tmp4.flush()

    motifs = []
    tree = cluster_motifs(tmp4.name,
                          "total",
                          "wic",
                          "mean",
                          True,
                          threshold=0.95,
                          include_bg=True)
    clusters = tree.getResult()
    for i, (cluster, members) in enumerate(clusters):
        cluster.id = "Nmer_%s" % (i + 1)
        motifs.append(cluster)

    refined_motifs = refine_by_scanning(motifs, fastafile)
    for i, m in enumerate(refined_motifs):
        m.id = "WannaMotif_%s" % (i + 1)

    return refined_motifs, "", ""