Ejemplo n.º 1
0
    def determine_closest_match(self, motifs):
        self.logger.debug("Determining closest matching motifs in database")
        motif_db = self.config.get_default_params()["motif_db"]
        db = os.path.join(self.config.get_motif_dir(), motif_db)
        db_motifs = []
        if db.endswith("pwm") or db.endswith("pfm"):
            db_motifs = read_motifs(open(db), fmt="pwm")
        elif db.endswith("transfac"):
            db_motifs = read_motifs(db, fmt="transfac")

        closest_match = {}
        mc = MotifComparer()
        db_motif_lookup = dict([(m.id, m) for m in db_motifs])
        match = mc.get_closest_match(motifs,
                                     db_motifs,
                                     "partial",
                                     "wic",
                                     "mean",
                                     parallel=False)
        for motif in motifs:
            # Calculate p-value
            pval, pos, orient = mc.compare_motifs(
                motif,
                db_motif_lookup[match[motif.id][0]],
                "partial",
                "wic",
                "mean",
                pval=True)
            closest_match[motif.id] = [
                db_motif_lookup[match[motif.id][0]], pval
            ]
        return closest_match
Ejemplo n.º 2
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print "Motif\tMatch\tScore\tP-value"
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
Ejemplo n.º 3
0
	def cluster(self, threshold=0.5, metric = "pcc", clust_method = "average"):
		""" 
		Returns:
		----------
		dict
			A dictionary with keys=cluster names and values=MotifList objects
		"""

		#Needs gimmemotif
		from gimmemotifs.motif import Motif
		from gimmemotifs.comparison import MotifComparer
		sns.set_style("ticks")	#set style back to ticks, as this is set globally during gimmemotifs import

		#Fill in self.gimme_obj variable
		motif_list = [motif.get_gimmemotif().gimme_obj for motif in self]	#list of gimmemotif objects

		#Similarities between all motifs
		mc = MotifComparer()
		score_dict = mc.get_all_scores(motif_list, motif_list, match = "total", metric = metric, combine = "mean")   #metric can be: seqcor, pcc, ed, distance, wic, chisq, akl or ssd
		self.similarity_matrix = generate_similarity_matrix(score_dict)

		# Clustering
		vector = ssd.squareform(self.similarity_matrix.to_numpy())
		self.linkage_mat = linkage(vector, method=clust_method)

		# Flatten clusters
		fclust_labels = fcluster(self.linkage_mat, threshold, criterion="distance")			#cluster membership per motif
		formatted_labels = ["Cluster_{0}".format(label) for label in fclust_labels]

		# Extract motifs belonging to each cluster
		cluster_dict = {label: MotifList() for label in formatted_labels}	#initialize dictionary
		for i, cluster_label in enumerate(formatted_labels):
			cluster_dict[cluster_label].append(self[i])

		return cluster_dict
Ejemplo n.º 4
0
    def create_consensus(self):
        """ Create consensus motif from MotifList """

        motif_list = [motif.gimme_obj
                      for motif in self]  #list of gimmemotif objects

        if len(motif_list) > 1:
            consensus_found = False
            mc = MotifComparer()

            #Initialize score_dict
            score_dict = mc.get_all_scores(motif_list,
                                           motif_list,
                                           match="total",
                                           metric="pcc",
                                           combine="mean")

            while not consensus_found:

                #Which motifs to merge?
                best_similarity_motifs = sorted(
                    find_best_pair(motif_list, score_dict)
                )  #indices of most similar motifs in cluster_motifs

                #Merge
                new_motif = merge_motifs(motif_list[best_similarity_motifs[0]],
                                         motif_list[best_similarity_motifs[1]])

                del (motif_list[best_similarity_motifs[1]])
                motif_list[best_similarity_motifs[0]] = new_motif

                if len(motif_list) == 1:  #done merging
                    consensus_found = True

                else:  #Update score_dict

                    #add the comparison of the new motif to the score_dict
                    score_dict[new_motif.id] = score_dict.get(new_motif.id, {})

                    for m in motif_list:
                        score_dict[new_motif.id][m.id] = mc.compare_motifs(
                            new_motif, m, metric="pcc")
                        score_dict[m.id][new_motif.id] = mc.compare_motifs(
                            m, new_motif, metric="pcc")

        #Round pwm values
        gimmemotif_consensus = motif_list[0]
        gimmemotif_consensus.pwm = [[round(f, 5) for f in l]
                                    for l in gimmemotif_consensus.pwm]

        #Convert back to OneMotif obj
        onemotif_consensus = gimmemotif_to_onemotif(gimmemotif_consensus)
        onemotif_consensus.gimme_obj = gimmemotif_consensus

        #Control the naming of the new motif
        all_names = [motif.name for motif in self]
        onemotif_consensus.name = ",".join(all_names[:3])
        onemotif_consensus.name += "(...)" if len(all_names) > 3 else ""

        return (onemotif_consensus)
Ejemplo n.º 5
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print("Motif\tMatch\tScore\tP-value")
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval))

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
Ejemplo n.º 6
0
def _create_images(outdir, clusters):
    ids = []
    mc = MotifComparer()
    trim_ic = 0.2

    sys.stderr.write("Creating images\n")
    for cluster,members in clusters:
        cluster.trim(trim_ic)
        cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), fmt="PNG")
        ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)    
            add_pos = sorted(scores.values(), key=lambda x: x[1])[0][1]
            for motif in members:
                _, pos, strand = scores[motif]
                add = pos - add_pos
                
                if strand in [1,"+"]:
                    pass
                else:
                    #print "RC %s" % motif.id
                    rc = motif.rc()
                    rc.id = motif.id
                    motif = rc
                #print "%s\t%s" % (motif.id, add)    
                motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), fmt="PNG", add_left=add)
        ids[-1][2] = [dict([("src", "%s.png" % m.id.replace(" ", "_")), ("alt", m.id.replace(" ", "_"))]) for m in members]
    return ids
Ejemplo n.º 7
0
 def ap1_included(self, motifs):
     ap1 = motif_from_consensus("TGASTCA")
     mc = MotifComparer()
     for motif in motifs:
         match = mc.get_closest_match(ap1, motif)
         if match["TGASTCA"][1][3] < 1e-6:
             return True
     return False
Ejemplo n.º 8
0
def ap1_included(motifs):
    ap1 = motif_from_consensus("TGASTCA")
    mc = MotifComparer()
    match = mc.get_closest_match(ap1, motifs, metric="seqcor")
    print(match)
    if match["TGASTCA"][1][0] >= 0.75:
        return True
    return False
Ejemplo n.º 9
0
 def ap1_included(self, motifs):
     #if len(motifs) == 0:
     #    return False
     ap1 = motif_from_consensus("TGASTCA")
     mc = MotifComparer()
     match = mc.get_closest_match(ap1, motifs, metric="seqcor")
     print(match)
     if match["TGASTCA"][1][0] >= 0.8:
         return True
     return False
Ejemplo n.º 10
0
def create_denovo_motif_report(inputfile,
                               pfmfile,
                               fgfa,
                               background,
                               locfa,
                               outdir,
                               params,
                               stats=None):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating de novo reports")

    motifs = read_motifs(pfmfile, fmt="pwm")

    # ROC plots
    create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"])

    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)

    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa,
                                   motifs=motifs).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get("cutoff_fpr", 0.9)
    lsize = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir,
                               "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(inputfile, pfmfile, background, closest_match,
                             outdir, stats)
Ejemplo n.º 11
0
    def test1_closest_match(self):
        """ Closest match """
        mc = MotifComparer()

        pwm = "test/data/pwmscan/TATA.pwm"
        ret = mc.get_closest_match(pwm)

        self.assertIn("TATA-box", ret)

        match = ret["TATA-box"]
        self.assertEqual("GM.5.0.TBP.0001", match[0])

        scores = match[1]
        self.assertAlmostEqual(-0.1041, scores[0], 4)
        self.assertEqual(0, scores[1])
        self.assertEqual(1, scores[2])
        self.assertAlmostEqual(3.1666e-8, scores[3])
Ejemplo n.º 12
0
    def test1_closest_match(self):
        """ Closest match """
        mc = MotifComparer()

        pwm = "test/data/pwmscan/TATA.pwm"
        ret = mc.get_closest_match(pwm)
        
        self.assertIn('TATA-box', ret)
        
        match = ret['TATA-box']
        self.assertEqual('GM.5.0.TBP.0001', match[0])
        
        scores = match[1]
        self.assertAlmostEqual(-0.1041, scores[0], 4)
        self.assertEqual(0, scores[1])
        self.assertEqual(1, scores[2])
        self.assertAlmostEqual(3.1666e-8, scores[3])
Ejemplo n.º 13
0
    def test1_closest_match(self):
        """ Closest match """
        mc = MotifComparer()

        pwm = "test/data/pwmscan/TATA.pwm"
        ret = mc.get_closest_match(pwm)

        self.assertIn('TATA-box', ret)

        match = ret['TATA-box']
        self.assertEqual('TBP_Average_1', match[0])

        scores = match[1]
        self.assertAlmostEqual(-0.3276, scores[0], 4)
        self.assertEqual(-1, scores[1])
        self.assertEqual(1, scores[2])
        self.assertAlmostEqual(4.134e-7, scores[3])
Ejemplo n.º 14
0
    def test1_denovo(self):
        """ de novo motif prediction """
        gimme_motifs(
            "test/data/denovo/input.fa",
            self.outdir,
            params={
                "tools": "BioProspector,Homer,MDmodule",
                "fraction": 0.5,
                "background": "random",
                "genome": "test/data/background/genome.fa",
            },
            filter_significant=True,
            cluster=True,
        )

        fnames = [
            "gimme.denovo.pfm",
            "gimme.denovo.html",
            "gimme.clustereds.html",
            "params.txt",
            "stats.random.txt",
        ]

        with open(os.path.join(self.outdir, "gimmemotifs.log")) as f:
            log = f.read()
        self.assertIn("clustering", log)

        # Check if all output files are there
        for fname in fnames:
            self.assertTrue(os.path.exists(os.path.join(self.outdir, fname)))

        # Check if correct motif is predicted
        with open(os.path.join(self.outdir, "gimme.denovo.pfm")) as f:
            predicted_motifs = read_motifs(f)
        ap1 = motif_from_consensus("TGASTCA")

        mc = MotifComparer()
        ap1_predicted = False
        for motif in predicted_motifs:
            match = mc.get_closest_match(ap1, motif)
            if match["TGASTCA"][1][3] < 1e-5:
                ap1_predicted = True
                break

        self.assertTrue(ap1_predicted)
Ejemplo n.º 15
0
def match(args):
    sample = dict([(m.id, m) for m in read_motifs(args.pfmfile)])
    db = dict([(m.id, m) for m in read_motifs(args.dbpfmfile)])

    mc = MotifComparer()
    result = mc.get_best_matches(
        sample.values(), args.nmatches, db.values(), "partial", "seqcor", "mean"
    )

    plotdata = []
    print("Motif\tMatch\tScore\tP-value")
    for motif_name, matches in result.items():
        for match in matches:

            pval, pos, orient = mc.compare_motifs(
                sample[motif_name], db[match[0]], "partial", "seqcor", "mean", pval=True
            )
            print("%s\t%s\t%0.2f\t%0.3e" % (motif_name, match[0], match[1][0], pval))
            motif = sample[motif_name]
            dbmotif = db[match[0]]

            if args.img:
                if orient == -1:
                    tmp = dbmotif.id
                    dbmotif = dbmotif.rc()
                    dbmotif.id = tmp
                if pos < 0:
                    tmp = motif.id
                    motif = Motif([[0.25, 0.25, 0.25, 0.25]] * -pos + motif.pwm)
                    motif.id = tmp
                elif pos > 0:
                    tmp = dbmotif.id
                    dbmotif = Motif([[0.25, 0.25, 0.25, 0.25]] * pos + dbmotif.pwm)
                    dbmotif.id = tmp

                diff = len(motif) - len(dbmotif)
                if diff > 0:
                    dbmotif = Motif(dbmotif.pwm + [[0.25, 0.25, 0.25, 0.25]] * diff)
                else:
                    motif = Motif(motif.pwm + [[0.25, 0.25, 0.25, 0.25]] * -diff)

                plotdata.append((motif, dbmotif, pval))
    if args.img:
        match_plot(plotdata, args.img)
Ejemplo n.º 16
0
    def determine_closest_match(self, motifs):
        self.logger.debug("Determining closest matching motifs in database")
        motif_db = self.config.get_default_params()["motif_db"]
        db = os.path.join(self.config.get_motif_dir(), motif_db)
        db_motifs = []
        if db.endswith("pwm") or db.endswith("pfm"):
            db_motifs = read_motifs(open(db), fmt="pwm")
        elif db.endswith("transfac"):
            db_motifs = read_motifs(db, fmt="transfac")

        closest_match = {}
        mc = MotifComparer()
        db_motif_lookup = dict([(m.id, m) for m in db_motifs])
        match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False)
        for motif in motifs:
            # Calculate p-value
            pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True)
            closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval]
        return closest_match
Ejemplo n.º 17
0
def merge_motifs(motif_1, motif_2):
	"""Creates the consensus motif from two provided motifs, using the pos and orientation calculated by gimmemotifs get_all_scores()

	Parameter:
	----------
	motif_1 : Object of class Motif
		First gimmemotif object to create the consensus.
	motif_2 : Object of class Motif
		Second gimmemotif object to create consensus.
	Returns:
	--------
	consensus : Object of class Motif
		Consensus of both motifs with id composed of ids of motifs it was created.
	"""
	from gimmemotifs.comparison import MotifComparer

	mc = MotifComparer()
	_, pos, orientation = mc.compare_motifs(motif_1, motif_2, metric= "pcc")
	consensus = motif_1.average_motifs(motif_2, pos = pos, orientation = orientation)
	consensus.id = motif_1.id + "+" + motif_2.id

	return consensus
Ejemplo n.º 18
0
    def test1_denovo(self):
        """ de novo motif prediction """
       
        
        gimme_motifs("test/data/denovo/input.fa", self.outdir,
            params={
                "tools":"BioProspector,Homer,MDmodule",
                "fraction":0.5,
                "background":"random"
                },
            filter_significant=True,
            cluster=True)
       
        fnames = ["motifs.pwm", "motif_report.html", "cluster_report.html",
                    "params.txt", "stats.random.txt"]
        
    
        with open(os.path.join(self.outdir, 'gimmemotifs.log')) as f:
            log = f.read()
        self.assertIn("clustering", log)
    
        # Check if all output files are there
        for fname in fnames:
            self.assertTrue(os.path.exists(os.path.join(self.outdir, fname)))   
  
        # Check if correct motif is predicted
        with open(os.path.join(self.outdir, "motifs.pwm")) as f:
            predicted_motifs = read_motifs(f)
        ap1 = motif_from_consensus("TGASTCA")

        mc = MotifComparer()
        ap1_predicted = False
        for motif in predicted_motifs:
            match = mc.get_closest_match(ap1, motif)
            if match["TGASTCA"][1][3] < 1e-5:
                ap1_predicted = True
                break

        self.assertTrue(ap1_predicted)
Ejemplo n.º 19
0
def create_denovo_motif_report(inputfile, pwmfile, fgfa, background, locfa, outdir, params, stats=None):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating reports")

    motifs = read_motifs(pwmfile, fmt="pwm")
    
    # ROC plots
    create_roc_plots(pwmfile, fgfa, background, outdir)
    
    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)
    
    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(motifs, fgfa, bgfa).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get('cutoff_fpr', 0.9)
    lwidth = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lwidth, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(inputfile, pwmfile, background, closest_match, outdir, stats)
Ejemplo n.º 20
0
def motifs(args):
    """ Calculate ROC_AUC and other metrics and optionally plot ROC curve."""
    if args.outdir is None:
        raise ValueError("an output directory is required!")
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    scan_dir = os.path.join(args.outdir, "motif_scan_results")
    if not os.path.exists(scan_dir):
        os.makedirs(scan_dir)

    file_type = determine_file_type(args.sample)
    outfile = os.path.join(args.outdir, f"input.w{args.size}.bed")
    sample = args.sample
    if file_type == "narrowpeak":
        narrowpeak_to_bed(args.sample, outfile, size=args.size)
        sample = outfile
    elif args.size and args.size > 0:
        if file_type == "fasta":
            logger.warn("size parameter will be ignored for FASTA input")
        elif file_type == "bed":
            write_equalsize_bedfile(args.sample, args.size, outfile)
            sample = outfile

    genome = args.genome
    if genome is None:
        args.zscore = False
        args.gc = False

    bgfile = None
    bg = args.background
    if bg is None:
        if genome is None:
            bg = "random"
        else:
            bg = "gc"

    if os.path.isfile(bg):
        bgfile = bg
        bg = "custom"
    else:
        # create background if not provided
        bgfile = os.path.join(args.outdir,
                              "generated_background.{}.fa".format(bg))
        size = args.size
        if size <= 0:
            size = None
        if bg == "gc":
            logger.info("creating background (matched GC%)")
        else:
            logger.info("creating background (random)")

        create_background_file(
            bgfile,
            bg,
            fmt="fasta",
            genome=genome,
            inputfile=sample,
            size=size,
            number=10000,
        )

    pfmfile = args.pfmfile

    motifs = []
    if args.known:
        motifs = read_motifs(pfmfile, fmt="pfm")

    if args.denovo:
        gimme_motifs(
            sample,
            args.outdir,
            params={
                "tools": args.tools,
                "analysis": args.analysis,
                "background": bg,
                "custom_background": bgfile,
                "genome": args.genome,
                "size": args.size,
            },
        )
        denovo = read_motifs(os.path.join(args.outdir, "gimme.denovo.pfm"))
        mc = MotifComparer()
        result = mc.get_closest_match(denovo,
                                      dbmotifs=pfmfile,
                                      metric="seqcor")
        match_motifs = read_motifs(pfmfile, as_dict=True)
        new_map_file = os.path.join(args.outdir, "combined.motif2factors.txt")
        base = os.path.splitext(pfmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            shutil.copyfile(map_file, new_map_file)

        motifs += denovo
        pfmfile = os.path.join(args.outdir, "combined.pfm")
        with open(pfmfile, "w") as f:
            for m in motifs:
                print(m.to_pwm(), file=f)

        with open(new_map_file, "a") as f:
            for m in denovo:
                print("{}\t{}\t{}\t{}".format(m.id, "de novo", "GimmeMotifs",
                                              "Y"),
                      file=f)
                if result[m.id][0] in match_motifs:
                    for factor in match_motifs[result[m.id]
                                               [0]].factors["direct"]:
                        print(
                            "{}\t{}\t{}\t{}".format(m.id, factor,
                                                    "inferred (GimmeMotifs)",
                                                    "N"),
                            file=f,
                        )
    else:
        logger.info("skipping de novo")

    stats = [
        "phyper_at_fpr",
        "roc_auc",
        "pr_auc",
        "enr_at_fpr",
        "recall_at_fdr",
        "roc_values",
        "matches_at_fpr",
    ]

    f_out = sys.stdout
    if args.outdir:
        f_out = open(args.outdir + "/gimme.roc.report.txt", "w")

    # Print the metrics
    f_out.write(
        "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n"
    )

    logger.info("creating motif scan tables")
    # ftype = determine_file_type(args.sample)
    # sample = args.sample
    # delete_sample = False
    # if ftype == "narrowpeak":
    #    f = NamedTemporaryFile(delete=False)
    #    logger.debug("Using {} as temporary BED file".format(f.name))
    #    narrowpeak_to_bed(args.sample, f.name, size=args.size)
    #    sample = f.name
    #    delete_sample = True

    # Create a table with the best score per motif for all motifs.
    # This has three reasons:
    # * Can be used to calculate statistics;
    # * Can be used to select a set of non-redundant motifs;
    # * These files are included in the output and can be used for further analyis.
    score_table = os.path.join(scan_dir, "input.motif.score.txt")
    bg_score_table = os.path.join(scan_dir, "background.motif.score.txt")
    for infile, outfile in [(sample, score_table), (bgfile, bg_score_table)]:
        scan_to_file(
            infile,
            pfmfile,
            filepath_or_buffer=outfile,
            score_table=True,
            genome=args.genome,
            zscore=True,
            gcnorm=True,
        )

    n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0]
    n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0]

    logger.info("calculating stats")
    for motif_stats in calc_stats_iterator(
            motifs=pfmfile,
            fg_table=score_table,
            bg_table=bg_score_table,
            stats=stats,
            ncpus=args.ncpus,
    ):
        for motif in motifs:
            if str(motif) in motif_stats:
                log_pvalue = np.inf
                if motif_stats[str(motif)]["phyper_at_fpr"] > 0:
                    log_pvalue = -np.log10(
                        motif_stats[str(motif)]["phyper_at_fpr"])
                f_out.write(
                    "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n"
                    .format(
                        motif.id,
                        motif_stats[str(motif)]["matches_at_fpr"][0],
                        motif_stats[str(motif)]["matches_at_fpr"][0] /
                        n_input * 100,
                        motif_stats[str(motif)]["matches_at_fpr"][1],
                        motif_stats[str(motif)]["matches_at_fpr"][1] /
                        n_background * 100,
                        motif_stats[str(motif)]["phyper_at_fpr"],
                        log_pvalue,
                        motif_stats[str(motif)]["roc_auc"],
                        motif_stats[str(motif)]["pr_auc"],
                        motif_stats[str(motif)]["enr_at_fpr"],
                        motif_stats[str(motif)]["recall_at_fdr"],
                    ))
    f_out.close()

    # Select a set of "non-redundant" motifs.
    # Using Recursive Feature Elimination, a set of motifs is selected that
    # best explains the peaks in comparison to the background sequences.
    nr_motifs = select_nonredundant_motifs(
        args.outdir + "/gimme.roc.report.txt",
        pfmfile,
        score_table,
        bg_score_table,
        tolerance=0.001,
    )

    # Provide BED files with motif scan results for the non-redundant motifs
    # At the moment this is not ideal, as scanning is now performed twice
    # for this set of non-redundant motifs.
    motif_dict = dict([(m.id, m) for m in motifs])
    for motif in nr_motifs:
        with NamedTemporaryFile(mode="w") as f:
            print(motif_dict[motif].to_pwm(), file=f)
            f.flush()
            safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)
            scan_to_file(
                sample,
                f.name,
                filepath_or_buffer=os.path.join(scan_dir,
                                                f"{safe_name}.matches.bed"),
                bed=True,
                fpr=0.01,
                genome=args.genome,
                zscore=True,
                gcnorm=True,
            )

    if args.report:
        logger.info("creating statistics report")
        if args.outdir:
            roc_html_report(
                args.outdir,
                args.outdir + "/gimme.roc.report.txt",
                pfmfile,
                threshold=0.01,
                outname="gimme.motifs.redundant.html",
                link_matches=False,
            )
            roc_html_report(
                args.outdir,
                args.outdir + "/gimme.roc.report.txt",
                pfmfile,
                threshold=0.01,
                use_motifs=nr_motifs,
                link_matches=True,
            )
            logger.info(
                f"gimme motifs final report: {os.path.join(args.outdir, 'gimme.motifs.html')}"
            )
Ejemplo n.º 21
0
def cluster(args):

    revcomp = not args.single

    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    trim_ic = 0.2
    clusters = []
    motifs = pwmfile_to_motifs(args.inputfile)
    if len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True)
        clusters = tree.getResult()
    
    ids = []
    mc = MotifComparer()

    sys.stderr.write("Creating images\n")
    for cluster,members in clusters:
        cluster.trim(trim_ic)
        cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG")
        ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)    
            add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
            for motif in members:
                score, pos, strand = scores[motif]
                add = pos - add_pos
                
                if strand in [1,"+"]:
                    pass
                else:
                    #print "RC %s" % motif.id
                    rc = motif.rc()
                    rc.id = motif.id
                    motif = rc
                #print "%s\t%s" % (motif.id, add)    
                motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
        ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]
    
    config = MotifConfig()
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids)

    with open(os.path.join(outdir, "cluster_report.html"), "w") as f:
        f.write(result.encode('utf-8'))

    f = open(os.path.join(outdir, "cluster_key.txt"), "w")
    for id in ids:
        f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]])))
    f.close()

    f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()
import sys
from gimmemotifs.motif import read_motifs
from gimmemotifs.comparison import seqcor, MotifComparer, _get_all_scores

pwmfile = sys.argv[1]
outfile = sys.argv[2]
chunksize = int(sys.argv[3])
chunk = int(sys.argv[4])
metric = sys.argv[5]

if metric not in ["wic", "seqcor", "pcc", "ed"]:
    raise ValueError("invalid metric {}".format(metric))

all_motifs = read_motifs(open(pwmfile))
chunk_motifs = all_motifs[(chunk - 1) * chunksize:chunk * chunksize]

mc = MotifComparer()
if metric == "pcc":
    dists = mc.get_all_scores(chunk_motifs, all_motifs, "partial", metric,
                              "mean", False)
else:
    dists = mc.get_all_scores(chunk_motifs, all_motifs, "total", metric,
                              "mean", False)

cols = list(dists.values())[0]
with open(outfile, "w") as f:
    f.write("\t{}\n".format("\t".join(cols)))
    for k, v in dists.items():
        f.write("{}\t{}\n".format(
            k, "\t".join(["{:.6f}".format(v[c][0]) for c in cols])))
Ejemplo n.º 23
0
    def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold):
        self.logger.info("clustering significant motifs.")

        trim_ic = 0.2
        clusters = []
        motifs = read_motifs(open(pfm_file), fmt="pwm")
        if len(motifs) == 1:
            clusters = [[motifs[0], motifs]]
        else:
            tree = cluster_motifs(pfm_file,
                                  "total",
                                  "wic",
                                  "mean",
                                  True,
                                  threshold=float(threshold),
                                  include_bg=True,
                                  progress=False)
            clusters = tree.getResult()

        ids = []
        mc = MotifComparer()

        for cluster, members in clusters:
            cluster.trim(trim_ic)
            cluster.to_img(os.path.join(self.imgdir, "%s.png" % cluster.id),
                           format="PNG")
            ids.append([cluster.id, {"src": "images/%s.png" % cluster.id}, []])
            if len(members) > 1:
                scores = {}
                for motif in members:
                    scores[motif] = mc.compare_motifs(cluster,
                                                      motif,
                                                      "total",
                                                      "wic",
                                                      "mean",
                                                      pval=True)
                add_pos = sorted(scores.values(),
                                 cmp=lambda x, y: cmp(x[1], y[1]))[0][1]
                for motif in members:
                    score, pos, strand = scores[motif]
                    add = pos - add_pos

                    if strand in [1, "+"]:
                        pass
                    else:
                        #print "RC %s" % motif.id
                        rc = motif.rc()
                        rc.id = motif.id
                        motif = rc
                    #print "%s\t%s" % (motif.id, add)
                    motif.to_img(os.path.join(
                        self.imgdir, "%s.png" % motif.id.replace(" ", "_")),
                                 format="PNG",
                                 add_left=add)
            ids[-1][2] = [
                dict([("src", "images/%s.png" % motif.id.replace(" ", "_")),
                      ("alt", motif.id.replace(" ", "_"))])
                for motif in members
            ]

        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("cluster_template.jinja.html")
        result = template.render(expname=self.basename,
                                 motifs=ids,
                                 inputfile=self.inputfile,
                                 date=datetime.today().strftime("%d/%m/%Y"),
                                 version=GM_VERSION)

        f = open(self.cluster_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

        f = open(cluster_pwm, "w")
        if len(clusters) == 1 and len(clusters[0][1]) == 1:
            f.write("%s\n" % clusters[0][0].to_pwm())
        else:
            for motif in tree.get_clustered_motifs():
                f.write("%s\n" % motif.to_pwm())
        f.close()

        self.logger.debug("Clustering done. See the result in %s",
                          self.cluster_report)
        return clusters
Ejemplo n.º 24
0
def cluster_motifs(
    motifs,
    match="total",
    metric="wic",
    combine="mean",
    pval=True,
    threshold=0.95,
    trim_edges=False,
    edge_ic_cutoff=0.2,
    include_bg=True,
    progress=True,
):
    """ 
    Clusters a set of sequence motifs. Required arg 'motifs' is a file containing
    positional frequency matrices or an array with motifs.

    Optional args:

    'match', 'metric' and 'combine' specify the method used to compare and score
    the motifs. By default the WIC score is used (metric='wic'), using the the
    score over the whole alignment (match='total'), with the total motif score
    calculated as the mean score of all positions (combine='mean').
    'match' can be either 'total' for the total alignment or 'subtotal' for the 
    maximum scoring subsequence of the alignment.
    'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed',
    'distance', 'wic' or 'chisq' 
    'combine' determines how the total score is calculated from the score of 
    individual positions and can be either 'sum' or 'mean'
    
    'pval' can be True or False and determines if the score should be converted to 
    an empirical p-value

    'threshold' determines the score (or p-value) cutoff

    If 'trim_edges' is set to True, all motif edges with an IC below 
    'edge_ic_cutoff' will be removed before clustering

    When computing the average of two motifs 'include_bg' determines if, at a 
    position only present in one motif, the information in that motif should
    be kept, or if it should be averaged with background frequencies. Should
    probably be left set to True.

    """

    # First read pfm or pfm formatted motiffile
    if type([]) != type(motifs):
        motifs = read_motifs(open(motifs), fmt="pwm")

    mc = MotifComparer()

    # Trim edges with low information content
    if trim_edges:
        for motif in motifs:
            motif.trim(edge_ic_cutoff)

    # Make a MotifTree node for every motif
    nodes = [MotifTree(m) for m in motifs]

    # Determine all pairwise scores and maxscore per motif
    scores = {}
    motif_nodes = dict([(n.motif.id, n) for n in nodes])
    motifs = [n.motif for n in nodes]

    if progress:
        sys.stderr.write("Calculating initial scores\n")
    result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True)

    for m1, other_motifs in result.items():
        for m2, score in other_motifs.items():
            if m1 == m2:
                if pval:
                    motif_nodes[m1].maxscore = 1 - score[0]
                else:
                    motif_nodes[m1].maxscore = score[0]
            else:
                if pval:
                    score = [1 - score[0]] + score[1:]
                scores[(motif_nodes[m1], motif_nodes[m2])] = score

    cluster_nodes = [node for node in nodes]
    ave_count = 1

    total = len(cluster_nodes)

    while len(cluster_nodes) > 1:
        l = sorted(scores.keys(), key=lambda x: scores[x][0])
        i = -1
        (n1, n2) = l[i]
        while not n1 in cluster_nodes or not n2 in cluster_nodes:
            i -= 1
            (n1, n2) = l[i]

        (score, pos, orientation) = scores[(n1, n2)]
        ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg)

        ave_motif.trim(edge_ic_cutoff)
        ave_motif.id = "Average_%s" % ave_count
        ave_count += 1

        new_node = MotifTree(ave_motif)
        if pval:
            new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
        else:
            new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]

        new_node.mergescore = score
        # print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score)
        n1.parent = new_node
        n2.parent = new_node
        new_node.left = n1
        new_node.right = n2

        cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent])

        if progress:
            progress = (1 - len(cmp_nodes) / float(total)) * 100
            sys.stderr.write(
                "\rClustering [{0}{1}] {2}%".format(
                    "#" * (int(progress) / 10), " " * (10 - int(progress) / 10), int(progress)
                )
            )

        result = mc.get_all_scores([new_node.motif], cmp_nodes.keys(), match, metric, combine, pval, parallel=True)

        for motif, n in cmp_nodes.items():
            x = result[new_node.motif.id][motif.id]
            if pval:
                x = [1 - x[0]] + x[1:]
            scores[(new_node, n)] = x

        nodes.append(new_node)

        cluster_nodes = [node for node in nodes if not node.parent]

    if progress:
        sys.stderr.write("\n")
    root = nodes[-1]
    for node in [node for node in nodes if not node.left]:
        node.parent.checkMerge(root, threshold)

    return root
Ejemplo n.º 25
0
def cluster_motifs(motifs, match="total", metric="wic", combine="mean", pval=True, threshold=0.95, trim_edges=False, edge_ic_cutoff=0.2, include_bg=True, progress=True):
    """ 
    Clusters a set of sequence motifs. Required arg 'motifs' is a file containing
    positional frequency matrices or an array with motifs.

    Optional args:

    'match', 'metric' and 'combine' specify the method used to compare and score
    the motifs. By default the WIC score is used (metric='wic'), using the the
    score over the whole alignment (match='total'), with the total motif score
    calculated as the mean score of all positions (combine='mean').
    'match' can be either 'total' for the total alignment or 'subtotal' for the 
    maximum scoring subsequence of the alignment.
    'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed',
    'distance', 'wic' or 'chisq' 
    'combine' determines how the total score is calculated from the score of 
    individual positions and can be either 'sum' or 'mean'
    
    'pval' can be True or False and determines if the score should be converted to 
    an empirical p-value

    'threshold' determines the score (or p-value) cutoff

    If 'trim_edges' is set to True, all motif edges with an IC below 
    'edge_ic_cutoff' will be removed before clustering

    When computing the average of two motifs 'include_bg' determines if, at a 
    position only present in one motif, the information in that motif should
    be kept, or if it should be averaged with background frequencies. Should
    probably be left set to True.

    """

    
    # First read pfm or pfm formatted motiffile
    if type([]) != type(motifs):
        motifs = read_motifs(open(motifs), fmt="pwm")
    
    mc = MotifComparer()

    # Trim edges with low information content
    if trim_edges:
        for motif in motifs:
            motif.trim(edge_ic_cutoff)
    
    # Make a MotifTree node for every motif
    nodes = [MotifTree(m) for m in motifs]
    
    # Determine all pairwise scores and maxscore per motif
    scores = {}
    motif_nodes = dict([(n.motif.id,n) for n in nodes])
    motifs = [n.motif for n in nodes]
    
    if progress:
        sys.stderr.write("Calculating initial scores\n")
    result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True)
    
    for m1, other_motifs in result.items():
        for m2, score in other_motifs.items():
            if m1 == m2:
                if pval:
                    motif_nodes[m1].maxscore = 1 - score[0]
                else:
                    motif_nodes[m1].maxscore = score[0]
            else:
                if pval:
                    score = [1 - score[0]] + score[1:]
                scores[(motif_nodes[m1],motif_nodes[m2])] = score
               
    cluster_nodes = [node for node in nodes]
    ave_count = 1
    
    total = len(cluster_nodes)

    while len(cluster_nodes) > 1:
        l = sorted(scores.keys(), key=lambda x: scores[x][0])
        i = -1
        (n1, n2) = l[i]
        while not n1 in cluster_nodes or not n2 in cluster_nodes:
            i -= 1
            (n1,n2) = l[i]
        
        (score, pos, orientation) = scores[(n1,n2)]
        ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg)
        
        ave_motif.trim(edge_ic_cutoff)
        ave_motif.id = "Average_%s" % ave_count
        ave_count += 1
        
        new_node = MotifTree(ave_motif)
        if pval:
             new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
        else:
            new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
            
        new_node.mergescore = score
        #print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score)
        n1.parent = new_node
        n2.parent = new_node
        new_node.left = n1
        new_node.right = n2
        
        cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent])
        
        if progress:
            progress = (1 - len(cmp_nodes) / float(total)) * 100
            sys.stderr.write('\rClustering [{0}{1}] {2}%'.format(
                '#'*(int(progress)/10), 
                " "*(10 - int(progress)/10), 
                int(progress)))
        
        result = mc.get_all_scores(
                [new_node.motif], 
                cmp_nodes.keys(), 
                match, 
                metric, 
                combine, 
                pval, 
                parallel=True)
        
        for motif, n in cmp_nodes.items():
            x = result[new_node.motif.id][motif.id]
            if pval:
                x = [1 - x[0]] + x[1:]
            scores[(new_node, n)] = x
        
        nodes.append(new_node)

        cluster_nodes = [node for node in nodes if not node.parent]
     
    if progress:
        sys.stderr.write("\n") 
    root = nodes[-1]
    for node in [node for node in nodes if not node.left]:
         node.parent.checkMerge(root, threshold)
    
    return root
Ejemplo n.º 26
0
        for line in f:
            vals = line.strip().split("\t")
            if len(vals) == 4:
                m2f[vals[0]] = m2f.get(vals[0], []) + [vals[1:]]
#print(m2f)

# Read factor to family mapping from the CIS-BP databse
anno = pd.read_table(tf_info)
anno = anno[["TF_Name", "Family_Name"]].drop_duplicates().set_index("TF_Name")

# read motifs
motifs = dict([(m.id, m) for m in read_motifs(open(pfmfile))])
df_cluster = pd.read_table(clusterfile)

ic_cutoff = 5
mc = MotifComparer()
id_count = {}
df = df_cluster.loc[k]
sys.stderr.write(str(k) + "\n")
seen_line = {}
with open("{}.pfm".format(outname), "w") as out:
    with open("{}.motif2factors.txt".format(outname), "w") as m2f_out:
        print("Motif\tFactor\tEvidence\tCurated", file=m2f_out)
        for cluster in range(k):
            if cluster % 10 == 0:
                sys.stderr.write("{}\n".format(cluster))
                out.flush()
            motif_ids = df[df == cluster].index
            motif = get_clustered_motifs(motif_ids)
            if motif.information_content() >= ic_cutoff:
                factors = []
Ejemplo n.º 27
0
def cluster_motifs_with_report(infile, outfile, outdir, threshold, title=None):
    # Cluster significant motifs

    if title is None:
        title = infile

    motifs = read_motifs(infile, fmt="pwm")

    trim_ic = 0.2
    clusters = []
    if len(motifs) == 0:
        return []
    elif len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        logger.info("clustering %d motifs.", len(motifs))
        tree = cluster_motifs(infile,
                              "total",
                              "wic",
                              "mean",
                              True,
                              threshold=float(threshold),
                              include_bg=True,
                              progress=False)
        clusters = tree.getResult()

    ids = []
    mc = MotifComparer()

    img_dir = os.path.join(outdir, "images")

    if not os.path.exists(img_dir):
        os.mkdir(img_dir)

    for cluster, members in clusters:
        cluster.trim(trim_ic)
        png = "images/{}.png".format(cluster.id)
        cluster.to_img(os.path.join(outdir, png), fmt="PNG")
        ids.append([cluster.id, {"src": png}, []])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] = mc.compare_motifs(cluster,
                                                  motif,
                                                  "total",
                                                  "wic",
                                                  "mean",
                                                  pval=True)
            add_pos = sorted(scores.values(), key=lambda x: x[1])[0][1]
            for motif in members:
                score, pos, strand = scores[motif]
                add = pos - add_pos

                if strand in [1, "+"]:
                    pass
                else:
                    rc = motif.rc()
                    rc.id = motif.id
                    motif = rc
                #print "%s\t%s" % (motif.id, add)
                png = "images/{}.png".format(motif.id.replace(" ", "_"))
                motif.to_img(os.path.join(outdir, png),
                             fmt="PNG",
                             add_left=add)
        ids[-1][2] = [
            dict([("src", "images/{}.png".format(motif.id.replace(" ", "_"))),
                  ("alt", motif.id.replace(" ", "_"))]) for motif in members
        ]

    config = MotifConfig()
    env = jinja2.Environment(
        loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids,
                             inputfile=title,
                             date=datetime.today().strftime("%d/%m/%Y"),
                             version=__version__)

    cluster_report = os.path.join(outdir, "cluster_report.html")
    with open(cluster_report, "wb") as f:
        f.write(result.encode('utf-8'))

    f = open(outfile, "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()

    logger.debug("Clustering done. See the result in %s", cluster_report)
    return clusters
Ejemplo n.º 28
0
    def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold):
        self.logger.info("clustering significant motifs.")

        trim_ic = 0.2
        clusters = []
        motifs = read_motifs(open(pfm_file), fmt="pwm")
        if len(motifs) == 1:
            clusters = [[motifs[0], motifs]]
        else:
            tree = cluster_motifs(
                    pfm_file, 
                    "total", 
                    "wic", 
                    "mean", 
                    True, 
                    threshold=float(threshold), 
                    include_bg=True,
                    progress=False
                    )
            clusters = tree.getResult()

        ids = []
        mc = MotifComparer()

        for cluster,members in clusters:
            cluster.trim(trim_ic)
            cluster.to_img(os.path.join(self.imgdir,"%s.png" % cluster.id), format="PNG")
            ids.append([cluster.id, {"src":"images/%s.png" % cluster.id},[]])
            if len(members) > 1:
                scores = {}
                for motif in members:
                    scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)
                add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
                for motif in members:
                    score, pos, strand = scores[motif]
                    add = pos - add_pos

                    if strand in [1,"+"]:
                        pass
                    else:
                        #print "RC %s" % motif.id
                        rc = motif.rc()
                        rc.id = motif.id
                        motif = rc
                    #print "%s\t%s" % (motif.id, add)
                    motif.to_img(os.path.join(self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
            ids[-1][2] = [dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]

        
        env = jinja2.Environment(loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("cluster_template.jinja.html")
        result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION)
        
        f = open(self.cluster_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

        f = open(cluster_pwm, "w")
        if len(clusters) == 1 and len(clusters[0][1]) == 1:
            f.write("%s\n" % clusters[0][0].to_pwm())
        else:
            for motif in tree.get_clustered_motifs():
                f.write("%s\n" % motif.to_pwm())
        f.close()

        self.logger.debug("Clustering done. See the result in %s", 
                self.cluster_report)
        return clusters
Ejemplo n.º 29
0
def cluster_motifs_with_report(infile, outfile, outdir, threshold, title=None):
    # Cluster significant motifs

    if title is None:
        title = infile

    motifs = read_motifs(infile, fmt="pwm")

    trim_ic = 0.2
    clusters = []
    if len(motifs) == 0:
        return []
    elif len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        logger.info("clustering %d motifs.", len(motifs))
        tree = cluster_motifs(
                infile,
                "total",
                "wic",
                "mean",
                True,
                threshold=float(threshold),
                include_bg=True,
                progress=False
                )
        clusters = tree.getResult()

    ids = []
    mc = MotifComparer()

    img_dir = os.path.join(outdir, "images")

    if not os.path.exists(img_dir):
        os.mkdir(img_dir)

    for cluster,members in clusters:
        cluster.trim(trim_ic)
        png = "images/{}.png".format(cluster.id)
        cluster.to_img(os.path.join(outdir, png), fmt="PNG")
        ids.append([cluster.id, {"src":png},[]])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)
            add_pos = sorted(scores.values(),key=lambda x: x[1])[0][1]
            for motif in members:
                score, pos, strand = scores[motif]
                add = pos - add_pos

                if strand in [1,"+"]:
                    pass
                else:
                   rc = motif.rc()
                   rc.id = motif.id
                   motif = rc
                #print "%s\t%s" % (motif.id, add)
                png = "images/{}.png".format(motif.id.replace(" ", "_"))
                motif.to_img(os.path.join(outdir, png), fmt="PNG", add_left=add)
        ids[-1][2] = [dict([("src", "images/{}.png".format(motif.id.replace(" ", "_"))), ("alt", motif.id.replace(" ", "_"))]) for motif in members]

    config = MotifConfig()
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(
                motifs=ids,
                inputfile=title,
                date=datetime.today().strftime("%d/%m/%Y"),
                version=__version__)

    cluster_report = os.path.join(outdir, "cluster_report.html")
    with open(cluster_report, "wb") as f:
        f.write(result.encode('utf-8'))

    f = open(outfile, "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()

    logger.debug("Clustering done. See the result in %s",
            cluster_report)
    return clusters