Esempio n. 1
0
    def download(self, outdir=DEFAULT_OUT):
        tmpdir = mkdtemp()
        file_tmp = urlretrieve(self.URL, filename=None)[0]
        
        with zipfile.ZipFile(file_tmp,"r") as zip_ref:
            zip_ref.extractall(tmpdir)
        
        motifs = []
        for fname in glob.glob(os.path.join(tmpdir, "pwms/*")):
            m_id = os.path.splitext(os.path.basename(fname))[0]
            for m in read_motifs(fname, fmt="transfac"):
                if len(m) > 0:
                    m.id = m_id
                    motifs.append(m)
        outfile = os.path.join(outdir, self.NAME)
        with open(outfile, "w") as f:
            print("# CIS-BP motif database (v{})".format(self.VERSION), file=f)
            print("# Retrieved from: {}".format(self.URL), file=f)
            print("# Date: {}".format(self.date), file=f)
            for motif in motifs:
                print(motif.to_pwm(), file=f)
        
        shutil.rmtree(tmpdir)

        motifs = read_motifs(outfile)
        anno = self.annotate_factors(motifs)
        self.create_annotation(os.path.join(outdir, self.NAME), anno)  
Esempio n. 2
0
def command_scan(inputfile, pwmfile, nreport=1, fpr=0.01, cutoff=None, 
        bed=False, scan_rc=True, table=False, score_table=False, moods=False, 
        pvalue=None, bgfile=None, genome=None, ncpus=None, normalize=False):
    motifs = read_motifs(pwmfile)
    
    fa = as_fasta(inputfile, genome)
    
    # initialize scanner
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    
    if genome:
        s.set_genome(genome=genome)

    if genome or bgfile:
        s.set_background(genome=genome, fname=bgfile, length=fa.median_length())

    if not score_table:
        s.set_threshold(fpr=fpr, threshold=cutoff)
    
    if table:
        it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods)
    elif score_table:
        it = scan_score_table(s, fa, motifs, scan_rc, normalize=normalize) 
    else:
        it = scan_normal(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods, bed, normalize=normalize)
    
    for row in it:
        yield row
Esempio n. 3
0
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None):
    threshold = check_threshold(data_dir, genome, scoring)
    
    config = MotifConfig()
    
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    df = pd.read_table(input_table, index_col=0)
    regions = list(df.index)
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_genome(genome)

    scores = []
    if scoring == "count":
        for row in s.count(regions, cutoff=threshold):
            scores.append(row)
    else:
        for row in s.best_score(regions):
            scores.append(row)
   
    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    return pd.DataFrame(scores, index=df.index, columns=motif_names)
Esempio n. 4
0
 def set_motifs(self, motifs):
     self.motifs = motifs
     self.motif_ids = [m.id for m in read_motifs(open(motifs))]
     self.checksum = {}
     if self.use_cache:
         chksum = CityHash64("\n".join(sorted(self.motif_ids)))
         self.checksum[self.motifs] = chksum
Esempio n. 5
0
 def download(self, outdir=DEFAULT_OUT):
     ### JASPAR ###
     for group in self.GROUPS:
         if group != "":
             group = "_" + group
         outfile = os.path.join(outdir, self.NAME.format(group))
         url = self.URL.format(group)
         with open(outfile, "w") as f:
             with urlopen(url) as response:
                 for line in response:
                     line = line.decode().strip()
                     if line.startswith(">"):
                         line = "_".join(line.split("\t")[:2])
                     print(line, file=f)
     
         motifs = read_motifs(outfile, fmt="jaspar")
         with open(outfile, "w") as f:
             print("# JASPAR2018{} motif database".format(group), file=f)
             print("# Retrieved from: {}".format(url), file=f)
             print("# Date: {}".format(self.date), file=f)
             for motif in motifs:
                 print(motif.to_pwm(), file=f)
                
         #if group == "_vertebrates":
         anno = self.annotate_factors(motifs)
         self.create_annotation(os.path.join(outdir, self.NAME.format(group)), anno)  
Esempio n. 6
0
def create_roc_plots(pwmfile, fgfa, background, outdir):
    """Make ROC plots for all motifs."""
    motifs = read_motifs(pwmfile, fmt="pwm", as_dict=True)
    ncpus = int(MotifConfig().get_default_params()['ncpus'])
    pool = Pool(processes=ncpus)
    jobs = {}
    for bg,fname in background.items():
        for m_id, m in motifs.items():

            k = "{}_{}".format(str(m), bg)
            jobs[k] = pool.apply_async(
                                            get_roc_values,
                                            (motifs[m_id], fgfa, fname,)
                                            )
    imgdir = os.path.join(outdir, "images")
    if not os.path.exists(imgdir):
        os.mkdir(imgdir)
    
    roc_img_file = os.path.join(outdir, "images", "{}_roc.{}.png")

    for motif in motifs.values():
        for bg in background:
            k = "{}_{}".format(str(motif), bg)
            error, x, y = jobs[k].get()
            if error:
                logger.error("Error in thread: %s", error)
                logger.error("Motif: %s", motif)
                sys.exit(1)
            roc_plot(roc_img_file.format(motif.id, bg), x, y)
Esempio n. 7
0
    def _calc_report_values(self, pwm, background):
        self.logger.debug("Calculating final statistics for report")
        self.p = dict([(b,{}) for b in background])
        self.e = dict([(b,{}) for b in background])

        e_files = dict([(bg, self.bg_file["cluster_enrichment"][bg]) for bg in background])

        for bg in self.p.keys():
            for line in open(e_files[bg]).readlines():
                if not (line.startswith("#") or line.startswith("Motif\tSig")):
                    vals = line.strip().split("\t")
                    self.p[bg][vals[0]] = float(vals[2])
                    self.e[bg][vals[0]] = float(vals[5])

        self.auc = dict([(b,{}) for b in background])
        self.mncp = dict([(b,{}) for b in background])


        rocs = dict([(bg, [self.bg_file["fa"][bg], self.bg_file["roc"][bg]]) for bg in background])

        for bg in self.auc.keys():
            bg_fasta_file, roc_file = rocs[bg]
            self.auc[bg], self.mncp[bg] = self._roc_metrics(pwm, self.validation_fa, bg_fasta_file, roc_file)

        motifs = read_motifs(open(pwm), fmt="pwm")
        self.closest_match = self.determine_closest_match(motifs)
Esempio n. 8
0
def load_motifs(motif_file, cutoff=0.95):
    motifs = read_motifs(open(motif_file))
    d = parse_cutoff(motifs, cutoff)
    cutoffs = []
    for m in motifs:
        c = m.pwm_min_score() + (m.pwm_max_score() - m.pwm_min_score()) * d[m.id]
        cutoffs.append(c)
    
    return zip(motifs, cutoffs)
Esempio n. 9
0
    def determine_closest_match(self, motifs):
        self.logger.debug("Determining closest matching motifs in database")
        motif_db = self.config.get_default_params()["motif_db"]
        db = os.path.join(self.config.get_motif_dir(), motif_db)
        db_motifs = []
        if db.endswith("pwm") or db.endswith("pfm"):
            db_motifs = read_motifs(open(db), fmt="pwm")
        elif db.endswith("transfac"):
            db_motifs = read_motifs(db, fmt="transfac")

        closest_match = {}
        mc = MotifComparer()
        db_motif_lookup = dict([(m.id, m) for m in db_motifs])
        match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False)
        for motif in motifs:
            # Calculate p-value
            pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True)
            closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval]
        return closest_match
Esempio n. 10
0
    def _run_program(self, bin, fastafile, savedir="", params=None):
        
        default_params = {"single":False, "background":None}
        if params is not None: 
            default_params.update(params)
        
        trawler = bin
        
        fastafile = os.path.abspath(fastafile)
        if not default_params["background"]:
            print "Background file needed!"
            sys.exit()
        bgfile = os.path.abspath(default_params["background"])
        savedir = os.path.abspath(savedir)
        
        #savedir = "/tmp/trawler/"

        tmp = NamedTemporaryFile(dir=self.tmpdir, delete=False)
        shutil.copy(fastafile, tmp.name)
        fastafile = tmp.name
    
        current_path = os.getcwd()
        os.chdir(self.dir())
        
        stdout = ""
        stderr = ""
        strand = "double"
        if default_params["single"]:
            strand = "single"
        cmd = "%s -sample %s -background %s -directory %s -strand %s" % (trawler, fastafile, bgfile, self.tmpdir, strand)
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) 
        out,err = p.communicate()
        stdout += out
        stderr += err
        
        os.chdir(current_path)
        motifs = []
        out_name = [dir for dir in os.listdir(self.tmpdir) if dir.startswith("tmp")][-1]
        out_file = os.path.join(self.tmpdir, out_name, "result", "%s.pwm" % out_name)
        if os.path.exists(out_file):
            motifs = read_motifs(open(os.path.join(
                                                    self.tmpdir, 
                                                    out_name, 
                                                    "result", 
                                                    "%s.pwm" % out_name)),
                                                    fmt="pwm")
        
        # remove temporary files
        if os.path.exists(tmp.name):
            os.unlink(tmp.name)
        
        for motif in motifs:
            motif.id = "%s_%s" % (self.name, motif.id)
        
        return motifs, stdout, stderr
Esempio n. 11
0
    def download(self, outdir=DEFAULT_OUT):
        outfile = os.path.join(outdir, self.NAME)
        with open(outfile, "w") as f:
            with urlopen(self.URL) as response:
                for line in response:
                    line = line.decode().strip()
                    print(line, file=f)
    
        motifs = read_motifs(outfile, fmt="transfac")
        with open(outfile, "w") as f:
            print("# SwissRegulon motif database (hg19:FANTOM5)", file=f)
            print("# Retrieved from: {}".format(self.URL), file=f)
            print("# Date: {}".format(self.date), file=f)
            for motif in motifs:
                if len(motif) > 0:
                    print(motif.to_pwm(), file=f)
 
        motifs = read_motifs(outfile)
        anno = self.annotate_factors(motifs)
        self.create_annotation(os.path.join(outdir, self.NAME), anno)  
Esempio n. 12
0
 def download(self, outdir=DEFAULT_OUT):
     # Factorbook is only supplied in non-redundant form as a supplemental pdf
     # For now, use the non-redundant version included with GimmeMotifs
     infile = "data/motif_databases/factorbook.pfm"
     outfile = os.path.join(outdir, self.NAME)
     motifs = read_motifs(infile)
     with open(outfile, "w") as f:
         for motif in motifs:
             print(motif.to_pwm(), file=f)
     anno = self.annotate_factors(motifs)
     self.create_annotation(os.path.join(outdir, self.NAME), anno)  
Esempio n. 13
0
 def test2_stats_single_motif(self):
     """ Calculate motif statistics """
     
     m_id = "p53_Average_8_CATGyCnGGrCATGy"
     
     with open(self.motifs) as f:
         motifs = read_motifs(f)
     motif = [m for m in motifs if str(m) == m_id][0]
     
     stats = calc_stats(motif, self.fg_fa, self.bg_fa, stats=["roc_auc"])
     self.assertGreater(stats[m_id]["roc_auc"] , 0.9)
Esempio n. 14
0
    def test1_prediction_result(self):
        """ Calculates statistics of motifs """
        tmp = tempfile.NamedTemporaryFile().name
        
        p = PredictionResult(tmp, fg_file=self.fg_fa, background={"random":self.bg_fa})

        with open(self.motifs) as f:
            motifs = read_motifs(f)
        p.add_motifs((0, (motifs, "", "")))
        p.wait_for_stats()        
        self.assertEqual(2, len(p.stats))
Esempio n. 15
0
def roc(args):
    """ Calculate ROC_AUC and other metrics and optionally plot ROC curve.
    """
    pwmfile = args.pwmfile
    fg_file = args.sample
    bg_file = args.background
    outputfile = args.outfile
    # Default extension for image
    if outputfile and   not outputfile.endswith(".png"):
        outputfile += ".png"
    
    motifs = read_motifs(open(pwmfile), fmt="pwm")

    s = Scanner()
    s.set_motifs(pwmfile)
    
    ids = []
    if args.ids:
        ids = args.ids.split(",")
    else:
        ids = [m.id for m in motifs]

    fg_total = dict([(m.id, []) for m in motifs])
    for scores in s.best_score(fg_file):
        for motif,score in zip(motifs, scores):
            fg_total[motif.id].append(score)
    
    bg_total = dict([(m.id, []) for m in motifs])
    for scores in s.best_score(bg_file):
        for motif,score in zip(motifs, scores):
            bg_total[motif.id].append(score)
   
    plot_x = []
    plot_y = []
    # Print the metrics
    print "Motif\tROC AUC\tMNCP\tEnr. at 5% FDR\tMax enr.\tRecall at 10% FDR"
    for motif_id in ids:
        fg_vals = fg_total[motif_id] 
        bg_vals = bg_total[motif_id]    
        (x, y) = ROC_values(fg_vals, bg_vals) 
        plot_x.append(x)
        plot_y.append(y)
        auc = ROC_AUC(fg_vals, bg_vals)
        mncp = MNCP(fg_vals, bg_vals)
        enr_fdr = enr_at_fdr(fg_vals, bg_vals)
        max_enr,score = max_enrichment(fg_vals, bg_vals)
        recall = recall_at_fdr(fg_vals, bg_vals, 0.1)
        print "%s\t%0.3f\t%03f\t%0.2f\t%0.2f\t%0.4f" % (
                motif_id, auc, mncp, enr_fdr, max_enr, recall)
    
    # Plot the ROC curve
    if outputfile:
        roc_plot(outputfile, plot_x, plot_y, ids=ids)
Esempio n. 16
0
    def download(self, outdir=DEFAULT_OUT):
        tmpdir = mkdtemp()
        file_tmp = urlretrieve(self.URL, filename=None)[0]
        tar = tarfile.open(file_tmp)
        fname = "IMAGE/utils/Collection.motif"
        members = [tar.getmember(fname)]
        tar.extractall(tmpdir, members=members)        
        outfile = os.path.join(outdir, self.NAME)
    
        motifs = read_motifs(os.path.join(tmpdir,fname))
        with open(outfile, "w") as f:
            print("# IMAGE motif database (v1.1)", file=f)
            print("# Retrieved from: {}".format(self.URL), file=f)
            print("# Date: {}".format(self.date), file=f)
            for motif in motifs:
                print(motif.to_pwm(), file=f)
        shutil.rmtree(tmpdir)
 
        motifs = read_motifs(outfile)
        anno = self.annotate_factors(motifs)
        self.create_annotation(os.path.join(outdir, self.NAME), anno)  
Esempio n. 17
0
    def _run_program(self, bin, fastafile, savedir="", params=None):
        
        default_params = {"single":False, "background":None, "analysis":"medium", "number":5, "width":10}
        if params is not None: 
            default_params.update(params)
        
        homer = bin
        
        fastafile = os.path.abspath(fastafile)
        
        # Background file is essential!
        if not default_params["background"]:
            print "Background file needed!"
            sys.exit()
        
        bgfile = os.path.abspath(default_params["background"])
        
        outfile = NamedTemporaryFile(
                dir=self.tmpdir, 
                prefix= "homer_w{}.".format(default_params["width"])
                ).name
        
        stderr = ""
        
        strand = ""
        if default_params["single"]:
            strand = " -strand + "

        cmd = "%s denovo -i %s -b %s -len %s -S %s %s -o %s -p 8" % (
            homer,
            fastafile,
            bgfile,
            default_params["width"],
            default_params["number"],
            strand,
            outfile)

        stdout = "Running command:\n{}\n".format(cmd)
        
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, cwd=self.tmpdir) 
        out,err = p.communicate()
        stdout += out
        stderr += err
        
        motifs = []
        
        if os.path.exists(outfile):
            motifs = read_motifs(open(outfile), fmt="pwm")
            for i, m in enumerate(motifs):
                m.id = "{}_{}_{}".format(self.name, default_params["width"], i + 1)
        
        return motifs, stdout, stderr
Esempio n. 18
0
    def _run_program(self, bin, fastafile, savedir="", params=None):
        if params is None:
            params = {}
        
        default_params = {"single":False, "background":None, "analysis":"medium", "number":5, "width":10}
        default_params.update(params)
        
        cmd = bin
        
        fastafile = os.path.abspath(fastafile)
        
        bgfile = os.path.abspath(default_params["background"])
        background = ""
        if bgfile:
            background = " --negSet {0} ".format(bgfile)

        outfile = os.path.join(self.tmpdir, os.path.basename(fastafile.replace(".fa", ".pwm")))
        
        stdout = ""
        stderr = ""
        
        strand = ""
        if not default_params["single"]:
            strand = " --revcomp "

        cmd = "%s %s %s --localization --batch --no-graphics %s %s" % (
            cmd,
            self.tmpdir, 
            fastafile,
            background,
            strand
            )

        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) 
        out,err = p.communicate()
        stdout += out
        stderr += err
        
        motifs = []
        
        if os.path.exists(outfile):
            motifs = read_motifs(open(outfile), fmt="xxmotifs")
            for m in motifs:
                m.id = "{0}_{1}".format(self.name, m.id)
        else:
            stdout += "\nMotif file {0} not found!\n".format(outfile)
            stderr += "\nMotif file {0} not found!\n".format(outfile)
        
        return motifs, stdout, stderr
Esempio n. 19
0
 def download(self, outdir=DEFAULT_OUT):
     outfile = os.path.join(outdir, self.NAME)
     with open(outfile, "w") as f:
         print("# ENCODE motif database", file=f)
         print("# Retrieved from: {}".format(self.URL), file=f)
         print("# Date: Dec. 2013", file=f)
         with urlopen(self.URL) as response:
             for line in response:
                 line = line.decode().strip()
                 if line.startswith(">"):
                     line = line.replace("\t", " ")
                 print(line, file=f)
     motifs = read_motifs(outfile)
     anno = self.annotate_factors(motifs)
     self.create_annotation(os.path.join(outdir, self.NAME), anno)  
Esempio n. 20
0
    def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name):
        motifs = dict([(m.id, m) for m in read_motifs(open(pwm_file), fmt="pwm")])

        jobs = {}
        for id,m in motifs.items():
            jobs[id] = self.job_server().apply_async(get_roc_values, (motifs[id],fg_fasta,bg_fasta,))

        roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png")

        for id in motifs.keys():
            error, x, y = jobs[id].get()
            if error:
                self.logger.error("Error in thread: %s", error)
                sys.exit(1)

            roc_plot(roc_img_file % (id,name), x, y)
Esempio n. 21
0
    def download(self, outdir=DEFAULT_OUT):
        ### Homer ###
        pfm_out = os.path.join(outdir, self.NAME)
        with open(pfm_out, "w") as f:
            print("# Homer motif database (v4.10)", file=f)
            print("# Retrieved from: {}".format(self.URL), file=f)
            print("# Date: {}".format(self.date), file=f)
            with urlopen(self.URL) as response:
                for line in response:
                    line = line.decode().strip()
                    if line.startswith(">"):
                        line = "_".join(line.split("\t")[:2])
                    print(line, file=f)

        motifs = read_motifs(pfm_out)
        anno = self.annotate_factors(motifs)
        self.create_annotation(os.path.join(outdir, self.NAME), anno)  
Esempio n. 22
0
 def download(self, outdir=DEFAULT_OUT):
     for group in ["HUMAN", "MOUSE"]:
         outfile = os.path.join(outdir, self.NAME.format(group))
         url = self.URL.format(group)
         with open(outfile, "w") as f:
             print("# HOCOMOCOv10_{} motif database".format(group), file=f)
             print("# Retrieved from: {}".format(url), file=f)
             print("# Date: {}".format(self.date), file=f)
             with urlopen(url) as response:
                 for line in response:
                     line = line.decode().strip()
                     if line.startswith(">"):
                         line = "_".join(line.split("\t")[:2])
                     print(line, file=f)
         motifs = read_motifs(outfile)
         anno = self.annotate_factors(motifs, self.ANNO_URL.format(group))
         self.create_annotation(os.path.join(outdir, self.NAME.format(group)), anno)  
Esempio n. 23
0
    def set_motifs(self, motifs):
        try:
            # Check if motifs is a list of Motif instances
            motifs[0].to_pwm()
            tmp = NamedTemporaryFile(mode="w", delete=False)
            for m in motifs:
                tmp.write("{}\n".format(m.to_pwm()))
            tmp.close()
            motif_file = tmp.name
        except AttributeError:
            motif_file = motifs

        self.motifs = motif_file
        self.motif_ids = [m.id for m in read_motifs(motif_file)]
        self.checksum = {}
        if self.use_cache:
            chksum = xxhash.xxh64("\n".join(sorted(self.motif_ids))).digest()
            self.checksum[self.motifs] = chksum
Esempio n. 24
0
    def _create_text_report(self, pwm, background):
        self.logger.debug("Creating text report")
        motifs = read_motifs(open(pwm), fmt="pwm")

        sort_key = background[0]
        if "gc" in background:
            sort_key = "gc"

        f = open(self.text_report, "w")
        header = "ID\tconsensus\tBest match db\tp-value best match\t" + "\t".join("Enrichment (%s)\tp-value (%s)\tROC AUC (%s)\tMNCP (%s)" % (b,b,b,b) for b in background)
        #print header
        f.write("%s\n" % header)
        for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])):
            vals = [motif.id, motif.to_consensus(), self.closest_match[motif.id][0].id, self.closest_match[motif.id][1]]
            for bg in background:
                vals += [self.e[bg][motif.id], self.p[bg][motif.id], self.auc[bg][motif.id], self.mncp[bg][motif.id]]
            f.write("%s\n" % "\t".join([str(x) for x in vals]))
            #print "%s\n" % "\t".join([str(x) for x in vals])
        f.close()
Esempio n. 25
0
def scan_to_best_match(fname, motifs, ncpus=None, genome=None, score=False):
    """Scan a FASTA file with motifs.

    Scan a FASTA file and return a dictionary with the best match per motif.

    Parameters
    ----------
    fname : str
        Filename of a sequence file in FASTA format.

    motifs : list
        List of motif instances.

    Returns
    -------
    result : dict
        Dictionary with motif scanning results.
    """
    # Initialize scanner
    s = Scanner(ncpus=ncpus)
    s.set_motifs(motifs)
    s.set_threshold(threshold=0.0)
    if genome:
        s.set_genome(genome)

    if isinstance(motifs, six.string_types):
        motifs = read_motifs(motifs)

    logger.debug("scanning %s...", fname)
    result = dict([(m.id, []) for m in motifs])
    if score:
        it = s.best_score(fname)
    else:
        it = s.best_match(fname)
    for scores in it:
        for motif, score in zip(motifs, scores):
            result[motif.id].append(score)

    # Close the pool and reclaim memory
    del s

    return result
Esempio n. 26
0
def create_denovo_motif_report(
    inputfile, pfmfile, fgfa, background, locfa, outdir, params, stats=None
):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating de novo reports")

    motifs = read_motifs(pfmfile, fmt="pwm")

    # ROC plots
    create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"])

    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)

    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa, motifs=motifs).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get("cutoff_fpr", 0.9)
    lsize = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(
        inputfile, pfmfile, background, closest_match, outdir, stats
    )
Esempio n. 27
0
    def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
        # Simple case, only one threshold
        if np.all(self.threshold.nunique(axis=0) == 1):
            return self.threshold.iloc[0].to_dict()

        if motifs is None:
            motifs = read_motifs(self.motifs)
        seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs]

        gc_bin_count = Counter(seq_gc_bins)

        _threshold = self.threshold
        if zscore:
            grouped = _threshold.groupby(_threshold.index).apply(scale, axis=0)
            _threshold = pd.DataFrame(
                np.vstack(grouped.values),
                index=_threshold.index,
                columns=_threshold.columns,
            )

        nseqs = int(20000 / np.sum(list(gc_bin_count.values())))
        t = {}
        maxt = pd.Series([m.pwm_max_score() for m in motifs],
                         index=_threshold.columns)
        # We do this in a loop as the DataFrame will get too big to fit in memory
        # when the difference between the number of sequences per gc_bin is very
        # high.
        _threshold = _threshold.reset_index()
        idx = np.hstack([
            _threshold[_threshold[_threshold.columns[0]] == gc_bin].sample(
                nseqs * count, replace=True, random_state=42).index.values
            for gc_bin, count in gc_bin_count.items()
        ])
        for motif in _threshold.columns[1:]:
            val = _threshold.loc[idx, motif].quantile(0.99,
                                                      interpolation="higher")
            if val < maxt.loc[motif]:
                t[motif] = val
            else:
                t[motif] = None
        return t
Esempio n. 28
0
def logo(args):
    if args.pfmfile is None and args.ids is None:
        name = os.path.splitext(os.path.split(pfmfile_location(None))[-1])[0]
        print(
            "Use the -i argument to specify which motif ids you want to use for logos."
        )
        print("If you really want to create logos for all of the motifs in the default")
        print("PFM file use the following command:")
        print(f"gimme logo -p {name}")
        sys.exit(1)
    inputfile = args.pfmfile

    motifs = read_motifs(inputfile)
    if args.ids:
        ids = args.ids.split(",")
        motifs = [m for m in motifs if m.id in ids]

    for motif in motifs:
        motif.plot_logo(
            fname="{}.png".format(motif.id), kind=args.kind, title=args.title
        )
Esempio n. 29
0
    def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name):
        motifs = dict([(m.id, m)
                       for m in read_motifs(open(pwm_file), fmt="pwm")])

        jobs = {}
        for id, m in motifs.items():
            jobs[id] = self.job_server().apply_async(get_roc_values, (
                motifs[id],
                fg_fasta,
                bg_fasta,
            ))

        roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png")

        for id in motifs.keys():
            error, x, y = jobs[id].get()
            if error:
                self.logger.error("Error in thread: %s", error)
                sys.exit(1)

            roc_plot(roc_img_file % (id, name), x, y)
Esempio n. 30
0
 def download(self, outdir=DEFAULT_OUT):
     for tax in ["insects", "plants", "vertebrates"]:
         tax_ = tax
         if not tax.endswith("es"):
             tax_ = tax[:-1]
         url = self.URL.format(tax.capitalize(), tax_)
         print(url)
         name = self.NAME.format(tax)
         
         file_tmp = urlretrieve(url, filename=None)[0]
         motifs = read_motifs(file_tmp, fmt="transfac")
         outfile = os.path.join(outdir, name)
         with open(outfile, "w") as f:
             print("# RSAT non-redundant {} motif database".format(tax), file=f)
             print("# Retrieved from: {}".format(url), file=f)
             print("# Date: {}".format(self.date), file=f)
             for motif in motifs:
                 print(motif.to_pwm(), file=f)
         
         anno = self.annotate_factors(motifs)
         self.create_annotation(os.path.join(outdir, self.NAME.format(tax)), anno)  
Esempio n. 31
0
def threshold(args):
    """Calculate motif score threshold for a given FPR."""
    if args.fpr < 0 or args.fpr > 1:
        print("Please specify a FPR between 0 and 1")
        sys.exit(1)

    motifs = read_motifs(args.pwmfile)

    s = Scanner()
    s.set_motifs(args.pwmfile)
    s.set_threshold(args.fpr, filename=args.inputfile)

    print("Motif\tScore\tCutoff")
    for motif in motifs:
        min_score = motif.pwm_min_score()
        max_score = motif.pwm_max_score()
        opt_score = s.threshold[motif.id]
        if opt_score is None:
            opt_score = motif.pwm_max_score()
        threshold = (opt_score - min_score) / (max_score - min_score)
        print("{0}\t{1}\t{2}".format(motif.id, opt_score, threshold))
Esempio n. 32
0
    def _load_factor2motifs(self, pfmfile=None, indirect=True, factors=None):
        motifs = read_motifs(pfmfile, as_dict=True)
        f2m = {}

        if self.is_human_genome():
            valid_factors = self._load_human_factors()

        for name, motif in motifs.items():
            for factor in get_motif_factors(motif, indirect=indirect):
                if factors is not None and factor not in factors:
                    continue

                # TODO: this is temporary, while the motif database we use
                # not very clean...
                if self.is_human_genome():
                    factor = factor.upper()

                if self.is_human_genome() and factor not in valid_factors:
                    continue

                f2m.setdefault(factor, []).append(name)
        return f2m
Esempio n. 33
0
def threshold(args):
    """Calculate motif score threshold for a given FPR."""
    if args.fpr < 0 or args.fpr > 1:
        print("Please specify a FPR between 0 and 1")
        sys.exit(1)

    motifs = read_motifs(args.pwmfile)
    
    s = Scanner()
    s.set_motifs(args.pwmfile)
    s.set_threshold(args.fpr, filename=args.inputfile)

    print("Motif\tScore\tCutoff")
    for motif in motifs:
        min_score = motif.pwm_min_score()
        max_score = motif.pwm_max_score()
        opt_score = s.threshold[motif.id]
        if opt_score is None:
            opt_score = motif.pwm_max_score()
        threshold = (opt_score - min_score) / (max_score - min_score)
        print("{0}\t{1}\t{2}".format(
                motif.id, opt_score, threshold))
Esempio n. 34
0
    def test1_denovo(self):
        """ de novo motif prediction """
       
        
        gimme_motifs("test/data/denovo/input.fa", self.outdir,
            params={
                "tools":"BioProspector,Homer,MDmodule",
                "fraction":0.5,
                "background":"random"
                },
            filter_significant=True,
            cluster=True)
       
        fnames = ["motifs.pwm", "motif_report.html", "cluster_report.html",
                    "params.txt", "stats.random.txt"]
        
    
        with open(os.path.join(self.outdir, 'gimmemotifs.log')) as f:
            log = f.read()
        self.assertIn("clustering", log)
    
        # Check if all output files are there
        for fname in fnames:
            self.assertTrue(os.path.exists(os.path.join(self.outdir, fname)))   
  
        # Check if correct motif is predicted
        with open(os.path.join(self.outdir, "motifs.pwm")) as f:
            predicted_motifs = read_motifs(f)
        ap1 = motif_from_consensus("TGASTCA")

        mc = MotifComparer()
        ap1_predicted = False
        for motif in predicted_motifs:
            match = mc.get_closest_match(ap1, motif)
            if match["TGASTCA"][1][3] < 1e-5:
                ap1_predicted = True
                break

        self.assertTrue(ap1_predicted)
Esempio n. 35
0
def create_denovo_motif_report(inputfile, pwmfile, fgfa, background, locfa, outdir, params, stats=None):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating reports")

    motifs = read_motifs(pwmfile, fmt="pwm")
    
    # ROC plots
    create_roc_plots(pwmfile, fgfa, background, outdir)
    
    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)
    
    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(motifs, fgfa, bgfa).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get('cutoff_fpr', 0.9)
    lwidth = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lwidth, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(inputfile, pwmfile, background, closest_match, outdir, stats)
Esempio n. 36
0
def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="logos"):
    if motifs is None:
        motifs = read_motifs(pfmfile, as_dict=True)

    if not os.path.exists(outdir):
        os.makedirs(outdir)
    if not os.path.exists(os.path.join(outdir, subdir)):
        os.makedirs(os.path.join(outdir, subdir))

    img_series = []
    for motif in series:
        if motif not in motifs:
            raise ValueError(f"Motif {motif} does not occur in motif database")
        fname = subdir + "/{}.png".format(re.sub(r"[^a-zA-Z0-9\-]+", "_", motif))
        if not os.path.exists(fname):
            motifs[motif].plot_logo(fname=os.path.join(outdir, fname))
        img_series.append(fname)

    if isinstance(series, pd.Index):
        index = series
    else:
        index = series.index
    return pd.Series(data=img_series, index=index)
Esempio n. 37
0
    def _roc_metrics(self, pwm, sample_fa, bg_fa, roc_file):
        motifs = dict([(m.id, m) for m in read_motifs(open(pwm), fmt="pwm")])

        jobs = {}
        for id,m in motifs.items():
            jobs[id] = self.job_server().apply_async(get_scores, (motifs[id],sample_fa,bg_fa,))

        all_auc = {}
        all_mncp = {}
        f = open(roc_file, "w")
        f.write("Motif\tROC AUC\tMNCP\tMax f-measure\tSens @ max f-measure\n")
        for id in motifs.keys():
            error, auc, mncp, max_f, y = jobs[id].get()
            if error:
                self.logger.error("Error in thread: %s", error)
                sys.exit(1)
            f.write("%s\t%s\t%s\t%s\t%s\n" % (id,auc,mncp,max_f,y))
            all_auc[id] = auc
            all_mncp[id] = mncp

        f.close()

        return all_auc,all_mncp
Esempio n. 38
0
def load_motifs(motifs_name):
    """

    Load motifs from celloracle motif database

    Args:
        motifs_name (str) : Name of motifs.

    Returns:
        list : List of gimmemotifs.motif object.


    """

    if motifs_name not in MOTIFS_LIST:
        raise ValueError(
            "The motifs name was not in the list. Available motifs: ",
            MOTIFS_LIST)

    path = MOTIFS_PATH_DICT[motifs_name]
    motifs = read_motifs(path)

    return motifs
Esempio n. 39
0
    def download(self, outdir=DEFAULT_OUT):
        for tax in ["insects", "plants", "vertebrates"]:
            tax_ = tax
            if not tax.endswith("es"):
                tax_ = tax[:-1]
            url = self.URL.format(tax.capitalize(), tax_)
            print(url)
            name = self.NAME.format(tax)

            file_tmp = urlretrieve(url, filename=None)[0]
            motifs = read_motifs(file_tmp, fmt="transfac")
            outfile = os.path.join(outdir, name)
            with open(outfile, "w") as f:
                print("# RSAT non-redundant {} motif database".format(tax),
                      file=f)
                print("# Retrieved from: {}".format(url), file=f)
                print("# Date: {}".format(self.date), file=f)
                for motif in motifs:
                    print(motif.to_pwm(), file=f)

            anno = self.annotate_factors(motifs)
            self.create_annotation(os.path.join(outdir, self.NAME.format(tax)),
                                   anno)
Esempio n. 40
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pfmfile = args.pfmfile

    lsize = args.size
    if not lsize:
        f = Fasta(fastafile)
        lsize = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = read_motifs(pfmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    n_cpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=n_cpus, maxtasksperchild=1000)
    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                pool.apply_async(
                    motif_localization,
                    (fastafile, motif, lsize, outfile, args.cutoff)))

    for job in jobs:
        job.get()
Esempio n. 41
0
def maelstrom_html_report(outdir, infile, pwmfile=None, threshold=2):
    df = pd.read_table(infile, index_col=0)
    df = df[np.any(abs(df) >= threshold, 1)]
    M = max(abs(df.min().min()), df.max().max())
    m = -M

    motifs = read_motifs(pwmfile)

    del df.index.name
    cols = df.columns
    
    motifs = read_motifs(pwmfile)
    idx = [motif.id for motif in motifs]
    direct = [",".join(sorted(set([x.upper() for x in motif.factors[DIRECT_NAME]]))) for motif in motifs]
    indirect = [",".join(sorted(set([x.upper() for x in motif.factors[INDIRECT_NAME]]))) for motif in motifs]
    m2f = pd.DataFrame({DIRECT_NAME:direct, INDIRECT_NAME:indirect}, index=idx)

    factor_cols = [DIRECT_NAME, INDIRECT_NAME]
    if True:
        for factor_col in factor_cols:
            f = m2f[factor_col].str.len() > 30
            m2f[factor_col] = '<div title="' + m2f[factor_col] + '">' + m2f[factor_col].str.slice(0,30)
            m2f.loc[f, factor_col] += '(...)'
            m2f[factor_col] += '</div>'
        df = df.join(m2f)

    df["logo"] = ['<img src="logos/{}.png" height=40/>'.format(re.sub('[()/]', '_', x)) for x in list(df.index)]

    if not os.path.exists(outdir + "/logos"):
        os.makedirs(outdir + "/logos")
    for motif in motifs:
        if motif.id in df.index:
            motif.to_img(outdir + "/logos/{}.png".format(re.sub('[()/]', '_',motif.id)), fmt="PNG")

    template_dir = MotifConfig().get_template_dir()
    js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read()
    css = open(os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8").read()
    cm = sns.diverging_palette(240, 10, as_cmap=True)
    df = df[factor_cols + ["logo"] + list(cols)]
    
    df_styled = df.style
    absmax = np.max((abs(df[cols].max().max()), abs(df[cols].min().min())))
    target = absmax * 1.75

    for col in cols:
        smin = df[col].min()
        smax = df[col].max()
        diff = smax - smin
        low = abs((-target - smin) / diff)
        high = (target - smax) / diff
        df_styled = df_styled.background_gradient(cmap='RdBu_r', low=low, high=high, subset=[col])
    
    df_styled = df_styled.set_precision(3)
    df_styled = df_styled.set_table_attributes("data-sortable")
    df_styled = df_styled.render()
    df_styled = df_styled.replace("data-sortable", 'class="sortable-theme-slick" data-sortable')

    with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f:
        f.write("<head>\n")
        f.write("<style>{}</style>\n".format(css))
        f.write("</head>\n")
        f.write("<body>\n")
        f.write(df_styled)
        f.write("<script>{}</script>\n".format(js))
        f.write("</body>\n")
Esempio n. 42
0
    def _create_report(self, pwm, background, stats=None, best_id=None):
        if stats is None:
            stats = {}
        if best_id is None:
            best_id = {}

        self.logger.debug("Creating graphical report")

        class ReportMotif:
            pass

        motifs = read_motifs(open(pwm), fmt="pwm")
        for m, match in self.closest_match.items():
            match[0].to_img(os.path.join(self.imgdir, "%s.png" % match[0].id),
                            format="PNG")

        sort_key = background[0]
        if "gc" in background:
            sort_key = "gc"

        roc_img_file = "%s_%s_roc"
        report_motifs = []
        sorted_motifs = sorted(motifs,
                               cmp=lambda x, y: cmp(self.mncp[sort_key][y.id],
                                                    self.mncp[sort_key][x.id]))

        for motif in sorted_motifs:
            rm = ReportMotif()
            rm.id = motif.id
            rm.id_href = {"href": "#%s" % motif.id}
            rm.id_name = {"name": motif.id}
            rm.img = {"src": os.path.join("images", "%s.png" % motif.id)}

            rm.best = best_id[motif.id]

            rm.consensus = motif.to_consensus()
            rm.stars = stats["%s_%s" %
                             (motif.id, motif.to_consensus())]["stars"]

            rm.bg = {}
            for bg in background:
                rm.bg[bg] = {}
                rm.bg[bg]["e"] = "%0.2f" % self.e[bg].setdefault(motif.id, 0.0)
                rm.bg[bg]["p"] = "%0.2f" % self.p[bg].setdefault(motif.id, 1.0)
                rm.bg[bg]["auc"] = "%0.3f" % self.auc[bg][motif.id]
                rm.bg[bg]["mncp"] = "%0.3f" % self.mncp[bg][motif.id]
                rm.bg[bg]["roc_img"] = {
                    "src":
                    "images/" + os.path.basename(roc_img_file %
                                                 (motif.id, bg)) + ".png"
                }
                rm.bg[bg]["roc_img_link"] = {
                    "href":
                    "images/" + os.path.basename(roc_img_file %
                                                 (motif.id, bg)) + ".png"
                }

            rm.histogram_img = {"data": "images/%s_histogram.svg" % motif.id}
            rm.histogram_link = {"href": "images/%s_histogram.svg" % motif.id}
            rm.match_img = {
                "src": "images/%s.png" % self.closest_match[motif.id][0].id
            }
            rm.match_id = self.closest_match[motif.id][0].id
            rm.match_pval = "%0.2e" % self.closest_match[motif.id][1]

            report_motifs.append(rm)

        total_report = self.motif_report

        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("report_template.jinja.html")
        result = template.render(expname=self.basename,
                                 motifs=report_motifs,
                                 inputfile=self.inputfile,
                                 date=datetime.today().strftime("%d/%m/%Y"),
                                 version=GM_VERSION)

        f = open(total_report, "w")
        f.write(result.encode('utf-8'))
        f.close()
Esempio n. 43
0
def _create_graphical_report(inputfile, pwm, background, closest_match, outdir, stats, best_id=None):
    """Create main gimme_motifs output html report."""
    if best_id is None:
        best_id = {}

    logger.debug("Creating graphical report")
    
    class ReportMotif(object):
        """Placeholder for motif stats."""
        pass

    config = MotifConfig()
    
    imgdir = os.path.join(outdir, "images")
    if not os.path.exists(imgdir):
        os.mkdir(imgdir)
    
    motifs = read_motifs(pwm, fmt="pwm")
    
    roc_img_file = "%s_roc.%s"

    dbpwm = config.get_default_params()["motif_db"]
    pwmdir = config.get_motif_dir()

    dbmotifs = read_motifs(os.path.join(pwmdir, dbpwm), as_dict=True)
    
    report_motifs = []
    for motif in motifs:
        
        rm = ReportMotif()
        rm.id = motif.id
        rm.id_href = {"href": "#%s" % motif.id}
        rm.id_name = {"name": motif.id}
        rm.img = {"src":  os.path.join("images", "%s.png" % motif.id)}
        motif.to_img(os.path.join(outdir, "images/{}.png".format(motif.id)), fmt="PNG")
        
        # TODO: fix best ID
        rm.best = "Gimme"#best_id[motif.id]

        rm.consensus = motif.to_consensus()
        rm.stars = int(np.mean(
                [stats[str(motif)][bg].get("stars", 0) for bg in background]
                ) + 0.5)

        rm.bg = {}
        for bg in background:
            rm.bg[bg] = {}
            this_stats = stats.get(str(motif), {}).get(bg)
            # TODO: fix these stats
            rm.bg[bg]["e"] = "%0.2f" % this_stats.get("enr_at_fpr", 1.0)
            rm.bg[bg]["p"] = "%0.2f" % this_stats.get("phyper_at_fpr", 1.0)
            rm.bg[bg]["auc"] = "%0.3f" % this_stats.get("roc_auc", 0.5)
            rm.bg[bg]["mncp"] = "%0.3f" % this_stats.get("mncp", 1.0)
            rm.bg[bg]["roc_img"] = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"}
            rm.bg[bg][u"roc_img_link"] = {u"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"}

        rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id}
        rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id}
        
        match_id = closest_match[motif.id][0]
        dbmotifs[match_id].to_img(os.path.join(outdir, "images/{}.png".format(match_id)), fmt="PNG")
    
        rm.match_img = {"src":  "images/{}.png".format(match_id)}
        rm.match_id = closest_match[motif.id][0]
        rm.match_pval = "%0.2e" % closest_match[motif.id][1][-1]

        report_motifs.append(rm)
    
    total_report = os.path.join(outdir, "motif_report.html")

    star_img = os.path.join(config.get_template_dir(), "star.png")
    shutil.copyfile(star_img, os.path.join(outdir, "images", "star.png"))

    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("report_template.jinja.html")
    # TODO: title
    result = template.render(
                    motifs=report_motifs, 
                    inputfile=inputfile, 
                    date=datetime.today().strftime("%d/%m/%Y"), 
                    version=__version__,
                    bg_types=list(background.keys()))

    with open(total_report, "wb") as f:
        f.write(result.encode('utf-8'))
Esempio n. 44
0
    def set_threshold(self, fpr=None, threshold=None, gc=False):
        """Set motif scanning threshold based on background sequences.

        Parameters
        ----------
        fpr : float, optional
            Desired FPR, between 0.0 and 1.0.

        threshold : float or str, optional
            Desired motif threshold, expressed as the fraction of the
            difference between minimum and maximum score of the PWM.
            Should either be a float between 0.0 and 1.0 or a filename
            with thresholds as created by 'gimme threshold'.

        """
        if threshold and fpr:
            raise ValueError("Need either fpr or threshold.")

        if fpr:
            fpr = float(fpr)
            if not (0.0 < fpr < 1.0):
                raise ValueError("Parameter fpr should be between 0 and 1")

        if not self.motifs:
            raise ValueError("please run set_motifs() first")

        thresholds = {}
        motifs = read_motifs(self.motifs)

        if threshold is not None:
            self.threshold = parse_threshold_values(self.motifs, threshold)
            return

        if not self.background:
            try:
                self.set_background(gc=gc)
            except Exception:
                raise ValueError("please run set_background() first")

        seqs = self.background.seqs

        lock.acquire()
        with Cache(CACHE_DIR) as cache:
            scan_motifs = []
            for motif in motifs:
                k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash,
                                          fpr)

                threshold = cache.get(k)
                if threshold is None:
                    scan_motifs.append(motif)
                else:
                    if np.isclose(threshold, motif.pwm_max_score()):
                        thresholds[motif.id] = None
                    elif np.isclose(threshold, motif.pwm_min_score()):
                        thresholds[motif.id] = 0.0
                    else:
                        thresholds[motif.id] = threshold

            if len(scan_motifs) > 0:
                logger.info("determining FPR-based threshold")
                for motif, threshold in self._threshold_from_seqs(
                        scan_motifs, seqs, fpr):
                    k = "{}|{}|{:.4f}".format(motif.hash(),
                                              self.background_hash, fpr)
                    cache.set(k, threshold)
                    if np.isclose(threshold, motif.pwm_max_score()):
                        thresholds[motif.id] = None
                    elif np.isclose(threshold, motif.pwm_min_score()):
                        thresholds[motif.id] = 0.0
                    else:
                        thresholds[motif.id] = threshold
        lock.release()
        self.threshold_str = "{}_{}_{}".format(fpr, threshold,
                                               self.background_hash)
        self.threshold = thresholds
Esempio n. 45
0
def scan_to_table(
    input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True
):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.

    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a
        genomepy genome name.

    scoring : str
        "count" or "score"

    pfmfile : str, optional
        Specify a PFM file for scanning.

    ncpus : int, optional
        If defined this specifies the number of cores to use.

    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()

    if pfmfile is None:
        pfmfile = config.get_default_params().get("motif_db", None)
        if pfmfile is not None:
            pfmfile = os.path.join(config.get_motif_dir(), pfmfile)

    if pfmfile is None:
        raise ValueError("no pfmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    if len(regions) >= 1000:
        check_regions = np.random.choice(regions, size=1000, replace=False)
    else:
        check_regions = regions

    size = int(
        np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs])
    )
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pfmfile)
    s.set_genome(genome)
    s.set_background(genome=genome, gc=gc, size=size)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        msg = "creating score table"
        if zscore:
            msg += " (z-score"
            if gc:
                msg += ", GC%"
            msg += ")"
        else:
            msg += " (logodds)"
        logger.info(msg)
        for row in s.best_score(regions, zscore=zscore, gc=gc):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(pfmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Esempio n. 46
0
    def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold):
        self.logger.info("clustering significant motifs.")

        trim_ic = 0.2
        clusters = []
        motifs = read_motifs(open(pfm_file), fmt="pwm")
        if len(motifs) == 1:
            clusters = [[motifs[0], motifs]]
        else:
            tree = cluster_motifs(pfm_file,
                                  "total",
                                  "wic",
                                  "mean",
                                  True,
                                  threshold=float(threshold),
                                  include_bg=True,
                                  progress=False)
            clusters = tree.getResult()

        ids = []
        mc = MotifComparer()

        for cluster, members in clusters:
            cluster.trim(trim_ic)
            cluster.to_img(os.path.join(self.imgdir, "%s.png" % cluster.id),
                           format="PNG")
            ids.append([cluster.id, {"src": "images/%s.png" % cluster.id}, []])
            if len(members) > 1:
                scores = {}
                for motif in members:
                    scores[motif] = mc.compare_motifs(cluster,
                                                      motif,
                                                      "total",
                                                      "wic",
                                                      "mean",
                                                      pval=True)
                add_pos = sorted(scores.values(),
                                 cmp=lambda x, y: cmp(x[1], y[1]))[0][1]
                for motif in members:
                    score, pos, strand = scores[motif]
                    add = pos - add_pos

                    if strand in [1, "+"]:
                        pass
                    else:
                        #print "RC %s" % motif.id
                        rc = motif.rc()
                        rc.id = motif.id
                        motif = rc
                    #print "%s\t%s" % (motif.id, add)
                    motif.to_img(os.path.join(
                        self.imgdir, "%s.png" % motif.id.replace(" ", "_")),
                                 format="PNG",
                                 add_left=add)
            ids[-1][2] = [
                dict([("src", "images/%s.png" % motif.id.replace(" ", "_")),
                      ("alt", motif.id.replace(" ", "_"))])
                for motif in members
            ]

        env = jinja2.Environment(
            loader=jinja2.FileSystemLoader([self.config.get_template_dir()]))
        template = env.get_template("cluster_template.jinja.html")
        result = template.render(expname=self.basename,
                                 motifs=ids,
                                 inputfile=self.inputfile,
                                 date=datetime.today().strftime("%d/%m/%Y"),
                                 version=GM_VERSION)

        f = open(self.cluster_report, "w")
        f.write(result.encode('utf-8'))
        f.close()

        f = open(cluster_pwm, "w")
        if len(clusters) == 1 and len(clusters[0][1]) == 1:
            f.write("%s\n" % clusters[0][0].to_pwm())
        else:
            for motif in tree.get_clustered_motifs():
                f.write("%s\n" % motif.to_pwm())
        f.close()

        self.logger.debug("Clustering done. See the result in %s",
                          self.cluster_report)
        return clusters
Esempio n. 47
0
def select_nonredundant_motifs(roc_report,
                               pfmfile,
                               fg_table,
                               bg_table,
                               tolerance=0.001):
    pfmfile = pfmfile_location(pfmfile)
    motifs = read_motifs(pfmfile)
    motif_dict = read_motifs(pfmfile, as_dict=True)

    mc = MotifComparer()

    df = pd.read_csv(roc_report, sep="\t", index_col=0)
    df = df[df["Enr. at 1% FPR"] >= 2]
    motifs = [m for m in motifs if m.id in df.index]

    cols = ["ROC AUC", "PR AUC", "Enr. at 1% FPR", "Recall at 10% FDR"]
    rank = df[cols].rank().mean(1).sort_values(ascending=False)

    redundant_motifs = []
    keep = []
    while df[~df.index.isin(redundant_motifs)].shape[0] > 0:
        motif = rank[~rank.index.isin(redundant_motifs)].head(1).index[0]
        keep.append(motif)

        result = mc.get_all_scores(
            [motif_dict[motif]],
            [m for m in motifs if m.id not in redundant_motifs],
            "partial",
            "seqcor",
            "mean",
        )
        result = result[motif]
        redundant_motifs += [m for m in result.keys() if result[m][0] >= 0.7]
    logger.debug(f"Selected {len(keep)} motifs for feature elimination")

    # Read motif scan results
    fg_table = pd.read_csv(fg_table, index_col=0, comment="#", sep="\t")
    bg_table = pd.read_csv(bg_table, index_col=0, comment="#", sep="\t")

    X = pd.concat((fg_table, bg_table), axis=0)
    y = np.hstack((np.ones(fg_table.shape[0]), np.zeros(bg_table.shape[0])))

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.4,
        random_state=2,
        shuffle=True,
    )

    X_bla = X_train[keep]
    model = LogisticRegression(solver="liblinear", max_iter=500, penalty="l1")
    # = RandomForestClassifier(n_estimators=100)
    max_score = np.mean(
        cross_val_score(model,
                        X_bla,
                        y_train,
                        cv=5,
                        scoring="average_precision"))
    mean_scores = []
    step = 1

    logger.info("selecting non-redundant motifs")
    n_features = 1
    for i in range(1, X_bla.shape[1], step):
        rfe = RFE(model, i)
        fit = rfe.fit(X_bla, y_train)
        mean_score = np.mean(
            cross_val_score(
                model,
                X_bla.loc[:, fit.support_],
                y_train,
                cv=5,
                scoring="average_precision",
            ))
        if i > 1 and mean_score - mean_scores[-1] < (max_score * tolerance):
            n_features = i - 1
            break
        mean_scores.append(mean_score)

    rfe = RFE(model, n_features)
    fit = rfe.fit(X_bla, y_train)

    selected_features = X_bla.columns[fit.support_]
    model.fit(X_train.loc[:, selected_features], y_train)
    y_pred = model.predict_proba(X_test.loc[:, selected_features])[:, 1]
    pr_auc = average_precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    logger.info(
        f"selected {len(selected_features)} non-redundant motifs: ROC AUC {roc_auc:.3f}, PR AUC {pr_auc:.3f}"
    )
    return selected_features
Esempio n. 48
0
def _create_graphical_report(inputfile,
                             pwm,
                             background,
                             closest_match,
                             outdir,
                             stats,
                             best_id=None):
    """Create main gimme_motifs output html report."""
    if best_id is None:
        best_id = {}

    logger.debug("Creating graphical report")

    class ReportMotif(object):
        """Placeholder for motif stats."""

        pass

    config = MotifConfig()

    imgdir = os.path.join(outdir, "images")
    if not os.path.exists(imgdir):
        os.mkdir(imgdir)

    motifs = read_motifs(pwm, fmt="pfm")

    roc_img_file = "%s_roc.%s"

    dbpwm = config.get_default_params()["motif_db"]
    pwmdir = config.get_motif_dir()

    dbmotifs = read_motifs(os.path.join(pwmdir, dbpwm), as_dict=True)

    report_motifs = []
    for motif in motifs:

        rm = ReportMotif()
        rm.id = motif.id
        rm.id_href = {"href": "#%s" % motif.id}
        rm.id_name = {"name": motif.id}
        rm.img = {"src": os.path.join("images", "%s.png" % motif.id)}
        motif.plot_logo(
            fname=os.path.join(outdir, "images/{}.png".format(motif.id)))

        # TODO: fix best ID
        rm.best = "Gimme"  # best_id[motif.id]

        rm.consensus = motif.to_consensus()
        rm.stars = int(
            np.mean(
                [stats[str(motif)][bg].get("stars", 0)
                 for bg in background]) + 0.5)

        rm.bg = {}
        for bg in background:
            rm.bg[bg] = {}
            this_stats = stats.get(str(motif), {}).get(bg)
            # TODO: fix these stats
            rm.bg[bg]["e"] = "%0.2f" % this_stats.get("enr_at_fpr", 1.0)
            rm.bg[bg]["p"] = "%0.2f" % this_stats.get("phyper_at_fpr", 1.0)
            rm.bg[bg]["auc"] = "%0.3f" % this_stats.get("roc_auc", 0.5)
            rm.bg[bg]["mncp"] = "%0.3f" % this_stats.get("mncp", 1.0)
            rm.bg[bg]["roc_img"] = {
                "src":
                "images/" + os.path.basename(roc_img_file % (motif.id, bg)) +
                ".png"
            }
            rm.bg[bg][u"roc_img_link"] = {
                u"href":
                "images/" + os.path.basename(roc_img_file % (motif.id, bg)) +
                ".png"
            }

        rm.histogram_img = {"data": "images/%s_histogram.svg" % motif.id}
        rm.histogram_link = {"href": "images/%s_histogram.svg" % motif.id}

        match_id = closest_match[motif.id][0]
        dbmotifs[match_id].plot_logo(
            fname=os.path.join(outdir, "images/{}.png".format(match_id)))

        rm.match_img = {"src": "images/{}.png".format(match_id)}
        rm.match_id = closest_match[motif.id][0]
        rm.match_pval = "%0.2e" % closest_match[motif.id][1][-1]

        report_motifs.append(rm)

    total_report = os.path.join(outdir, "gimme.denovo.html")

    star_img = os.path.join(config.get_template_dir(), "star.png")
    shutil.copyfile(star_img, os.path.join(outdir, "images", "star.png"))

    env = jinja2.Environment(
        loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("report_template.jinja.html")
    # TODO: title
    result = template.render(
        motifs=report_motifs,
        inputfile=inputfile,
        date=datetime.today().strftime("%d/%m/%Y"),
        version=__version__,
        bg_types=list(background.keys()),
    )

    with open(total_report, "wb") as f:
        f.write(result.encode("utf-8"))
Esempio n. 49
0
def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=2):
    df = pd.read_table(infile, index_col=0)
    df = df[np.any(abs(df) >= threshold, 1)]

    motifs = read_motifs(pfmfile)

    del df.index.name
    cols = df.columns

    motifs = read_motifs(pfmfile)
    idx = [motif.id for motif in motifs]
    direct = [
        ",".join(sorted(set([x.upper() for x in motif.factors[DIRECT_NAME]])))
        for motif in motifs
    ]
    indirect = [
        ",".join(sorted(set([x.upper()
                             for x in motif.factors[INDIRECT_NAME]])))
        for motif in motifs
    ]
    m2f = pd.DataFrame({
        DIRECT_NAME: direct,
        INDIRECT_NAME: indirect
    },
                       index=idx)

    factor_cols = [DIRECT_NAME, INDIRECT_NAME]
    if True:
        for factor_col in factor_cols:
            f = m2f[factor_col].str.len() > 30
            m2f[factor_col] = ('<div title="' + m2f[factor_col] + '">' +
                               m2f[factor_col].str.slice(0, 30))
            m2f.loc[f, factor_col] += "(...)"
            m2f[factor_col] += "</div>"
        df = df.join(m2f)

    df["logo"] = [
        '<img src="logos/{}.png" height=40/>'.format(re.sub("[()/]", "_", x))
        for x in list(df.index)
    ]

    if not os.path.exists(outdir + "/logos"):
        os.makedirs(outdir + "/logos")
    for motif in motifs:
        if motif.id in df.index:
            motif.plot_logo(
                fname=outdir +
                "/logos/{}.png".format(re.sub("[()/]", "_", motif.id)))

    template_dir = MotifConfig().get_template_dir()
    js = open(os.path.join(template_dir, "sortable/sortable.min.js"),
              encoding="utf-8").read()
    css = open(
        os.path.join(template_dir, "sortable/sortable-theme-slick.css"),
        encoding="utf-8",
    ).read()
    df = df[factor_cols + ["logo"] + list(cols)]

    df_styled = df.style
    absmax = np.max((abs(df[cols].max().max()), abs(df[cols].min().min())))
    target = absmax * 1.75

    for col in cols:
        smin = df[col].min()
        smax = df[col].max()
        diff = smax - smin
        low = abs((-target - smin) / diff)
        high = (target - smax) / diff
        df_styled = df_styled.background_gradient(cmap="RdBu_r",
                                                  low=low,
                                                  high=high,
                                                  subset=[col])

    df_styled = df_styled.set_precision(3)
    df_styled = df_styled.set_table_attributes("data-sortable")
    df_styled = df_styled.render()
    df_styled = df_styled.replace(
        "data-sortable", 'class="sortable-theme-slick" data-sortable')

    with open(outdir + "/gimme.maelstrom.report.html", "w",
              encoding="utf-8") as f:
        f.write("<head>\n")
        f.write("<style>{}</style>\n".format(css))
        f.write("</head>\n")
        f.write("<body>\n")
        f.write(df_styled)
        f.write("<script>{}</script>\n".format(js))
        f.write("</body>\n")
Esempio n. 50
0
def roc_html_report(
    outdir,
    infile,
    pfmfile,
    outname="gimme.motifs.html",
    threshold=0.01,
    use_motifs=None,
    link_matches=False,
):
    df = pd.read_table(infile, index_col=0)
    del df.index.name
    df["corrected P-value"] = multipletests(df["P-value"], method="fdr_bh")[1]

    cols = [
        "logo",
        "# matches",
        "# matches background",
        "P-value",
        "log10 P-value",
        "corrected P-value",
        "ROC AUC",
        "PR AUC",
        "Enr. at 1% FPR",
        "Recall at 10% FDR",
    ]

    motifs = read_motifs(pfmfile)
    if use_motifs is not None:
        motifs = [m for m in motifs if m.id in use_motifs]

    idx = [motif.id for motif in motifs]
    df = df.loc[idx]
    direct = [",".join(motif.factors[DIRECT_NAME]) for motif in motifs]
    indirect = [",".join(motif.factors[INDIRECT_NAME]) for motif in motifs]
    m2f = pd.DataFrame({
        DIRECT_NAME: direct,
        INDIRECT_NAME: indirect
    },
                       index=idx)

    factor_cols = [DIRECT_NAME, INDIRECT_NAME]
    if True:
        for factor_col in factor_cols:
            f = m2f[factor_col].str.len() > 30
            m2f[factor_col] = ('<div title="' + m2f[factor_col] + '">' +
                               m2f[factor_col].str.slice(0, 30))
            m2f.loc[f, factor_col] += "(...)"
            m2f[factor_col] += "</div>"
        df = df.join(m2f)
        cols = factor_cols + cols

    df = df[df["corrected P-value"] <= threshold]

    if link_matches:
        df["# matches"] = ("<a href=motif_scan_results/" +
                           df.index.to_series() + ".matches.bed>" +
                           df["# matches"].astype(str) + "</a>")

    df["logo"] = [
        '<img src="logos/{}.png" height=40/>'.format(
            re.sub(r"[^-_\w]+", "_", x)) for x in list(df.index)
    ]

    df = df[cols]
    if not os.path.exists(outdir + "/logos"):
        os.makedirs(outdir + "/logos")
    for motif in motifs:
        if motif.id in df.index:
            motif.plot_logo(
                fname=outdir +
                "/logos/{}.png".format(re.sub(r"[^-_\w]+", "_", motif.id)))

    bar_cols = [
        "log10 P-value",
        "ROC AUC",
        "PR AUC",
        "MNCP",
        "Enr. at 1% FPR",
        "Recall at 10% FDR",
    ]
    template_dir = MotifConfig().get_template_dir()
    js = open(os.path.join(template_dir, "sortable/sortable.min.js"),
              encoding="utf-8").read()
    css = open(
        os.path.join(template_dir, "sortable/sortable-theme-slick.css"),
        encoding="utf-8",
    ).read()
    with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f:
        f.write("<head>\n")
        f.write("<style>{}</style>\n".format(css))
        f.write("</head>\n")
        f.write("<body>\n")
        if df.shape[0] > 0:
            f.write(
                df.sort_values(
                    "ROC AUC",
                    ascending=False).style.bar(bar_cols).set_precision(3).
                set_table_attributes("data-sortable").render().replace(
                    "data-sortable",
                    'class="sortable-theme-slick" data-sortable'))
        else:
            f.write("No enriched motifs found.")
        f.write("<script>{}</script>\n".format(js))
        f.write("</body>\n")
Esempio n. 51
0
def cluster_motifs_with_report(infile, outfile, outdir, threshold, title=None):
    # Cluster significant motifs

    if title is None:
        title = infile

    motifs = read_motifs(infile, fmt="pwm")

    trim_ic = 0.2
    clusters = []
    if len(motifs) == 0:
        return []
    elif len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        logger.info("clustering %d motifs.", len(motifs))
        tree = cluster_motifs(infile,
                              "total",
                              "wic",
                              "mean",
                              True,
                              threshold=float(threshold),
                              include_bg=True,
                              progress=False)
        clusters = tree.getResult()

    ids = []
    mc = MotifComparer()

    img_dir = os.path.join(outdir, "images")

    if not os.path.exists(img_dir):
        os.mkdir(img_dir)

    for cluster, members in clusters:
        cluster.trim(trim_ic)
        png = "images/{}.png".format(cluster.id)
        cluster.to_img(os.path.join(outdir, png), fmt="PNG")
        ids.append([cluster.id, {"src": png}, []])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] = mc.compare_motifs(cluster,
                                                  motif,
                                                  "total",
                                                  "wic",
                                                  "mean",
                                                  pval=True)
            add_pos = sorted(scores.values(), key=lambda x: x[1])[0][1]
            for motif in members:
                score, pos, strand = scores[motif]
                add = pos - add_pos

                if strand in [1, "+"]:
                    pass
                else:
                    rc = motif.rc()
                    rc.id = motif.id
                    motif = rc
                #print "%s\t%s" % (motif.id, add)
                png = "images/{}.png".format(motif.id.replace(" ", "_"))
                motif.to_img(os.path.join(outdir, png),
                             fmt="PNG",
                             add_left=add)
        ids[-1][2] = [
            dict([("src", "images/{}.png".format(motif.id.replace(" ", "_"))),
                  ("alt", motif.id.replace(" ", "_"))]) for motif in members
        ]

    config = MotifConfig()
    env = jinja2.Environment(
        loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids,
                             inputfile=title,
                             date=datetime.today().strftime("%d/%m/%Y"),
                             version=__version__)

    cluster_report = os.path.join(outdir, "cluster_report.html")
    with open(cluster_report, "wb") as f:
        f.write(result.encode('utf-8'))

    f = open(outfile, "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()

    logger.debug("Clustering done. See the result in %s", cluster_report)
    return clusters
Esempio n. 52
0
def command_scan(
    inputfile,
    pfmfile,
    nreport=1,
    fpr=0.01,
    cutoff=None,
    bed=False,
    scan_rc=True,
    table=False,
    score_table=False,
    moods=False,
    pvalue=None,
    bgfile=None,
    genome=None,
    ncpus=None,
    zscore=False,
    gcnorm=False,
):
    motifs = read_motifs(pfmfile)

    fa = as_fasta(inputfile, genome)

    # initialize scanner
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pfmfile)

    if genome:
        s.set_genome(genome=genome)

    if genome:
        s.set_background(genome=genome,
                         fname=bgfile,
                         size=fa.median_length(),
                         gc=gcnorm)
    if bgfile:
        s.set_background(genome=genome, fname=bgfile, size=fa.median_length())

    if not score_table:
        s.set_threshold(fpr=fpr, threshold=cutoff)

    if table:
        it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport,
                        scan_rc, pvalue, moods)
    elif score_table:
        it = scan_score_table(s,
                              fa,
                              motifs,
                              scan_rc,
                              zscore=zscore,
                              gcnorm=gcnorm)
    else:
        it = scan_normal(
            s,
            inputfile,
            fa,
            motifs,
            cutoff,
            bgfile,
            nreport,
            scan_rc,
            pvalue,
            moods,
            bed,
            zscore=zscore,
            gcnorm=gcnorm,
        )

    for row in it:
        yield row
Esempio n. 53
0
m2f = {}
fnames = glob.glob(os.path.join(m2f_dir, "*.motif2factors.txt"))
for fname in fnames:
    with open(fname) as f:
        for line in f:
            vals = line.strip().split("\t")
            if len(vals) == 4:
                m2f[vals[0]] = m2f.get(vals[0], []) + [vals[1:]]
#print(m2f)

# Read factor to family mapping from the CIS-BP databse
anno = pd.read_table(tf_info)
anno = anno[["TF_Name", "Family_Name"]].drop_duplicates().set_index("TF_Name")

# read motifs
motifs = dict([(m.id, m) for m in read_motifs(open(pfmfile))])
df_cluster = pd.read_table(clusterfile)

ic_cutoff = 5
mc = MotifComparer()
id_count = {}
df = df_cluster.loc[k]
sys.stderr.write(str(k) + "\n")
seen_line = {}
with open("{}.pfm".format(outname), "w") as out:
    with open("{}.motif2factors.txt".format(outname), "w") as m2f_out:
        print("Motif\tFactor\tEvidence\tCurated", file=m2f_out)
        for cluster in range(k):
            if cluster % 10 == 0:
                sys.stderr.write("{}\n".format(cluster))
                out.flush()
Esempio n. 54
0
def motifs(args):
    """ Calculate ROC_AUC and other metrics and optionally plot ROC curve."""
    if args.outdir is None:
        raise ValueError("an output directory is required!")
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    scan_dir = os.path.join(args.outdir, "motif_scan_results")
    if not os.path.exists(scan_dir):
        os.makedirs(scan_dir)

    file_type = determine_file_type(args.sample)
    outfile = os.path.join(args.outdir, f"input.w{args.size}.bed")
    sample = args.sample
    if file_type == "narrowpeak":
        narrowpeak_to_bed(args.sample, outfile, size=args.size)
        sample = outfile
    elif args.size and args.size > 0:
        if file_type == "fasta":
            logger.warn("size parameter will be ignored for FASTA input")
        elif file_type == "bed":
            write_equalsize_bedfile(args.sample, args.size, outfile)
            sample = outfile

    genome = args.genome
    if genome is None:
        args.zscore = False
        args.gc = False

    bgfile = None
    bg = args.background
    if bg is None:
        if genome is None:
            bg = "random"
        else:
            bg = "gc"

    if os.path.isfile(bg):
        bgfile = bg
        bg = "custom"
    else:
        # create background if not provided
        bgfile = os.path.join(args.outdir,
                              "generated_background.{}.fa".format(bg))
        size = args.size
        if size <= 0:
            size = None
        if bg == "gc":
            logger.info("creating background (matched GC%)")
        else:
            logger.info("creating background (random)")

        create_background_file(
            bgfile,
            bg,
            fmt="fasta",
            genome=genome,
            inputfile=sample,
            size=size,
            number=10000,
        )

    pfmfile = args.pfmfile

    motifs = []
    if args.known:
        motifs = read_motifs(pfmfile, fmt="pfm")

    if args.denovo:
        gimme_motifs(
            sample,
            args.outdir,
            params={
                "tools": args.tools,
                "analysis": args.analysis,
                "background": bg,
                "custom_background": bgfile,
                "genome": args.genome,
                "size": args.size,
            },
        )
        denovo = read_motifs(os.path.join(args.outdir, "gimme.denovo.pfm"))
        mc = MotifComparer()
        result = mc.get_closest_match(denovo,
                                      dbmotifs=pfmfile,
                                      metric="seqcor")
        match_motifs = read_motifs(pfmfile, as_dict=True)
        new_map_file = os.path.join(args.outdir, "combined.motif2factors.txt")
        base = os.path.splitext(pfmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            shutil.copyfile(map_file, new_map_file)

        motifs += denovo
        pfmfile = os.path.join(args.outdir, "combined.pfm")
        with open(pfmfile, "w") as f:
            for m in motifs:
                print(m.to_pwm(), file=f)

        with open(new_map_file, "a") as f:
            for m in denovo:
                print("{}\t{}\t{}\t{}".format(m.id, "de novo", "GimmeMotifs",
                                              "Y"),
                      file=f)
                if result[m.id][0] in match_motifs:
                    for factor in match_motifs[result[m.id]
                                               [0]].factors["direct"]:
                        print(
                            "{}\t{}\t{}\t{}".format(m.id, factor,
                                                    "inferred (GimmeMotifs)",
                                                    "N"),
                            file=f,
                        )
    else:
        logger.info("skipping de novo")

    stats = [
        "phyper_at_fpr",
        "roc_auc",
        "pr_auc",
        "enr_at_fpr",
        "recall_at_fdr",
        "roc_values",
        "matches_at_fpr",
    ]

    f_out = sys.stdout
    if args.outdir:
        f_out = open(args.outdir + "/gimme.roc.report.txt", "w")

    # Print the metrics
    f_out.write(
        "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n"
    )

    logger.info("creating motif scan tables")
    # ftype = determine_file_type(args.sample)
    # sample = args.sample
    # delete_sample = False
    # if ftype == "narrowpeak":
    #    f = NamedTemporaryFile(delete=False)
    #    logger.debug("Using {} as temporary BED file".format(f.name))
    #    narrowpeak_to_bed(args.sample, f.name, size=args.size)
    #    sample = f.name
    #    delete_sample = True

    # Create a table with the best score per motif for all motifs.
    # This has three reasons:
    # * Can be used to calculate statistics;
    # * Can be used to select a set of non-redundant motifs;
    # * These files are included in the output and can be used for further analyis.
    score_table = os.path.join(scan_dir, "input.motif.score.txt")
    bg_score_table = os.path.join(scan_dir, "background.motif.score.txt")
    for infile, outfile in [(sample, score_table), (bgfile, bg_score_table)]:
        scan_to_file(
            infile,
            pfmfile,
            filepath_or_buffer=outfile,
            score_table=True,
            genome=args.genome,
            zscore=True,
            gcnorm=True,
        )

    n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0]
    n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0]

    logger.info("calculating stats")
    for motif_stats in calc_stats_iterator(
            motifs=pfmfile,
            fg_table=score_table,
            bg_table=bg_score_table,
            stats=stats,
            ncpus=args.ncpus,
    ):
        for motif in motifs:
            if str(motif) in motif_stats:
                log_pvalue = np.inf
                if motif_stats[str(motif)]["phyper_at_fpr"] > 0:
                    log_pvalue = -np.log10(
                        motif_stats[str(motif)]["phyper_at_fpr"])
                f_out.write(
                    "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n"
                    .format(
                        motif.id,
                        motif_stats[str(motif)]["matches_at_fpr"][0],
                        motif_stats[str(motif)]["matches_at_fpr"][0] /
                        n_input * 100,
                        motif_stats[str(motif)]["matches_at_fpr"][1],
                        motif_stats[str(motif)]["matches_at_fpr"][1] /
                        n_background * 100,
                        motif_stats[str(motif)]["phyper_at_fpr"],
                        log_pvalue,
                        motif_stats[str(motif)]["roc_auc"],
                        motif_stats[str(motif)]["pr_auc"],
                        motif_stats[str(motif)]["enr_at_fpr"],
                        motif_stats[str(motif)]["recall_at_fdr"],
                    ))
    f_out.close()

    # Select a set of "non-redundant" motifs.
    # Using Recursive Feature Elimination, a set of motifs is selected that
    # best explains the peaks in comparison to the background sequences.
    nr_motifs = select_nonredundant_motifs(
        args.outdir + "/gimme.roc.report.txt",
        pfmfile,
        score_table,
        bg_score_table,
        tolerance=0.001,
    )

    # Provide BED files with motif scan results for the non-redundant motifs
    # At the moment this is not ideal, as scanning is now performed twice
    # for this set of non-redundant motifs.
    motif_dict = dict([(m.id, m) for m in motifs])
    for motif in nr_motifs:
        with NamedTemporaryFile(mode="w") as f:
            print(motif_dict[motif].to_pwm(), file=f)
            f.flush()
            safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)
            scan_to_file(
                sample,
                f.name,
                filepath_or_buffer=os.path.join(scan_dir,
                                                f"{safe_name}.matches.bed"),
                bed=True,
                fpr=0.01,
                genome=args.genome,
                zscore=True,
                gcnorm=True,
            )

    if args.report:
        logger.info("creating statistics report")
        if args.outdir:
            roc_html_report(
                args.outdir,
                args.outdir + "/gimme.roc.report.txt",
                pfmfile,
                threshold=0.01,
                outname="gimme.motifs.redundant.html",
                link_matches=False,
            )
            roc_html_report(
                args.outdir,
                args.outdir + "/gimme.roc.report.txt",
                pfmfile,
                threshold=0.01,
                use_motifs=nr_motifs,
                link_matches=True,
            )
            logger.info(
                f"gimme motifs final report: {os.path.join(args.outdir, 'gimme.motifs.html')}"
            )
Esempio n. 55
0
def best_motif_in_cluster(
        single_pwm,
        clus_pwm,
        clusters,
        fg_fa,
        background,
        genome,
        stats=None,
        metrics=("roc_auc", "recall_at_fdr"),
):
    """Return the best motif per cluster for a clustering results.

    The motif can be either the average motif or one of the clustered motifs.

    Parameters
    ----------
    single_pwm : str
        Filename of motifs.

    clus_pwm : str
        Filename of motifs.

    clusters :
        Motif clustering result.

    fg_fa : str
        Filename of FASTA file.

    background : dict
        Dictionary for background file names.

    genome : str
        Genome name.

    stats : dict, optional
        If statistics are not supplied they will be computed.

    metrics : sequence, optional
        Metrics to use for motif evaluation. Default are "roc_auc" and
        "recall_at_fdr".

    Returns
    -------
    motifs : list
        List of Motif instances.
    """
    # combine original and clustered motifs
    motifs = read_motifs(single_pwm) + read_motifs(clus_pwm)
    motifs = dict([(str(m), m) for m in motifs])

    # get the statistics for those motifs that were not yet checked
    clustered_motifs = []
    for clus, singles in clusters:
        for motif in set([clus] + singles):
            if str(motif) not in stats:
                clustered_motifs.append(motifs[str(motif)])

    new_stats = {}
    for bg, bg_fa in background.items():
        for m, s in calc_stats(fg_file=fg_fa,
                               bg_file=bg_fa,
                               motifs=clustered_motifs,
                               genome=genome).items():
            if m not in new_stats:
                new_stats[m] = {}
            new_stats[m][bg] = s
    stats.update(new_stats)

    rank = rank_motifs(stats, metrics)

    # rank the motifs
    best_motifs = []
    for clus, singles in clusters:
        if len(singles) > 1:
            eval_motifs = singles
            if clus not in motifs:
                eval_motifs.append(clus)
            eval_motifs = [motifs[str(e)] for e in eval_motifs]
            best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1]
            best_motifs.append(best_motif)
        else:
            best_motifs.append(clus)
        for bg in background:
            stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles)

    best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True)
    return best_motifs
Esempio n. 56
0
    def run_full_analysis(self, inputfile, user_params=None):
        """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """
        self.logger.info("starting full motif analysis")
        self.logger.debug("Using temporary directory {0}".format(mytmpdir()))

        if user_params is None:
            user_params = {}
        params = self.config.get_default_params()
        params.update(user_params)

        if params["torque"]:
            from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult
            self.logger.debug("Using torque")
        else:
            from gimmemotifs.prediction import pp_predict_motifs, PredictionResult
            self.logger.debug("Using multiprocessing")

        self.params = params
        #self.weird = params["weird_option"]

        background = [x.strip() for x in params["background"].split(",")]

        self.logger.debug("Parameters:")
        for param, value in params.items():
            self.logger.debug("  %s: %s", param, value)

        # Checking input
        self.input_type = "BED"
        # If we can load it as fasta then it is a fasta, yeh?
        try:
            Fasta(inputfile)
            self.logger.debug("Inputfile is a FASTA file")
            self.input_type = "FASTA"
        except Exception:
            # Leave it to BED
            pass

        index_msg = ("No index found for genome {}! "
                     "Has GimmeMotifs been configured correctly and is the "
                     "genome indexed?").format(params["genome"])
        index_dir = os.path.join(self.config.get_index_dir(), params["genome"])

        if self.input_type == "FASTA":
            for bg in background:
                if not bg in FA_VALID_BGS:
                    self.logger.info(
                        "Input type is FASTA, can't use background type '%s'",
                        bg)
                if bg == "genomic":
                    if not os.path.exists(index_dir):
                        self.logger.error(index_msg)
                        sys.exit(1)
            background = [bg for bg in background if bg in FA_VALID_BGS]

        elif self.input_type == "BED":
            # Does the index_dir exist?  #bed-specific
            if not os.path.exists(index_dir):
                self.logger.error(index_msg)
                sys.exit(1)

            # is it a valid bed-file etc.
            self._check_input(inputfile)  # bed-specific

            # Check for valid background
            for bg in background:
                if not bg in BED_VALID_BGS:
                    self.logger.info(
                        "Input type is BED, can't use background type '%s'",
                        bg)
            background = [bg for bg in background if bg in BED_VALID_BGS]

        if len(background) == 0:
            self.logger.error("No valid backgrounds specified!")
            sys.exit(1)

        self.max_time = None
        max_time = None
        # Maximum time?
        if params["max_time"]:
            try:
                max_time = float(params["max_time"])
            except Exception:
                self.logger.debug(
                    "Could not parse max_time value, setting to no limit")
                self.max_time = None

            if max_time > 0:
                self.logger.debug(
                    "Time limit for motif prediction: %0.2f hours" % max_time)
                max_time = 3600 * max_time
                self.max_time = max_time
                self.logger.debug("Max_time in seconds %0.0f" % self.max_time)
            else:
                self.logger.debug(
                    "Invalid time limit for motif prediction, setting to no limit"
                )
                self.max_time = None
        else:
            self.logger.debug("No time limit for motif prediction")

        if "random" in background:
            self.markov_model = params["markov_model"]

        # Create the necessary files for motif prediction and validation
        if self.input_type == "BED":
            self.prepare_input_bed(inputfile, params["genome"],
                                   params["width"], params["fraction"],
                                   params["abs_max"], params["use_strand"])

            # Create file for location plots
            index_dir = os.path.join(self.config.get_index_dir(),
                                     params["genome"])
            lwidth = int(params["lwidth"])
            width = int(params["width"])
            extend = (lwidth - width) / 2
            genome_index.track2fasta(index_dir,
                                     self.validation_bed,
                                     self.location_fa,
                                     extend_up=extend,
                                     extend_down=extend,
                                     use_strand=params["use_strand"],
                                     ignore_missing=True)

        elif self.input_type == "FASTA":
            self.prepare_input_fa(inputfile, params["width"],
                                  params["fraction"], params["abs_max"])

            # File for location plots
            self.location_fa = self.validation_fa
            fa = Fasta(self.location_fa)
            seqs = fa.seqs
            lwidth = len(seqs[0])
            all_same_width = not (False
                                  in [len(seq) == lwidth for seq in seqs])
            if not all_same_width:
                self.logger.warn(
                    "PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!"
                )

        else:
            self.logger.error("Unknown input type, shouldn't happen")
            sys.exit(1)

        tools = dict([(x.strip(), x
                       in [y.strip() for y in params["tools"].split(",")])
                      for x in params["available_tools"].split(",")])

        self.create_background(background, params["genome"], params["width"])

        # Predict the motifs
        analysis = params["analysis"]
        """ Predict motifs, input is a FASTA-file"""
        self.logger.info("starting motif prediction (%s)", analysis)
        self.logger.info("tools: %s",
                         ", ".join([x for x in tools.keys() if tools[x]]))

        bg_file = self.bg_file["fa"][sorted(
            background, lambda x, y: cmp(BG_RANK[x], BG_RANK[y]))[0]]
        self.logger.debug("Using bg_file %s for significance" % bg_file)
        result = pp_predict_motifs(self.prediction_fa,
                                   self.predicted_pfm,
                                   analysis,
                                   params["genome"],
                                   params["use_strand"],
                                   self.prediction_bg,
                                   tools,
                                   self.job_server(),
                                   logger=self.logger,
                                   max_time=self.max_time,
                                   fg_file=self.validation_fa,
                                   bg_file=bg_file)

        motifs = result.motifs
        self.logger.info("predicted %s motifs", len(motifs))
        self.logger.debug("written to %s", self.predicted_pfm)

        if len(motifs) == 0:
            self.logger.info("no motifs found")
            sys.exit()

        # Write stats output to file
        f = open(self.stats_file, "w")
        stat_keys = result.stats.values()[0].keys()
        f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys)))

        self.logger.debug(result.stats)

        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats:
                f.write(
                    "%s\t%s\n" %
                    (motif.id, "\t".join([str(stats[k]) for k in stat_keys])))
            else:
                self.logger.error(
                    "No stats for motif {0}, skipping this motif!".format(
                        motif.id))
                motifs.remove(motif)
        f.close()

        self.motifs_with_stats = motifs

        f = open(self.ranks_file, "w")
        tools = dict((m.id.split("_")[0], 1) for m in motifs).keys()
        f.write("Metric\tType\t%s\n" % ("\t".join(tools)))
        for stat in ["mncp", "roc_auc", "maxenr"]:
            best_motif = {}
            for motif in self.motifs_with_stats:
                val = result.stats["%s_%s" %
                                   (motif.id, motif.to_consensus())][stat]
                name = motif.id.split("_")[0]
                if val > best_motif.setdefault(name, 0):
                    best_motif[name] = val
            names = best_motif.keys()
            vals = [best_motif[name] for name in names]
            rank = rankdata(vals)
            ind = [names.index(x) for x in tools]

            f.write("%s\t%s\t%s\n" %
                    (stat, "value", "\t".join([str(vals[i]) for i in ind])))
            f.write("%s\t%s\t%s\n" %
                    (stat, "rank", "\t".join([str(rank[i]) for i in ind])))
        f.close()

        #self.logger.debug("RANK: %s" % stat)
        #self.logger.debug("\t".join([str(x) for x in names]))
        #self.logger.debug("\t".join([str(x) for x in vals]))
        #self.logger.debug("\t".join([str(x) for x in rank]))

        # Determine significant motifs
        nsig = 0
        f = open(self.significant_pfm, "w")
        for motif in motifs:
            stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())]
            if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats[
                    'enr_fdr'] >= 2:
                f.write("%s\n" % motif.to_pfm())
                nsig += 1
        f.close()
        self.logger.info("%s motifs are significant", nsig)
        self.logger.debug("written to %s", self.significant_pfm)

        if nsig == 0:
            self.logger.info("no significant motifs found")
            return

        # ROC metrics of significant motifs
        for bg in background:
            self._roc_metrics(self.significant_pfm, self.validation_fa,
                              self.bg_file["fa"][bg], self.bg_file["roc"][bg])

        # Cluster significant motifs
        clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm,
                                        self.outdir,
                                        params["cluster_threshold"])

        # Determine best motif in cluster

        num_cluster, best_id = self._determine_best_motif_in_cluster(
            clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir)

        ### Enable parallel and modular evaluation of results
        # Scan (multiple) files with motifs
        # Define callback functions once scanning is finished:
        #    - ROC plot
        #     - Statistics
        #    - Location plots (histogram)
        #     -

        # Stars
        tmp = NamedTemporaryFile(dir=mytmpdir()).name
        p = PredictionResult(tmp,
                             logger=self.logger,
                             job_server=self.server,
                             fg_file=self.validation_fa,
                             bg_file=bg_file,
                             do_counter=False)
        p.add_motifs(
            ("clustering", (read_motifs(open(self.final_pwm)), "", "")))
        while len(p.stats.keys()) < len(p.motifs):
            sleep(5)

        #print "p.stats"
        #print p.stats
        #print "num_cluster"
        #print num_cluster
        for mid, num in num_cluster.items():
            p.stats[mid]["numcluster"] = num

        all_stats = {
            "mncp": [2, 5, 8],
            "roc_auc": [0.6, 0.75, 0.9],
            "maxenr": [10, 20, 30],
            "enr_fdr": [4, 8, 12],
            "fraction": [0.4, 0.6, 0.8],
            "ks_sig": [4, 7, 10],
            "numcluster": [3, 6, 9],
        }

        self.logger.info("creating report")

        # ROC plots
        for bg in background:
            self.create_roc_plots(self.final_pwm, self.validation_fa,
                                  self.bg_file["fa"][bg], bg)

        # Location plots
        self.logger.debug("Creating localization plots")
        motifs = read_motifs(open(self.final_pwm), fmt="pwm")
        for motif in motifs:
            m = "%s_%s" % (motif.id, motif.to_consensus())
            s = p.stats[m]
            outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id)
            motif_localization(self.location_fa,
                               motif,
                               lwidth,
                               outfile,
                               cutoff=s["cutoff_fdr"])

            s["stars"] = int(
                mean([star(s[x], all_stats[x])
                      for x in all_stats.keys()]) + 0.5)
            self.logger.debug("Motif %s: %s stars" % (m, s["stars"]))

        # Calculate enrichment of final, clustered motifs
        self.calculate_cluster_enrichment(self.final_pwm, background)

        # Create report
        self.print_params()
        self._calc_report_values(self.final_pwm, background)
        self._create_report(self.final_pwm,
                            background,
                            stats=p.stats,
                            best_id=best_id)
        self._create_text_report(self.final_pwm, background)

        self.logger.info("finished")
        self.logger.info("output dir: %s", os.path.split(self.motif_report)[0])
        self.logger.info("report: %s", os.path.split(self.motif_report)[-1])
        #self.logger.info("Open %s in your browser to see your results." % (self.motif_report))

        if not (params["keep_intermediate"]):

            self.logger.debug(
                "Deleting intermediate files. Please specifify the -k option if you want to keep these files."
            )
            shutil.rmtree(self.tmpdir)

        self.logger.debug("Done")

        return self.motif_report
Esempio n. 57
0
def cluster_motifs(motifs, match="total", metric="wic", combine="mean", pval=True, threshold=0.95, trim_edges=False, edge_ic_cutoff=0.2, include_bg=True, progress=True):
    """ 
    Clusters a set of sequence motifs. Required arg 'motifs' is a file containing
    positional frequency matrices or an array with motifs.

    Optional args:

    'match', 'metric' and 'combine' specify the method used to compare and score
    the motifs. By default the WIC score is used (metric='wic'), using the the
    score over the whole alignment (match='total'), with the total motif score
    calculated as the mean score of all positions (combine='mean').
    'match' can be either 'total' for the total alignment or 'subtotal' for the 
    maximum scoring subsequence of the alignment.
    'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed',
    'distance', 'wic' or 'chisq' 
    'combine' determines how the total score is calculated from the score of 
    individual positions and can be either 'sum' or 'mean'
    
    'pval' can be True or False and determines if the score should be converted to 
    an empirical p-value

    'threshold' determines the score (or p-value) cutoff

    If 'trim_edges' is set to True, all motif edges with an IC below 
    'edge_ic_cutoff' will be removed before clustering

    When computing the average of two motifs 'include_bg' determines if, at a 
    position only present in one motif, the information in that motif should
    be kept, or if it should be averaged with background frequencies. Should
    probably be left set to True.

    """

    
    # First read pfm or pfm formatted motiffile
    if type([]) != type(motifs):
        motifs = read_motifs(open(motifs), fmt="pwm")
    
    mc = MotifComparer()

    # Trim edges with low information content
    if trim_edges:
        for motif in motifs:
            motif.trim(edge_ic_cutoff)
    
    # Make a MotifTree node for every motif
    nodes = [MotifTree(m) for m in motifs]
    
    # Determine all pairwise scores and maxscore per motif
    scores = {}
    motif_nodes = dict([(n.motif.id,n) for n in nodes])
    motifs = [n.motif for n in nodes]
    
    if progress:
        sys.stderr.write("Calculating initial scores\n")
    result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True)
    
    for m1, other_motifs in result.items():
        for m2, score in other_motifs.items():
            if m1 == m2:
                if pval:
                    motif_nodes[m1].maxscore = 1 - score[0]
                else:
                    motif_nodes[m1].maxscore = score[0]
            else:
                if pval:
                    score = [1 - score[0]] + score[1:]
                scores[(motif_nodes[m1],motif_nodes[m2])] = score
               
    cluster_nodes = [node for node in nodes]
    ave_count = 1
    
    total = len(cluster_nodes)

    while len(cluster_nodes) > 1:
        l = sorted(scores.keys(), key=lambda x: scores[x][0])
        i = -1
        (n1, n2) = l[i]
        while not n1 in cluster_nodes or not n2 in cluster_nodes:
            i -= 1
            (n1,n2) = l[i]
        
        (score, pos, orientation) = scores[(n1,n2)]
        ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg)
        
        ave_motif.trim(edge_ic_cutoff)
        ave_motif.id = "Average_%s" % ave_count
        ave_count += 1
        
        new_node = MotifTree(ave_motif)
        if pval:
             new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
        else:
            new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0]
            
        new_node.mergescore = score
        #print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score)
        n1.parent = new_node
        n2.parent = new_node
        new_node.left = n1
        new_node.right = n2
        
        cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent])
        
        if progress:
            progress = (1 - len(cmp_nodes) / float(total)) * 100
            sys.stderr.write('\rClustering [{0}{1}] {2}%'.format(
                '#'*(int(progress)/10), 
                " "*(10 - int(progress)/10), 
                int(progress)))
        
        result = mc.get_all_scores(
                [new_node.motif], 
                cmp_nodes.keys(), 
                match, 
                metric, 
                combine, 
                pval, 
                parallel=True)
        
        for motif, n in cmp_nodes.items():
            x = result[new_node.motif.id][motif.id]
            if pval:
                x = [1 - x[0]] + x[1:]
            scores[(new_node, n)] = x
        
        nodes.append(new_node)

        cluster_nodes = [node for node in nodes if not node.parent]
     
    if progress:
        sys.stderr.write("\n") 
    root = nodes[-1]
    for node in [node for node in nodes if not node.left]:
         node.parent.checkMerge(root, threshold)
    
    return root
Esempio n. 58
0
def moap(
    inputfile,
    method="hypergeom",
    scoring=None,
    outfile=None,
    motiffile=None,
    pfmfile=None,
    genome=None,
    fpr=0.01,
    ncpus=None,
    subsample=None,
    zscore=True,
    gc=True,
):
    """Run a single motif activity prediction algorithm.

    Parameters
    ----------
    inputfile : str
        :1File with regions (chr:start-end) in first column and either cluster
        name in second column or a table with values.

    method : str, optional
        Motif activity method to use. Any of 'hypergeom', 'lasso',
        'bayesianridge',
        'rf', 'xgboost'. Default is 'hypergeom'.

    scoring:  str, optional
        Either 'score' or 'count'

    outfile : str, optional
        Name of outputfile to save the fitted activity values.

    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.

    pfmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not
        supplied.

    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied

    fpr : float, optional
        FPR for motif scanning

    ncpus : int, optional
        Number of threads to use. Default is the number specified in the config.

    zscore : bool, optional
        Use z-score normalized motif scores.

    gc : bool optional
        Use GC% bins for z-score.

    Returns
    -------
    pandas DataFrame with motif activity
    """

    if scoring and scoring not in ["score", "count"]:
        raise ValueError("valid values are 'score' and 'count'")

    if inputfile.endswith("feather"):
        df = pd.read_feather(inputfile)
        df = df.set_index(df.columns[0])
    else:
        # read data
        df = pd.read_table(inputfile, index_col=0, comment="#")

    clf = Moap.create(method, ncpus=ncpus)

    if clf.ptype == "classification":
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype("object") in set(df.dtypes):
            raise ValueError(
                "columns should all be numeric for {}".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")

        pfmfile = pfmfile_location(pfmfile)
        try:
            motifs = read_motifs(pfmfile)
        except Exception:
            sys.stderr.write("can't read motifs from {}".format(pfmfile))
            raise

        # scan for motifs
        motif_names = [m.id for m in read_motifs(pfmfile)]
        scores = []
        if method == "classic" or scoring == "count":
            logger.info("motif scanning (scores)")
            scores = scan_regionfile_to_table(
                inputfile,
                genome,
                "count",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
        else:
            logger.info("motif scanning (scores)")
            scores = scan_regionfile_to_table(
                inputfile,
                genome,
                "score",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)

    elif isinstance(motiffile, pd.DataFrame):
        motifs = motiffile
    else:
        motifs = pd.read_table(motiffile, index_col=0, comment="#")

    if outfile and os.path.exists(outfile):
        out = pd.read_table(outfile, index_col=0, comment="#")
        ncols = df.shape[1]
        if ncols == 1:
            ncols = len(df.iloc[:, 0].unique())

        if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols:
            logger.warn("%s output already exists... skipping", method)
            return out

    if subsample is not None:
        n = int(subsample * df.shape[0])
        logger.debug("Subsampling %d regions", n)
        df = df.sample(n)

    motifs = motifs.loc[df.index]

    clf.fit(motifs, df)

    if outfile:
        with open(outfile, "w") as f:
            f.write(
                "# maelstrom - GimmeMotifs version {}\n".format(__version__))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if isinstance(motiffile, str):
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))

        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_
Esempio n. 59
0
    def set_threshold(self,
                      fpr=None,
                      threshold=None,
                      genome=None,
                      length=200,
                      filename=None):
        """Set motif scanning threshold based on background sequences.

        Parameters
        ----------
        fpr : float, optional
            Desired FPR, between 0.0 and 1.0.

        threshold : float or str, optional
            Desired motif threshold, expressed as the fraction of the 
            difference between minimum and maximum score of the PWM.
            Should either be a float between 0.0 and 1.0 or a filename
            with thresholds as created by 'gimme threshold'.

        """
        if threshold:
            if fpr:
                raise ValueError("Need either fpr or threshold.")
            if genome:
                sys.stderr.write(
                    "Parameter genome ignored when threshold is specified.\n"
                    "Did you want to use fpr?\n")
            if filename:
                sys.stderr.write(
                    "Parameter filename ignored when threshold is specified.\n"
                    "Did you want to use fpr?\n")

        if genome and filename:
            raise ValueError("Need either genome or filename.")

        if fpr:
            fpr = float(fpr)
            if not (0.0 < fpr < 1.0):
                raise ValueError("Parameter fpr should be between 0 and 1")

        if not self.motifs:
            raise ValueError("please run set_motifs() first")

        thresholds = {}
        with open(self.motifs) as f:
            motifs = read_motifs(f)

        if threshold is not None:
            self.threshold = parse_threshold_values(self.motifs, threshold)
            return

        if filename:
            if not os.path.exists(filename):
                raise IOError("File {} does not exist.".format(filename))

            bg_hash = file_checksum(filename)
            seqs = Fasta(filename).seqs
        elif genome:
            bg_hash = "{}\{}".format(genome, int(length))
        else:
            raise ValueError("Need genome or filename")

        with Cache(CACHE_DIR) as cache:
            scan_motifs = []
            for motif in motifs:
                k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr)

                threshold = cache.get(k)
                if threshold is None:
                    scan_motifs.append(motif)
                else:
                    if np.isclose(threshold, motif.pwm_max_score()):
                        thresholds[motif.id] = None
                    else:
                        thresholds[motif.id] = threshold

            if len(scan_motifs) > 0:
                if genome:
                    Genome(genome)
                    sys.stderr.write(
                        "Determining threshold for fpr {} and length {} based on {}\n"
                        .format(fpr, int(length), genome))
                    fa = RandomGenomicFasta(genome, length, 10000)
                    seqs = fa.seqs
                else:
                    sys.stderr.write(
                        "Determining threshold for fpr {} based on {}\n".
                        format(fpr, filename))
                for motif, threshold in self._threshold_from_seqs(
                        scan_motifs, seqs, fpr):
                    k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr)
                    cache.set(k, threshold)
                    if np.isclose(threshold, motif.pwm_max_score()):
                        thresholds[motif.id] = None
                    else:
                        thresholds[motif.id] = threshold

        self.threshold_str = "{}_{}_{}_{}_{}".format(fpr, threshold, genome,
                                                     length, filename)
        self.threshold = thresholds