Esempio n. 1
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print "Motif\tMatch\tScore\tP-value"
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
Esempio n. 2
0
    def determine_closest_match(self, motifs):
        self.logger.debug("Determining closest matching motifs in database")
        motif_db = self.config.get_default_params()["motif_db"]
        db = os.path.join(self.config.get_motif_dir(), motif_db)
        db_motifs = []
        if db.endswith("pwm") or db.endswith("pfm"):
            db_motifs = read_motifs(open(db), fmt="pwm")
        elif db.endswith("transfac"):
            db_motifs = read_motifs(db, fmt="transfac")

        closest_match = {}
        mc = MotifComparer()
        db_motif_lookup = dict([(m.id, m) for m in db_motifs])
        match = mc.get_closest_match(motifs,
                                     db_motifs,
                                     "partial",
                                     "wic",
                                     "mean",
                                     parallel=False)
        for motif in motifs:
            # Calculate p-value
            pval, pos, orient = mc.compare_motifs(
                motif,
                db_motif_lookup[match[motif.id][0]],
                "partial",
                "wic",
                "mean",
                pval=True)
            closest_match[motif.id] = [
                db_motif_lookup[match[motif.id][0]], pval
            ]
        return closest_match
Esempio n. 3
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print("Motif\tMatch\tScore\tP-value")
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval))

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
Esempio n. 4
0
def ap1_included(motifs):
    ap1 = motif_from_consensus("TGASTCA")
    mc = MotifComparer()
    match = mc.get_closest_match(ap1, motifs, metric="seqcor")
    print(match)
    if match["TGASTCA"][1][0] >= 0.75:
        return True
    return False
Esempio n. 5
0
 def ap1_included(self, motifs):
     ap1 = motif_from_consensus("TGASTCA")
     mc = MotifComparer()
     for motif in motifs:
         match = mc.get_closest_match(ap1, motif)
         if match["TGASTCA"][1][3] < 1e-6:
             return True
     return False
Esempio n. 6
0
 def ap1_included(self, motifs):
     #if len(motifs) == 0:
     #    return False
     ap1 = motif_from_consensus("TGASTCA")
     mc = MotifComparer()
     match = mc.get_closest_match(ap1, motifs, metric="seqcor")
     print(match)
     if match["TGASTCA"][1][0] >= 0.8:
         return True
     return False
Esempio n. 7
0
def create_denovo_motif_report(inputfile,
                               pfmfile,
                               fgfa,
                               background,
                               locfa,
                               outdir,
                               params,
                               stats=None):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating de novo reports")

    motifs = read_motifs(pfmfile, fmt="pwm")

    # ROC plots
    create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"])

    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)

    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa,
                                   motifs=motifs).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get("cutoff_fpr", 0.9)
    lsize = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir,
                               "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(inputfile, pfmfile, background, closest_match,
                             outdir, stats)
Esempio n. 8
0
    def test1_closest_match(self):
        """ Closest match """
        mc = MotifComparer()

        pwm = "test/data/pwmscan/TATA.pwm"
        ret = mc.get_closest_match(pwm)

        self.assertIn("TATA-box", ret)

        match = ret["TATA-box"]
        self.assertEqual("GM.5.0.TBP.0001", match[0])

        scores = match[1]
        self.assertAlmostEqual(-0.1041, scores[0], 4)
        self.assertEqual(0, scores[1])
        self.assertEqual(1, scores[2])
        self.assertAlmostEqual(3.1666e-8, scores[3])
Esempio n. 9
0
    def test1_closest_match(self):
        """ Closest match """
        mc = MotifComparer()

        pwm = "test/data/pwmscan/TATA.pwm"
        ret = mc.get_closest_match(pwm)
        
        self.assertIn('TATA-box', ret)
        
        match = ret['TATA-box']
        self.assertEqual('GM.5.0.TBP.0001', match[0])
        
        scores = match[1]
        self.assertAlmostEqual(-0.1041, scores[0], 4)
        self.assertEqual(0, scores[1])
        self.assertEqual(1, scores[2])
        self.assertAlmostEqual(3.1666e-8, scores[3])
Esempio n. 10
0
    def test1_closest_match(self):
        """ Closest match """
        mc = MotifComparer()

        pwm = "test/data/pwmscan/TATA.pwm"
        ret = mc.get_closest_match(pwm)

        self.assertIn('TATA-box', ret)

        match = ret['TATA-box']
        self.assertEqual('TBP_Average_1', match[0])

        scores = match[1]
        self.assertAlmostEqual(-0.3276, scores[0], 4)
        self.assertEqual(-1, scores[1])
        self.assertEqual(1, scores[2])
        self.assertAlmostEqual(4.134e-7, scores[3])
Esempio n. 11
0
    def test1_denovo(self):
        """ de novo motif prediction """
        gimme_motifs(
            "test/data/denovo/input.fa",
            self.outdir,
            params={
                "tools": "BioProspector,Homer,MDmodule",
                "fraction": 0.5,
                "background": "random",
                "genome": "test/data/background/genome.fa",
            },
            filter_significant=True,
            cluster=True,
        )

        fnames = [
            "gimme.denovo.pfm",
            "gimme.denovo.html",
            "gimme.clustereds.html",
            "params.txt",
            "stats.random.txt",
        ]

        with open(os.path.join(self.outdir, "gimmemotifs.log")) as f:
            log = f.read()
        self.assertIn("clustering", log)

        # Check if all output files are there
        for fname in fnames:
            self.assertTrue(os.path.exists(os.path.join(self.outdir, fname)))

        # Check if correct motif is predicted
        with open(os.path.join(self.outdir, "gimme.denovo.pfm")) as f:
            predicted_motifs = read_motifs(f)
        ap1 = motif_from_consensus("TGASTCA")

        mc = MotifComparer()
        ap1_predicted = False
        for motif in predicted_motifs:
            match = mc.get_closest_match(ap1, motif)
            if match["TGASTCA"][1][3] < 1e-5:
                ap1_predicted = True
                break

        self.assertTrue(ap1_predicted)
Esempio n. 12
0
    def determine_closest_match(self, motifs):
        self.logger.debug("Determining closest matching motifs in database")
        motif_db = self.config.get_default_params()["motif_db"]
        db = os.path.join(self.config.get_motif_dir(), motif_db)
        db_motifs = []
        if db.endswith("pwm") or db.endswith("pfm"):
            db_motifs = read_motifs(open(db), fmt="pwm")
        elif db.endswith("transfac"):
            db_motifs = read_motifs(db, fmt="transfac")

        closest_match = {}
        mc = MotifComparer()
        db_motif_lookup = dict([(m.id, m) for m in db_motifs])
        match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False)
        for motif in motifs:
            # Calculate p-value
            pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True)
            closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval]
        return closest_match
Esempio n. 13
0
    def test1_denovo(self):
        """ de novo motif prediction """
       
        
        gimme_motifs("test/data/denovo/input.fa", self.outdir,
            params={
                "tools":"BioProspector,Homer,MDmodule",
                "fraction":0.5,
                "background":"random"
                },
            filter_significant=True,
            cluster=True)
       
        fnames = ["motifs.pwm", "motif_report.html", "cluster_report.html",
                    "params.txt", "stats.random.txt"]
        
    
        with open(os.path.join(self.outdir, 'gimmemotifs.log')) as f:
            log = f.read()
        self.assertIn("clustering", log)
    
        # Check if all output files are there
        for fname in fnames:
            self.assertTrue(os.path.exists(os.path.join(self.outdir, fname)))   
  
        # Check if correct motif is predicted
        with open(os.path.join(self.outdir, "motifs.pwm")) as f:
            predicted_motifs = read_motifs(f)
        ap1 = motif_from_consensus("TGASTCA")

        mc = MotifComparer()
        ap1_predicted = False
        for motif in predicted_motifs:
            match = mc.get_closest_match(ap1, motif)
            if match["TGASTCA"][1][3] < 1e-5:
                ap1_predicted = True
                break

        self.assertTrue(ap1_predicted)
Esempio n. 14
0
def create_denovo_motif_report(inputfile, pwmfile, fgfa, background, locfa, outdir, params, stats=None):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating reports")

    motifs = read_motifs(pwmfile, fmt="pwm")
    
    # ROC plots
    create_roc_plots(pwmfile, fgfa, background, outdir)
    
    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)
    
    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(motifs, fgfa, bgfa).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get('cutoff_fpr', 0.9)
    lwidth = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lwidth, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(inputfile, pwmfile, background, closest_match, outdir, stats)
Esempio n. 15
0
def motifs(args):
    """ Calculate ROC_AUC and other metrics and optionally plot ROC curve."""
    if args.outdir is None:
        raise ValueError("an output directory is required!")
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
    scan_dir = os.path.join(args.outdir, "motif_scan_results")
    if not os.path.exists(scan_dir):
        os.makedirs(scan_dir)

    file_type = determine_file_type(args.sample)
    outfile = os.path.join(args.outdir, f"input.w{args.size}.bed")
    sample = args.sample
    if file_type == "narrowpeak":
        narrowpeak_to_bed(args.sample, outfile, size=args.size)
        sample = outfile
    elif args.size and args.size > 0:
        if file_type == "fasta":
            logger.warn("size parameter will be ignored for FASTA input")
        elif file_type == "bed":
            write_equalsize_bedfile(args.sample, args.size, outfile)
            sample = outfile

    genome = args.genome
    if genome is None:
        args.zscore = False
        args.gc = False

    bgfile = None
    bg = args.background
    if bg is None:
        if genome is None:
            bg = "random"
        else:
            bg = "gc"

    if os.path.isfile(bg):
        bgfile = bg
        bg = "custom"
    else:
        # create background if not provided
        bgfile = os.path.join(args.outdir,
                              "generated_background.{}.fa".format(bg))
        size = args.size
        if size <= 0:
            size = None
        if bg == "gc":
            logger.info("creating background (matched GC%)")
        else:
            logger.info("creating background (random)")

        create_background_file(
            bgfile,
            bg,
            fmt="fasta",
            genome=genome,
            inputfile=sample,
            size=size,
            number=10000,
        )

    pfmfile = args.pfmfile

    motifs = []
    if args.known:
        motifs = read_motifs(pfmfile, fmt="pfm")

    if args.denovo:
        gimme_motifs(
            sample,
            args.outdir,
            params={
                "tools": args.tools,
                "analysis": args.analysis,
                "background": bg,
                "custom_background": bgfile,
                "genome": args.genome,
                "size": args.size,
            },
        )
        denovo = read_motifs(os.path.join(args.outdir, "gimme.denovo.pfm"))
        mc = MotifComparer()
        result = mc.get_closest_match(denovo,
                                      dbmotifs=pfmfile,
                                      metric="seqcor")
        match_motifs = read_motifs(pfmfile, as_dict=True)
        new_map_file = os.path.join(args.outdir, "combined.motif2factors.txt")
        base = os.path.splitext(pfmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            shutil.copyfile(map_file, new_map_file)

        motifs += denovo
        pfmfile = os.path.join(args.outdir, "combined.pfm")
        with open(pfmfile, "w") as f:
            for m in motifs:
                print(m.to_pwm(), file=f)

        with open(new_map_file, "a") as f:
            for m in denovo:
                print("{}\t{}\t{}\t{}".format(m.id, "de novo", "GimmeMotifs",
                                              "Y"),
                      file=f)
                if result[m.id][0] in match_motifs:
                    for factor in match_motifs[result[m.id]
                                               [0]].factors["direct"]:
                        print(
                            "{}\t{}\t{}\t{}".format(m.id, factor,
                                                    "inferred (GimmeMotifs)",
                                                    "N"),
                            file=f,
                        )
    else:
        logger.info("skipping de novo")

    stats = [
        "phyper_at_fpr",
        "roc_auc",
        "pr_auc",
        "enr_at_fpr",
        "recall_at_fdr",
        "roc_values",
        "matches_at_fpr",
    ]

    f_out = sys.stdout
    if args.outdir:
        f_out = open(args.outdir + "/gimme.roc.report.txt", "w")

    # Print the metrics
    f_out.write(
        "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n"
    )

    logger.info("creating motif scan tables")
    # ftype = determine_file_type(args.sample)
    # sample = args.sample
    # delete_sample = False
    # if ftype == "narrowpeak":
    #    f = NamedTemporaryFile(delete=False)
    #    logger.debug("Using {} as temporary BED file".format(f.name))
    #    narrowpeak_to_bed(args.sample, f.name, size=args.size)
    #    sample = f.name
    #    delete_sample = True

    # Create a table with the best score per motif for all motifs.
    # This has three reasons:
    # * Can be used to calculate statistics;
    # * Can be used to select a set of non-redundant motifs;
    # * These files are included in the output and can be used for further analyis.
    score_table = os.path.join(scan_dir, "input.motif.score.txt")
    bg_score_table = os.path.join(scan_dir, "background.motif.score.txt")
    for infile, outfile in [(sample, score_table), (bgfile, bg_score_table)]:
        scan_to_file(
            infile,
            pfmfile,
            filepath_or_buffer=outfile,
            score_table=True,
            genome=args.genome,
            zscore=True,
            gcnorm=True,
        )

    n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0]
    n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0]

    logger.info("calculating stats")
    for motif_stats in calc_stats_iterator(
            motifs=pfmfile,
            fg_table=score_table,
            bg_table=bg_score_table,
            stats=stats,
            ncpus=args.ncpus,
    ):
        for motif in motifs:
            if str(motif) in motif_stats:
                log_pvalue = np.inf
                if motif_stats[str(motif)]["phyper_at_fpr"] > 0:
                    log_pvalue = -np.log10(
                        motif_stats[str(motif)]["phyper_at_fpr"])
                f_out.write(
                    "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n"
                    .format(
                        motif.id,
                        motif_stats[str(motif)]["matches_at_fpr"][0],
                        motif_stats[str(motif)]["matches_at_fpr"][0] /
                        n_input * 100,
                        motif_stats[str(motif)]["matches_at_fpr"][1],
                        motif_stats[str(motif)]["matches_at_fpr"][1] /
                        n_background * 100,
                        motif_stats[str(motif)]["phyper_at_fpr"],
                        log_pvalue,
                        motif_stats[str(motif)]["roc_auc"],
                        motif_stats[str(motif)]["pr_auc"],
                        motif_stats[str(motif)]["enr_at_fpr"],
                        motif_stats[str(motif)]["recall_at_fdr"],
                    ))
    f_out.close()

    # Select a set of "non-redundant" motifs.
    # Using Recursive Feature Elimination, a set of motifs is selected that
    # best explains the peaks in comparison to the background sequences.
    nr_motifs = select_nonredundant_motifs(
        args.outdir + "/gimme.roc.report.txt",
        pfmfile,
        score_table,
        bg_score_table,
        tolerance=0.001,
    )

    # Provide BED files with motif scan results for the non-redundant motifs
    # At the moment this is not ideal, as scanning is now performed twice
    # for this set of non-redundant motifs.
    motif_dict = dict([(m.id, m) for m in motifs])
    for motif in nr_motifs:
        with NamedTemporaryFile(mode="w") as f:
            print(motif_dict[motif].to_pwm(), file=f)
            f.flush()
            safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)
            scan_to_file(
                sample,
                f.name,
                filepath_or_buffer=os.path.join(scan_dir,
                                                f"{safe_name}.matches.bed"),
                bed=True,
                fpr=0.01,
                genome=args.genome,
                zscore=True,
                gcnorm=True,
            )

    if args.report:
        logger.info("creating statistics report")
        if args.outdir:
            roc_html_report(
                args.outdir,
                args.outdir + "/gimme.roc.report.txt",
                pfmfile,
                threshold=0.01,
                outname="gimme.motifs.redundant.html",
                link_matches=False,
            )
            roc_html_report(
                args.outdir,
                args.outdir + "/gimme.roc.report.txt",
                pfmfile,
                threshold=0.01,
                use_motifs=nr_motifs,
                link_matches=True,
            )
            logger.info(
                f"gimme motifs final report: {os.path.join(args.outdir, 'gimme.motifs.html')}"
            )