Exemple #1
0
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None):
    threshold = check_threshold(data_dir, genome, scoring)
    
    config = MotifConfig()
    
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    df = pd.read_table(input_table, index_col=0)
    regions = list(df.index)
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_genome(genome)

    scores = []
    if scoring == "count":
        for row in s.count(regions, cutoff=threshold):
            scores.append(row)
    else:
        for row in s.best_score(regions):
            scores.append(row)
   
    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    return pd.DataFrame(scores, index=df.index, columns=motif_names)
Exemple #2
0
def threshold(args):
    if args.fdr < 0 or args.fdr > 1:
        print "Please specify a FDR between 0 and 1"
        sys.exit(1)

    motifs = pwmfile_to_motifs(args.pwmfile)

    s = Scanner()
    s.set_motifs(args.pwmfile)

    score_table = []
    for scores in s.best_score(args.inputfile):
        score_table.append(scores)

    print "Motif\tScore\tCutoff"
    for i, scores in enumerate(np.array(score_table).transpose()):
        motif = motifs[i]
        pwm = motif.pwm
        min_score = motif.pwm_min_score()
        if len(scores) > 0:
            opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr))
            cutoff = (opt_score - min_score) / (motif.pwm_max_score() -
                                                min_score)
            print "{0}\t{1}\t{2}".format(motif.id, opt_score, cutoff)
        else:
            sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
Exemple #3
0
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None):
    threshold = check_threshold(data_dir, genome, scoring)

    config = MotifConfig()

    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    df = pd.read_table(input_table, index_col=0)
    regions = list(df.index)
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_genome(genome)

    scores = []
    if scoring == "count":
        for row in s.count(regions, cutoff=threshold):
            scores.append(row)
    else:
        for row in s.best_score(regions):
            scores.append(row)

    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    return pd.DataFrame(scores, index=df.index, columns=motif_names)
Exemple #4
0
def threshold(args):
    if args.fdr < 0 or args.fdr > 1:
        print "Please specify a FDR between 0 and 1"
        sys.exit(1)

    motifs = pwmfile_to_motifs(args.pwmfile)
    
    s = Scanner()
    s.set_motifs(args.pwmfile)
    
    score_table = []
    for scores in s.best_score(args.inputfile):
        score_table.append(scores)

    print "Motif\tScore\tCutoff"
    for i,scores in enumerate(np.array(score_table).transpose()):
        motif = motifs[i]
        pwm = motif.pwm
        min_score = motif.pwm_min_score()
        if len(scores) > 0:
            opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr))
            cutoff = (opt_score - min_score) / (
                    motif.pwm_max_score() - min_score)
            print "{0}\t{1}\t{2}".format(
                    motif.id, opt_score , cutoff)
        else:
            sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
def command_scan(inputfile, pwmfile, nreport=1, fpr=0.01, cutoff=None, 
        bed=False, scan_rc=True, table=False, score_table=False, moods=False, 
        pvalue=None, bgfile=None, genome=None, ncpus=None, normalize=False):
    motifs = read_motifs(pwmfile)
    
    fa = as_fasta(inputfile, genome)
    
    # initialize scanner
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    
    if genome:
        s.set_genome(genome=genome)

    if genome or bgfile:
        s.set_background(genome=genome, fname=bgfile, length=fa.median_length())

    if not score_table:
        s.set_threshold(fpr=fpr, threshold=cutoff)
    
    if table:
        it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods)
    elif score_table:
        it = scan_score_table(s, fa, motifs, scan_rc, normalize=normalize) 
    else:
        it = scan_normal(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods, bed, normalize=normalize)
    
    for row in it:
        yield row
Exemple #6
0
def roc(args):
    """ Calculate ROC_AUC and other metrics and optionally plot ROC curve.
    """
    pwmfile = args.pwmfile
    fg_file = args.sample
    bg_file = args.background
    outputfile = args.outfile
    # Default extension for image
    if outputfile and   not outputfile.endswith(".png"):
        outputfile += ".png"
    
    motifs = read_motifs(open(pwmfile), fmt="pwm")

    s = Scanner()
    s.set_motifs(pwmfile)
    
    ids = []
    if args.ids:
        ids = args.ids.split(",")
    else:
        ids = [m.id for m in motifs]

    fg_total = dict([(m.id, []) for m in motifs])
    for scores in s.best_score(fg_file):
        for motif,score in zip(motifs, scores):
            fg_total[motif.id].append(score)
    
    bg_total = dict([(m.id, []) for m in motifs])
    for scores in s.best_score(bg_file):
        for motif,score in zip(motifs, scores):
            bg_total[motif.id].append(score)
   
    plot_x = []
    plot_y = []
    # Print the metrics
    print "Motif\tROC AUC\tMNCP\tEnr. at 5% FDR\tMax enr.\tRecall at 10% FDR"
    for motif_id in ids:
        fg_vals = fg_total[motif_id] 
        bg_vals = bg_total[motif_id]    
        (x, y) = ROC_values(fg_vals, bg_vals) 
        plot_x.append(x)
        plot_y.append(y)
        auc = ROC_AUC(fg_vals, bg_vals)
        mncp = MNCP(fg_vals, bg_vals)
        enr_fdr = enr_at_fdr(fg_vals, bg_vals)
        max_enr,score = max_enrichment(fg_vals, bg_vals)
        recall = recall_at_fdr(fg_vals, bg_vals, 0.1)
        print "%s\t%0.3f\t%03f\t%0.2f\t%0.2f\t%0.4f" % (
                motif_id, auc, mncp, enr_fdr, max_enr, recall)
    
    # Plot the ROC curve
    if outputfile:
        roc_plot(outputfile, plot_x, plot_y, ids=ids)
Exemple #7
0
def roc(args):
    """ Calculate ROC_AUC and other metrics and optionally plot ROC curve.
    """
    pwmfile = args.pwmfile
    fg_file = args.sample
    bg_file = args.background
    outputfile = args.outfile
    # Default extension for image
    if outputfile and not outputfile.endswith(".png"):
        outputfile += ".png"

    motifs = read_motifs(open(pwmfile), fmt="pwm")

    s = Scanner()
    s.set_motifs(pwmfile)

    ids = []
    if args.ids:
        ids = args.ids.split(",")
    else:
        ids = [m.id for m in motifs]

    fg_total = dict([(m.id, []) for m in motifs])
    for scores in s.best_score(fg_file):
        for motif, score in zip(motifs, scores):
            fg_total[motif.id].append(score)

    bg_total = dict([(m.id, []) for m in motifs])
    for scores in s.best_score(bg_file):
        for motif, score in zip(motifs, scores):
            bg_total[motif.id].append(score)

    plot_x = []
    plot_y = []
    # Print the metrics
    print "Motif\tROC AUC\tMNCP\tEnr. at 5% FDR\tMax enr."
    for motif_id in ids:
        fg_vals = fg_total[motif_id]
        bg_vals = bg_total[motif_id]
        (x, y) = ROC_values(fg_vals, bg_vals)
        plot_x.append(x)
        plot_y.append(y)
        auc = ROC_AUC(fg_vals, bg_vals)
        mncp = MNCP(fg_vals, bg_vals)
        enr_fdr = enr_at_fdr(fg_vals, bg_vals)
        max_enr, score = max_enrichment(fg_vals, bg_vals)
        print "%s\t%0.3f\t%03f\t%0.2f\t%0.2f" % (motif_id, auc, mncp, enr_fdr,
                                                 max_enr)

    # Plot the ROC curve
    if outputfile:
        roc_plot(outputfile, plot_x, plot_y, ids=ids)
Exemple #8
0
    def get_PWMScore(self, fin_regions_fa):
        """ Scan motif in every peak.

        Arguments:
            fin_regions_fa {[type]} -- [input fasta file]

        Returns:
            [type] -- [pfmscorefile]
        """
        pfmscorefile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)
        seqs = [s.split(" ")[0] for s in as_fasta(fin_regions_fa, genome=self.genome).ids]

        s = Scanner(ncpus=self.ncore)
        s.set_motifs(self.pfmfile)
        s.set_threshold(threshold=0.0)
        s.set_genome(self.genome)

        with open(self.pfmfile) as f:
            motifs = read_motifs(f)

        chunksize = 10000
        # Run 10k peaks one time.

        with tqdm(total=len(seqs)) as pbar:
            for chunk in range(0, len(seqs), chunksize):
                chunk_seqs = seqs[chunk : chunk + chunksize]
                # print(chunk, "-", chunk + chunksize, "enhancers")
                pfm_score = []
                it = s.best_score(chunk_seqs, zscore=True, gc=True)
                # We are using GC-normalization for motif scan because many sequence is GC-enriched.
                # GimmeMotif develop branch already include GC-normalization option now.
                for seq, scores in zip(chunk_seqs, it):
                    for motif, score in zip(motifs, scores):
                        pfm_score.append([motif.id, seq, score])
                    pbar.update(1)
                pfm_score = pd.DataFrame(pfm_score, columns=["motif", "enhancer", "zscore"])
                pfm_score = pfm_score.set_index("motif")

                # print("\tCombine")
                pfm_score["zscoreRank"] = minmax_scale(rankdata(pfm_score["zscore"]))
                # When we built model, rank and minmax normalization was used.
                cols = ["enhancer", "zscore", "zscoreRank"]
                write_header = False
                if chunk == 0:
                    write_header = True
                pfm_score[cols].to_csv(pfmscorefile, sep="\t", header=write_header)
                # pbar.update(chunk + chunksize)

        return pfmscorefile.name
Exemple #9
0
def scan_to_table(input_table,
                  genome,
                  data_dir,
                  scoring,
                  pwmfile=None,
                  ncpus=None):
    config = MotifConfig()

    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    s.set_genome(genome)
    nregions = len(regions)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR, genome=genome)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        logger.info("creating score table")
        for row in s.best_score(regions):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(open(pwmfile))]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Exemple #10
0
def get_motif_scores(fa, motifs):
    s = Scanner()
    s.set_motifs(motifs)
    s.set_threshold(threshold=0.0)
    seqs = Fasta(fa.seqfn)
    for i, result in enumerate(s.scan(seqs, nreport=1)):
        intron_id = seqs.ids[i]
        for m, matches in enumerate(result):
            motif = motifs[m]
            for score, pos, strand in matches:
                if score < 0:
                    score_rescaled = rescale(score,
                                             orig_range=[motif.min_score, 0],
                                             new_range=[0, 50])
                else:
                    score_rescaled = rescale(score,
                                             orig_range=[0, motif.max_score],
                                             new_range=[50, 100])
                yield (intron_id, motif.id, score_rescaled)
def threshold(args):
    """Calculate motif score threshold for a given FPR."""
    if args.fpr < 0 or args.fpr > 1:
        print("Please specify a FPR between 0 and 1")
        sys.exit(1)

    motifs = read_motifs(args.pwmfile)

    s = Scanner()
    s.set_motifs(args.pwmfile)
    s.set_threshold(args.fpr, filename=args.inputfile)

    print("Motif\tScore\tCutoff")
    for motif in motifs:
        min_score = motif.pwm_min_score()
        max_score = motif.pwm_max_score()
        opt_score = s.threshold[motif.id]
        if opt_score is None:
            opt_score = motif.pwm_max_score()
        threshold = (opt_score - min_score) / (max_score - min_score)
        print("{0}\t{1}\t{2}".format(motif.id, opt_score, threshold))
Exemple #12
0
    def scan(self, background_length=200, fpr=0.02, n_cpus=-1, verbose=True):
        """
        Scan DNA sequences searching for TF binding motifs.

        Args:
           background_length (int): background length. This is used for the calculation of the binding score.

           fpr (float): False positive rate for motif identification.

           n_cpus (int): number of CPUs for parallel calculation.

           verbose (bool): Whether to show a progress bar.

        """

        self.fpr = fpr
        self.background_length = background_length
        print("initiating scanner ...")
        ## 1. initialilze scanner  ##
        # load motif
        motifs = default_motifs()

        # initialize scanner
        s = Scanner(ncpus=n_cpus)

        # set parameters
        s.set_motifs(motifs)
        s.set_background(genome=self.ref_genome, length=background_length)
        #s.set_background(genome="mm9", length=400)
        s.set_threshold(fpr=fpr)

        ## 2. motif scan ##
        print("getting DNA sequences ...")
        target_sequences = peak2fasta(self.all_peaks, self.ref_genome)
        print("scanning motifs ...")
        self.scanned_df = scan_dna_for_motifs(s, motifs, target_sequences,
                                              verbose)

        self.__addLog("scanMotifs")
Exemple #13
0
def threshold(args):
    """Calculate motif score threshold for a given FPR."""
    if args.fpr < 0 or args.fpr > 1:
        print("Please specify a FPR between 0 and 1")
        sys.exit(1)

    motifs = read_motifs(args.pwmfile)
    
    s = Scanner()
    s.set_motifs(args.pwmfile)
    s.set_threshold(args.fpr, filename=args.inputfile)

    print("Motif\tScore\tCutoff")
    for motif in motifs:
        min_score = motif.pwm_min_score()
        max_score = motif.pwm_max_score()
        opt_score = s.threshold[motif.id]
        if opt_score is None:
            opt_score = motif.pwm_max_score()
        threshold = (opt_score - min_score) / (max_score - min_score)
        print("{0}\t{1}\t{2}".format(
                motif.id, opt_score, threshold))
Exemple #14
0
def scan_to_table(input_table, genome, scoring, pwmfile=None, ncpus=None):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.
    
    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a 
        genomepy genome name.
    
    scoring : str
        "count" or "score"
    
    pwmfile : str, optional
        Specify a PFM file for scanning.
    
    ncpus : int, optional
        If defined this specifies the number of cores to use.
    
    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()
    
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:,0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index
    
    regions = list(idx)
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    s.set_genome(genome)
    s.set_background(genome=genome)
    
    nregions = len(regions)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        logger.info("creating score table")
        for row in s.best_score(regions, normalize=True):
            scores.append(row)
        logger.info("done")
   
    motif_names = [m.id for m in read_motifs(pwmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Exemple #15
0
def moap(
    inputfile,
    method="hypergeom",
    scoring=None,
    outfile=None,
    motiffile=None,
    pfmfile=None,
    genome=None,
    fpr=0.01,
    ncpus=None,
    subsample=None,
    zscore=True,
    gc=True,
):
    """Run a single motif activity prediction algorithm.

    Parameters
    ----------
    inputfile : str
        :1File with regions (chr:start-end) in first column and either cluster
        name in second column or a table with values.

    method : str, optional
        Motif activity method to use. Any of 'hypergeom', 'lasso',
        'lightningclassification', 'lightningregressor', 'bayesianridge',
        'rf', 'xgboost'. Default is 'hypergeom'.

    scoring:  str, optional
        Either 'score' or 'count'

    outfile : str, optional
        Name of outputfile to save the fitted activity values.

    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.

    pfmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not
        supplied.

    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied

    fpr : float, optional
        FPR for motif scanning

    ncpus : int, optional
        Number of threads to use. Default is the number specified in the config.

    zscore : bool, optional
        Use z-score normalized motif scores.

    gc : bool optional
        Use GC% bins for z-score.

    Returns
    -------
    pandas DataFrame with motif activity
    """

    if scoring and scoring not in ["score", "count"]:
        raise ValueError("valid values are 'score' and 'count'")

    if inputfile.endswith("feather"):
        df = pd.read_feather(inputfile)
        df = df.set_index(df.columns[0])
    else:
        # read data
        df = pd.read_table(inputfile, index_col=0, comment="#")

    clf = Moap.create(method, ncpus=ncpus)

    if clf.ptype == "classification":
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype("object") in set(df.dtypes):
            raise ValueError("columns should all be numeric for {}".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")

        pfmfile = pfmfile_location(pfmfile)
        try:
            motifs = read_motifs(pfmfile)
        except Exception:
            sys.stderr.write("can't read motifs from {}".format(pfmfile))
            raise

        # initialize scanner
        s = Scanner(ncpus=ncpus)
        s.set_motifs(pfmfile)
        s.set_genome(genome)
        s.set_background(genome=genome)

        # scan for motifs
        motif_names = [m.id for m in read_motifs(pfmfile)]
        scores = []
        if method == "classic" or scoring == "count":
            logger.info("motif scanning (scores)")
            scores = scan_to_table(
                inputfile,
                genome,
                "count",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
        else:
            logger.info("motif scanning (scores)")
            scores = scan_to_table(
                inputfile,
                genome,
                "score",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)

    elif isinstance(motiffile, pd.DataFrame):
        motifs = motiffile
    else:
        motifs = pd.read_table(motiffile, index_col=0, comment="#")

    if outfile and os.path.exists(outfile):
        out = pd.read_table(outfile, index_col=0, comment="#")
        ncols = df.shape[1]
        if ncols == 1:
            ncols = len(df.iloc[:, 0].unique())

        if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols:
            logger.warn("%s output already exists... skipping", method)
            return out

    if subsample is not None:
        n = int(subsample * df.shape[0])
        logger.debug("Subsampling %d regions", n)
        df = df.sample(n)

    motifs = motifs.loc[df.index]

    if method == "lightningregressor":
        outdir = os.path.dirname(outfile)
        tmpname = os.path.join(outdir, ".lightning.tmp")
        clf.fit(motifs, df, tmpdir=tmpname)
        shutil.rmtree(tmpname)
    else:
        clf.fit(motifs, df)

    if outfile:
        with open(outfile, "w") as f:
            f.write("# maelstrom - GimmeMotifs version {}\n".format(__version__))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if isinstance(motiffile, str):
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))

        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_
Exemple #16
0
def scan_to_table(
    input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True
):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.

    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a
        genomepy genome name.

    scoring : str
        "count" or "score"

    pfmfile : str, optional
        Specify a PFM file for scanning.

    ncpus : int, optional
        If defined this specifies the number of cores to use.

    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()

    if pfmfile is None:
        pfmfile = config.get_default_params().get("motif_db", None)
        if pfmfile is not None:
            pfmfile = os.path.join(config.get_motif_dir(), pfmfile)

    if pfmfile is None:
        raise ValueError("no pfmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:, 0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index

    regions = list(idx)
    if len(regions) >= 1000:
        check_regions = np.random.choice(regions, size=1000, replace=False)
    else:
        check_regions = regions

    size = int(
        np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs])
    )
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pfmfile)
    s.set_genome(genome)
    s.set_background(genome=genome, gc=gc, size=size)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        msg = "creating score table"
        if zscore:
            msg += " (z-score"
            if gc:
                msg += ", GC%"
            msg += ")"
        else:
            msg += " (logodds)"
        logger.info(msg)
        for row in s.best_score(regions, zscore=zscore, gc=gc):
            scores.append(row)
        logger.info("done")

    motif_names = [m.id for m in read_motifs(pfmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
Exemple #17
0
def diff(args):

    infiles = args.inputfiles.split(",")
    bgfile = args.bgfile
    outfile = args.outputfile
    pwmfile = args.pwmfile
    cutoff = args.cutoff
    genome = args.genome
    minenr = float(args.minenr)
    minfreq = float(args.minfreq)

    tmpdir = mkdtemp()

    # Retrieve FASTA clusters from BED file
    if len(infiles) == 1 and infiles[0].endswith("bed"):
        if not args.genome:
            sys.stderr.write("Can't convert BED file without genome!\n")
            sys.exit(1)

        clusters = {}
        for line in open(infiles[0]):
            vals = line.strip().split("\t")
            clusters.setdefault(vals[4], []).append(vals[:3])

        infiles = []

        for cluster, regions in clusters.items():
            sys.stderr.write("Creating FASTA file for {0}\n".format(cluster))
            inbed = os.path.join(tmpdir, "{0}.bed".format(cluster))
            outfa = os.path.join(tmpdir, "{0}.fa".format(cluster))
            with open(inbed, "w") as f:
                for vals in regions:
                    f.write("{0}\t{1}\t{2}\n".format(*vals))
            Genome(genome).track2fasta(inbed, outfa)
            infiles.append(outfa)

    pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)])
    motifs = [m for m in pwms.keys()]
    names = [os.path.basename(os.path.splitext(fname)[0]) for fname in infiles]

    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_threshold(threshold=cutoff)

    # Get background frequencies
    nbg = float(len(Fasta(bgfile).seqs))

    bgcounts = s.total_count(bgfile, nreport=1)
    bgfreq = [(c + 0.01) / nbg for c in bgcounts]

    # Get frequences in input files
    freq = {}
    counts = {}
    for fname in infiles:
        mcounts = s.total_count(fname, nreport=1)
        n = float(len(Fasta(fname).seqs))
        counts[fname] = mcounts
        freq[fname] = [(c + 0.01) / n for c in mcounts]

    freq = np.array([freq[fname] for fname in infiles]).transpose()
    counts = np.array([counts[fname] for fname in infiles]).transpose()

    #for row in freq:
    #    print freq

    diff_plot(motifs,
              pwms,
              names,
              freq,
              counts,
              bgfreq,
              bgcounts,
              outfile,
              minenr=minenr,
              minfreq=minfreq)

    shutil.rmtree(tmpdir)
Exemple #18
0
def command_scan(inputfile, pwmfile, nreport=1, cutoff=0.9, bed=False, 
        scan_rc=True, table=False, score_table=False, moods=False, 
        pvalue=None, bgfile=None, genome=None):
    motifs = pwmfile_to_motifs(pwmfile)
    
    index_dir = None
    if genome is not None:
        index_dir = os.path.join(MotifConfig().get_index_dir(), genome) 
    
    # initialize scanner
    s = Scanner()
    s.set_motifs(pwmfile)
    
    fa = as_fasta(inputfile, index_dir)
    
    if moods:
        result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table)
    else:
        result_it = s.scan(fa, nreport, scan_rc, cutoff)

    
    if table:
        # header
        yield "\t{}".format("\t".join([m.id for m in motifs]))
        
        if moods:
            result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile,  nreport, scan_rc, pvalue, table)
            for seq_id, counts in result_it:
                yield "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts]))
        else:
            # get iterator
            result_it = s.count(fa, nreport, scan_rc, cutoff)
            # counts table
            for i, counts in enumerate(result_it):
                yield "{}\t{}".format(
                        fa.ids[i], 
                        "\t".join([str(x) for x in counts])
                        )

    elif score_table:
        # get iterator
        result_it = s.best_score(fa, scan_rc)
        # header
        yield "\t{}".format("\t".join([m.id for m in motifs]))
        # score table
        for i,scores in enumerate(result_it):
            yield "{}\t{}".format(
                    fa.ids[i], 
                    "\t".join([str(x) for x in scores])
                    )

    else:
        if moods:
            for motif, d in result_it:
                for seq_id,matches in d.items():
                    for pos,score,strand in matches:
                        yield format_line(fa, seq_id, motif,
                                score, pos, strand, bed=bed)
        else:
            for i, result in enumerate(result_it):
                seq_id = fa.ids[i]
                for motif, matches in zip(motifs, result):
                    for (score, pos, strand) in matches:
                        yield format_line(fa, seq_id, motif, 
                                   score, pos, strand, bed=bed)
Exemple #19
0
def moap(inputfile, method="classic", scoring="score", outfile=None, motiffile=None, pwmfile=None, genome=None, cutoff=0.95):
    """ Run a single motif activity prediction algorithm.
    
    Parameters
    ----------
    
    inputfile : str
        File with regions (chr:start-end) in first column and either cluster 
        name in second column or a table with values.
    
    method : str, optional
        Motif activity method to use. Any of 'classic', 'ks', 'lasso', 
        'lightning', 'mara', 'rf'. Default is 'classic'. 
    
    scoring:  str, optional
        Either 'score' or 'count'
    
    outfile : str, optional
        Name of outputfile to save the fitted activity values.
    
    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.
    
    pwmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not 
        supplied.
    
    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied
    
    cutoff : float, optional
        Cutoff for motif scanning
    
    Returns
    -------
    
    pandas DataFrame with motif activity
    """

    if scoring not in ['score', 'count']:
        raise ValueError("valid values are 'score' and 'count'")
    
    config = MotifConfig()

    m2f = None
    
    # read data
    df = pd.read_table(inputfile, index_col=0)

    if method in CLUSTER_METHODS:
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype('object') in set(df.dtypes):
            raise ValueError(
                    "columns should all be numeric for {}".format(method))
        if method not in VALUE_METHODS:
            raise ValueError("method {} not valid".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")
        # check pwmfile
        if pwmfile is None:
            pwmfile = config.get_default_params().get("motif_db", None)
            if pwmfile is not None:
                pwmfile = os.path.join(config.get_motif_dir(), pwmfile)
        
        if pwmfile is None:
            raise ValueError("no pwmfile given and no default database specified")

        if not os.path.exists(pwmfile):
            raise ValueError("{} does not exist".format(pwmfile))

        try:
            motifs = read_motifs(open(pwmfile))
        except:
            sys.stderr.write("can't read motifs from {}".format(pwmfile))
            raise

        base = os.path.splitext(pwmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            m2f = pd.read_table(map_file, index_col=0)

        # initialize scanner
        s = Scanner()
        sys.stderr.write(pwmfile + "\n")
        s.set_motifs(pwmfile)
        s.set_genome(genome)

        # scan for motifs
        sys.stderr.write("scanning for motifs\n")
        motif_names = [m.id for m in read_motifs(open(pwmfile))]
        scores = []
        if method == 'classic' or scoring == "count":
            for row in s.count(list(df.index), cutoff=cutoff):
                scores.append(row)
        else:
            for row in s.best_score(list(df.index)):
                scores.append(row)

        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)
    else:
        motifs = pd.read_table(motiffile, index_col=0)   

    motifs = motifs.loc[df.index]
    
    clf = None
    if method == "ks":
        clf = KSMoap()
    if method == "mwu":
        clf = MWMoap()
    if method == "rf":
        clf = RFMoap()
    if method == "lasso":
        clf = LassoMoap()
    if method == "lightning":
        clf = LightningMoap()
    if method == "mara":
        clf = MaraMoap()
    if method == "more":
        clf = MoreMoap()
    if method == "classic":
        clf = ClassicMoap()

    clf.fit(motifs, df)
    
    if outfile:
        with open(outfile, "w") as f:
            f.write("# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if motiffile:
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))
        
        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_
Exemple #20
0
def diff(args):

    infiles = args.inputfiles.split(",")
    bgfile = args.bgfile
    outfile = args.outputfile
    pwmfile = args.pwmfile
    cutoff = args.cutoff
    genome = args.genome
    minenr = float(args.minenr)
    minfreq = float(args.minfreq)

    tmpdir = mkdtemp()
    
    # Retrieve FASTA clusters from BED file
    if len(infiles) == 1 and infiles[0].endswith("bed"):
        if not args.genome:
            sys.stderr.write("Can't convert BED file without genome!\n")
            sys.exit(1)

        clusters = {}
        for line in open(infiles[0]):
            vals = line.strip().split("\t")
            clusters.setdefault(vals[4], []).append(vals[:3])
        
        infiles = []
        
        for cluster,regions in clusters.items():
            sys.stderr.write("Creating FASTA file for {0}\n".format(cluster))
            inbed = os.path.join(tmpdir, "{0}.bed".format(cluster))
            outfa = os.path.join(tmpdir, "{0}.fa".format(cluster))
            with open(inbed, "w") as f:
                for vals in regions:
                    f.write("{0}\t{1}\t{2}\n".format(*vals))
            Genome(genome).track2fasta(inbed, outfa)
            infiles.append(outfa)
    
    pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)])
    motifs = [m for m in pwms.keys()]
    names = [os.path.basename(os.path.splitext(fname)[0]) for fname in infiles]
   
    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_threshold(threshold=cutoff)

    # Get background frequencies
    nbg = float(len(Fasta(bgfile).seqs))
    
    bgcounts = s.total_count(bgfile, nreport=1) 
    bgfreq = [(c + 0.01) / nbg for c in bgcounts]
    
    # Get frequences in input files
    freq = {}
    counts = {}
    for fname in infiles:
        mcounts = s.total_count(fname, nreport=1) 
        n = float(len(Fasta(fname).seqs))
        counts[fname] = mcounts
        freq[fname] = [(c + 0.01) / n for c in mcounts]
    
    freq = np.array([freq[fname] for fname in infiles]).transpose()
    counts = np.array([counts[fname] for fname in infiles]).transpose()
    
    #for row in freq:
    #    print freq

    diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, minenr=minenr, minfreq=minfreq)

    shutil.rmtree(tmpdir)
Exemple #21
0
def calc_stats_iterator(
    fg_file=None,
    bg_file=None,
    fg_table=None,
    bg_table=None,
    motifs=None,
    stats=None,
    genome=None,
    zscore=True,
    gc=True,
    ncpus=None,
):
    """Calculate motif enrichment metrics.

    Parameters
    ----------
    fg_file : str, optional
        Filename of a FASTA, BED or region file with positive sequences.

    bg_file : str, optional
        Filename of a FASTA, BED or region file with negative sequences.

    fg_table : str, optional
        Filename of a table with motif scan results of positive sequences.

    bg_table : str, optional
        Filename of a table with motif scan results of negative sequences.

    motifs : str, list or Motif instance, optional
        A file with motifs in pfm format, a list of Motif instances or a
        single Motif instance. If motifs is `None`, the default motif
        database is used.

    genome : str, optional
        Genome or index directory in case of BED/regions.

    stats : list, optional
        Names of metrics to calculate. See gimmemotifs.rocmetrics.__all__
        for available metrics.

    ncpus : int, optional
        Number of cores to use.

    Returns
    -------
    result : dict
        Dictionary with results where keys are motif ids and the values are
        dictionary with metric name and value pairs.
    """
    if not stats:
        stats = rocmetrics.__all__

    if fg_table is None:
        if fg_file is None:
            raise ValueError("Need either fg_table or fg_file argument")
    elif fg_file is not None:
        raise ValueError("Need either fg_table or fg_file argument, not both")

    if bg_table is None:
        if bg_file is None:
            raise ValueError("Need either bg_table or bg_file argument")
    elif bg_file is not None:
        raise ValueError("Need either bg_table or bg_file argument, not both")

    if fg_table is not None or bg_table is not None:
        remove_stats = []
        for s in stats:
            func = getattr(rocmetrics, s)
            if func.input_type == "pos":
                remove_stats.append(s)
        if len(remove_stats) != 0:
            logger.warn(
                "Cannot calculate stats that require position from table of motif scores."
            )
            logger.warn(f"Skipping the following statistics: {', '.join(remove_stats)}")
            stats = [s for s in stats if s not in remove_stats]

    if isinstance(motifs, Motif):
        all_motifs = [motifs]
    else:
        if type([]) == type(motifs):
            all_motifs = motifs
        else:
            motifs = pfmfile_location(motifs)
            all_motifs = read_motifs(motifs, fmt="pwm")
    if fg_table is not None or bg_table is not None:
        filtered_motifs = pd.read_csv(
            fg_table, sep="\t", index_col=0, nrows=1, comment="#"
        ).columns
        filtered_motifs = filtered_motifs.intersection(
            pd.read_csv(bg_table, sep="\t", index_col=0, nrows=1, comment="#").columns
        )
        all_motifs = [m for m in all_motifs if m.id in filtered_motifs]

    if ncpus is None:
        ncpus = int(MotifConfig().get_default_params()["ncpus"])

    if fg_file is not None or bg_file is not None:
        if zscore or gc:
            # Precalculate mean and stddev for z-score calculation
            s = Scanner(ncpus=ncpus)
            s.set_motifs(all_motifs)
            s.set_genome(genome)
            s.set_meanstd(gc=gc)

    chunksize = 240
    for i in range(0, len(all_motifs), chunksize):
        result = {}
        logger.debug(
            "chunk %s of %s", (i / chunksize) + 1, len(all_motifs) // chunksize + 1
        )
        motifs = all_motifs[i : i + chunksize]

        if fg_table is None:
            fg_total = scan_to_best_match(
                fg_file, motifs, ncpus=ncpus, genome=genome, zscore=zscore, gc=gc
            )
        else:
            fg_total = pd.read_csv(
                fg_table, sep="\t", usecols=[m.id for m in motifs], comment="#"
            ).to_dict(orient="list")
            for m in fg_total:
                fg_total[m] = [(x, None) for x in fg_total[m]]

        if bg_table is None:
            bg_total = scan_to_best_match(
                bg_file, motifs, ncpus=ncpus, genome=genome, zscore=zscore, gc=gc
            )
        else:
            bg_total = pd.read_csv(
                bg_table, sep="\t", usecols=[m.id for m in motifs], comment="#"
            ).to_dict(orient="list")
            for m in bg_total:
                bg_total[m] = [(x, None) for x in bg_total[m]]

        logger.debug("calculating statistics")

        if ncpus == 1:
            it = _single_stats(motifs, stats, fg_total, bg_total)
        else:
            it = _mp_stats(motifs, stats, fg_total, bg_total, ncpus)

        for motif_id, s, ret in it:
            if motif_id not in result:
                result[motif_id] = {}
            result[motif_id][s] = ret
        yield result
Exemple #22
0
def moap(inputfile,
         method="hypergeom",
         scoring=None,
         outfile=None,
         motiffile=None,
         pwmfile=None,
         genome=None,
         fpr=0.01,
         ncpus=None):
    """Run a single motif activity prediction algorithm.
    
    Parameters
    ----------
    inputfile : str
        :1File with regions (chr:start-end) in first column and either cluster 
        name in second column or a table with values.
    
    method : str, optional
        Motif activity method to use. Any of 'hypergeom', 'lasso', 
        'lightningclassification', 'lightningregressor', 'bayesianridge', 
        'rf', 'xgboost'. Default is 'hypergeom'. 
    
    scoring:  str, optional
        Either 'score' or 'count'
    
    outfile : str, optional
        Name of outputfile to save the fitted activity values.
    
    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.
    
    pwmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not 
        supplied.
    
    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied
    
    fpr : float, optional
        FPR for motif scanning
    
    ncpus : int, optional
        Number of threads to use. Default is the number specified in the config.
    
    Returns
    -------
    pandas DataFrame with motif activity
    """

    if scoring and scoring not in ['score', 'count']:
        raise ValueError("valid values are 'score' and 'count'")

    config = MotifConfig()

    m2f = None

    if inputfile.endswith("feather"):
        df = pd.read_feather(inputfile)
        df = df.set_index(df.columns[0])
    else:
        # read data
        df = pd.read_table(inputfile, index_col=0, comment="#")

    clf = Moap.create(method, ncpus=ncpus)

    if clf.ptype == "classification":
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype('object') in set(df.dtypes):
            raise ValueError(
                "columns should all be numeric for {}".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")
        # check pwmfile
        if pwmfile is None:
            pwmfile = config.get_default_params().get("motif_db", None)
            if pwmfile is not None:
                pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

        if pwmfile is None:
            raise ValueError(
                "no pwmfile given and no default database specified")

        if not os.path.exists(pwmfile):
            raise ValueError("{} does not exist".format(pwmfile))

        try:
            motifs = read_motifs(open(pwmfile))
        except:
            sys.stderr.write("can't read motifs from {}".format(pwmfile))
            raise

        base = os.path.splitext(pwmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            m2f = pd.read_table(map_file, index_col=0, comment="#")

        # initialize scanner
        s = Scanner(ncpus=ncpus)
        sys.stderr.write(pwmfile + "\n")
        s.set_motifs(pwmfile)
        s.set_genome(genome)

        # scan for motifs
        sys.stderr.write("scanning for motifs\n")
        motif_names = [m.id for m in read_motifs(open(pwmfile))]
        scores = []
        if method == 'classic' or scoring == "count":
            s.set_threshold(fpr=fpr)
            for row in s.count(list(df.index)):
                scores.append(row)
        else:
            for row in s.best_score(list(df.index)):
                scores.append(row)

        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)
    else:
        motifs = pd.read_table(motiffile, index_col=0, comment="#")

    if outfile and os.path.exists(outfile):
        out = pd.read_table(outfile, index_col=0, comment="#")
        ncols = df.shape[1]
        if ncols == 1:
            ncols = len(df.iloc[:, 0].unique())

        if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols:
            logger.warn("%s output already exists... skipping", method)
            return out

    motifs = motifs.loc[df.index]

    if method == "lightningregressor":
        outdir = os.path.dirname(outfile)
        tmpname = os.path.join(outdir, ".lightning.tmp")
        clf.fit(motifs, df, tmpdir=tmpname)
        shutil.rmtree(tmpname)
    else:
        clf.fit(motifs, df)

    if outfile:
        with open(outfile, "w") as f:
            f.write(
                "# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if motiffile:
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))

        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_
Exemple #23
0
def moap(inputfile,
         method="classic",
         scoring="score",
         outfile=None,
         motiffile=None,
         pwmfile=None,
         genome=None,
         cutoff=0.95):
    """ Run a single motif activity prediction algorithm.
    
    Parameters
    ----------
    
    inputfile : str
        File with regions (chr:start-end) in first column and either cluster 
        name in second column or a table with values.
    
    method : str, optional
        Motif activity method to use. Any of 'classic', 'ks', 'lasso', 
        'lightning', 'mara', 'rf'. Default is 'classic'. 
    
    scoring:  str, optional
        Either 'score' or 'count'
    
    outfile : str, optional
        Name of outputfile to save the fitted activity values.
    
    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.
    
    pwmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not 
        supplied.
    
    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied
    
    cutoff : float, optional
        Cutoff for motif scanning
    
    Returns
    -------
    
    pandas DataFrame with motif activity
    """

    if scoring not in ['score', 'count']:
        raise ValueError("valid values are 'score' and 'count'")

    config = MotifConfig()

    m2f = None

    # read data
    df = pd.read_table(inputfile, index_col=0)

    if method in CLUSTER_METHODS:
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype('object') in set(df.dtypes):
            raise ValueError(
                "columns should all be numeric for {}".format(method))
        if method not in VALUE_METHODS:
            raise ValueError("method {} not valid".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")
        # check pwmfile
        if pwmfile is None:
            pwmfile = config.get_default_params().get("motif_db", None)
            if pwmfile is not None:
                pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

        if pwmfile is None:
            raise ValueError(
                "no pwmfile given and no default database specified")

        if not os.path.exists(pwmfile):
            raise ValueError("{} does not exist".format(pwmfile))

        try:
            motifs = read_motifs(open(pwmfile))
        except:
            sys.stderr.write("can't read motifs from {}".format(pwmfile))
            raise

        base = os.path.splitext(pwmfile)[0]
        map_file = base + ".motif2factors.txt"
        if os.path.exists(map_file):
            m2f = pd.read_table(map_file, index_col=0)

        # initialize scanner
        s = Scanner()
        sys.stderr.write(pwmfile + "\n")
        s.set_motifs(pwmfile)
        s.set_genome(genome)

        # scan for motifs
        sys.stderr.write("scanning for motifs\n")
        motif_names = [m.id for m in read_motifs(open(pwmfile))]
        scores = []
        if method == 'classic' or scoring == "count":
            for row in s.count(list(df.index), cutoff=cutoff):
                scores.append(row)
        else:
            for row in s.best_score(list(df.index)):
                scores.append(row)

        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)
    else:
        motifs = pd.read_table(motiffile, index_col=0)

    clf = None
    if method == "ks":
        clf = KSMoap()
    if method == "mwu":
        clf = MWMoap()
    if method == "rf":
        clf = RFMoap()
    if method == "lasso":
        clf = LassoMoap()
    if method == "lightning":
        clf = LightningMoap()
    if method == "mara":
        clf = MaraMoap()
    if method == "more":
        clf = MoreMoap()
    if method == "classic":
        clf = ClassicMoap()

    clf.fit(motifs, df)

    if outfile:
        with open(outfile, "w") as f:
            f.write(
                "# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if motiffile:
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))

        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_
Exemple #24
0
    def scan(self,
             background_length=200,
             fpr=0.02,
             n_cpus=-1,
             verbose=True,
             motifs=None,
             TF_evidence_level="direct_and_indirect",
             TF_formatting="auto"):
        """
        Scan DNA sequences searching for TF binding motifs.

        Args:
           background_length (int): background length. This is used for the calculation of the binding score.

           fpr (float): False positive rate for motif identification.

           n_cpus (int): number of CPUs for parallel calculation.

           verbose (bool): Whether to show a progress bar.

           motifs (list): a list of gimmemotifs motifs, will revert to default_motifs() if None

           TF_evidence_level (str): Please select one from ["direct", "direct_and_indirect"]. If "direct" is selected, TFs that have a binding evidence were used.
               If "direct_and_indirect" is selected, TFs with binding evidence and inferred TFs are used.
               For more information, please read explanation of Motif class in gimmemotifs documentation (https://gimmemotifs.readthedocs.io/en/master/index.html)

        """

        self.fpr = fpr
        self.background_length = background_length

        ## 1. initialilze scanner  ##
        # load motif
        if motifs is None:
            if verbose:
                print(
                    "No motif data entered. Loading default motifs for your species ..."
                )

            if self.species in [
                    "Mouse", "Human", "Rat"
            ]:  # If species is vertebrate, we use gimmemotif default motifs as a default.
                motifs = default_motifs()
                self.motif_db_name = "gimme.vertebrate.v5.0"
                self.TF_formatting = True
                if verbose:
                    print(
                        " Default motif for vertebrate: gimme.vertebrate.v5.0. \n For more information, please go https://gimmemotifs.readthedocs.io/en/master/overview.html \n"
                    )

            elif self.species in [
                    "Zebrafish"
            ]:  # If species is Zebrafish, we use CisBP database.
                self.motif_db_name = 'CisBP_ver2_Danio_rerio.pfm'
                motifs = load_motifs(self.motif_db_name)
                self.TF_formatting = False
                if verbose:
                    print(
                        f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n"
                    )

            elif self.species in [
                    "S.cerevisiae"
            ]:  # If species is S.cerevisiae, we use CisBP database.
                self.motif_db_name = 'CisBP_ver2_Saccharomyces_cerevisiae.pfm'
                motifs = load_motifs(self.motif_db_name)
                self.TF_formatting = False
                if verbose:
                    print(
                        f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n"
                    )

            elif self.species in [
                    "Xenopus"
            ]:  # If species is S.cerevisiae, we use CisBP database.
                self.motif_db_name = 'CisBP_ver2_Xenopus_tropicalis_and_Xenopus_laevis.pfm'
                motifs = load_motifs(self.motif_db_name)
                self.TF_formatting = False
                if verbose:
                    print(
                        f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n"
                    )

            elif self.species in [
                    "Drosophila"
            ]:  # If species is S.cerevisiae, we use CisBP database.
                self.motif_db_name = 'CisBP_ver2_Drosophila_mix.pfm'
                motifs = load_motifs(self.motif_db_name)
                self.TF_formatting = False
                if verbose:
                    print(
                        f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n"
                    )

            elif self.species in [
                    "C.elegans"
            ]:  # If species is S.cerevisiae, we use CisBP database.
                self.motif_db_name = 'CisBP_ver2_Caenorhabditis_elegans.pfm'
                motifs = load_motifs(self.motif_db_name)
                self.TF_formatting = False
                if verbose:
                    print(
                        f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n"
                    )

            elif self.species in [
                    "Arabidopsis"
            ]:  # If species is S.cerevisiae, we use CisBP database.
                self.motif_db_name = 'CisBP_ver2_Arabidopsis_thaliana.pfm'
                motifs = load_motifs(self.motif_db_name)
                self.TF_formatting = False
                if verbose:
                    print(
                        f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n"
                    )

            else:
                raise ValueError(
                    f"We have no default motifs for your species, {self.species}. Please set motifs."
                )

        else:
            # Check format
            if isinstance(motifs, list):
                if isinstance(motifs[0], Motif):
                    if verbose:
                        print(
                            "Checking your motifs... Motifs format looks good. \n"
                        )
                else:
                    raise ValueError(f"Motif data type was invalid.")
            else:
                raise ValueError(
                    f"motifs should be a list of Motif object in gimmemotifs.")

            self.motif_db_name = "custom_motifs"
            if TF_formatting == "auto":
                self.TF_formatting = False
            else:
                self.TF_formatting = TF_formatting

        self.motifs = motifs

        self.dic_motif2TFs = _get_dic_motif2TFs(
            species=self.species,
            motifs=motifs,
            TF_evidence_level=TF_evidence_level,
            formatting=self.TF_formatting)
        self.TF_evidence_level = TF_evidence_level

        # initialize scanner
        if verbose:
            print("Initiating scanner... \n")
        s = Scanner(ncpus=n_cpus)

        # set parameters
        s.set_motifs(motifs)
        try:
            s.set_background(
                genome=self.ref_genome,
                size=background_length)  # For gimmemotifs ver 14.4
        except:
            s.set_background(
                genome=self.ref_genome,
                length=background_length)  # For old gimmemotifs ver 13

        #s.set_background(genome="mm9", length=400)
        if verbose:
            print(
                "Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. \n"
            )
        s.set_threshold(fpr=fpr)

        ## 2. motif scan ##
        print("Convert peak info into DNA sequences ... \n")
        # Get DNA sequences
        target_sequences = peak2fasta(self.all_peaks, self.ref_genome)
        # Remove DNA sequence with zero length
        target_sequences = remove_zero_seq(fasta_object=target_sequences)

        print(
            "Scanning motifs ... It may take several hours if you proccess many peaks. \n"
        )
        self.scanned_df = scan_dna_for_motifs(s, motifs, target_sequences,
                                              verbose)

        self.__addLog("scanMotifs")