def __init__(self, ncore=1, genome="hg38", gene_bed=None, pfmfile=None, include_notfs=False, rm_curated=True, etype="hg38H3K27ac", tffile=None): self.ncore = ncore self.genome = genome # dream_model.txt is the logistic regression model. package_dir = os.path.dirname(ananse.__file__) self.etype = etype if self.genome == "hg38" and self.etype == "hg38H3K27ac": self.model = os.path.join(package_dir, "db", "dream_model_h3k27ac.txt") elif self.etype == "p300" or self.etype == "ATAC": self.model = os.path.join(package_dir, "db", "dream_model_p300.txt") else: raise TypeError("""The input enhancer data type should hg38H3K27ac, p300 or ATAC. It is not possible set -e to hg38H3K27ac if the genome is not hg38. Please provide a enhancer type with -e argument. By default is hg38H3K27ac.""") # filter tfs? self.include_notfs = include_notfs # remove curated? self.rm_curated = rm_curated # load real tfs self.tffile = tffile if self.tffile is None: self.tffile = os.path.join(package_dir, "db", "tfs.txt") # self.tffile = "db/tfs.txt" # Motif information file self.pfmfile = pfmfile_location(pfmfile) self.motifs2factors = self.pfmfile.replace(".pfm", ".motif2factors.txt") self.filtermotifs2factors = clear_tfs(self.motifs2factors, self.tffile, self.include_notfs, self.rm_curated)
def __init__( self, peak_weights, motif_weights, pfmfile=None, model=None, curation_filter=None, tf_list=None, whitelist=True, ncore=1, verbose=True, ): self.peak_weights = peak_weights # output from ScorePeaks self.motif_weights = motif_weights # output from ScoreMotifs self.motifs2factors_file = pfmfile_location(pfmfile).replace( ".pfm", ".motif2factors.txt") self.motifs2factors = self.filter_transcription_factors( curation_filter, tf_list, whitelist) self.model = model if self.model is None: # dream_model.txt is a 2D logistic regression model. package_dir = os.path.dirname(__file__) self.model = os.path.join(package_dir, "db", "dream_model_p300.pickle") self.ncore = ncore self.verbose = verbose
def logo(args): if args.pfmfile is None and args.ids is None: name = os.path.splitext(os.path.split(pfmfile_location(None))[-1])[0] print( "Use the -i argument to specify which motif ids you want to use for logos." ) print("If you really want to create logos for all of the motifs in the default") print("PFM file use the following command:") print(f"gimme logo -p {name}") sys.exit(1) inputfile = args.pfmfile motifs = read_motifs(inputfile) if args.ids: ids = args.ids.split(",") motifs = [m for m in motifs if m.id in ids] for motif in motifs: motif.plot_logo( fname="{}.png".format(motif.id), kind=args.kind, title=args.title )
def moap( inputfile, method="hypergeom", scoring=None, outfile=None, motiffile=None, pfmfile=None, genome=None, fpr=0.01, ncpus=None, subsample=None, zscore=True, gc=True, ): """Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str :1File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'hypergeom', 'lasso', 'bayesianridge', 'rf', 'xgboost'. Default is 'hypergeom'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pfmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied fpr : float, optional FPR for motif scanning ncpus : int, optional Number of threads to use. Default is the number specified in the config. zscore : bool, optional Use z-score normalized motif scores. gc : bool optional Use GC% bins for z-score. Returns ------- pandas DataFrame with motif activity """ if scoring and scoring not in ["score", "count"]: raise ValueError("valid values are 'score' and 'count'") if inputfile.endswith("feather"): df = pd.read_feather(inputfile) df = df.set_index(df.columns[0]) else: # read data df = pd.read_table(inputfile, index_col=0, comment="#") clf = Moap.create(method, ncpus=ncpus) if clf.ptype == "classification": if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype("object") in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") pfmfile = pfmfile_location(pfmfile) try: motifs = read_motifs(pfmfile) except Exception: sys.stderr.write("can't read motifs from {}".format(pfmfile)) raise # scan for motifs motif_names = [m.id for m in read_motifs(pfmfile)] scores = [] if method == "classic" or scoring == "count": logger.info("motif scanning (scores)") scores = scan_regionfile_to_table( inputfile, genome, "count", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) else: logger.info("motif scanning (scores)") scores = scan_regionfile_to_table( inputfile, genome, "score", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) elif isinstance(motiffile, pd.DataFrame): motifs = motiffile else: motifs = pd.read_table(motiffile, index_col=0, comment="#") if outfile and os.path.exists(outfile): out = pd.read_table(outfile, index_col=0, comment="#") ncols = df.shape[1] if ncols == 1: ncols = len(df.iloc[:, 0].unique()) if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols: logger.warn("%s output already exists... skipping", method) return out if subsample is not None: n = int(subsample * df.shape[0]) logger.debug("Subsampling %d regions", n) df = df.sample(n) motifs = motifs.loc[df.index] clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write( "# maelstrom - GimmeMotifs version {}\n".format(__version__)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if isinstance(motiffile, str): f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def infer_motifs( adata: AnnData, dataset: str, cluster: Optional[str] = "louvain", n_top_genes: Optional[int] = 1000, max_cell_types: Optional[int] = 50, pfm: Optional[str] = None, min_annotated: Optional[int] = 50, num_enhancers: Optional[int] = 10000, maelstrom: Optional[bool] = False, indirect: Optional[bool] = True, n_sketch: Optional[int] = 2500, n_permutations: Optional[int] = 100000, ) -> None: """Infer motif ativity for single cell RNA-seq data. The adata object is modified with the following fields. **X_cell_types** : `adata.obsm` field Cell type coefficients. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. dataset : `str` Name of reference data set or directory with reference data. cluster : `str`, optional (default: "louvain") Name of the clustering, can be either louvain or leiden. n_top_genes : `int`, optional (default: 1000) Number of variable genes that is used. If `n_top_genes` is greater than the number of hypervariable genes in `adata` then all variable genes are used. max_cell_types : `int`, optional (default: 50) Maximum number of cell types to select. pfm : `str`, optional (default: None) Name of motif file in PFM format. The GimmeMotifs default is used if this parameter is not specified. This can be a filename, or a pfm name support by GimmeMotifs such as `JASPAR2018_vertebrates`. If a custom PFM file is specified, there should also be an associated `.motif2factors.txt` file. min_annotated : `int`, optional (default: 50) Cells that are annotated with cell types less than this number will be annotated as "other". num_enhancers : `int`, optional (default: 10000) Number of enhancers to use for motif activity analysis. maelstrom : `boolean`, optional (default: False) Use maelstrom instead of ridge regression for motif activity analysis. """ use_name = True validate_adata(adata) data = ScepiaDataset(dataset) if "scepia" not in adata.uns: adata.uns["scepia"] = {"version": __version__} # Annotate each cell with H3K27ac reference if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs: annotate_cells( adata, dataset=dataset, cluster=cluster, n_top_genes=n_top_genes, min_annotated=min_annotated, max_cell_types=max_cell_types, ) logger.info("Linking variable genes to differential enhancers.") gene_map_file = data.gene_mapping link_file = data.link_file link = pd.read_feather(link_file) if use_name: ens2name = pd.read_csv(gene_map_file, sep="\t", index_col=0, names=["identifier", "name"]) link = link.join(ens2name, on="gene").dropna() link = link.set_index("name") link.index = link.index.str.upper() enh_genes = adata.var_names[adata.var_names.str.upper().isin( link.index)].str.upper() var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique() enhancer_df = data.load_reference_data(reftype="enhancer") enhancer_df.index = change_region_size(enhancer_df.index) enhancer_df = enhancer_df.loc[var_enhancers, adata.uns["scepia"]["cell_types"]] enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean() enhancer_df.loc[:, :] = scale(enhancer_df) # Select top most variable enhancers enhancer_df = enhancer_df.loc[enhancer_df.var(1).sort_values().tail( num_enhancers).index] # Center by mean of the most import cell types # Here we chose the majority cell type per cluster cluster_cell_types = adata.obs["cluster_annotation"].unique() mean_value = enhancer_df[cluster_cell_types].mean(1) enhancer_df = enhancer_df.sub(mean_value, axis=0) fname = NamedTemporaryFile(delete=False).name enhancer_df.to_csv(fname, sep="\t") logger.info("inferring motif activity") pfm = pfmfile_location(pfm) if maelstrom: with TemporaryDirectory() as tmpdir: run_maelstrom( fname, data.genome, tmpdir, center=False, filter_redundant=True, ) motif_act = pd.read_csv( os.path.join(tmpdir, "final.out.txt"), sep="\t", comment="#", index_col=0, ) motif_act.columns = motif_act.columns.str.replace( r"z-score\s+", "") pfm = pfmfile_location( os.path.join(tmpdir, "nonredundant.motifs.pfm")) else: logger.info(f"Activity based on genome {data.genome}") motif_act = moap( fname, scoring="score", genome=data.genome, method="bayesianridge", pfmfile=pfm, ncpus=12, ) adata.uns["scepia"]["pfm"] = pfm adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"] ["cell_types"]] logger.info("calculating cell-specific motif activity") cell_motif_activity = ( adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T).T cell_motif_activity.index = adata.obs_names adata.obs = adata.obs.drop( columns=cell_motif_activity.columns.intersection(adata.obs.columns)) adata.obs = adata.obs.join(cell_motif_activity) correlate_tf_motifs(adata, indirect=indirect, n_sketch=n_sketch, n_permutations=n_permutations) add_activity(adata) logger.info("Done with motif inference.")
def select_nonredundant_motifs(roc_report, pfmfile, fg_table, bg_table, tolerance=0.001): pfmfile = pfmfile_location(pfmfile) motifs = read_motifs(pfmfile) motif_dict = read_motifs(pfmfile, as_dict=True) mc = MotifComparer() df = pd.read_csv(roc_report, sep="\t", index_col=0) df = df[df["Enr. at 1% FPR"] >= 2] motifs = [m for m in motifs if m.id in df.index] cols = ["ROC AUC", "PR AUC", "Enr. at 1% FPR", "Recall at 10% FDR"] rank = df[cols].rank().mean(1).sort_values(ascending=False) redundant_motifs = [] keep = [] while df[~df.index.isin(redundant_motifs)].shape[0] > 0: motif = rank[~rank.index.isin(redundant_motifs)].head(1).index[0] keep.append(motif) result = mc.get_all_scores( [motif_dict[motif]], [m for m in motifs if m.id not in redundant_motifs], "partial", "seqcor", "mean", ) result = result[motif] redundant_motifs += [m for m in result.keys() if result[m][0] >= 0.7] logger.debug(f"Selected {len(keep)} motifs for feature elimination") # Read motif scan results fg_table = pd.read_csv(fg_table, index_col=0, comment="#", sep="\t") bg_table = pd.read_csv(bg_table, index_col=0, comment="#", sep="\t") X = pd.concat((fg_table, bg_table), axis=0) y = np.hstack((np.ones(fg_table.shape[0]), np.zeros(bg_table.shape[0]))) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=2, shuffle=True, ) X_bla = X_train[keep] model = LogisticRegression(solver="liblinear", max_iter=500, penalty="l1") # = RandomForestClassifier(n_estimators=100) max_score = np.mean( cross_val_score(model, X_bla, y_train, cv=5, scoring="average_precision")) mean_scores = [] step = 1 logger.info("selecting non-redundant motifs") n_features = 1 for i in range(1, X_bla.shape[1], step): rfe = RFE(model, i) fit = rfe.fit(X_bla, y_train) mean_score = np.mean( cross_val_score( model, X_bla.loc[:, fit.support_], y_train, cv=5, scoring="average_precision", )) if i > 1 and mean_score - mean_scores[-1] < (max_score * tolerance): n_features = i - 1 break mean_scores.append(mean_score) rfe = RFE(model, n_features) fit = rfe.fit(X_bla, y_train) selected_features = X_bla.columns[fit.support_] model.fit(X_train.loc[:, selected_features], y_train) y_pred = model.predict_proba(X_test.loc[:, selected_features])[:, 1] pr_auc = average_precision_score(y_test, y_pred) roc_auc = roc_auc_score(y_test, y_pred) logger.info( f"selected {len(selected_features)} non-redundant motifs: ROC AUC {roc_auc:.3f}, PR AUC {pr_auc:.3f}" ) return selected_features
def run_maelstrom( infile, genome, outdir, pfmfile=None, plot=True, cluster=False, score_table=None, count_table=None, methods=None, ncpus=None, zscore=True, gc=True, ): """Run maelstrom on an input table. Parameters ---------- infile : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. outdir : str Output directory for all results. pfmfile : str, optional Specify a PFM file for scanning. plot : bool, optional Create heatmaps. cluster : bool, optional If True and if the input table has more than one column, the data is clustered and the cluster activity methods are also run. Not well-tested. score_table : str, optional Filename of pre-calculated table with motif scores. count_table : str, optional Filename of pre-calculated table with motif counts. methods : list, optional Activity methods to use. By default are all used. ncpus : int, optional If defined this specifies the number of cores to use. zscore : bool, optional Use z-score normalized motif scores. gc : bool, optional Use GC% bins to normalize motif scores. """ logger.info("Starting maelstrom") if infile.endswith("feather"): df = pd.read_feather(infile) df = df.set_index(df.columns[0]) else: df = pd.read_table(infile, index_col=0, comment="#") # Check for duplicates if df.index.duplicated(keep=False).any(): logger.warning("Input file contains duplicate regions!") logger.warning("These will be removed.") df = df.iloc[~df.index.duplicated(keep=False)] if not os.path.exists(outdir): os.mkdir(outdir) if methods is None: methods = Moap.list_predictors() methods = [m.lower() for m in methods] df.to_csv(os.path.join(outdir, "input.table.txt"), sep="\t") infile = os.path.join(outdir, "input.table.txt") # Copy the motif informatuon pfmfile = pfmfile_location(pfmfile) if pfmfile: shutil.copy2(pfmfile, outdir) mapfile = re.sub(".p[fw]m$", ".motif2factors.txt", pfmfile) if os.path.exists(mapfile): shutil.copy2(mapfile, outdir) # Create a file with the number of motif matches if count_table is None: count_table = os.path.join(outdir, "motif.count.txt.gz") if not os.path.exists(count_table): logger.info("motif scanning (counts)") counts = scan_to_table( infile, genome, "count", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) counts.to_csv(count_table, sep="\t", compression="gzip") else: logger.info("Counts, using: %s", count_table) # Create a file with the score of the best motif match if score_table is None: score_table = os.path.join(outdir, "motif.score.txt.gz") if not os.path.exists(score_table): logger.info("motif scanning (scores)") scores = scan_to_table( infile, genome, "score", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) scores.to_csv(score_table, sep="\t", float_format="%.3f", compression="gzip") else: logger.info("Scores, using: %s", score_table) if cluster: cluster = False for method in methods: m = Moap.create(method, ncpus=ncpus) if m.ptype == "classification": cluster = True break if not cluster: logger.info("Skipping clustering, no classification methods") exps = [] clusterfile = infile if df.shape[1] != 1: # More than one column for method in Moap.list_regression_predictors(): if method in methods: m = Moap.create(method, ncpus=ncpus) exps.append([method, m.pref_table, infile]) logger.debug("Adding %s", method) if cluster: clusterfile = os.path.join( outdir, os.path.basename(infile) + ".cluster.txt") df[:] = scale(df, axis=0) names = df.columns df_changed = pd.DataFrame(index=df.index) df_changed["cluster"] = np.nan for name in names: df_changed.loc[(df[name] - df.loc[:, df.columns != name].max(1)) > 0.5, "cluster"] = name df_changed.dropna().to_csv(clusterfile, sep="\t") if df.shape[1] == 1 or cluster: for method in Moap.list_classification_predictors(): if method in methods: m = Moap.create(method, ncpus=ncpus) exps.append([method, m.pref_table, clusterfile]) if len(exps) == 0: logger.error("No method to run.") sys.exit(1) for method, scoring, fname in exps: try: if scoring == "count" and count_table is not None: moap_with_table(fname, count_table, outdir, method, scoring, ncpus=ncpus) elif scoring == "score" and score_table is not None: moap_with_table(fname, score_table, outdir, method, scoring, ncpus=ncpus) else: moap_with_bg(fname, genome, outdir, method, scoring, pfmfile=pfmfile, ncpus=ncpus) except Exception as e: logger.warn("Method %s with scoring %s failed", method, scoring) logger.warn(e) logger.warn("Skipping") raise dfs = {} for method, scoring, fname in exps: t = "{}.{}".format(method, scoring) fname = os.path.join(outdir, "activity.{}.{}.out.txt".format(method, scoring)) try: dfs[t] = pd.read_table(fname, index_col=0, comment="#") except FileNotFoundError: logger.warn("Activity file for {} not found!\n".format(t)) if len(methods) > 1: logger.info("Rank aggregation") df_p = df_rank_aggregation(df, dfs, exps) df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t") # df_p = df_p.join(m2f) # Write motif frequency table if df.shape[1] == 1: mcount = df.join(pd.read_table(count_table, index_col=0, comment="#")) m_group = mcount.groupby(df.columns[0]) freq = m_group.sum() / m_group.count() freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t") if plot and len(methods) > 1: logger.info("html report") maelstrom_html_report(outdir, os.path.join(outdir, "final.out.txt"), pfmfile) logger.info(os.path.join(outdir, "gimme.maelstrom.report.html"))
def calc_stats_iterator( fg_file=None, bg_file=None, fg_table=None, bg_table=None, motifs=None, stats=None, genome=None, zscore=True, gc=True, ncpus=None, ): """Calculate motif enrichment metrics. Parameters ---------- fg_file : str, optional Filename of a FASTA, BED or region file with positive sequences. bg_file : str, optional Filename of a FASTA, BED or region file with negative sequences. fg_table : str, optional Filename of a table with motif scan results of positive sequences. bg_table : str, optional Filename of a table with motif scan results of negative sequences. motifs : str, list or Motif instance, optional A file with motifs in pfm format, a list of Motif instances or a single Motif instance. If motifs is `None`, the default motif database is used. genome : str, optional Genome or index directory in case of BED/regions. stats : list, optional Names of metrics to calculate. See gimmemotifs.rocmetrics.__all__ for available metrics. ncpus : int, optional Number of cores to use. Returns ------- result : dict Dictionary with results where keys are motif ids and the values are dictionary with metric name and value pairs. """ if not stats: stats = rocmetrics.__all__ if fg_table is None: if fg_file is None: raise ValueError("Need either fg_table or fg_file argument") elif fg_file is not None: raise ValueError("Need either fg_table or fg_file argument, not both") if bg_table is None: if bg_file is None: raise ValueError("Need either bg_table or bg_file argument") elif bg_file is not None: raise ValueError("Need either bg_table or bg_file argument, not both") if fg_table is not None or bg_table is not None: remove_stats = [] for s in stats: func = getattr(rocmetrics, s) if func.input_type == "pos": remove_stats.append(s) if len(remove_stats) != 0: logger.warn( "Cannot calculate stats that require position from table of motif scores." ) logger.warn(f"Skipping the following statistics: {', '.join(remove_stats)}") stats = [s for s in stats if s not in remove_stats] if isinstance(motifs, Motif): all_motifs = [motifs] else: if type([]) == type(motifs): all_motifs = motifs else: motifs = pfmfile_location(motifs) all_motifs = read_motifs(motifs, fmt="pwm") if fg_table is not None or bg_table is not None: filtered_motifs = pd.read_csv( fg_table, sep="\t", index_col=0, nrows=1, comment="#" ).columns filtered_motifs = filtered_motifs.intersection( pd.read_csv(bg_table, sep="\t", index_col=0, nrows=1, comment="#").columns ) all_motifs = [m for m in all_motifs if m.id in filtered_motifs] if ncpus is None: ncpus = int(MotifConfig().get_default_params()["ncpus"]) if fg_file is not None or bg_file is not None: if zscore or gc: # Precalculate mean and stddev for z-score calculation s = Scanner(ncpus=ncpus) s.set_motifs(all_motifs) s.set_genome(genome) s.set_meanstd(gc=gc) chunksize = 240 for i in range(0, len(all_motifs), chunksize): result = {} logger.debug( "chunk %s of %s", (i / chunksize) + 1, len(all_motifs) // chunksize + 1 ) motifs = all_motifs[i : i + chunksize] if fg_table is None: fg_total = scan_to_best_match( fg_file, motifs, ncpus=ncpus, genome=genome, zscore=zscore, gc=gc ) else: fg_total = pd.read_csv( fg_table, sep="\t", usecols=[m.id for m in motifs], comment="#" ).to_dict(orient="list") for m in fg_total: fg_total[m] = [(x, None) for x in fg_total[m]] if bg_table is None: bg_total = scan_to_best_match( bg_file, motifs, ncpus=ncpus, genome=genome, zscore=zscore, gc=gc ) else: bg_total = pd.read_csv( bg_table, sep="\t", usecols=[m.id for m in motifs], comment="#" ).to_dict(orient="list") for m in bg_total: bg_total[m] = [(x, None) for x in bg_total[m]] logger.debug("calculating statistics") if ncpus == 1: it = _single_stats(motifs, stats, fg_total, bg_total) else: it = _mp_stats(motifs, stats, fg_total, bg_total, ncpus) for motif_id, s, ret in it: if motif_id not in result: result[motif_id] = {} result[motif_id][s] = ret yield result
def __init__(self, genome, bed, pfmfile=None, ncore=1, verbose=True): self.genome = genome self.bed = bed # putative enhancer regions in format chr:start-end (in column 0 with header) self.pfm_file = pfmfile_location(pfmfile) self.ncore = ncore self.verbose = verbose
def run_maelstrom( infile, genome, outdir, pfmfile=None, filter_redundant=True, filter_cutoff=0.8, plot=True, cluster=False, score_table=None, count_table=None, methods=None, ncpus=None, zscore=True, gc=True, center=False, aggregation="int_stouffer", ): """Run maelstrom on an input table. Parameters ---------- infile : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. outdir : str Output directory for all results. pfmfile : str, optional Specify a PFM file for scanning. filter_redundant : bool, optional Create a non-redundant set of motifs based on correlation of motif scores in the input data. filter_cutoff : float, optional Cutoff to use for non-redundant motif selection. Default is 0.8. plot : bool, optional Create heatmaps. cluster : bool, optional If True and if the input table has more than one column, the data is clustered and the cluster activity methods are also run. Not well-tested. score_table : str, optional Filename of pre-calculated table with motif scores. count_table : str, optional Filename of pre-calculated table with motif counts. methods : list, optional Activity methods to use. By default are all used. ncpus : int, optional If defined this specifies the number of cores to use. zscore : bool, optional Use z-score normalized motif scores. gc : bool, optional Use GC% bins to normalize motif scores. center : bool, optional Mean-center the input table. aggregation: str, optional How to combine scores of the predictors. The default is "int_stouffer", for inverse normal transform followed by Stouffer's methods to combine z-scores. Alternatively, "stuart" performs rank aggregation and reports the -log10 of the rank aggregation p-value. """ logger.info("Starting maelstrom") if infile.endswith("feather"): df = pd.read_feather(infile) df = df.set_index(df.columns[0]) else: df = pd.read_table(infile, index_col=0, comment="#") # Check if the input is mean-centered if df.shape[1] > 1 and not np.allclose(df.mean(1), 0): if center: logger.info( "Input is not mean-centered, setting the mean of all rows to 0." ) logger.info( "Use --nocenter if you know what you're doing and want to change this behavior." ) logger.info( "Note that if you use count data (ChIP-seq, ATAC-seq) we recommend to " "first transform your data, for instance using log2(), and to normalize " "between samples. To create a table suitable for maelstrom you can use the " "coverage_table script included with GimmeMotifs." ) df = df.sub(df.mean(axis=1), axis=0) else: logger.info("Input is not mean-centered, but --nocenter was specified.") logger.info( "Leaving the data as-is, but make sure this is what your really want." ) # Check for duplicates if df.index.duplicated(keep=False).any(): logger.warning("Input file contains duplicate regions!") logger.warning("These will be removed.") df = df.iloc[~df.index.duplicated(keep=False)] if not os.path.exists(outdir): os.mkdir(outdir) if methods is None: methods = Moap.list_predictors() methods = [m.lower() for m in methods] df.to_csv(os.path.join(outdir, "input.table.txt"), sep="\t") infile = os.path.join(outdir, "input.table.txt") # Copy the motif informatuon pfmfile = pfmfile_location(pfmfile) if pfmfile: shutil.copy2(pfmfile, outdir) mapfile = re.sub(".p[fw]m$", ".motif2factors.txt", pfmfile) if os.path.exists(mapfile): shutil.copy2(mapfile, outdir) # Create a file with the number of motif matches if count_table is None: count_table = os.path.join(outdir, "motif.count.txt.gz") if not os.path.exists(count_table): logger.info("motif scanning (counts)") counts = scan_regionfile_to_table( infile, genome, "count", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) counts.to_csv(count_table, sep="\t", compression="gzip") else: logger.info("Counts, using: %s", count_table) # Create a file with the score of the best motif match if score_table is None: score_table = os.path.join(outdir, "motif.score.txt.gz") if not os.path.exists(score_table): logger.info("motif scanning (scores)") scores = scan_regionfile_to_table( infile, genome, "score", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) scores.to_csv( score_table, sep="\t", float_format="%.3f", compression="gzip" ) else: logger.info("Scores, using: %s", score_table) counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t") scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t") if filter_redundant: logger.info("Selecting non-redundant motifs") fa = FeatureAgglomeration( distance_threshold=filter_cutoff, n_clusters=None, affinity="correlation", linkage="complete", compute_full_tree=True, ) fa.fit(scores) X_cluster = pd.DataFrame({"motif": scores.columns, "label": fa.labels_}) X_cluster = X_cluster.join(scores.var().to_frame(name="var"), on="motif") selected_motifs = ( X_cluster.sort_values("var") .drop_duplicates(subset=["label"], keep="last")["motif"] .values ) nr_motif = ( X_cluster.sort_values("var") .drop_duplicates(subset=["label"], keep="last")[["label", "motif"]] .set_index("label") ) X_cluster = X_cluster.join(nr_motif, rsuffix="_nr", on="label") motif_map = X_cluster[["motif", "motif_nr"]].set_index("motif") scores = scores[selected_motifs] counts = counts[selected_motifs] score_table = os.path.join(outdir, "motif.nr.score.txt.gz") scores.to_csv(score_table, sep="\t", compression="gzip") count_table = os.path.join(outdir, "motif.nr.count.txt.gz") counts.to_csv(count_table, sep="\t", compression="gzip") m2f = pd.read_table(os.path.join(outdir, mapfile), comment="#") m2f = m2f.join(motif_map, on="Motif") m2f.loc[m2f["Motif"] != m2f["motif_nr"], "Curated"] = "N" m2f["Motif"] = m2f["motif_nr"] m2f = m2f.drop(columns=["motif_nr"]) motifs = read_motifs(pfmfile) pfmfile = os.path.join(outdir, "nonredundant.motifs.pfm") with open(pfmfile, "w") as f: for motif in motifs: f.write(f"{motif.to_pfm()}\n") mapfile = pfmfile.replace(".pfm", ".motif2factors.txt") with open(mapfile, "w") as f: f.write( "# Note: this mapping is specifically created for this non-redundant set of motifs.\n" ) f.write( "# It also includes factors for motifs that were similar, but this can be\n" ) f.write("# specific to this analysis.\n") with open(mapfile, "a") as f: m2f.to_csv(f, index=False, sep="\t") logger.info(f"Selected {len(selected_motifs)} motifs") logger.info(f"Motifs: {pfmfile}") logger.info(f"Factor mappings: {mapfile}") if cluster: cluster = False for method in methods: m = Moap.create(method, ncpus=ncpus) if m.ptype == "classification": cluster = True break if not cluster: logger.info("Skipping clustering, no classification methods") exps = [] clusterfile = infile if df.shape[1] != 1: # More than one column for method in Moap.list_regression_predictors(): if method in methods: m = Moap.create(method, ncpus=ncpus) exps.append([method, m.pref_table, infile]) logger.debug("Adding %s", method) if cluster: clusterfile = os.path.join( outdir, os.path.basename(infile) + ".cluster.txt" ) df[:] = scale(df, axis=0) names = df.columns df_changed = pd.DataFrame(index=df.index) df_changed["cluster"] = np.nan for name in names: df_changed.loc[ (df[name] - df.loc[:, df.columns != name].max(1)) > 0.5, "cluster" ] = name df_changed.dropna().to_csv(clusterfile, sep="\t") if df.shape[1] == 1 or cluster: for method in Moap.list_classification_predictors(): if method in methods: m = Moap.create(method, ncpus=ncpus) exps.append([method, m.pref_table, clusterfile]) if len(exps) == 0: logger.error("No method to run.") sys.exit(1) for method, scoring, fname in exps: try: if scoring == "count": moap_with_table( fname, count_table, outdir, method, scoring, ncpus=ncpus ) elif scoring == "score": moap_with_table( fname, score_table, outdir, method, scoring, ncpus=ncpus ) except Exception as e: logger.warn("Method %s with scoring %s failed", method, scoring) logger.warn(e) logger.warn("Skipping") raise dfs = {} for method, scoring, fname in exps: t = "{}.{}".format(method, scoring) fname = os.path.join(outdir, "activity.{}.{}.out.txt".format(method, scoring)) try: dfs[t] = pd.read_table(fname, index_col=0, comment="#") except FileNotFoundError: logger.warn("Activity file for {} not found!\n".format(t)) if len(methods) > 1: logger.info("Rank aggregation") df_p = df_rank_aggregation(df, dfs, exps, method=aggregation) # Add percentage of input sequences with motif if df.shape[1] > 1: df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100 else: bla = counts.join(df).groupby(df.columns[0]).mean() * 100 bla = bla.T bla = bla.rename( columns={col: f"{col} % with motif" for col in bla.columns} ) df_p = df_p.join(bla) if df.shape[1] > 1: # Add correlation between motif score and signal logger.info("Correlation") for col in df.columns: df_p[f"corr {col}"] = 0 for motif in df_p.index: df_p.loc[motif, f"corr {col}"] = pearsonr(df[col], scores[motif])[0] df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t") # df_p = df_p.join(m2f) # Write motif frequency table if df.shape[1] == 1: mcount = df.join(counts) m_group = mcount.groupby(df.columns[0]) freq = m_group.sum() / m_group.count() freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t") if plot and len(methods) > 1: logger.info("html report") maelstrom_html_report(outdir, os.path.join(outdir, "final.out.txt"), pfmfile) logger.info(os.path.join(outdir, "gimme.maelstrom.report.html"))