def _rank_agg_column(exps, dfs, e): tmp_dfs = [pd.DataFrame(), pd.DataFrame()] for i,sort_order in enumerate([False, True]): for method,scoring,fname in exps: k = "{}.{}".format(method, scoring) if k in dfs: v = dfs[k] tmp_dfs[i][k] = v.sort_values(e, ascending=sort_order).index.values return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
def _rank_agg_column(exps, dfs, e): tmp_dfs = [pd.DataFrame(), pd.DataFrame()] for i, sort_order in enumerate([False, True]): for method, scoring, fname in exps: k = "{}.{}".format(method, scoring) if k in dfs: v = dfs[k] tmp_dfs[i][k] = v.sort_values( e, ascending=sort_order).index.values return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
def _rank_agg_column(exps, dfs, e): tmp_dfs = [pd.DataFrame(), pd.DataFrame()] for i, sort_order in enumerate([False, True]): for method, scoring, _ in exps: k = "{}.{}".format(method, scoring) if k in dfs: v = dfs[k] # Sample rows before sorting to shuffle # Otherwise all ties will not have a random order due to inherent # ordering of the motif dataframe tmp_dfs[i][k] = (v.sample(frac=1).sort_values( e, ascending=sort_order).index.values) return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
def test2_rankagg(self): """ Test Python implementation of rank aggregation """ df = pd.read_table(self.rank_in, index_col=0) result = rankagg(df).values ref = pd.read_table(self.rank_out, index_col=0)["p.adjust"].values for v1, v2 in zip(ref, result): self.assertAlmostEqual(v1, v2)
def test2_rankagg(self): """ Test Python implementation of rank aggregation """ df = pd.read_csv(self.rank_in, index_col=0, sep="\t") result = rankagg(df)["score"].values ref = pd.read_csv(self.rank_out, index_col=0, sep="\t")["score"].values for v1, v2 in zip(ref, result): self.assertAlmostEqual(v1, v2)
def run_maelstrom(infile, genome, outdir, cluster=True, score_table=None, count_table=None): if not os.path.exists(outdir): os.mkdir(outdir) if not count_table: counts = scan_to_table(infile, genome, outdir, "count") count_table = os.path.join(outdir, "motif.count.txt") counts.to_csv(count_table, sep="\t") if not score_table: scores = scan_to_table(infile, genome, outdir, "score") score_table = os.path.join(outdir, "motif.score.txt") scores.to_csv(score_table, sep="\t") df = pd.read_table(infile, index_col=0) # Drop duplicate indices, doesn't work very well downstream df = df.loc[df.index.drop_duplicates(keep=False)] exps = [] clusterfile = infile if df.shape[1] != 1: # More than one column exps += [ ("mara", "count", infile), ("lasso", "score", infile), ] if cluster: clusterfile = os.path.join(outdir, os.path.basename(infile) + ".cluster.txt") df = df.apply(scale, 0) names = df.columns df_changed = pd.DataFrame(index=df.index) df_changed["cluster"] = np.nan for name in names: df_changed.loc[(df[name] - df.loc[:,df.columns != name].max(1)) > 0.5, "cluster"] = name df_changed.dropna().to_csv(clusterfile, sep="\t") if df.shape[1] == 1 or cluster: exps += [ ("rf", "score", clusterfile), ("classic", "count", clusterfile), ("mwu", "score", clusterfile), ("lightning", "score", clusterfile), ] for method, scoring, fname in exps: try: sys.stderr.write("Running {} with {}\n".format(method,scoring)) if scoring == "count" and count_table: moap_with_table(fname, count_table, outdir, method, scoring) elif scoring == "score" and score_table: moap_with_table(fname, score_table, outdir, method, scoring) else: moap_with_bg(fname, genome, outdir, method, scoring) except Exception as e: sys.stderr.write( "Method {} with scoring {} failed\n{}\nSkipping\n".format( method, scoring, e) ) dfs = {} for method, scoring,fname in exps: t = "{}.{}".format(method,scoring) fname = os.path.join(outdir, "activity.{}.{}.out.txt".format( method, scoring)) try: dfs[t] = pd.read_table(fname, index_col=0, comment="#") except: sys.stderr.write("Activity file for {} not found!\n".format(t)) df_p = pd.DataFrame(index=dfs.values()[0].index) names = dfs.values()[0].columns for e in names: df_tmp = pd.DataFrame() for method,scoring,fname in exps: k = "{}.{}".format(method, scoring) v = dfs[k] df_tmp[k] = v.sort_values(e, ascending=False).index.values df_p[e] = rankagg(df_tmp) df_p[names] = -np.log10(df_p[names]) df_p.to_csv(os.path.join(outdir, "final.out.csv"), sep="\t")
def test1_rankagg(self): """ Test rank aggregation """ df = pd.read_table(self.fname, index_col=0) result = rankagg(df) self.assertEquals("AP2", result.sort_values().index[0])
def test1_rankagg(self): """ Test rank aggregation """ df = pd.read_table(self.fname, index_col=0) result = rankagg(df) self.assertEqual("AP2", result.sort_values().index[0])
def run_maelstrom(infile, genome, outdir, pwmfile=None, plot=True, cluster=True, score_table=None, count_table=None): df = pd.read_table(infile, index_col=0) # Check for duplicates if df.index.duplicated(keep=False).any(): raise ValueError("Input file contains duplicate regions! " "Please remove them.") if not os.path.exists(outdir): os.mkdir(outdir) # Create a file with the number of motif matches if not count_table: counts = scan_to_table(infile, genome, outdir, "count", pwmfile=pwmfile) count_table = os.path.join(outdir, "motif.count.txt.gz") counts.to_csv(count_table, sep="\t", compression="gzip") # Create a file with the score of the best motif match if not score_table: scores = scan_to_table(infile, genome, outdir, "score", pwmfile=pwmfile) score_table = os.path.join(outdir, "motif.score.txt.gz") scores.to_csv(score_table, sep="\t", float_format="%.3f", compression="gzip") exps = [] clusterfile = infile if df.shape[1] != 1: # More than one column exps += [ ("mara", "count", infile), ("lasso", "score", infile), ] if cluster: clusterfile = os.path.join(outdir, os.path.basename(infile) + ".cluster.txt") df = df.apply(scale, 0) names = df.columns df_changed = pd.DataFrame(index=df.index) df_changed["cluster"] = np.nan for name in names: df_changed.loc[(df[name] - df.loc[:,df.columns != name].max(1)) > 0.5, "cluster"] = name df_changed.dropna().to_csv(clusterfile, sep="\t") if df.shape[1] == 1 or cluster: exps += [ ("rf", "score", clusterfile), ("classic", "count", clusterfile), ("mwu", "score", clusterfile), ("lightning", "score", clusterfile), ] for method, scoring, fname in exps: try: sys.stderr.write("Running {} with {}\n".format(method,scoring)) if scoring == "count" and count_table: moap_with_table(fname, count_table, outdir, method, scoring) elif scoring == "score" and score_table: moap_with_table(fname, score_table, outdir, method, scoring) else: moap_with_bg(fname, genome, outdir, method, scoring, pwmfile=pwmfile) except Exception as e: sys.stderr.write( "Method {} with scoring {} failed\n{}\nSkipping\n".format( method, scoring, e) ) dfs = {} for method, scoring,fname in exps: t = "{}.{}".format(method,scoring) fname = os.path.join(outdir, "activity.{}.{}.out.txt".format( method, scoring)) try: dfs[t] = pd.read_table(fname, index_col=0, comment="#") except: sys.stderr.write("Activity file for {} not found!\n".format(t)) df_p = pd.DataFrame(index=dfs.values()[0].index) names = dfs.values()[0].columns for e in names: df_tmp = pd.DataFrame() for method,scoring,fname in exps: k = "{}.{}".format(method, scoring) if k in dfs: v = dfs[k] df_tmp[k] = v.sort_values(e, ascending=False).index.values df_p[e] = rankagg(df_tmp) df_p[names] = -np.log10(df_p[names]) df_p.to_csv(os.path.join(outdir, "final.out.csv"), sep="\t") #df_p = df_p.join(m2f) # Write motif frequency table if df.shape[1] == 1: mcount = df.join(pd.read_table(count_table, index_col=0)) m_group = mcount.groupby(df.columns[0]) freq = (m_group.sum() / m_group.count()) freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t") if plot: visualize_maelstrom(outdir, pwmfile=pwmfile)
def test1_rankagg(self): """ Test rank aggregation """ df = pd.read_csv(self.fname, index_col=0, sep="\t") result = rankagg(df) self.assertEqual("AP2", result.sort_values("score").index[0])
def run_maelstrom(infile, genome, outdir, plot=True, cluster=True, score_table=None, count_table=None): if not os.path.exists(outdir): os.mkdir(outdir) if not count_table: counts = scan_to_table(infile, genome, outdir, "count") count_table = os.path.join(outdir, "motif.count.txt") counts.to_csv(count_table, sep="\t") if not score_table: scores = scan_to_table(infile, genome, outdir, "score") score_table = os.path.join(outdir, "motif.score.txt") scores.to_csv(score_table, sep="\t", float_format="%.3f") df = pd.read_table(infile, index_col=0) # Drop duplicate indices, doesn't work very well downstream df = df.loc[df.index.drop_duplicates(keep=False)] exps = [] clusterfile = infile if df.shape[1] != 1: # More than one column exps += [ ("mara", "count", infile), ("lasso", "score", infile), ] if cluster: clusterfile = os.path.join( outdir, os.path.basename(infile) + ".cluster.txt") df = df.apply(scale, 0) names = df.columns df_changed = pd.DataFrame(index=df.index) df_changed["cluster"] = np.nan for name in names: df_changed.loc[(df[name] - df.loc[:, df.columns != name].max(1)) > 0.5, "cluster"] = name df_changed.dropna().to_csv(clusterfile, sep="\t") if df.shape[1] == 1 or cluster: exps += [ ("rf", "score", clusterfile), ("classic", "count", clusterfile), ("mwu", "score", clusterfile), ("lightning", "score", clusterfile), ] for method, scoring, fname in exps: try: sys.stderr.write("Running {} with {}\n".format(method, scoring)) if scoring == "count" and count_table: moap_with_table(fname, count_table, outdir, method, scoring) elif scoring == "score" and score_table: moap_with_table(fname, score_table, outdir, method, scoring) else: moap_with_bg(fname, genome, outdir, method, scoring) except Exception as e: sys.stderr.write( "Method {} with scoring {} failed\n{}\nSkipping\n".format( method, scoring, e)) dfs = {} for method, scoring, fname in exps: t = "{}.{}".format(method, scoring) fname = os.path.join(outdir, "activity.{}.{}.out.txt".format(method, scoring)) try: dfs[t] = pd.read_table(fname, index_col=0, comment="#") except: sys.stderr.write("Activity file for {} not found!\n".format(t)) df_p = pd.DataFrame(index=dfs.values()[0].index) names = dfs.values()[0].columns for e in names: df_tmp = pd.DataFrame() for method, scoring, fname in exps: k = "{}.{}".format(method, scoring) v = dfs[k] df_tmp[k] = v.sort_values(e, ascending=False).index.values df_p[e] = rankagg(df_tmp) df_p[names] = -np.log10(df_p[names]) df_p.to_csv(os.path.join(outdir, "final.out.csv"), sep="\t") #df_p = df_p.join(m2f) # Write motif frequency table mcount = df.join(pd.read_table(count_table, index_col=0)) m_group = mcount.groupby("cluster") freq = (m_group.sum() / m_group.count()) freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t") if plot: visualize_maelstrom(outdir)