Esempio n. 1
0
def _rank_agg_column(exps, dfs, e):
    tmp_dfs = [pd.DataFrame(), pd.DataFrame()]
        
    for i,sort_order in enumerate([False, True]):
        for method,scoring,fname in exps:
            k = "{}.{}".format(method, scoring)
            if k in dfs:
                v = dfs[k]
                tmp_dfs[i][k] = v.sort_values(e, ascending=sort_order).index.values
    return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
Esempio n. 2
0
def _rank_agg_column(exps, dfs, e):
    tmp_dfs = [pd.DataFrame(), pd.DataFrame()]

    for i, sort_order in enumerate([False, True]):
        for method, scoring, fname in exps:
            k = "{}.{}".format(method, scoring)
            if k in dfs:
                v = dfs[k]
                tmp_dfs[i][k] = v.sort_values(
                    e, ascending=sort_order).index.values
    return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
Esempio n. 3
0
def _rank_agg_column(exps, dfs, e):
    tmp_dfs = [pd.DataFrame(), pd.DataFrame()]

    for i, sort_order in enumerate([False, True]):
        for method, scoring, _ in exps:
            k = "{}.{}".format(method, scoring)
            if k in dfs:
                v = dfs[k]
                # Sample rows before sorting to shuffle
                # Otherwise all ties will not have a random order due to inherent
                # ordering of the motif dataframe
                tmp_dfs[i][k] = (v.sample(frac=1).sort_values(
                    e, ascending=sort_order).index.values)
    return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
Esempio n. 4
0
 def test2_rankagg(self):
     """ Test Python implementation of rank aggregation """
     df = pd.read_table(self.rank_in, index_col=0)
     result = rankagg(df).values
     ref = pd.read_table(self.rank_out, index_col=0)["p.adjust"].values
     for v1, v2 in zip(ref, result):
         self.assertAlmostEqual(v1, v2)
Esempio n. 5
0
 def test2_rankagg(self):
     """ Test Python implementation of rank aggregation """
     df = pd.read_table(self.rank_in, index_col=0)
     result = rankagg(df).values
     ref = pd.read_table(self.rank_out, index_col=0)["p.adjust"].values
     for v1, v2 in zip(ref, result):
         self.assertAlmostEqual(v1, v2)
Esempio n. 6
0
 def test2_rankagg(self):
     """ Test Python implementation of rank aggregation """
     df = pd.read_csv(self.rank_in, index_col=0, sep="\t")
     result = rankagg(df)["score"].values
     ref = pd.read_csv(self.rank_out, index_col=0, sep="\t")["score"].values
     for v1, v2 in zip(ref, result):
         self.assertAlmostEqual(v1, v2)
Esempio n. 7
0
def run_maelstrom(infile, genome, outdir, cluster=True, 
        score_table=None, count_table=None):

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not count_table:
        counts = scan_to_table(infile, genome, outdir, "count")
        count_table = os.path.join(outdir, "motif.count.txt")
        counts.to_csv(count_table, sep="\t")

    if not score_table:
        scores = scan_to_table(infile, genome, outdir, "score")
        score_table = os.path.join(outdir, "motif.score.txt")
        scores.to_csv(score_table, sep="\t")
    
    df = pd.read_table(infile, index_col=0)

    # Drop duplicate indices, doesn't work very well downstream
    df = df.loc[df.index.drop_duplicates(keep=False)]
    exps = []
    clusterfile = infile
    if df.shape[1] != 1:
        # More than one column
        exps += [
                ("mara", "count", infile),
                ("lasso", "score", infile),
                ]

        if cluster:
            clusterfile = os.path.join(outdir,
                    os.path.basename(infile) + ".cluster.txt")
            df = df.apply(scale, 0)
            names = df.columns
            df_changed = pd.DataFrame(index=df.index)
            df_changed["cluster"] = np.nan
            for name in names:
                df_changed.loc[(df[name] - df.loc[:,df.columns != name].max(1)) > 0.5, "cluster"] = name
            df_changed.dropna().to_csv(clusterfile, sep="\t")
    if df.shape[1] == 1 or cluster:
        exps += [
                ("rf", "score", clusterfile),
                ("classic", "count", clusterfile),
                ("mwu", "score", clusterfile),
                ("lightning", "score", clusterfile),
                ]

    for method, scoring, fname in exps:
        try:
            sys.stderr.write("Running {} with {}\n".format(method,scoring))
            if scoring == "count" and count_table:
                moap_with_table(fname, count_table, outdir, method, scoring)
            elif scoring == "score" and score_table:
                moap_with_table(fname, score_table, outdir, method, scoring)
            else:
                moap_with_bg(fname, genome, outdir, method, scoring)
        
        
        except Exception as e:
            sys.stderr.write(
                    "Method {} with scoring {} failed\n{}\nSkipping\n".format(
                        method, scoring, e)
                    )
    
    dfs = {}
    for method, scoring,fname  in exps:
        t = "{}.{}".format(method,scoring)
        fname = os.path.join(outdir, "activity.{}.{}.out.txt".format(
                           method, scoring))
        try:
            dfs[t] = pd.read_table(fname, index_col=0, comment="#")
        except:
            sys.stderr.write("Activity file for {} not found!\n".format(t))
    
    df_p = pd.DataFrame(index=dfs.values()[0].index)
    names = dfs.values()[0].columns
    for e in names:
        df_tmp = pd.DataFrame()
        for method,scoring,fname in exps:
            k = "{}.{}".format(method, scoring)
            v = dfs[k]
            df_tmp[k] = v.sort_values(e, ascending=False).index.values
        
        df_p[e] = rankagg(df_tmp)
    df_p[names] = -np.log10(df_p[names])
    df_p.to_csv(os.path.join(outdir, "final.out.csv"), sep="\t")
Esempio n. 8
0
 def test1_rankagg(self):
     """ Test rank aggregation """
     df = pd.read_table(self.fname, index_col=0)
     result = rankagg(df)
     self.assertEquals("AP2", result.sort_values().index[0])
Esempio n. 9
0
 def test1_rankagg(self):
     """ Test rank aggregation """
     df = pd.read_table(self.fname, index_col=0)
     result = rankagg(df)
     self.assertEqual("AP2", result.sort_values().index[0])
Esempio n. 10
0
def run_maelstrom(infile, genome, outdir, pwmfile=None, plot=True, cluster=True, 
        score_table=None, count_table=None):

    df = pd.read_table(infile, index_col=0)
    # Check for duplicates
    if df.index.duplicated(keep=False).any():
        raise ValueError("Input file contains duplicate regions! "
                         "Please remove them.")
    
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # Create a file with the number of motif matches
    if not count_table:
        counts = scan_to_table(infile, genome, outdir, "count",
                pwmfile=pwmfile)
        count_table = os.path.join(outdir, "motif.count.txt.gz")
        counts.to_csv(count_table, sep="\t", compression="gzip")

    # Create a file with the score of the best motif match
    if not score_table:
        scores = scan_to_table(infile, genome, outdir, "score",
                pwmfile=pwmfile)
        score_table = os.path.join(outdir, "motif.score.txt.gz")
        scores.to_csv(score_table, sep="\t", float_format="%.3f", 
                compression="gzip")

    exps = []
    clusterfile = infile
    if df.shape[1] != 1:
        # More than one column
        exps += [
                ("mara", "count", infile),
                ("lasso", "score", infile),
                ]

        if cluster:
            clusterfile = os.path.join(outdir,
                    os.path.basename(infile) + ".cluster.txt")
            df = df.apply(scale, 0)
            names = df.columns
            df_changed = pd.DataFrame(index=df.index)
            df_changed["cluster"] = np.nan
            for name in names:
                df_changed.loc[(df[name] - df.loc[:,df.columns != name].max(1)) > 0.5, "cluster"] = name
            df_changed.dropna().to_csv(clusterfile, sep="\t")
    if df.shape[1] == 1 or cluster:
        exps += [
                ("rf", "score", clusterfile),
                ("classic", "count", clusterfile),
                ("mwu", "score", clusterfile),
                ("lightning", "score", clusterfile),
                ]

    for method, scoring, fname in exps:
        try:
            sys.stderr.write("Running {} with {}\n".format(method,scoring))
            if scoring == "count" and count_table:
                moap_with_table(fname, count_table, outdir, method, scoring)
            elif scoring == "score" and score_table:
                moap_with_table(fname, score_table, outdir, method, scoring)
            else:
                moap_with_bg(fname, genome, outdir, method, scoring, pwmfile=pwmfile)
        
        
        except Exception as e:
            sys.stderr.write(
                    "Method {} with scoring {} failed\n{}\nSkipping\n".format(
                        method, scoring, e)
                    )
    
    dfs = {}
    for method, scoring,fname  in exps:
        t = "{}.{}".format(method,scoring)
        fname = os.path.join(outdir, "activity.{}.{}.out.txt".format(
                           method, scoring))
        try:
            dfs[t] = pd.read_table(fname, index_col=0, comment="#")
        except:
            sys.stderr.write("Activity file for {} not found!\n".format(t))
    
    df_p = pd.DataFrame(index=dfs.values()[0].index)
    names = dfs.values()[0].columns
    for e in names:
        df_tmp = pd.DataFrame()
        for method,scoring,fname in exps:
            k = "{}.{}".format(method, scoring)
            if k in dfs:
                v = dfs[k]
                df_tmp[k] = v.sort_values(e, ascending=False).index.values
        
        df_p[e] = rankagg(df_tmp)
    df_p[names] = -np.log10(df_p[names])
    df_p.to_csv(os.path.join(outdir, "final.out.csv"), sep="\t")
    #df_p = df_p.join(m2f)

    # Write motif frequency table
    
    if df.shape[1] == 1:
        mcount = df.join(pd.read_table(count_table, index_col=0))
        m_group = mcount.groupby(df.columns[0])
        freq = (m_group.sum() / m_group.count())
        freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t")

    if plot:
        visualize_maelstrom(outdir, pwmfile=pwmfile)
Esempio n. 11
0
 def test1_rankagg(self):
     """ Test rank aggregation """
     df = pd.read_csv(self.fname, index_col=0, sep="\t")
     result = rankagg(df)
     self.assertEqual("AP2", result.sort_values("score").index[0])
Esempio n. 12
0
def run_maelstrom(infile,
                  genome,
                  outdir,
                  plot=True,
                  cluster=True,
                  score_table=None,
                  count_table=None):

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not count_table:
        counts = scan_to_table(infile, genome, outdir, "count")
        count_table = os.path.join(outdir, "motif.count.txt")
        counts.to_csv(count_table, sep="\t")

    if not score_table:
        scores = scan_to_table(infile, genome, outdir, "score")
        score_table = os.path.join(outdir, "motif.score.txt")
        scores.to_csv(score_table, sep="\t", float_format="%.3f")

    df = pd.read_table(infile, index_col=0)

    # Drop duplicate indices, doesn't work very well downstream
    df = df.loc[df.index.drop_duplicates(keep=False)]
    exps = []
    clusterfile = infile
    if df.shape[1] != 1:
        # More than one column
        exps += [
            ("mara", "count", infile),
            ("lasso", "score", infile),
        ]

        if cluster:
            clusterfile = os.path.join(
                outdir,
                os.path.basename(infile) + ".cluster.txt")
            df = df.apply(scale, 0)
            names = df.columns
            df_changed = pd.DataFrame(index=df.index)
            df_changed["cluster"] = np.nan
            for name in names:
                df_changed.loc[(df[name] -
                                df.loc[:, df.columns != name].max(1)) > 0.5,
                               "cluster"] = name
            df_changed.dropna().to_csv(clusterfile, sep="\t")
    if df.shape[1] == 1 or cluster:
        exps += [
            ("rf", "score", clusterfile),
            ("classic", "count", clusterfile),
            ("mwu", "score", clusterfile),
            ("lightning", "score", clusterfile),
        ]

    for method, scoring, fname in exps:
        try:
            sys.stderr.write("Running {} with {}\n".format(method, scoring))
            if scoring == "count" and count_table:
                moap_with_table(fname, count_table, outdir, method, scoring)
            elif scoring == "score" and score_table:
                moap_with_table(fname, score_table, outdir, method, scoring)
            else:
                moap_with_bg(fname, genome, outdir, method, scoring)

        except Exception as e:
            sys.stderr.write(
                "Method {} with scoring {} failed\n{}\nSkipping\n".format(
                    method, scoring, e))

    dfs = {}
    for method, scoring, fname in exps:
        t = "{}.{}".format(method, scoring)
        fname = os.path.join(outdir,
                             "activity.{}.{}.out.txt".format(method, scoring))
        try:
            dfs[t] = pd.read_table(fname, index_col=0, comment="#")
        except:
            sys.stderr.write("Activity file for {} not found!\n".format(t))

    df_p = pd.DataFrame(index=dfs.values()[0].index)
    names = dfs.values()[0].columns
    for e in names:
        df_tmp = pd.DataFrame()
        for method, scoring, fname in exps:
            k = "{}.{}".format(method, scoring)
            v = dfs[k]
            df_tmp[k] = v.sort_values(e, ascending=False).index.values

        df_p[e] = rankagg(df_tmp)
    df_p[names] = -np.log10(df_p[names])
    df_p.to_csv(os.path.join(outdir, "final.out.csv"), sep="\t")
    #df_p = df_p.join(m2f)

    # Write motif frequency table
    mcount = df.join(pd.read_table(count_table, index_col=0))

    m_group = mcount.groupby("cluster")
    freq = (m_group.sum() / m_group.count())
    freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t")

    if plot:
        visualize_maelstrom(outdir)