Beispiel #1
0
def test_jitter():
    df1 = pd.DataFrame({'x': [1, 2, 1, 2],
                        'y': [1, 1, 2, 2]})
    p = (ggplot(df1, aes('x', 'y')) +
         geom_point(size=10) +
         geom_jitter(size=10, color='red', random_state=random_state) +
         geom_jitter(size=10, color='blue', width=0.1,
                     height=0.1, random_state=random_state))
    assert p + _theme == 'jitter'

    with pytest.raises(PlotnineError):
        geom_jitter(position=position_jitter(), width=0.1)
Beispiel #2
0
def test_jitter():
    df1 = pd.DataFrame({'x': [1, 2, 1, 2], 'y': [1, 1, 2, 2]})
    p = (ggplot(df1, aes('x', 'y')) + geom_point(size=10) +
         geom_jitter(size=10, color='red', random_state=random_state) +
         geom_jitter(size=10,
                     color='blue',
                     width=0.1,
                     height=0.1,
                     random_state=random_state))
    assert p + _theme == 'jitter'

    with pytest.raises(PlotnineError):
        geom_jitter(position=position_jitter(), width=0.1)
def test_annotation_stripes_continuous_transformed():
    pdf = mtcars.assign(am=pd.Categorical(mtcars.am))
    p = (ggplot(pdf) +
         annotation_stripes(fills=["red", "green", "blue"], alpha=0.1) +
         geom_jitter(aes("hp", "wt", color="am"), random_state=5) +
         scale_x_continuous(trans='log2'))
    assert p == "annotation_stripes_continuous_transformed"
def mixed_linear_factors_plot(df, x_axis, factor):
    plotnine.options.figure_size = (10, 10)
    factor_steps = df[factor].unique()
    reg_lines = pd.DataFrame({
        factor: factor_steps,
        'intercept': np.zeros_like(factor_steps),
        'slope': np.zeros_like(factor_steps)
    })
    for i, step in enumerate(factor_steps):
        factored_df = df[df[factor] == step]
        md = smf.mixedlm('mse ~ %s' % x_axis,
                         factored_df,
                         groups=factored_df.index.values)
        mdf = md.fit()
        reg_lines.iloc[i] = [step, mdf.params['Intercept'], mdf.params[x_axis]]

    df['percent_broken'] = df['percent_broken'].round().astype(np.int)
    df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int)
    reg_lines[factor] = reg_lines[factor].round().astype(np.int)
    gg = (
        plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) +
        plotnine.geom_jitter(width=2.5, show_legend=False) +
        plotnine.scale_color_manual(['#DB5F57'] * 4) +
        plotnine.facet_wrap(factor) + plotnine.geom_abline(
            plotnine.aes(intercept='intercept', slope='slope'),
            data=reg_lines) + plotnine.theme_classic(base_size=20))
    gg.save('%s_vs_%s_rmse.pdf' % (x_axis, factor))
def test_annotation_stripes_continuous():
    pdf = mtcars.assign(am=pd.Categorical(mtcars.am))
    p = (ggplot(pdf) + annotation_stripes(
        fills=["red", "green", "blue"], alpha=0.4, size=1, linetype="dashed") +
         geom_jitter(aes("gear", "wt", color="am"), random_state=5))

    assert p == "annotation_stripes_continuous"
Beispiel #6
0
def make_jitter_plot(data,x,y):
    """
        Make a scatter plot between two variables data[x] and data[y]
    """
    (p9.ggplot(data=data,
               mapping=p9.aes(x=x, y=y))
    + p9.geom_jitter(alpha=0.2)
    + p9.scales.scale_color_cmap(name='viridis'));
Beispiel #7
0
def _make_plots(df_plt, out_file_base, y='AUC', facet_grid='', h_line=''):
    len_x = len(np.unique(df_plt['resolution']))
    if 'sparsity_l1' in df_plt.columns:
        df_plt['Sparsity'] = df_plt['sparsity_l1']
        len_x2 = len(np.unique(df_plt['Sparsity']))
    else:
        len_x2 = 0
    if len_x2 > 1:
        gplt = plt9.ggplot(df_plt,
                           plt9.aes(
                               fill='Sparsity',
                               x='resolution',
                               y=y,
                           ))
        gplt = gplt + plt9.geom_boxplot(alpha=0.8, outlier_alpha=0)
        gplt = gplt + plt9.geom_jitter(
            plt9.aes(color='Sparsity'), alpha=0.25, width=0.2)
    else:
        gplt = plt9.ggplot(df_plt, plt9.aes(x='resolution', y=y))
        gplt = gplt + plt9.geom_boxplot(alpha=0.8, outlier_alpha=0)
        gplt = gplt + plt9.geom_jitter(alpha=0.25, width=0.2)
    gplt = gplt + plt9.theme_bw(base_size=12)
    if facet_grid != '':
        gplt = gplt + plt9.facet_grid('{} ~ .'.format(facet_grid))
    if y == 'f1-score':
        gplt = gplt + plt9.labs(x='Resolution', y='F1 score', title='')
    elif y in ['AUC', 'MCC']:
        gplt = gplt + plt9.labs(x='Resolution', y=y, title='')
    else:
        gplt = gplt + plt9.labs(
            x='Resolution', y=y.capitalize().replace('_', ' '), title='')
    gplt = gplt + plt9.theme(
        # legend_position='none',
        axis_text_x=plt9.element_text(angle=-45, hjust=0))
    if len_x2 != 0 and len_x2 < 9:
        gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
    if h_line != '':
        gplt = gplt + plt9.geom_hline(plt9.aes(yintercept=h_line),
                                      linetype='dashdot')
    gplt.save('{}-resolution__{}.png'.format(out_file_base,
                                             y.replace('-', '_')),
              dpi=300,
              width=4 * ((len_x + len_x2) / 4),
              height=5,
              limitsize=False)
def plot_features(dat):
    import plotnine as p9
    
    p = {
    p9.ggplot(dat, p9.aes('grade', 'features', color = 'keywords')) +
    p9.geom_jitter(alpha = .5) +
    p9.facet_wrap('feature_id', scales = 'free_y')
    }
    
    return p
Beispiel #9
0
def plot_compare(stats,
                 variant,
                 variant_baseline,
                 metric,
                 mode="identity",
                 jitter=0.01):
    assert mode in ["identity", "ratio", "difference"]
    plotdata = compare_stats(stats, variant, variant_baseline)
    bsw = bsw_table2(plotdata, metric=metric, reltol=1.0)
    display(bsw)
    baseline_name = f"{metric}_baseline"
    plotdata = plotdata[[metric, baseline_name, "dataset"]].assign(
        ratio=plotdata[metric] / plotdata[baseline_name],
        difference=plotdata[metric] - plotdata[baseline_name],
    )

    if mode == "identity":
        return (ggplot(data=plotdata) + geom_jitter(
            aes(x=f"{metric}_baseline", y=metric, fill="dataset"),
            width=jitter,
            height=jitter,
        ) + scale_x_log10() + scale_y_log10() +
                geom_abline(aes(slope=1, intercept=0)))
    elif mode == "ratio":
        return (
            ggplot(data=plotdata) + geom_jitter(
                aes(x=f"{metric}_baseline", y="ratio", fill="dataset"),
                width=jitter,
                height=jitter,
            ) + scale_x_log10() + scale_y_log10()
            ## ablines are drawn wrt the already log-transformed axes. hence 0 = log(1) in scale
            + geom_abline(aes(slope=0, intercept=0.0)) +
            geom_abline(aes(slope=-1, intercept=0.0))  # max
        )
    elif mode == "difference":
        return (ggplot(data=plotdata) + geom_jitter(
            aes(x=f"{metric}_baseline", y="difference", fill="dataset"),
            width=jitter,
            height=jitter,
        ) + scale_x_log10() + scale_y_log10() +
                geom_abline(aes(slope=0, intercept=0)))
    else:
        assert False, "unknown mode"
Beispiel #10
0
def plot_policy(df):
    trunk = df.query('complete & test_nodes == 64 & test_c == 1/16')

    return (pn.ggplot(trunk, pn.aes(x='rel_elo', y='policy', group='run', color='factor(boardsize)'))
        + pn.geom_jitter(show_legend=False, size=.25, width=.02, shape='.')
        + pn.scale_y_continuous(trans='log10')
        + pn.scale_color_continuous(trans='log2')
        + pn.labs(
            x='Relative Elo (-1 random, 0 perfect play)',
            y='Policy Noise Scale')
        + plot.IEEE((5, 4)))
Beispiel #11
0
def plot_replicate_correlation(
    df,
    batch,
    plate,
    facet_string=None,
    split_samples=False,
    output_file_base=None,
    output_file_extensions=[".png", ".pdf", ".svg"],
    dpi=500,
    height=4,
    width=5,
    return_plot=False,
):
    correlation_gg = (
        gg.ggplot(
            df,
            gg.aes(x="group_replicate", y="similarity_metric", fill="group_replicate"),
        )
        + gg.geom_boxplot(
            alpha=0.3, outlier_alpha=0, width=0.8, notchwidth=0.25, fatten=1.5
        )
        + gg.geom_jitter(shape=".", size=0.001, alpha=0.3, width=0.3, height=0)
        + gg.scale_fill_manual(
            name="Replicate",
            labels={"True": "True", "False": "False"},
            values=["#B99638", "#2DB898"],
        )
        + gg.xlab("Replicates")
        + gg.ylab("Pearson Correlation")
        + gg.ggtitle("{}: {}".format(batch, plate))
        + gg.theme_bw()
        + gg.theme(
            subplots_adjust={"wspace": 0.2},
            title=gg.element_text(size=5),
            axis_text=gg.element_text(size=4),
            axis_title=gg.element_text(size=5),
            legend_text=gg.element_text(size=4),
            legend_title=gg.element_text(size=5),
            strip_text=gg.element_text(size=4, color="black"),
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
        )
    )

    if split_samples:
        assert facet_string, "To split samples, specify a facet_string"
        correlation_gg += gg.facet_wrap(facet_string)

    if output_file_base:
        save_figure(
            correlation_gg, output_file_base, output_file_extensions, dpi, height, width
        )
    if return_plot:
        return correlation_gg
def test_annotation_stripes_double():
    pdf = mtcars.assign(gear=pd.Categorical(mtcars.gear),
                        am=pd.Categorical(mtcars.am))
    p = (
        ggplot(pdf) + annotation_stripes(
            fills=["#0000FF", "#FF0000"], alpha=0.3, direction='vertical') +
        annotation_stripes(
            fills=["#AAAAAA", "#FFFFFF"], alpha=0.3, direction='horizontal') +
        geom_jitter(aes("gear", "wt", shape="gear", color="am"),
                    random_state=5) +
        scale_shape_discrete(guide=guide_legend(order=1))  # work around #229
    )
    assert p == "annotation_stripes_double"
def method_plot(df, baseline_rul, baseline_mse, method):
    plotnine.options.figure_size = (15, 8)

    jan = df[df['method'] == method]

    jan['percent_broken'] = jan['percent_broken'].round().astype(np.int)
    jan['percent_fail_runs'] = jan['percent_fail_runs'].round().astype(np.int)

    plotnine.ylim = (2, 10)
    gg = (plotnine.ggplot(
        jan, plotnine.aes(x='percent_broken', y='log_score', color='method')) +
          plotnine.facet_wrap('task', 2, 4) +
          plotnine.stat_boxplot(plotnine.aes(y='log_value', x=60),
                                data=baseline_rul,
                                width=80,
                                color='#14639e',
                                show_legend=False) +
          plotnine.geom_jitter(width=2.5, show_legend=False) +
          plotnine.stat_smooth(method='gls', show_legend=False) +
          plotnine.xlab('Grade of Degradation in %') +
          plotnine.ylab('Logarithmic RUL-Score') +
          plotnine.theme_classic(base_size=20))
    gg.save('%s_log_rul.pdf' % method)

    plotnine.ylim = (90, 10)
    gg = (plotnine.ggplot(
        jan, plotnine.aes(x='percent_broken', y='mse', color='method')) +
          plotnine.facet_wrap('task', 2, 4) +
          plotnine.stat_boxplot(plotnine.aes(y='value', x=60),
                                data=baseline_mse,
                                width=80,
                                color='#14639e',
                                show_legend=False) +
          plotnine.geom_jitter(width=2.5, show_legend=False) +
          plotnine.stat_smooth(method='gls', show_legend=False) +
          plotnine.xlab('Grade of Degradation in %') + plotnine.ylab('RMSE') +
          plotnine.theme_classic(base_size=20))
    gg.save('%s_rmse.pdf' % method)
def test_annotation_stripes_coord_flip():
    pdf = mtcars.assign(gear=pd.Categorical(mtcars.gear),
                        am=pd.Categorical(mtcars.am))
    p = (
        ggplot(pdf) + annotation_stripes(
            fills=["#AAAAAA", "#FFFFFF", "#7F7FFF"], alpha=0.3) + geom_jitter(
                aes("gear", "wt", shape="gear", color="am"), random_state=5) +
        geom_vline(xintercept=0.5, color="black") +
        geom_vline(xintercept=1.5, color="black") +
        geom_vline(xintercept=2.5, color="black") +
        geom_vline(xintercept=3.5, color="black") +
        scale_shape_discrete(guide=guide_legend(order=1))  # work around #229
        + coord_flip())
    assert p == "annotation_stripes_coord_flip"
Beispiel #15
0
def makePlot(grdevices, plotName, samp_set1_vals, samp_set2_vals,
             image_file_type):

    samp_vector = ["set1" for i in range(len(samp_set1_vals))]
    samp_vector.extend(["set2" for i in range(len(samp_set2_vals))])

    data_vector = samp_set1_vals + samp_set2_vals

    dframe = pd.DataFrame(list(zip(samp_vector, data_vector)),
                          columns=["sample", "value"])

    gg = pn.ggplot(dframe, pn.aes(x="sample", y="value")) + pn.geom_jitter(
        position="jitter", width=0.2,
        height=0.01) + pn.coord_cartesian(ylim=(0, 100)) + pn.theme_bw()

    # TODO Just infer format from plotName
    gg.save(filename=plotName, format=image_file_type)
Beispiel #16
0
def steps_violin_plotter(df_ar, testbed, run=0):
    df_estimate = testbed.estimate_distribution(1000)
    df_estimate = df_estimate.astype({"action": "int32"})
    df_ar = df_ar.loc[df_ar["run"] == run]
    df_ar = df_ar.astype({"action": "int32"})
    p = (
        p9.ggplot(
            p9.aes(
                x="reorder(factor(action), action)",
                y="reward",
            )
        )
        + p9.ggtitle(f"Action - Rewards across {df_ar.shape[0]} steps")
        + p9.xlab("k-arm")
        + p9.ylab("Reward")
        + p9.geom_violin(df_estimate, fill="#d0d3d4")
        + p9.geom_jitter(df_ar, p9.aes(color="step"))
        + p9.theme(figure_size=(20, 9))
    )
    fig = p.draw()

    return fig
Beispiel #17
0
max_acceptable_range = r.return_prediction(
    best_line, age, linear_coeff, log10_coeff, ln_coeff) + zero_z_score

print("\n\nThe predicted acceptable range at age ", str(age), " is from ",
      str(min_acceptable_range), " to ", str(max_acceptable_range), "\n\n")

# save csv file
outlierfile = filename.replace('.csv', '_outliers.csv')

data_output.to_csv(outlierfile, index=False)

# plot overlay of IQR and mod-Z score outliers
p = (
    p9.ggplot(data=data_output,
              mapping=p9.aes(x='age_rounded', y='value', group='age_rounded'))
    + p9.geom_jitter(mapping=p9.aes(color='z_outlier', outlier_alpha=0.1)) +
    p9.geom_boxplot(outlier_size=0, outlier_stroke=0) + p9.ggtitle(
        "Outliers detected via the IQR method (boxplot)\nand modified z-score method (dotplot)"
    ) + p9.ylim(-10, 175))
print(p)
plotfile = filename.replace('.csv', '_outlierplot')
p9.ggsave(plot=p, filename=plotfile)

# plot regression
x = data_stats_regression['age_rounded']
y = data_stats_regression['median']
plt.plot(x, y, 'o')
plt.plot(x, r.func_linear(x, *linear_coeff))
plt.plot(x, r.func_log(x, *log10_coeff))
plt.plot(x, r.func_ln(x, *ln_coeff))
plt.title(
Beispiel #18
0
def MDplot(Data,
           Names=None,
           Ordering='Default',
           Scaling=None,
           Fill='darkblue',
           RobustGaussian=True,
           GaussianColor='magenta',
           Gaussian_lwd=1.5,
           BoxPlot=False,
           BoxColor='darkred',
           MDscaling='width',
           LineColor='black',
           LineSize=0.01,
           QuantityThreshold=40,
           UniqueValuesThreshold=12,
           SampleSize=500000,
           SizeOfJitteredPoints=1,
           OnlyPlotOutput=True,
           ValueColumn=None,
           ClassColumn=None):
    """
    Plots a mirrored density plot for each numeric column
    
    Args:
        Data (dataframe): dataframe containing data. Each column is one 
                          variable (wide table format, for long table format 
                          see ValueColumn and ClassColumn)
        Names (list): list of column names (will be used if data is not a 
                      dataframe)
        Ordering (str): 'Default', 'Columnwise', 'Alphabetical' or 'Statistics'
        Scaling (str): scaling method, one of: Percentalize, CompleteRobust, 
                                               Robust, Log
        Fill (str): color of MD-Plot
        RobustGaussian (bool): draw a gaussian distribution if column is 
                               gaussian
        GaussianColor (str): color for gaussian distribution
        Gaussian_lwd (float): line width of gaussian distribution
        BoxPlot (bool): draw box-plot
        BoxColor (str): color for box-plots
        MDscaling (str): scale of ggplot violin
        LineSize (float): line width of ggplot violin
        QuantityThreshold (int): minimal number of rows
        UniqueValuesThreshold (int): minimal number of unique values per 
                                         column
        SampleSize (int): number of samples used if number of rows is larger 
                          than SampleSize
        OnlyPlotOutput (bool): if True than returning only ggplot object,
                               if False than returning dictionary containing 
                               ggplot object and additional infos
        ValueColumn (str): name of the column of values to be plotted
                           (data in long table format)
        ClassColumn (str): name of the column with class identifiers for the 
                           value column (data in long table format)
        
    Returns:
        ggplot object or dictionary containing ggplot object and additional 
        infos
    """

    if not isinstance(Data, pd.DataFrame):
        try:
            if Names is not None:
                Data = pd.DataFrame(Data, columns=Names)
            else:
                Data = pd.DataFrame(Data)
                lstCols = list(Data.columns)
                dctCols = {}
                for strCol in lstCols:
                    dctCols[strCol] = "C_" + str(strCol)
                Data = Data.rename(columns=dctCols)
        except:
            raise Exception("Data cannot be converted into pandas dataframe")
    else:
        Data = Data.reset_index(drop=True)

    if ValueColumn is not None and ClassColumn is not None:
        lstCols = list(Data.columns)
        if ValueColumn not in lstCols:
            raise Exception("ValueColumn not contained in dataframe")
        if ClassColumn not in lstCols:
            raise Exception("ClassColumn not contained in dataframe")

        lstClasses = list(Data[ClassColumn].unique())
        DataWide = pd.DataFrame()
        for strClass in lstClasses:
            if len(DataWide) == 0:
                DataWide = Data[Data[ClassColumn] == strClass].copy()\
                .reset_index(drop=True)
                DataWide = DataWide.rename(columns={ValueColumn: strClass})
                DataWide = DataWide[[strClass]]
            else:
                dfTemp = Data[Data[ClassColumn] == strClass].copy()\
                .reset_index(drop=True)
                dfTemp = dfTemp.rename(columns={ValueColumn: strClass})
                dfTemp = dfTemp[[strClass]]
                DataWide = DataWide.join(dfTemp, how='outer')
        Data = DataWide.copy()

    lstCols = list(Data.columns)
    for strCol in lstCols:
        if not is_numeric_dtype(Data[strCol]):
            print("Deleting non numeric column: " + strCol)
            Data = Data.drop([strCol], axis=1)
        else:
            if abs(Data[strCol].sum()) == np.inf:
                print("Deleting infinite column: " + strCol)
                Data = Data.drop([strCol], axis=1)

    Data = Data.rename_axis("index", axis="index")\
    .rename_axis("variable", axis="columns")
    dvariables = Data.shape[1]
    nCases = Data.shape[0]

    if nCases > SampleSize:
        print('Data has more cases than "SampleSize". Drawing a sample for '
              'faster computation. You can omit this by setting '
              '"SampleSize=len(data)".')
        sampledIndex = np.sort(
            np.random.choice(list(Data.index), size=SampleSize, replace=False))
        Data = Data.loc[sampledIndex]

    nPerVar = Data.apply(lambda x: len(x.dropna()))
    nUniquePerVar = Data.apply(lambda x: len(list(x.dropna().unique())))

    # renaming columns to nonumeric names
    lstCols = list(Data.columns)
    dctCols = {}
    for strCol in lstCols:
        try:
            a = float(strCol)
            dctCols[strCol] = "C_" + str(strCol)
        except:
            dctCols[strCol] = str(strCol)
    Data = Data.rename(columns=dctCols)

    if Scaling == "Percentalize":
        Data = Data.apply(lambda x: 100 * (x - x.min()) / (x.max() - x.min()))
    if Scaling == "CompleteRobust":
        Data = robust_normalization(Data, centered=True, capped=True)
    if Scaling == "Robust":
        Data = robust_normalization(Data, centered=False, capped=False)
    if Scaling == "Log":
        Data = signed_log(Data, base="Ten")
        if RobustGaussian == True:
            RobustGaussian = False
            print("log with robust gaussian does not work, because mean and "
                  "variance is not valid description for log normal data")

#_______________________________________________Roboust Gaussian and Statistics
    if RobustGaussian == True or Ordering == "Statistics":
        Data = Data.applymap(lambda x: np.nan if abs(x) == np.inf else x)

        if nCases < 50:
            warnings.warn("Sample is maybe too small for statistical testing")

        factor = pd.Series([0.25, 0.75]).apply(lambda x: abs(norm.ppf(x)))\
        .sum()
        std = Data.std()

        dfQuartile = Data.apply(
            lambda x: mquantiles(x, [0.25, 0.75], alphap=0.5, betap=0.5))
        dfQuartile = dfQuartile.append(dfQuartile.loc[1] - dfQuartile.loc[0],
                                       ignore_index=True)
        dfQuartile.index = ["low", "hi", "iqr"]
        dfMinMax = Data.apply(
            lambda x: mquantiles(x, [0.001, 0.999], alphap=0.5, betap=0.5))
        dfMinMax.index = ["min", "max"]

        shat = pd.Series()
        mhat = pd.Series()
        nonunimodal = pd.Series()
        skewed = pd.Series()
        bimodalprob = pd.Series()
        isuniformdist = pd.Series()
        nSample = max([10000, nCases])
        normaldist = np.empty((nSample, dvariables))
        normaldist[:] = np.nan
        normaldist = pd.DataFrame(normaldist, columns=lstCols)

        for strCol in lstCols:
            shat[strCol] = min(
                [std[strCol], dfQuartile[strCol].loc["iqr"] / factor])
            mhat[strCol] = trim_mean(Data[strCol].dropna(), 0.1)

            if nCases > 45000 and nPerVar[strCol] > 8:
                # statistical testing does not work with to many cases
                sampledIndex = np.sort(
                    np.random.choice(list(Data.index),
                                     size=45000,
                                     replace=False))
                vec = Data[strCol].loc[sampledIndex]
                if nUniquePerVar[strCol] > UniqueValuesThreshold:
                    nonunimodal[strCol] = dip.diptst(vec.dropna(), numt=100)[1]
                    skewed[strCol] = skewtest(vec)[1]
                    args = (dfMinMax[strCol].loc["min"],
                            dfMinMax[strCol].loc["max"] \
                            - dfMinMax[strCol].loc["min"])
                    isuniformdist[strCol] = kstest(vec, "uniform", args)[1]
                    bimodalprob[strCol] = bimodal(vec)["Bimodal"]
                else:
                    print("Not enough unique values for statistical testing, "
                          "thus output of testing is ignored.")
                    nonunimodal[strCol] = 1
                    skewed[strCol] = 1
                    isuniformdist[strCol] = 0
                    bimodalprob[strCol] = 0
            elif nPerVar[strCol] < 8:
                warnings.warn("Sample of finite values to small to calculate "
                              "agostino.test or dip.test for " + strCol)
                nonunimodal[strCol] = 1
                skewed[strCol] = 1
                isuniformdist[strCol] = 0
                bimodalprob[strCol] = 0
            else:
                if nUniquePerVar[strCol] > UniqueValuesThreshold:
                    nonunimodal[strCol] = dip.diptst(Data[strCol].dropna(),
                                                     numt=100)[1]
                    skewed[strCol] = skewtest(Data[strCol])[1]
                    args = (dfMinMax[strCol].loc["min"],
                            dfMinMax[strCol].loc["max"] \
                            - dfMinMax[strCol].loc["min"])
                    isuniformdist[strCol] = kstest(Data[strCol], "uniform",
                                                   args)[1]
                    bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"]
                else:
                    print("Not enough unique values for statistical testing, "
                          "thus output of testing is ignored.")
                    nonunimodal[strCol] = 1
                    skewed[strCol] = 1
                    isuniformdist[strCol] = 0
                    bimodalprob[strCol] = 0

            if isuniformdist[strCol] < 0.05 and nonunimodal[strCol] > 0.05 \
            and skewed[strCol] > 0.05 and bimodalprob[strCol] < 0.05 \
            and nPerVar[strCol] > QuantityThreshold \
            and nUniquePerVar[strCol] > UniqueValuesThreshold:
                normaldist[strCol] = np.random.normal(mhat[strCol],
                                                      shat[strCol], nSample)
                normaldist[strCol] = normaldist[strCol]\
                .apply(lambda x: np.nan if x < Data[strCol].min() \
                                 or x > Data[strCol].max() else x)
        nonunimodal[nonunimodal == 0] = 0.0000000001
        skewed[skewed == 0] = 0.0000000001
        effectStrength = (-10 * np.log(skewed) - 10 * np.log(nonunimodal)) / 2

#______________________________________________________________________Ordering
    if Ordering == "Default":
        bimodalprob = pd.Series()
        for strCol in lstCols:
            if nCases > 45000 and nPerVar[strCol] > 8:
                sampledIndex = np.sort(
                    np.random.choice(list(Data.index),
                                     size=45000,
                                     replace=False))
                vec = Data[strCol].loc[sampledIndex]
                bimodalprob[strCol] = bimodal(vec)["Bimodal"]
            elif nPerVar[strCol] < 8:
                bimodalprob[strCol] = 0
            else:
                bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"]
        if len(list(bimodalprob.unique())) < 2 and dvariables > 1 \
        and RobustGaussian == True:
            rangfolge = list(effectStrength.sort_values(ascending=False).index)
            print("Using statistics for ordering instead of default")
        else:
            rangfolge = list(bimodalprob.sort_values(ascending=False).index)

    if Ordering == "Columnwise":
        rangfolge = lstCols

    if Ordering == "Alphabetical":
        rangfolge = lstCols.copy()
        rangfolge.sort()

    if Ordering == "Statistics":
        rangfolge = list(effectStrength.sort_values(ascending=False).index)

#________________________________________________________________Data Reshaping
    if nPerVar.min() < QuantityThreshold \
    or nUniquePerVar.min() < UniqueValuesThreshold:
        warnings.warn("Some columns have less than " + str(QuantityThreshold) +
                      " data points or less than " +
                      str(UniqueValuesThreshold) +
                      " unique values. Changing from MD-plot to Jitter-Plot "
                      "for these columns.")
        dataDensity = Data.copy()
        mm = Data.median()
        for strCol in lstCols:
            if nPerVar[strCol] < QuantityThreshold \
            or nUniquePerVar[strCol] < UniqueValuesThreshold:
                if mm[strCol] != 0:
                    dataDensity[strCol] = mm[strCol] \
                    * np.random.uniform(-0.001, 0.001, nCases) + mm[strCol]
                else:
                    dataDensity[strCol] = np.random.uniform(
                        -0.001, 0.001, nCases)
        # Generates in the cases where pdf cannot be estimated a scatter plot
        dataJitter = dataDensity.copy()
        # Delete all scatters for features where distributions can be estimated
        for strCol in lstCols:
            if nPerVar[strCol] >= QuantityThreshold \
            and nUniquePerVar[strCol] >= UniqueValuesThreshold:
                dataJitter[strCol] = np.nan
        #apply ordering
        dataframe = dataDensity[rangfolge].reset_index()\
        .melt(id_vars=["index"])
    else:
        dataframe = Data[rangfolge].reset_index().melt(id_vars=["index"])

    dctCols = {"index": "ID", "variable": "Variables", "value": "Values"}
    dataframe = dataframe.rename(columns=dctCols)

    #______________________________________________________________________Plotting
    plot = p9.ggplot(dataframe, p9.aes(x="Variables", group="Variables",
                                        y="Values")) \
                     + p9.scale_x_discrete(limits=rangfolge)

    plot = plot + p9.geom_violin(stat = stat_pde_density(scale=MDscaling),
                                 fill=Fill, colour=LineColor,
                                 size=LineSize, trim=True) \
                           + p9.theme(axis_text_x=p9.element_text(rotation=90))

    if nPerVar.min() < QuantityThreshold \
    or nUniquePerVar.min() < UniqueValuesThreshold:
        dataframejitter = dataJitter[rangfolge].reset_index()\
        .melt(id_vars=["index"])
        dataframejitter = dataframejitter.rename(columns=dctCols)
        plot = plot + p9.geom_jitter(
            size=SizeOfJitteredPoints,
            data=dataframejitter,
            colour=LineColor,
            mapping=p9.aes(x="Variables", group="Variables", y="Values"),
            position=p9.position_jitter(0.15))

    if RobustGaussian == True:
        dfTemp = normaldist[rangfolge].reset_index().melt(id_vars=["index"])
        dfTemp = dfTemp.rename(columns=dctCols)
        if dfTemp["Values"].isnull().all() == False:
            plot = plot + p9.geom_violin(
                data=dfTemp,
                mapping=p9.aes(x="Variables", group="Variables", y="Values"),
                colour=GaussianColor,
                alpha=0,
                scale=MDscaling,
                size=Gaussian_lwd,
                na_rm=True,
                trim=True,
                fill=None,
                position="identity",
                width=1)

    if BoxPlot == True:
        plot = plot + p9.stat_boxplot(geom = "errorbar", width = 0.5,
                                      color=BoxColor) \
                    + p9.geom_boxplot(width=1, outlier_colour = None, alpha=0,
                                      fill='#ffffff', color=BoxColor,
                                      position="identity")

    if OnlyPlotOutput == True:
        return plot
    else:
        print(plot)
        return {
            "Ordering": rangfolge,
            "DataOrdered": Data[rangfolge],
            "ggplotObj": plot
        }
Beispiel #19
0
                          y="cigarettes_per_day"))
    + p9.geom_boxplot()
    )

# change color of boxes and move aes to geom layer
(p9.ggplot(smoke_complete)
    + p9.geom_boxplot(p9.aes(x="vital_status",
                   y="cigarettes_per_day"), color="tomato")
    )

# adding colored points to black box and whisker plot
(p9.ggplot(smoke_complete,
           p9.aes(x="vital_status",
                          y="cigarettes_per_day"))
    + p9.geom_boxplot()
    + p9.geom_jitter(alpha=0.2, color="blue")
    )

## Challenge: visualize the same data as a violin plot in a color of your choice

#### Plotting time series data ####

# group and count vital status by year of birth
yearly_counts = birth_reduced.groupby(["year_of_birth", "vital_status"])["vital_status"].count()
yearly_counts # both year and vital status are row indexes
# reset the index to use both as column variables
yearly_counts = yearly_counts.reset_index(name="counts")
yearly_counts

# create line plot
(p9.ggplot(yearly_counts,
    ], axis='columns')
    df['feature_set'] = model
    cv_results_df = cv_results_df.append(df)
    
cv_results_summary = (cv_results_df
    .groupby(['classify__alpha', 'feature_set'])['mean_test_score']
    .max()
    .reset_index())


# In[17]:

(gg.ggplot(cv_results_summary, gg.aes(x='classify__alpha',
                                      y='mean_test_score',
                                      color='feature_set'))
 + gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05)
 + gg.scale_x_log10()
 + gg.labs(x='Regularization strength multiplier (log alpha)',
           y='CV AUROC')
 + gg.guides(fill=gg.guide_legend(title="Feature Set"))
 + gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1)
 + theme_cognoma()
)


# ## Use optimal hyperparameters to output ROC curve

# In[18]:

y_pred_dict = {
    model: {
Beispiel #21
0
def plot_violinbox_plots_per_category(
        dataframe: pandas.DataFrame,
        plot_type: str,
        target_feature: str,
        label_column: str,
        colors: List[str],
        coloring_style: str,
        value_skip_list: List = [],
        jitter_alpha: float = 0.7,
        plot_alpha: float = 0.5,
        log_10_scale: bool = False,
        theme: str = 'gray',
        save_to_file: str = None,
        dpi: int = 150,
        show: bool = True
) -> p9.ggplot:
    """
        The :func:`plot_violinbox_plots_per_category` helps with providing the user with nicely plotted violin and
        box plots of the distribution of data points.

        Parameters
        ----------
        dataframe: `pandas.DataFrame`, required
            This is the main parameter that this method is supposed to work with, which is a dataframe that has
            a label column in which we have integer values starting from 0, and a float feature column the distribution
            of which we tend to monitor.
        plot_type: `str`, required
            This value, either `box` or `violin`, determines the type of plot.
        target_feature: `str`, required
            This parameter is the column name of the features that we want to monitor.
        label_column: `str`, required
            The input dataframe must have a label_column (preferably integer starting from 0), the name of that
            column should be input here.
        colors: `List[str]`, required
            Depending on whether or not our `coloring_style` is manual or automatic, this can either be a list of colors
            or a list of two colors indicating a range of color values.
        coloring_style: `str`, optional (default='manual')
            Either `manual` or `gradient` which helps assigning colors to clusters.
        value_skip_list: `List`, optional (default=[])
            If some values in the feature column are to be skipped, they should be put in here so that they
            are ignored in the plots. For example, if for some reason some values are -10000000, they can be taken care
            of in here.
        jitter_alpha: `float`, optional (default=0.7)
            The jitter value transparency is set in this parameter.
        plot_alpha: `float`, optional (default=0.5)
            The transparency intensity can be determined by setting this parameter.
        log_10_scale: `bool`, optional (default=False)
            If the user wants to take the logarithm in the basis of 10, this parameter should be set to 1.
        theme: `str`, optional (default='gray')
            This is the `theme` types, the acceped values are: ``['gray', 'dark', 'seaborn', 'light']``, the values
            are consistent with `plotnine` package's format.
        save_to_file: `str`, optional (default=None)
            If the user intends to save the plot in a file, this parameter should have a value. The value must be a filepath.
        dpi: `int`, optional (default=150)
            The dpi for saving the plots indicating the image quality.
        show: `bool`, optional (default=True)
            Whether or not the plot is to be shown is set in this parameter.
        Returns
        ----------
        The output of this method is of `p9.ggplot` type.
        """
    if len(value_skip_list) > 0:
        df = dataframe[~dataframe[target_feature].isin(value_skip_list)]

    if coloring_style == 'gradient':
        assert len(colors) == 2, "you have chosen gradient style coloring, for colors you have to provide a list with the \
            First element being the color for low and the second the color for high."
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x='factor(' + label_column + ')', y=target_feature, color=label_column))
        pplot += p9.scale_color_gradient(low=colors[0], high=colors[1])
    elif coloring_style == 'manual':
        assert len(colors) == len(df[label_column].unique()), "You have chosen per category manual coloring, therefore you have to provide the same number of colors"
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x='factor(' + label_column + ')', y=target_feature, color='factor(' + label_column + ')'))
        pplot += p9.scale_alpha_manual(colors)

    pplot += p9.geom_jitter(alpha=jitter_alpha)

    if plot_type == 'box':
        pplot += p9.geom_boxplot(alpha=plot_alpha)
    elif plot_type == 'violin':
        pplot += p9.geom_violin(alpha=plot_alpha)
    else:
        raise Exception('unknown plot type, it must be violin or box.')

    if theme == 'gray':
        pplot += p9.theme_gray()
    elif theme == 'dark':
        pplot += p9.theme_dark()
    elif theme == 'seaborn':
        pplot += p9.theme_seaborn()
    elif theme == 'light':
        pplot += p9.theme_light()
    else:
        raise Exception('Theme type not supported, please add.')

    if log_10_scale:
        pplot += p9.scale_x_log10()

    if save_to_file is not None:
        save_directory, filename = separate_path_and_file(filepath=save_to_file)
        pplot.save(filename=filename, path=save_directory, dpi=dpi)

    if show:
        pplot.draw()

    return pplot
        pd.DataFrame(pipeline.cv_results_),
        pd.DataFrame.from_records(pipeline.cv_results_['params'])
    ],
                   axis='columns')
    df['feature_set'] = model
    cv_results_df = cv_results_df.append(df)

cv_results_summary = (cv_results_df.groupby(
    ['classify__alpha', 'feature_set'])['mean_test_score'].max().reset_index())

# In[17]:

(gg.ggplot(
    cv_results_summary,
    gg.aes(x='classify__alpha', y='mean_test_score', color='feature_set')) +
 gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05) + gg.scale_x_log10() +
 gg.labs(x='Regularization strength multiplier (log alpha)', y='CV AUROC') +
 gg.guides(fill=gg.guide_legend(title="Feature Set")) +
 gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1) +
 theme_cognoma())

# ## Use optimal hyperparameters to output ROC curve

# In[18]:

y_pred_dict = {
    model: {
        'train': pipeline.decision_function(X_train),
        'test': pipeline.decision_function(X_test)
    }
    for model, pipeline in cv_pipelines.items()
def mixed_linear_plots(df, x_axis, x_label):
    plotnine.options.figure_size = (8, 10)

    md = smf.mixedlm('log_score ~ percent_broken + percent_fail_runs',
                     df,
                     groups=df.index.values)
    mdf_rul = md.fit()

    print('#' * 18 + 'Log RUL' + '#' * 18)
    print(mdf_rul.summary())

    md = smf.mixedlm('mse ~ percent_broken + percent_fail_runs',
                     df,
                     groups=df.index.values)
    mdf_mse = md.fit()

    print('#' * 18 + 'RMSE' + '#' * 18)
    print(mdf_mse.summary())

    df['percent_broken'] = df['percent_broken'].round().astype(np.int)
    df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int)

    gg = (plotnine.ggplot(
        df, plotnine.aes(x=x_axis, y='log_score', color='method')) +
          plotnine.geom_jitter(width=2.5, show_legend=False) +
          plotnine.geom_abline(
              plotnine.aes(intercept=mdf_rul.params['Intercept'],
                           slope=mdf_rul.params[x_axis])) +
          plotnine.stat_smooth(method='gls', show_legend=False) +
          plotnine.xlab(x_label) + plotnine.ylab('Logarithmic RUL-Score') +
          plotnine.scale_color_discrete(name='Method', labels=['DAAN', 'JAN'])
          + plotnine.theme_classic(base_size=20))
    gg.save('%s_log_rul_by_method.pdf' % x_axis)

    gg = (plotnine.ggplot(
        df, plotnine.aes(x=x_axis, y='log_score', color='task')) +
          plotnine.geom_jitter(width=2.5, show_legend=False) +
          plotnine.geom_abline(
              plotnine.aes(intercept=mdf_rul.params['Intercept'],
                           slope=mdf_rul.params[x_axis])) +
          plotnine.stat_smooth(method='gls', show_legend=False) +
          plotnine.xlab(x_label) + plotnine.ylab('Logarithmic RUL-Score') +
          plotnine.scale_color_discrete(
              name='Task',
              labels=['4→3', '4→2', '1→3', '1→2', '3→4', '3→1', '2→4', '2→1'
                      ]) + plotnine.theme_classic(base_size=20))
    gg.save('%s_log_rul_by_task.pdf' % x_axis)

    gg = (
        plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) +
        plotnine.geom_jitter(width=2.5) + plotnine.geom_abline(
            plotnine.aes(intercept=mdf_mse.params['Intercept'],
                         slope=mdf_mse.params[x_axis])) +
        plotnine.stat_smooth(method='gls') + plotnine.ylab('RMSE') +
        plotnine.xlab(x_label) +
        plotnine.scale_color_discrete(name='Method', labels=['DAAN', 'JAN']) +
        plotnine.theme_classic(base_size=20))
    gg.save('%s_mse_by_method.pdf' % x_axis)

    gg = (plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='task')) +
          plotnine.geom_jitter(width=2.5) + plotnine.geom_abline(
              plotnine.aes(intercept=mdf_mse.params['Intercept'],
                           slope=mdf_mse.params[x_axis])) +
          plotnine.stat_smooth(method='gls') + plotnine.ylab('RMSE') +
          plotnine.scale_color_discrete(
              name='Task',
              labels=['4→3', '4→2', '1→3', '1→2', '3→4', '3→1', '2→4', '2→1'
                      ]) + plotnine.theme_classic(base_size=20))
    gg.save('%s_mse_by_task.pdf' % x_axis)
Beispiel #24
0
buffersize = np.logspace(0, 7, 15).astype(int)


def speed_test_buffer(buffersize):
    t1 = time.time()
    n_iter = 10_000_000
    gen = read_binary_bus(B1.bus_file, decode_seq=False, buffersize=buffersize)
    gen = toolz.take(n_iter, gen)
    for a in gen:
        pass
    t2 = time.time()
    return t2 - t1


results = [
    {'buffer': b, 'time': speed_test_buffer(b)}
    for _ in tqdm.trange(20) for b in buffersize
]

results = pd.DataFrame(results)
results.to_csv('/home/michi/ms_python_packages/pybustools/speed/results.csv')
%matplotlib
plt.scatter(results.buffer, results.time)
#     plt.plot(results.buffer, results.time, 'x-')
plt.xscale('log')
plt.savefig('/home/michi/ms_python_packages/pybustools/speed/buffer.png', dpi=300)
plt.show()

p = pn.ggplot(results.query('buffer>=100'), pn.aes('factor(buffer)', 'time', color='factor(buffer)')) + pn.geom_boxplot()+ pn.geom_jitter() +pn.labs(title='Buffersize vs Time', x='Buffersize', y='Time (sec)')
p.save('/home/michi/ms_python_packages/pybustools/speed/buffer_vs_time.png', dpi=300)
Beispiel #25
0
def n_es_genes(df: pd.DataFrame,
               annotation: pd.Series,
               figsize: tuple = None) -> p9.ggplot:
    """Plot distribution of number of ES genes per group
    
    Computes the number of ES genes per column, e.g. cell(-type) 
    and plots the distribution for the groups specified
    by the annotation.
    
    Parameters
    ----------
    df : DataFrame
        Dataframe containing positive ES weights, ideally use only ESmu.
    annotation : Series
        Annotation to group dataframe cell(-types) by in the violin plots.
    figsize : (float, float), optional (default: None)
        Specify width and height of plot.
    
    Returns
    -------
    p : ggplot
        A plotnine ggplot

    """

    ### Count number of non-zero values, i.e. ESw > 0
    df = df.astype(bool).sum(axis=0)

    ### Map column labels to annotation
    if type(annotation) is pd.DataFrame:
        annotation = annotation.iloc[:, 0]

    # remove duplicates
    annotation = annotation.loc[~annotation.index.duplicated(keep='first')]

    df.index = df.index.map(annotation, na_action="ignore").values.astype(str)

    # Constants, height and width of plot.
    if figsize is None:
        W = min((df.index.nunique(), 10))
        H = 6.4  # plotnine default height
    else:
        W, H = figsize

    ### Convert to tidy / long format
    # Org:
    #       ABC  ACBG  ACMB
    # POMC  0.0   0.5   0.9
    # AGRP  0.2   0.0   0.0
    # LEPR  0.1   0.1   0.4

    # Tidy:
    #   gene_name annotation    es_weight
    # 1 POMC      ABC           0.0
    # 2 AGRP      ABC           0.6
    # 3 LEPR      ABC           1.0
    df_tidy = df.copy()
    df_tidy.index.name = None
    df_tidy = pd.melt(df_tidy.reset_index(),
                      id_vars="index",
                      var_name="annotation",
                      value_name="count")

    ### Compute the mean count of ES genes
    mean_count = df_tidy["count"].mean(axis=0)

    ### Plot
    p = (
        ### data
        p9.ggplot(
            data=df_tidy,
            mapping=p9.aes(x="index", y="count", fill="index", label="index"),
        )

        ### theming
        + p9.theme_classic() + p9.theme(
            figure_size=(W, H), axis_text_x=p9.element_text(rotation=75)) +
        p9.labs(
            x="",  # e.g. "Cell-type"
            y="Number of ES genes",  # e.g. "ES weight"
        )

        ### viz
        + p9.geom_violin(scale="width", show_legend=False) +
        p9.geom_jitter(width=0.1, height=0, show_legend=False) +
        p9.geom_hline(yintercept=mean_count,
                      color="blue",
                      linetype="dashed",
                      show_legend=False))

    return p
Beispiel #26
0
def control_list(in_file=None,
                 out_dir=None,
                 reference_gene_file=None,
                 log2=False,
                 page_width=None,
                 page_height=None,
                 user_img_file=None,
                 page_format=None,
                 pseudo_count=1,
                 set_colors=None,
                 dpi=300,
                 rug=False,
                 jitter=False,
                 skip_first=False):
    # -------------------------------------------------------------------------
    #
    # Check in_file content
    #
    # -------------------------------------------------------------------------

    for p, line in enumerate(in_file):

        line = chomp(line)
        line = line.split("\t")

        if len(line) > 2:
            message("Need a two columns file.",
                    type="ERROR")
        if skip_first:
            if p == 0:
                continue
        try:
            fl = float(line[1])
        except ValueError:
            msg = "It seems that column 2 of input file"
            msg += " contains non numeric values. "
            msg += "Check that no header is present and that "
            msg += "columns are ordered properly. "
            msg += "Or use '--skip-first'. "
            message(msg, type="ERROR")

        if log2:
            fl = fl + pseudo_count
            if fl <= 0:
                message("Can not log transform negative/zero values. Add a pseudo-count.",
                        type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Check colors
    #
    # -------------------------------------------------------------------------

    set_colors = set_colors.split(",")

    if len(set_colors) != 2:
        message("Need two colors. Please fix.", type="ERROR")

    mcolors_name = mcolors.cnames

    for i in set_colors:
        if i not in mcolors_name:
            if not is_hex_color(i):
                message(i + " is not a valid color. Please fix.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Preparing output files
    #
    # -------------------------------------------------------------------------

    # Preparing pdf file name
    file_out_list = make_outdir_and_file(out_dir, ["control_list.txt",
                                                   "reference_list.txt",
                                                   "diagnostic_diagrams." + page_format],
                                         force=True)

    control_file, reference_file_out, img_file = file_out_list

    if user_img_file is not None:

        os.unlink(img_file.name)
        img_file = user_img_file

        if not img_file.name.endswith(page_format):
            msg = "Image format should be: {f}. Please fix.".format(f=page_format)
            message(msg, type="ERROR")

        test_path = os.path.abspath(img_file.name)
        test_path = os.path.dirname(test_path)

        if not os.path.exists(test_path):
            os.makedirs(test_path)

    # -------------------------------------------------------------------------
    #
    # Read the reference list
    #
    # -------------------------------------------------------------------------

    try:
        reference_genes = pd.read_csv(reference_gene_file.name, sep="\t", header=None)
    except pd.errors.EmptyDataError:
        message("No genes in --reference-gene-file.", type="ERROR")

    reference_genes.rename(columns={reference_genes.columns.values[0]: 'gene'}, inplace=True)

    # -------------------------------------------------------------------------
    #
    # Delete duplicates
    #
    # -------------------------------------------------------------------------

    before = len(reference_genes)
    reference_genes = reference_genes.drop_duplicates(['gene'])
    after = len(reference_genes)

    msg = "%d duplicate lines have been deleted in reference file."
    message(msg % (before - after))

    # -------------------------------------------------------------------------
    #
    # Read expression data and add the pseudo_count
    #
    # -------------------------------------------------------------------------

    if skip_first:
        exp_data = pd.read_csv(in_file.name, sep="\t",
                               header=None, index_col=None,
                               skiprows=[0], names=['exprs'])
    else:

        exp_data = pd.read_csv(in_file.name, sep="\t", names=['exprs'], index_col=0)

    exp_data.exprs = exp_data.exprs.values + pseudo_count

    # -------------------------------------------------------------------------
    #
    # log transformation
    #
    # -------------------------------------------------------------------------

    ylabel = 'Expression'

    if log2:
        if len(exp_data.exprs.values[exp_data.exprs.values == 0]):
            message("Can't use log transformation on zero or negative values. Use -p.",
                    type="ERROR")
        else:
            exp_data.exprs = np.log2(exp_data.exprs.values)
            ylabel = 'log2(Expression)'

    # -------------------------------------------------------------------------
    #
    # Are reference gene found in control list
    #
    # -------------------------------------------------------------------------

    # Sort in increasing order
    exp_data = exp_data.sort_values('exprs')

    #  Vector with positions indicating which in the
    # expression data list are found in reference_gene

    reference_genes_found = [x for x in reference_genes['gene'] if x in exp_data.index]

    msg = "Found %d genes of the reference in the provided signal file" % len(reference_genes_found)
    message(msg)

    not_found = [x for x in reference_genes['gene'] if x not in exp_data.index]

    if len(not_found):
        if len(not_found) == len(reference_genes):
            message("Genes from reference file where not found in signal file (n=%d)." % len(not_found), type="ERROR")
        else:
            message("List of reference genes not found :%s" % not_found)
    else:
        message("All reference genes were found.")

    # -------------------------------------------------------------------------
    #
    # Search for genes with matched signal
    #
    # -------------------------------------------------------------------------

    exp_data_save = exp_data.copy()

    control_list = list()

    nb_candidate_left = exp_data.shape[0] - len(reference_genes_found)

    message("Searching for genes with matched signal.")

    if nb_candidate_left < len(reference_genes_found):
        message("Not enough element to perform selection. Exiting", type="ERROR")

    for i in reference_genes_found:
        not_candidates = reference_genes_found + control_list
        not_candidates = list(set(not_candidates))

        diff = abs(exp_data.loc[i] - exp_data)
        control_list.extend(diff.loc[np.setdiff1d(diff.index, not_candidates)].idxmin(axis=0, skipna=True).tolist())

    # -------------------------------------------------------------------------
    #
    # Prepare a dataframe for plotting
    #
    # -------------------------------------------------------------------------

    message("Preparing a dataframe for plotting.")

    reference = exp_data_save.loc[reference_genes_found].sort_values('exprs')
    reference = reference.assign(genesets=['Reference'] * reference.shape[0])

    control = exp_data_save.loc[control_list].sort_values('exprs')
    control = control.assign(genesets=['Control'] * control.shape[0])

    data = pd.concat([reference, control])
    data['sets'] = pd.Series(['sets' for x in data.index.tolist()], index=data.index)
    data['genesets'] = Categorical(data['genesets'])

    # -------------------------------------------------------------------------
    #
    # Diagnostic plots
    #
    # -------------------------------------------------------------------------

    p = ggplot(data, aes(x='sets', y='exprs', fill='genesets'))

    p += scale_fill_manual(values=dict(zip(['Reference', 'Control'], set_colors)))

    p += geom_violin(color=None)

    p += xlab('Gene sets') + ylab(ylabel)

    p += facet_wrap('~genesets')

    if rug:
        p += geom_rug()

    if jitter:
        p += geom_jitter()

    p += theme_bw()
    p += theme(axis_text_x=element_blank())

    # -------------------------------------------------------------------------
    # Turn warning off. Both pandas and plotnine use warnings for deprecated
    # functions. I need to turn they off although I'm not really satisfied with
    # this solution...
    # -------------------------------------------------------------------------

    def fxn():
        warnings.warn("deprecated", DeprecationWarning)

    # -------------------------------------------------------------------------
    #
    # Saving
    #
    # -------------------------------------------------------------------------

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fxn()
        message("Saving diagram to file : " + img_file.name)
        message("Be patient. This may be long for large datasets.")

        try:
            p.save(filename=img_file.name, width=page_width, height=page_height, dpi=dpi, limitsize=False)
        except PlotnineError as err:
            message("Plotnine message: " + err.message)
            message("Plotnine encountered an error.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # write results
    #
    # -------------------------------------------------------------------------

    exp_data_save.loc[reference_genes_found].sort_values('exprs').to_csv(reference_file_out.name, sep="\t")
    exp_data_save.loc[control_list].sort_values('exprs').to_csv(control_file.name, sep="\t")
Beispiel #27
0
def rel_plot(sbs, variant, jitter=0.01):
    plotdata = sbs[sbs.variant == variant]
    xcol = "base"
    ycol = "ratio"
    plotdata = plotdata.assign(x=plotdata[xcol], y=plotdata[ycol])
    plotdata = plotdata.assign(sbs_index=plotdata.index.values)
    session_text = (plotdata[["session_index", "base_session_index"]].apply(
        tuple, axis=1).map(lambda tup: f"{tup[0]} vs. {tup[1]}"))
    plotdata = plotdata.assign(session_text=session_text)

    x = np.geomspace(0.02, 1, num=5)
    y = 1 / x
    diag_df = pd.DataFrame({"x": x, "y": y})

    scatterplot = (
        ggplot(plotdata) + geom_jitter(
            aes(x="x", y="y", fill="dataset", color="dataset"),
            width=jitter,
            height=jitter,
            alpha=0.6,
            size=1.0,
        )
        #                 shape=plotdata.dataset.map(lambda x : '.' if x in ['lvis','objectnet'] else 'o'),
        #                 size=plotdata.dataset.map(lambda x : 1. if x in ['lvis','objectnet'] else 2.))
        #  + geom_text(aes(x='base', y='delta', label='category', color='dataset'), va='bottom',
        #              data=plotdata1[plotdata1.ratio < .6],
        #              position=position_jitter(.05, .05), show_legend=False)
        + geom_line(aes(x="x", y="y"), data=diag_df)
        # + geom_text(aes(x='x', y='y', label='session_text'), va='top', data=plotdata[(plotdata.y < .4) | (plotdata.y > 3)])
        + ylab(ycol)
        #               + geom_area(aes(y2=1.1, y=.9), linetype='dashed', alpha=.7)
        + geom_hline(aes(yintercept=1.1), linetype="dashed", alpha=0.7) +
        geom_hline(aes(yintercept=0.9), linetype="dashed", alpha=0.7) +
        geom_vline(
            aes(xintercept=0.1, ),
            linetype="dashed",
            alpha=0.7,
        ) + geom_vline(
            aes(xintercept=0.3, ),
            linetype="dashed",
            alpha=0.7,
        )
        # + geom_abline()
        #    + geom_point(aes(x='recall', y='precision', color='variant'), size=1.)
        #     + facet_wrap(facets=['cat'], ncol=6, scales='free_x')
        + xlab(xcol)
        # +scale_color_discrete()
        + theme(
            figure_size=(8, 5),
            legend_position="top",
            subplots_adjust={"hspace": 0.5},
            legend_title=element_blank(),
            legend_box_margin=-1,
            legend_margin=0.0,
            axis_text=element_text(size=12, margin={
                "t": 0.2,
                "l": -0.3
            }),
            legend_text=element_text(size=11),
            axis_title=element_text(size=12,
                                    margin={
                                        "r": -0.2,
                                        "b": 0.0,
                                        "l": 0,
                                        "t": 0.0
                                    }),
        ) + scale_x_log10(labels=make_labeler(brief_format),
                          breaks=[0.01, 0.1, 0.3, 1.0]) +
        scale_y_log10(labels=make_labeler(brief_format),
                      breaks=[0.5, 0.9, 1.1, 2.0, 3.0, 6, 12]))

    return scatterplot
Beispiel #28
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Fits logistic regression to predict labels.'
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    parser.add_argument(
        '-h5',
        '--h5_anndata',
        action='store',
        dest='h5',
        required=True,
        help='H5 AnnData file where clusters have been saved to cluster slot.')

    # parser.add_argument(
    #     '-ncpu', '--number_cpu',
    #     action='store',
    #     dest='number_cpu',
    #     default=50,
    #     type=int,
    #     help='Number of CPUs to use. Since we are testing the dask backend,\
    #         this corresponds to the number of CPUs available across all of\
    #         the worker jobs we spin out.\
    #         (default: %(default)s)'
    # )

    parser.add_argument('-s',
                        '--sparsity_l1',
                        action='store',
                        dest='sparsity_l1',
                        default=0.0001,
                        type=float,
                        help='Smaller values specify stronger regularization.\
            (default: %(default)s)')

    parser.add_argument('-nepoch',
                        '--number_epoch',
                        action='store',
                        dest='number_epoch',
                        default=25,
                        type=int,
                        help='Number of epochs.\
            (default: %(default)s)')

    parser.add_argument(
        '-bs',
        '--batch_size',
        action='store',
        dest='batch_size',
        default=32,
        type=int,
        help='Batch size. Divides the dataset into n batches and updates the\
            weights at the end of each one.\
            (default: %(default)s)')

    parser.add_argument(
        '-tsc',
        '--train_size_cells',
        action='store',
        dest='train_size_cells',
        default=0,
        type=int,
        help='Number of cells to use for training set. If > 0 all\
            remaining cells not randomly selected for training will be used\
            for the test set. Overrides <train_size_fraction>.\
            (default: %(default)s)')

    parser.add_argument('-tsf',
                        '--train_size_fraction',
                        action='store',
                        dest='train_size_fraction',
                        default=0.67,
                        type=float,
                        help='Fraction of the data to use for training set.\
            (default: %(default)s)')

    parser.add_argument(
        '--dict_add',
        action='store',
        dest='dict_add',
        default='',
        type=str,
        help='Additional information to add to output model_report.\
            Format: key::value:::key2::value2.\
            Example: method::leiden:::resolution::3.0\
            (default: %(default)s)')

    parser.add_argument('--grid_search',
                        action='store_true',
                        dest='grid_search',
                        default=False,
                        help='Run a grid search of hyperparameters.\
            (default: %(default)s)')

    parser.add_argument('--memory_limit',
                        action='store',
                        dest='memory_limit',
                        default=50,
                        type=int,
                        help='Memory limit in Gb.\
            (default: %(default)s)')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: keras_model-<params>)')
    options = parser.parse_args()

    verbose = True

    # Set GPU memory limits
    gpus = tf.config.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        # For TF v1
        # config = tf.ConfigProto()
        # config.gpu_options.allow_growth = True
        # session = tf.Session(config=config)

        # For TF v2
        try:
            # Method 1:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)

            # Method 2:
            # Restrict TensorFlow to only allocate 1GB of memory on the first
            # GPU
            # tf.config.experimental.set_virtual_device_configuration(
            #     gpus[0],
            #     [tf.config.experimental.VirtualDeviceConfiguration(
            #         memory_limit=options.memory_limit*1024
            #     )])
            # logical_gpus = tf.config.list_logical_devices('GPU')
            # print(
            #     len(gpus),
            #     "Physical GPUs,",
            #     len(logical_gpus),
            #     "Logical GPUs"
            # )
        except RuntimeError as e:
            # Virtual devices must be set before GPUs have been initialized
            print(e)
    else:
        raise Exception('ERROR: no GPUs detected.')

    # Get additional data we are going to append to the output model info
    dict_add = {}
    if options.dict_add != '':
        for item in options.dict_add.split(':::'):
            _tmp = item.split('::')
            if len(_tmp) != 2:
                raise Exception('ERROR: check dict_add.')
            else:
                dict_add[_tmp[0]] = _tmp[1]
    print(dict_add)

    # Load the AnnData file.
    # This file should already have clusters identified and saved to the
    # clusters slot.
    adata = sc.read_h5ad(filename=options.h5)

    # Set X to cp10k
    # adata.X = np.expm1(adata.layers['log1p_cp10k'])
    # Set X to ln(cp10k+1)
    # NOTE: Testing with 100k TI dataset, we were able to achieve higher
    # accuracy with log1p_cp10k - likely becuase better spread in distribution.
    adata.X = adata.layers['log1p_cp10k']
    # Set X to raw counts
    # adata.X = adata.layers['counts']

    # Add some info from adata to dict_add
    for key, value in adata.uns['neighbors']['params'].items():
        dict_add['neighbors__{}'.format(key)] = value
    for key, value in adata.uns['cluster']['params'].items():
        dict_add['cluster__{}'.format(key)] = value

    # If train_size_cells, override the fraction so that the total number of
    # cells in the training set will be equal to train_size_cells.
    train_size_fraction = options.train_size_fraction
    if options.train_size_cells > 0:
        if options.train_size_cells >= adata.n_obs:
            raise Exception('Invalid train_size_cells.')
        train_size_fraction = (
            1 - ((adata.n_obs - options.train_size_cells) / adata.n_obs))
        if verbose:
            print(
                'Set train_size_fraction to: {}.'.format(train_size_fraction))
    if verbose:
        print('Number cells training ({}) and testing ({}).'.format(
            int(train_size_fraction * adata.n_obs),
            int((1 - train_size_fraction) * adata.n_obs)))

    # Set X and y
    X = adata.X
    y = adata.obs['cluster'].values

    # Set other variables
    sparsity_l1 = options.sparsity_l1
    n_epochs = options.number_epoch
    batch_size = options.batch_size

    # Center and scale the data
    if sp.sparse.issparse(X):
        X = X.todense()
    X_std = X
    scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
    X_std = scaler.fit_transform(X)
    if verbose:
        print('center={} scale={}'.format(True, True))

    # One hot encode y (the cell type classes)
    # encode class values as integers
    encoder = preprocessing.LabelEncoder()
    encoder.fit(y)
    print('Found {} clusters'.format(len(encoder.classes_)))

    # Define the model
    # NOTE: Defaults determined via grid search of 160k TI single cells
    def classification_model(optimizer='sgd',
                             activation='softmax',
                             loss='categorical_crossentropy',
                             sparsity_l1__activity=0.0001,
                             sparsity_l2__activity=0.0,
                             sparsity_l1__kernel=0.0,
                             sparsity_l2__kernel=0.0,
                             sparsity_l1__bias=0.0,
                             sparsity_l2__bias=0.0):
        # create model
        model = Sequential()
        # Use a “softmax” activation function in the output layer. This is to
        # ensure the output values are in the range of 0 and 1 and may be used
        # as predicted probabilities.
        #
        # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax
        # Softmax assigns decimal probabilities to each class in a multi-class
        # problem. Those decimal probabilities must add up to 1.0. This
        # additional constraint helps training converge more quickly than it
        # otherwise would. Softmax is implemented through a neural network
        # layer just before the output layer. The Softmax layer must have the
        # same number of nodes as the output layer.
        # Softmax assumes that each example is a member of exactly one class.
        #
        # Softmax should be used for multi-class prediction with single label
        # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/video-lecture
        # NOTE: input dimension = number of features your data has
        model.add(
            Dense(
                len(encoder.classes_),  # output dim is number of classes
                use_bias=True,  # intercept
                activation=activation,  # softmax, sigmoid
                activity_regularizer=L1L2(l1=sparsity_l1__activity,
                                          l2=sparsity_l2__activity),
                kernel_regularizer=L1L2(l1=sparsity_l1__kernel,
                                        l2=sparsity_l2__kernel),
                bias_regularizer=L1L2(l1=sparsity_l1__bias,
                                      l2=sparsity_l2__bias),
                input_dim=X.shape[1]))
        # Example of adding additional layers
        # model.add(Dense(8, input_dim=4, activation='relu'))
        # model.add(Dense(3, activation='softmax'))

        # Metrics to check out over training epochs
        mets = [
            # loss,
            keras.metrics.CategoricalAccuracy(name='categorical_accuracy'),
            # keras.metrics.TruePositives(name='tp'),
            # keras.metrics.FalsePositives(name='fp'),
            # keras.metrics.TrueNegatives(name='tn'),
            # keras.metrics.FalseNegatives(name='fn'),
            # keras.metrics.Precision(name='precision'),
            # keras.metrics.Recall(name='recall'),
            # keras.metrics.AUC(name='auc'),
            keras.metrics.BinaryAccuracy(name='accuracy')
        ]
        # Use Adam gradient descent optimization algorithm with a logarithmic
        # loss function, which is called “categorical_crossentropy” in Keras.
        # UPDATE: sgd works better emperically.
        model.compile(
            optimizer=optimizer,  # adam, sgd
            loss=loss,
            metrics=mets)

        return model

    # Now, either call a grid search or specific model fit
    if options.grid_search:
        # Get the out file base.
        out_file_base = options.of
        if out_file_base == '':
            out_file_base = 'keras_model'
        out_file_base = '{}-grid_search'.format(out_file_base)

        # Call grid search of various parameters
        grid_result, df_grid_result = keras_grid(
            model_function=classification_model,
            encoder=encoder,
            X_std=X_std,
            y=y,
            n_epochs=n_epochs,
            batch_size=batch_size)

        # NOTE: This will fail because can't pickle KerasClassifier. This is
        # fine though becuase results are saved in tsv.gz format below.
        # Save the results
        # out_f = '{}-grid_result.gz'.format(out_file_base)
        # joblib.dump(
        #     grid_result,
        #     out_f,
        #     compress=('gzip', 3)
        # )
        # Load the model
        # lr = joblib.load(
        #     'test-lr_model.joblib.gz'
        # )
        # print(lr)

        # Save the results of our search to tsv
        out_f = '{}-grid_result.tsv.gz'.format(out_file_base)
        df_grid_result.to_csv(out_f,
                              sep='\t',
                              index=False,
                              quoting=csv.QUOTE_NONNUMERIC,
                              na_rep='',
                              compression=compression_opts)

        # Add a single columns that summarizes params
        param_columns = [
            col for col in df_grid_result.columns if 'param__' in col
        ]
        df_grid_result['params'] = df_grid_result[param_columns].astype(
            str).apply(lambda x: '-'.join(x), axis=1)

        # Plot the distribution of accuracy across folds
        split_columns = [
            col for col in df_grid_result.columns if 'split' in col
        ]
        split_columns = [col for col in split_columns if '_test_score' in col]
        df_plt = pd.melt(df_grid_result,
                         id_vars=['params'],
                         value_vars=split_columns)
        gplt = plt9.ggplot(df_plt, plt9.aes(x='params', y='value'))
        gplt = gplt + plt9.theme_bw()
        gplt = gplt + plt9.geom_boxplot(alpha=0.8)
        gplt = gplt + plt9.geom_jitter(alpha=0.75)
        gplt = gplt + plt9.scale_y_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0
            # limits=[0, 1]
        )
        gplt = gplt + plt9.labs(x='Parameters', y='Score', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt.save('{}-score.png'.format(out_file_base),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)

        # Plot the mean time and std err for fitting results
        gplt = plt9.ggplot(df_grid_result,
                           plt9.aes(x='params', y='mean_fit_time'))
        gplt = gplt + plt9.theme_bw()
        gplt = gplt + plt9.geom_point()
        gplt = gplt + plt9.geom_errorbar(plt9.aes(
            ymin='mean_fit_time-std_fit_time',
            ymax='mean_fit_time+std_fit_time'),
                                         width=0.2,
                                         position=plt9.position_dodge(0.05))
        gplt = gplt + plt9.scale_y_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
        gplt = gplt + plt9.labs(x='Parameters', y='Mean fit time', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt.save('{}-fit_time.png'.format(out_file_base),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)

    else:
        # Get the out file base.
        out_file_base = options.of
        if out_file_base == '':
            out_file_base = 'keras_model'
            # out_file_base = '{}-center={}-scale={}'.format(
            #     out_file_base,
            #     center,
            #     scale
            # )
            out_file_base = '{}-batch_size={}-epochs={}'.format(
                out_file_base, batch_size, n_epochs)
            out_file_base = '{}-sparsity_l1={}-train_size_fraction={}'.format(
                out_file_base,
                str(sparsity_l1).replace('.', 'pt'),
                str(train_size_fraction).replace('.', 'pt'))

        # Fit the specific model and save the results
        model, model_report, y_prob_df, history = fit_model_keras(
            model_function=classification_model,
            encoder=encoder,
            X_std=X_std,
            y=y,
            sparsity_l1=sparsity_l1,
            sparsity_l2=0.0,
            n_epochs=n_epochs,
            batch_size=batch_size,
            train_size_fraction=train_size_fraction)

        # Save the model, weights (coefficients), and bias (intercept)
        model.save('{}.h5'.format(out_file_base),
                   overwrite=True,
                   include_optimizer=True)

        # Save the model and weights (coefficients) seperately
        # open('{}.json'.format(out_file_base), 'w').write(model.to_json())
        open('{}.yml'.format(out_file_base), 'w').write(model.to_yaml())
        model.save_weights('{}-weights.h5'.format(out_file_base))
        # Example read functions
        # model = model_from_yaml(open('my_model_architecture.yaml').read())
        # model.load_weights('my_model_weights.h5')

        # Save the model report
        # Add column telling us if this is cluster or summary value
        is_cluster = []
        for i in model_report.index:
            if i in encoder.classes_:
                is_cluster.append(True)
            else:
                is_cluster.append(False)
        model_report['is_cluster'] = is_cluster
        # Add in extra data
        model_report['sparsity_l1'] = sparsity_l1
        if dict_add:
            for key, value in dict_add.items():
                model_report[key] = value
        print(model_report)
        out_f = '{}-model_report.tsv.gz'.format(out_file_base)
        model_report.to_csv(out_f,
                            sep='\t',
                            index=True,
                            index_label='cell_label',
                            quoting=csv.QUOTE_NONNUMERIC,
                            na_rep='',
                            compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Save the test results - each row is a cell and the columns are the
        # prob of that cell belonging to a particular class.
        # Add in extra data
        y_prob_df['sparsity_l1'] = sparsity_l1
        if dict_add:
            for key, value in dict_add.items():
                y_prob_df[key] = value
        out_f = '{}-test_result.tsv.gz'.format(out_file_base)
        y_prob_df.to_csv(
            out_f,
            sep='\t',
            index=False,  # NOTE: Not adding the label to test_result index.
            # index_label='cell_label',
            quoting=csv.QUOTE_NONNUMERIC,
            na_rep='',
            compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Make a matrix of weights per gene
        # Columns = genes tested and rows = cell type label
        weight, bias = model.layers[-1].get_weights()
        # weight, bias = model.get_layer("output").get_weights()
        df_weights = pd.DataFrame.from_records(
            weight,
            index=adata.var.index,  # index is gene
            columns=encoder.classes_)
        # Save the weights dataframe.
        out_f = '{}-weights.tsv.gz'.format(out_file_base)
        df_weights.to_csv(out_f,
                          sep='\t',
                          index=True,
                          index_label='ensembl_gene_id',
                          quoting=csv.QUOTE_NONNUMERIC,
                          na_rep='',
                          compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Plot the number of features with non-zero coefficients in each
        # cluster.
        out_f = '{}-n_features.png'.format(out_file_base)
        df_plt = pd.DataFrame({
            'classes': df_weights.columns,
            'features': (df_weights != 0).sum(axis=0)
        })
        df_plt = df_plt.set_index('classes')
        # print(df_plt)
        # Add in catgories with no predictive model (e.g., becuase they were
        # too few in training).
        for i in adata.obs['cluster'].cat.categories:
            if i not in df_plt.index:
                df_plt = df_plt.append(
                    pd.Series([0], index=df_plt.columns, name=i))
        fig = plt.figure(figsize=(max(0.5 * len(df_plt.index), 5), 4))
        # plt.bar(lr.classes_, n_features)
        plt.bar(df_plt.index, df_plt['features'])
        plt.xlabel('Cluster')
        plt.ylabel('Features with coefficient != 0')
        plt.xticks(rotation=90)
        for i in df_plt.index:
            plt.annotate(str(df_plt.loc[i, 'features']),
                         xy=(i, df_plt.loc[i, 'features']))
        fig.savefig(out_f, dpi=300, bbox_inches='tight')
        plt.close(fig)

        # Plot ROC of the test and truth.
        out_f = '{}-roc.png'.format(out_file_base)
        fig = plt.figure()
        cell_label_true = y_prob_df.pop('cell_label_true')
        # Drop columns that are not cell type labels
        for i in y_prob_df.columns:
            if 'class__' not in i:
                del y_prob_df[i]
        plot_roc(y_prob_df.values, cell_label_true.values, y_prob_df.columns)
        fig.savefig(out_f, dpi=300, bbox_inches='tight')
        plt.close(fig)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Plot metrics vs cluster size to see if smaller clusters have poorer
        # metric measures.
        df_plt = model_report.fillna(0)
        for i in df_plt.index:
            if i not in encoder.classes_:
                df_plt = df_plt.drop(i)
        for i in ['AUC', 'f1-score', 'average_precision_score', 'MCC']:
            out_f = '{}-cluster_size_{}.png'.format(out_file_base, i)
            fig = plt.figure()
            plt.scatter(df_plt['n_cells_full_dataset'], df_plt[i], alpha=0.5)
            plt.xlabel('Number of cells in cluster (full dataset)')
            plt.ylabel(i)
            if i in ['AUC', 'f1-score', 'average_precision_score']:
                plt.ylim(0, 1)
            elif i == 'MCC':
                plt.ylim(-1, 1)
            # Add annotation of the cluster
            for index, row in df_plt.iterrows():
                if row['n_cells_full_dataset'] == 0:
                    print('ERROP: n_cells_full_dataset = 0 for {}.'.format(
                        index))
                plt.annotate(
                    index,  # this is the text
                    (row['n_cells_full_dataset'], row[i]),  # point to label
                    textcoords='offset points',  # how to position the text
                    xytext=(0, 10),  # distance from text to points (x,y)
                    ha='center'  # horiz alignment can be left, right, center
                )
            fig.savefig(out_f, dpi=300, bbox_inches='tight')
            plt.xscale('log', basex=10)
            fig.savefig('{}-cluster_size_{}_log10.png'.format(
                out_file_base, i),
                        dpi=300,
                        bbox_inches='tight')
            plt.close(fig)
            if verbose:
                print('Completed: save {}.'.format(out_f))

        # Plot history of metrics over epochs
        for dat_i in history.history.keys():
            fig = plt.figure()
            plt.plot(history.history[dat_i])
            plt.ylabel(dat_i)
            plt.xlabel('Epoch')
            fig.savefig('{}-model_iter_{}.png'.format(out_file_base, dat_i),
                        dpi=300,
                        bbox_inches='tight')
            plt.close(fig)
Beispiel #29
0
import pandas as pd
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import plotnine as p9


df = pd.read_csv('exercise.csv')

results = smf.ols('y ~ x1 + x2 + x1*x2', data=df).fit()

wyn=results.params

print(results.summary())


fig1=(p9.ggplot(p9.aes(x='x1,x2',y='y'),data=df)+p9.geom_jitter(width=0.1)
      +p9.geom_abline(p9.aes(intercept=wyn['Intercept'],slope=wyn['x1'])))


plt.show()
Beispiel #30
0
from plotnine import save_as_pdf_pages

beauty=pd.read_csv("beauty.csv")

dane=pd.read_csv("beauty.csv")
print(len(dane))

figures = []

piekno = "btystdave"

results = smf.ols("courseevaluation" +"~btystdave", data=dane).fit()
wyn=results.params

fig1=(p9.ggplot(p9.aes(x="btystdave",y="courseevaluation"),data=dane)
      +p9.geom_jitter(width=0.1)
      +p9.geom_abline(p9.aes(intercept=wyn['Intercept'],slope=wyn["btystdave"])))
print(fig1)
figures.append(fig1)

df2=beauty
df2['y_pred']=results.predict()
df2['residuals']=df2['courseevaluation']-df2['y_pred']
fig2_res=(p9.ggplot(p9.aes(x='btystdave',y='residuals'),data=beauty)
      +p9.geom_point())
print(fig2_res)
figures.append(fig2_res)

results = smf.ols("courseevaluation" +"~btystdavepos + btystdave", data=dane).fit()
wyn=results.params