def go_to_time_plot3(large_go_to_time_probs_new: list,
                     large_go_to_time_probs_old: list,
                     average_minutes_per_game_values: list):
    """ Plot go-to-time probability, old vs. new rules, no blowouts, 300 matches/round """

    large_time_prob_data = pd.DataFrame({
        'Average minutes per game':
        np.concatenate(
            [average_minutes_per_game_values,
             average_minutes_per_game_values]),
        'P(Go to time)':
        np.concatenate(
            [large_go_to_time_probs_new, large_go_to_time_probs_old]),
        'Rules':
        np.concatenate([
            np.repeat('New', len(average_minutes_per_game_values)),
            np.repeat('Old', len(average_minutes_per_game_values))
        ])
    })
    (plt.ggplot(
        large_time_prob_data,
        plt.aes(x='Average minutes per game', y='P(Go to time)',
                color='Rules')) + plt.geom_line() + plt.geom_point() +
     plt.ylim([0, 1]) + plt.theme_classic()).save(
         filename='figures/go_to_time_300_matches_prob_plot.png')
Ejemplo n.º 2
0
def plot_vs_discrete(data_table,
                     discrete_metric_name,
                     metric_name,
                     segment_name,
                     title,
                     ylim=None,
                     aggregate="mean"
                     ):
    data_filtered = \
        data_table.loc[((pd.notnull(data_table[metric_name])) & (pd.notnull(data_table[discrete_metric_name])))][
            [discrete_metric_name, metric_name, segment_name]]

    data_filtered[[metric_name]] = data_filtered[[metric_name]].astype(float)
    result = data_filtered.groupby([discrete_metric_name, segment_name]).agg({metric_name: aggregate}).reset_index()
    result[metric_name] = round(result[metric_name], 3)

    gg_result = plot.ggplot(result) + plot.aes(x=discrete_metric_name,
                                               y=metric_name,
                                               fill=segment_name,
                                               label=metric_name
                                               ) + \
                plot.geom_bar(stat="identity", position="dodge") + \
                plot.geom_text(position=plot.position_dodge(width=.9), size=8) + \
                plot.labs(x=discrete_metric_name, y=aggregate + "(" + metric_name + ")", title=title)

    if pd.notnull(ylim):
        gg_result = gg_result + plot.ylim(ylim)

    return gg_result
def go_to_time_plot2(go_to_time_probs_new: list, go_to_time_probs_old: list,
                     go_to_time_blowout_probs_new: list,
                     go_to_time_blowout_probs_old: list,
                     average_minutes_per_game_values: list):
    """ Plot go-to-time probability, new vs. old rules, blowouts vs. no blowouts, 85 matches/round """

    time_prob_blowout_data = pd.DataFrame({
        'Average minutes per game':
        np.concatenate([
            average_minutes_per_game_values, average_minutes_per_game_values,
            average_minutes_per_game_values, average_minutes_per_game_values
        ]),
        'P(Go to time)':
        np.concatenate([
            go_to_time_probs_new, go_to_time_probs_old,
            go_to_time_blowout_probs_new, go_to_time_blowout_probs_old
        ]),
        'Rules':
        np.concatenate([
            np.repeat('New, no blowouts',
                      len(average_minutes_per_game_values)),
            np.repeat('Old, no blowouts',
                      len(average_minutes_per_game_values)),
            np.repeat('New, blowouts', len(average_minutes_per_game_values)),
            np.repeat('Old, blowouts', len(average_minutes_per_game_values))
        ])
    })

    (plt.ggplot(
        time_prob_blowout_data,
        plt.aes(x='Average minutes per game', y='P(Go to time)',
                color='Rules')) + plt.geom_line() + plt.geom_point() +
     plt.ylim([0, 1]) + plt.theme_classic()).save(
         filename='figures/go_to_time_prob_with_blowouts_plot.png')
Ejemplo n.º 4
0
def plot_ci_eval(df):
    molten = pd.melt(df,
                     id_vars=['sample_size'],
                     value_vars=['bootstrap', 'ztest', 'ttest'])

    return (ggplot(molten, aes(x='sample_size', y='value', color='variable')) +
            geom_line() + scale_x_log10() + ylim(0, 1))
Ejemplo n.º 5
0
def create_length_plot(len_df, legend_position='right', legend_box='vertical'):
    mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index()
    mean_len_df[' '] = 'Mean Length'

    plt = (ggplot(len_df) + aes(x='x', fill='Method', y='..density..') +
           geom_histogram(binwidth=2, position='identity', alpha=.6) +
           geom_text(aes(x='x', y=.22, label='x', color='Method'),
                     mean_len_df,
                     inherit_aes=False,
                     format_string='{:.1f}',
                     show_legend=False) +
           geom_segment(aes(x='x', xend='x', y=0, yend=.205, linetype=' '),
                        mean_len_df,
                        inherit_aes=False,
                        color='black') + scale_linetype_manual(['dashed']) +
           facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) +
           xlab('Example Length') + ylab('Frequency') +
           scale_color_manual(values=COLORS) +
           scale_fill_manual(values=COLORS) + theme_fs() + theme(
               aspect_ratio=1,
               legend_title=element_blank(),
               legend_position=legend_position,
               legend_box=legend_box,
           ))

    return plt
Ejemplo n.º 6
0
def plot_fees(fees, title, y_axis, years, filename):
    p = pn.ggplot(fees, pn.aes('year', y_axis, color = 'conference', shape = 'conference')) + \
        pn.geom_point() + \
        pn.geom_line() + \
        pn.labs(title = title, x = 'Year', y = 'Fee (€)') + \
        pn.ylim(0, 1000) + \
        pn.theme_light() + \
        pn.scale_x_continuous(breaks = years) + \
        pn.scale_colour_discrete(name = 'Conference') + \
        pn.scale_shape_discrete(name = 'Conference')

    p.save(filename, width=6, height=3, dpi=300)
Ejemplo n.º 7
0
def plot_action_proportion(df_agent):
    """Plot the action proportion for the sub-dataframe for a single agent."""
    n_action = np.max(df_agent.action) + 1
    plt_data = []
    for i in range(n_action):
        probs = (df_agent.groupby('t').agg({
            'action': lambda x: np.mean(x == i)
        }).rename(columns={'action': 'action_' + str(i)}))
        plt_data.append(probs)
    plt_df = pd.concat(plt_data, axis=1).reset_index()
    p = (gg.ggplot(pd.melt(plt_df, id_vars='t')) +
         gg.aes('t', 'value', colour='variable', group='variable') +
         gg.geom_line(size=1.25, alpha=0.75) + gg.xlab('Timestep (t)') +
         gg.ylab('Action probability') + gg.ylim(0, 1) +
         gg.scale_colour_brewer(name='Variable', type='qual', palette='Set1'))
    return p
Ejemplo n.º 8
0
def make_plot(name):
    df = pd.read_csv(f'small_n/results/{name}.csv')

    molten = pd.melt(
        df,
        id_vars=['sample_size'],
        value_vars=['bootstrap', 'ztest', 'ttest'],
        var_name='method',
        value_name='success',
    )

    (ggplot(molten, aes(x='sample_size', y='success', color='method')) +
     geom_line(size=1) + scale_x_log10() + ylim(0, 1) + geom_hline(
         yintercept=0.95, linetype='dotted', color='#FF5500', size=3)).save(
             f'slides/static/plots/{name}.png',
             height=7.0,
             width=10,
             units='in')
def grid_search_models(X,y):

    # get only exons 4-12

    X2 = X[:,3:12]
    X_train, X_test, y_train, y_test = train_test_split(X2,y,test_size=0.3)

    #SVM

    svc = SVC()
    param_grid = {'C':[0.5,1,2,3,5,6,7,8,9,10],'kernel':['rbf','linear','poly','sigmoid'],'degree':[2,3,4,5,6]}
    grid_search_svc = GridSearchCV(svc, param_grid,
                               scoring='accuracy')
    grid_search_svc.fit(X_train, y_train)

    #logistic regression

    lr = LogisticRegression()
    param_grid = {'penalty':['l1','l2'],'C':[0.5,1,2,3,4,5,8,10]}
    grid_search_lr = GridSearchCV(lr, param_grid,
                               scoring='accuracy')
    grid_search_lr.fit(X_train, y_train)

    #decision tree

    dt = DecisionTreeClassifier()
    param_grid = {'max_depth': [3, 10, 20, 30], 'max_leaf_nodes': [2, 4, 6, 8],'min_samples_leaf':[1,2,3],'min_samples_split':[2,4,6]}
    grid_search_dt = RandomizedSearchCV(dt, param_grid, cv=10,
                               scoring='accuracy')
    grid_search_dt.fit(X_train, y_train)

    # plot performances

    data = {
        'Model':['SVM']*10 + ['LogisticRegression']*10 + ['DecisionTree']*10,
        'Accuracy':list(cross_val_score(grid_search_svc.best_estimator_,X_train,y_train,cv=10)) + \
        list(cross_val_score(grid_search_lr.best_estimator_,X_train,y_train,cv=10)) + \
        list(cross_val_score(grid_search_dt.best_estimator_,X_train,y_train,cv=10))
    }
    data = pd.DataFrame(data)
    data['Model'] = pd.Categorical(data['Model'], categories=['SVM','LogisticRegression','DecisionTree'], ordered=True)

    p = pn.ggplot(data,pn.aes('Model','Accuracy')) + pn.geom_boxplot() + pn.ylim(0,1)
    p.save('./plots/tumor_genotype_prediction/accuracy-model.png')
Ejemplo n.º 10
0
def gene_log_HR_plot(inFile, pcaFile=None, model=None):
    # get logHRs
    par = get_params(inFile)
    pca_components = par["means"]["logHR"].shape[0] >> 1
    components = range(pca_components)
    tf_components = slice(pca_components, 2 * pca_components)

    t_logHR = par["means"]["logHR"][components, 0]
    tf_logHR = par["means"]["logHR"][tf_components, 0]

    t_logHR_sd = par["stds"]["logHR"][components, 0]
    tf_logHR_sd = par["stds"]["logHR"][tf_components, 0]

    # get pca
    if pcaFile is None:
        pcaFile = inFile.replace("_params.hdf5", "_pca.pkl")
    with open(pcaFile, "rb") as buff:
        pca = pickle.load(buff)

    # prep dataframe
    n_genes = pca.components_.shape[1]
    if model is None:
        logHR_df = pd.DataFrame(index=[f"{i+1}" for i in range(n_genes)])
    else:
        logHR_df = pd.DataFrame(index=model.counts.index)
    logHR_df["tumor logHR"] = pca.inverse_transform(t_logHR)
    logHR_df["non-tumor logHR"] = pca.inverse_transform(tf_logHR)
    logHR_df["tumor logHR sd"] = np.sqrt(
        np.sum((pca.components_ * t_logHR_sd[:, None])**2, axis=0))
    logHR_df["non-tumor logHR sd"] = np.sqrt(
        np.sum((pca.components_ * tf_logHR_sd[:, None])**2, axis=0))
    logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"]
    logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] /
                               logHR_df["tumor logHR sd"])
    logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2
    logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2

    # make plot
    lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min())
    ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max())
    pl = (pn.ggplot(pn.aes("non-tumor logHR", "tumor logHR"), logHR_df) +
          pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() +
          pn.geom_point(alpha=0.3, color="red") + pn.geom_abline())
    return pl, logHR_df
Ejemplo n.º 11
0
def plot_cor(df):
    # drop missing correlations
    out = df[~df['corr'].isnull()]
    # add pair column
    out = out.assign(pair=out.col_1 + '&' + out.col_2)
    # add a sign column
    sign = ((out['corr'] > 0).astype('int')).to_list()
    sign = [['Negative', 'Positive'][i] for i in sign]
    out['sign'] = sign
    #out  = out.sort_values('pair', ascending = False).reset_index(drop = True)
    # add ind column
    out['ind'] = [out.shape[0] - i for i in range(out.shape[0])]
    # plot using bands
    ggplt = p9.ggplot(data = out, mapping = p9.aes(x = 'pair', y = 'corr')) \
        + p9.geom_hline(
            yintercept = 0,
            linetype = "dashed",
            color = "#c2c6cc"
            ) \
        + p9.geom_rect(
            alpha = 0.4,
            xmin = out.ind.values - 0.4,
            xmax = out.ind.values + 0.4,
            ymin = out.lower.values,
            ymax = out.upper.values,
            fill = [['b', '#abaeb3'][int(x > 0.05)] for x in out.p_value]
          ) \
        + p9.geom_segment(
            x = out.ind.values - 0.4,
            y = out['corr'].values,
            xend = out.ind.values + 0.4,
            yend = out['corr'].values
          ) \
        + p9.coord_flip() \
        + p9.ylim(np.min(out.lower.values), np.max(out.upper.values)) \
        + p9.labs(x = "", y = "Correlation")
    return ggplt
Ejemplo n.º 12
0
def create_length_plot(len_df, legend_position='right', legend_box='vertical'):
    mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index()
    mean_len_df[' '] = 'Mean Length'

    plt = (
        ggplot(len_df)
        + aes(x='x', fill='Method', y='..density..')
        + geom_histogram(binwidth=2, position='identity', alpha=.6)
        + geom_text(
            aes(x='x', y=.22, label='x', color='Method'),
            mean_len_df,
            inherit_aes=False,
            format_string='{:.1f}',
            show_legend=False
        )
        + geom_segment(
            aes(x='x', xend='x', y=0, yend=.205, linetype=' '),
            mean_len_df,
            inherit_aes=False, color='black'
        )
        + scale_linetype_manual(['dashed'])
        + facet_wrap('Task')
        + xlim(0, 20) + ylim(0, .23)
        + xlab('Example Length') + ylab('Frequency')
        + scale_color_manual(values=COLORS)
        + scale_fill_manual(values=COLORS)
        + theme_fs()
        + theme(
            aspect_ratio=1,
            legend_title=element_blank(),
            legend_position=legend_position,
            legend_box=legend_box,
        )
    )

    return plt
Ejemplo n.º 13
0
print("\n\nThe predicted acceptable range at age ", str(age), " is from ",
      str(min_acceptable_range), " to ", str(max_acceptable_range), "\n\n")

# save csv file
outlierfile = filename.replace('.csv', '_outliers.csv')

data_output.to_csv(outlierfile, index=False)

# plot overlay of IQR and mod-Z score outliers
p = (
    p9.ggplot(data=data_output,
              mapping=p9.aes(x='age_rounded', y='value', group='age_rounded'))
    + p9.geom_jitter(mapping=p9.aes(color='z_outlier', outlier_alpha=0.1)) +
    p9.geom_boxplot(outlier_size=0, outlier_stroke=0) + p9.ggtitle(
        "Outliers detected via the IQR method (boxplot)\nand modified z-score method (dotplot)"
    ) + p9.ylim(-10, 175))
print(p)
plotfile = filename.replace('.csv', '_outlierplot')
p9.ggsave(plot=p, filename=plotfile)

# plot regression
x = data_stats_regression['age_rounded']
y = data_stats_regression['median']
plt.plot(x, y, 'o')
plt.plot(x, r.func_linear(x, *linear_coeff))
plt.plot(x, r.func_log(x, *log10_coeff))
plt.plot(x, r.func_ln(x, *ln_coeff))
plt.title(
    "Regression performed on medians of age 1, 3 and 5\ndata with outliers removed"
)
plt.show()
Ejemplo n.º 14
0
def log_HR_plot(inFile, label_unit=10, log_scale_color=True):
    par = get_params(inFile)
    pca_components = par["means"]["logHR"].shape[0] >> 1
    components = range(pca_components)
    tf_components = slice(pca_components, 2 * pca_components)

    logHR_df = pd.DataFrame(index=[f"{i+1}" for i in components])
    logHR_df["tumor logHR"] = par["means"]["logHR"][components, 0]
    logHR_df["non-tumor logHR"] = par["means"]["logHR"][tf_components, 0]
    logHR_df["component"] = components
    logHR_df["label"] = [
        logHR_df.index[i] if i <= label_unit else "" for i in components
    ]
    logHR_df["tumor logHR sd"] = par["stds"]["logHR"][components, 0]
    logHR_df["non-tumor logHR sd"] = par["stds"]["logHR"][tf_components, 0]
    logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"]
    logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] /
                               logHR_df["tumor logHR sd"])
    logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2
    logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2
    logHR_df["tumor -log10(p-value)"] = -np.log10(logHR_df["tumor p-value"])
    logHR_df["non-tumor -log10(p-value)"] = -np.log10(
        logHR_df["non-tumor p-value"])

    lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min())
    ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max())
    pl = (pn.ggplot(
        pn.aes(
            "non-tumor logHR",
            "tumor logHR",
            color="non-tumor p-value",
            fill="tumor p-value",
            label="label",
        ),
        logHR_df,
    ) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.geom_abline() +
          pn.geom_point() + pn.theme_minimal() +
          pn.geom_text(ha="left", va="bottom", color="black"))
    if log_scale_color:
        pl += pn.scale_color_cmap(trans="log")
        pl += pn.scale_fill_cmap(trans="log")

    lb = min(
        logHR_df["non-tumor -log10(p-value)"].min(),
        logHR_df["tumor -log10(p-value)"].min(),
    )
    ub = max(
        logHR_df["non-tumor -log10(p-value)"].max(),
        logHR_df["tumor -log10(p-value)"].max(),
    )
    pl_p = (pn.ggplot(
        pn.aes(
            "non-tumor -log10(p-value)",
            "tumor -log10(p-value)",
            color="component",
            label="label",
        ),
        logHR_df,
    ) + pn.geom_point() + pn.xlim(lb, ub) + pn.ylim(lb, ub) +
            pn.theme_minimal() +
            pn.geom_text(ha="left", va="bottom", color="black"))
    return pl, pl_p, logHR_df
Ejemplo n.º 15
0
def gene_profile(genes: list, 
                 weights: pd.DataFrame, 
                 stddev: pd.DataFrame=None,
                 y_axis_label: str=None,
                 highlight_n: int=None, 
                 highlight_anno: list=None, 
                 figsize: tuple=None,
                 ylim: tuple=None) -> p9.ggplot:
    """
    
    Parameters
    ----------
    weights            : DataFrame of ES weights
    genes          : a single str or list of genes to include in plot as facets
    highlight_n    : number of highest ESw to highlight
    highlight_anno : specific annotations to highlight
    figsize : (float, float), optional (default: None)
        Specify width and height of plot.
    
    Returns
    -------
        g    : ggplot
        
    Todo:
        * find a better way for sorting cell-types along x-axis
        * report if gene in genes is not found in df
        * report if duplicate genes
        * replace hacky x-axis labelling
    
    """
    
    ### Reduce dataframe to genes of interest
    genes = [str.upper(s) for s in genes]
    idx = np.char.upper(weights.index.values.astype(str))
    mask = np.isin(idx, genes)
    df_tidy = weights[mask]
    n_genes = len(df_tidy)

    assert (n_genes >= 1), "No matching genes found in dataframe."

    stddev_tidy = None
    if stddev is not None:
        idx = np.char.upper(stddev.index.values.astype(str))
        mask = np.isin(idx, genes)
        stddev_tidy = stddev[mask]
        n_genes = len(df_tidy)
        assert (n_genes >= 1), "No matching genes found in stddev dataframe."

    # Constants, height and width of plot.
    if figsize is None:
        H = 5*n_genes
        W = 15
    else:
        W, H = figsize

    if ylim is None:
        ylim = (-1,1)
    
    if y_axis_label is None:
        y_axis_label = "Expression Specificity"
    
    ### Convert to tidy / long format if necessary
    # Org:
    #       ABC  ACBG  ACMB
    # POMC  0.0   0.5   0.9
    # AGRP  0.2   0.0   0.0
    # LEPR  0.1   0.1   0.4
    
    # Tidy:
    #   gene_name annotation    es_weight
    # 1 POMC      ABC           0.0
    # 2 AGRP      ABC           0.6
    # 3 LEPR      ABC           1.0     

    df_tidy.index.name = None # ensure that index name is none, so "index" is used for id_vars
    df_tidy = pd.melt(df_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="weight")
    
    if stddev_tidy is not None:
        stddev_tidy.index.name = None
        stddev_tidy = pd.melt(stddev_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="stddev")
        df_tidy = df_tidy.merge(stddev_tidy, on=["index", "annotation"])


    ### Sort values by gene_name and es_weight and add order
    # Sorted:
    #   gene_name annotation   es_weight   x_order
    # 1 AGRP      MOL2         0.0         1
    # 2 AGRP      ACNT1        0.1         2
    # 3 AGRP      MOL1         0.2         3
    
    df_tidy = df_tidy.sort_values(by=["index", "weight"])
    df_tidy["order"] = np.arange(len(df_tidy)) + 1
    
    ### Generate highlight
    # Default: highlight top 5
    if ((highlight_n is None) and (highlight_anno is None)):
        highlight_n = 5

    # highlight list of 
    if (highlight_anno is not None):
        df_tidy["highlight"] = df_tidy["annotation"].isin(highlight_anno)
    elif (highlight_n is not None):
        df_tidy["highlight"] = df_tidy.groupby("index")["order"].rank("first", ascending=False) <= highlight_n
    else:
        df_tidy["highlight"] = np.array([False] * len(df_tidy))
    
    df_highlight = df_tidy[df_tidy["highlight"]]
    
    ### Plot
    # linear function to compute x_axis text-size.
    # Mainly depends on number of genes in df per faceet, i.e. len(df_tidy) / len(genes).
    SIZE_TEXT_X_AXIS = 10.161 - 0.023 * (len(df_tidy) / len(genes))
    
    # Limits of the order for each index gene / facet, e.g. [0, 266, 531]
    # These limits are necessary to only plot the labels
    order_lims = [0, *(df_tidy.groupby("index")["order"].max().values)]
    
    def find_nearest(array,value):
        array = np.asarray(array)
        idx = (np.abs(array - value)).argmin()
        return array[idx]
        
    def getbreaks(lims):
        # function defined for use in debugging
        l = find_nearest(order_lims, lims[0])
        r = find_nearest(order_lims, lims[1])
        breaks = np.arange(l, r)
        return breaks

    def getlbls(idx):
        # function defined for use in debugging
        idx = idx
        lbls = df_tidy["annotation"].iloc[idx].values
        return lbls
    
    p = (
        ### data
        p9.ggplot(data=df_tidy, mapping=p9.aes(x="order", y="weight", label="annotation"))

        ### theming
        + p9.theme_classic()
        + p9.theme(
            figure_size = (W,H),
            axis_ticks_major_x = p9.element_blank(),
            axis_text_x = p9.element_text(rotation=75, hjust=0, size=SIZE_TEXT_X_AXIS), # 
            axis_text_y = p9.element_text(size=W),
            panel_spacing = 1,
            strip_background = p9.element_blank()
        )

        + p9.ylim(ylim[0],ylim[1])

        + p9.labs(
            x="", # e.g. "Cell-type"
            y=y_axis_label, # e.g. "ES weight"
        )

        ### viz
        # all
        + p9.geom_segment(mapping=p9.aes(x="order", xend="order", y=0, yend="weight"),
                       color="grey",
                       alpha=0.3,
                       show_legend=False
        )

        + p9.geom_point(mapping=p9.aes(size=2),
                     color="grey",
                    show_legend=False
        )

        # highlight
        + p9.geom_point(data=df_highlight, mapping=p9.aes(size=2), 
                     color="dodgerblue",
                    show_legend=False
        )

        + p9.geom_segment(data=df_highlight, mapping=p9.aes(x="order", xend="order", y=0, yend="weight"),
                       color="dodgerblue",
                       alpha=0.3,
                       show_legend=False
        )

        + p9.facet_wrap("index",
                     scales="free",
                     nrow=n_genes
                    )
        
        + p9.scale_x_continuous(
            # order_scale is continuous across all annotations
            # so the scale will look weird for each facet, e.g.
            # facet 1 may have order 1-7, and facet 2 has order 8-14.
            # therefore we must use a labeller function to get the 
            # correct labels for each interval of order.
            breaks = lambda lims: getbreaks(lims),
            labels = lambda idx: getlbls(idx)
        )
    )
    
    if stddev_tidy is not None:
        p = p + p9.geom_errorbar(mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), 
                                    color="grey", width=0.1)\
                + p9.geom_errorbar(data=df_highlight, mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"),
                                color="dodgerblue", width=0.1)

    # add labels last for them to be on top
    p = p + p9.geom_label(data=df_highlight,
                    color = "dodgerblue",
                    adjust_text = {'expand_points': (2,2)}
        )

    return p
    
targene_geo_mutant = output[output['status_sign'] == 1]
targene_geo_wt = output[output['status_sign'] == -1]

# Output t-test results
t_results_geo_targene = ttest_ind(a = targene_geo_mutant['weight'],
                              b = targene_geo_wt['weight'], equal_var = False)
print('Statistic = {:.2f}, p = {:.2E}'.format(t_results_geo_targene[0],
                                              Decimal(t_results_geo_targene[1])))

# graphical output for predictions
p = (gg.ggplot(output,
               gg.aes(x='weight', y='dummy_y', color='factor(status_sign)')) +
     gg.geom_hline(gg.aes(yintercept=0), linetype='solid') +
     gg.geom_point(size=4) +
     gg.scale_color_manual(values=["#377eb8", "#ff7f00"], labels=['WT', 'Mutant']) +
     gg.ylim([-0.1, 0.1]) +
     gg.xlim([-0.001, 1.001]) +
     gg.theme_seaborn(style='whitegrid') +
     gg.xlab('Targene Classifier Score') +
     gg.ylab('') +
     gg.labs(color='Sample_status') +
     gg.ggtitle('Mutant vs WT \n') +
     gg.theme(
        plot_title=gg.element_text(size=22),
        axis_title_x=gg.element_text(size=16),
        axis_text_x=gg.element_text(size=16),
        axis_text_y=gg.element_blank(),
        axis_ticks_length=4,
        axis_ticks_major_y=gg.element_blank(),
        axis_ticks_minor_y=gg.element_blank(),
        axis_ticks_minor_x=gg.element_blank(),
                               index=image_meta_col_list + ["Ch"],
                               columns=["type"]).reset_index()
    cp_sat_df.columns = image_meta_col_list + [
        "Ch", "PercentMax", "StdIntensity"
    ]

    cp_saturation_ymax = max(cp_sat_df.PercentMax)
    if cp_saturation_ymax < 1:
        cp_saturation_ymax = 1

    cp_saturation_gg = (
        gg.ggplot(
            cp_sat_df,
            gg.aes(x="StdIntensity", y="PercentMax", label=image_cols["site"]),
        ) + gg.coord_fixed(ratio=0.25) + gg.geom_text(size=6) +
        gg.ylim([0, cp_saturation_ymax]) +
        gg.facet_wrap(["Ch", image_cols["well"]],
                      nrow=len(painting_image_names),
                      scales="free") + gg.theme_bw() +
        gg.ggtitle(f"Cell Painting Image Saturation \n {plate}") + gg.theme(
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
            strip_text=gg.element_text(size=7),
            axis_text=gg.element_text(size=6),
            subplots_adjust={"wspace": 0.2},
        ))
    output_file = pathlib.Path(output_figuresdir, "cp_saturation.png")
    if check_if_write(output_file, force, throw_warning=True):
        cp_saturation_gg.save(
            output_file,
            dpi=300,
            width=(len(cp_sat_df[image_cols["well"]].unique()) + 2),
Ejemplo n.º 18
0
    def barchart_make(roi, df, list_rois, config, ylimit, save_function,
                      find_ylim_function):
        thisroi = list_rois[roi]

        current_df = df.loc[df['index'] == thisroi]

        current_df = current_df.sort_values([config.single_roi_fig_x_axis])
        current_df = current_df.reset_index(
            drop=True)  # Reset index to remove grouping
        current_df[config.single_roi_fig_x_axis] = pd.Categorical(
            current_df[config.single_roi_fig_x_axis],
            categories=current_df[config.single_roi_fig_x_axis].unique())

        figure = (
            pltn.ggplot(
                current_df,
                pltn.aes(x=config.single_roi_fig_x_axis,
                         y='Mean',
                         ymin="Mean-Conf_Int_95",
                         ymax="Mean+Conf_Int_95",
                         fill='factor({colour})'.format(
                             colour=config.single_roi_fig_colour))) +
            pltn.theme_538() + pltn.geom_col(position=pltn.position_dodge(
                preserve='single', width=0.8),
                                             width=0.8,
                                             na_rm=True) +
            pltn.geom_errorbar(size=1,
                               position=pltn.position_dodge(
                                   preserve='single', width=0.8)) +
            pltn.labs(x=config.single_roi_fig_label_x,
                      y=config.single_roi_fig_label_y,
                      fill=config.single_roi_fig_label_fill) +
            pltn.scale_x_discrete(labels=[]) +
            pltn.theme(panel_grid_major_x=pltn.element_line(alpha=0),
                       axis_title_x=pltn.element_text(
                           weight='bold', color='black', size=20),
                       axis_title_y=pltn.element_text(
                           weight='bold', color='black', size=20),
                       axis_text_y=pltn.element_text(size=20, color='black'),
                       legend_title=pltn.element_text(size=20, color='black'),
                       legend_text=pltn.element_text(size=18, color='black'),
                       subplots_adjust={'right': 0.85},
                       legend_position=(0.9, 0.8),
                       dpi=config.plot_dpi) +
            pltn.geom_text(pltn.aes(y=-.7, label=config.single_roi_fig_x_axis),
                           color='black',
                           size=20,
                           va='top') + pltn.scale_fill_manual(
                               values=config.colorblind_friendly_plot_colours))

        if ylimit:
            # Set y limit of figure (used to make it the same for every barchart)
            figure += pltn.ylim(None, ylimit)
            thisroi += '_same_ylim'

        returned_ylim = 0
        if config.use_same_axis_limits in ('Same limits',
                                           'Create both') and ylimit == 0:
            returned_ylim = find_ylim_function(thisroi, figure, 'yaxis')

        if config.use_same_axis_limits == 'Same limits' and ylimit == 0:
            return returned_ylim
        elif ylimit != 0:
            folder = 'Same_yaxis'
        else:
            folder = 'Different_yaxis'

        save_function(figure, thisroi, config, folder, 'barchart')

        return returned_ylim
Ejemplo n.º 19
0
        cp_sat_df, index=image_meta_col_list + ["Ch"], columns=["type"]
    ).reset_index()
    cp_sat_df.columns = image_meta_col_list + ["Ch", "PercentMax", "StdIntensity"]

    cp_saturation_ymax = max(cp_sat_df.PercentMax)
    if cp_saturation_ymax < 1:
        cp_saturation_ymax = 1

    cp_saturation_gg = (
        gg.ggplot(
            cp_sat_df,
            gg.aes(x="StdIntensity", y="PercentMax", label=image_cols["site"]),
        )
        + gg.coord_fixed(ratio=0.25)
        + gg.geom_text(size=6)
        + gg.ylim([0, cp_saturation_ymax])
        + gg.facet_wrap(
            ["Ch", image_cols["well"]], nrow=len(painting_image_names), scales="free"
        )
        + gg.theme_bw()
        + gg.ggtitle(f"Cell Painting Image Saturation \n {plate}")
        + gg.theme(
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
            strip_text=gg.element_text(size=7),
            axis_text=gg.element_text(size=6),
            subplots_adjust={"wspace": 0.2},
        )
    )
    output_file = pathlib.Path(output_figuresdir, "cp_saturation.png")
    if check_if_write(output_file, force, throw_warning=True):
        cp_saturation_gg.save(
    out_i = pandas.DataFrame(sim_res_fwd[i], columns=out.columns[3:])
    out_i['time'] = t
    out_i['signal'] = C3_scan[i]
    out_i['dir'] = 'fwd'
    out = pandas.concat([out, out_i[out.columns]])
for i in range(len(sim_res_rev)):
    out_i = pandas.DataFrame(sim_res_rev[i], columns=out.columns[3:])
    out_i['time'] = t
    out_i['signal'] = numpy.flip(C3_scan)[i]
    out_i['dir'] = 'rev'
    out = pandas.concat([out, out_i[out.columns]])
out.to_csv("sim.txt", sep="\t", index=False)

###################### plotting ##################################
g = (ggplot(out, aes('time', 's2', group='signal', color='signal')) +
     geom_line(size=0.5) + ylim(0, 20000) +
     scale_color_distiller(palette='RdYlBu', type="diverging") +
     facet_wrap('~dir') + theme_bw())
g.save(filename="./num_cont_graphs/sim_fwd_rev.png",
       format="png",
       width=8,
       height=4,
       units='in',
       verbose=False)

eq = out[out.time == max(out.time)]
g = (ggplot(eq) + aes(x='signal', y='s2', color='dir') +
     geom_path(size=2, alpha=0.5) + geom_point(color="black") + theme_bw())
g.save(filename="./num_cont_graphs/sim_bif_diag.png",
       format="png",
       width=8,
sensitivities.append(0)
especifities_1.append(0)  #para que al plotearlo acabe en la diagonal
#pintamos ahora la curva
import matplotlib.pyplot as plt
"""%matplotlib inline
plt.plot(especifities_1,sensitivities, marker="o", linestyle="--", color="r")
x=[i*0.01 for i in range(100)]
y=[i*0.01 for i in range(100)]
plt.plot(x,y) #pinto la diagonal (el peor modelo que existe)
plt.xlabel("1-Especificidad")
plt.ylabel("Sensibilidad")
plt.title("Curva ROC")
#recordemos que mi seleccion de variables era una mierda absoluta
"""
#cuanto mayor sea el área entre la curva y la diagonal, mejor es el modelo predictivo
from sklearn import metrics
from plotnine import ggplot, aes, geom_line, geom_area, ggtitle, xlim, ylim  #si quiero importar todo pongo solo *

espec_1, sensit, _ = metrics.roc_curve(Y_test, prob)
df = pd.DataFrame({"x": espec_1, "y": sensit})

auc = metrics.auc(espec_1, sensit)  #área bajo la curva

print(df.head())
print(
    ggplot(df, aes(x="x", y="y")) + geom_line() +
    geom_line(linetype="dashed") + xlim(-0.01, 1.01) + ylim(-0.01, 1.01))
print(
    ggplot(df, aes(x="x", y="y")) + geom_area(alpha=0.25) +
    geom_line(aes(y="y")) + ggtitle("Curva ROC y AUC=%s " % str(auc)))
Ejemplo n.º 22
0
lmb_data['demvoteshare_c'] = lmb_data['demvoteshare'] - 0.5
# drop missing values
lmb_data = lmb_data[~pd.isnull(lmb_data.demvoteshare_c)]
lmb_data['demvoteshare_sq'] = lmb_data['demvoteshare_c']**2

#aggregating the data
lmb_data = lmb_data[lmb_data.demvoteshare.between(.45, .55)]
categories = lmb_data.lagdemvoteshare
lmb_data['lagdemvoteshare_100'] = pd.cut(lmb_data.lagdemvoteshare, 100)

agg_lmb_data = lmb_data.groupby('lagdemvoteshare_100')['score'].mean().reset_index()
lmb_data['gg_group'] = [1 if x>.5 else 0 for x in lmb_data.lagdemvoteshare]
agg_lmb_data['lagdemvoteshare'] = np.arange(0.01, 1.01, .01)

# plotting
p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) +    p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) +    p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), 
                  data=lmb_data, method = "lm", 
              formula = 'y ~ x + I(x**2)') +\
    p.xlim(0,1) + p.ylim(0,100) +\
    p.geom_vline(xintercept = 0.5)

p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) +    p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) +    p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), 
                  data=lmb_data, method = "lowess") +\
    p.xlim(0,1) + p.ylim(0,100) +\
    p.geom_vline(xintercept = 0.5)

p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) +    p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) +    p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), 
                  data=lmb_data, method = "lm")+\
    p.xlim(0,1) + p.ylim(0,100) +\
    p.geom_vline(xintercept = 0.5)
Ejemplo n.º 23
0
x_axis_label = 'T-SNE Component 1'
y_axis_label = 'T-SNE Component 2'
xlim = [tsne_results_df.iloc[:, 0].min(), tsne_results_df.iloc[:, 0].max()]
ylim = [tsne_results_df.iloc[:, 1].min(), tsne_results_df.iloc[:, 1].max()]

plot = (p9.ggplot(tsne_results_df,
                    p9.aes(y=tsne_results_df.columns[1], 
                           x=tsne_results_df.columns[0],
                           group=clusters_colname,
                           color=clusters_colname
                           ))
        + p9.geom_point(size=2)
        + p9.geom_rug()
        + p9.stat_ellipse()
        + p9.xlim(xlim[0], xlim[1])
        + p9.ylim(ylim[0], ylim[1])
        #+ p9.scale_color_gradient(low='blue', high='yellow')
        #+ p9.scale_color_manual(values=colors)
        + p9.theme_light(base_size=18)
        + p9.ggtitle(plot_title)
        + p9.labs(y=y_axis_label,
                  x=x_axis_label)
        )

plot_filename = 'shap_clusters.png'
plot.save(plot_filename, width=10, height=10)
from IPython.display import Image
Image(filename=plot_filename)

# + [markdown]
'''
Ejemplo n.º 24
0
overall_preprint_survival = kmf.survival_function_.reset_index().assign(
    label="all_papers"
)
overall_preprint_survival.head()

g = (
    p9.ggplot(
        overall_preprint_survival.assign(
            timeline=lambda x: pd.to_timedelta(x.timeline, "D")
        ),
        p9.aes(x="timeline", y="KM_estimate", color="label"),
    )
    + p9.scale_x_timedelta(labels=timedelta_format("d"))
    + p9.geom_line()
    + p9.ylim(0, 1)
)
print(g)

# # Calculate Category Survival Function

# This section measures how long it takes for certain categories to get preprints published.

entire_preprint_df = pd.DataFrame([], columns=["timeline", "KM_estimate", "category"])
half_life = []
for cat, grouped_df in preprints_w_published_dates.groupby("category"):
    temp_df = preprints_w_published_dates.query(f"category=='{cat}'")
    kmf.fit(
        temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24,
        event_observed=~temp_df["published_doi"].isna(),
    )
Ejemplo n.º 25
0
def main():

    args = UserInput()

    if args.y_lim:
        y_lim = np.array(args.y_lim, dtype=np.float32)
    else:
        y_lim = None
    if args.size:
        size = np.array(args.size, dtype=np.float32)
    else:
        size = args.size

###################################

    df_list = [
        pd.read_csv(f, sep=args.sep, skipinitialspace=True)
        for f in args.infile
    ]

    ## only take input with 1 or 2 columns; for 2 columns, 1st is always removed
    lg_list = []
    for idx, df in enumerate(df_list):
        xdf = pd.DataFrame(df.iloc[:, int(args.col) - 1])

        if args.col_names:
            xdf.columns = [args.col_names[idx]]

        lg_list.append(pd.melt(xdf))

    lg_df = pd.concat(lg_list)
    lg_df.columns = [args.x_name, args.y_name]
    print(lg_df)

    ## plotnine method
    if args.use_p9:
        import plotnine as p9
        Quant = [.25, .5, .75]

        if y_lim is not None:
            set_ylim = p9.ylim(y_lim)
        else:
            set_ylim = p9.ylim(
                [lg_df[args.y_name].min(), lg_df[args.y_name].max()])

        df_plot = (p9.ggplot(
            lg_df, p9.aes(x=args.x_name, y=args.y_name, fill=args.x_name)) +
                   p9.geom_violin(
                       width=.75, draw_quantiles=Quant, show_legend=False) +
                   p9.ggtitle(args.title) + p9.theme_classic() + set_ylim +
                   p9.scale_x_discrete(limits=args.col_names) +
                   p9.theme(text=p9.element_text(size=12, color='black'),
                            axis_text_x=p9.element_text(angle=33),
                            panel_grid_major_y=p9.element_line(color='gray',
                                                               alpha=.5)))

        p9.ggsave(filename='{0}.violin.{1}'.format(args.outpref, args.img),
                  plot=df_plot,
                  dpi=int(args.dpi),
                  format=args.img,
                  width=size[0],
                  height=size[1],
                  units='in',
                  verbose=False)

    else:
        ## Seaborn method
        import seaborn as sns
        sns.set(style='whitegrid')

        ax = sns.violinplot(x=args.x_name,
                            y=args.y_name,
                            data=lg_df,
                            linewidth=1,
                            inner='box')
        if args.title:
            ax.set_title(args.title)
        if y_lim is not None:
            ax.set(ylim=y_lim)

        plt.savefig('{0}.violin.{1}'.format(args.outpref, args.img),
                    figsize=tuple(size),
                    format=args.img,
                    dpi=int(args.dpi))
        plt.clf()
Ejemplo n.º 26
0
sv = scale_predictors(df, predictor='SVC')
# ld = scale_predictors(df, predictor='LDA')
nb = scale_predictors(df, predictor='naive_bayes')
rn = scale_predictors(df, predictor='Random')
ac = scale_predictors(df, predictor='acg_ip_risk')
rf = scale_predictors(df, predictor='RandmForest')
ct = scale_predictors(df, predictor='cheating')

df2 = pd.concat([nb, rn, ac, rf, sv, ct])
# df2 = pd.concat([nb, rn, ac, rf, ct])

print(df2.head(20))
print(df2.describe())
p = pn.ggplot(df2, pn.aes(x='num_examined', y='num_detected', group='classifier', colour='classifier')) +\
    pn.geom_step() +\
    pn.ggtitle("How Many ppl would we need to intervene on to prevent Y hospitalizations?")
    # pn.scales.scale_x_reverse()

p.save(HOME_DIR + 'all_together_d.png', height=8, width=10, units='in', verbose=False)

p2 = pn.ggplot(df2, pn.aes(x='num_examined', y='num_detected', group='classifier', colour='classifier')) +\
    pn.geom_step() +\
    pn.ggtitle("How Many ppl would we need to intervene on to prevent Y hospitalizations?") +\
    pn.xlim(0, 300) + pn.ylim(0, 300)
    # pn.scales.scale_x_reverse()

p2.save(HOME_DIR + 'all_together_trunc.png', height=8, width=10, units='in', verbose=False)


print("Finished!")
Ejemplo n.º 27
0

def read_data(file):
    return pd.read_stata(
        "https://raw.github.com/scunning1975/mixtape/master/" + file)


start_is_born = pd.DataFrame({
    'beauty': np.random.normal(size=2500),
    'talent': np.random.normal(size=2500)
})

start_is_born['score'] = start_is_born['beauty'] + start_is_born['talent']
start_is_born['c85'] = np.percentile(start_is_born['score'], q=85)
start_is_born['star'] = 0
start_is_born.loc[start_is_born['score'] > start_is_born['c85'], 'star'] = 1
start_is_born.head()

lm = sm.OLS.from_formula('beauty ~ talent', data=start_is_born).fit()

p.ggplot(start_is_born, p.aes(x='talent', y='beauty')) + p.geom_point(
    size=0.5) + p.xlim(-4, 4) + p.ylim(-4, 4)

p.ggplot(start_is_born[start_is_born.star == 1], p.aes(
    x='talent', y='beauty')) + p.geom_point(size=0.5) + p.xlim(-4, 4) + p.ylim(
        -4, 4)

p.ggplot(start_is_born[start_is_born.star == 0], p.aes(
    x='talent', y='beauty')) + p.geom_point(size=0.5) + p.xlim(-4, 4) + p.ylim(
        -4, 4)
Ejemplo n.º 28
0
def graph_kda(df):
    try:
        lm = LinearRegression()
        X = df['games_ago']
        y = df['kda']
        X = X.values.reshape(-1, 1)
        y = y.values.reshape(-1, 1)
        lm.fit(X,y)
        coef = np.transpose(lm.coef_)
        if coef[0] < 0:
          lm_color = 'green'
        elif coef[0] > 0:
          lm_color = 'red'
        else:
          lm_color = 'black'        
        kda = ggplot(df) + aes(x='games_ago', y='kda') + geom_line(color="black", linetype='dashed') + geom_point(aes(color='lane', size=3), show_legend={'size': False}) + ylim(0,30) + scale_x_reverse() + geom_smooth(method = "lm", color = lm_color) + xlab('Games Ago') + ylab('KDA') + ggtitle("KDA Over the Last 5 Games")
    except:
        pass
    return kda
Ejemplo n.º 29
0
                    roll_credits_times_2 += collection.autorelease()

            case.record(pokemon, collection, results, slot_machine)

        print(
            f"{collection.num_unique()} / {len(pokemon)}, ({len(collection.pokemon)})"
        )

    # Output simulation results
    data = simulation_data.to_data_frame()

    num_unique_pokemon_plot = (
        plt9.ggplot(data, plt9.aes("roll_num", "num_unique_pokemon", color="case_id"))
        + plt9.geom_line()
        + plt9.geom_hline(yintercept=len(pokemon))
        + plt9.ylim(0, len(pokemon))
    )

    num_unique_pokemon_plot.save(args.num_unique_pokemon_plot, dpi=300)
    print("Output:", args.num_unique_pokemon_plot)

    data_2 = simulation_data.to_num_missing_data_frame()

    num_missing_pokemon_plot = (
        plt9.ggplot(
            data_2[data_2["case_id"] == 0],
            plt9.aes("roll_num", "num_missing", fill="rarity"),
        )
        + plt9.geom_area()
        + plt9.geom_hline(yintercept=len(pokemon))
        + plt9.ylim(0, len(pokemon))
Ejemplo n.º 30
0
    out_i = pandas.DataFrame(sim_res_fwd[i], columns=out.columns[3:])
    out_i['time'] = t
    out_i['signal'] = C3_scan[i]
    out_i['dir'] = 'Low $[S^{**}]$'
    out = pandas.concat([out, out_i[out.columns]])
for i in range(len(sim_res_rev)):
    out_i = pandas.DataFrame(sim_res_rev[i], columns=out.columns[3:])
    out_i['time'] = t
    out_i['signal'] = numpy.flip(C3_scan)[i]
    out_i['dir'] = 'High $[S^{**}]$'
    out = pandas.concat([out, out_i[out.columns]])
out.to_csv("./num_cont_graphs/sim2.txt", sep="\t", index=False)

###################### plotting ##################################
g = (ggplot(out, aes('time', response, group='signal', color='signal')) +
     geom_line(size=0.5) + ylim(0, 202) + labs(x="time", y="$[S^{**}]$") +
     scale_color_distiller(
         palette='RdYlBu', type="diverging", name="$B_{tot}$") +
     facet_wrap('~dir') + theme_bw())
g.save(filename="./num_cont_graphs/sim_fwd_rev2.png",
       format="png",
       width=8,
       height=4,
       units='in',
       verbose=False)

eq = out[out.time == max(out.time)]

g = (ggplot(eq) + aes(x='signal', y=response, color='dir') +
     labs(x="$B_{tot}$", y="$[S^{**}]$", color="") +
     geom_path(size=2, alpha=0.5) + geom_point(color="black") + theme_bw() +
median_ci_l, median_ci_u

# In[9]:

overall_preprint_survival = kmf.survival_function_.reset_index().assign(
    label="all_papers")
overall_preprint_survival.head()

# In[10]:

g = (p9.ggplot(
    overall_preprint_survival.assign(
        timeline=lambda x: pd.to_timedelta(x.timeline, "D")),
    p9.aes(x="timeline", y="KM_estimate", color="label"),
) + p9.scale_x_timedelta(labels=timedelta_format("d")) + p9.geom_line() +
     p9.ylim(0, 1))
print(g)

# # Calculate Category Survival Function

# This section measures how long it takes for certain categories to get preprints published.

# In[11]:

entire_preprint_df = pd.DataFrame(
    [], columns=["timeline", "KM_estimate", "category"])
half_life = []
for cat, grouped_df in preprints_w_published_dates.groupby("category"):
    temp_df = preprints_w_published_dates.query(f"category=='{cat}'")
    kmf.fit(
        temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24,