Exemple #1
0
def plot_ROC(label_list, pred_list, names=None, **args):
    """
    複数の ROC 曲線をプロットする 
    :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)]  のようにして与える,  pred_list に対応させる
    :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること
    :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合,
            ラベルの組が 2~3  ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする.
    :param args: sklearn.metrics.roc_curve に与えるパラメータ
    :return: plotnine オブジェクト
    """
    if names is None:
        if len(label_list) == 2:
            names = ('train', 'test')
        elif len(label_list) == 3:
            names = ('train', 'valid', 'test')
        else:
            names = list(range(len(label_list)))
    else:
        pass
    roc = [roc_curve(y, p, **args) for y, p in zip(label_list, pred_list)]
    fpr, tpr = tuple([list(chain.from_iterable(x)) for x in zip(*roc)][0:2])
    models = chain.from_iterable([[name] * l for name, l in zip(names, [len(x) for x, y, _ in roc])])
    d_roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'model': models})
    return ggplot(
            d_roc,
            aes(x='fpr', y='tpr', group='model', color='model')
    ) + geom_segment(x=0, y=0, xend=1, yend=1, linetype=':', color='grey'
    ) + geom_line(
    ) + scale_color_discrete(breaks=names
    ) + labs(x='false positive rate', y='true positive rate'
    ) + coord_equal(ratio=1, xlim=[0, 1], ylim=[0, 1]
    ) + theme_classic() + theme(figure_size=(4, 4))
Exemple #2
0
def duration_TL(Data):
    print('======= Creating duration_TL =======')
    x = Data.Duration[pd.isna(Data.Duration) == True]
    
    if ((len(x)+10)) >= len(Data):
       print("WARNING: All values for Duration are NA's")
    
    else:
        #Filter Symptomes and Correct Durations
        Symptomes = Data[(Data.Group == "sy") & (Data.Duration < 180)]
        
        #Setting data with missing times
        Symptomes['Date'] = pd.to_datetime(Symptomes['Date'])
        
        if len(Symptomes) == 0:
            print('No duration for TL_2')
        else: 
            sdate = min(Symptomes["Date"])   # start date
            edate = max(Symptomes["Date"])   # end date
            delta = edate - sdate       # as timedelta
#            from datetime import timedelta
            day = []
            for i in range(delta.days + 1):
                d= sdate + timedelta(days=i)
                day.append(d)
                
            DF = pd.DataFrame(day)
            DF.columns = ['Date']
            data_with_missing_times = pd.merge(DF, Symptomes, on='Date', how='outer')
            data_with_missing_times.Date = pd.to_datetime(data_with_missing_times.Date)
            if delta.days > 1825:
                datebreaks = '18 months'
            else:
                if delta.days > 1095:
                    datebreaks = '12 months'                
                else:
                    datebreaks = '6 months'

                
            plot = (p9.ggplot(data=data_with_missing_times, mapping=p9.aes(x='Date', 
                                                                           y='Duration'))
            + p9.geom_smooth(color = 'red', size = 5, method="loess", se=False)
            + p9.theme_classic()
            + p9.theme(axis_text = p9.element_text(size=33), 
                       axis_title = p9.element_text(size = 33,face = 'bold'))
            + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks)
            + p9.labs(x='',y=''))    

            if (len(data_with_missing_times) > 0):

                plot.save(filename = 'TL_2.jpeg',
                         plot = plot,
                         path = "pdf/iteration/",
                         width = 25, height = 5,
                         dpi = 320)
                

            else: 
                print('Plot not created; no data found.')
        return(print('=================================duration_TL DONE ============================='))
def go_to_time_plot3(large_go_to_time_probs_new: list,
                     large_go_to_time_probs_old: list,
                     average_minutes_per_game_values: list):
    """ Plot go-to-time probability, old vs. new rules, no blowouts, 300 matches/round """

    large_time_prob_data = pd.DataFrame({
        'Average minutes per game':
        np.concatenate(
            [average_minutes_per_game_values,
             average_minutes_per_game_values]),
        'P(Go to time)':
        np.concatenate(
            [large_go_to_time_probs_new, large_go_to_time_probs_old]),
        'Rules':
        np.concatenate([
            np.repeat('New', len(average_minutes_per_game_values)),
            np.repeat('Old', len(average_minutes_per_game_values))
        ])
    })
    (plt.ggplot(
        large_time_prob_data,
        plt.aes(x='Average minutes per game', y='P(Go to time)',
                color='Rules')) + plt.geom_line() + plt.geom_point() +
     plt.ylim([0, 1]) + plt.theme_classic()).save(
         filename='figures/go_to_time_300_matches_prob_plot.png')
Exemple #4
0
    def plot_overlap_duration(self, data, options):
        matches = data["matches"]
        matches = matches.loc[matches.tag_overlap > 0]
        # matches.loc[:, "log_dur"] = log()

        plt = ggplot(data=matches, mapping=aes(x="tag_duration", y="tag_overlap",),)
        plt = (
            plt
            + geom_point()
            + xlab("Tag duration")
            + ylab("Proportion tag overlapping with matching event")
            + theme_classic()
            + theme(
                axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30}),
                plot_title=element_text(
                    weight="bold", size=14, margin={"t": 10, "b": 10}
                ),
                figure_size=(10, 10),
                text=element_text(size=12, weight="bold"),
            )
            + ggtitle(
                (
                    "Proportion of tag overlapping with matching event depending on duration "
                    + "size for model {}, database {}, class {}\n"
                    + "with detector options {}"
                ).format(
                    options["scenario_info"]["model"],
                    options["scenario_info"]["database"],
                    options["scenario_info"]["class"],
                    options,
                )
            )
        )

        return plt
def mixed_linear_factors_plot(df, x_axis, factor):
    plotnine.options.figure_size = (10, 10)
    factor_steps = df[factor].unique()
    reg_lines = pd.DataFrame({
        factor: factor_steps,
        'intercept': np.zeros_like(factor_steps),
        'slope': np.zeros_like(factor_steps)
    })
    for i, step in enumerate(factor_steps):
        factored_df = df[df[factor] == step]
        md = smf.mixedlm('mse ~ %s' % x_axis,
                         factored_df,
                         groups=factored_df.index.values)
        mdf = md.fit()
        reg_lines.iloc[i] = [step, mdf.params['Intercept'], mdf.params[x_axis]]

    df['percent_broken'] = df['percent_broken'].round().astype(np.int)
    df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int)
    reg_lines[factor] = reg_lines[factor].round().astype(np.int)
    gg = (
        plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) +
        plotnine.geom_jitter(width=2.5, show_legend=False) +
        plotnine.scale_color_manual(['#DB5F57'] * 4) +
        plotnine.facet_wrap(factor) + plotnine.geom_abline(
            plotnine.aes(intercept='intercept', slope='slope'),
            data=reg_lines) + plotnine.theme_classic(base_size=20))
    gg.save('%s_vs_%s_rmse.pdf' % (x_axis, factor))
def go_to_time_plot2(go_to_time_probs_new: list, go_to_time_probs_old: list,
                     go_to_time_blowout_probs_new: list,
                     go_to_time_blowout_probs_old: list,
                     average_minutes_per_game_values: list):
    """ Plot go-to-time probability, new vs. old rules, blowouts vs. no blowouts, 85 matches/round """

    time_prob_blowout_data = pd.DataFrame({
        'Average minutes per game':
        np.concatenate([
            average_minutes_per_game_values, average_minutes_per_game_values,
            average_minutes_per_game_values, average_minutes_per_game_values
        ]),
        'P(Go to time)':
        np.concatenate([
            go_to_time_probs_new, go_to_time_probs_old,
            go_to_time_blowout_probs_new, go_to_time_blowout_probs_old
        ]),
        'Rules':
        np.concatenate([
            np.repeat('New, no blowouts',
                      len(average_minutes_per_game_values)),
            np.repeat('Old, no blowouts',
                      len(average_minutes_per_game_values)),
            np.repeat('New, blowouts', len(average_minutes_per_game_values)),
            np.repeat('Old, blowouts', len(average_minutes_per_game_values))
        ])
    })

    (plt.ggplot(
        time_prob_blowout_data,
        plt.aes(x='Average minutes per game', y='P(Go to time)',
                color='Rules')) + plt.geom_line() + plt.geom_point() +
     plt.ylim([0, 1]) + plt.theme_classic()).save(
         filename='figures/go_to_time_prob_with_blowouts_plot.png')
Exemple #7
0
def medicine(Data):
    print('======= Creating medicine =======')
    
    try:
        #Filter medicine 
        medicine = Data[(Data.Group == 'me')|(Data.Group == 'ma')]
        
        #Setting data with missing times
        medicine.Date = pd.to_datetime(medicine.Date)
        medicine['Date'] = pd.to_datetime(medicine['Date'])
        
        sdate = min(medicine["Date"])   # start date
        edate = max(medicine["Date"])   # end date
        delta = edate - sdate       # as timedelta
        
#        from datetime import date, timedelta    
        
        day = []
        
        for i in range(delta.days + 1):
            d= sdate + timedelta(days=i)
            day.append(d)
            
            DF = pd.DataFrame(day)
            DF.columns = ['Date']
            data_with_missing_times = pd.merge(DF, medicine, on='Date', how='outer')
            medicine = data_with_missing_times
            
            ########HOW TO DEAL WITH MEDICINE NA'S IN PLOTS, NOT TO SHOW THEM#############################################################################################################
            #if (medicine.Name.isnull().sum() > 0):   
            #medicine = medicine[['Date','Name']]
            #medicine = 
            
            medicine = medicine[pd.isna(medicine.Name) == False]
            #Creating and saving Medicine plot
        
        
        if (len(medicine) > 5):        
            #Plot everything but Na's
            
            f_tl1 = (p9.ggplot(data=medicine,
                               mapping=p9.aes(x='Date', y = 'Name'))
        + p9.geom_point(color = 'red', size = 3)
        + p9.theme_classic()
        + p9.theme(axis_text = p9.element_text(size= 18),
                   axis_title = p9.element_text(size = 18,face = 'bold'))
        + p9.labs(title = '', x='',y='')
        )
        
            
        f_tl1.save(filename = 'Medicine.jpeg',
                   plot = f_tl1,
                   path = "pdf/iteration/",
                   width = 25, height = 5,
                   dpi = 320)
        
    except:
        print("Medicical graph failed")

    return(print('=================================medicine DONE ============================='))    
def pca_plot(pca_data: pd.DataFrame, dim1: str, dim2: str, dim3: str):
    """ 
    Returns plot displaying 3 PCA variables (including color). 
    Parameters
    ----------
    pca: Fitted pca object to plot. 
    df: Dataframe pca was fit on. Used for column names. 
    dim1: String of column name of principal component to plot on x-axis. 
    dim2: String of column name of principal component to plot on y-axis.
    dim3: String of column name of principal component to plot as colour.

    Returns
    ----------
    Plot of PCA with dim1 on x-axis, dim2 on y-axis, and coloured by dim3


    """
    #Set plot theme within function: 
    p9.theme_set(p9.theme_classic())

    num_components = len(pca_data.columns) - 1
    color_type = type(pca_data.loc[0, dim3])
    p = (p9.ggplot(pca_data, p9.aes(x=dim1, y=dim2, fill=dim3))
        + p9.geom_point()
    )
    if(color_type==str):
        print('color type is qualitative')
        #Can't find a better colour palette yet.
        #p = p + (p9.scale_fill_brewer(type="qual", palette='Accent'))
    return(p)
Exemple #9
0
def derplot(adata=None,
            filename='derplot',
            embedding='tsne',
            feature='sample_type_tech',
            size=(12, 12),
            save=False,
            draw=False,
            psize=1):
    start = datetime.datetime.now()
    p.options.figure_size = size
    savename = filename + '.' + embedding + '.' + feature + '.derplot.png'
    print(
        start.strftime("%H:%M:%S"),
        'Starting ... \t',
        savename,
    )
    p.theme_set(p.theme_classic())
    pt = \
    p.ggplot(p.aes(embedding +'0', embedding + '1', color=feature), adata.obs) \
        + p.geom_point(size=psize, alpha = 1, stroke = 0 ) \
        + p.guides(color = p.guide_legend(override_aes={'size': 15}))

    if save: pt.save(savename, format='png', dpi=200)
    end = datetime.datetime.now()
    delta = end - start
    print(start.strftime("%H:%M:%S"), str(int(delta.total_seconds())),
          's to make: \t', savename)
Exemple #10
0
def frequency_TL(Data):
    print('======= Creating frequency_TL =======')
    #Filtering
    Data['date_4'] = Data['date'].dt.date
    tl4 = Data.groupby("date_4", sort = False, as_index = False).count()
    tl4 = tl4.iloc[:, 0:2]
    tl4 = tl4.rename(columns = {"Unnamed: 0": "n"})    
    
    sdate = min(tl4["date_4"])  # start date
    edate = max(tl4["date_4"])   # end date
    delta = edate - sdate       # as timedelta
    
#    tl4 = Data.groupby("Date", sort = False, as_index = False).count()
#    tl4 = tl4.iloc[:, 0:2]
#    tl4 = tl4.rename(columns = {"Unnamed: 0": "n"})
#    tl4['Date'] = pd.to_datetime(tl4['Date'])
    
#    #Setting data with missing times
#    sdate = min(tl4["Date"])  # start date
#    edate = max(tl4["Date"])   # end date
#    delta = edate - sdate       # as timedelta
    
    from datetime import timedelta    
    day = []
    for i in range(delta.days + 1):
        d= sdate + timedelta(days=i)
        day.append(d)
        
    DF = pd.DataFrame(day)
    DF.columns = ['date_4']
    data_with_missing_times = pd.merge(DF, tl4, on='date_4', how='outer')
    if delta.days > 1825:
                datebreaks = '18 months'
    else:
        if delta.days > 1095:
            datebreaks = '12 months'                
        else:
            datebreaks = '6 months'
    #Creating and saving TL_4
    
    plot =(p9.ggplot(data=data_with_missing_times,
                     mapping=p9.aes(x='date_4',y='n'))
        + p9.geom_col(fill = 'red')
        + p9.theme_classic()
        + p9.theme(axis_text = p9.element_text(size=40),
                   axis_title = p9.element_text(size = 40,face = 'bold'))
        + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks)
        + p9.labs(x='',y='')
        )
        
    if (len(data_with_missing_times) > 0):
        plot.save(filename = 'TL_4.jpeg',
                 plot = plot,
                 path = "pdf/iteration/",
                 width = 25, height = 5,
                 dpi = 320)
    else: 
        print('Plot not created; no data found.')
    return(print('=================================frequency_TL DONE ============================='))
Exemple #11
0
def summary(tags, opts=None):
    print(tags)
    tags_summary = (
        tags.groupby(["tag", "background"])
        .agg({"tag": "count"})
        .rename(columns={"tag": "n_tags"})
        .reset_index()
        .astype({"background": "category", "tag": "category"})
    )
    print(tags_summary)
    # tags_summary = tags_df.groupby(["species"]).agg(
    #     {"tag_duration": "sum", "species": "count"}
    # )

    # tags_summary.rename(columns={"species": "count"}, inplace=True)

    # tags_summary["tag_duration"] = tags_summary.tag_duration.astype(int)
    # tags_summary["duration"] = tags_summary.tag_duration.astype(str) + "s"
    # tags_summary = tags_summary.reindex(list(SPECIES_LABELS.keys()))
    # # tags_summary["species"] = tags_summary.index
    # tags_summary.reset_index(inplace=True)
    # tags_summary
    # (
    #     ggplot(
    #         data=tags_summary,
    #         mapping=aes(
    #             x="factor(species, ordered=False)",
    #             y="tag_duration",
    #             fill="factor(species, ordered=False)",
    #         ),
    #     )
    #     + geom_bar(stat="identity", show_legend=False)
    #     + xlab("Species")
    #     + ylab("Duration of annotations (s)")
    #     + geom_text(mapping=aes(label="count"), nudge_y=15)
    #     + theme_classic()
    #     + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels)
    # ).save("species_repartition_duration_mini.png", width=10, height=8)

    plt = (
        ggplot(
            data=tags_summary,
            mapping=aes(
                x="tag",  # "factor(species, ordered=False)",
                y="n_tags",
                fill="background",  # "factor(species, ordered=False)",
            ),
        )
        + geom_bar(stat="identity", show_legend=True, position=position_dodge())
        + xlab("Species")
        + ylab("Number of annotations")
        + geom_text(mapping=aes(label="n_tags"), nudge_y=15)
        + theme_classic()
        + theme(axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30}))
        # + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels)
    ).save("tag_species_bg.png", width=10, height=8)
    # print(tags_summary)

    print(plt)
def plot_pca_vis(pca: PCA, df: pd.DataFrame, pc_x: int = 0, pc_y: int = 1, num_dims: int = 5) -> plt:

    """
    Plot contribution of different dimensions to principal components. 
    
    Parameters
    ----------
    pca: Fitted pca object to plot. 
    df: Dataframe pca was fit on. Used for column names. 
    pc_x: Index of principal component to plot on x-axis. 
    pc_y: Index of principal component to plot on y-axis. 
    num_dims: Number of contributing elements to include for each axis. 

    Returns
    ----------
    Null

    Prints matplotlib.plt object. 

    https://stackoverflow.com/questions/45148539/project-variables-in-pca-plot-in-python
    Adapted into function by Tim Cashion
    """
    #Set plot theme within function: 
    p9.theme_set(p9.theme_classic())

    # Get the PCA components (loadings)
    PCs = pca.components_
    
    PC_x_index = PCs[pc_x, : ].argsort()[-num_dims:][::-1]
    PC_y_index = PCs[pc_y, : ].argsort()[-num_dims:][::-1]
    combined_index = set(list(PC_x_index) + list(PC_y_index))
    combined_index = sorted(combined_index)
    PCs = PCs[:, combined_index]
    # Use quiver to generate the basic plot
    fig = plt.figure(figsize=(5,5))
    plt.quiver(np.zeros(PCs.shape[1]), np.zeros(PCs.shape[1]),
            PCs[pc_x,:], PCs[pc_y,:], 
            angles='xy', scale_units='xy', scale=1)

    # Add labels based on feature names (here just numbers)
    feature_names = df.columns[combined_index]
    for i,j,z in zip(PCs[pc_y,:]+0.02, PCs[pc_x,:]+0.02, feature_names):
        plt.text(j, i, z, ha='center', va='center')

    # Add unit circle
    circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
    plt.gca().add_artist(circle)

    # Ensure correct aspect ratio and axis limits
    plt.axis('equal')
    plt.xlim([-1.0,1.0])
    plt.ylim([-1.0,1.0])

    # Label axes
    plt.xlabel('PC ' + str(pc_x))
    plt.ylabel('PC ' + str(pc_y))
    
    plt.tight_layout()
    return plt
Exemple #13
0
def duration_graph(Data, Data_m):
    print('======= Creating duration_graph =======')
    #Filter current year and month, and correct Duration
    
    #Graph2_ALL.Duration = Graph2_ALL.Duration/60
    #Graph2_ALL.Duration = Graph2_ALL.Duration.astype(str)    
    x = Data.Duration[pd.isna(Data.Duration) == True]
    if (len(x) == len(Data)):
        logging.warning('=================================Graph_2 aborted =============================')
        return
    else:
        Graph2 = Data_m[(Data_m.Duration < 180)]
        Graph2_ALL = Data[(Data.Duration < 180)]
        if (len(Graph2_ALL) > 0):
                    plot= (p9.ggplot(data=Graph2_ALL,
                                     mapping=p9.aes(x='Duration'))
                                + p9.geom_bar(fill = 'red', stat = 'count', size = 100)
                                + p9.theme_classic()
                                + p9.theme(axis_text = p9.element_text(size=40),
                                           axis_title = p9.element_text(size = 40,face = 'bold'))
                                + p9.labs(title = '', x='',y='No. of attacks')
                                )
                    plot.save(filename = 'Graph_ALL_2.jpeg',plot = plot,
                          path = "pdf/iteration/",
                          width = 25, height = 5,
                          dpi = 320)
        else: 
                print('Plot not created; no data found.')
        if (len(Graph2) > 0):
                    plot_month= (p9.ggplot(data=Graph2,
                                           mapping=p9.aes(x='Duration'))
                                + p9.geom_bar(fill = 'red', stat = 'count', size = 100)
                                + p9.theme_classic()
                                + p9.theme(axis_text = p9.element_text(size=40),
                                           axis_title = p9.element_text(size = 40,face = 'bold'))
                                + p9.labs(title = '', x='',y='No. of attacks')
                                )
                    plot_month.save(filename = 'Graph_2.jpeg',
                                plot = plot_month,
                                path = "pdf/iteration/",
                                width = 25, height = 5,
                                dpi = 320)
        else: 
                print('Plot not created; no data found.')
    return(print('=================================duration_graph DONE ============================='))
Exemple #14
0
def intensity_TL(Data):
    print('======= Creating intensity_TL =======')    
    x = Data.Intensity[pd.isna(Data.Intensity) == True]
    if (len(x) == len(Data)):
       print("WARNING: All values for Intensity are NA's")

    else:
        #Filter Symptomes
        Symptomes = Data[(Data.Group == "sy")]
        tl3 = Symptomes.groupby("Date", as_index =False, sort = False)['Intensity'].agg({'Intensity': 'mean'})
        #tl3['Day'] = range(1,(len(tl3)+1))
        #tl3 = tl3.rename(columns = {'Intensity': "Intensity_mean"})
        tl3['Date'] = pd.to_datetime(tl3['Date'])
        #Setting data with missing times
        sdate = min(tl3["Date"])   # start date
        edate = max(tl3["Date"])   # end date
        delta = edate - sdate       # as timedelta
        
#        from datetime import timedelta
        day = []
        for i in range(delta.days + 1):
            d= sdate + timedelta(days=i)
            day.append(d)
            
        DF = pd.DataFrame(day)
        DF.columns = ['Date']
        data_with_missing_times = pd.merge(DF, tl3, on='Date', how='outer')
        if delta.days > 1825:
                datebreaks = '18 months'
        else:
            if delta.days > 1095:
                datebreaks = '12 months'
            else:
                datebreaks = '6 months'
        
        plot =(p9.ggplot(data=data_with_missing_times,
                         mapping=p9.aes(x='Date',y='Intensity'))
            + p9.geom_point(color = 'red', size = 5)
            + p9.theme_classic()
            + p9.theme(axis_text = p9.element_text(size=40),
                       axis_title = p9.element_text(size = 40,face = 'bold'))
            + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks)
            + p9.labs(x='',y='')
            )    
    
    #Creating and saving TL_3
    if (len(data_with_missing_times) > 5):
        #TL3 = TL_3(data_with_missing_times)
        
        plot.save(filename = 'TL_3.jpeg',
                 plot = plot,
                 path = "pdf/iteration/",
                 width = 25, height = 5,
                 dpi = 320)
    else: 
        print('Plot not created; no data found.')
    return(print('=================================intensity_TL DONE ============================='))
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes(x="count", label="..count..")) +
      geom_bar(fill="#1e4f79") +
      geom_text(stat="count", va='bottom', size=24) +
      scale_x_discrete(limits=[
          "1", "2", "3", "5", "26", "52", "97", "100", "300", "537"
      ]) + scale_y_continuous(breaks=[0, 5, 10], limits=[0, 10]) +
      ggtitle("Case Study Sizes") + xlab("Number of Projects") +
      ylab("Number of Case Studies") +
      theme_classic(base_size=28, base_family="Helvetica") +
      theme(text=element_text(size=28))).save(file_path, width=14, height=7)
def density_plot1(num_matches_per_round: int,
                  match_lengths_from_one_round: list):
    """ Density plot for match lengths, new rules, no blowouts, 85 matches/round """

    match_lengths = pd.DataFrame(
        {'Match length': match_lengths_from_one_round})
    (plt.ggplot(match_lengths, plt.aes(x='Match length')) +
     plt.geom_density() +
     plt.geom_vline(xintercept=50, color='black', size=2) +
     plt.theme_classic() +
     plt.xlim([0, 55])).save(filename='figures/match_length_density_plot.png')
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes(x="pattern", y="count", label="fraction")) +
      geom_bar(stat="identity", fill="#1e4f79") +
      geom_text(va='bottom', size=24, format_string='{:.1%}') +
      scale_x_discrete(limits=self._data["pattern"]) +
      scale_y_continuous(labels=comma_format(), expand=[0.1, 0]) +
      ggtitle("Design Pattern Counts") + xlab("Design Pattern") +
      ylab("Count") + theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32),
            axis_text_x=element_text(rotation=45, ha="right"))).save(
                file_path, width=24, height=8)
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes(x="category", y="count", label="percent")) +
      geom_bar(stat="identity", fill="#1e4f79") +
      geom_text(va='bottom', size=24) +
      scale_x_discrete(limits=self._data["category"]) +
      scale_y_continuous(labels=comma_format(), expand=[0.1, 0]) +
      ggtitle("Classes per Category") + xlab("Category") +
      ylab("Number of Classes") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32),
            axis_text_x=element_text(rotation=45, ha="right"))).save(
                file_path, width=7, height=7)
Exemple #19
0
def barplot(df, key, figsize=(8, 6), vertical=False):
    if vertical: figsize = tuple(list(reversed(list(figsize))))
    p9.options.figure_size = figsize
    top_l = df[key].value_counts().index.tolist()
    df[key] = pd.Categorical(df[key], categories=reversed(top_l))
    fig = p9.ggplot(p9.aes(x=key, y='..count..', label='..count..'), data=df)
    fig += p9.geom_bar(alpha=0.5)
    if vertical: fig += p9.coord_flip()
    fig += p9.stat_count(geom="text",
                         position=p9.position_stack(vjust=0.5),
                         size=10)
    fig += p9.theme_classic()
    return fig
def method_plot(df, baseline_rul, baseline_mse, method):
    plotnine.options.figure_size = (15, 8)

    jan = df[df['method'] == method]

    jan['percent_broken'] = jan['percent_broken'].round().astype(np.int)
    jan['percent_fail_runs'] = jan['percent_fail_runs'].round().astype(np.int)

    plotnine.ylim = (2, 10)
    gg = (plotnine.ggplot(
        jan, plotnine.aes(x='percent_broken', y='log_score', color='method')) +
          plotnine.facet_wrap('task', 2, 4) +
          plotnine.stat_boxplot(plotnine.aes(y='log_value', x=60),
                                data=baseline_rul,
                                width=80,
                                color='#14639e',
                                show_legend=False) +
          plotnine.geom_jitter(width=2.5, show_legend=False) +
          plotnine.stat_smooth(method='gls', show_legend=False) +
          plotnine.xlab('Grade of Degradation in %') +
          plotnine.ylab('Logarithmic RUL-Score') +
          plotnine.theme_classic(base_size=20))
    gg.save('%s_log_rul.pdf' % method)

    plotnine.ylim = (90, 10)
    gg = (plotnine.ggplot(
        jan, plotnine.aes(x='percent_broken', y='mse', color='method')) +
          plotnine.facet_wrap('task', 2, 4) +
          plotnine.stat_boxplot(plotnine.aes(y='value', x=60),
                                data=baseline_mse,
                                width=80,
                                color='#14639e',
                                show_legend=False) +
          plotnine.geom_jitter(width=2.5, show_legend=False) +
          plotnine.stat_smooth(method='gls', show_legend=False) +
          plotnine.xlab('Grade of Degradation in %') + plotnine.ylab('RMSE') +
          plotnine.theme_classic(base_size=20))
    gg.save('%s_rmse.pdf' % method)
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("loc")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_grid(facets="category ~ .", scales='free_y') +
      scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) +
      scale_y_continuous(labels=comma_format())
      #+ scale_y_continuous(labels=lambda l: ["%.2f%%" % (v * 100 / len(self._data)) for v in l])
      + ggtitle("Class Sizes") + xlab("Lines of Code") +
      ylab("Number of Classes") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32), subplots_adjust={"hspace": 0.1
                                                         })).save(file_path,
                                                                  width=8,
                                                                  height=18)
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("value")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_wrap(facets="variable", scales="free", ncol=3) +
      scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) +
      scale_y_continuous(labels=comma_format()) +
      ggtitle("Distributions of QMOOD Quality Attributes") +
      xlab("Quality Attribute Value") + ylab("Number of Projects") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32),
            subplots_adjust={
                "wspace": 0.35,
                "hspace": 0.35
            })).save(file_path, width=24, height=12)
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("value")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_wrap(facets="variable", scales="free", ncol=3) + xlim(0, 1) +
      scale_y_continuous(labels=comma_format()) +
      ggtitle("Intensity of Design Pattern Use") +
      xlab("Percentage of Classes Participating in Design Pattern") +
      ylab("Number of Projects") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32),
            axis_title_y=element_text(margin={"r": 40}),
            subplots_adjust={
                "wspace": 0.3,
                "hspace": 0.5
            })).save(file_path, width=24, height=24)
Exemple #24
0
    def plot_overlap_duration_bar(self, data, options):
        matches = data["matches"]
        matches = matches.loc[matches.tag_overlap > 0]
        matches.loc[:, "tag_overlap_bin"] = pd.cut(
            matches.tag_overlap, [0, 0.25, 0.5, 0.75, 1]
        )
        matches.loc[:, "tag_duration_bin"] = pd.cut(
            matches.tag_duration, [0, 0.25, 0.5, 0.75, 1, 1.5, 2, float("inf")]
        )

        matches.loc[matches.tag_overlap < 0.3].to_csv("small_overlap.csv")

        # matches.loc[:, "log_dur"] = log()

        plt = ggplot(
            data=matches, mapping=aes(x="tag_duration_bin", fill="tag_overlap_bin",),
        )
        plt = (
            plt
            + geom_bar()
            + xlab("Tag duration")
            + ylab("Proportion tag overlapping with matching event")
            + theme_classic()
            + theme(
                axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30}),
                plot_title=element_text(
                    weight="bold", size=14, margin={"t": 10, "b": 10}
                ),
                figure_size=(10, 10),
                text=element_text(size=12, weight="bold"),
            )
            + ggtitle(
                (
                    "Proportion of tag overlapping with matching event depending on duration "
                    + "size for model {}, database {}, class {}\n"
                    + "with detector options {}"
                ).format(
                    options["scenario_info"]["model"],
                    options["scenario_info"]["database"],
                    options["scenario_info"]["class"],
                    options,
                )
            )
        )

        return plt
Exemple #25
0
    def create(self, file_path: str) -> None:
        metrics = self._data["metric"].unique()

        for metric in metrics:
            data = self._data[self._data["metric"] == metric]
            q75, q25 = np.percentile(data["value"], [98, 2])

            (ggplot(data, aes(x="category", y="value")) +
             geom_boxplot(outlier_shape="") +
             coord_cartesian(ylim=(q75 * 0.8, q25 * 1.2))
             #+ facet_wrap(facets="metric", scales="free", ncol=3)
             + ggtitle(metric)
             #+ ggtitle("QMOOD Quality Attributes")
             + xlab("Category") + ylab("Value") +
             theme_classic(base_size=28, base_family="Helvetica")
             #+ theme(subplots_adjust={"wspace": 0.25, "hspace": 0.2})
             ).save(f"{file_path}.{metric}.pdf", width=24, height=24)
Exemple #26
0
def plot_pred_hist(label_list, pred_list, names=None, n_bins=10):
    """
    予測確率のヒストグラムを描く
    :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)]  のようにして与える,  pred_list に対応させる
    :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること
    :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合, ラベルの組が 2~3  ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする.
    :param: n_bins: ヒストグラムのビン数
    :return: plotnine オブジェクト
    TODO: geom_vline の表示方法
    """
    if names is None:
        if len(label_list) == 2:
            names = ('train', 'test')
        elif len(label_list) == 3:
            names = ('train', 'valid', 'test')
        else:
            names = list(range(len(label_list)))
    else:
        pass
    name_order = {k: v for v, k in enumerate(names)}
    name_order_rev = {str(k): v for v, k in name_order.items()}
    d = pd.DataFrame(
            {col: v for col, v in zip(('y', 'prediction'), [list(chain.from_iterable(x)) for x in ([label_list, pred_list])])}
    ).assign(
        model=list(chain.from_iterable([[name] * len(l) for name, l in zip(names, label_list)]))
    ).melt(
        id_vars='model'
    ).assign(
        order=lambda x: x.model.replace(name_order)
    ).sort_values(['order', 'variable'])
    # 補助線としての平均値を引くためのデータ
    d_mean = d.drop(columns='order').groupby(['variable', 'model']).mean(
            ).reset_index().rename(columns={'value': 'mean'})
    d = d.merge(d_mean, on=['variable', 'model'])
    return ggplot(
            d,
            aes(x='value', y='..density..', group='variable', fill='variable')
    ) + geom_histogram(position='identity', alpha=.5, bins=10
    ) + geom_vline(
            aes(xintercept='mean', group='variable', color='variable',
                linetype='variable')
    ) + labs(x='prediction', fill='frequency', linetype='mean', color='mean'
    ) + facet_wrap(
            '~order', scales='free_y', labeller=lambda x: name_order_rev[x]
    ) + theme_classic() + theme(figure_size=(6, 4))
Exemple #27
0
def plot_calibration(label_list, pred_list, names=None, **args):
    """
    カリブレーションカーブを複数描く.
    :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)]  のようにして与える,  pred_list に対応させる
    :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること
    :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合, ラベルの組が 2~3  ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする.
    :param: args: sklearn.metrics.roc_curve に与えるパラメータ.
        :param: strategy='quantile': 分割方法. 'quantile' または 'uniform'
        :param: n_bins=10: ビン数.
        :param: normalize=False: 予測確率の0-1正規化が必要かどうか
    :return: plotnine オブジェクト
    TODO: 入力データがすごい偏ってるときの表示範囲
    """
    if names is None:
        if len(label_list) == 2:
            names = ('train', 'test')
        elif len(label_list) == 3:
            names = ('train', 'valid', 'test')
        elif len(label_list) == 1:
            names = 'model',
        else:
            names = list(range(len(label_list)))
    else:
        pass
    if args is None:
        args = {'strategy': 'quantile', 'n_bins': 5}
    else:
        args['strategy'] = args['strategy'] if 'strategy' in args.keys() else 'quantile'
        args['n_bins'] = args['n_bins'] if 'n_bins' in args.keys() else 10
    calib = [calibration_curve(y, p, **args) for y, p in zip(label_list, pred_list)]
    frac, pred = tuple([list(chain.from_iterable(x)) for x in zip(*calib)][0:2])
    models = chain.from_iterable([[name] * l for name, l in zip(names, [len(x) for x, y in calib])])
    d_calib = pd.DataFrame({'pred': pred, 'frac': frac, 'model': models})
    return ggplot(
            d_calib,
            aes(x='pred', y='frac', group='model', color='model')
    ) + geom_segment(x=0, y=0, xend=1, yend=1, linetype=':', color='grey'
    ) + geom_line(
    ) + geom_point(
    ) + scale_color_discrete(breaks=names
    ) + labs(x='mean estimated probability', y='fraction of positives'
    ) + coord_equal(ratio=1) + theme_classic() + theme(figure_size=(4, 4))
Exemple #28
0
def general(Data):
    logging.info('======= Creating general =======')
    print('======= Creating general =======')
    x = Data.Intensity[pd.isna(Data.Intensity) == True]
    if (len(x) == len(Data)):
       print("WARNING: All values for Intensity are NA's")
    
    else:
        Data['Minutesss'] = Data['date']
        Data['Minutesss'] = pd.to_datetime(Data['Minutesss'], errors='coerce')
        Data.date= pd.to_datetime(Data.date, errors = 'coerce')
        Data['Minutesss'] = Data['Minutesss'].dt.hour*60 + Data['Minutesss'].dt.minute
        #Data.Intensity = Data.Intensity.astype(str)
        #Data.Intensity = Data.Intensity.astype(float)
        #Data.Intensity.fillna('0', inplace=True)
        plot =(p9.ggplot(data=Data,
                             mapping=p9.aes(x='date',y='Minutesss',
                                            colour = 'Intensity'))
                        + p9.geom_point(size = 2)
                        #+ p9.geom_smooth(method="loess", se=False, color = 'tomato', size = 5)
                        + p9.theme_classic()
                        + p9.scale_colour_gradient(low = "white", high = "red", aesthetics = "colour")
                        + p9.theme(axis_text = p9.element_text(size=18),
                                   axis_title = p9.element_text(size = 18,face = 'bold'),
                                   legend_position = 'none')
                        + p9.scale_x_datetime(date_labels = '%b %y', date_breaks = '6 months')
                        + p9.labs(x='',y='', colour = 'Intensity: ')
                        )
    #Creating and saving TL_1
    if (len(Data) > 0):
        #TL1 = TL_1(Data)
        plot.save(filename = 'TL_1.jpeg',
                 plot = plot,
                 path = "pdf/iteration/",
                 width = 25, height = 5,
                 dpi = 320)
    else: 
        print('Plot not created; no data found.')
    return(print('=================================general DONE ============================='))
def density_plot2(num_matches_per_round: int,
                  match_lengths_from_one_round: list,
                  match_lengths_from_one_round_with_blowouts: list):
    """ Density plot for match lengths, new rules, blowouts vs. no blowouts, 85 matches/round """

    match_lengths_blowout = pd.DataFrame({
        'Match length':
        np.concatenate([
            match_lengths_from_one_round,
            match_lengths_from_one_round_with_blowouts
        ]),
        'Blowouts':
        np.concatenate([
            np.repeat('No', num_matches_per_round),
            np.repeat('Yes', num_matches_per_round)
        ])
    })
    (plt.ggplot(match_lengths_blowout,
                plt.aes(x='Match length', color='Blowouts')) +
     plt.geom_density() +
     plt.geom_vline(xintercept=50, color='black', size=2) + plt.xlim([0, 55]) +
     plt.theme_classic()).save(
         filename='figures/match_length_with_blowout_density_plot.png')
Exemple #30
0
def pattern_research_plot(data):
    from colour import Color
    
    def colors_gradient_generator(low_color, high_color, color_steps):
        low_color_obj = Color(low_color)
        high_color_obj = Color(high_color)
        return map(lambda x : x.hex_l, low_color_obj.range_to(high_color_obj,color_steps))
    
    blue = list(colors_gradient_generator("#004996", "#018ace", 3))[::-1]
    data = data.melt(id_vars=['hour_category'], value_vars= ['D','W','MS'], var_name='series', value_name='count')
    time_unit_categories = pd.Categorical(data['series'], categories= ['D','W','MS'])
    data = data.assign(series = time_unit_categories)
    plot =(p9.ggplot(data=data,
                     mapping=p9.aes(x='hour_category', y ='count', fill ='series'))
        + p9.geom_bar(stat='identity', position='dodge') 
        + p9.scale_fill_manual(blue,labels = ['D','W','MS'])
        + p9.theme_classic()
        + p9.theme(axis_text = p9.element_text(size=8),
                   axis_title = p9.element_text(size = 8,face = 'bold'))
        + p9.coord_cartesian(ylim = (0,100))
        + p9.scale_y_continuous(labels=lambda l: ["%d%%" % (v) for v in l])
        + p9.labs(x='hour_category',y='Ratio of attacks'))
        
    return plot
Exemple #31
0
    def test_theme_classic(self):
        p = self.g + labs(title='Theme Classic') + theme_classic()

        assert p + _theme == 'theme_classic'