def plot_ROC(label_list, pred_list, names=None, **args): """ 複数の ROC 曲線をプロットする :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)] のようにして与える, pred_list に対応させる :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合, ラベルの組が 2~3 ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする. :param args: sklearn.metrics.roc_curve に与えるパラメータ :return: plotnine オブジェクト """ if names is None: if len(label_list) == 2: names = ('train', 'test') elif len(label_list) == 3: names = ('train', 'valid', 'test') else: names = list(range(len(label_list))) else: pass roc = [roc_curve(y, p, **args) for y, p in zip(label_list, pred_list)] fpr, tpr = tuple([list(chain.from_iterable(x)) for x in zip(*roc)][0:2]) models = chain.from_iterable([[name] * l for name, l in zip(names, [len(x) for x, y, _ in roc])]) d_roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'model': models}) return ggplot( d_roc, aes(x='fpr', y='tpr', group='model', color='model') ) + geom_segment(x=0, y=0, xend=1, yend=1, linetype=':', color='grey' ) + geom_line( ) + scale_color_discrete(breaks=names ) + labs(x='false positive rate', y='true positive rate' ) + coord_equal(ratio=1, xlim=[0, 1], ylim=[0, 1] ) + theme_classic() + theme(figure_size=(4, 4))
def duration_TL(Data): print('======= Creating duration_TL =======') x = Data.Duration[pd.isna(Data.Duration) == True] if ((len(x)+10)) >= len(Data): print("WARNING: All values for Duration are NA's") else: #Filter Symptomes and Correct Durations Symptomes = Data[(Data.Group == "sy") & (Data.Duration < 180)] #Setting data with missing times Symptomes['Date'] = pd.to_datetime(Symptomes['Date']) if len(Symptomes) == 0: print('No duration for TL_2') else: sdate = min(Symptomes["Date"]) # start date edate = max(Symptomes["Date"]) # end date delta = edate - sdate # as timedelta # from datetime import timedelta day = [] for i in range(delta.days + 1): d= sdate + timedelta(days=i) day.append(d) DF = pd.DataFrame(day) DF.columns = ['Date'] data_with_missing_times = pd.merge(DF, Symptomes, on='Date', how='outer') data_with_missing_times.Date = pd.to_datetime(data_with_missing_times.Date) if delta.days > 1825: datebreaks = '18 months' else: if delta.days > 1095: datebreaks = '12 months' else: datebreaks = '6 months' plot = (p9.ggplot(data=data_with_missing_times, mapping=p9.aes(x='Date', y='Duration')) + p9.geom_smooth(color = 'red', size = 5, method="loess", se=False) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=33), axis_title = p9.element_text(size = 33,face = 'bold')) + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks) + p9.labs(x='',y='')) if (len(data_with_missing_times) > 0): plot.save(filename = 'TL_2.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================duration_TL DONE ============================='))
def go_to_time_plot3(large_go_to_time_probs_new: list, large_go_to_time_probs_old: list, average_minutes_per_game_values: list): """ Plot go-to-time probability, old vs. new rules, no blowouts, 300 matches/round """ large_time_prob_data = pd.DataFrame({ 'Average minutes per game': np.concatenate( [average_minutes_per_game_values, average_minutes_per_game_values]), 'P(Go to time)': np.concatenate( [large_go_to_time_probs_new, large_go_to_time_probs_old]), 'Rules': np.concatenate([ np.repeat('New', len(average_minutes_per_game_values)), np.repeat('Old', len(average_minutes_per_game_values)) ]) }) (plt.ggplot( large_time_prob_data, plt.aes(x='Average minutes per game', y='P(Go to time)', color='Rules')) + plt.geom_line() + plt.geom_point() + plt.ylim([0, 1]) + plt.theme_classic()).save( filename='figures/go_to_time_300_matches_prob_plot.png')
def plot_overlap_duration(self, data, options): matches = data["matches"] matches = matches.loc[matches.tag_overlap > 0] # matches.loc[:, "log_dur"] = log() plt = ggplot(data=matches, mapping=aes(x="tag_duration", y="tag_overlap",),) plt = ( plt + geom_point() + xlab("Tag duration") + ylab("Proportion tag overlapping with matching event") + theme_classic() + theme( axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30}), plot_title=element_text( weight="bold", size=14, margin={"t": 10, "b": 10} ), figure_size=(10, 10), text=element_text(size=12, weight="bold"), ) + ggtitle( ( "Proportion of tag overlapping with matching event depending on duration " + "size for model {}, database {}, class {}\n" + "with detector options {}" ).format( options["scenario_info"]["model"], options["scenario_info"]["database"], options["scenario_info"]["class"], options, ) ) ) return plt
def mixed_linear_factors_plot(df, x_axis, factor): plotnine.options.figure_size = (10, 10) factor_steps = df[factor].unique() reg_lines = pd.DataFrame({ factor: factor_steps, 'intercept': np.zeros_like(factor_steps), 'slope': np.zeros_like(factor_steps) }) for i, step in enumerate(factor_steps): factored_df = df[df[factor] == step] md = smf.mixedlm('mse ~ %s' % x_axis, factored_df, groups=factored_df.index.values) mdf = md.fit() reg_lines.iloc[i] = [step, mdf.params['Intercept'], mdf.params[x_axis]] df['percent_broken'] = df['percent_broken'].round().astype(np.int) df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int) reg_lines[factor] = reg_lines[factor].round().astype(np.int) gg = ( plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.scale_color_manual(['#DB5F57'] * 4) + plotnine.facet_wrap(factor) + plotnine.geom_abline( plotnine.aes(intercept='intercept', slope='slope'), data=reg_lines) + plotnine.theme_classic(base_size=20)) gg.save('%s_vs_%s_rmse.pdf' % (x_axis, factor))
def go_to_time_plot2(go_to_time_probs_new: list, go_to_time_probs_old: list, go_to_time_blowout_probs_new: list, go_to_time_blowout_probs_old: list, average_minutes_per_game_values: list): """ Plot go-to-time probability, new vs. old rules, blowouts vs. no blowouts, 85 matches/round """ time_prob_blowout_data = pd.DataFrame({ 'Average minutes per game': np.concatenate([ average_minutes_per_game_values, average_minutes_per_game_values, average_minutes_per_game_values, average_minutes_per_game_values ]), 'P(Go to time)': np.concatenate([ go_to_time_probs_new, go_to_time_probs_old, go_to_time_blowout_probs_new, go_to_time_blowout_probs_old ]), 'Rules': np.concatenate([ np.repeat('New, no blowouts', len(average_minutes_per_game_values)), np.repeat('Old, no blowouts', len(average_minutes_per_game_values)), np.repeat('New, blowouts', len(average_minutes_per_game_values)), np.repeat('Old, blowouts', len(average_minutes_per_game_values)) ]) }) (plt.ggplot( time_prob_blowout_data, plt.aes(x='Average minutes per game', y='P(Go to time)', color='Rules')) + plt.geom_line() + plt.geom_point() + plt.ylim([0, 1]) + plt.theme_classic()).save( filename='figures/go_to_time_prob_with_blowouts_plot.png')
def medicine(Data): print('======= Creating medicine =======') try: #Filter medicine medicine = Data[(Data.Group == 'me')|(Data.Group == 'ma')] #Setting data with missing times medicine.Date = pd.to_datetime(medicine.Date) medicine['Date'] = pd.to_datetime(medicine['Date']) sdate = min(medicine["Date"]) # start date edate = max(medicine["Date"]) # end date delta = edate - sdate # as timedelta # from datetime import date, timedelta day = [] for i in range(delta.days + 1): d= sdate + timedelta(days=i) day.append(d) DF = pd.DataFrame(day) DF.columns = ['Date'] data_with_missing_times = pd.merge(DF, medicine, on='Date', how='outer') medicine = data_with_missing_times ########HOW TO DEAL WITH MEDICINE NA'S IN PLOTS, NOT TO SHOW THEM############################################################################################################# #if (medicine.Name.isnull().sum() > 0): #medicine = medicine[['Date','Name']] #medicine = medicine = medicine[pd.isna(medicine.Name) == False] #Creating and saving Medicine plot if (len(medicine) > 5): #Plot everything but Na's f_tl1 = (p9.ggplot(data=medicine, mapping=p9.aes(x='Date', y = 'Name')) + p9.geom_point(color = 'red', size = 3) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size= 18), axis_title = p9.element_text(size = 18,face = 'bold')) + p9.labs(title = '', x='',y='') ) f_tl1.save(filename = 'Medicine.jpeg', plot = f_tl1, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) except: print("Medicical graph failed") return(print('=================================medicine DONE ============================='))
def pca_plot(pca_data: pd.DataFrame, dim1: str, dim2: str, dim3: str): """ Returns plot displaying 3 PCA variables (including color). Parameters ---------- pca: Fitted pca object to plot. df: Dataframe pca was fit on. Used for column names. dim1: String of column name of principal component to plot on x-axis. dim2: String of column name of principal component to plot on y-axis. dim3: String of column name of principal component to plot as colour. Returns ---------- Plot of PCA with dim1 on x-axis, dim2 on y-axis, and coloured by dim3 """ #Set plot theme within function: p9.theme_set(p9.theme_classic()) num_components = len(pca_data.columns) - 1 color_type = type(pca_data.loc[0, dim3]) p = (p9.ggplot(pca_data, p9.aes(x=dim1, y=dim2, fill=dim3)) + p9.geom_point() ) if(color_type==str): print('color type is qualitative') #Can't find a better colour palette yet. #p = p + (p9.scale_fill_brewer(type="qual", palette='Accent')) return(p)
def derplot(adata=None, filename='derplot', embedding='tsne', feature='sample_type_tech', size=(12, 12), save=False, draw=False, psize=1): start = datetime.datetime.now() p.options.figure_size = size savename = filename + '.' + embedding + '.' + feature + '.derplot.png' print( start.strftime("%H:%M:%S"), 'Starting ... \t', savename, ) p.theme_set(p.theme_classic()) pt = \ p.ggplot(p.aes(embedding +'0', embedding + '1', color=feature), adata.obs) \ + p.geom_point(size=psize, alpha = 1, stroke = 0 ) \ + p.guides(color = p.guide_legend(override_aes={'size': 15})) if save: pt.save(savename, format='png', dpi=200) end = datetime.datetime.now() delta = end - start print(start.strftime("%H:%M:%S"), str(int(delta.total_seconds())), 's to make: \t', savename)
def frequency_TL(Data): print('======= Creating frequency_TL =======') #Filtering Data['date_4'] = Data['date'].dt.date tl4 = Data.groupby("date_4", sort = False, as_index = False).count() tl4 = tl4.iloc[:, 0:2] tl4 = tl4.rename(columns = {"Unnamed: 0": "n"}) sdate = min(tl4["date_4"]) # start date edate = max(tl4["date_4"]) # end date delta = edate - sdate # as timedelta # tl4 = Data.groupby("Date", sort = False, as_index = False).count() # tl4 = tl4.iloc[:, 0:2] # tl4 = tl4.rename(columns = {"Unnamed: 0": "n"}) # tl4['Date'] = pd.to_datetime(tl4['Date']) # #Setting data with missing times # sdate = min(tl4["Date"]) # start date # edate = max(tl4["Date"]) # end date # delta = edate - sdate # as timedelta from datetime import timedelta day = [] for i in range(delta.days + 1): d= sdate + timedelta(days=i) day.append(d) DF = pd.DataFrame(day) DF.columns = ['date_4'] data_with_missing_times = pd.merge(DF, tl4, on='date_4', how='outer') if delta.days > 1825: datebreaks = '18 months' else: if delta.days > 1095: datebreaks = '12 months' else: datebreaks = '6 months' #Creating and saving TL_4 plot =(p9.ggplot(data=data_with_missing_times, mapping=p9.aes(x='date_4',y='n')) + p9.geom_col(fill = 'red') + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks) + p9.labs(x='',y='') ) if (len(data_with_missing_times) > 0): plot.save(filename = 'TL_4.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================frequency_TL DONE ============================='))
def summary(tags, opts=None): print(tags) tags_summary = ( tags.groupby(["tag", "background"]) .agg({"tag": "count"}) .rename(columns={"tag": "n_tags"}) .reset_index() .astype({"background": "category", "tag": "category"}) ) print(tags_summary) # tags_summary = tags_df.groupby(["species"]).agg( # {"tag_duration": "sum", "species": "count"} # ) # tags_summary.rename(columns={"species": "count"}, inplace=True) # tags_summary["tag_duration"] = tags_summary.tag_duration.astype(int) # tags_summary["duration"] = tags_summary.tag_duration.astype(str) + "s" # tags_summary = tags_summary.reindex(list(SPECIES_LABELS.keys())) # # tags_summary["species"] = tags_summary.index # tags_summary.reset_index(inplace=True) # tags_summary # ( # ggplot( # data=tags_summary, # mapping=aes( # x="factor(species, ordered=False)", # y="tag_duration", # fill="factor(species, ordered=False)", # ), # ) # + geom_bar(stat="identity", show_legend=False) # + xlab("Species") # + ylab("Duration of annotations (s)") # + geom_text(mapping=aes(label="count"), nudge_y=15) # + theme_classic() # + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels) # ).save("species_repartition_duration_mini.png", width=10, height=8) plt = ( ggplot( data=tags_summary, mapping=aes( x="tag", # "factor(species, ordered=False)", y="n_tags", fill="background", # "factor(species, ordered=False)", ), ) + geom_bar(stat="identity", show_legend=True, position=position_dodge()) + xlab("Species") + ylab("Number of annotations") + geom_text(mapping=aes(label="n_tags"), nudge_y=15) + theme_classic() + theme(axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30})) # + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels) ).save("tag_species_bg.png", width=10, height=8) # print(tags_summary) print(plt)
def plot_pca_vis(pca: PCA, df: pd.DataFrame, pc_x: int = 0, pc_y: int = 1, num_dims: int = 5) -> plt: """ Plot contribution of different dimensions to principal components. Parameters ---------- pca: Fitted pca object to plot. df: Dataframe pca was fit on. Used for column names. pc_x: Index of principal component to plot on x-axis. pc_y: Index of principal component to plot on y-axis. num_dims: Number of contributing elements to include for each axis. Returns ---------- Null Prints matplotlib.plt object. https://stackoverflow.com/questions/45148539/project-variables-in-pca-plot-in-python Adapted into function by Tim Cashion """ #Set plot theme within function: p9.theme_set(p9.theme_classic()) # Get the PCA components (loadings) PCs = pca.components_ PC_x_index = PCs[pc_x, : ].argsort()[-num_dims:][::-1] PC_y_index = PCs[pc_y, : ].argsort()[-num_dims:][::-1] combined_index = set(list(PC_x_index) + list(PC_y_index)) combined_index = sorted(combined_index) PCs = PCs[:, combined_index] # Use quiver to generate the basic plot fig = plt.figure(figsize=(5,5)) plt.quiver(np.zeros(PCs.shape[1]), np.zeros(PCs.shape[1]), PCs[pc_x,:], PCs[pc_y,:], angles='xy', scale_units='xy', scale=1) # Add labels based on feature names (here just numbers) feature_names = df.columns[combined_index] for i,j,z in zip(PCs[pc_y,:]+0.02, PCs[pc_x,:]+0.02, feature_names): plt.text(j, i, z, ha='center', va='center') # Add unit circle circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b') plt.gca().add_artist(circle) # Ensure correct aspect ratio and axis limits plt.axis('equal') plt.xlim([-1.0,1.0]) plt.ylim([-1.0,1.0]) # Label axes plt.xlabel('PC ' + str(pc_x)) plt.ylabel('PC ' + str(pc_y)) plt.tight_layout() return plt
def duration_graph(Data, Data_m): print('======= Creating duration_graph =======') #Filter current year and month, and correct Duration #Graph2_ALL.Duration = Graph2_ALL.Duration/60 #Graph2_ALL.Duration = Graph2_ALL.Duration.astype(str) x = Data.Duration[pd.isna(Data.Duration) == True] if (len(x) == len(Data)): logging.warning('=================================Graph_2 aborted =============================') return else: Graph2 = Data_m[(Data_m.Duration < 180)] Graph2_ALL = Data[(Data.Duration < 180)] if (len(Graph2_ALL) > 0): plot= (p9.ggplot(data=Graph2_ALL, mapping=p9.aes(x='Duration')) + p9.geom_bar(fill = 'red', stat = 'count', size = 100) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.labs(title = '', x='',y='No. of attacks') ) plot.save(filename = 'Graph_ALL_2.jpeg',plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') if (len(Graph2) > 0): plot_month= (p9.ggplot(data=Graph2, mapping=p9.aes(x='Duration')) + p9.geom_bar(fill = 'red', stat = 'count', size = 100) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.labs(title = '', x='',y='No. of attacks') ) plot_month.save(filename = 'Graph_2.jpeg', plot = plot_month, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================duration_graph DONE ============================='))
def intensity_TL(Data): print('======= Creating intensity_TL =======') x = Data.Intensity[pd.isna(Data.Intensity) == True] if (len(x) == len(Data)): print("WARNING: All values for Intensity are NA's") else: #Filter Symptomes Symptomes = Data[(Data.Group == "sy")] tl3 = Symptomes.groupby("Date", as_index =False, sort = False)['Intensity'].agg({'Intensity': 'mean'}) #tl3['Day'] = range(1,(len(tl3)+1)) #tl3 = tl3.rename(columns = {'Intensity': "Intensity_mean"}) tl3['Date'] = pd.to_datetime(tl3['Date']) #Setting data with missing times sdate = min(tl3["Date"]) # start date edate = max(tl3["Date"]) # end date delta = edate - sdate # as timedelta # from datetime import timedelta day = [] for i in range(delta.days + 1): d= sdate + timedelta(days=i) day.append(d) DF = pd.DataFrame(day) DF.columns = ['Date'] data_with_missing_times = pd.merge(DF, tl3, on='Date', how='outer') if delta.days > 1825: datebreaks = '18 months' else: if delta.days > 1095: datebreaks = '12 months' else: datebreaks = '6 months' plot =(p9.ggplot(data=data_with_missing_times, mapping=p9.aes(x='Date',y='Intensity')) + p9.geom_point(color = 'red', size = 5) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=40), axis_title = p9.element_text(size = 40,face = 'bold')) + p9.scale_x_datetime(date_labels = '%Y-%m', date_breaks = datebreaks) + p9.labs(x='',y='') ) #Creating and saving TL_3 if (len(data_with_missing_times) > 5): #TL3 = TL_3(data_with_missing_times) plot.save(filename = 'TL_3.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================intensity_TL DONE ============================='))
def create(self, file_path: str) -> None: (ggplot(self._data, aes(x="count", label="..count..")) + geom_bar(fill="#1e4f79") + geom_text(stat="count", va='bottom', size=24) + scale_x_discrete(limits=[ "1", "2", "3", "5", "26", "52", "97", "100", "300", "537" ]) + scale_y_continuous(breaks=[0, 5, 10], limits=[0, 10]) + ggtitle("Case Study Sizes") + xlab("Number of Projects") + ylab("Number of Case Studies") + theme_classic(base_size=28, base_family="Helvetica") + theme(text=element_text(size=28))).save(file_path, width=14, height=7)
def density_plot1(num_matches_per_round: int, match_lengths_from_one_round: list): """ Density plot for match lengths, new rules, no blowouts, 85 matches/round """ match_lengths = pd.DataFrame( {'Match length': match_lengths_from_one_round}) (plt.ggplot(match_lengths, plt.aes(x='Match length')) + plt.geom_density() + plt.geom_vline(xintercept=50, color='black', size=2) + plt.theme_classic() + plt.xlim([0, 55])).save(filename='figures/match_length_density_plot.png')
def create(self, file_path: str) -> None: (ggplot(self._data, aes(x="pattern", y="count", label="fraction")) + geom_bar(stat="identity", fill="#1e4f79") + geom_text(va='bottom', size=24, format_string='{:.1%}') + scale_x_discrete(limits=self._data["pattern"]) + scale_y_continuous(labels=comma_format(), expand=[0.1, 0]) + ggtitle("Design Pattern Counts") + xlab("Design Pattern") + ylab("Count") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), axis_text_x=element_text(rotation=45, ha="right"))).save( file_path, width=24, height=8)
def create(self, file_path: str) -> None: (ggplot(self._data, aes(x="category", y="count", label="percent")) + geom_bar(stat="identity", fill="#1e4f79") + geom_text(va='bottom', size=24) + scale_x_discrete(limits=self._data["category"]) + scale_y_continuous(labels=comma_format(), expand=[0.1, 0]) + ggtitle("Classes per Category") + xlab("Category") + ylab("Number of Classes") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), axis_text_x=element_text(rotation=45, ha="right"))).save( file_path, width=7, height=7)
def barplot(df, key, figsize=(8, 6), vertical=False): if vertical: figsize = tuple(list(reversed(list(figsize)))) p9.options.figure_size = figsize top_l = df[key].value_counts().index.tolist() df[key] = pd.Categorical(df[key], categories=reversed(top_l)) fig = p9.ggplot(p9.aes(x=key, y='..count..', label='..count..'), data=df) fig += p9.geom_bar(alpha=0.5) if vertical: fig += p9.coord_flip() fig += p9.stat_count(geom="text", position=p9.position_stack(vjust=0.5), size=10) fig += p9.theme_classic() return fig
def method_plot(df, baseline_rul, baseline_mse, method): plotnine.options.figure_size = (15, 8) jan = df[df['method'] == method] jan['percent_broken'] = jan['percent_broken'].round().astype(np.int) jan['percent_fail_runs'] = jan['percent_fail_runs'].round().astype(np.int) plotnine.ylim = (2, 10) gg = (plotnine.ggplot( jan, plotnine.aes(x='percent_broken', y='log_score', color='method')) + plotnine.facet_wrap('task', 2, 4) + plotnine.stat_boxplot(plotnine.aes(y='log_value', x=60), data=baseline_rul, width=80, color='#14639e', show_legend=False) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.stat_smooth(method='gls', show_legend=False) + plotnine.xlab('Grade of Degradation in %') + plotnine.ylab('Logarithmic RUL-Score') + plotnine.theme_classic(base_size=20)) gg.save('%s_log_rul.pdf' % method) plotnine.ylim = (90, 10) gg = (plotnine.ggplot( jan, plotnine.aes(x='percent_broken', y='mse', color='method')) + plotnine.facet_wrap('task', 2, 4) + plotnine.stat_boxplot(plotnine.aes(y='value', x=60), data=baseline_mse, width=80, color='#14639e', show_legend=False) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.stat_smooth(method='gls', show_legend=False) + plotnine.xlab('Grade of Degradation in %') + plotnine.ylab('RMSE') + plotnine.theme_classic(base_size=20)) gg.save('%s_rmse.pdf' % method)
def create(self, file_path: str) -> None: (ggplot(self._data, aes("loc")) + geom_histogram(bins=100, fill="#1e4f79") + facet_grid(facets="category ~ .", scales='free_y') + scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) + scale_y_continuous(labels=comma_format()) #+ scale_y_continuous(labels=lambda l: ["%.2f%%" % (v * 100 / len(self._data)) for v in l]) + ggtitle("Class Sizes") + xlab("Lines of Code") + ylab("Number of Classes") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), subplots_adjust={"hspace": 0.1 })).save(file_path, width=8, height=18)
def create(self, file_path: str) -> None: (ggplot(self._data, aes("value")) + geom_histogram(bins=100, fill="#1e4f79") + facet_wrap(facets="variable", scales="free", ncol=3) + scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) + scale_y_continuous(labels=comma_format()) + ggtitle("Distributions of QMOOD Quality Attributes") + xlab("Quality Attribute Value") + ylab("Number of Projects") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), subplots_adjust={ "wspace": 0.35, "hspace": 0.35 })).save(file_path, width=24, height=12)
def create(self, file_path: str) -> None: (ggplot(self._data, aes("value")) + geom_histogram(bins=100, fill="#1e4f79") + facet_wrap(facets="variable", scales="free", ncol=3) + xlim(0, 1) + scale_y_continuous(labels=comma_format()) + ggtitle("Intensity of Design Pattern Use") + xlab("Percentage of Classes Participating in Design Pattern") + ylab("Number of Projects") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), axis_title_y=element_text(margin={"r": 40}), subplots_adjust={ "wspace": 0.3, "hspace": 0.5 })).save(file_path, width=24, height=24)
def plot_overlap_duration_bar(self, data, options): matches = data["matches"] matches = matches.loc[matches.tag_overlap > 0] matches.loc[:, "tag_overlap_bin"] = pd.cut( matches.tag_overlap, [0, 0.25, 0.5, 0.75, 1] ) matches.loc[:, "tag_duration_bin"] = pd.cut( matches.tag_duration, [0, 0.25, 0.5, 0.75, 1, 1.5, 2, float("inf")] ) matches.loc[matches.tag_overlap < 0.3].to_csv("small_overlap.csv") # matches.loc[:, "log_dur"] = log() plt = ggplot( data=matches, mapping=aes(x="tag_duration_bin", fill="tag_overlap_bin",), ) plt = ( plt + geom_bar() + xlab("Tag duration") + ylab("Proportion tag overlapping with matching event") + theme_classic() + theme( axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30}), plot_title=element_text( weight="bold", size=14, margin={"t": 10, "b": 10} ), figure_size=(10, 10), text=element_text(size=12, weight="bold"), ) + ggtitle( ( "Proportion of tag overlapping with matching event depending on duration " + "size for model {}, database {}, class {}\n" + "with detector options {}" ).format( options["scenario_info"]["model"], options["scenario_info"]["database"], options["scenario_info"]["class"], options, ) ) ) return plt
def create(self, file_path: str) -> None: metrics = self._data["metric"].unique() for metric in metrics: data = self._data[self._data["metric"] == metric] q75, q25 = np.percentile(data["value"], [98, 2]) (ggplot(data, aes(x="category", y="value")) + geom_boxplot(outlier_shape="") + coord_cartesian(ylim=(q75 * 0.8, q25 * 1.2)) #+ facet_wrap(facets="metric", scales="free", ncol=3) + ggtitle(metric) #+ ggtitle("QMOOD Quality Attributes") + xlab("Category") + ylab("Value") + theme_classic(base_size=28, base_family="Helvetica") #+ theme(subplots_adjust={"wspace": 0.25, "hspace": 0.2}) ).save(f"{file_path}.{metric}.pdf", width=24, height=24)
def plot_pred_hist(label_list, pred_list, names=None, n_bins=10): """ 予測確率のヒストグラムを描く :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)] のようにして与える, pred_list に対応させる :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合, ラベルの組が 2~3 ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする. :param: n_bins: ヒストグラムのビン数 :return: plotnine オブジェクト TODO: geom_vline の表示方法 """ if names is None: if len(label_list) == 2: names = ('train', 'test') elif len(label_list) == 3: names = ('train', 'valid', 'test') else: names = list(range(len(label_list))) else: pass name_order = {k: v for v, k in enumerate(names)} name_order_rev = {str(k): v for v, k in name_order.items()} d = pd.DataFrame( {col: v for col, v in zip(('y', 'prediction'), [list(chain.from_iterable(x)) for x in ([label_list, pred_list])])} ).assign( model=list(chain.from_iterable([[name] * len(l) for name, l in zip(names, label_list)])) ).melt( id_vars='model' ).assign( order=lambda x: x.model.replace(name_order) ).sort_values(['order', 'variable']) # 補助線としての平均値を引くためのデータ d_mean = d.drop(columns='order').groupby(['variable', 'model']).mean( ).reset_index().rename(columns={'value': 'mean'}) d = d.merge(d_mean, on=['variable', 'model']) return ggplot( d, aes(x='value', y='..density..', group='variable', fill='variable') ) + geom_histogram(position='identity', alpha=.5, bins=10 ) + geom_vline( aes(xintercept='mean', group='variable', color='variable', linetype='variable') ) + labs(x='prediction', fill='frequency', linetype='mean', color='mean' ) + facet_wrap( '~order', scales='free_y', labeller=lambda x: name_order_rev[x] ) + theme_classic() + theme(figure_size=(6, 4))
def plot_calibration(label_list, pred_list, names=None, **args): """ カリブレーションカーブを複数描く. :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)] のようにして与える, pred_list に対応させる :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合, ラベルの組が 2~3 ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする. :param: args: sklearn.metrics.roc_curve に与えるパラメータ. :param: strategy='quantile': 分割方法. 'quantile' または 'uniform' :param: n_bins=10: ビン数. :param: normalize=False: 予測確率の0-1正規化が必要かどうか :return: plotnine オブジェクト TODO: 入力データがすごい偏ってるときの表示範囲 """ if names is None: if len(label_list) == 2: names = ('train', 'test') elif len(label_list) == 3: names = ('train', 'valid', 'test') elif len(label_list) == 1: names = 'model', else: names = list(range(len(label_list))) else: pass if args is None: args = {'strategy': 'quantile', 'n_bins': 5} else: args['strategy'] = args['strategy'] if 'strategy' in args.keys() else 'quantile' args['n_bins'] = args['n_bins'] if 'n_bins' in args.keys() else 10 calib = [calibration_curve(y, p, **args) for y, p in zip(label_list, pred_list)] frac, pred = tuple([list(chain.from_iterable(x)) for x in zip(*calib)][0:2]) models = chain.from_iterable([[name] * l for name, l in zip(names, [len(x) for x, y in calib])]) d_calib = pd.DataFrame({'pred': pred, 'frac': frac, 'model': models}) return ggplot( d_calib, aes(x='pred', y='frac', group='model', color='model') ) + geom_segment(x=0, y=0, xend=1, yend=1, linetype=':', color='grey' ) + geom_line( ) + geom_point( ) + scale_color_discrete(breaks=names ) + labs(x='mean estimated probability', y='fraction of positives' ) + coord_equal(ratio=1) + theme_classic() + theme(figure_size=(4, 4))
def general(Data): logging.info('======= Creating general =======') print('======= Creating general =======') x = Data.Intensity[pd.isna(Data.Intensity) == True] if (len(x) == len(Data)): print("WARNING: All values for Intensity are NA's") else: Data['Minutesss'] = Data['date'] Data['Minutesss'] = pd.to_datetime(Data['Minutesss'], errors='coerce') Data.date= pd.to_datetime(Data.date, errors = 'coerce') Data['Minutesss'] = Data['Minutesss'].dt.hour*60 + Data['Minutesss'].dt.minute #Data.Intensity = Data.Intensity.astype(str) #Data.Intensity = Data.Intensity.astype(float) #Data.Intensity.fillna('0', inplace=True) plot =(p9.ggplot(data=Data, mapping=p9.aes(x='date',y='Minutesss', colour = 'Intensity')) + p9.geom_point(size = 2) #+ p9.geom_smooth(method="loess", se=False, color = 'tomato', size = 5) + p9.theme_classic() + p9.scale_colour_gradient(low = "white", high = "red", aesthetics = "colour") + p9.theme(axis_text = p9.element_text(size=18), axis_title = p9.element_text(size = 18,face = 'bold'), legend_position = 'none') + p9.scale_x_datetime(date_labels = '%b %y', date_breaks = '6 months') + p9.labs(x='',y='', colour = 'Intensity: ') ) #Creating and saving TL_1 if (len(Data) > 0): #TL1 = TL_1(Data) plot.save(filename = 'TL_1.jpeg', plot = plot, path = "pdf/iteration/", width = 25, height = 5, dpi = 320) else: print('Plot not created; no data found.') return(print('=================================general DONE ============================='))
def density_plot2(num_matches_per_round: int, match_lengths_from_one_round: list, match_lengths_from_one_round_with_blowouts: list): """ Density plot for match lengths, new rules, blowouts vs. no blowouts, 85 matches/round """ match_lengths_blowout = pd.DataFrame({ 'Match length': np.concatenate([ match_lengths_from_one_round, match_lengths_from_one_round_with_blowouts ]), 'Blowouts': np.concatenate([ np.repeat('No', num_matches_per_round), np.repeat('Yes', num_matches_per_round) ]) }) (plt.ggplot(match_lengths_blowout, plt.aes(x='Match length', color='Blowouts')) + plt.geom_density() + plt.geom_vline(xintercept=50, color='black', size=2) + plt.xlim([0, 55]) + plt.theme_classic()).save( filename='figures/match_length_with_blowout_density_plot.png')
def pattern_research_plot(data): from colour import Color def colors_gradient_generator(low_color, high_color, color_steps): low_color_obj = Color(low_color) high_color_obj = Color(high_color) return map(lambda x : x.hex_l, low_color_obj.range_to(high_color_obj,color_steps)) blue = list(colors_gradient_generator("#004996", "#018ace", 3))[::-1] data = data.melt(id_vars=['hour_category'], value_vars= ['D','W','MS'], var_name='series', value_name='count') time_unit_categories = pd.Categorical(data['series'], categories= ['D','W','MS']) data = data.assign(series = time_unit_categories) plot =(p9.ggplot(data=data, mapping=p9.aes(x='hour_category', y ='count', fill ='series')) + p9.geom_bar(stat='identity', position='dodge') + p9.scale_fill_manual(blue,labels = ['D','W','MS']) + p9.theme_classic() + p9.theme(axis_text = p9.element_text(size=8), axis_title = p9.element_text(size = 8,face = 'bold')) + p9.coord_cartesian(ylim = (0,100)) + p9.scale_y_continuous(labels=lambda l: ["%d%%" % (v) for v in l]) + p9.labs(x='hour_category',y='Ratio of attacks')) return plot
def test_theme_classic(self): p = self.g + labs(title='Theme Classic') + theme_classic() assert p + _theme == 'theme_classic'