def _plot_difficulties(): """ Plots difficulties over time as a scatter time and exludes the ones where the difficulty is constant 2 or 3. Folder: Logfiles/ Plot name: difficulties.pdf """ print("Plotting difficulties...") resolution = 10 # resample every x seconds -> the bigger, the smoother fig, ax = plt.subplots() total = 0 high = 0 for idx, df in enumerate(sd.df_list): df = pl.transform_df_to_numbers(df) df_num_resampled = ph.resample_dataframe(df, resolution) ax.scatter(df_num_resampled['Time'], df_num_resampled['physDifficulty'], c=ph.green_color, alpha=0.3) high += len(df_num_resampled[df_num_resampled['physDifficulty'] == 3]) total += len(df_num_resampled) # print('Across all logfiles, the users are in ' + str(round(high/total, 2)) + '% on level HIGH') ax.set_ylabel('Physical Difficulty') ax.set_xlabel('Time (s)',) plt.yticks([1, 2, 3], ['Low', 'Medium', 'High']) plt.title('Difficulties') ph.save_plot(plt, 'Report/', 'difficulties.pdf')
def _crashes_per_obstacle_arrangement(): """ Plots the percentage of crashes vs the obstacle arrangement Folder: Logfiles/ Plot name: barplot_%crashes_per_obstacle_arrangement.pdf """ df = pd.concat(sd.df_list, ignore_index=True) conc_dataframes = transform_df_to_numbers(df) # For each obstacle-arrangement, make a dictionary-entry with a list [#occurences, #crashes] obst_dict = {} # For each crash, find corresponding row where we can find the obstacle he crashed into. for index, row in conc_dataframes.iterrows(): if row['Logtype'] == 'EVENT_CRASH': obstacle = row['obstacle'] if obstacle in obst_dict: obst_dict[obstacle] = [ obst_dict[obstacle][0] + 1, obst_dict[obstacle][1] + 1 ] else: obst_dict[obstacle] = [1, 1] if row['Logtype'] == 'EVENT_OBSTACLE': obstacle = row['obstacle'] if obstacle in obst_dict: obst_dict[obstacle] = [ obst_dict[obstacle][0] + 1, obst_dict[obstacle][1] ] else: obst_dict[obstacle] = [1, 0] obst_dict = collections.OrderedDict( sorted(obst_dict.items(), key=lambda s: len(s[0]))) index = obst_dict.keys() columns = ["#Occurences", "#Crashes", "Crashes in %"] data = np.zeros(shape=(len(index), 3)) count = 0 for key, value in obst_dict.items(): data[count][0] = value[0] # #Occurences data[count][1] = value[1] # #Crashes data[count][2] = value[1] / value[0] * 100 count += 1 df = pd.DataFrame(data, index=index, columns=columns) fix, ax = plt.subplots() ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3) ax.set_axisbelow(True) [i.set_linewidth(0.3) for i in ax.spines.values()] plt.xticks(rotation=90) plt.title('Crashes vs. obstacle arrangement') plt.ylabel('Crashes at this arrangement [%]') plt.xlabel('Obstacle arrangement') plt.bar(df.index, df['Crashes in %']) hp.save_plot(plt, 'Logfiles/', 'barplot_%crashes_per_obstacle_arrangement.pdf')
def _plot_difficulty_vs_size_obstacle_scatter_plot(): """ Plots the difficulty of the level and the size of the obstacle at a given difficulty in a scatter plot Folder: Logfiles/ Plot name: scatter_difficulty_vs_num_obstacles.pdf """ plt.figure() values = _get_number_of_obstacles_per_difficulty() for i in [0, 1, 2]: li = values[5 * i:5 * i + 5] maximum = max(li) if (max(li) > 0) else 1 values[5 * i:5 * i + 5] = [x / maximum * 2000 for x in li] fig, ax = plt.subplots() plt.title('Size of obstacle vs difficulty ') plt.ylabel('obstacle size') ax.yaxis.set_major_locator( MaxNLocator(integer=True)) # Only show whole numbers as difficulties plt.xticks([1, 2, 3], ['Low', 'Medium', 'High']) plt.xlabel('Difficulty') x = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3] y = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4] plt.scatter(x, y, s=values) hp.save_plot(plt, 'Logfiles/', 'scatter_difficulty_vs_num_obstacles.pdf')
def _plot_hr_vs_difficulty_scatter_plot(): """ Plots the heartrate vs the difficulty in a scatter plot Folder: Logfiles/ Plot name: scatter_difficulty_vs_heartrate.pdf """ df = pd.concat(sd.df_list, ignore_index=True) df_num = transform_df_to_numbers(df) df_num.set_index('timedelta', inplace=True) resolution = 3 # resample and take mean over difficulty. This means that a point can now have a difficulty "between" # Low/Medium/High, depending on how many seconds out of the resolution seconds it was on which level. avg_hr_df_resampled = df_num.resample(str(resolution) + 'S').mean() plt.title('Difficulty vs. heartrate') plt.ylabel('heartrate') plt.xlabel('Difficulty') x = avg_hr_df_resampled['physDifficulty'] y = avg_hr_df_resampled['Heartrate'] plt.scatter(x, y, s=30) plt.xticks([1, 2, 3], ['Low', 'Medium', 'High']) hp.save_plot(plt, 'Logfiles/', 'scatter_difficulty_vs_heartrate.pdf')
def _plot_heartrate_change(): """ Plot Heartrate changes from one time point to the next for each logfile Folder: Logfiles/Abs Heartrate Changes/ Plot name: histogram_hr_change_percentage_{logfile name}.pdf """ bpm_changes = [] # Stores all percentage changes in HR per logfile X = [] for idx, df in enumerate(sd.df_list): if not (df['Heartrate'] == -1).all(): X.append(idx) resampled = hp.resample_dataframe(df, 1) percentage_change = np.diff( resampled['Heartrate']) / resampled['Heartrate'][:-1] * 100. x = percentage_change[np.logical_not(np.isnan(percentage_change))] bpm_changes.append(x) plt.ylabel('#Times HR changed') plt.xlabel('Change in Heartrate [%]') for idx, l in enumerate(bpm_changes): # Histogram per user name = str(sd.names_logfiles[idx]) plt.figure() plt.title('Heartrate change for plot ' + name) plt.hist(l, color=hp.blue_color) hp.save_plot(plt, 'Logfiles/Abs Heartrate Changes/', 'histogram_hr_change_percentage_' + name + '.pdf')
def _plot_mean_and_std_hr_boxplot(): """ Plots mean and std bpm per user in a box-chart Folder: Logfiles/ Plot name: boxplot_mean_hr_per_user.pdf """ conc_dataframes = pd.concat(sd.df_list, ignore_index=True) df2 = conc_dataframes.pivot(columns=conc_dataframes.columns[1], index=conc_dataframes.index) df2.columns = df2.columns.droplevel() conc_dataframes[['Heartrate', 'userID']].boxplot(by='userID', grid=False, sym='r+') names = [n[:2] for n in sd.names_logfiles] locs, labels = plt.xticks() # Get locations and labels plt.xticks(locs, list(OrderedDict.fromkeys(names))) plt.ylabel('Heartrate (bpm)') plt.xlabel('User name') plt.title('') hp.save_plot(plt, 'Logfiles/', 'boxplot_mean_hr_per_user.pdf')
def _plot_average_hr_over_all_logfiles(): """ Plots average heartrate over all logfiles Folder: Logfiles/ Plot name: lineplot_average_heartrate.pdf """ plt.subplots() plt.ylabel('Heartrate (bpm)') plt.xlabel('Playing time (s)') plt.title('Average Heartrate across all users') conc_dataframes = pd.concat(sd.df_list, ignore_index=True) time_df = conc_dataframes.groupby(['userID', 'logID'])['Time'].max() min_time = time_df.min() conc_dataframes = conc_dataframes[ conc_dataframes['Time'] < min_time] # Cut all dataframes to the same minimal length df_copy = conc_dataframes.copy() # to prevent SettingWithCopyWarning avg_hr_df = df_copy.groupby(['timedelta' ])[['timedelta', 'Heartrate' ]].mean() # Take mean over all logfiles avg_hr_df.reset_index(inplace=True) avg_hr_df_resampled = hp.resample_dataframe(avg_hr_df, 10) plt.plot(avg_hr_df_resampled['Time'], avg_hr_df_resampled['Heartrate']) hp.save_plot(plt, 'Logfiles/', 'lineplot_average_heartrate.pdf')
def _plot_feature_distributions(X): """ Plots the distribution of the features in separate plots :param X: Feature matrix Folder: Features/Feature_distributions/ Plot name: histogram_{feature name}.pdf """ print("Plotting histogram of each feature...") f_names = f_factory.feature_names for idx, feature in enumerate(f_names): x = X[:, idx] plt.figure() if feature == 'timedelta_to_last_obst': mean: float = np.mean(x) std_dev: float = np.std(x) plt.hist(x, bins=np.arange(mean - 2 * std_dev, mean + 2 * std_dev, 0.005)) else: plt.hist(x) # add a 'best fit' line # sb.distplot(x) plt.title(feature) plt.tight_layout() filename = 'histogram_' + feature + '.pdf' hp.save_plot(plt, 'Features/Feature_distributions/', filename)
def _plot_hr_of_dataframes(): """ Generates one heartrate plot for each dataframes (Used to compare normalized hr to original hr) Only works for real data at the moment, because of name_logfile not existing if synthesized_data... Folder: Logfiles/Heartrate_Events/ Plot name: lineplot_hr_{logfile name}.pdf """ print("Plotting heartrate of dataframes over time...") resolution = 5 for idx, df in enumerate(sd.df_list): if not (df['Heartrate'] == -1).all(): df_num_resampled = hp.resample_dataframe(df, resolution) # Plot Heartrate _, ax1 = plt.subplots() ax1.plot(df_num_resampled['Time'], df_num_resampled['Heartrate'], hp.blue_color) ax1.set_xlabel('Playing time (s)') ax1.set_ylabel('Heartrate', color=hp.blue_color) ax1.tick_params('y', colors=hp.blue_color) filename = 'lineplot_hr_' + sd.names_logfiles[idx] + '.pdf' hp.save_plot(plt, 'Logfiles/Heartrate/', filename)
def _plot_heat_map_of_grid_search(cv_results, Classifier): """ Plots a heatmap over the hyperparameters, showing the corresponding roc_auc score Problem: We can only show 2 hyperparameters :param cv_results: cv_results of RandomizedSearchCV :param Classifier: the classfier """ params = ([ list(set(v.compressed())) for k, v in cv_results.items() if k.startswith('param_') ]) plt.figure() results_df = pd.DataFrame(cv_results) scores = np.array(results_df.mean_test_score).reshape( len(params[0]), len(params[1])) sns.heatmap(scores, annot=True, xticklabels=params[0], yticklabels=params[1], cmap=plt.cm.RdYlGn) plt.title('Grid Search roc_auc Score') plots_helpers.save_plot(plt, 'Gridsearch/', Classifier.name + '.pdf')
def _plot_mean_value_of_heartrate_at_crash(): """ For each feature, print the average of it when there was a crash vs. there was no crash Folder: Features/Crash Correlation/ Plot name: barplot_mean_{feature name}_at_crash.pdf """ print("Plotting mean value of heartrate when crash vs no crash happened...") means_when_crash = [] means_when_no_crash = [] stds_when_crash = [] stds_when_no_crash = [] for df in sd.df_list: df_with_crash = df[df['Logtype'] == 'EVENT_CRASH'] df_without_crash = df[df['Logtype'] == 'EVENT_OBSTACLE'] means_when_crash.append(df_with_crash['Heartrate'].mean()) means_when_no_crash.append(df_without_crash['Heartrate'].mean()) stds_when_crash.append(df_with_crash['Heartrate'].std()) stds_when_no_crash.append(df_without_crash['Heartrate'].std()) fix, ax = plt.subplots() bar_width = 0.3 line_width = 0.3 index = np.arange(len(means_when_crash)) ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3) ax.set_axisbelow(True) [i.set_linewidth(line_width) for i in ax.spines.values()] plt.bar(index, means_when_crash, bar_width, color=ph.red_color, label='Heartrate when crash', yerr=stds_when_crash, error_kw={'elinewidth': line_width, 'capsize': 1.4, 'markeredgewidth': line_width}, ) plt.bar(index + bar_width, means_when_no_crash, bar_width, color=ph.blue_color, label='Heartrate when no crash', yerr=stds_when_no_crash, error_kw={'elinewidth': line_width, 'capsize': 1.4, 'markeredgewidth': line_width}, ) plt.ylabel('Heartrate (normalized)') plt.xlabel('Logfile') plt.title('Average value of Heartrate when crash or not crash') plt.xticks(index + bar_width / 2, np.arange(1, 20), rotation='horizontal') plt.legend(prop={'size': 6}) filename = 'barplot_mean_heartrate_at_crash.pdf' ph.save_plot(plt, 'Report/', filename)
def _plot_feature(X, i): """ Plots the feature at position i of each logfile over time :param X: Feature matrix :param i: Feature index to plot (look at features_factoy for order) Folder: Features/Feature Plots/ Plot name: lineplot_{feature name}_{logfile_name}.pdf """ print('Plotting feature ' + f_factory.feature_names[i] + ' of each logfile over time...') # df_num_resampled = resample_dataframe(samples, resolution) feature_name = f_factory.feature_names[i] for idx, _ in enumerate(sd.df_list): obst_df = sd.obstacle_df_list[idx] times = obst_df['Time'] start = sum([len(l) for l in sd.obstacle_df_list[:idx]]) samples = list(X[start:start + len(times), i]) _, ax1 = plt.subplots() # Plot crashes crash_times = [ row['Time'] for _, row in obst_df.iterrows() if row['crash'] ] crash_values = [ samples[index] for index, row in obst_df.iterrows() if row['crash'] ] plt.scatter(crash_times, crash_values, c='r', marker='.', label='crash') plt.legend() ax1.plot(times, samples, c=hp.blue_color) ax1.set_xlabel('Playing time (s)') ax1.set_ylabel(feature_name, color=hp.blue_color) plt.title('Feature ' + feature_name + ' for logfile ' + sd.names_logfiles[idx]) ax1.tick_params('y', colors=hp.blue_color) # plt.ylim([max(np.mean(X[:, i]) - 3 * np.std(X[:, i]), min(X[:, i])), max(X[:, i])]) # plt.ylim([0, 1]) ax1.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3) ax1.set_axisbelow(True) ax1.spines['top'].set_linewidth(0.3) ax1.spines['right'].set_linewidth(0.3) filename = 'lineplot_' + feature_name + '_' + sd.names_logfiles[ idx] + '.pdf' hp.save_plot(plt, 'Features/Feature Plots/' + feature_name + '/', filename)
def _plot_heartrate_and_events(): """ Plots the heartrate of logfile 4 (user Is), together with the crashes, Shieldtutorials and Brokenship events. Note: Same as plot_heartrate_and_events in plots_logfiles.py, but only for one specific logfile Folder: Report/ Plot name: lineplot_hr_and_events.pdf """ sd.setup( fewer_data=False, # Specify if we want fewer data (for debugging purposes...) normalize_heartrate=False, remove_tutorials=False # We want tutorial to be exactly at 3 and 7.5 minutes! ) print("Plotting heartrate and events...") idx = 4 df = sd.df_list[idx] # Plot Heartrate _, ax1 = plt.subplots() ax1.plot(df['Time'], df['Heartrate'], ph.blue_color, linewidth=1.0, label='Heartrate') ax1.set_xlabel('Playing time (s)') ax1.set_ylabel('Heartrate', color=ph.blue_color) ax1.tick_params('y', colors=ph.blue_color) times_crashes = [row['Time'] for _, row in sd.obstacle_df_list[idx].iterrows() if row['crash']] heartrate_crashes = [df[df['Time'] == row['Time']].iloc[0]['Heartrate'] for _, row in sd.obstacle_df_list[idx].iterrows() if row['crash']] plt.scatter(times_crashes, heartrate_crashes, c='r', marker='.', label='Crash') # Plot Brokenships times_repairing = [row['Time'] for _, row in df.iterrows() if row['Gamemode'] == 'BROKENSHIP'] hr_max = df['Heartrate'].max() hr_min = df['Heartrate'].min() for xc in times_repairing: plt.vlines(x=xc, ymin=hr_min, ymax=hr_max+0.2, color='y', linewidth=1, label='Ship broken') # Plot Shieldtutorial times_repairing = [row['Time'] for _, row in df.iterrows() if row['Gamemode'] == 'SHIELDTUTORIAL'] hr_max = df['Heartrate'].max() hr_min = df['Heartrate'].min() for xc in times_repairing: plt.vlines(x=xc, ymin=hr_min, ymax=hr_max + 0.2, color='g', linewidth=1, label='Shield tutorial') handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) # Otherwise we'd have one label for each vline plt.legend(by_label.values(), by_label.keys()) filename = 'lineplot_hr_and_events.pdf' ph.save_plot(plt, 'Report/', filename)
def _feature_selection(X, y, verbose=False): """ Feature Selection with ExtraTreesClassifier. Prints and plots the importance of the features Source: http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html :param X: Feature matrix :param y: labels :param verbose: Whether a detailed report should be printed out :return new feature matrix with selected features """ clf = ExtraTreesClassifier(n_estimators=250, class_weight='balanced') forest = clf.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] X_new = SelectFromModel(clf).fit_transform(X, y) # Print the feature ranking if verbose: print("Feature ranking:") print('\n# features after feature-selection: ' + str(X_new.shape[1])) x_ticks = [] for f in range(X.shape[1]): x_ticks.append(f_factory.feature_names[indices[f]]) if verbose: print("%d. feature %s (%.3f)" % (f + 1, f_factory.feature_names[indices[f]], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), x_ticks, rotation='vertical') plt.xlim([-1, X.shape[1]]) plt.tight_layout() plots_helpers.save_plot(plt, 'Features/', 'feature_importance_decision_tree.pdf') return X_new, y
def _plot_mean_value_of_feature_at_crash(X, y): """ For each feature, print the average of it when there was a crash vs. there was no crash :param X: Feature matrix :param y: labels Folder: Features/Crash Correlation/ Plot name: barplot_mean_{feature name}_at_crash.pdf """ print( "Plotting mean value of each feature when crash vs no crash happened..." ) rows_with_crash = [val for (idx, val) in enumerate(X) if y[idx] == 1] rows_without_crash = [val for (idx, val) in enumerate(X) if y[idx] == 0] # Iterate over all features and plot corresponding plot for i in range(0, len(X[0])): mean_when_crash = np.mean([l[i] for l in rows_with_crash]) mean_when_no_crash = np.mean([l[i] for l in rows_without_crash]) std_when_crash = np.std([l[i] for l in rows_with_crash]) std_when_no_crash = np.std([l[i] for l in rows_without_crash]) plt.subplots() plt.bar(1, mean_when_no_crash, width=0.5, yerr=std_when_crash, color=hp.blue_color) plt.bar(2, mean_when_crash, width=0.5, yerr=std_when_no_crash, color=hp.green_color) plt.ylim(0) plt.xticks([1, 2], ['No crash', 'Crash']) plt.ylabel(str(f_factory.feature_names[i])) plt.title('Average value of feature ' + str(f_factory.feature_names[i]) + ' when crash or not crash') filename = 'barplot_mean_' + str( f_factory.feature_names[i]) + '_at_crash.pdf' hp.save_plot(plt, 'Features/Crash Correlation/', filename)
def _plot_hr(dataframe, i): """ Plots the heartrate of the dataframe :param dataframe: Dataframe from which the heartrate should be plotted :param i: id to differentiate plots """ fig, ax1 = plt.subplots() fig.suptitle('heartrate') ax1.plot(dataframe['Time'], dataframe['Heartrate']) ax1.set_xlabel('Playing time (s)') ax1.set_ylabel('Heartrate') plots_helpers.save_plot(plt, 'Logfiles/synthesized_data/', 'heartrate_testdata_' + str(i) + '.pdf')
def plot_precision_recall_curve(classifier, X, y, filename): """ Plots and saves a precision recall curve :param classifier: Classifier to generate precision-recall curve from :param X: Feature matrix :param y: labels :param filename: Name of the file the plot should be stored to """ # allows to add probability output to classifiers which implement decision_function() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) scaler = MinMaxScaler(feature_range=(0, 1)) X_train = scaler.fit_transform(X_train) # Fit and transform on trainig set, then transform test set too X_test = scaler.transform(X_test) corr = FindCorrelation(threshold=0.9) X_train = corr.fit(X_train).transform(X_train) X_test = corr.transform(X_test) classifier.fit(X_train, y_train) decision_fct = getattr(classifier, "decision_function", None) if callable(decision_fct): y_score = classifier.decision_function(X_test) precision, recall, _ = precision_recall_curve(y_test, y_score) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('2-class Precision-Recall curve') plots_helpers.save_plot(plt, 'Performance/Precision Recall Curves/', filename) else: print('\tThis classifier doesn\'t implement decision_function(), ' 'thus no precision_recall curve can be generated')
def _plot_heartrate_histogram(): """ Plots a histogram of heartrate data accumulated over all logfiles Folder: Logfiles/ Plot name: histogram_hr_all_logfiles.pdf """ print("Plotting histogram of heartrate of accumulated logfiles...") _, ax = plt.subplots() df = pd.concat(sd.df_list, ignore_index=True) df = df[df['Heartrate'] != -1]['Heartrate'] plt.hist(df) plt.title('Histogram of HR: $\mu=%.3f$, $\sigma=%.3f$' % (float(np.mean(df)), float(np.std(df)))) ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3) ax.set_axisbelow(True) [i.set_linewidth(0.3) for i in ax.spines.values()] hp.save_plot(plt, 'Logfiles/', 'histogram_hr_all_logfiles.pdf')
def _plot_crashes_vs_size_of_obstacle(): """ Plots the percentage of crashes depending on the size of the obstacle Folder: Logfiles/ Plot name: barplot_%crashes_per_size_of_obstacles.pdf """ conc_dataframes = pd.concat(sd.df_list, ignore_index=True) conc_dataframes = transform_df_to_numbers(conc_dataframes) new = conc_dataframes['obstacle'].apply( lambda x: 0 if x == 'none' else x.count( ",") + 1) # count number of obstacle parts per obstacle conc_num = conc_dataframes.assign(numObstacles=new) # [a,b,c,d,e], e..g #obstacles that had size 0,1,2,3,4 respectively num_obstacles_per_size = conc_num.groupby('numObstacles').size().tolist() # num_obstacles_per_size.insert(2, 0) # No obstacles of size 2... num_crashes_per_size = [0, 0, 0, 0, 0] # For each crash, find corresponding row where we can find the size of the obstacle he crashed into. for index, row in conc_num.iterrows(): if row['Logtype'] == 'EVENT_CRASH': sizeOfObstacle = row['numObstacles'] num_crashes_per_size[sizeOfObstacle] += 1 percentage_of_crashes = [ 0 if (x == 0 or y == 0) else x / y * 100.0 for x, y in zip(num_crashes_per_size, num_obstacles_per_size) ] x = [0, 1, 2, 3, 4] plt.title('Crash percentage per size of obstacle') plt.ylabel('Crashes [%]') plt.xlabel('Size of obstacle') plt.bar(x, percentage_of_crashes) hp.save_plot(plt, 'Logfiles/', 'barplot_%crashes_per_size_of_obstacles.pdf')
def _plot_hr_or_points_and_difficulty(to_compare): """ Plots heartrate or points together with the difficulty in a line plot :param to_compare: 'Heartrate' or 'Points' Folder: Logfiles/Heartrate Difficulty Corr/ or Logfiles/Points Difficulty Corr/ Plot name: lineplot_heartrate_difficulty_{logfile name}.pdf and lineplot_points_difficulty_{logfile name}.pdf """ resolution = 10 # resample every x seconds -> the bigger, the smoother for idx, df in enumerate(sd.df_list): df = transform_df_to_numbers(df) if not (df['Heartrate'] == -1).all(): df_num_resampled = hp.resample_dataframe(df, resolution) # Plot Heartrate fig, ax1 = plt.subplots() ax1.plot(df_num_resampled['Time'], df_num_resampled[to_compare], hp.blue_color) ax1.set_xlabel('Playing time (s)') ax1.set_ylabel(to_compare, color=hp.blue_color) ax1.tick_params('y', colors=hp.blue_color) # Plot Difficulty ax2 = ax1.twinx() ax2.plot(df_num_resampled['Time'], df_num_resampled['physDifficulty'], hp.green_color) ax2.set_ylabel('physDifficulty', color=hp.green_color) ax2.tick_params('y', colors=hp.green_color) ax2.yaxis.set_major_locator(MaxNLocator( integer=True)) # Only show whole numbers as difficulties plt.yticks([1, 2, 3], ['Low', 'Medium', 'High']) plt.title('Difficulty and ' + to_compare + ' for user ' + sd.names_logfiles[idx]) hp.save_plot( plt, 'Logfiles/', to_compare + ' Difficulty Corr/lineplot_' + to_compare + '_difficulty_' + str(sd.names_logfiles[idx]) + '.pdf')
def _plot_roc_curve(predicted_probas, y, filename, title='ROC', plot_thresholds=False): """ Plots roc_curve for a given classifier :param predicted_probas: Probabilities of positive label :param y: labels :param filename: name of the file that the roc plot should be stored in :param title: title of the roc plot :param plot_thresholds: Also plot thresholds """ # allows to add probability output to classifiers which implement decision_function() # clf = CalibratedClassifierCV(classifier) fpr_, tpr_, thresholds_ = roc_curve(y, predicted_probas) roc_auc = auc(fpr_, tpr_) plt.figure() plt.title(title) plt.plot(fpr_, tpr_, plots_helpers.blue_color, label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], c='gray', ls='--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') if plot_thresholds: # create the axis of thresholds (scores) ax2 = plt.gca().twinx() ax2.plot(fpr_, thresholds_, markeredgecolor='r', linestyle='dashed', color='r') ax2.set_ylabel('Threshold', color='r') ax2.set_ylim([thresholds_[-1], thresholds_[0]]) ax2.set_xlim([fpr_[0], fpr_[-1]]) plots_helpers.save_plot(plt, 'Report/', filename)
def _plot_feature_correlation_matrix(reduced_features=True): """ Function plots a heatmap of the correlation matrix for each pair of columns (=features) in the dataframe. Source: https://seaborn.pydata.org/examples/many_pairwise_correlations.html :param reduced_features: Should we use all features or only the reduced ones? Folder: Features/ Plot name: correlation_matrix_all_features.pdf or correlation_matrix_reduced_features.pdf """ print("Plotting correlation matrix...") X, _ = f_factory.get_feature_matrix_and_label(False, True, True, False, reduced_features) X = pd.DataFrame(X) corr = X.corr() sb.set(style="white") # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask, k=0)] = True # Set up the matplotlib figure fig, ax = plt.subplots(figsize=(len(f_factory.feature_names), len(f_factory.feature_names))) # Generate a custom diverging colormap cmap = sb.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio ax.tick_params(labelsize=20) sb.heatmap(corr, mask=mask, cmap=cmap, center=0, annot=True, xticklabels=f_factory.feature_names, yticklabels=f_factory.feature_names, square=True, linewidths=0.0, cbar_kws={"shrink": .6}, vmin=-1, vmax=1) cax = plt.gcf().axes[-1] cax.tick_params(labelsize=20) if reduced_features: ph.save_plot(plt, 'Report/', 'correlation_matrix_reduced_features.pdf') else: ph.save_plot(plt, 'Report/', 'correlation_matrix_all_features.pdf')
def _plot_heartrate_change(): """ Plot number of times the heartrate changed more than {thresh} times Folder: Logfiles/ Plot name: barplot_hr_change_thresh.pdf """ thresh = 10 bpm_changes_over_thresh = [] # Stores #points where change > thresh per logfile for idx, df in enumerate(sd.df_list): if not (df['Heartrate'] == -1).all(): resampled = ph.resample_dataframe(df, 1) percentage_change = np.diff(resampled['Heartrate']) / resampled['Heartrate'][:-1] * 100. x = percentage_change[np.logical_not(np.isnan(percentage_change))] bpm_changes_over_thresh.append(len([i for i in x if i > thresh])) fig, ax = plt.subplots() # plt.title('Number of times the heartrate changed more than ' + str(thresh) + '%') plt.ylabel('Number of times') plt.xlabel('Logfile') index = np.arange(len(bpm_changes_over_thresh)) ax.yaxis.set_major_locator(MaxNLocator(integer=True)) # Only show whole numbers as difficulties ax.xaxis.set_major_locator(MaxNLocator(integer=True)) # Only show whole numbers as difficulties plt.xticks(index, np.arange(1, 20), rotation='horizontal') plt.bar(index, bpm_changes_over_thresh, color=ph.blue_color, width=0.25) ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3) ax.set_axisbelow(True) [i.set_linewidth(0.3) for i in ax.spines.values()] ph.save_plot(plt, 'Report/', 'barplot_hr_change_thresh.pdf')
def test_all_windows(): """ Keeps one window fixed and changes the other two. Calculates the roc_auc of the Random Forest with pre-tuned parameters for each window combination and plots it. """ print("\n################# Testing all window sizes #################\n") const_window = 'cw' const_w = 10 list_1 = [5, 10, 20, 30, 50, 60] list_2 = list_1[::-1] if const_window == 'hw': name1 = 'Crash window (s)' name2 = 'Gradient window (s)' filename = 'windows_const_hw.pdf' elif const_window == 'cw': name1 = 'Default window (s)' name2 = 'Gradient window (s)' filename = 'windows_const_cw.pdf' else: name1 = 'Crash window' name2 = 'Default window' filename = 'windows_const_gradient_w.pdf' mean_scores = np.zeros((len(list_1), len(list_2))) model_name = 'Nearest Neighbor' for idx_w1, w1 in enumerate(list_1): for idx_w2, w2 in enumerate(list_2): if const_window == 'hw': X, y = f_factory.get_feature_matrix_and_label( verbose=True, use_cached_feature_matrix=True, save_as_pickle_file=True, h_window=const_w, c_window=w1, gradient_window=w2, reduced_features=False) model = classifiers.get_cclassifier_with_name( model_name, X, y).tuned_clf roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \ get_performance(model, model_name, X, y, tuned_params_keys=None, verbose=False, create_curves=False) mean_scores[idx_w1][idx_w2] = roc_auc_mean elif const_window == 'cw': X, y = f_factory.get_feature_matrix_and_label( verbose=True, use_cached_feature_matrix=True, save_as_pickle_file=True, h_window=w1, c_window=const_w, gradient_window=w2, reduced_features=False) model = classifiers.get_cclassifier_with_name( model_name, X, y).tuned_clf roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \ get_performance(model, model_name, X, y, tuned_params_keys=None, verbose=False, create_curves=False) mean_scores[idx_w1][idx_w2] = roc_auc_mean else: X, y = f_factory.get_feature_matrix_and_label( verbose=True, use_cached_feature_matrix=True, save_as_pickle_file=True, h_window=w1, c_window=w2, gradient_window=const_w, reduced_features=False) model = classifiers.get_cclassifier_with_name( model_name, X, y).tuned_clf roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \ get_performance(model, model_name, X, y, tuned_params_keys=None, verbose=False, create_curves=False) mean_scores[idx_w1][idx_w2] = roc_auc_mean mean_scores = np.fliplr( np.flipud(mean_scores)) # Flip to plot it correctly # Plot elements plt.subplot() plt.imshow(mean_scores, cmap='RdYlGn') plt.title('Average classifier performance when using constant ' + const_window) ax = plt.gca() ax.set_xticks(np.arange(0, len(list_1), 1)) ax.set_yticks(np.arange(0, len(list_2), 1)) ax.set_xticklabels(list_1) ax.set_yticklabels(list_2) ax.set_ylabel(name1) ax.set_xlabel(name2) plt.colorbar() plots_helpers.save_plot(plt, 'Performance/Windows/', filename)
def _plot_timedeltas_and_crash_per_logfile(do_normalize=True): """ Plots for each logfile the mean and std of timedelta_to_last_obst at each obstacle and if a crash or not happened :param do_normalize: Whether to normalize timedelta_feature over time Folder: Features/Timedelta vs Crash Detailed Plot name: crash_logfile_{logfile_name}.pdf """ for idx, df in enumerate(sd.obstacle_df_list): timedelta_crash = [] timedelta_no_crash = [] computed_timedeltas = [] for i in range(0, len(df.index)): current_obstacle_row = df.iloc[i] previous_obstacle_row = df.iloc[ i - 1] if i > 0 else current_obstacle_row timedelta = current_obstacle_row['Time'] - previous_obstacle_row[ 'Time'] # Clamp outliers (e.g. because of tutorials etc.). If timedelta >3, it's most likely e.g 33 seconds, so I # clamp to c.a. the average if timedelta > 3 or timedelta < 1: timedelta = 2 if do_normalize: # Normalization (since timedelta over time decreases slightly) if len(computed_timedeltas) >= 1: normalized = timedelta / computed_timedeltas[-1] else: normalized = 1 if current_obstacle_row['crash']: timedelta_crash.append(normalized) else: timedelta_no_crash.append(normalized) else: if current_obstacle_row['crash']: timedelta_crash.append(timedelta) else: timedelta_no_crash.append(timedelta) computed_timedeltas.append(timedelta) # Rescale values scaler = MinMaxScaler(feature_range=(0, 1)) scaler.fit( np.array(timedelta_crash + timedelta_no_crash).reshape(-1, 1)) # Evaluation mean_when_crash = np.mean(timedelta_crash) mean_when_no_crash = np.mean(timedelta_no_crash) std_when_crash = np.std(timedelta_crash) std_when_no_crash = np.std(timedelta_no_crash) _, _ = plt.subplots() plt.ylim(0, 1.2) plt.ylabel('Feature value') plt.bar(1, mean_when_no_crash, width=0.5, yerr=std_when_no_crash) plt.bar(2, mean_when_crash, width=0.5, yerr=std_when_crash, label='Crash') plt.xticks([1, 2], ['No crash', 'Crash']) plt.title('Average timedelta value for logfile ' + str(idx) + ' when crash or not crash') filename = 'crash_logfile_' + sd.names_logfiles[idx] + '.pdf' hp.save_plot(plt, 'Features/Timedelta vs Crash Detailed/', filename)
def _plot_scores_with_different_feature_selections(): """ After trying different feature selcetions, I plot the scores for each classifier in a barchart. Note: The numbers were colelcted by analyzsing the performances! 1. timedelta_to_last_obst only 2. timedelta_to_last_obst + last_obstacle_crash 3. all features 4. old features (=all features without timedelta_to_last_obst) Folder: Performance Plot name: clf_performance_with_different_features.pdf """ scores_timedelta_only = [0.69, 0.69, 0.84, 0.69, 0.86, 0.86, 0.8, 0.69] scores_timedelta_and_last_obst_crash = [ 0.745, 0.726, 0.99, 0.73, 0.99, 0.994, 0.96, 0.73 ] scores_all_features = [0.68, 0.68, 0.61, 0.64, 0.96, 0.95, 0.965, 0.65] scores_old_features = [0.62, 0.63, 0.57, 0.622, 0.53, 0.6, 0.64, 0.74] fix, ax = plt.subplots() bar_width = 0.2 line_width = 0.3 index = np.arange(len(scores_timedelta_only)) ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3) ax.set_axisbelow(True) [i.set_linewidth(line_width) for i in ax.spines.values()] plt.bar( index, scores_timedelta_and_last_obst_crash, bar_width, color=hp.red_color, label='timedelta_to_last_obst + last_obstacle_crash', ) plt.bar( index + bar_width, scores_timedelta_only, bar_width, color=hp.blue_color, label='timedelta_to_last_obst', ) plt.bar( index + 2 * bar_width, scores_all_features, bar_width, color=hp.green_color, label='all features', ) plt.bar( index + 3 * bar_width, scores_old_features, bar_width, color=hp.yellow_color, label='all features, but without timedelta_to_last_obst', ) plt.ylabel('roc_auc') plt.title('roc_auc when selecting different features') plt.xticks(index + bar_width / 4, classifiers.names, rotation='vertical') ax.set_ylim([0, 1.2]) plt.legend(prop={'size': 6}) plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) plt.tight_layout() hp.save_plot(plt, 'Performance/', 'clf_performance_with_different_features.pdf')
def _plot_timedelta_vs_obstacle_scatter(X, y): """ Plots timedelta-feature and labels in a scatter plot and the histogram on top :param X: Feature matrix :param y: labels Folder: Features/Timedelta vs crash/ Plot name: scatter_timedelta_crash_mean_over_all_users.pdf or scatter_timedelta_crash_{logfile_name}.pdf """ # Split up feature matrix into one matrix for each logfile feature_matrices = [] label_lists = [] obstacles_so_far = 0 for df in sd.obstacle_df_list: num_obstacles = len(df.index) feature_matrices.append( X.take(range(obstacles_so_far, obstacles_so_far + num_obstacles), axis=0)) label_lists.append(y[obstacles_so_far:obstacles_so_far + num_obstacles]) obstacles_so_far += num_obstacles X_old = X y_old = y for i in range(0, len(sd.df_list) + 1): plt.subplot() if i == len(sd.df_list): # Do the plot with the entire feature matrix X = X_old y = y_old plt.title('Timedelta vs crash plot aggregated over all logfiles') else: X = feature_matrices[i] y = label_lists[i] plt.title('Timedelta vs crash plot for logfile ' + sd.names_logfiles[i]) g = sb.jointplot(X[:, 9], X[:, 8], kind='reg') g.ax_joint.cla() plt.sca(g.ax_joint) colors = [hp.red_color if i == 1 else hp.green_color for i in y] plt.scatter(X[:, 9], X[:, 8], c=colors, alpha=0.3, s=150) plt.xticks([0, 1], ['False', 'True']) plt.ylim([ np.mean(X[:, 8]) - 3 * np.std(X[:, 8]), np.mean(X[:, 8]) + 3 * np.std(X[:, 8]) ]) # Achse fixen! plt.ylabel('Time to last obstacle') plt.xlabel('Crash at last obstacle') green_patch = mpatches.Patch(color=hp.green_color, label='no crash') red_patch = mpatches.Patch(color=hp.red_color, label='crash') plt.legend(handles=[green_patch, red_patch]) if i == len(sd.df_list): hp.save_plot(plt, 'Features/Timedelta vs crash/', 'scatter_timedelta_crash_mean_over_all_users.pdf') else: hp.save_plot( plt, 'Features/Timedelta vs crash/', 'scatter_timedelta_crash_' + sd.names_logfiles[i] + '.pdf')
def _plot_crashes_vs_timedelta(X): """ Plots the percentage of crashes happening depending on the timedelta-feature in a barchart :param X: Feature matrix Folder: Features/ Plot name: barplot_%crashes_vs_timedelta.pdf """ print("Plotting percentage crashes vs timedelta...") scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(X) timedelta_values_at_crashes = [] timedelta_values_at_non_crashes = [] timedelta_feature_index = f_factory.feature_names.index( 'timedelta_to_last_obst') obst_conc = pd.concat(sd.obstacle_df_list) for idx, row in obst_conc.iterrows(): if row['crash']: timedelta_values_at_crashes.append(X[idx, timedelta_feature_index]) else: timedelta_values_at_non_crashes.append(X[idx, timedelta_feature_index]) def get_percentage_crashes_for_bin(i): """ Returns percentage of crashes when timedelta is in a certain bin, where bin i: [i/10 , i/10 + 0.1] :param i: Bin :return: tuple with (opercentage, #occurences) """ conc = timedelta_values_at_crashes + timedelta_values_at_non_crashes try: return (len([ x for x in timedelta_values_at_crashes if i / 10 <= x <= i / 10 + 0.1 ]) / len([x for x in conc if i / 10 <= x <= i / 10 + 0.1]), len([ x for x in timedelta_values_at_crashes if i / 10 <= x <= i / 10 + 0.1 ])) except ZeroDivisionError: return 0, 0 x_tick_labels = [ '[0.0, 0.1]', '[0.1, 0.2]', '[0.2, 0.3]', '[0.3, 0.4]', '[0.4, 0.5]', '[0.5, 0.6]', '[0.6, 0.7]', '[0.7, 0.8]', '[0.8, 0.9]', '[0.9, 1.0]' ] tuples = [get_percentage_crashes_for_bin(i) for i in range(0, 10)] value_list = [t[0] for t in tuples] occurences_list = [t[1] for t in tuples] bar_width = 0.2 fig, ax = plt.subplots() plt.title('Percentage of crashes depending on timedelta') plt.ylabel('crashes (%)') plt.xlabel('timedelta to previous obstacle (s, normalized)') plt.xticks(np.arange(len(value_list)) + bar_width / 2, rotation='vertical') ax.set_xticklabels(x_tick_labels) # ax.set_ylim(0, ceil(max(value_list) * 10) / 10.0) plt.bar(np.arange(len(value_list)), value_list, color=hp.blue_color, width=bar_width, label='Crashes (%)') ax2 = ax.twinx() plt.bar(np.arange(len(value_list)) + bar_width, occurences_list, color=hp.red_color, width=bar_width, label='Occurences') ax2.set_ylabel('Occurences', color=hp.red_color) ax2.tick_params('y', colors=hp.red_color) # Add legend with two axis lines, labels = ax.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() ax2.legend(lines + lines2, labels + labels2, loc=0) ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3) ax.set_axisbelow(True) [i.set_linewidth(0.3) for i in ax.spines.values()] hp.save_plot(plt, 'Features/', 'barplot_%crashes_vs_timedelta.pdf')
def _plot_scores_normal_cv_vs_leaveone_group_out_cv(names, auc_scores_scenario_1, auc_stds_scenario_1, auc_scores_scenario_2, auc_stds_scenario_2): """ Plots the roc_auc score and the standard deviation for each classifier for both scenarios next to each other :param names: names of the logfiles :param auc_scores_scenario_1: list of roc_auc scores when doing normal cv :param auc_stds_scenario_1: list of roc_auc_std scores when doing normal cv :param auc_scores_scenario_2: list of roc_auc scores when doing leave_one_user_out cv :param auc_stds_scenario_2: list of roc_auc_std scores when doing leave_one_user_out cv """ fix, ax = plt.subplots() bar_width = 0.3 line_width = 0.3 index = np.arange(len(auc_scores_scenario_1)) ax.yaxis.grid(True, zorder=0, color='grey', linewidth=0.3) ax.set_axisbelow(True) [i.set_linewidth(line_width) for i in ax.spines.values()] plt.bar( index, auc_scores_scenario_1, bar_width, color=plots_helpers.blue_color, label='10-fold cross-validation', yerr=auc_stds_scenario_1, error_kw={ 'elinewidth': line_width, 'capsize': 1.4, 'markeredgewidth': line_width }, ) plt.bar( index + bar_width, auc_scores_scenario_2, bar_width, color=plots_helpers.red_color, label='Leave One Group Out cross-validation', yerr=auc_stds_scenario_2, error_kw={ 'elinewidth': line_width, 'capsize': 1.4, 'markeredgewidth': line_width }, ) plt.ylabel('AUC') plt.title('Performance when leaving one user out in training phase') plt.xticks(index + bar_width / 2, names, rotation='vertical') ax.set_ylim([0, 1.0]) plt.legend(prop={'size': 10}) ''' def autolabel(rects): """ Attach a text label above each bar displaying its height """ for rect in rects: height = rect.get_height() ax.text(rect.get_x() + rect.get_width() / 2., 1.1 * height, '%0.2f' % height, ha='center', va='bottom', size=5) # autolabel(r1) # autolabel(r2) ''' plt.tight_layout() plots_helpers.save_plot( plt, 'Report/', 'clf_performance_with_user_left_out_vs_normal.pdf')
def plot_roc_curves(hyperparameter_tuning=False, pre_set=True, with_lstm=False): """ Plots roc_curves for all classifier in one single plot :param hyperparameter_tuning: Do hyperparameter tuning :param pre_set: Some classifiers have pre_tuned parameters (on Euler). Take those instead of tuning :param with_lstm: Also include ROC of LSTM network (takes a little time...) Folder: Report/ Plot name: roc_curves.pdf """ X, y = f_factory.get_feature_matrix_and_label( verbose=False, use_cached_feature_matrix=True, save_as_pickle_file=True, reduced_features=False, use_boxcox=False ) clf_names = ['SVM', 'Nearest Neighbor', 'Random Forest', 'Naive Bayes'] if pre_set: clf_list = [classifiers.get_cclassifier_with_name(name, X, y).tuned_clf for name in clf_names] else: clf_list = [classifiers.get_cclassifier_with_name(name, X, y).clf for name in clf_names] tprs = [] fprs = [] roc_aucs = [] for idx, classifier in enumerate(clf_list): if hyperparameter_tuning: classifier, _ = hyperparameter_optimization.get_tuned_clf_and_tuned_hyperparameters( X, y, clf_name=clf_names[idx], verbose=False, pre_set=True ) # clf = CalibratedClassifierCV(classifier) clf = classifier kf = KFold(n_splits=10) predicted_probas_list = [] y = np.array(y) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] scaler = MinMaxScaler(feature_range=(0, 1)) X_train = scaler.fit_transform(X_train) # Fit and transform on trainig set, then transform test set too X_test = scaler.transform(X_test) corr = FindCorrelation(threshold=0.9) X_train = corr.fit(X_train).transform(X_train) X_test = corr.transform(X_test) clf.fit(X_train, y_train) predicted_probas = clf.predict_proba(X_test) predicted_probas_list.append(predicted_probas[:, 1]) fpr, tpr, _ = roc_curve(y, list(itertools.chain.from_iterable(predicted_probas_list))) roc_auc = auc(fpr, tpr) fprs.append(fpr) tprs.append(tpr) roc_aucs.append(roc_auc) # Also add LSTM scores: if with_lstm: clf_names.append("LSTM") fpr, tpr, roc_auc = LSTM.create_roc_curve(X, y, 130) fprs.append(fpr) tprs.append(tpr) roc_aucs.append(roc_auc) plt.figure() for idx, name in enumerate(clf_names): plt.plot(fprs[idx], tprs[idx], label=name + ' (AUC = %0.2f)' % roc_aucs[idx]) plt.title('Roc curves') plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], c='gray', ls='--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plots_helpers.save_plot(plt, 'Report/', 'roc_curves.pdf')