def bagging_pasting(X_train, X_val, y_train, y_val, doplot=False): from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score tree_clf = DecisionTreeClassifier(random_state=42) tree_clf.fit(X_train, y_train) y_pred_tree = tree_clf.predict(X_val) print("DecisionTree classifier, accuracy score = %f\n" % accuracy_score(y_val, y_pred_tree)) bag_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=1, oob_score=True) bag_clf.fit(X_train, y_train) y_pred = bag_clf.predict(X_val) print("Bagging classifier, accuracy score = %f\n" % accuracy_score(y_val, y_pred)) if doplot: from utils import plot_decision_boundary, save_fig plt.figure(figsize=(11, 4)) plt.subplot(121) plot_decision_boundary(tree_clf, X, y) plt.title("Decision Tree", fontsize=14) plt.subplot(122) plot_decision_boundary(bag_clf, X, y) plt.title("Decision Trees with Bagging", fontsize=14) save_fig("DT_without_and_with_bagging_plot", CHAPTER_ID) plt.show()
def visualize_k_fold_roc_plot(X, y_gold, classifier, K): """ Visualizes K ROC curves created from K-fold cross validation and the mean ROC curve. Keyword arguments: X -- The feature vectors y_gold_standard -- Expected labels. classifier -- The classifier to be used K -- Number of folds to perform """ cross_validation = StratifiedKFold(y_gold, n_folds=K) mean_true_positive_rate = 0.0 mean_false_positive_rate = 0.0 for i, (train, test) in enumerate(cross_validation): #classify classifier.fit(X[train], y_gold[train]) y_predicted = classifier.predict(X[test]) #compute ROC false_positive_rate, true_positive_rate, thresholds = roc_curve( y_gold[test], y_predicted) roc_auc = auc(false_positive_rate, true_positive_rate) plt.plot(false_positive_rate, true_positive_rate, linewidth=1, label='ROC fold %d (area = %0.2f)' % (i + 1, roc_auc)) #save means mean_true_positive_rate += true_positive_rate mean_false_positive_rate += false_positive_rate #compute final mean mean_true_positive_rate /= len(cross_validation) mean_false_positive_rate /= len(cross_validation) mean_auc = auc(mean_false_positive_rate, mean_true_positive_rate) plt.plot(mean_false_positive_rate, mean_true_positive_rate, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, linewidth=2) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Classifier') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(loc="lower right") # plt.show() #save fig output_dir = 'img' save_fig(output_dir, '{}/roc.png'.format(output_dir)) plt.close()
def visualize_feature_boxplot(X,y,selected_feature,features): """ Visualize the boxplot of a feature Keyword arguments: X -- The feature vectors y -- The target vector selected_feature -- The desired feature to obtain the histogram features -- Vector of feature names (X1 to XN) """ #create data joint_data=np.column_stack((X,y)) column_names=features #create dataframe df=pd.DataFrame(data=joint_data,columns=column_names) # palette = sea.hls_palette() splot=sea.boxplot(data=df,x='Y',y=selected_feature,hue="Y",palette="husl") plt.title('BoxPlot Distribution of '+selected_feature) #save fig output_dir = "img" save_fig(output_dir,'{}/{}_boxplot.png'.format(output_dir,selected_feature))
def make_record_alg_cmp_bar(path, serv, operations, ylabel, stats): labels = list(stats.keys()) xtickslabels = deepcopy(stats[next(iter(stats))]['keys']) if serv != 'conf' and serv != 'int': for i, val in enumerate(xtickslabels): xtickslabels[i] = settings.sec_str[int(val)] for op in operations: fig, ax = plt.subplots(1, 1, figsize=(30, 10)) y = [] yerr = [] for key in stats: y.append(stats[key]['mean_' + ylabel + '_' + op]) yerr.append(stats[key]['stddev_' + ylabel + '_' + op]) ax = utils.multiple_custom_bar(y, yerr, ax, title=op + ' (mean)', labels=labels, xtickslabels=xtickslabels, xlabel='security strength (in bits)', ylabel=ylabel) utils.save_fig( fig, 'statistics/' + path + '/serv_' + serv + '_' + op + '_' + ylabel + '.png')
def visualize_lda2D(X,y): """ Visualize the separation between classes using the two most discriminant features Keyword arguments: X -- The feature vectors y -- The target vector """ labels=['Paid','Default'] lda = LDA(n_components = 2,solver='eigen') # lda = LDA(n_components = 2) discriminative_attributes = lda.fit(X, y).transform(X) palette = sea.color_palette() # plt.plot(discriminative_attributes[:,0][y==0],'sg',label="Paid", alpha=0.5) # plt.plot(discriminative_attributes[:,0][y==1],'^r',label="Default", alpha=0.5) plt.scatter(discriminative_attributes[:,0][y==0],discriminative_attributes[:,1][y==0],marker='s',color='green',label="Paid", alpha=0.5) plt.scatter(discriminative_attributes[:,0][y==1],discriminative_attributes[:,1][y==1],marker='^',color='red',label="Default", alpha=0.5) plt.xlabel('First Linear Discriminant') plt.ylabel('Second Linear Discriminant') leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title("Linear Discriminant Analysis") plt.tight_layout #save fig output_dir='img' save_fig(output_dir,'{}/lda.png'.format(output_dir))
def plot_hist(baseline_samples, target_samples, true_x, true_y): baseline_samples = baseline_samples.squeeze() target_samples = target_samples.squeeze() bmin, bmax = baseline_samples.min(), baseline_samples.max() ax = sns.kdeplot(baseline_samples, shade=True, color=(0.6, 0.1, 0.1, 0.2)) ax = sns.kdeplot(target_samples, shade=True, color=(0.1, 0.1, 0.6, 0.2)) ax.set_xlim(bmin, bmax) y0, y1 = ax.get_ylim() plt.plot([true_y, true_y], [0, y1 - (y1 - y0) * 0.01], linewidth=1, color='r') plt.title('Predictive' + (f' at {true_x:.2f}' if true_x is not None else '')) fig = plt.gcf() fig.set_size_inches(9, 9) # plt.tight_layout() # pad=0.4, w_pad=0.5, h_pad=1.0) name = utils.DATA_DIR.replace('/', '-') # plt.tight_layout(pad=0.6) utils.save_fig('predictive-at-point-' + name)
def visualize_feature_boxplot(X, y, selected_feature, features): """ Visualize the boxplot of a feature Keyword arguments: X -- The feature vectors y -- The target vector selected_feature -- The desired feature to obtain the histogram features -- Vector of feature names (X1 to XN) """ #create data joint_data = np.column_stack((X, y)) column_names = features #create dataframe df = pd.DataFrame(data=joint_data, columns=column_names) # palette = sea.hls_palette() splot = sea.boxplot(data=df, x='Y', y=selected_feature, hue="Y", palette="husl") plt.title('BoxPlot Distribution of ' + selected_feature) #save fig output_dir = "img" save_fig(output_dir, '{}/{}_boxplot.png'.format(output_dir, selected_feature))
def callback(X_next, Y_next, i): global X_sample, Y_sample # Plot samples, surrogate function, noise-free objective and next sampling location #plt.subplot(n_iter, 2, 2 * i + 1) plt.figure() plot_approximation(gpr, X, Y, X_sample, Y_sample, X_next, show_legend=i == 0) plt.title(f'Iteration {i+1}') if save_figures: save_fig('bayes-opt-surrogate-{}.pdf'.format(i + 1)) plt.show() plt.figure() #plt.subplot(n_iter, 2, 2 * i + 2) plot_acquisition(X, expected_improvement(X, X_sample, Y_sample, gpr), X_next, show_legend=i == 0) if save_figures: save_fig('bayes-opt-acquisition-{}.pdf'.format(i + 1)) plt.show() # Add sample to previous samples X_sample = np.append(X_sample, np.atleast_2d(X_next), axis=0) Y_sample = np.append(Y_sample, np.atleast_2d(Y_next), axis=0)
def visualize_hist_pairplot(X,y,selected_feature1,selected_feature2,features,diag_kind): """ Visualize the pairwise relationships (Histograms and Density Funcions) between classes and respective attributes Keyword arguments: X -- The feature vectors y -- The target vector selected_feature1 - First feature selected_feature1 - Second feature diag_kind -- Type of plot in the diagonal (Histogram or Density Function) """ #create data joint_data=np.column_stack((X,y)) column_names=features #create dataframe df=pd.DataFrame(data=joint_data,columns=column_names) #plot palette = sea.hls_palette() splot=sea.pairplot(df, hue="Y", palette={0:palette[2],1:palette[0]},vars=[selected_feature1,selected_feature2],diag_kind=diag_kind) splot.fig.suptitle('Pairwise relationship: '+selected_feature1+" vs "+selected_feature2) splot.set(xticklabels=[]) # plt.subplots_adjust(right=0.94, top=0.94) #save fig output_dir = "img" save_fig(output_dir,'{}/{}_{}_hist_pairplot.png'.format(output_dir,selected_feature1,selected_feature2))
def generate_histograms(start_feature, end_feature, features_base): num_weeks = 15 num_features = end_feature - start_feature +1 in_file = "data/%s.csv" % features_base feature_set = validate_csv(in_file) start_time = time.time() data = np.genfromtxt(in_file, delimiter = ',', skip_header = 1) print "loaded data in", time.time() - start_time, "seconds" pl.clf() dropout_vector = data[:, 1] for feature_index in range(start_feature, end_feature + 1): feature_distribution = data[:, feature_index] start_time = time.time() m1 = feature_distribution == -1 # remove default values masked = np.ma.masked_array(feature_distribution, m1) for x, value in enumerate(masked): if (x % num_weeks == 0 and dropout_vector[x] == 0) or (x % num_weeks != 0 and dropout_vector[x - 1] == 0) : #remove values where the student was always dropped out or has already dropped out the prior week masked.mask[x] = True graph_distribution(masked.compressed(), feature_set[feature_index -1], feature_index - start_feature + 1, num_features) print "Ran Feature %s in" % (feature_set[feature_index -1]), time.time() - start_time, "seconds" pl.subplots_adjust(hspace=.5) pl.subplots_adjust(wspace=.5) # pl.show() utils.save_fig("/home/colin/evo/papers/thesis/figures/feature_distributions/%s_%s_%s" % (features_base, start_feature, end_feature))
def train(args, model, train_loader, optimizer, epoch_index): model.train() correct = 0 epoch_train_loss = 0.0 ## use a weight matrix to store the weight of the network which will be used for visualization weight_matrix = np.zeros( (args.image_fashion_mnist_width, args.image_fashion_mnist_height)) for batch_idx, (data, target) in enumerate(train_loader): ## preparing the data fed into the neural network data, target = Variable(data), Variable(target) optimizer.zero_grad() ## obtain the output of the network if args.dataset_name == "cifar10": output = model(data) elif args.dataset_name == "fashion_mnist": # if viusalize the weights of neural network if args.visual_flag: output, weight_ret = model(data) weight_matrix = weight_ret.detach().numpy() # not viusalize the weights of neural network else: output = model(data) # calculate the predication train_pred = torch.argmax(F.softmax(output, dim=1), dim=1).view(-1, ) # count the correct prediction correct += train_pred.eq(target.data).sum() # The cross entropy loss is calculated, softmax function is embedded in F.cross_entropy() function loss = F.cross_entropy(output, target) epoch_train_loss += loss.item() # loss is used for back-propagation (BP) in MLP loss.backward() optimizer.step() epoch_train_accuracy = 100. * correct / len(train_loader.dataset) epoch_loss_mean = epoch_train_loss / len(train_loader) if epoch_index % args.log_interval == 0: print('Train Epoch: {}\tLoss: {:.6f} Accuracy: {:.2f}%'.format( epoch_index, epoch_loss_mean, epoch_train_accuracy)) # visualize the weights of neural network for Fashion MNIST dataset if args.dataset_name == "fashion_mnist" and args.visual_flag and epoch_index % args.save_weight_interval == 0: os.makedirs(args.output_folder, exist_ok=True) save_fig( args, weight_matrix, os.path.join(args.output_folder, "network_weights_" + str(epoch_index) + ".pdf")) return epoch_loss_mean, epoch_train_accuracy
def make_errorbar(ylabel, operations, file_path, stats, types, xlabel): fig, axes = plt.subplots(1, 2, figsize=(10, 5)) params = [{'color': 'red'}, {'color': 'blue'}] for i in range(len(axes)): axes[i] = utils.custom_errorbar(stats['keys'], stats[types[0] + '_' + ylabel + '_' + operations[i]], stats[types[1] + '_' + ylabel + '_' + operations[i]], axes[i], title=operations[i], xlabel=xlabel, ylabel=ylabel, kwargs=params[i]) utils.save_fig(fig, file_path + ylabel + '_deviation.png')
def evaluate_config(X_test, X_train, Y_test, Y_train, acc_list, e_loss, p_loss, model_orig, inputs, outputs, model, z_idx, title, fig_dir=None, use_LIME=True): fig, (ax_1, ax_2) = plt.subplots(1, 2, figsize=(8, 4)) ax = ax_1 color = 'tab:orange' ax.plot(np.arange(len(e_loss)), e_loss, label="Explanation Loss", color=color) ax.set_ylabel("loss") ax2 = ax.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:green' ax2.plot(np.arange(len(acc_list)), np.array(acc_list)[:, 1], label="Accuracy", color=color) ax.set_xlabel("epochs") ax2.set_ylabel("accuracy") # second ax ax = ax_2 color = 'tab:orange' ax.plot(np.arange(len(e_loss)), e_loss, color=color) ax.set_ylabel("explanation loss") ax2 = ax.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.plot(np.arange(len(p_loss)), p_loss, label="Categorical Cross-Entropy", color=color) ax2.set_ylabel("performance loss") ax.set_xlabel("epochs") fig.legend(loc='upper left') # ax2.legend() plt.tight_layout() if fig_dir is not None: save_fig(fig, "loss_cf", fig_dir) df, df_test = evaluate_config_2(X_test, X_train, Y_test, Y_train, fig_dir, inputs, model, model_orig, outputs, title, use_LIME, z_idx) return df, df_test
def render(env): # to render an environment, you can use the mode 'human' #img = env.render(mode='human') # to get back the image in the form of a NumPy array, use mode='rgb_array' img = env.render(mode='rgb_array') plt.figure(figsize=(6, 8)) plt.imshow(img) plt.axis('off') utils.save_fig(config.ENV_SHORT_NAME + '_step1') plt.show()
def make_plot(ylabel, operations, file_path, stats, types, xlabel): fig, axes = plt.subplots(1, 3, figsize=(15, 5)) params1 = {'color': 'red', 'linestyle': '-', 'label': operations[0]} params2 = {'color': 'blue', 'linestyle': '--', 'label': operations[1]} for ax, tp in zip(axes, types): ax = utils.custom_plots(stats['keys'], stats[tp + '_' + ylabel + '_' + operations[0]], stats[tp + '_' + ylabel + '_' + operations[1]], ax, title=tp.capitalize(), xlabel=xlabel, ylabel=ylabel, kwargs1=params1, kwargs2=params2) utils.save_fig(fig, file_path + ylabel + '_statistics.png')
def make_record_alg_cmp_bar(path, operations, ylabel, stats, extra_labels, handshake=False): xtickslabels = [] sec_lvl = [] scale_type = ['linear', 'log'] for key in stats[list(stats.keys())[0]]['keys']: val = key.split('_') if val[1] not in sec_lvl: sec_lvl.append(val[1]) for key in stats: tmp = '' for id in extra_labels[key]: tmp += id if tmp == '': xtickslabels.append(key) else: xtickslabels.append(key + '\n(' + tmp + ')') for op in operations: for lvl in sec_lvl: y = {} for key in stats: for key in stats[key]['keys']: elem = key.split('_') if elem[0] not in y: y[elem[0]] = [] for alg in y: for key in stats: try: idx = stats[key]['keys'].index(alg + '_' + lvl) y[alg].append(stats[key]['mean_' + ylabel + '_' + op][idx]) except (ValueError, KeyError): y[alg].append(0) # print('') # for a in y: # print(f'{a}: {y[a]} : {len(y[a])}') for scale in scale_type: fig, ax = plt.subplots(1, 1, figsize=(30, 10)) ax = utils.stacked_custom_bar(y, ax, handshake=handshake, title=op + ' (mean)', scale=scale, xlabel='algorithms', xtickslabels=xtickslabels, ylabel=ylabel) utils.save_fig(fig, 'statistics/' + path + '/serv_all_' + op + '_' + settings.sec_str[int(lvl)] + '_' + ylabel + '_' + scale + '.png')
def visualize_k_fold_precision_recall_plot(X,y_gold,classifier,K): """ Visualizes K Average Precision-Recall curves created from K-fold cross validation and the mean Precision-Recall curve. Keyword arguments: X -- The feature vectors y_gold_standard -- Expected labels. classifier -- The classifier to be used K -- Number of folds to perform """ cross_validation = StratifiedKFold(y_gold, n_folds=K) mean_precision = 0.0 mean_recall= 0.0 avg_mean_precision_recall=0.0 for i, (train, test) in enumerate(cross_validation): #classify classifier.fit(X[train], y_gold[train]) y_predicted=classifier.predict(X[test]) #compute Precision-Recall precision, recall, thresholds = precision_recall_curve(y_gold[test].ravel(),y_predicted.ravel()) average_precision = average_precision_score(y_gold[test], y_predicted) plt.plot(recall, precision,label='Precision-recall fold %d (area = %0.2f)' % (i+1, average_precision) ) #save means mean_precision += precision mean_recall += recall avg_mean_precision_recall+=average_precision #compute final mean mean_precision /= len(cross_validation) mean_recall /= len(cross_validation) avg_mean_precision_recall /= len(cross_validation) plt.plot(mean_recall,mean_precision, 'k--',label='Mean Precision-Recall (area = %0.2f)' % avg_mean_precision_recall, linewidth=2) plt.plot([0,1], [0.5, 0.5], '--', color=(0.6, 0.6, 0.6), label='Random Classifier') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall Curve') plt.legend(loc="lower left") # plt.show() #save fig output_dir='img' save_fig(output_dir,'{}/pr_curve.png'.format(output_dir)) plt.close()
def plot_predictive_comparison(env, baseline_samples, target_samples, stddev_mult=3., target_metrics=None, title_name=None): # single var regression only baseline_samples = baseline_samples.squeeze() target_samples = target_samples.squeeze() train_x, train_y = env.get_train_x(), env.get_train_y() test_x, test_y = env.get_test_x(), env.get_test_y() pad_width = test_x.shape[0] - train_x.shape[0] train_x_padded = np.pad(train_x[:, 0], (0, pad_width), 'constant', constant_values=np.nan) train_y_padded = np.pad(train_y[:, 0], (0, pad_width), 'constant', constant_values=np.nan) df = pd.DataFrame.from_dict({ 'time': test_x[:, 0], 'true_y': test_y[:, 0], 'train_x': train_x_padded, 'train_y': train_y_padded, 'mean': target_samples.mean(axis=0), 'std': stddev_mult * target_samples.std(axis=0), 'base_mean': baseline_samples.mean(axis=0), 'base_std': stddev_mult * baseline_samples.std(axis=0), }).reset_index() g = sns.FacetGrid(df, size=9, aspect=1.8) g.map(plt.errorbar, 'time', 'base_mean', 'base_std', color=(0.7, 0.1, 0.1, 0.09)) g.map(plt.errorbar, 'time', 'mean', 'std', color=(0.1, 0.1, 0.7, 0.09)) g.map(plt.plot, 'time', 'mean', color='b', lw=1) g.map(plt.plot, 'time', 'true_y', color='r', lw=1) g.map(plt.scatter, 'train_x', 'train_y', color='g', s=20) ax = g.ax ax.set_title('Posterior Predictive Distribution' + (': ' + title_name) if title_name is not None else '') ax.set(xlabel='X', ylabel='Y') ax.set_xlim(env.view_xrange[0], env.view_xrange[1]) ax.set_ylim(env.view_yrange[0], env.view_yrange[1]) legend = ['Prediction mean', 'True f(x)', 'Training data', 'True StdDev', 'Predicted StdDev'] plt.legend(legend) if target_metrics is not None: offset = 0 for tm, tv in target_metrics.items(): ax.annotate('{}: {:.02f}'.format(tm, tv), xy=(0.08, 0.92 - offset), xytext=(0.08, 0.92 - offset), xycoords='figure fraction', textcoords='figure fraction') offset += 0.04 name = utils.DATA_DIR.replace('/', '-') plt.tight_layout(pad=0.6) utils.save_fig('predictive-distribution-' + name)
def plot_predictive_baseline(env, samples, stddev_mult=3., title_name=None): # single var regression only samples = samples.squeeze() train_x, train_y = env.get_train_x(), env.get_train_y() test_x, test_y = env.get_test_x(), env.get_test_y() pad_width = test_x.shape[0] - train_x.shape[0] train_x_padded = np.pad(train_x[:, 0], (0, pad_width), 'constant', constant_values=np.nan) train_y_padded = np.pad(train_y[:, 0], (0, pad_width), 'constant', constant_values=np.nan) data = samples df = pd.DataFrame.from_dict({ 'time': test_x[:, 0], 'true_y': test_y[:, 0], 'train_x': train_x_padded, 'train_y': train_y_padded, 'mean': data.mean(axis=0), 'std': stddev_mult * data.std(axis=0), # 'stdn': 2. * (data.std(axis=0) + .5 ** .5), }).reset_index() g = sns.FacetGrid(df, size=9, aspect=1.8) g.map(plt.errorbar, 'time', 'mean', 'std', color=(0.7, 0.1, 0.1, 0.09)) g.map(plt.plot, 'time', 'mean', color='b', lw=1) g.map(plt.plot, 'time', 'true_y', color='r', lw=1) g.map(plt.scatter, 'train_x', 'train_y', color='g', s=20) ax = g.ax ax.set_title('Posterior Predictive Distribution' + (': ' + title_name) if title_name is not None else '') ax.set(xlabel='X', ylabel='Y') ax.set_xlim(env.view_xrange[0], env.view_xrange[1]) ax.set_ylim(env.view_yrange[0], env.view_yrange[1]) legend = ['Prediction mean', 'True f(x)', 'Training data', 'StdDev'] plt.legend(legend) # ax.annotate("MSE: {:.03f}".format(0), xy=(0.1, 0.9), xytext=(0.1, 0.9), xycoords='figure fraction', # textcoords='figure fraction') name = utils.DATA_DIR.replace('/', '-') plt.tight_layout(pad=0.6) utils.save_fig('predictive-distribution-' + name)
def visualize_k_fold_roc_plot(X,y_gold,classifier,K): """ Visualizes K ROC curves created from K-fold cross validation and the mean ROC curve. Keyword arguments: X -- The feature vectors y_gold_standard -- Expected labels. classifier -- The classifier to be used K -- Number of folds to perform """ cross_validation = StratifiedKFold(y_gold, n_folds=K) mean_true_positive_rate = 0.0 mean_false_positive_rate = 0.0 for i, (train, test) in enumerate(cross_validation): #classify classifier.fit(X[train], y_gold[train]) y_predicted=classifier.predict(X[test]) #compute ROC false_positive_rate, true_positive_rate, thresholds = roc_curve(y_gold[test], y_predicted) roc_auc = auc(false_positive_rate, true_positive_rate) plt.plot(false_positive_rate, true_positive_rate, linewidth=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc)) #save means mean_true_positive_rate += true_positive_rate mean_false_positive_rate += false_positive_rate #compute final mean mean_true_positive_rate /= len(cross_validation) mean_false_positive_rate /= len(cross_validation) mean_auc = auc(mean_false_positive_rate, mean_true_positive_rate) plt.plot(mean_false_positive_rate, mean_true_positive_rate, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, linewidth=2) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Classifier') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(loc="lower right") # plt.show() #save fig output_dir='img' save_fig(output_dir,'{}/roc.png'.format(output_dir)) plt.close()
def plot_box(xs, ys, data, xaxis_label=None, yaxis_label=None, x_sci=False, y_sci=True, name=None): fig, ax = plt.subplots() ax = sns.boxplot(x=xs, y=ys, data=data, linewidth=1, fliersize=6) utils.set_sci_axis(ax, x_sci, y_sci) utils.set_axis_labels(ax, xaxis_label, yaxis_label) utils.finalize(ax) utils.save_fig(name)
def plot_bar(x, y, hue=None, hue_count=1, line_values=None, line_label=None, legend=True, xaxis_label=None, yaxis_label=None, xticks=None, y_lim=None, x_sci=True, y_sci=True, fig_size=None, y_err=None, name=None): fig, ax = plt.subplots() if fig_size and isinstance(fig_size, list) and len(fig_size) > 0: if len(fig_size) == 1: fig.set_figwidth(fig_size[0]) else: fig.set_figwidth(fig_size[0]) fig.set_figheight(fig_size[1]) bar_width = 0.2 new_x = [x_ + bar_width for x_ in x] ticks_x = [x_ + 0.5*bar_width for x_ in new_x] if y_err: ax.errorbar(ticks_x, y, yerr=y_err, fmt='o', ecolor='r', capthick=1, elinewidth=1) plt.bar(new_x, y, width=bar_width) if line_values and utils.is_list(line_values): x_set = set(x) if len(line_values) == len(x_set): if line_label: ax.plot(ax.get_xticks(), line_values, label=line_label) hue_count += 1 else: ax.plot(ax.get_xticks(), line_values) if xticks and isinstance(xticks, list): plt.xticks(ticks_x, xticks) if y_lim and isinstance(y_lim, list) and len(y_lim) > 0: if len(y_lim) == 1: plt.ylim(ymin=y_lim[0]) else: plt.ylim(ymin=y_lim[0]) plt.ylim(ymax=y_lim[1]) if legend: utils.set_legend(ncol=hue_count) utils.set_sci_axis(ax, x_sci, y_sci) utils.set_axis_labels(ax, xaxis_label, yaxis_label) utils.finalize(ax, x_grid=False) utils.save_fig(name)
def make_scatter(ylabel, operations, file_path, data, xlabel): fig, axes = plt.subplots(1, 2, figsize=(10, 5)) x = {} y = {} xtickslabels = list(data.keys()) kwargs = [{'color': 'red'}, {'color': 'blue'}] for op in operations: x[op] = [] y[op] = [] for key, i in zip(data, range(len(xtickslabels))): x[op] += [i for j in range(len(data[key][ylabel + '_' + op]))] y[op] += data[key][ylabel + '_' + op] for i in range(len(axes)): axes[i] = utils.custom_scatter(x[operations[i]], y[operations[i]], axes[i], title=operations[i], xtickslabels=xtickslabels, xlabel=xlabel, ylabel=ylabel, kwargs=kwargs[i]) utils.save_fig(fig, file_path + ylabel + '_distribution.png')
def feature_cdf(X, y, selected_feature): """ Plot the empirical/stand cumulative density function of the given feature Keyword arguments: X -- The feature vectors y -- The target vector selected_feature -- The desired feature to obtain the histogram """ #Standard Normal Cumulative Density Function N = len(X) Normal = np.random.normal(size=N) histogram, bin_edges = np.histogram(Normal, bins=N, normed=True) dx = bin_edges[1] - bin_edges[0] G = np.cumsum(histogram) * dx #Empirical Cumulative Density Functions feature_index = int(selected_feature[1:]) - 1 X_k = np.sort(X[:, feature_index]) #feature vector sorted ECDF_k = np.array(range(N)) / float( N) #Empirical Cumulative Function F, steps of 1/N #Kolmogorov-Smirnov Test result = kolmogorov_smirnov_two_sample_test(G, ECDF_k) ks_statistic = result[0] p_value = result[1] plt.plot(bin_edges[1:], G, label="Standard Normal Cumulative Density Funcion") plt.plot(X_k, ECDF_k, label="Empirical Cumulative Density Function") plt.suptitle("Empirirical vs Standard Normal Cumulative Distribution of " + selected_feature + " Feature\nKolmogorov-Smirnov Statistic=" + str(ks_statistic)) plt.xlabel(selected_feature) plt.legend(loc='center right') # plt.show() #save fig output_dir = "img" save_fig(output_dir, '{}/{}_cdf.png'.format(output_dir, selected_feature))
def visualize_hist_pairplot(X, y, selected_feature1, selected_feature2, features, diag_kind): """ Visualize the pairwise relationships (Histograms and Density Funcions) between classes and respective attributes Keyword arguments: X -- The feature vectors y -- The target vector selected_feature1 - First feature selected_feature1 - Second feature diag_kind -- Type of plot in the diagonal (Histogram or Density Function) """ #create data joint_data = np.column_stack((X, y)) column_names = features #create dataframe df = pd.DataFrame(data=joint_data, columns=column_names) #plot palette = sea.hls_palette() splot = sea.pairplot(df, hue="Y", palette={ 0: palette[2], 1: palette[0] }, vars=[selected_feature1, selected_feature2], diag_kind=diag_kind) splot.fig.suptitle('Pairwise relationship: ' + selected_feature1 + " vs " + selected_feature2) splot.set(xticklabels=[]) # plt.subplots_adjust(right=0.94, top=0.94) #save fig output_dir = "img" save_fig( output_dir, '{}/{}_{}_hist_pairplot.png'.format(output_dir, selected_feature1, selected_feature2))
def plot_two_bars(ys, labels, legend=True, xaxis_label=None, yaxis_label=None, xticks=None, y_lim=None, x_sci=True, y_sci=True, fig_size=None, name=None): fig, ax = plt.subplots() if fig_size and isinstance(fig_size, list) and len(fig_size) > 0: if len(fig_size) == 1: fig.set_figwidth(fig_size[0]) else: fig.set_figwidth(fig_size[0]) fig.set_figheight(fig_size[1]) bar_width = 0.2 x = range(len(ys)) new_x1 = [x_ for x_ in x] new_x2 = [x_ + bar_width for x_ in x] rects1 = ax.bar(new_x1, ys[0], width=bar_width, color='b') rects2 = ax.bar(new_x2, ys[1], width=bar_width) if xticks and isinstance(xticks, list): plt.xticks([x_ + bar_width for x_ in new_x1], xticks) if y_lim and isinstance(y_lim, list) and len(y_lim) > 0: if len(y_lim) == 1: plt.ylim(ymin=y_lim[0]) else: plt.ylim(ymin=y_lim[0]) plt.ylim(ymax=y_lim[1]) if legend: ax.legend((rects1[0], rects2[0]), (labels[0], labels[1]), fontsize=28, frameon=False, bbox_to_anchor=(0, 0.95, 1.2, .10), handlelength=0.5, handletextpad=0.2, loc=3, ncol=2, mode="expand", borderaxespad=0.) utils.set_sci_axis(ax, x_sci, y_sci) utils.set_axis_labels(ax, xaxis_label, yaxis_label) utils.finalize(ax, x_grid=False) utils.save_fig(name)
def visualize_pca2D(X, y): """ Visualize the first two principal components Keyword arguments: X -- The feature vectors y -- The target vector """ pca = PCA(n_components=2) principal_components = pca.fit_transform(X) palette = sea.color_palette() plt.scatter(principal_components[y == 0, 0], principal_components[y == 0, 1], marker='s', color='green', label="Paid", alpha=0.5, edgecolor='#262626', facecolor=palette[1], linewidth=0.15) plt.scatter(principal_components[y == 1, 0], principal_components[y == 1, 1], marker='^', color='red', label="Default", alpha=0.5, edgecolor='#262626' '', facecolor=palette[2], linewidth=0.15) leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title("Two-Dimensional Principal Component Analysis") plt.tight_layout #save fig output_dir = 'img' save_fig(output_dir, '{}/pca2D.png'.format(output_dir))
def visualize_lda2D(X, y): """ Visualize the separation between classes using the two most discriminant features Keyword arguments: X -- The feature vectors y -- The target vector """ labels = ['Paid', 'Default'] lda = LDA(n_components=2, solver='eigen') # lda = LDA(n_components = 2) discriminative_attributes = lda.fit(X, y).transform(X) palette = sea.color_palette() # plt.plot(discriminative_attributes[:,0][y==0],'sg',label="Paid", alpha=0.5) # plt.plot(discriminative_attributes[:,0][y==1],'^r',label="Default", alpha=0.5) plt.scatter(discriminative_attributes[:, 0][y == 0], discriminative_attributes[:, 1][y == 0], marker='s', color='green', label="Paid", alpha=0.5) plt.scatter(discriminative_attributes[:, 0][y == 1], discriminative_attributes[:, 1][y == 1], marker='^', color='red', label="Default", alpha=0.5) plt.xlabel('First Linear Discriminant') plt.ylabel('Second Linear Discriminant') leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title("Linear Discriminant Analysis") plt.tight_layout #save fig output_dir = 'img' save_fig(output_dir, '{}/lda.png'.format(output_dir))
def feature_cdf(X,y,selected_feature): """ Plot the empirical/stand cumulative density function of the given feature Keyword arguments: X -- The feature vectors y -- The target vector selected_feature -- The desired feature to obtain the histogram """ #Standard Normal Cumulative Density Function N = len(X) Normal = np.random.normal(size = N) histogram,bin_edges = np.histogram(Normal, bins = N, normed = True ) dx = bin_edges[1] - bin_edges[0] G = np.cumsum(histogram)*dx #Empirical Cumulative Density Functions feature_index=int(selected_feature[1:])-1 X_k = np.sort(X[:,feature_index]) #feature vector sorted ECDF_k = np.array(range(N))/float(N) #Empirical Cumulative Function F, steps of 1/N #Kolmogorov-Smirnov Test result=kolmogorov_smirnov_two_sample_test(G,ECDF_k) ks_statistic=result[0] p_value=result[1] plt.plot(bin_edges[1:], G, label="Standard Normal Cumulative Density Funcion") plt.plot(X_k, ECDF_k,label="Empirical Cumulative Density Function") plt.suptitle("Empirirical vs Standard Normal Cumulative Distribution of "+selected_feature+" Feature\nKolmogorov-Smirnov Statistic="+str(ks_statistic)) plt.xlabel(selected_feature) plt.legend(loc='center right') # plt.show() #save fig output_dir = "img" save_fig(output_dir,'{}/{}_cdf.png'.format(output_dir,selected_feature))
def visualize_pca2D(X,y): """ Visualize the first two principal components Keyword arguments: X -- The feature vectors y -- The target vector """ pca = PCA(n_components = 2) principal_components = pca.fit_transform(X) palette = sea.color_palette() plt.scatter(principal_components[y==0, 0], principal_components[y==0, 1], marker='s',color='green',label="Paid", alpha=0.5,edgecolor='#262626', facecolor=palette[1], linewidth=0.15) plt.scatter(principal_components[y==1, 0], principal_components[y==1, 1], marker='^',color='red',label="Default", alpha=0.5,edgecolor='#262626''', facecolor=palette[2], linewidth=0.15) leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title("Two-Dimensional Principal Component Analysis") plt.tight_layout #save fig output_dir='img' save_fig(output_dir,'{}/pca2D.png'.format(output_dir))
def adaboost(X_train, X_val, y_train, y_val, doplot=False): from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score ada_clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=1), # weak classifier n_estimators=200, algorithm='SAMME.R', learning_rate=0.5) ada_clf.fit(X_train, y_train) y_pred = ada_clf.predict(X_val) print("Adaboost classifier, accuracy score = %f\n" % accuracy_score(y_val, y_pred)) if doplot: from utils import plot_decision_boundary, save_fig plt.figure(figsize=(11, 4)) plot_decision_boundary(ada_clf, X, y) plt.title("Adaboost classifcation with Decision Tree base estimator", fontsize=12) save_fig("AdaBoost_with_DT", CHAPTER_ID) plt.show()
def generate_histograms(start_feature, end_feature, features_base): num_weeks = 15 num_features = end_feature - start_feature + 1 in_file = "data/%s.csv" % features_base feature_set = validate_csv(in_file) start_time = time.time() data = np.genfromtxt(in_file, delimiter=',', skip_header=1) print "loaded data in", time.time() - start_time, "seconds" pl.clf() dropout_vector = data[:, 1] for feature_index in range(start_feature, end_feature + 1): feature_distribution = data[:, feature_index] start_time = time.time() m1 = feature_distribution == -1 # remove default values masked = np.ma.masked_array(feature_distribution, m1) for x, value in enumerate(masked): if (x % num_weeks == 0 and dropout_vector[x] == 0) or ( x % num_weeks != 0 and dropout_vector[x - 1] == 0 ): #remove values where the student was always dropped out or has already dropped out the prior week masked.mask[x] = True graph_distribution(masked.compressed(), feature_set[feature_index - 1], feature_index - start_feature + 1, num_features) print "Ran Feature %s in" % (feature_set[feature_index - 1] ), time.time() - start_time, "seconds" pl.subplots_adjust(hspace=.5) pl.subplots_adjust(wspace=.5) # pl.show() utils.save_fig( "/home/colin/evo/papers/thesis/figures/feature_distributions/%s_%s_%s" % (features_base, start_feature, end_feature))
def save_figure(self, save_dir_path): if not check_is_dir(save_dir_path): raise IsADirectoryError(r'please give a correct dir_path') save_dir_path = Path(save_dir_path) if not self._flag_univariate_analysis: raise PermissionError("save_figure will be excute when univariate_analysis finished") save_fig(self._plot_distribute, save_dir_path, type_str='distribute') save_fig(self._plot_distribute_target, save_dir_path, type_str='distribute_taget') save_fig(self._plot_ar, save_dir_path, type_str='ar')
def enet_plot(l1_ratio): """Function plotting enet_path for some tuning parameter.""" _, theta_enet, _ = linear_model.enet_path(A, b, alphas=alphas, fit_intercept=False, l1_ratio=l1_ratio, return_models=False) fig1 = plt.figure(figsize=(12, 8)) ax1 = fig1.add_subplot(111) ax1.plot(alphas, np.transpose(theta_enet), linewidth=3) ax1.set_xscale('log') ax1.set_xlabel(r"$\lambda$") ax1.set_ylabel("Coefficient value") ax1.set_ylim([-1, 5]) plt.savefig(save_fig(path, "enet_coeffs", "pdf")) plt.show() return theta_enet
def plot_curves(ytrue, yscores, name): precisions, recalls, thresholds = precision_recall_curve(ytrue, yscores) ap = average_precision_score(ytrue, yscores) plt.figure() plot_precision_recall_vs_threshold(precisions, recalls, thresholds) save_fig("{}-PR-vs-thresh.pdf".format(name)) plt.figure() plot_precision_vs_recall(precisions, recalls, ap) save_fig("{}-PR.pdf".format(name)) fpr, tpr, thresholds = roc_curve(ytrue, yscores) auc = roc_auc_score(ytrue, yscores) plt.figure() plot_roc_curve(fpr, tpr, auc) save_fig("{}-ROC.pdf".format(name))
df.hist() plt.show() # scatter plot of response vs each feature nrows = 3 ncols = 4 fig, ax = plt.subplots(nrows=nrows, ncols=ncols, sharey=True, figsize=[15, 10]) plt.tight_layout() plt.clf() for i in range(0, 12): plt.subplot(nrows, ncols, i + 1) plt.scatter(X[:, i], y) plt.xlabel(boston.feature_names[i]) plt.ylabel("house price") plt.grid() save_fig("boston-housing-scatter") plt.show() # Rescale input data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) scaler = sklearn.preprocessing.StandardScaler() scaler = scaler.fit(X_train) Xscaled = scaler.transform(X_train) # equivalent to Xscaled = scaler.fit_transform(X_train) # Fit model
def train(self, num_epoch): for epoch in range(num_epoch): if (self.resample): train_dl_iter = iter(self.train_dl) for i, data in enumerate(tqdm(self.train_dl)): # (1) : minimizes mean((D(x) - mean(D(G(z))) - 1)**2) + mean((D(G(z)) - mean(D(x)) + 1)**2) self.netD.zero_grad() real_images = data[0].to(self.device) bs = real_images.size(0) # real labels (bs) real_label = torch.full((bs, ), self.real_label, device=self.device) # fake labels (bs) fake_label = torch.full((bs, ), self.fake_label, device=self.device) # noise (bs, nz, 1, 1), fake images (bs, cn, 64, 64) noise = generate_noise(bs, self.nz, self.device) fake_images = self.netG(noise) # calculate the discriminator results for both real & fake c_xr = self.netD(real_images) # (bs, 1, 1, 1) c_xr = c_xr.view(-1) # (bs) c_xf = self.netD(fake_images.detach()) # (bs, 1, 1, 1) c_xf = c_xf.view(-1) # (bs) # calculate the Discriminator loss errD = (torch.mean( (c_xr - torch.mean(c_xf) - real_label)**2) + torch.mean( (c_xf - torch.mean(c_xr) + real_label)**2)) / 2.0 errD.backward() # update D using the gradients calculated previously self.optimizerD.step() # (2) : minimizes mean((D(G(z)) - mean(D(x)) - 1)**2) + mean((D(x) - mean(D(G(z))) + 1)**2) self.netG.zero_grad() if (self.resample): real_images = next(train_dl_iter)[0].to(self.device) noise = generate_noise(bs, self.nz, self.device) fake_images = self.netG(noise) # we updated the discriminator once, therefore recalculate c_xr, c_xf c_xr = self.netD(real_images) # (bs, 1, 1, 1) c_xr = c_xr.view(-1) # (bs) c_xf = self.netD(fake_images) # (bs, 1, 1, 1) c_xf = c_xf.view(-1) # (bs) # calculate the Generator loss errG = (torch.mean( (c_xf - torch.mean(c_xr) - real_label)**2) + torch.mean( (c_xr - torch.mean(c_xf) + real_label)**2)) / 2.0 errG.backward() # update G using the gradients calculated previously self.optimizerG.step() self.errD_records.append(float(errD)) self.errG_records.append(float(errG)) if (i % self.loss_interval == 0): print('[%d/%d] [%d/%d] errD : %.4f, errG : %.4f' % (epoch + 1, num_epoch, i + 1, self.train_iteration_per_epoch, errD, errG)) if (i % self.image_interval == 0): if (self.special == None): sample_images_list = get_sample_images_list( 'Unsupervised', (self.fixed_noise, self.netG)) plot_fig = plot_multiple_images( sample_images_list, 4, 4) cur_file_name = os.path.join( self.save_img_dir, str(self.save_cnt) + ' : ' + str(epoch) + '-' + str(i) + '.jpg') self.save_cnt += 1 save_fig(cur_file_name, plot_fig) plot_fig.clf() elif (self.special == 'Wave'): sample_audios_list = get_sample_images_list( 'Unsupervised_Audio', (self.fixed_noise, self.netG)) plot_fig = plot_multiple_spectrograms( sample_audios_list, 4, 4, freq=16000) cur_file_name = os.path.join( self.save_img_dir, str(self.save_cnt) + ' : ' + str(epoch) + '-' + str(i) + '.jpg') self.save_cnt += 1 save_fig(cur_file_name, plot_fig) plot_fig.clf() if (self.snapshot_interval is not None): if (i % self.snapshot_interval == 0): save( os.path.join( self.save_snapshot_dir, 'Epoch' + str(epoch) + '_' + str(i) + '.state'), self.netD, self.netG, self.optimizerD, self.optimizerG)
def evaluate_config_2(X_test, X_train, Y_test, Y_train, fig_dir, inputs, model, model_orig, outputs, title, use_LIME, z_idx): # In[66]: loss_tr, acc_tr = model.evaluate(X_train, Y_train) loss_ts, acc_ts = model.evaluate(X_test, Y_test) loss_tr_o, acc_tr_o = model_orig.evaluate(X_train, Y_train) loss_ts_o, acc_ts_o = model_orig.evaluate(X_test, Y_test) # In[81]: round_err = 4 loss_diff_tr = np.round(np.abs(loss_tr - loss_tr_o), round_err) loss_diff_ts = np.round(np.abs(loss_ts - loss_ts_o), round_err) print("Loss Diff: train - {} ---- test - {}".format( loss_diff_tr, loss_diff_ts)) acc_diff_tr = np.round(np.abs(acc_tr - acc_tr_o), round_err) acc_diff_ts = np.round(np.abs(acc_ts - acc_ts_o), round_err) print("Acc Diff: train - {:.2f}% ---- test - {:.2f}%".format( acc_diff_tr * 100, acc_diff_ts * 100)) explanation_loss_tr = compute_explanation_loss(inputs, outputs, model, z_idx) # if explanation_loss_tr != e_loss[-1]: # print("Explanation loss not computed correctly! compute:{} e_loss:{}".format( # float(explanation_loss_tr), float(e_loss[-1]))) print("Explanation loss - {:.4f}".format(explanation_loss_tr)) # 0 because suddenly non of the features are important! # In[68]: # In[74]: num_p, percent = analyze_mismatch(model_orig, model, inputs) print("Prediction Mismatch (Train):", num_p, str(round(percent, 3)) + "%") # Test Set Mismatch inputs_test = tf.convert_to_tensor(X_test, dtype=tf.float32) outputs_test = tf.convert_to_tensor(Y_test, dtype=tf.float32) num_p, percent = analyze_mismatch(model_orig, model, inputs_test) print("Prediction Mismatch (Test):", num_p, str(round(percent, 3)) + "%") # In[76]: from evaluate import plot_ranking_histograms from functools import partial # In[78]: models = [model_orig, model] att_methods = attribution_methods if not use_LIME: att_methods = att_methods[:-1] # TODO the attribution methods are evaluate 2x -> ones in plot_ranking & ones in analyze DF! there is no reason! plot_ranking_histograms_ = partial(plot_ranking_histograms, models=models, z_idx=z_idx, num_f=X_train.shape[1] - 1, attribution_methods=att_methods) fig_train = plot_ranking_histograms_(inputs=inputs, ys=outputs, title="{} Train".format(title)) save_fig(fig_train, fname="ranking_histograms_train", fig_dir=fig_dir) # In[80]: # with Test fig_test = plot_ranking_histograms_(inputs=inputs_test, ys=outputs_test, title="{} Test".format(title)) save_fig(fig_test, fname="ranking_histograms_test", fig_dir=fig_dir) # ### Explain ranking # In[84]: print("Computing Summary Table Train") df = get_summary_table(models=models, inputs=inputs, outputs=outputs, feature=z_idx, attribution_methods=att_methods) diff_mean_tr, df_top_diff_tr, shifts_sum_tr, shifts_mean_tr = analyze_summary_table( df, title=title) print("Computing Summary Table Test") df_test = get_summary_table(models=models, inputs=inputs_test, outputs=outputs_test, feature=z_idx, attribution_methods=att_methods) diff_mean_ts, df_top_diff_ts, shifts_sum_ts, shifts_mean_ts = analyze_summary_table( df_test, title) return df, df_test
ax = pl.gca() pl.imshow(np.transpose(data), interpolation='nearest',origin='lower', vmin=0, vmax=1, cmap='RdBu') ax.set_xlabel("The predicted week number", fontsize=fontsize) ax.set_ylabel("Lag", fontsize=fontsize) # pl.title('Logistic Regression AUC: %s' % cohort, fontsize=fontsize) ax.set_xticks(range(n)) ax.set_yticks(range(n)) ax.set_xticklabels(range(2,n+2),fontsize=med_fontsize) ax.set_yticklabels(range(1,n+1),fontsize=med_fontsize) cb = pl.colorbar() for t in cb.ax.get_yticklabels(): t.set_fontsize(med_fontsize) utils.save_fig("/home/colin/evo/papers/thesis/figures/hmm_logreg/%s_support_%s" % (cohort, max_support)) #plot mean AUC over support: benchmarks = { "no_collab": 0.775938784743, "wiki_only": 0.609065848058, "forum_and_wiki": 0.648563087051, "forum_only": 0.76697590925} pl.clf() ax = pl.gca() pl.plot(range(3, 30,2), [means[support] for support in range(3, 30,2)]) # pl.title('HMM logreg: %s' % cohort, fontsize=fontsize) ax.set_xlabel("Number of support", fontsize=fontsize) ax.set_ylabel("Mean AUC of all leads and lags", fontsize=fontsize)
for cohort in cohorts: n = len(feature_vectors[cohort]) ax = pl.gca() pl.bar(range(n), feature_vectors[cohort], color='0.75') # pl.title('Randomized logistic Regression: %s' % cohort, fontsize=fontsize) ax.set_xlabel("Feature number", fontsize=fontsize) ax.set_ylabel("Average feature weight", fontsize=fontsize) ax.set_xticks(range(n)) ax.set_xticklabels(features, fontsize=AUC_fontsize, rotation='vertical') utils.save_fig( "/home/colin/evo/papers/thesis/figures/logreg/randomized_%s" % cohort) # pl.show() # break pl.clf() for cohort in cohorts: in_file = "results/randomized_logistic_reg_features_%s_time_averaged.csv" % cohort data = np.genfromtxt(in_file, delimiter=",")[1:, :-1] n = len(data) for i in range(n): if np.all(data[:, i] == 0): n = i data = data[:n, :n] for col in range(n): data[:, col] /= norm(data[:, col])
import matplotlib.pyplot as plt acc = history_dict['acc'] val_acc = history_dict['val_acc'] loss = history_dict['loss'] val_loss = history_dict['val_loss'] epochs = range(1, len(acc) + 1) fig, ax = plt.subplots() plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'r-', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() save_fig("imdb-loss.pdf") plt.show() fig, ax = plt.subplots() plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'r', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() save_fig("imdb-acc.pdf") plt.show() # Now turn on early stopping # https://chrisalbon.com/deep_learning/keras/neural_network_early_stopping/
pl.imshow(np.transpose(data), interpolation="nearest", origin="lower", vmin=0, vmax=1) ax.set_xlabel("Number of hidden support", fontsize=fontsize) ax.set_ylabel("Lead", fontsize=fontsize) # ax.set_title('HMM AUC: %s' % cohort, fontsize=fontsize) ax.set_xticks(range(num_supports)) ax.set_yticks(range(num_leads)) ax.set_xticklabels(range(3, 30, 2), fontsize=med_fontsize) ax.set_yticklabels(range(1, num_leads), fontsize=med_fontsize) cb = pl.colorbar() for t in cb.ax.get_yticklabels(): t.set_fontsize(med_fontsize) # pl.show() utils.save_fig("/home/colin/evo/papers/thesis/figures/hmm/%s" % cohort) for cohort in cohorts: # plot mean AUC over support: benchmarks = { "no_collab": 0.775938784743, "wiki_only": 0.609065848058, "forum_and_wiki": 0.648563087051, "forum_only": 0.76697590925, } support_dict = cohort_dict[cohort] means = {} for support in range(3, 30, 2): data = support_dict[support]
n = len(feature_vectors[cohort]) ax = pl.gca() pl.bar(range(n), feature_vectors[cohort], color = '0.75') # pl.title('Randomized logistic Regression: %s' % cohort, fontsize=fontsize) ax.set_xlabel("Feature number", fontsize=fontsize) ax.set_ylabel("Average feature weight", fontsize=fontsize) ax.set_xticks(range(n)) ax.set_xticklabels(features,fontsize=AUC_fontsize, rotation='vertical') utils.save_fig("/home/colin/evo/papers/thesis/figures/logreg/randomized_%s" % cohort) # pl.show() # break pl.clf() for cohort in cohorts: in_file = "results/randomized_logistic_reg_features_%s_time_averaged.csv" % cohort data = np.genfromtxt(in_file, delimiter=",")[1:, :-1] n = len(data) for i in range(n): if np.all(data[:,i]==0): n = i data = data[:n,:n] for col in range(n): data[:, col] /= norm(data[:, col])
def plot_scatter(xs, ys, line_labels=None, xaxis_label=None, yaxis_label=None, xticks=None, vlines=None, vlines_kwargs=None, hlines=None, hlines_kwargs=None, x_sci=True, y_sci=True, y_lim=None, legend_top=True, fig_size=None, ls_cycle=False, name=None): multiple_x = utils.is_list_of_list(xs) multiple_y = utils.is_list_of_list(ys) multiple_line_label = utils.is_list(line_labels) assert multiple_x == multiple_y == multiple_line_label fig, ax = plt.subplots() if fig_size and isinstance(fig_size, list) and len(fig_size) > 0: if len(fig_size) == 1: fig.set_figwidth(fig_size[0]) else: fig.set_figwidth(fig_size[0]) fig.set_figheight(fig_size[1]) colors = iter(cm.rainbow(np.linspace(0, 1, len(ys)))) if multiple_x: for x, y, line_label in zip(xs, ys, line_labels): ax.scatter(x, y, label=line_label, c=next(colors)) else: ax.scatter(xs, ys, label=line_labels, c=next(colors)) if xticks and isinstance(xticks, list): x = xs[0] if multiple_x else xs plt.xticks(x, xticks) if y_lim and isinstance(y_lim, list) and len(y_lim) > 0: if len(y_lim) == 1: plt.ylim(ymin=y_lim[0]) else: plt.ylim(ymin=y_lim[0]) plt.ylim(ymax=y_lim[1]) plt.xlim(xmin=0) ncol = len(xs) if multiple_x else 1 utils.set_legend(legend_top, ncol) utils.set_sci_axis(ax, x_sci, y_sci) utils.set_axis_labels(ax, xaxis_label, yaxis_label) vlines = vlines or [] for xvline in vlines: with ALTERNATIVE_PALETTE: plt.axvline(x=xvline, **vlines_kwargs) hlines = hlines or [] for yhline in hlines: with ALTERNATIVE_PALETTE: plt.axhline(y=yhline, **hlines_kwargs) utils.finalize(ax) utils.save_fig(name)
lead = int(float((lead))) -1 lag = int(float((lag)))-1 week= lead+lag data[week,lag] = float(auc) for week in range(n): for lag in range(n): if week >= lag: pl.text(week - .3, lag - .1,((int)(100*data[week][lag]))/100.0,fontsize=AUC_fontsize) ax = pl.gca() pl.imshow(np.transpose(data), interpolation='nearest',origin='lower', vmin=0, vmax=1, cmap='RdBu') ax.set_xlabel("The predicted week number", fontsize=fontsize) ax.set_ylabel("Lag", fontsize=fontsize) # pl.title('Logistic Regression AUC: %s' % cohort, fontsize=fontsize) ax.set_xticks(range(n)) ax.set_yticks(range(n)) ax.set_xticklabels(range(2,n+2),fontsize=med_fontsize) ax.set_yticklabels(range(1,n+1),fontsize=med_fontsize) cb = pl.colorbar() for t in cb.ax.get_yticklabels(): t.set_fontsize(med_fontsize) utils.save_fig("/home/colin/evo/papers/thesis/figures/logreg/%s" % cohort) # print cohort, np.nanmean(data) # pl.show() # break
def plot_line(xs, ys, line_labels=None, xaxis_label=None, yaxis_label=None, xticks=None, vlines=None, vlines_kwargs=None, hlines=None, hlines_kwargs=None, x_sci=True, y_sci=True, y_lim=None, x_lim=None, legend_top=True, ls_cycle=False, marker_size=0, x_grid=True, y_grid=True, fig_size=None, name=None, draw_arrow=False): multiple_x = utils.is_list_of_list(xs) multiple_y = utils.is_list_of_list(ys) multiple_line_label = utils.is_list(line_labels) assert multiple_x == multiple_y == multiple_line_label fig, ax = plt.subplots() if fig_size and isinstance(fig_size, list) and len(fig_size) > 0: if len(fig_size) == 1: fig.set_figwidth(fig_size[0]) else: fig.set_figwidth(fig_size[0]) fig.set_figheight(fig_size[1]) ls_cycler = utils.get_line_styles_cycler(ls_cycle) ms_cycler = utils.get_marker_styles_cycler(marker_size > 0) if multiple_x: for x, y, line_label in zip(xs, ys, line_labels): ax.plot(x, y, label=line_label, ls=next(ls_cycler), marker=next(ms_cycler), markersize=marker_size) else: ax.plot(xs, ys, label=line_labels) if xticks and isinstance(xticks, list): x = xs[0] if multiple_x else xs plt.xticks(x, xticks) if y_lim and isinstance(y_lim, list) and len(y_lim) > 0: if len(y_lim) == 1: plt.ylim(ymin=y_lim[0]) else: plt.ylim(ymin=y_lim[0]) plt.ylim(ymax=y_lim[1]) if x_lim and isinstance(x_lim, list) and len(x_lim) > 0: if len(x_lim) == 1: plt.xlim(xmin=x_lim[0]) else: plt.xlim(xmin=x_lim[0]) plt.xlim(xmax=x_lim[1]) ncol = len(xs) if multiple_x else 1 utils.set_legend(legend_top, ncol) utils.set_sci_axis(ax, x_sci, y_sci) utils.set_axis_labels(ax, xaxis_label, yaxis_label) vlines = vlines or [] for xvline in vlines: with ALTERNATIVE_PALETTE: plt.axvline(x=xvline, **vlines_kwargs) hlines = hlines or [] for yhline in hlines: with ALTERNATIVE_PALETTE: plt.axhline(y=yhline, **hlines_kwargs) utils.finalize(ax, x_grid=x_grid, y_grid=y_grid) utils.save_fig(name)
def visualize_feature_hist_dist(X,y,selected_feature,features,normalize=False): """ Visualize the histogram distribution of a feature Keyword arguments: X -- The feature vectors y -- The target vector selected_feature -- The desired feature to obtain the histogram features -- Vector of feature names (X1 to XN) normalize -- Whether to normalize the histogram (Divide by total) """ #create data joint_data=np.column_stack((X,y)) column_names=features #create dataframe df=pd.DataFrame(data=joint_data,columns=column_names) palette = sea.hls_palette() #find number of unique values (groups) unique_values=pd.unique(df[[selected_feature]].values.ravel()) unique_values=map(int, unique_values) unique_values.sort() n_groups=len(unique_values) fig, ax = plt.subplots() index = np.arange(n_groups) bar_width = 0.35 opacity = 0.4 #find values belonging to the positive class and values belonging to the negative class positive_class_index=df[df[features[-1]] == 1].index.tolist() negative_class_index=df[df[features[-1]] != 1].index.tolist() positive_values=df[[selected_feature]].loc[positive_class_index].values.ravel() positive_values=map(int, positive_values) negative_values=df[[selected_feature]].loc[negative_class_index].values.ravel() negative_values=map(int, negative_values) #normalize data (divide by total) n_positive_labels=n_negative_labels=1 if normalize==True: n_positive_labels=len(y[y==1]) n_negative_labels=len(y[y!=1]) #count positive_counts=[0]*len(index) negative_counts=[0]*len(index) for v in xrange(len(unique_values)): positive_counts[v]=positive_values.count(v)/n_positive_labels negative_counts[v]=negative_values.count(v)/n_negative_labels #plot plt.bar(index, positive_counts, bar_width,alpha=opacity,color='b',label='Default') #class 1 plt.bar(index+bar_width, negative_counts, bar_width,alpha=opacity,color='r',label='Paid') #class 0 plt.xlabel(selected_feature) plt.ylabel('Frequency') if normalize: plt.ylabel('Proportion') plt.title("Normalized Histogram Distribution of the feature '"+selected_feature+"' grouped by class") plt.xticks(index + bar_width, map(str, unique_values) ) plt.legend() plt.tight_layout() # plt.show() #save fig output_dir = "img" save_fig(output_dir,'{}/{}_hist_dist.png'.format(output_dir,selected_feature))