Exemple #1
0
def bagging_pasting(X_train, X_val, y_train, y_val, doplot=False):
    from sklearn.ensemble import BaggingClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score

    tree_clf = DecisionTreeClassifier(random_state=42)
    tree_clf.fit(X_train, y_train)
    y_pred_tree = tree_clf.predict(X_val)
    print("DecisionTree classifier, accuracy score = %f\n" %
          accuracy_score(y_val, y_pred_tree))

    bag_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators=500,
                                max_samples=100,
                                bootstrap=True,
                                n_jobs=1,
                                oob_score=True)
    bag_clf.fit(X_train, y_train)
    y_pred = bag_clf.predict(X_val)
    print("Bagging classifier, accuracy score = %f\n" %
          accuracy_score(y_val, y_pred))

    if doplot:
        from utils import plot_decision_boundary, save_fig
        plt.figure(figsize=(11, 4))
        plt.subplot(121)
        plot_decision_boundary(tree_clf, X, y)
        plt.title("Decision Tree", fontsize=14)
        plt.subplot(122)
        plot_decision_boundary(bag_clf, X, y)
        plt.title("Decision Trees with Bagging", fontsize=14)
        save_fig("DT_without_and_with_bagging_plot", CHAPTER_ID)
        plt.show()
Exemple #2
0
def visualize_k_fold_roc_plot(X, y_gold, classifier, K):
    """
	Visualizes K ROC curves created from K-fold cross validation and the mean ROC curve.

	Keyword arguments:
	X -- The feature vectors
	y_gold_standard -- Expected labels.
	classifier -- The classifier to be used
	K -- Number of folds to perform
	"""

    cross_validation = StratifiedKFold(y_gold, n_folds=K)

    mean_true_positive_rate = 0.0
    mean_false_positive_rate = 0.0

    for i, (train, test) in enumerate(cross_validation):
        #classify
        classifier.fit(X[train], y_gold[train])
        y_predicted = classifier.predict(X[test])

        #compute ROC
        false_positive_rate, true_positive_rate, thresholds = roc_curve(
            y_gold[test], y_predicted)
        roc_auc = auc(false_positive_rate, true_positive_rate)
        plt.plot(false_positive_rate,
                 true_positive_rate,
                 linewidth=1,
                 label='ROC fold %d (area = %0.2f)' % (i + 1, roc_auc))

        #save means
        mean_true_positive_rate += true_positive_rate
        mean_false_positive_rate += false_positive_rate

    #compute final mean
    mean_true_positive_rate /= len(cross_validation)
    mean_false_positive_rate /= len(cross_validation)
    mean_auc = auc(mean_false_positive_rate, mean_true_positive_rate)
    plt.plot(mean_false_positive_rate,
             mean_true_positive_rate,
             'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc,
             linewidth=2)

    plt.plot([0, 1], [0, 1],
             '--',
             color=(0.6, 0.6, 0.6),
             label='Random Classifier')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    # plt.show()

    #save fig
    output_dir = 'img'
    save_fig(output_dir, '{}/roc.png'.format(output_dir))

    plt.close()
def visualize_feature_boxplot(X,y,selected_feature,features):
	"""
	Visualize the boxplot of a feature

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature -- The desired feature to obtain the histogram
	features -- Vector of feature names (X1 to XN)
	"""

	#create data
	joint_data=np.column_stack((X,y))
	column_names=features

	#create dataframe
	df=pd.DataFrame(data=joint_data,columns=column_names)

	# palette = sea.hls_palette()
	splot=sea.boxplot(data=df,x='Y',y=selected_feature,hue="Y",palette="husl")
	plt.title('BoxPlot Distribution of '+selected_feature)

	#save fig
	output_dir = "img"
	save_fig(output_dir,'{}/{}_boxplot.png'.format(output_dir,selected_feature))
def make_record_alg_cmp_bar(path, serv, operations, ylabel, stats):
    labels = list(stats.keys())
    xtickslabels = deepcopy(stats[next(iter(stats))]['keys'])

    if serv != 'conf' and serv != 'int':
        for i, val in enumerate(xtickslabels):
            xtickslabels[i] = settings.sec_str[int(val)]

    for op in operations:
        fig, ax = plt.subplots(1, 1, figsize=(30, 10))
        y = []
        yerr = []

        for key in stats:
            y.append(stats[key]['mean_' + ylabel + '_' + op])
            yerr.append(stats[key]['stddev_' + ylabel + '_' + op])

        ax = utils.multiple_custom_bar(y,
                                       yerr,
                                       ax,
                                       title=op + ' (mean)',
                                       labels=labels,
                                       xtickslabels=xtickslabels,
                                       xlabel='security strength (in bits)',
                                       ylabel=ylabel)
        utils.save_fig(
            fig, 'statistics/' + path + '/serv_' + serv + '_' + op + '_' +
            ylabel + '.png')
def visualize_lda2D(X,y):
	"""
	Visualize the separation between classes using the two most discriminant features

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""
	labels=['Paid','Default']
	lda = LDA(n_components = 2,solver='eigen')
	# lda = LDA(n_components = 2)
	discriminative_attributes = lda.fit(X, y).transform(X)

	palette = sea.color_palette()
	# plt.plot(discriminative_attributes[:,0][y==0],'sg',label="Paid", alpha=0.5)
	# plt.plot(discriminative_attributes[:,0][y==1],'^r',label="Default", alpha=0.5)
	plt.scatter(discriminative_attributes[:,0][y==0],discriminative_attributes[:,1][y==0],marker='s',color='green',label="Paid", alpha=0.5)
	plt.scatter(discriminative_attributes[:,0][y==1],discriminative_attributes[:,1][y==1],marker='^',color='red',label="Default", alpha=0.5)
	plt.xlabel('First Linear Discriminant')
	plt.ylabel('Second Linear Discriminant')

	leg = plt.legend(loc='upper right', fancybox=True)
	leg.get_frame().set_alpha(0.5)
	plt.title("Linear Discriminant Analysis")
	plt.tight_layout

	#save fig
	output_dir='img'
	save_fig(output_dir,'{}/lda.png'.format(output_dir))
Exemple #6
0
def plot_hist(baseline_samples, target_samples, true_x, true_y):
    baseline_samples = baseline_samples.squeeze()
    target_samples = target_samples.squeeze()

    bmin, bmax = baseline_samples.min(), baseline_samples.max()

    ax = sns.kdeplot(baseline_samples, shade=True, color=(0.6, 0.1, 0.1, 0.2))
    ax = sns.kdeplot(target_samples, shade=True, color=(0.1, 0.1, 0.6, 0.2))
    ax.set_xlim(bmin, bmax)

    y0, y1 = ax.get_ylim()

    plt.plot([true_y, true_y], [0, y1 - (y1 - y0) * 0.01],
             linewidth=1,
             color='r')
    plt.title('Predictive' +
              (f' at {true_x:.2f}' if true_x is not None else ''))

    fig = plt.gcf()
    fig.set_size_inches(9, 9)
    # plt.tight_layout()  # pad=0.4, w_pad=0.5, h_pad=1.0)

    name = utils.DATA_DIR.replace('/', '-')
    # plt.tight_layout(pad=0.6)
    utils.save_fig('predictive-at-point-' + name)
def visualize_feature_boxplot(X, y, selected_feature, features):
    """
	Visualize the boxplot of a feature

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature -- The desired feature to obtain the histogram
	features -- Vector of feature names (X1 to XN)
	"""

    #create data
    joint_data = np.column_stack((X, y))
    column_names = features

    #create dataframe
    df = pd.DataFrame(data=joint_data, columns=column_names)

    # palette = sea.hls_palette()
    splot = sea.boxplot(data=df,
                        x='Y',
                        y=selected_feature,
                        hue="Y",
                        palette="husl")
    plt.title('BoxPlot Distribution of ' + selected_feature)

    #save fig
    output_dir = "img"
    save_fig(output_dir, '{}/{}_boxplot.png'.format(output_dir,
                                                    selected_feature))
def callback(X_next, Y_next, i):
    global X_sample, Y_sample
    # Plot samples, surrogate function, noise-free objective and next sampling location
    #plt.subplot(n_iter, 2, 2 * i + 1)
    plt.figure()
    plot_approximation(gpr,
                       X,
                       Y,
                       X_sample,
                       Y_sample,
                       X_next,
                       show_legend=i == 0)
    plt.title(f'Iteration {i+1}')
    if save_figures: save_fig('bayes-opt-surrogate-{}.pdf'.format(i + 1))
    plt.show()

    plt.figure()
    #plt.subplot(n_iter, 2, 2 * i + 2)
    plot_acquisition(X,
                     expected_improvement(X, X_sample, Y_sample, gpr),
                     X_next,
                     show_legend=i == 0)
    if save_figures: save_fig('bayes-opt-acquisition-{}.pdf'.format(i + 1))
    plt.show()

    # Add sample to previous samples
    X_sample = np.append(X_sample, np.atleast_2d(X_next), axis=0)
    Y_sample = np.append(Y_sample, np.atleast_2d(Y_next), axis=0)
def visualize_hist_pairplot(X,y,selected_feature1,selected_feature2,features,diag_kind):
	"""
	Visualize the pairwise relationships (Histograms and Density Funcions) between classes and respective attributes

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature1 - First feature
	selected_feature1 - Second feature
	diag_kind -- Type of plot in the diagonal (Histogram or Density Function)
	"""

	#create data
	joint_data=np.column_stack((X,y))
	column_names=features

	#create dataframe
	df=pd.DataFrame(data=joint_data,columns=column_names)

	#plot
	palette = sea.hls_palette()
	splot=sea.pairplot(df, hue="Y", palette={0:palette[2],1:palette[0]},vars=[selected_feature1,selected_feature2],diag_kind=diag_kind)
	splot.fig.suptitle('Pairwise relationship: '+selected_feature1+" vs "+selected_feature2)
	splot.set(xticklabels=[])
	# plt.subplots_adjust(right=0.94, top=0.94)

	#save fig
	output_dir = "img"
	save_fig(output_dir,'{}/{}_{}_hist_pairplot.png'.format(output_dir,selected_feature1,selected_feature2))
def generate_histograms(start_feature, end_feature, features_base):
	num_weeks = 15
	num_features = end_feature - start_feature +1
	in_file = "data/%s.csv" % features_base

	feature_set = validate_csv(in_file)	

	start_time = time.time()
	data = np.genfromtxt(in_file, delimiter = ',', skip_header = 1)	
	print "loaded data in", time.time() - start_time, "seconds"	

	pl.clf()
	dropout_vector = data[:, 1]
	for feature_index in range(start_feature, end_feature + 1):
		feature_distribution = data[:, feature_index]
		start_time = time.time()

		m1 = feature_distribution == -1 # remove default values
		masked = np.ma.masked_array(feature_distribution, m1)

		for x, value in enumerate(masked):
			if (x % num_weeks == 0 and  dropout_vector[x] == 0) or (x % num_weeks != 0 and dropout_vector[x - 1] == 0) : #remove values where the student was always dropped out or has already dropped out the prior week
				masked.mask[x] = True

		graph_distribution(masked.compressed(), feature_set[feature_index -1], feature_index - start_feature + 1, num_features)
		print "Ran Feature %s in" % (feature_set[feature_index -1]), time.time() - start_time, "seconds"	
	pl.subplots_adjust(hspace=.5)
	pl.subplots_adjust(wspace=.5)
	# pl.show()
	utils.save_fig("/home/colin/evo/papers/thesis/figures/feature_distributions/%s_%s_%s" % (features_base, start_feature, end_feature))
def train(args, model, train_loader, optimizer, epoch_index):
    model.train()

    correct = 0
    epoch_train_loss = 0.0

    ## use a weight matrix to store the weight of the network which will be used for visualization
    weight_matrix = np.zeros(
        (args.image_fashion_mnist_width, args.image_fashion_mnist_height))

    for batch_idx, (data, target) in enumerate(train_loader):

        ## preparing the data fed into the neural network
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()

        ## obtain the output of the network
        if args.dataset_name == "cifar10":
            output = model(data)
        elif args.dataset_name == "fashion_mnist":

            # if viusalize the weights of neural network
            if args.visual_flag:
                output, weight_ret = model(data)
                weight_matrix = weight_ret.detach().numpy()
            # not viusalize the weights of neural network
            else:
                output = model(data)

        # calculate the predication
        train_pred = torch.argmax(F.softmax(output, dim=1), dim=1).view(-1, )

        # count the correct prediction
        correct += train_pred.eq(target.data).sum()

        # The cross entropy loss is calculated, softmax function is embedded in F.cross_entropy() function
        loss = F.cross_entropy(output, target)
        epoch_train_loss += loss.item()

        # loss is used for back-propagation (BP) in MLP
        loss.backward()
        optimizer.step()

    epoch_train_accuracy = 100. * correct / len(train_loader.dataset)
    epoch_loss_mean = epoch_train_loss / len(train_loader)

    if epoch_index % args.log_interval == 0:
        print('Train Epoch: {}\tLoss: {:.6f} Accuracy: {:.2f}%'.format(
            epoch_index, epoch_loss_mean, epoch_train_accuracy))

    # visualize the weights of neural network for Fashion MNIST dataset
    if args.dataset_name == "fashion_mnist" and args.visual_flag and epoch_index % args.save_weight_interval == 0:
        os.makedirs(args.output_folder, exist_ok=True)
        save_fig(
            args, weight_matrix,
            os.path.join(args.output_folder,
                         "network_weights_" + str(epoch_index) + ".pdf"))

    return epoch_loss_mean, epoch_train_accuracy
def make_errorbar(ylabel, operations, file_path, stats, types, xlabel):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    params = [{'color': 'red'}, {'color': 'blue'}]

    for i in range(len(axes)):
        axes[i] = utils.custom_errorbar(stats['keys'], stats[types[0] + '_' + ylabel + '_' + operations[i]],
                                    stats[types[1] + '_' + ylabel + '_' + operations[i]], axes[i], title=operations[i],
                                    xlabel=xlabel, ylabel=ylabel, kwargs=params[i])

    utils.save_fig(fig, file_path + ylabel + '_deviation.png')
def evaluate_config(X_test,
                    X_train,
                    Y_test,
                    Y_train,
                    acc_list,
                    e_loss,
                    p_loss,
                    model_orig,
                    inputs,
                    outputs,
                    model,
                    z_idx,
                    title,
                    fig_dir=None,
                    use_LIME=True):
    fig, (ax_1, ax_2) = plt.subplots(1, 2, figsize=(8, 4))
    ax = ax_1
    color = 'tab:orange'
    ax.plot(np.arange(len(e_loss)),
            e_loss,
            label="Explanation Loss",
            color=color)
    ax.set_ylabel("loss")
    ax2 = ax.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:green'
    ax2.plot(np.arange(len(acc_list)),
             np.array(acc_list)[:, 1],
             label="Accuracy",
             color=color)
    ax.set_xlabel("epochs")
    ax2.set_ylabel("accuracy")
    # second ax
    ax = ax_2
    color = 'tab:orange'
    ax.plot(np.arange(len(e_loss)), e_loss, color=color)
    ax.set_ylabel("explanation loss")
    ax2 = ax.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:blue'
    ax2.plot(np.arange(len(p_loss)),
             p_loss,
             label="Categorical Cross-Entropy",
             color=color)
    ax2.set_ylabel("performance loss")
    ax.set_xlabel("epochs")
    fig.legend(loc='upper left')
    # ax2.legend()
    plt.tight_layout()
    if fig_dir is not None:
        save_fig(fig, "loss_cf", fig_dir)

    df, df_test = evaluate_config_2(X_test, X_train, Y_test, Y_train, fig_dir,
                                    inputs, model, model_orig, outputs, title,
                                    use_LIME, z_idx)

    return df, df_test
Exemple #14
0
def render(env):
    # to render an environment, you can use the mode 'human'
    #img = env.render(mode='human')
    # to get back the image in the form of a NumPy array, use mode='rgb_array'
    img = env.render(mode='rgb_array')

    plt.figure(figsize=(6, 8))
    plt.imshow(img)
    plt.axis('off')
    utils.save_fig(config.ENV_SHORT_NAME + '_step1')
    plt.show()
def make_plot(ylabel, operations, file_path, stats, types, xlabel):
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    params1 = {'color': 'red', 'linestyle': '-', 'label': operations[0]}
    params2 = {'color': 'blue', 'linestyle': '--', 'label': operations[1]}

    for ax, tp in zip(axes, types):
        ax = utils.custom_plots(stats['keys'], stats[tp + '_' + ylabel + '_' + operations[0]],
                            stats[tp + '_' + ylabel + '_' + operations[1]], ax,
                            title=tp.capitalize(), xlabel=xlabel, ylabel=ylabel, kwargs1=params1, kwargs2=params2)

    utils.save_fig(fig, file_path + ylabel + '_statistics.png')
Exemple #16
0
def make_record_alg_cmp_bar(path, operations, ylabel, stats, extra_labels, handshake=False):
    xtickslabels = []
    sec_lvl = []
    scale_type = ['linear', 'log']

    for key in stats[list(stats.keys())[0]]['keys']:
        val = key.split('_')

        if val[1] not in sec_lvl:
            sec_lvl.append(val[1])

    for key in stats:
        tmp = ''

        for id in extra_labels[key]:
            tmp += id

        if tmp == '':
            xtickslabels.append(key)
        else:
            xtickslabels.append(key + '\n(' + tmp + ')')

    for op in operations:
        for lvl in sec_lvl:
            y = {}

            for key in stats:
                for key in stats[key]['keys']:
                    elem = key.split('_')

                    if elem[0] not in y:
                        y[elem[0]] = []
            
            for alg in y:
                for key in stats:
                    try:
                        idx = stats[key]['keys'].index(alg + '_' + lvl)
                        y[alg].append(stats[key]['mean_' + ylabel + '_' + op][idx])
                    
                    except (ValueError, KeyError):
                        y[alg].append(0)

            # print('')
            # for a in y:
            #     print(f'{a}: {y[a]} : {len(y[a])}')

            for scale in scale_type:
                fig, ax = plt.subplots(1, 1, figsize=(30, 10))

                ax = utils.stacked_custom_bar(y, ax, handshake=handshake, title=op + ' (mean)', scale=scale,
                                            xlabel='algorithms', xtickslabels=xtickslabels, ylabel=ylabel)
                utils.save_fig(fig, 'statistics/' + path + '/serv_all_' + op +
                                            '_' + settings.sec_str[int(lvl)] + '_' + ylabel + '_' + scale + '.png')
def visualize_k_fold_precision_recall_plot(X,y_gold,classifier,K):
	"""
	Visualizes K Average Precision-Recall curves created from K-fold cross validation and the mean Precision-Recall curve.

	Keyword arguments:
	X -- The feature vectors
	y_gold_standard -- Expected labels.
	classifier -- The classifier to be used
	K -- Number of folds to perform
	"""

	cross_validation = StratifiedKFold(y_gold, n_folds=K)

	mean_precision = 0.0
	mean_recall= 0.0
	avg_mean_precision_recall=0.0

	for i, (train, test) in enumerate(cross_validation):
		#classify
		classifier.fit(X[train], y_gold[train])
		y_predicted=classifier.predict(X[test])

		#compute Precision-Recall
		precision, recall, thresholds = precision_recall_curve(y_gold[test].ravel(),y_predicted.ravel())
		average_precision = average_precision_score(y_gold[test], y_predicted)
		plt.plot(recall, precision,label='Precision-recall fold %d (area = %0.2f)' % (i+1, average_precision) )

		#save means
		mean_precision += precision
		mean_recall += recall
		avg_mean_precision_recall+=average_precision

    #compute final mean
	mean_precision /= len(cross_validation)
	mean_recall /= len(cross_validation)
	avg_mean_precision_recall /= len(cross_validation)
	plt.plot(mean_recall,mean_precision, 'k--',label='Mean Precision-Recall (area = %0.2f)' % avg_mean_precision_recall, linewidth=2)

	plt.plot([0,1], [0.5, 0.5], '--', color=(0.6, 0.6, 0.6), label='Random Classifier')

	plt.xlim([0.0, 1.0])
	plt.ylim([0.0, 1.05])
	plt.xlabel('Recall')
	plt.ylabel('Precision')
	plt.title('Precision-Recall Curve')
	plt.legend(loc="lower left")
	# plt.show()

	#save fig
	output_dir='img'
	save_fig(output_dir,'{}/pr_curve.png'.format(output_dir))

	plt.close()
Exemple #18
0
def plot_predictive_comparison(env, baseline_samples, target_samples, stddev_mult=3., target_metrics=None,
                               title_name=None):
    # single var regression only
    baseline_samples = baseline_samples.squeeze()
    target_samples = target_samples.squeeze()

    train_x, train_y = env.get_train_x(), env.get_train_y()
    test_x, test_y = env.get_test_x(), env.get_test_y()

    pad_width = test_x.shape[0] - train_x.shape[0]
    train_x_padded = np.pad(train_x[:, 0], (0, pad_width), 'constant', constant_values=np.nan)
    train_y_padded = np.pad(train_y[:, 0], (0, pad_width), 'constant', constant_values=np.nan)

    df = pd.DataFrame.from_dict({
        'time': test_x[:, 0],
        'true_y': test_y[:, 0],
        'train_x': train_x_padded,
        'train_y': train_y_padded,
        'mean': target_samples.mean(axis=0),
        'std': stddev_mult * target_samples.std(axis=0),
        'base_mean': baseline_samples.mean(axis=0),
        'base_std': stddev_mult * baseline_samples.std(axis=0),
    }).reset_index()

    g = sns.FacetGrid(df, size=9, aspect=1.8)

    g.map(plt.errorbar, 'time', 'base_mean', 'base_std', color=(0.7, 0.1, 0.1, 0.09))
    g.map(plt.errorbar, 'time', 'mean', 'std', color=(0.1, 0.1, 0.7, 0.09))
    g.map(plt.plot, 'time', 'mean', color='b', lw=1)
    g.map(plt.plot, 'time', 'true_y', color='r', lw=1)
    g.map(plt.scatter, 'train_x', 'train_y', color='g', s=20)

    ax = g.ax
    ax.set_title('Posterior Predictive Distribution' + (': ' + title_name) if title_name is not None else '')
    ax.set(xlabel='X', ylabel='Y')
    ax.set_xlim(env.view_xrange[0], env.view_xrange[1])
    ax.set_ylim(env.view_yrange[0], env.view_yrange[1])

    legend = ['Prediction mean', 'True f(x)', 'Training data', 'True StdDev', 'Predicted StdDev']
    plt.legend(legend)

    if target_metrics is not None:
        offset = 0
        for tm, tv in target_metrics.items():
            ax.annotate('{}: {:.02f}'.format(tm, tv), xy=(0.08, 0.92 - offset), xytext=(0.08, 0.92 - offset),
                        xycoords='figure fraction', textcoords='figure fraction')
            offset += 0.04

    name = utils.DATA_DIR.replace('/', '-')
    plt.tight_layout(pad=0.6)
    utils.save_fig('predictive-distribution-' + name)
Exemple #19
0
def plot_predictive_baseline(env, samples, stddev_mult=3., title_name=None):
    # single var regression only
    samples = samples.squeeze()

    train_x, train_y = env.get_train_x(), env.get_train_y()
    test_x, test_y = env.get_test_x(), env.get_test_y()

    pad_width = test_x.shape[0] - train_x.shape[0]
    train_x_padded = np.pad(train_x[:, 0], (0, pad_width),
                            'constant',
                            constant_values=np.nan)
    train_y_padded = np.pad(train_y[:, 0], (0, pad_width),
                            'constant',
                            constant_values=np.nan)

    data = samples

    df = pd.DataFrame.from_dict({
        'time': test_x[:, 0],
        'true_y': test_y[:, 0],
        'train_x': train_x_padded,
        'train_y': train_y_padded,
        'mean': data.mean(axis=0),
        'std': stddev_mult * data.std(axis=0),
        # 'stdn': 2. * (data.std(axis=0) + .5 ** .5),
    }).reset_index()

    g = sns.FacetGrid(df, size=9, aspect=1.8)

    g.map(plt.errorbar, 'time', 'mean', 'std', color=(0.7, 0.1, 0.1, 0.09))
    g.map(plt.plot, 'time', 'mean', color='b', lw=1)
    g.map(plt.plot, 'time', 'true_y', color='r', lw=1)
    g.map(plt.scatter, 'train_x', 'train_y', color='g', s=20)

    ax = g.ax
    ax.set_title('Posterior Predictive Distribution' +
                 (': ' + title_name) if title_name is not None else '')
    ax.set(xlabel='X', ylabel='Y')
    ax.set_xlim(env.view_xrange[0], env.view_xrange[1])
    ax.set_ylim(env.view_yrange[0], env.view_yrange[1])

    legend = ['Prediction mean', 'True f(x)', 'Training data', 'StdDev']
    plt.legend(legend)

    # ax.annotate("MSE: {:.03f}".format(0), xy=(0.1, 0.9), xytext=(0.1, 0.9), xycoords='figure fraction',
    #             textcoords='figure fraction')

    name = utils.DATA_DIR.replace('/', '-')
    plt.tight_layout(pad=0.6)
    utils.save_fig('predictive-distribution-' + name)
def visualize_k_fold_roc_plot(X,y_gold,classifier,K):
	"""
	Visualizes K ROC curves created from K-fold cross validation and the mean ROC curve.

	Keyword arguments:
	X -- The feature vectors
	y_gold_standard -- Expected labels.
	classifier -- The classifier to be used
	K -- Number of folds to perform
	"""

	cross_validation = StratifiedKFold(y_gold, n_folds=K)

	mean_true_positive_rate = 0.0
	mean_false_positive_rate = 0.0

	for i, (train, test) in enumerate(cross_validation):
		#classify
		classifier.fit(X[train], y_gold[train])
		y_predicted=classifier.predict(X[test])

		#compute ROC
		false_positive_rate, true_positive_rate, thresholds = roc_curve(y_gold[test], y_predicted)
		roc_auc = auc(false_positive_rate, true_positive_rate)
		plt.plot(false_positive_rate, true_positive_rate, linewidth=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc))

		#save means
		mean_true_positive_rate += true_positive_rate
		mean_false_positive_rate += false_positive_rate

	#compute final mean
	mean_true_positive_rate /= len(cross_validation)
	mean_false_positive_rate /= len(cross_validation)
	mean_auc = auc(mean_false_positive_rate, mean_true_positive_rate)
	plt.plot(mean_false_positive_rate, mean_true_positive_rate, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, linewidth=2)


	plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Classifier')

	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('Receiver Operating Characteristic (ROC) Curve')
	plt.legend(loc="lower right")
	# plt.show()

	#save fig
	output_dir='img'
	save_fig(output_dir,'{}/roc.png'.format(output_dir))

	plt.close()
Exemple #21
0
def plot_box(xs,
             ys,
             data,
             xaxis_label=None,
             yaxis_label=None,
             x_sci=False,
             y_sci=True,
             name=None):
    fig, ax = plt.subplots()
    ax = sns.boxplot(x=xs, y=ys, data=data, linewidth=1, fliersize=6)
    utils.set_sci_axis(ax, x_sci, y_sci)
    utils.set_axis_labels(ax, xaxis_label, yaxis_label)
    utils.finalize(ax)
    utils.save_fig(name)
Exemple #22
0
def plot_bar(x, y, hue=None, hue_count=1,
             line_values=None, line_label=None, legend=True,
             xaxis_label=None, yaxis_label=None,
             xticks=None, y_lim=None,
             x_sci=True, y_sci=True,
             fig_size=None,
             y_err=None,
             name=None):
    fig, ax = plt.subplots()
    if fig_size and isinstance(fig_size, list) and len(fig_size) > 0:
        if len(fig_size) == 1:
            fig.set_figwidth(fig_size[0])
        else:
            fig.set_figwidth(fig_size[0])
            fig.set_figheight(fig_size[1])

    bar_width = 0.2
    new_x = [x_ + bar_width for x_ in x]
    ticks_x = [x_ + 0.5*bar_width for x_ in new_x]
    if y_err:
        ax.errorbar(ticks_x, y, yerr=y_err, fmt='o', ecolor='r', capthick=1, elinewidth=1)
    plt.bar(new_x, y, width=bar_width)

    if line_values and utils.is_list(line_values):
        x_set = set(x)
        if len(line_values) == len(x_set):
            if line_label:
                ax.plot(ax.get_xticks(), line_values, label=line_label)
                hue_count += 1
            else:
                ax.plot(ax.get_xticks(), line_values)
    if xticks and isinstance(xticks, list):
        plt.xticks(ticks_x, xticks)
    if y_lim and isinstance(y_lim, list) and len(y_lim) > 0:
        if len(y_lim) == 1:
            plt.ylim(ymin=y_lim[0])
        else:
            plt.ylim(ymin=y_lim[0])
            plt.ylim(ymax=y_lim[1])
    if legend:
        utils.set_legend(ncol=hue_count)
    utils.set_sci_axis(ax, x_sci, y_sci)
    utils.set_axis_labels(ax, xaxis_label, yaxis_label)
    utils.finalize(ax, x_grid=False)
    utils.save_fig(name)
def make_scatter(ylabel, operations, file_path, data, xlabel):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    x = {}
    y = {}
    xtickslabels = list(data.keys())
    kwargs = [{'color': 'red'}, {'color': 'blue'}]

    for op in operations:
        x[op] = []
        y[op] = []

        for key, i in zip(data, range(len(xtickslabels))):
            x[op] += [i for j in range(len(data[key][ylabel + '_' + op]))]
            y[op] += data[key][ylabel + '_' + op]
        
    for i in range(len(axes)):
        axes[i] = utils.custom_scatter(x[operations[i]], y[operations[i]], axes[i], title=operations[i],
                                    xtickslabels=xtickslabels, xlabel=xlabel, ylabel=ylabel, kwargs=kwargs[i])
        utils.save_fig(fig, file_path + ylabel + '_distribution.png')
def feature_cdf(X, y, selected_feature):
    """
	Plot the empirical/stand cumulative density function of the given feature

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature -- The desired feature to obtain the histogram
	"""

    #Standard Normal Cumulative Density Function
    N = len(X)
    Normal = np.random.normal(size=N)
    histogram, bin_edges = np.histogram(Normal, bins=N, normed=True)
    dx = bin_edges[1] - bin_edges[0]
    G = np.cumsum(histogram) * dx

    #Empirical Cumulative Density Functions
    feature_index = int(selected_feature[1:]) - 1
    X_k = np.sort(X[:, feature_index])  #feature vector sorted
    ECDF_k = np.array(range(N)) / float(
        N)  #Empirical Cumulative Function F, steps of 1/N

    #Kolmogorov-Smirnov Test
    result = kolmogorov_smirnov_two_sample_test(G, ECDF_k)
    ks_statistic = result[0]
    p_value = result[1]

    plt.plot(bin_edges[1:],
             G,
             label="Standard Normal Cumulative Density Funcion")
    plt.plot(X_k, ECDF_k, label="Empirical Cumulative Density Function")
    plt.suptitle("Empirirical vs Standard Normal Cumulative Distribution of " +
                 selected_feature + " Feature\nKolmogorov-Smirnov Statistic=" +
                 str(ks_statistic))
    plt.xlabel(selected_feature)
    plt.legend(loc='center right')

    # plt.show()

    #save fig
    output_dir = "img"
    save_fig(output_dir, '{}/{}_cdf.png'.format(output_dir, selected_feature))
def visualize_hist_pairplot(X, y, selected_feature1, selected_feature2,
                            features, diag_kind):
    """
	Visualize the pairwise relationships (Histograms and Density Funcions) between classes and respective attributes

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature1 - First feature
	selected_feature1 - Second feature
	diag_kind -- Type of plot in the diagonal (Histogram or Density Function)
	"""

    #create data
    joint_data = np.column_stack((X, y))
    column_names = features

    #create dataframe
    df = pd.DataFrame(data=joint_data, columns=column_names)

    #plot
    palette = sea.hls_palette()
    splot = sea.pairplot(df,
                         hue="Y",
                         palette={
                             0: palette[2],
                             1: palette[0]
                         },
                         vars=[selected_feature1, selected_feature2],
                         diag_kind=diag_kind)
    splot.fig.suptitle('Pairwise relationship: ' + selected_feature1 + " vs " +
                       selected_feature2)
    splot.set(xticklabels=[])
    # plt.subplots_adjust(right=0.94, top=0.94)

    #save fig
    output_dir = "img"
    save_fig(
        output_dir,
        '{}/{}_{}_hist_pairplot.png'.format(output_dir, selected_feature1,
                                            selected_feature2))
Exemple #26
0
def plot_two_bars(ys, labels,
                  legend=True,
                  xaxis_label=None,
                  yaxis_label=None,
                  xticks=None, y_lim=None,
                  x_sci=True, y_sci=True,
                  fig_size=None,
                  name=None):
    fig, ax = plt.subplots()
    if fig_size and isinstance(fig_size, list) and len(fig_size) > 0:
        if len(fig_size) == 1:
            fig.set_figwidth(fig_size[0])
        else:
            fig.set_figwidth(fig_size[0])
            fig.set_figheight(fig_size[1])

    bar_width = 0.2
    x = range(len(ys))
    new_x1 = [x_ for x_ in x]
    new_x2 = [x_ + bar_width for x_ in x]
    rects1 = ax.bar(new_x1, ys[0], width=bar_width, color='b')
    rects2 = ax.bar(new_x2, ys[1], width=bar_width)

    if xticks and isinstance(xticks, list):
        plt.xticks([x_ + bar_width for x_ in new_x1], xticks)
    if y_lim and isinstance(y_lim, list) and len(y_lim) > 0:
        if len(y_lim) == 1:
            plt.ylim(ymin=y_lim[0])
        else:
            plt.ylim(ymin=y_lim[0])
            plt.ylim(ymax=y_lim[1])
    if legend:
        ax.legend((rects1[0], rects2[0]), (labels[0], labels[1]),
                  fontsize=28, frameon=False,
                  bbox_to_anchor=(0, 0.95, 1.2, .10), handlelength=0.5, handletextpad=0.2,
                  loc=3, ncol=2, mode="expand",
                  borderaxespad=0.)
    utils.set_sci_axis(ax, x_sci, y_sci)
    utils.set_axis_labels(ax, xaxis_label, yaxis_label)
    utils.finalize(ax, x_grid=False)
    utils.save_fig(name)
def visualize_pca2D(X, y):
    """
	Visualize the first two principal components

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(X)

    palette = sea.color_palette()
    plt.scatter(principal_components[y == 0, 0],
                principal_components[y == 0, 1],
                marker='s',
                color='green',
                label="Paid",
                alpha=0.5,
                edgecolor='#262626',
                facecolor=palette[1],
                linewidth=0.15)
    plt.scatter(principal_components[y == 1, 0],
                principal_components[y == 1, 1],
                marker='^',
                color='red',
                label="Default",
                alpha=0.5,
                edgecolor='#262626'
                '',
                facecolor=palette[2],
                linewidth=0.15)

    leg = plt.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.title("Two-Dimensional Principal Component Analysis")
    plt.tight_layout

    #save fig
    output_dir = 'img'
    save_fig(output_dir, '{}/pca2D.png'.format(output_dir))
def visualize_lda2D(X, y):
    """
	Visualize the separation between classes using the two most discriminant features

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""
    labels = ['Paid', 'Default']
    lda = LDA(n_components=2, solver='eigen')
    # lda = LDA(n_components = 2)
    discriminative_attributes = lda.fit(X, y).transform(X)

    palette = sea.color_palette()
    # plt.plot(discriminative_attributes[:,0][y==0],'sg',label="Paid", alpha=0.5)
    # plt.plot(discriminative_attributes[:,0][y==1],'^r',label="Default", alpha=0.5)
    plt.scatter(discriminative_attributes[:, 0][y == 0],
                discriminative_attributes[:, 1][y == 0],
                marker='s',
                color='green',
                label="Paid",
                alpha=0.5)
    plt.scatter(discriminative_attributes[:, 0][y == 1],
                discriminative_attributes[:, 1][y == 1],
                marker='^',
                color='red',
                label="Default",
                alpha=0.5)
    plt.xlabel('First Linear Discriminant')
    plt.ylabel('Second Linear Discriminant')

    leg = plt.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.title("Linear Discriminant Analysis")
    plt.tight_layout

    #save fig
    output_dir = 'img'
    save_fig(output_dir, '{}/lda.png'.format(output_dir))
def feature_cdf(X,y,selected_feature):
	"""
	Plot the empirical/stand cumulative density function of the given feature

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature -- The desired feature to obtain the histogram
	"""

	#Standard Normal Cumulative Density Function
	N = len(X)
	Normal = np.random.normal(size = N)
	histogram,bin_edges = np.histogram(Normal, bins = N, normed = True )
	dx = bin_edges[1] - bin_edges[0]
	G = np.cumsum(histogram)*dx

	#Empirical Cumulative Density Functions
	feature_index=int(selected_feature[1:])-1
	X_k = np.sort(X[:,feature_index])					#feature vector sorted
	ECDF_k = np.array(range(N))/float(N)				#Empirical Cumulative Function F, steps of 1/N

	#Kolmogorov-Smirnov Test
	result=kolmogorov_smirnov_two_sample_test(G,ECDF_k)
	ks_statistic=result[0]
	p_value=result[1]

	plt.plot(bin_edges[1:], G, label="Standard Normal Cumulative Density Funcion")
	plt.plot(X_k, ECDF_k,label="Empirical Cumulative Density Function")
	plt.suptitle("Empirirical vs Standard Normal Cumulative Distribution of "+selected_feature+" Feature\nKolmogorov-Smirnov Statistic="+str(ks_statistic))
	plt.xlabel(selected_feature)
	plt.legend(loc='center right')

	# plt.show()

	#save fig
	output_dir = "img"
	save_fig(output_dir,'{}/{}_cdf.png'.format(output_dir,selected_feature))
def visualize_pca2D(X,y):
	"""
	Visualize the first two principal components

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	"""
	pca = PCA(n_components = 2)
	principal_components = pca.fit_transform(X)

	palette = sea.color_palette()
	plt.scatter(principal_components[y==0, 0], principal_components[y==0, 1], marker='s',color='green',label="Paid", alpha=0.5,edgecolor='#262626', facecolor=palette[1], linewidth=0.15)
	plt.scatter(principal_components[y==1, 0], principal_components[y==1, 1], marker='^',color='red',label="Default", alpha=0.5,edgecolor='#262626''', facecolor=palette[2], linewidth=0.15)

	leg = plt.legend(loc='upper right', fancybox=True)
	leg.get_frame().set_alpha(0.5)
	plt.title("Two-Dimensional Principal Component Analysis")
	plt.tight_layout

	#save fig
	output_dir='img'
	save_fig(output_dir,'{}/pca2D.png'.format(output_dir))
Exemple #31
0
def adaboost(X_train, X_val, y_train, y_val, doplot=False):
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score

    ada_clf = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=1),  # weak classifier
        n_estimators=200,
        algorithm='SAMME.R',
        learning_rate=0.5)
    ada_clf.fit(X_train, y_train)
    y_pred = ada_clf.predict(X_val)
    print("Adaboost classifier, accuracy score = %f\n" %
          accuracy_score(y_val, y_pred))

    if doplot:
        from utils import plot_decision_boundary, save_fig
        plt.figure(figsize=(11, 4))
        plot_decision_boundary(ada_clf, X, y)
        plt.title("Adaboost classifcation with Decision Tree base estimator",
                  fontsize=12)
        save_fig("AdaBoost_with_DT", CHAPTER_ID)
        plt.show()
Exemple #32
0
def generate_histograms(start_feature, end_feature, features_base):
    num_weeks = 15
    num_features = end_feature - start_feature + 1
    in_file = "data/%s.csv" % features_base

    feature_set = validate_csv(in_file)

    start_time = time.time()
    data = np.genfromtxt(in_file, delimiter=',', skip_header=1)
    print "loaded data in", time.time() - start_time, "seconds"

    pl.clf()
    dropout_vector = data[:, 1]
    for feature_index in range(start_feature, end_feature + 1):
        feature_distribution = data[:, feature_index]
        start_time = time.time()

        m1 = feature_distribution == -1  # remove default values
        masked = np.ma.masked_array(feature_distribution, m1)

        for x, value in enumerate(masked):
            if (x % num_weeks == 0 and dropout_vector[x] == 0) or (
                    x % num_weeks != 0 and dropout_vector[x - 1] == 0
            ):  #remove values where the student was always dropped out or has already dropped out the prior week
                masked.mask[x] = True

        graph_distribution(masked.compressed(), feature_set[feature_index - 1],
                           feature_index - start_feature + 1, num_features)
        print "Ran Feature %s in" % (feature_set[feature_index - 1]
                                     ), time.time() - start_time, "seconds"
    pl.subplots_adjust(hspace=.5)
    pl.subplots_adjust(wspace=.5)
    # pl.show()
    utils.save_fig(
        "/home/colin/evo/papers/thesis/figures/feature_distributions/%s_%s_%s"
        % (features_base, start_feature, end_feature))
Exemple #33
0
    def save_figure(self, save_dir_path):
        if not check_is_dir(save_dir_path):
            raise IsADirectoryError(r'please give a correct dir_path')
        save_dir_path = Path(save_dir_path)

        if not self._flag_univariate_analysis:
            raise PermissionError("save_figure will be excute when univariate_analysis finished")

        save_fig(self._plot_distribute, save_dir_path, type_str='distribute')
        save_fig(self._plot_distribute_target, save_dir_path, type_str='distribute_taget')
        save_fig(self._plot_ar, save_dir_path, type_str='ar')
def enet_plot(l1_ratio):
    """Function plotting enet_path for some tuning parameter."""
    _, theta_enet, _ = linear_model.enet_path(A,
                                              b,
                                              alphas=alphas,
                                              fit_intercept=False,
                                              l1_ratio=l1_ratio,
                                              return_models=False)
    fig1 = plt.figure(figsize=(12, 8))
    ax1 = fig1.add_subplot(111)
    ax1.plot(alphas, np.transpose(theta_enet), linewidth=3)
    ax1.set_xscale('log')
    ax1.set_xlabel(r"$\lambda$")
    ax1.set_ylabel("Coefficient value")
    ax1.set_ylim([-1, 5])
    plt.savefig(save_fig(path, "enet_coeffs", "pdf"))
    plt.show()
    return theta_enet
Exemple #35
0
def plot_curves(ytrue, yscores, name):
    precisions, recalls, thresholds = precision_recall_curve(ytrue, yscores)
    ap = average_precision_score(ytrue, yscores)
    plt.figure()
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    save_fig("{}-PR-vs-thresh.pdf".format(name))
    plt.figure()
    plot_precision_vs_recall(precisions, recalls, ap)
    save_fig("{}-PR.pdf".format(name))
    fpr, tpr, thresholds = roc_curve(ytrue, yscores)
    auc = roc_auc_score(ytrue, yscores)
    plt.figure()
    plot_roc_curve(fpr, tpr, auc)
    save_fig("{}-ROC.pdf".format(name))
Exemple #36
0
df.hist()
plt.show()

# scatter plot of response vs each feature
nrows = 3
ncols = 4
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, sharey=True, figsize=[15, 10])
plt.tight_layout()
plt.clf()
for i in range(0, 12):
    plt.subplot(nrows, ncols, i + 1)
    plt.scatter(X[:, i], y)
    plt.xlabel(boston.feature_names[i])
    plt.ylabel("house price")
    plt.grid()
save_fig("boston-housing-scatter")
plt.show()

# Rescale input data

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

scaler = sklearn.preprocessing.StandardScaler()
scaler = scaler.fit(X_train)
Xscaled = scaler.transform(X_train)
# equivalent to Xscaled = scaler.fit_transform(X_train)

# Fit model
    def train(self, num_epoch):
        for epoch in range(num_epoch):
            if (self.resample):
                train_dl_iter = iter(self.train_dl)
            for i, data in enumerate(tqdm(self.train_dl)):
                # (1) : minimizes mean((D(x) - mean(D(G(z))) - 1)**2) + mean((D(G(z)) - mean(D(x)) + 1)**2)
                self.netD.zero_grad()
                real_images = data[0].to(self.device)
                bs = real_images.size(0)
                # real labels (bs)
                real_label = torch.full((bs, ),
                                        self.real_label,
                                        device=self.device)
                # fake labels (bs)
                fake_label = torch.full((bs, ),
                                        self.fake_label,
                                        device=self.device)
                # noise (bs, nz, 1, 1), fake images (bs, cn, 64, 64)
                noise = generate_noise(bs, self.nz, self.device)
                fake_images = self.netG(noise)
                # calculate the discriminator results for both real & fake
                c_xr = self.netD(real_images)  # (bs, 1, 1, 1)
                c_xr = c_xr.view(-1)  # (bs)
                c_xf = self.netD(fake_images.detach())  # (bs, 1, 1, 1)
                c_xf = c_xf.view(-1)  # (bs)
                # calculate the Discriminator loss
                errD = (torch.mean(
                    (c_xr - torch.mean(c_xf) - real_label)**2) + torch.mean(
                        (c_xf - torch.mean(c_xr) + real_label)**2)) / 2.0
                errD.backward()
                # update D using the gradients calculated previously
                self.optimizerD.step()

                # (2) : minimizes mean((D(G(z)) - mean(D(x)) - 1)**2) + mean((D(x) - mean(D(G(z))) + 1)**2)
                self.netG.zero_grad()
                if (self.resample):
                    real_images = next(train_dl_iter)[0].to(self.device)
                    noise = generate_noise(bs, self.nz, self.device)
                    fake_images = self.netG(noise)
                # we updated the discriminator once, therefore recalculate c_xr, c_xf
                c_xr = self.netD(real_images)  # (bs, 1, 1, 1)
                c_xr = c_xr.view(-1)  # (bs)
                c_xf = self.netD(fake_images)  # (bs, 1, 1, 1)
                c_xf = c_xf.view(-1)  # (bs)
                # calculate the Generator loss
                errG = (torch.mean(
                    (c_xf - torch.mean(c_xr) - real_label)**2) + torch.mean(
                        (c_xr - torch.mean(c_xf) + real_label)**2)) / 2.0
                errG.backward()
                # update G using the gradients calculated previously
                self.optimizerG.step()

                self.errD_records.append(float(errD))
                self.errG_records.append(float(errG))

                if (i % self.loss_interval == 0):
                    print('[%d/%d] [%d/%d] errD : %.4f, errG : %.4f' %
                          (epoch + 1, num_epoch, i + 1,
                           self.train_iteration_per_epoch, errD, errG))

                if (i % self.image_interval == 0):
                    if (self.special == None):
                        sample_images_list = get_sample_images_list(
                            'Unsupervised', (self.fixed_noise, self.netG))
                        plot_fig = plot_multiple_images(
                            sample_images_list, 4, 4)
                        cur_file_name = os.path.join(
                            self.save_img_dir,
                            str(self.save_cnt) + ' : ' + str(epoch) + '-' +
                            str(i) + '.jpg')
                        self.save_cnt += 1
                        save_fig(cur_file_name, plot_fig)
                        plot_fig.clf()

                    elif (self.special == 'Wave'):
                        sample_audios_list = get_sample_images_list(
                            'Unsupervised_Audio',
                            (self.fixed_noise, self.netG))
                        plot_fig = plot_multiple_spectrograms(
                            sample_audios_list, 4, 4, freq=16000)
                        cur_file_name = os.path.join(
                            self.save_img_dir,
                            str(self.save_cnt) + ' : ' + str(epoch) + '-' +
                            str(i) + '.jpg')
                        self.save_cnt += 1
                        save_fig(cur_file_name, plot_fig)
                        plot_fig.clf()

                if (self.snapshot_interval is not None):
                    if (i % self.snapshot_interval == 0):
                        save(
                            os.path.join(
                                self.save_snapshot_dir, 'Epoch' + str(epoch) +
                                '_' + str(i) + '.state'), self.netD, self.netG,
                            self.optimizerD, self.optimizerG)
def evaluate_config_2(X_test, X_train, Y_test, Y_train, fig_dir, inputs, model,
                      model_orig, outputs, title, use_LIME, z_idx):
    # In[66]:
    loss_tr, acc_tr = model.evaluate(X_train, Y_train)
    loss_ts, acc_ts = model.evaluate(X_test, Y_test)
    loss_tr_o, acc_tr_o = model_orig.evaluate(X_train, Y_train)
    loss_ts_o, acc_ts_o = model_orig.evaluate(X_test, Y_test)
    # In[81]:
    round_err = 4
    loss_diff_tr = np.round(np.abs(loss_tr - loss_tr_o), round_err)
    loss_diff_ts = np.round(np.abs(loss_ts - loss_ts_o), round_err)
    print("Loss Diff: train - {} ---- test - {}".format(
        loss_diff_tr, loss_diff_ts))
    acc_diff_tr = np.round(np.abs(acc_tr - acc_tr_o), round_err)
    acc_diff_ts = np.round(np.abs(acc_ts - acc_ts_o), round_err)
    print("Acc Diff: train - {:.2f}% ---- test - {:.2f}%".format(
        acc_diff_tr * 100, acc_diff_ts * 100))
    explanation_loss_tr = compute_explanation_loss(inputs, outputs, model,
                                                   z_idx)
    # if explanation_loss_tr != e_loss[-1]:
    #     print("Explanation loss not computed correctly! compute:{} e_loss:{}".format(
    #         float(explanation_loss_tr), float(e_loss[-1])))
    print("Explanation loss - {:.4f}".format(explanation_loss_tr))
    # 0 because suddenly non of the features are important!
    # In[68]:
    # In[74]:
    num_p, percent = analyze_mismatch(model_orig, model, inputs)
    print("Prediction Mismatch (Train):", num_p, str(round(percent, 3)) + "%")
    # Test Set Mismatch
    inputs_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
    outputs_test = tf.convert_to_tensor(Y_test, dtype=tf.float32)
    num_p, percent = analyze_mismatch(model_orig, model, inputs_test)
    print("Prediction Mismatch (Test):", num_p, str(round(percent, 3)) + "%")
    # In[76]:
    from evaluate import plot_ranking_histograms
    from functools import partial
    # In[78]:
    models = [model_orig, model]
    att_methods = attribution_methods
    if not use_LIME:
        att_methods = att_methods[:-1]
    # TODO the attribution methods are evaluate 2x -> ones in plot_ranking & ones in analyze DF! there is no reason!
    plot_ranking_histograms_ = partial(plot_ranking_histograms,
                                       models=models,
                                       z_idx=z_idx,
                                       num_f=X_train.shape[1] - 1,
                                       attribution_methods=att_methods)
    fig_train = plot_ranking_histograms_(inputs=inputs,
                                         ys=outputs,
                                         title="{} Train".format(title))
    save_fig(fig_train, fname="ranking_histograms_train", fig_dir=fig_dir)
    # In[80]:
    # with Test
    fig_test = plot_ranking_histograms_(inputs=inputs_test,
                                        ys=outputs_test,
                                        title="{} Test".format(title))
    save_fig(fig_test, fname="ranking_histograms_test", fig_dir=fig_dir)
    # ### Explain ranking
    # In[84]:
    print("Computing Summary Table Train")
    df = get_summary_table(models=models,
                           inputs=inputs,
                           outputs=outputs,
                           feature=z_idx,
                           attribution_methods=att_methods)
    diff_mean_tr, df_top_diff_tr, shifts_sum_tr, shifts_mean_tr = analyze_summary_table(
        df, title=title)
    print("Computing Summary Table Test")
    df_test = get_summary_table(models=models,
                                inputs=inputs_test,
                                outputs=outputs_test,
                                feature=z_idx,
                                attribution_methods=att_methods)
    diff_mean_ts, df_top_diff_ts, shifts_sum_ts, shifts_mean_ts = analyze_summary_table(
        df_test, title)
    return df, df_test
	ax = pl.gca()
	pl.imshow(np.transpose(data), interpolation='nearest',origin='lower', vmin=0, vmax=1, cmap='RdBu')
	ax.set_xlabel("The predicted week number", fontsize=fontsize)
	ax.set_ylabel("Lag", fontsize=fontsize)
	# pl.title('Logistic Regression AUC: %s' % cohort, fontsize=fontsize)

	ax.set_xticks(range(n))
	ax.set_yticks(range(n))
	ax.set_xticklabels(range(2,n+2),fontsize=med_fontsize)
	ax.set_yticklabels(range(1,n+1),fontsize=med_fontsize)
	
	cb = pl.colorbar()
	for t in cb.ax.get_yticklabels():
		t.set_fontsize(med_fontsize)

	utils.save_fig("/home/colin/evo/papers/thesis/figures/hmm_logreg/%s_support_%s" % (cohort, max_support))


	#plot mean AUC over support:
	benchmarks = {
	"no_collab": 0.775938784743,
	"wiki_only": 0.609065848058,
	"forum_and_wiki": 0.648563087051,
	"forum_only": 0.76697590925}

	pl.clf()
	ax = pl.gca()
	pl.plot(range(3, 30,2), [means[support] for support in range(3, 30,2)])
	# pl.title('HMM logreg: %s' % cohort, fontsize=fontsize)
	ax.set_xlabel("Number of support", fontsize=fontsize)
	ax.set_ylabel("Mean AUC of all leads and lags", fontsize=fontsize)
for cohort in cohorts:

    n = len(feature_vectors[cohort])

    ax = pl.gca()
    pl.bar(range(n), feature_vectors[cohort], color='0.75')
    # pl.title('Randomized logistic Regression: %s' % cohort, fontsize=fontsize)

    ax.set_xlabel("Feature number", fontsize=fontsize)
    ax.set_ylabel("Average feature weight", fontsize=fontsize)

    ax.set_xticks(range(n))
    ax.set_xticklabels(features, fontsize=AUC_fontsize, rotation='vertical')

    utils.save_fig(
        "/home/colin/evo/papers/thesis/figures/logreg/randomized_%s" % cohort)
    # pl.show()
    # break
    pl.clf()

for cohort in cohorts:
    in_file = "results/randomized_logistic_reg_features_%s_time_averaged.csv" % cohort
    data = np.genfromtxt(in_file, delimiter=",")[1:, :-1]
    n = len(data)
    for i in range(n):
        if np.all(data[:, i] == 0):
            n = i
    data = data[:n, :n]
    for col in range(n):
        data[:, col] /= norm(data[:, col])
Exemple #41
0
import matplotlib.pyplot as plt

acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
fig, ax = plt.subplots()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r-', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
save_fig("imdb-loss.pdf")
plt.show()

fig, ax = plt.subplots()
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
save_fig("imdb-acc.pdf")
plt.show()

# Now turn on early stopping
# https://chrisalbon.com/deep_learning/keras/neural_network_early_stopping/
    pl.imshow(np.transpose(data), interpolation="nearest", origin="lower", vmin=0, vmax=1)
    ax.set_xlabel("Number of hidden support", fontsize=fontsize)
    ax.set_ylabel("Lead", fontsize=fontsize)
    # ax.set_title('HMM AUC: %s' % cohort, fontsize=fontsize)

    ax.set_xticks(range(num_supports))
    ax.set_yticks(range(num_leads))
    ax.set_xticklabels(range(3, 30, 2), fontsize=med_fontsize)
    ax.set_yticklabels(range(1, num_leads), fontsize=med_fontsize)

    cb = pl.colorbar()
    for t in cb.ax.get_yticklabels():
        t.set_fontsize(med_fontsize)
        # pl.show()

    utils.save_fig("/home/colin/evo/papers/thesis/figures/hmm/%s" % cohort)


for cohort in cohorts:
    # plot mean AUC over support:
    benchmarks = {
        "no_collab": 0.775938784743,
        "wiki_only": 0.609065848058,
        "forum_and_wiki": 0.648563087051,
        "forum_only": 0.76697590925,
    }

    support_dict = cohort_dict[cohort]
    means = {}
    for support in range(3, 30, 2):
        data = support_dict[support]
	n = len(feature_vectors[cohort])

	ax = pl.gca()
	pl.bar(range(n), feature_vectors[cohort], color = '0.75')
	# pl.title('Randomized logistic Regression: %s' % cohort, fontsize=fontsize)
	
	ax.set_xlabel("Feature number", fontsize=fontsize)
	ax.set_ylabel("Average feature weight", fontsize=fontsize)


	ax.set_xticks(range(n))
	ax.set_xticklabels(features,fontsize=AUC_fontsize, rotation='vertical')

	
	
	utils.save_fig("/home/colin/evo/papers/thesis/figures/logreg/randomized_%s" % cohort)
	# pl.show()
	# break
	pl.clf()

for cohort in cohorts:
	in_file = "results/randomized_logistic_reg_features_%s_time_averaged.csv" % cohort
	data = np.genfromtxt(in_file, delimiter=",")[1:, :-1]
	n = len(data)
	for i in range(n):
		if np.all(data[:,i]==0):
			n = i
	data = data[:n,:n]
	for col in range(n):
		data[:, col] /= norm(data[:, col])
Exemple #44
0
def plot_scatter(xs,
                 ys,
                 line_labels=None,
                 xaxis_label=None,
                 yaxis_label=None,
                 xticks=None,
                 vlines=None,
                 vlines_kwargs=None,
                 hlines=None,
                 hlines_kwargs=None,
                 x_sci=True,
                 y_sci=True,
                 y_lim=None,
                 legend_top=True,
                 fig_size=None,
                 ls_cycle=False,
                 name=None):
    multiple_x = utils.is_list_of_list(xs)
    multiple_y = utils.is_list_of_list(ys)
    multiple_line_label = utils.is_list(line_labels)
    assert multiple_x == multiple_y == multiple_line_label

    fig, ax = plt.subplots()
    if fig_size and isinstance(fig_size, list) and len(fig_size) > 0:
        if len(fig_size) == 1:
            fig.set_figwidth(fig_size[0])
        else:
            fig.set_figwidth(fig_size[0])
            fig.set_figheight(fig_size[1])
    colors = iter(cm.rainbow(np.linspace(0, 1, len(ys))))
    if multiple_x:
        for x, y, line_label in zip(xs, ys, line_labels):
            ax.scatter(x, y, label=line_label, c=next(colors))
    else:
        ax.scatter(xs, ys, label=line_labels, c=next(colors))
    if xticks and isinstance(xticks, list):
        x = xs[0] if multiple_x else xs
        plt.xticks(x, xticks)
    if y_lim and isinstance(y_lim, list) and len(y_lim) > 0:
        if len(y_lim) == 1:
            plt.ylim(ymin=y_lim[0])
        else:
            plt.ylim(ymin=y_lim[0])
            plt.ylim(ymax=y_lim[1])
    plt.xlim(xmin=0)
    ncol = len(xs) if multiple_x else 1
    utils.set_legend(legend_top, ncol)
    utils.set_sci_axis(ax, x_sci, y_sci)
    utils.set_axis_labels(ax, xaxis_label, yaxis_label)

    vlines = vlines or []
    for xvline in vlines:
        with ALTERNATIVE_PALETTE:
            plt.axvline(x=xvline, **vlines_kwargs)

    hlines = hlines or []
    for yhline in hlines:
        with ALTERNATIVE_PALETTE:
            plt.axhline(y=yhline, **hlines_kwargs)

    utils.finalize(ax)
    utils.save_fig(name)
			lead = int(float((lead))) -1
			lag = int(float((lag)))-1
			week= lead+lag
			data[week,lag] = float(auc)

	for week in range(n):
		for lag in range(n):
				if week >= lag:
					pl.text(week - .3, lag - .1,((int)(100*data[week][lag]))/100.0,fontsize=AUC_fontsize)

	ax = pl.gca()
	pl.imshow(np.transpose(data), interpolation='nearest',origin='lower', vmin=0, vmax=1, cmap='RdBu')
	ax.set_xlabel("The predicted week number", fontsize=fontsize)
	ax.set_ylabel("Lag", fontsize=fontsize)
	# pl.title('Logistic Regression AUC: %s' % cohort, fontsize=fontsize)

	ax.set_xticks(range(n))
	ax.set_yticks(range(n))
	ax.set_xticklabels(range(2,n+2),fontsize=med_fontsize)
	ax.set_yticklabels(range(1,n+1),fontsize=med_fontsize)
	
	cb = pl.colorbar()
	for t in cb.ax.get_yticklabels():
		t.set_fontsize(med_fontsize)

	utils.save_fig("/home/colin/evo/papers/thesis/figures/logreg/%s" % cohort)
	
	# print cohort, np.nanmean(data)
	# pl.show()

	# break
Exemple #46
0
def plot_line(xs,
              ys,
              line_labels=None,
              xaxis_label=None,
              yaxis_label=None,
              xticks=None,
              vlines=None,
              vlines_kwargs=None,
              hlines=None,
              hlines_kwargs=None,
              x_sci=True,
              y_sci=True,
              y_lim=None,
              x_lim=None,
              legend_top=True,
              ls_cycle=False,
              marker_size=0,
              x_grid=True, y_grid=True,
              fig_size=None,
              name=None, draw_arrow=False):
    multiple_x = utils.is_list_of_list(xs)
    multiple_y = utils.is_list_of_list(ys)
    multiple_line_label = utils.is_list(line_labels)
    assert multiple_x == multiple_y == multiple_line_label

    fig, ax = plt.subplots()
    if fig_size and isinstance(fig_size, list) and len(fig_size) > 0:
        if len(fig_size) == 1:
            fig.set_figwidth(fig_size[0])
        else:
            fig.set_figwidth(fig_size[0])
            fig.set_figheight(fig_size[1])
    ls_cycler = utils.get_line_styles_cycler(ls_cycle)
    ms_cycler = utils.get_marker_styles_cycler(marker_size > 0)
    if multiple_x:
        for x, y, line_label in zip(xs, ys, line_labels):
            ax.plot(x, y, label=line_label, ls=next(ls_cycler), marker=next(ms_cycler), markersize=marker_size)
    else:
        ax.plot(xs, ys, label=line_labels)
    if xticks and isinstance(xticks, list):
        x = xs[0] if multiple_x else xs
        plt.xticks(x, xticks)

    if y_lim and isinstance(y_lim, list) and len(y_lim) > 0:
        if len(y_lim) == 1:
            plt.ylim(ymin=y_lim[0])
        else:
            plt.ylim(ymin=y_lim[0])
            plt.ylim(ymax=y_lim[1])

    if x_lim and isinstance(x_lim, list) and len(x_lim) > 0:
        if len(x_lim) == 1:
            plt.xlim(xmin=x_lim[0])
        else:
            plt.xlim(xmin=x_lim[0])
            plt.xlim(xmax=x_lim[1])

    ncol = len(xs) if multiple_x else 1
    utils.set_legend(legend_top, ncol)
    utils.set_sci_axis(ax, x_sci, y_sci)
    utils.set_axis_labels(ax, xaxis_label, yaxis_label)

    vlines = vlines or []
    for xvline in vlines:
        with ALTERNATIVE_PALETTE:
            plt.axvline(x=xvline, **vlines_kwargs)

    hlines = hlines or []
    for yhline in hlines:
        with ALTERNATIVE_PALETTE:
            plt.axhline(y=yhline, **hlines_kwargs)

    utils.finalize(ax, x_grid=x_grid, y_grid=y_grid)
    utils.save_fig(name)
def visualize_feature_hist_dist(X,y,selected_feature,features,normalize=False):
	"""
	Visualize the histogram distribution of a feature

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	selected_feature -- The desired feature to obtain the histogram
	features -- Vector of feature names (X1 to XN)
	normalize -- Whether to normalize the histogram (Divide by total)
	"""

	#create data
	joint_data=np.column_stack((X,y))
	column_names=features

	#create dataframe
	df=pd.DataFrame(data=joint_data,columns=column_names)
	palette = sea.hls_palette()

	#find number of unique values (groups)
	unique_values=pd.unique(df[[selected_feature]].values.ravel())
	unique_values=map(int, unique_values)
	unique_values.sort()
	n_groups=len(unique_values)

	fig, ax = plt.subplots()
	index = np.arange(n_groups)
	bar_width = 0.35
	opacity = 0.4

	#find values belonging to the positive class and values belonging to the negative class
	positive_class_index=df[df[features[-1]] == 1].index.tolist()
	negative_class_index=df[df[features[-1]] != 1].index.tolist()

	positive_values=df[[selected_feature]].loc[positive_class_index].values.ravel()
	positive_values=map(int, positive_values)

	negative_values=df[[selected_feature]].loc[negative_class_index].values.ravel()
	negative_values=map(int, negative_values)

	#normalize data (divide by total)
	n_positive_labels=n_negative_labels=1
	if normalize==True:
		n_positive_labels=len(y[y==1])
		n_negative_labels=len(y[y!=1])

	#count
	positive_counts=[0]*len(index)
	negative_counts=[0]*len(index)
	for v in xrange(len(unique_values)):
		positive_counts[v]=positive_values.count(v)/n_positive_labels
		negative_counts[v]=negative_values.count(v)/n_negative_labels

	#plot
	plt.bar(index, positive_counts, bar_width,alpha=opacity,color='b',label='Default')			#class 1
	plt.bar(index+bar_width, negative_counts, bar_width,alpha=opacity,color='r',label='Paid')	#class 0

	plt.xlabel(selected_feature)
	plt.ylabel('Frequency')
	if normalize:
		plt.ylabel('Proportion')
	plt.title("Normalized Histogram Distribution of the feature '"+selected_feature+"' grouped by class")
	plt.xticks(index + bar_width, map(str, unique_values) )
	plt.legend()
	plt.tight_layout()

	# plt.show()

	#save fig
	output_dir = "img"
	save_fig(output_dir,'{}/{}_hist_dist.png'.format(output_dir,selected_feature))