def extra_data(): pca = ("pca", IncrementalPCA(n_components=500)) scale = ("scale", StandardScaler()) bias = ("bias", AddBiasTerm()) svc = ("svc", SVC(kernel="poly", degree=best_degree, C=best_c, gamma=best_gamma)) p = Pipeline([pca, scale, bias, svc]) n = 50000 num_slices = 50 N,M = Xtrain_full.shape scores_perturb = [] scores_extra = [] extra_data, extra_targets = gd.generate_extra_data(n) perturbed_data, perturbed_targets = gd.perturb_modified_digits(Xtrain_full, Ytrain_full, n) for i in range(-1,num_slices): print "Iteration "+ str(i+2) Xp = np.vstack((Xtrain_full, extra_data[0:(i+1)*n/num_slices])) Yp = np.vstack((Ytrain_full, extra_targets[0:(i+1)*n/num_slices])) p.fit(Xp, Yp) scores_extra.append({"train_score": p.score(Xp, Yp), "test_score": p.score(Xtest, Ytest), "num_examples":N+(i+1)*n/num_slices, "i":i}) Xp = np.vstack((Xtrain_full, perturbed_data[0:(i+1)*n/num_slices])) Yp = np.vstack((Ytrain_full, perturbed_targets[0:(i+1)*n/num_slices])) p.fit(Xp, Yp) scores_perturb.append({"train_score": p.score(Xp, Yp), "test_score": p.score(Xtest, Ytest), "num_examples":N+(i+1)*n/num_slices, "i":i}) d = {"scores_perturb":scores_perturb, "scores_extra": scores_extra} with open("ignore/extra_data_graph.json", "w") as f: json.dump(d, f)
Y = np.load('./data/train_outputs.npy') # Y = Y.reshape( Y.shape[0] , 1 ) Y = Y.astype(np.int32) PIXELS = 48 print 'Original Dataset size:' print X.shape import generate_extra_data as ged x_new, y_new = ged.perturb_modified_digits(X,Y,500000) X = np.vstack((X,x_new)) Y = np.hstack((Y,y_new)) print 'New dataset size:' print X.shape, Y.shape X = X.reshape((-1,1, PIXELS, PIXELS)) validation_division = int(len(X)*validate_split) top = int(len(X)*max_split) X_train, X_val = X[:validation_division,:], X[validation_division:top,:] Y_train, Y_val = Y[:validation_division], Y[validation_division:top]
# In[6]: X = np.load('./data/train_inputs.npy') Y = np.load('./data/train_outputs.npy') # Y = Y.reshape( Y.shape[0] , 1 ) Y = Y.astype(np.int32) PIXELS = 48 print 'Original Dataset size:' print X.shape import generate_extra_data as ged x_new, y_new = ged.perturb_modified_digits(X, Y, 500000) X = np.vstack((X, x_new)) Y = np.hstack((Y, y_new)) print 'New dataset size:' print X.shape, Y.shape X = X.reshape((-1, 1, PIXELS, PIXELS)) validation_division = int(len(X) * validate_split) top = int(len(X) * max_split) X_train, X_val = X[:validation_division, :], X[validation_division:top, :] Y_train, Y_val = Y[:validation_division], Y[validation_division:top] # In[7]: