def grid_search(): comments, labels = load_data() param_grid = dict(logr__C=np.arange(1, 20, 5)) clf = build_nltk_model() cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2) grid = GridSearchCV(clf, cv=cv, param_grid=param_grid, verbose=4, n_jobs=12, score_func=auc_score) grid.fit(comments, labels) print(grid.best_score_) print(grid.best_params_) tracer() cv_scores = grid.scores_ for param in cv_scores.params: means, errors = cv_scores.accumulated(param, 'max') plt.errorbar(cv_scores.values[param], means, yerr=errors) plt.xlabel(param) plt.ylim((0.85, 0.93)) plt.savefig("grid_plot_%s.png" % param) plt.close() comments_test, dates_test = load_test() prob_pred = grid.best_estimator_.predict_proba(comments_test) write_test(prob_pred[:, 1])
def loadData(): global dataChoice if dataChoice == 'sim': train_filename = 'r-train100.csv' test_filename = 'r-test100.csv' user_filename = 'u100.csv' else: # dataChoice == 'validate' or dataChoice == 'full' train_filename = 'ratings-train.csv' test_filename = 'ratings-test.csv' user_filename = 'users.csv' training_data = util.load_train(train_filename) test_queries = util.load_test(test_filename) user_list = util.load_users(user_filename) validation_set = {} if dataChoice != 'full': # split training_data into 80% training and 20% validation split = int(len(training_data) * 0.8) validation_set = training_data[split:] training_data = training_data[:split] return training_data, test_queries, user_list, validation_set
def make_predictions(ratings_data, mfact_data): """ Makes a set of predictions, suitable for passing to util.write_predictions Arguments: ratings_data : the data object returned by build_ratings mfact_data : the data object returned by mfact Returns: predictions : a list of dicts, suitable for passing to util.write_predictions """ # load data from the original training set center = ratings_data["center"] scale = ratings_data["scale"] book_isbn_to_index = ratings_data["book_isbn_to_index"] # load data calculated by the matrix factorization P = mfact_data["P"] Q = mfact_data["Q"] Bn = mfact_data["Bn"] Bd = mfact_data["Bd"] mean = mfact_data["mean"] # load the set of requested predictions queries = util.load_test("../data/books/ratings-test.csv") L = len(queries) debug("Making %d predictions",L) # for each query for (i,query) in enumerate(queries): # print progress # if DEBUG: print ("%d / %d : " % (i+1,L)), # lookup user and book index user_index = query["user"] - 1 book_index = book_isbn_to_index[query["isbn"]] # calculate predicted rating #rating_float = (np.dot(P[user_index,:],Q[book_index,:]) + mean + Bn[user_index] + Bd[book_index]) \ # * scale + center rating_float = np.dot(P[user_index,:],Q[book_index,:]) * scale + center # coerce to range (1,5); round rating = max(1,min(5,rating_float)) # store both values so we can do visualization of distributions later query["rating"] = rating query["rating_f"] = rating_float # print value # if DEBUG: print "%f -> %d" % (rating_float, rating) return queries
def test_data(): path = 'my_model.h5' model = keras.models.load_model(path) _, _, _, _, tokenizer = util.load_train_val() X, y = util.load_test(path_data, tokenizer) res = model.eval(x=X, y=y, batch_size=200) print(res)
def apply_models(): comments, labels = load_extended_data() comments_test = load_test("impermium_verification_set_.csv") clf1 = build_base_model() clf2 = build_elasticnet_model() clf3 = build_stacked_model() clf4 = build_nltk_model() models = [clf1, clf2, clf3, clf4] probs_common = np.zeros((len(comments_test), 2)) for i, clf in enumerate(models): clf.fit(comments, labels) probs = clf.predict_proba(comments_test) #print("score: %f" % auc_score(labels_test, probs[:, 1])) probs_common += probs write_test(probs[:, 1], "test_prediction_model_%d.csv" % i, ds="impermium_verification_set_.csv") probs_common /= 4. #score = auc_score(labels_test, probs_common[:, 1]) #print("combined score: %f" % score) write_test(probs_common[:, 1], "test_prediction_combined.csv", ds="impermium_verification_set_.csv")
# acc = util.model_roc_score(learner, test_dat) acc = util.model_score(learner, test_dat) test_acc.append(acc) learned_size.append(total_pool_size - prof.get_pool_size() + init_size) if ITER_ENABLE: if count < 0: break count -= 1 return test_acc, learned_size pool_dat = load_pool() init_dat = load_init() test_dat = load_test() train_pool = np.array(gen_land_pool(pool_dat)) shuffle(train_pool) print "[info]Start passive learning..." test_acc_ps, learned_size_ps = run_stl_landm(pool_dat, init_dat, test_dat, do_active=False) util.curve_to_csv("res/ps_stl_non.csv", test_acc_ps, learned_size_ps) print "[info]Start active learning..." test_acc_ac, learned_size_ac = run_stl_landm(pool_dat, init_dat, test_dat,
# coding=utf8 import numpy as np import pandas as pd from scipy import interp from matplotlib import pyplot as plt from sklearn import preprocessing from sklearn.metrics import roc_curve, auc # load data # X1 = pd.read_csv(r'Data/Train/PPD_Training_Master_GBK_3_1_Training_Set.csv', encoding='gbk') X2 = pd.read_csv(r'Data/Test/PPD_Master_GBK_2_Test_Set.csv')# , encoding='gbk') from util import load_train, load_test X_train, y_train, w_train = load_train() X_test = load_test() print 'data loaded, transforming...' ''' 3.19 commented scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # train and predict from sklearn.linear_model import LogisticRegression clf = LogisticRegression(n_jobs=-1, class_weight='balanced', penalty='l1') clf.fit(X_train, y_train) probas_ = clf.predict_proba(X_test) # visualization on training set fpr, tpr, thresholds = roc_curve(y_train, p2[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr)
# # do prediction based on matrix factorization # K = 30 # run = 0 # step = 280 # mfact = su.unpickle("output/mfact_%d_run_%d/mfact_%d_%d" % (K, run, K, step)) # P = mfact["P"] # Q = mfact["Q"] # Bn = mfact["Bn"] # Bd = mfact["Bd"] # # this is the mean of the standardized training data, used for the learning/ # # prediction # standard_mean = mfact["mean"] # load the set of requested predictions queries = util.load_test("../data/books/ratings-test.csv") L = len(queries) # for each query for (i,query) in enumerate(queries): print ("%d / %d : " % (i+1,L)), user_index = query["user"] - 1 book_index = book_isbn_to_index[query["isbn"]] # calculate predicted rating rating_float = (np.dot(P[user_index,:],Q[book_index,:]) + standard_mean + Bn[user_index] + Bd[book_index]) * std + mean # coerce to range (1,5); round, convert to int rating = int(round(max(1,min(5,rating_float)))) # store both values so we can do visualization of distributions later
def generateFullTestMatrix(detectorSettings=(1, 1, 1), partitions=(9, 9, 9)): testMatrix = [] for i in range(0, util.TEST_COUNT - 1): testMatrix.append(generateEdgeFeaturesVector(util.load_test(i))) return testMatrix
def predict(train, test, pred_file): y_hat, train_rss = run_model(train, test, 'prediction', 0) for i, yi in enumerate(y_hat): if yi < 0: y_hat[i] = 0 if yi > 5: y_hat[i] = 5 for i, entry in enumerate(test): entry['rating'] = float(y_hat[i]) util.write_predictions(test, pred_file) # prediction mode test_filename = 'data/ratings-test.csv' test = util.load_test(test_filename) pred_filename = 'predictions/sgd_converged.csv' predict(train_valid, test, pred_filename) """ x = np.zeros((n, r, 2)) # 2 layers, 1 for train predictions and 1 for valid predictions y = np.zeros((n, 1)) for i, entry in enumerate(train_valid): y[i] = float(entry['rating']) def build_matrix(m, v, fold_idx, param_idx): span = np.shape(v)[0] # train predictions
import numpy as np import util # This makes predictions based on the mean rating for each user in the # training data. When there are no training data for a user, it # defaults to the global mean. pred_filename = 'pred-user-mean.csv' train_filename = 'ratings-train.csv' test_filename = 'ratings-test.csv' user_filename = 'users.csv' training_data = util.load_train(train_filename) test_queries = util.load_test(test_filename) user_list = util.load_users(user_filename) # Compute the global mean rating for a fallback. num_train = len(training_data) mean_rating = float(sum(map(lambda x: x['rating'], training_data))) / num_train print "The global mean rating is %0.3f." % (mean_rating) # Turn the list of users into a dictionary. # Store data for each user to keep track of the per-user average. users = {} for user in user_list: users[user['user']] = { 'total': 0, # For storing the total of ratings. 'count': 0, # For storing the number of ratings. } # Iterate over the training data to compute means.
import time from sklearn.decomposition import PCA from sklearn.svm import SVC from util import load_training, load_test, evaluate, standard import warnings warnings.filterwarnings('ignore') ### 此处定义参数 n_components = 100 # PCA降至的维数 C = 1 # 软间隔系数 decision_function = 'ovr' # 'ovo' for OneVsOne and 'ovr' for OneVsRest' kernel = 'rbf' # 核函数类型 'rbf', 'linear', 'poly' or 'sigmoid' gamma = 1e-5 # 针对rbf, gamma越大,支持向量越少 ##### training_data, training_label = load_training() test_data, test_label = load_test() print('training size: {}'.format(len(training_label))) print('test size: {}'.format(len(test_label))) # 展成一维 training_data = np.array([x.flatten() for x in training_data]) training_label = np.array(training_label) test_data = np.array([x.flatten() for x in test_data]) test_label = np.array(test_label) pca = PCA(n_components=n_components) model = SVC(C=C, random_state=0, max_iter=1000, kernel=kernel,
import numpy as np import ensemble from train_model_library import train_model_library import util import makePred ensemble_library_pred, validation_labels, scaler, model_grid = train_model_library( n_folds_to_compute=1) ensemble, acc, n, c1acc = ensemble.generate_ensemble(ensemble_library_pred, validation_labels, n_init=3, tolerance=.00001) ids, features = util.load_test("kaggle_test_tf_idf_l1_norm.csv") labels = makePred.makePrediction(ensemble, model_grid, features, scaler) util.write_predictions(labels, "idflabels_lean_2.csv") print("done")
def reload_dat(): gc.collect() pool_dat = load_pool() init_dat = load_init() test_dat = load_test() return pool_dat, init_dat, test_dat
ELLA_DIR = "/home/stpanda/Dropbox/STDreamSoft/Academics/SeniorThesis/Projects/al_ella/lib/ELLAv1.0" eng.addpath("/home/stpanda/Dropbox/STDreamSoft/Academics/SeniorThesis/Projects/al_ella/ml") eng.addpath(eng.genpath(ELLA_DIR)) # res = eng.runExperimentActiveTask() # print res ######## Panda ###### # Comparing Multiple Active Learner vs ELLA + ATS vs ELLA + ATS + AL vs # ELLA + AL # This file runs ELLA + Active Task Selection ##################### ## Load all files test_dat = util.add_bias(load_test()) pool_dat = util.add_bias(load_pool()) init_dat = util.add_bias(load_init()) init_size = util.dat_size(init_dat) ## Init ELLA Model with init set ## ella_model = ELLA(eng, init_dat) init_acc = ella_score(ella_model, test_dat) test_acc = [init_acc] learned_size = [init_size] prof = Professor(init_dat, pool_dat, multi_t=True, random=True) total_pool_size = prof.get_pool_size() print "train pool size", total_pool_size
import math NUM_CLUSTERS = 8 # This makes predictions based on the mean rating for each book in the # training data. When there are no training data for a book, it # defaults to the global mean. pred_filename = 'predictor_age-kmeans1.csv' train_filename = 'ratings-train.csv' test_filename = 'ratings-test.csv' book_filename = 'books.csv' user_filename = 'users.csv' training_data = util.load_train(train_filename) test_queries = util.load_test(test_filename) book_list = util.load_books(book_filename) user_list = util.load_users(user_filename) train_data = [] test_data = [] for datum in training_data: if (random.randrange(2) == 0): train_data.append(datum) else: test_data.append(datum) # Compute the global mean rating for a fallback. num_train = len(train_data) mean_rating = float(sum(map(lambda x: x['rating'], train_data)))/num_train
ImageFile.LOAD_TRUNCATED_IMAGES = True import shutil from torchvision.models import vgg16 import warnings warnings.filterwarnings('ignore') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') from dataset import DepthEigenDataset from network import GlobalCoarseNet, LocalFineNet from loss import ScaleInvariantLoss import util data_dir_test = Path('nyu/test') bs = 32 dataloader_test, datalen_test = util.load_test(data_dir_test, bs) print(datalen_test) global_model = torch.load('models/global_model.pt') global_model.eval() local_model = torch.load('models/local_model.pt') local_model.eval() for i, samples in enumerate(dataloader_test): rgbs = samples['rgb'].float().to(device) depths = samples['depth'].float().to(device) # results from global coarse network with torch.no_grad(): global_output = global_model(rgbs).unsqueeze(1)