def loadData(): global dataChoice if dataChoice == 'sim': train_filename = 'r-train100.csv' test_filename = 'r-test100.csv' user_filename = 'u100.csv' else: # dataChoice == 'validate' or dataChoice == 'full' train_filename = 'ratings-train.csv' test_filename = 'ratings-test.csv' user_filename = 'users.csv' training_data = util.load_train(train_filename) test_queries = util.load_test(test_filename) user_list = util.load_users(user_filename) validation_set = {} if dataChoice != 'full': # split training_data into 80% training and 20% validation split = int(len(training_data) * 0.8) validation_set = training_data[split:] training_data = training_data[:split] return training_data, test_queries, user_list, validation_set
def _view(arg): try: num = int(arg) mri = load_train(num) except: mri = load_nifti1(arg) view(mri)
def build_ratings_tuple(): """ Loads the training data for N users and D books, and builds a list of tuples Returns: a dict with the following fields: - `ratings` : a list of tuples (i,j,r) where i is the user, j is the book, and r is the rating that the user gave the book - `book_isbn_to_index` : dict that maps the ISBN for each book to a numerical index j. - `N` : the number of users - `D` : the number of books - `T` : the number of ratings in the training set """ print "Loading Users..." users = util.load_users("../../data/books/users.csv") user_ids = sorted([ user["user"] for user in users ]) N = len(user_ids) del users print "Loaded %d users." % N print "Loading Books..." books = util.load_books("../../data/books/books.csv") book_isbns = sorted([ book["isbn"] for book in books ]) book_isbn_to_index = dict( zip(book_isbns,range(len(book_isbns))) ) D = len(book_isbns) print "Loaded %d books." % D print "Loading Trainings..." train = util.load_train("../../data/books/ratings-train.csv") T = len(train) print "Loaded %d ratings." % T ratings = [(rating["user"]-1, book_isbn_to_index[rating["isbn"]], (rating["rating"])) for rating in train] return { "ratings": ratings, "book_isbn_to_index": book_isbn_to_index , \ "N": N, "D": D, "T": T}
def book_biases(): train_filename = 'ratings-train.csv' book_filename = 'books.csv' training_data = util.load_train(train_filename) book_list = util.load_books(book_filename) books = {} for book in book_list: books[book['isbn']] = { 'total': 0, # For storing the total of ratings. 'count': 0, # For storing the number of ratings. } # Iterate over the training data to compute means. for rating in training_data: books[rating['isbn']]['total'] += rating['rating'] books[rating['isbn']]['count'] += 1 bBooks = np.zeros(len(book_list)) for book in book_list: isbn = book['isbn'] bBooks[isbnIndex[isbn]] = float(book['total']) / book['count'] float(book['total']) / book['count']
# ignore stupid warning import warnings warnings.filterwarnings('ignore') # training device device = torch.device('cuda') # hyperparameter epochs_num = 4 rate = 1 # rate = 5 in paper; but result is bad so it is tuned by hand # specifed in paper batch_size = 32 # load dataset dataloader_train, dataloader_valid, train_num, val_num = util.load_train(Path('nyu/train'), Path('nyu/val'), batch_size) print("training number:{}, validation number:{}".format(train_num, val_num)) ######################################################### # initializing the model # ######################################################### # initialize and parallize the models global_net = GlobalCoarseNet().to(device) local_net = LocalFineNet().to(device) # loss global_loss = ScaleInvariantLoss() local_loss = ScaleInvariantLoss()
Created on Sun Feb 9 16:20:34 2014 Type: Driver cross_validation to tune parameters @author: vincentli2010 """ import numpy as np import util from matplotlib import pyplot as plt user_list = util.user_list book_list = util.book_list train_filename = 'data/ratings-train.csv' train_valid = util.load_train(train_filename) ######### Tuning Parameters ######### PARAM = [0.05, 0.1, 0.3, 0.5] #PARAM = np.arange(0.05, 5, 0.05) num_folds = 1 # always 5-fold cross-validate, this decides how many folds to run ##import data_processing as dp #dphelper = dp.data_processing() #dense, sparse = dphelper.split(train_valid) #train_valid = dense
# coding=utf8 import numpy as np import pandas as pd from scipy import interp from matplotlib import pyplot as plt from sklearn import preprocessing from sklearn.metrics import roc_curve, auc # load data # X1 = pd.read_csv(r'Data/Train/PPD_Training_Master_GBK_3_1_Training_Set.csv', encoding='gbk') X2 = pd.read_csv(r'Data/Test/PPD_Master_GBK_2_Test_Set.csv')# , encoding='gbk') from util import load_train, load_test X_train, y_train, w_train = load_train() X_test = load_test() print 'data loaded, transforming...' ''' 3.19 commented scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # train and predict from sklearn.linear_model import LogisticRegression clf = LogisticRegression(n_jobs=-1, class_weight='balanced', penalty='l1') clf.fit(X_train, y_train) probas_ = clf.predict_proba(X_test) # visualization on training set fpr, tpr, thresholds = roc_curve(y_train, p2[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr)
import os import util import artifacts # This script must run in Python, not Pypy. # This creates a dict like {filename: label} for the whole training set. train = util.load_train(True) artifacts.put_artifact(train, 'train_dict') # This makes a similar dict, holding a sample of 20k positive # and 20k negative instances. # This is used for determining frequent tags, tokens, etc. for features. # The dict is saved as artifacts/sample_20_20.pkl. sample = util.create_sample('sample_20_20', 20000, 20000)
import random import math NUM_CLUSTERS = 8 # This makes predictions based on the mean rating for each book in the # training data. When there are no training data for a book, it # defaults to the global mean. pred_filename = 'predictor_age-kmeans1.csv' train_filename = 'ratings-train.csv' test_filename = 'ratings-test.csv' book_filename = 'books.csv' user_filename = 'users.csv' training_data = util.load_train(train_filename) test_queries = util.load_test(test_filename) book_list = util.load_books(book_filename) user_list = util.load_users(user_filename) train_data = [] test_data = [] for datum in training_data: if (random.randrange(2) == 0): train_data.append(datum) else: test_data.append(datum) # Compute the global mean rating for a fallback. num_train = len(train_data)
def train_model_library(n_folds=5, n_folds_to_compute=5): # ensemble_library_pred is an array of predictions made by the individual models. # each row is an obesrvation in the validation set, and each column is the # prediction of a cross validation model. # validation_labels is a column vector corresponding to the labels of the # observations in the validation set. # model_grid is a list of lists. each element corresponds to a single model. # m = model_grid[i]. m[0] is the model index (corresponding to a column in # ensemble_library_pred. m[1] is a list of n_folds_to_compute model objects. # m[2] is empty and holds the predictions of each model until the end # m[3] is the constructor for that model i. # m[4] is a dictionary specifying the model parameters for model i. ids, features, labels = util.load_train(TRAIN_PATH) kf_cv = cross_validation.KFold(features.shape[0], n_folds=n_folds, shuffle=True) scaler = preprocessing.StandardScaler() model_grid = _generate_model_grid() tot_v_size = 0 i = 1 validation_labels = [] for train_idx, validate_idx in kf_cv: print 'cross validation step # ', i training_features = scaler.fit_transform(features[train_idx, :]) training_labels = labels[train_idx] validation_features = scaler.transform(features[validate_idx, :]) validation_labels.append(labels[validate_idx]) # loop over all model type and model parameter pairs, train them, # and predict the current validation points for model in model_grid: print model m = model[3](**model[4]) if model[5] == 'unstandardized': model[1].append(m.fit(features[train_idx, :], training_labels)) model[2].append( m.predict_proba(features[validate_idx, :])[:, 1]) elif model[5] == 'standardized': model[1].append(m.fit(training_features, training_labels)) model[2].append(m.predict_proba(validation_features)[:, 1]) else: raise ValueError('dataset type not recognized') tot_v_size += validate_idx.size if i >= n_folds_to_compute: break i += 1 # calibrate scaler to entire training set for subsequent testing scaler.fit(features) # stack individual validation folds validation_labels = np.concatenate(validation_labels) # populate the ensemble library pred and empty the model store to reduce memory ensemble_library_pred = np.zeros((tot_v_size, len(model_grid))) for model in model_grid: ensemble_library_pred[:, model[0]] = np.concatenate(model[2]) model[2] = [] return ensemble_library_pred, validation_labels, scaler, model_grid
import numpy as np import util import operator import math # import matplotlib.pyplot as plt user_list = util.user_list book_list = util.book_list pred_filename = 'pred-amazon-baseline.csv' train_filename = 'data/ratings-train.csv' test_filename = 'data/ratings-test.csv' user_filename = 'data/users.csv' book_filename = 'data/books.csv' train = util.load_train(train_filename) test_queries = util.load_test(test_filename) user_list = util.load_users(user_filename) book_list = util.load_books(book_filename) # Compute the mean rating (4.070495) train_mean = float(sum(map(lambda x: x['rating'], train)))/len(train) # Turn the list of users into a dictionary. # Store data for each user to keep track of the per-user average. users = {} # {user1: {isbn1: 4, isbn2: 5, ...}, user2: {...}, ...} for user in user_list: users[user['user']] = {} items = {} # {isbn1: {user1: 4, user2: 5, ...}, isbn2: {...}, ...} for item in book_list:
import numpy as np import util # This makes predictions based on the mean rating for each user in the # training data. When there are no training data for a user, it # defaults to the global mean. pred_filename = 'pred-user-mean.csv' train_filename = 'ratings-train.csv' test_filename = 'ratings-test.csv' user_filename = 'users.csv' training_data = util.load_train(train_filename) test_queries = util.load_test(test_filename) user_list = util.load_users(user_filename) # Compute the global mean rating for a fallback. num_train = len(training_data) mean_rating = float(sum(map(lambda x: x['rating'], training_data))) / num_train print "The global mean rating is %0.3f." % (mean_rating) # Turn the list of users into a dictionary. # Store data for each user to keep track of the per-user average. users = {} for user in user_list: users[user['user']] = { 'total': 0, # For storing the total of ratings. 'count': 0, # For storing the number of ratings. } # Iterate over the training data to compute means.
from util import load_train, load_nifti1 def usage(): print('%s <num> | <path>' % argv[0]) print(' num: number of training mri to display') print(' path: path to nni file') exit() if len(argv) != 2: usage() try: num = int(argv[1]) mri = load_train(num) except: mri = load_nifti1(argv[1]) max_z = mri.shape[2] - 1 initial_z = int(max_z / 2) # set up figure fig = plt.figure() ax = fig.add_subplot(111) ax.autoscale(True) plt.subplots_adjust(left=0.25, bottom=0.25) # plot first data set frame = 0 mri_plot = ax.imshow(mri[:, :, initial_z], cmap=plt.cm.gray)
# coding=utf8 import numpy as np import pandas as pd from util import load_train X, y, w = load_train() if __name__ == '__main__': import sys clf_name = sys.argv[1] print clf_name + '=======================' # from sklearn import svm # wclf = svm.SVC(kernel='linear', class_weight={1: 10}) # svm don't provide proba clf = None sample_weighted = True if clf_name == 'XGB': from train_predict import test_xgb test_xgb(X, y, w) elif clf_name =='Ada': from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=100) elif clf_name == 'DT': from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(random_state=0) elif clf_name == 'GB': from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(n_estimators=500)
def generateFullTrainMatrix(detectorSettings=(1, 1, 1), partitions=(9, 9, 9)): trainMatrix = [] for i in range(0, util.TRAIN_COUNT - 1): trainMatrix.append(generateEdgeFeaturesVector(util.load_train(i))) return trainMatrix
# coding=utf8 import numpy as np import pandas as pd from util import load_train X, y = load_train() if __name__ == '__main__': import sys clf_name = sys.argv[1] print clf_name + '=======================' if clf_name == 'XGB': from train_predict import test_xgb test_xgb(X, y)
from dataset import DepthEigenDataset from network import GlobalCoarseNet, LocalFineNet from loss import ScaleInvariantLoss import util #cuda device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #hyperparameters num_epochs = 100 # not specified in paper data_dir_train = Path('nyu/train') data_dir_valid = Path('nyu/test') bs = 32 dataloader_train, dataloader_valid, datalen_train, datalen_valid = util.load_train( data_dir_train, data_dir_valid, bs) print(datalen_train, datalen_valid) #now the net # initialize global_model = GlobalCoarseNet(init=False).to(device) local_model = LocalFineNet(init=False).to(device) # loss global_criterion = ScaleInvariantLoss() local_criterion = ScaleInvariantLoss() # optimizer r = 0.1 global_optimizer = torch.optim.SGD([{ 'params': global_model.coarse6.parameters(),