def convert_data(): if train: files = config.trainfiles num = 16 else: files = config.testfiles num = 7 utils.parmap(convert_train_file, files, num)
def run(X, y, XTest, yTest, foldsNum, path, inds, jobsNum = 6, calcProbs = True): # y = y.astype(np.int) # yTest = yTest.astype(np.int) # heldoutSize = 1./foldsNum # X, idx = shuffle(X) # y = y[idx] # XTest, idx = shuffle(XTest) # yTest = yTest[idx] # scaler = preprocessing.StandardScaler().fit(X) # X = scaler.transform(X) # XTest = scaler.transform(XTest) # X = X[:5000, :] # y = y[:5000] # ---------------------------------------------------- XTest = XTest[:500, :] # ------------------------------------------------------- yTest = yTest[:500] # cv = StratifiedShuffleSplit(y, foldsNum, heldoutSize, random_state=0) cv = IndicesKFold(inds, 5)#, 4000, 1000) # -------------------------------- scores = ['roc_auc','precision', 'recall'] # -------------- aucScoreFunc = make_scorer(aucScore, greater_is_better=True) # scores = ['roc_auc'] #[aucScoreFunc] # 'roc_auc' # ---------------------------------------------------- scores = ['precision'] calcProbs = True print('Start the grid search') t = time.time() for tuned_param in tuned_parameters: params = [] for fold_num, (train_index, test_index) in enumerate(cv): params.append((X, y, train_index, test_index, tuned_param, fold_num, calcProbs)) print(tuned_param) if (jobsNum == 1): mapResults = [calc_cv_scores(p) for p in params] # For debugging else: mapResults = utils.parmap(calc_cv_scores, params, jobsNum) cv_scores = np.array([score for (clf, score) in mapResults][:len(cv)]) print(cv_scores) clf = mapResults[0][0] elapsed = time.time() - t print('Request took '+str(elapsed)+' sec.') print(str(datetime.now())) scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) XTest = scaler.transform(XTest) printResults(clf, XTest, yTest, calcProbs, path,cv_scores)
def run(x, y, x_test, y_test, folds_num, path, inds, jobs_num=6, calc_probs=True): # y = y.astype(np.int) # y_test = y_test.astype(np.int) # heldoutSize = 1./folds_num # x, idx = shuffle(x) # y = y[idx] # x_test, idx = shuffle(x_test) # y_test = y_test[idx] # scaler = preprocessing.StandardScaler().fit(x) # x = scaler.transform(x) # x_test = scaler.transform(x_test) # x = x[:5000, :] # y = y[:5000] # ---------------------------------------------------- x_test = x_test[:500, :] # ------------------------------------------------------- y_test = y_test[:500] # cv = StratifiedShuffleSplit(y, folds_num, heldoutSize, random_state=0) cv = IndicesKFold(inds, folds_num) # , 4000, 1000) # -------------------------------- scores = ['roc_auc','precision', 'recall'] # -------------- auc_scoreFunc = make_scorer(auc_score, greater_is_better=True) # scores = ['roc_auc'] #[auc_scoreFunc] # 'roc_auc' # ---------------------------------------------------- scores = ['precision'] # calc_probs = True parts_of_path = path.split('/') dump_path = os.path.join(DUMP_FOLDER, parts_of_path[-1]) if not os.path.exists(dump_path): os.makedirs(dump_path) current_dump_folder = dump_path print('Start the grid search') t = time.time() for tuned_param in tuned_parameters: already_exist = check_if_params_were_calculated(dump_path, tuned_param) if already_exist: continue params = [] for fold_num, (train_index, test_index) in enumerate(cv): params.append((x, y, train_index, test_index, tuned_param, fold_num, calc_probs, inds)) print(tuned_param) if jobs_num == 1: map_results = [calc_cv_scores(p) for p in params] # For debugging else: map_results = utils.parmap(calc_cv_scores, params, jobs_num) cv_scores = np.array([(score,scoreTrial) for (clf, score,scoreTrial) in map_results][:len(cv)]) print(cv_scores) mean_cv_score = sum(cv_scores)/len(cv_scores) print('==============================mean auc score is '+str(mean_cv_score)+'==============================') clf = map_results[0][0] elapsed = time.time() - t print('Request took '+str(elapsed)+' sec.') print(str(datetime.now())) # scaler = preprocessing.StandardScaler().fit(x) # x = scaler.transform(x) # x_test = scaler.transform(x_test) mini_path = path.split('/')[-1] mini_path = mini_path.replace(' ', '_') print_results(clf, x_test, y_test, calc_probs, path, None, cv_scores, current_dump_folder, mean_cv_score)
sys.stdout.flush() return 1 - test_score def run_single(train_files_glob, i): print '... reading' sys.stdout.flush() x_train, y_train = gen_vw.read_train_data(train_files_glob, i) x_test, y_test = gen_vw.read_test_data(train_files_glob, i) x_valid, y_valid = gen_vw.read_valid_data(train_files_glob, i) print '... reading done' sys.stdout.flush() x = T.matrix('x') y = T.ivector('y') error = train_batch(x, y, x_train, y_train, x_valid, y_valid, x_test, y_test) return 1 - error if __name__ == '__main__': #tfile = config.train_folder + "train_subject0[1-4].mat" #print run_single(tfile, 0) tfile = config.train_folder + "*.mat" precisions = utils.parmap(lambda i: run_single(tfile, i), len(glob(tfile)) - 1) print sum(precisions) / len(precisions)