def fabert_predict(train_data, labels, valid_data, test_data, output_dir, time_budget, target_num, is_sparse): print (strftime("%Y-%m-%d %H:%M:%S")) print ("make multiclass prediction\n") np_seed = int(time.time()) np.random.seed(np_seed) print ("np seed = ", np_seed) print (train_data.shape) print ("train_data.shape == (%d,%d)\n" % train_data.shape) n_features = train_data.shape[1] n_samples = train_data.shape[0] start_time = time.time() # # FS_iterations = max(1,int(5000/target_num * (5000./n_samples)*2000./n_features)) # FS_iterations = 5000 # print ("FS_iterations = %d\n" % FS_iterations) # # select_clf = ExtraTreesClassifier(n_estimators=FS_iterations,max_depth=3) # select_clf = ExtraTreesClassifier(n_estimators=FS_iterations,max_depth=3) # select_clf.fit(train_data, labels) # print("FS time = ", time.time() - start_time) # # my_mean =1./(1000*n_features) # print(my_mean) # print("feature importances: ", np.sort(select_clf.feature_importances_)) # # train_data = select_clf.transform(train_data,threshold=my_mean ) # valid_data = select_clf.transform(valid_data,threshold=my_mean ) # test_data = select_clf.transform(test_data,threshold=my_mean) # print(my_mean) # print(train_data.shape) # #exit(1) ######################### Make validation/test predictions n_features = train_data.shape[1] # if n_features < 100: # gbt_features=n_features # else: # gbt_features=int(n_features**0.5) gbt_features = int(n_features ** 0.5) gbt_iterations = 15000 # int((time_budget / 3000.) * 3000000/(gbt_features * target_num) * (7000./n_samples)) # gbt_params=GBT_params(n_iterations=gbt_iterations,depth=int(10 * np.log2(gbt_iterations)/14.3), learning_rate=0.01,subsample_part=0.6,n_max_features=gbt_features,min_samples_split=5, min_samples_leaf=3) gbt_params = GBT_params( n_iterations=gbt_iterations, depth=4, learning_rate=0.01, subsample_part=0.6, n_max_features=gbt_features, min_samples_split=5, min_samples_leaf=3, ) gbt_params.print_params() (y_valid, y_test) = make_classification(gbt_params, train_data, labels, valid_data, test_data) print ("y_valid.shape = ", y_valid.shape) print ("y_test.shape = ", y_test.shape) return (y_valid, y_test)
def make_cross_validation( data, solution, cv_folds, params_begin, params_mult_factor, params_add_factor, params_num_iter ): # params = GBT_params(params_begin.n_iterations,params_begin.depth, params_begin.learning_rate, params_begin.subsample_part, params_begin.n_max_features) params = GBT_params() cv_iterations = ( params_num_iter.n_iterations * params_num_iter.depth * params_num_iter.learning_rate * params_num_iter.subsample_part * params_num_iter.n_max_features ) cv_res = np.zeros(cv_iterations) cv_times = np.zeros(cv_iterations) cur_iter = 0 params.n_iterations = params_begin.n_iterations for n_iterations in range(params_num_iter.n_iterations): params.learning_rate = params_begin.learning_rate for n_learning_rate in range(params_num_iter.learning_rate): params.depth = params_begin.depth for n_max_depth in range(params_num_iter.depth): params.subsample_part = params_begin.subsample_part for subsample_part in range(params_num_iter.subsample_part): params.n_max_features = params_begin.n_max_features for n_max_features in range(params_num_iter.n_max_features): start_time = time.time() params.print_params() clf = ensemble.GradientBoostingClassifier( n_estimators=params.n_iterations, learning_rate=params.learning_rate, max_depth=params.depth, subsample=params.subsample_part, max_features=int(params.n_max_features), ) cv_res[cur_iter] = Calc_CV_ERROR(clf, data, solution, cv_folds) print ("CV score = %1.5", cv_res[cur_iter]) cv_times[cur_iter] = time.time() - start_time print ("CV time = %d", cv_times[cur_iter]) params.n_max_features *= params_mult_factor.n_max_features params.n_max_features += params_add_factor.n_max_features cur_iter += 1 params.subsample_part *= params_mult_factor.subsample_part params.subsample_part += params_add_factor.subsample_part params.depth *= params_mult_factor.depth params.depth += params_add_factor.depth params.learning_rate *= params_mult_factor.learning_rate params.learning_rate += params_add_factor.learning_rate params.n_iterations *= params_mult_factor.n_iterations params.n_iterations += params_add_factor.n_iterations return (cv_res, cv_times)
def multiclass_predict(train_data,labels,valid_data,test_data,output_dir,time_budget,target_num, is_sparse): print(strftime("%Y-%m-%d %H:%M:%S")) print("make multiclass prediction\n") np_seed = int(time.time()) np.random.seed(np_seed) print ("np seed = " , np_seed) print(train_data.shape) print("train_data.shape == (%d,%d)\n"%train_data.shape) n_features = train_data.shape[1] n_samples = train_data.shape[0] start_time = time.time() if is_sparse: print("no FS, it is sparse data\n") train_data=train_data.toarray() valid_data=valid_data.toarray() test_data=test_data.toarray() # train_data = select_clf.transform(train_data,threshold=my_mean ) # valid_data = select_clf.transform(valid_data,threshold=my_mean ) # test_data = select_clf.transform(test_data,threshold=my_mean) print("sparse converting time = ", time.time() - start_time) start_time = time.time() FS_iterations = max(1,int(5000/target_num * (5000./n_samples)*2000./n_features)) print ("FS_iterations = %d\n" % FS_iterations) select_clf = ExtraTreesClassifier(n_estimators=FS_iterations,max_depth=3) select_clf.fit(train_data, labels) print("FS time = ", time.time() - start_time) my_mean =1./(10*n_features) train_data = select_clf.transform(train_data,threshold=my_mean ) valid_data = select_clf.transform(valid_data,threshold=my_mean ) test_data = select_clf.transform(test_data,threshold=my_mean) print(my_mean) print(train_data.shape) ######################### Make validation/test predictions n_features=train_data.shape[1] if n_features < 100: gbt_features=n_features else: gbt_features=int(n_features**0.5) gbt_iterations= int((time_budget / 3000.) * 3000000/(gbt_features * target_num) * (7000./n_samples)) gbt_params=GBT_params(n_iterations=gbt_iterations,depth=int(10 * np.log2(gbt_iterations)/14.3), learning_rate=0.01,subsample_part=0.6,n_max_features=gbt_features,min_samples_split=5, min_samples_leaf=3) gbt_params.print_params() (y_valid, y_test) = make_classification(gbt_params, train_data, labels, valid_data, test_data) print("y_valid.shape = ",y_valid.shape ) print("y_test.shape = ",y_test.shape ) return (y_valid, y_test)