def run_regression(data_file, lead, lag): start_time = time.time() intermediate_file = "prediction/data/tmp.csv" flatten_featureset.create_features(intermediate_file, data_file, lead, lag) train_data = np.genfromtxt(intermediate_file, delimiter = ',', skip_header = 1) os.remove(intermediate_file) X_train = train_data[:,1:] #file format is [label list_of_features] Y_train = train_data[:,0] logreg = linear_model.RandomizedLogisticRegression() #n_jobs=12 logreg.fit(X_train, Y_train) return logreg.scores_
def load_data(train_file, test_file, lead, lag): intermediate_file1 = "prediction/data/train.csv" intermediate_file2 = "prediction/data/test.csv" flatten_featureset.create_features(intermediate_file1, train_file, lead, lag) train_data = np.genfromtxt(intermediate_file1, delimiter = ',', skip_header = 1) os.remove(intermediate_file1) flatten_featureset.create_features(intermediate_file2, test_file, lead, lag) test_data = np.genfromtxt(intermediate_file2, delimiter = ',', skip_header = 1) os.remove(intermediate_file2) X_train = train_data[:,1:] #file format is [label list_of_features] Y_train = train_data[:,0] X_test = test_data[:,1:] #file format is [label list_of_features] Y_test = test_data[:,0] return X_train, Y_train, X_test, Y_test
def load_data(train_file, test_file, lead, lag): intermediate_file1 = "data/train.csv" intermediate_file2 = "data/test.csv" flatten_featureset.create_features(intermediate_file1, train_file, lead, lag) train_data = np.genfromtxt(intermediate_file1, delimiter = ',', skip_header = 1) os.remove(intermediate_file1) flatten_featureset.create_features(intermediate_file2, test_file, lead, lag) test_data = np.genfromtxt(intermediate_file2, delimiter = ',', skip_header = 1) os.remove(intermediate_file2) X_train = train_data[:,1:] #file format is [label list_of_features] Y_train = train_data[:,0] X_test = test_data[:,1:] #file format is [label list_of_features] Y_test = test_data[:,0] return X_train, Y_train, X_test, Y_test
@author: Colin Taylor Run flatten_featureset for all cohorts to create a set of flattened datasets Name pattern of flattened featureset is: data/flat/features_cut_wiki_only_lead_5_lag_2_train.csv Name pattern of input featureset is: data/features_cut_wiki_only_train.csv ''' import flatten_featureset import time cohorts = ["forum_only", "wiki_only", "forum_and_wiki", "no_collab"] trains = ["train", "test"] features_base = "features_" in_data_file_prefix = "prediction/data/" + features_base out_data_file_prefix = "prediction/data/flat/" + features_base data_file_suffix = ".csv" for cohort in cohorts: for train in trains: start_time = time.time() for lead in range (1,14): for lag in range(1, 15 - lead): in_data_file = in_data_file_prefix + cohort + "_" + train + data_file_suffix out_data_file = out_data_file_prefix + cohort + "_lead_%s_lag_%s_" % (lead, lag) + train + data_file_suffix flatten_featureset.create_features(out_data_file, in_data_file, lead, lag) print "Ran flatten %s, %s in %s seconds" % (cohort, train, time.time() - start_time)