def run_regression(data_file, lead, lag):
	start_time = time.time()
	intermediate_file = "prediction/data/tmp.csv"

	flatten_featureset.create_features(intermediate_file, data_file, lead, lag)
	train_data = np.genfromtxt(intermediate_file, delimiter = ',', skip_header = 1)
	os.remove(intermediate_file)

	X_train = train_data[:,1:] #file format is [label list_of_features]
	Y_train = train_data[:,0]

	logreg = linear_model.RandomizedLogisticRegression() #n_jobs=12
	logreg.fit(X_train, Y_train)

	return logreg.scores_
Esempio n. 2
0
def load_data(train_file, test_file, lead, lag):
	intermediate_file1 = "prediction/data/train.csv"
	intermediate_file2 = "prediction/data/test.csv"

	flatten_featureset.create_features(intermediate_file1, train_file, lead, lag)
	train_data = np.genfromtxt(intermediate_file1, delimiter = ',', skip_header = 1)
	os.remove(intermediate_file1)

	flatten_featureset.create_features(intermediate_file2, test_file, lead, lag)
	test_data = np.genfromtxt(intermediate_file2, delimiter = ',', skip_header = 1)
	os.remove(intermediate_file2)

	X_train = train_data[:,1:] #file format is [label list_of_features]
	Y_train = train_data[:,0]
	X_test = test_data[:,1:] #file format is [label list_of_features]
	Y_test = test_data[:,0]

	return X_train, Y_train, X_test, Y_test
def load_data(train_file, test_file, lead, lag):
	intermediate_file1 = "data/train.csv"
	intermediate_file2 = "data/test.csv"

	flatten_featureset.create_features(intermediate_file1, train_file, lead, lag)
	train_data = np.genfromtxt(intermediate_file1, delimiter = ',', skip_header = 1)
	os.remove(intermediate_file1)

	flatten_featureset.create_features(intermediate_file2, test_file, lead, lag)
	test_data = np.genfromtxt(intermediate_file2, delimiter = ',', skip_header = 1)
	os.remove(intermediate_file2)
	
	X_train = train_data[:,1:] #file format is [label list_of_features]
	Y_train = train_data[:,0]
	X_test = test_data[:,1:] #file format is [label list_of_features]
	Y_test = test_data[:,0]

	return X_train, Y_train, X_test, Y_test
Esempio n. 4
0
@author: Colin Taylor

Run flatten_featureset for all cohorts to create a set of flattened datasets
Name pattern of flattened featureset is: data/flat/features_cut_wiki_only_lead_5_lag_2_train.csv
Name pattern of input featureset is: data/features_cut_wiki_only_train.csv
'''
import flatten_featureset
import time

cohorts = ["forum_only", "wiki_only", "forum_and_wiki", "no_collab"]
trains = ["train", "test"]

features_base = "features_"
in_data_file_prefix = "prediction/data/" + features_base
out_data_file_prefix = "prediction/data/flat/" + features_base
data_file_suffix = ".csv"

for cohort in cohorts:
	for train in trains:
		start_time = time.time()
		for lead in range (1,14):
			for lag in range(1, 15 - lead):
				in_data_file = in_data_file_prefix + cohort + "_" + train + data_file_suffix
				out_data_file = out_data_file_prefix + cohort + "_lead_%s_lag_%s_" % (lead, lag) + train + data_file_suffix
				flatten_featureset.create_features(out_data_file, in_data_file, lead, lag)
		print "Ran flatten %s, %s in %s seconds" % (cohort, train, time.time() - start_time)