from sklearn.grid_search import GridSearchCV


TRAIN_DIR = "train"
TEST_DIR = "test"

def create_submission(ids,predictions,filename):
	with open(filename, "w") as f:
		f.write("Id,Prediction\n")
		for i,p in zip(ids,predictions):
			f.write(str(i) + "," + str(p) + "\n")

if __name__ == "__main__":
	num_of_train_files = len(os.listdir(TRAIN_DIR))
	num_of_test_files = len(os.listdir(TEST_DIR))
	good_attributes = TWK_feat_eng.read_attributes('attributes.txt')
	good_calls = TWK_feat_eng.read_attributes('calls.txt')
	X_train, t_train, train_ids = TWK_feat_eng.create_data_matrix(0, num_of_train_files, good_attributes, good_calls, direc=TRAIN_DIR, training=True)
	full_test, _, test_ids = TWK_feat_eng.create_data_matrix(0, num_of_test_files, good_attributes, good_calls, direc=TEST_DIR, training=False)

	xX_train, xX_valid, xY_train, xY_valid = train_test_split(X_train, t_train, test_size=0.33, random_state=181)

	# Quickly check to see if distributions are well mixed... first pass showed that the histograms matched up well
	# plt.figure()
	# plt.hist(xY_train,bins=20,alpha=0.5)
	# plt.hist(xY_valid,bins=20,alpha=0.5)
	# plt.show()

	print "We've compiled the training data and have split it into training/validation... stay tuned!\n"
	print "Train set dims: ", X_train.shape, "Number of training files: ", num_of_train_files
	print "Test set dims: ", full_test.shape, "Number of testing files: ", num_of_test_files
from sklearn.grid_search import GridSearchCV


TRAIN_DIR = "train"
TEST_DIR = "test"

def create_submission(ids,predictions,filename):
	with open(filename, "w") as f:
		f.write("Id,Prediction\n")
		for i,p in zip(ids,predictions):
			f.write(str(i) + "," + str(p) + "\n")

if __name__ == "__main__":
	num_of_train_files = len(os.listdir(TRAIN_DIR))
	num_of_test_files = len(os.listdir(TEST_DIR))
	X_train, t_train, train_ids = TWK_feat_eng.create_data_matrix(0, num_of_train_files, TRAIN_DIR, training=True)
	full_test, _, test_ids = TWK_feat_eng.create_data_matrix(0, num_of_test_files, TEST_DIR, training=False)

	xX_train, xX_valid, xY_train, xY_valid = train_test_split(X_train, t_train, test_size=0.33, random_state=181)

	# Quickly check to see if distributions are well mixed... first pass showed that the histograms matched up well
	# plt.figure()
	# plt.hist(xY_train,bins=20,alpha=0.5)
	# plt.hist(xY_valid,bins=20,alpha=0.5)
	# plt.show()

	print "We've compiled the training data and have split it into training/validation... stay tuned!\n"
	print "Train set dims: ", X_train.shape
	print "Test set dims: ", full_test.shape

	# Standardize the data!