from sklearn.grid_search import GridSearchCV TRAIN_DIR = "train" TEST_DIR = "test" def create_submission(ids,predictions,filename): with open(filename, "w") as f: f.write("Id,Prediction\n") for i,p in zip(ids,predictions): f.write(str(i) + "," + str(p) + "\n") if __name__ == "__main__": num_of_train_files = len(os.listdir(TRAIN_DIR)) num_of_test_files = len(os.listdir(TEST_DIR)) good_attributes = TWK_feat_eng.read_attributes('attributes.txt') good_calls = TWK_feat_eng.read_attributes('calls.txt') X_train, t_train, train_ids = TWK_feat_eng.create_data_matrix(0, num_of_train_files, good_attributes, good_calls, direc=TRAIN_DIR, training=True) full_test, _, test_ids = TWK_feat_eng.create_data_matrix(0, num_of_test_files, good_attributes, good_calls, direc=TEST_DIR, training=False) xX_train, xX_valid, xY_train, xY_valid = train_test_split(X_train, t_train, test_size=0.33, random_state=181) # Quickly check to see if distributions are well mixed... first pass showed that the histograms matched up well # plt.figure() # plt.hist(xY_train,bins=20,alpha=0.5) # plt.hist(xY_valid,bins=20,alpha=0.5) # plt.show() print "We've compiled the training data and have split it into training/validation... stay tuned!\n" print "Train set dims: ", X_train.shape, "Number of training files: ", num_of_train_files print "Test set dims: ", full_test.shape, "Number of testing files: ", num_of_test_files
from sklearn.grid_search import GridSearchCV TRAIN_DIR = "train" TEST_DIR = "test" def create_submission(ids,predictions,filename): with open(filename, "w") as f: f.write("Id,Prediction\n") for i,p in zip(ids,predictions): f.write(str(i) + "," + str(p) + "\n") if __name__ == "__main__": num_of_train_files = len(os.listdir(TRAIN_DIR)) num_of_test_files = len(os.listdir(TEST_DIR)) X_train, t_train, train_ids = TWK_feat_eng.create_data_matrix(0, num_of_train_files, TRAIN_DIR, training=True) full_test, _, test_ids = TWK_feat_eng.create_data_matrix(0, num_of_test_files, TEST_DIR, training=False) xX_train, xX_valid, xY_train, xY_valid = train_test_split(X_train, t_train, test_size=0.33, random_state=181) # Quickly check to see if distributions are well mixed... first pass showed that the histograms matched up well # plt.figure() # plt.hist(xY_train,bins=20,alpha=0.5) # plt.hist(xY_valid,bins=20,alpha=0.5) # plt.show() print "We've compiled the training data and have split it into training/validation... stay tuned!\n" print "Train set dims: ", X_train.shape print "Test set dims: ", full_test.shape # Standardize the data!