def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() features = fe.feature_extractor() print("Transforming features") trans_valid = features.fit_transform(valid) trans_valid = np.nan_to_num(trans_valid) print("Saving Valid Features") data_io.save_features(trans_valid) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") orig_predictions = classifier.predict_proba(trans_valid) predictions = orig_predictions[:, 2] - orig_predictions[:, 0] predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1,4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1, 4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff / 60, 1)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1,4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train = train.append(sup) target = target.append(sup_target) print "Train size = ", str(train.shape) print("Extracting features and training model") (feature_trans, classifier) = get_pipeline() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) print("Train-test split") trainX, testX, trainY, testY = train_test_split(orig_train, target.Target, random_state = 1) print "TrainX size = ", str(trainX.shape) print "TestX size = ", str(testX.shape) print("Saving features") data_io.save_features(orig_train) classifier.fit(trainX, trainY) print("Saving the classifier") data_io.save_model(classifier) testX = numpy.nan_to_num(testX) print "Score on held-out test data ->", classifier.score(testX, testY) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) feature_importrance = classifier.feature_importances_ logger = open("feature_importance.csv","a") for fi in feature_importrance: logger.write(str(fi)) logger.write("\n") t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)