def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:, 1] paper_ids_sorted = sorted(zip(preds, row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:,1] paper_ids_sorted = sorted(zip(preds,row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Reading the test data") test = data_io.read_test() print("Making predictions") np.random.seed(12341234) predictions = test.apply(shuffle, axis=1) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Reading test data") test = data_io.read_test() ordinals = np.arange(len(test)) recommendations = zip(test["srch_id"], test["prop_id"], ordinals) print("Writing predictions to file") data_io.write_submission(recommendations, "testOrderBenchmark.csv")
def __init__(self): self.train = data_io.read_train() self.test = data_io.read_test() self.destin = data_io.read_desin() # pca analysis on the destination pca = PCA(n_components=3) self.dest_pca = pca.fit_transform( self.destin[["d{0}".format(i + 1) for i in range(149)]]) self.dest_pca = pd.DataFrame(self.dest_pca) self.dest_pca["srch_destination_id"] = self.destin[ "srch_destination_id"]
def main(): test = data_io.read_test() ## deal with the NAs, and add features train.feature_eng(test) ## predict the booking_bool print("Loading the Booking classifier..") tstart = datetime.now() classifier = data_io.load_model(True) print("Time used,") print(datetime.now() - tstart) print("Making predictions on the booking_bool..") tstart = datetime.now() b_fnames = train.get_features(test, True) b_test_f = test[b_fnames].values b_prob = classifier.predict_proba(b_test_f)[:, 1] b_prob = list(-1.0 * b_prob) print("Time used,") print(datetime.now() - tstart) ## predict the click_bool print("Loading the Click classifier..") tstart = datetime.now() classifier = data_io.load_model(False) print("Time used,") print(datetime.now() - tstart) print("Making predictions on the click_bool..") tstart = datetime.now() c_fnames = train.get_features(test, False) c_test_f = test[c_fnames].values c_prob = classifier.predict_proba(c_test_f)[:, 1] c_prob = list(-1.0 * c_prob) print("Time used,") print(datetime.now() - tstart) ## Making Recommendations recommendations = zip(test["srch_id"], test["prop_id"], 4 * b_prob + c_prob) print("Writing predictions to file..") tstart = datetime.now() data_io.write_submission(recommendations) print("Time used,") print(datetime.now() - tstart)
def main(): test = data_io.read_test() ## deal with the NAs, and add features train.feature_eng(test) ## predict the booking_bool print("Loading the Booking classifier..") tstart = datetime.now() classifier = data_io.load_model(True) print("Time used,") print datetime.now() - tstart print("Making predictions on the booking_bool..") tstart = datetime.now() b_fnames = train.get_features(test, True) b_test_f = test[b_fnames].values b_prob = classifier.predict_proba(b_test_f)[:,1] b_prob = list(-1.0*b_prob) print("Time used,") print datetime.now() - tstart ## predict the click_bool print("Loading the Click classifier..") tstart = datetime.now() classifier = data_io.load_model(False) print("Time used,") print datetime.now() - tstart print("Making predictions on the click_bool..") tstart = datetime.now() c_fnames = train.get_features(test, False) c_test_f = test[c_fnames].values c_prob = classifier.predict_proba(c_test_f)[:,1] c_prob = list(-1.0*c_prob) print("Time used,") print datetime.now() - tstart ## Making Recommendations recommendations = zip(test["srch_id"], test["prop_id"], 4*b_prob+c_prob) print("Writing predictions to file..") tstart = datetime.now() data_io.write_submission(recommendations) print("Time used,") print datetime.now() - tstart
def main(): print("Reading test data ...") test = data_io.read_test() test.fillna(0, inplace=True) feature_names = list(test.columns) feature_names.remove("date_time") features = test[feature_names].values print("Loading the Regressor ...") regressor = data_io.load_model() print("Making predictions ...") predictions = regressor.predict(features) predictions = list(-1.0*predictions) recommendations = zip(test["srch_id"], test["prop_id"], predictions) print("Writing predictions to file ...") data_io.write_submission(recommendations)
def main(): print("Reading test data") test = data_io.read_test() test.fillna(0, inplace=True) feature_names = list(test.columns) feature_names.remove("date_time") features = test[feature_names].values print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = classifier.predict_proba(features)[:, 1] predictions = list(-1.0 * predictions) recommendations = zip(test["srch_id"], test["prop_id"], predictions) print("Writing predictions to file") data_io.write_submission(recommendations)
def prediction(n_train_samples): proc_test_samples_file = get_paths()["proc_test_samples_path"] if os.path.exists(proc_test_samples_file): print "Loading processed test data..." new_test_samples = pd.read_csv(proc_test_samples_file) else: print "Reading test data..." test_samples = data_io.read_test() test_samples = test_samples.fillna(value=0) print "Porcessing test samples" new_test_samples = process_test_samples(test_samples) new_test_samples.to_csv(proc_test_samples_file, index=None) test_feature = new_test_samples.values print "Loading the Random Forest Classifier" rf_classifier = data_io.load_model(model_name="rf_classifier.pkl") print "Random Forest Predicting" rf_predictions = rf_classifier.predict_proba(test_feature)[:, 1] print "Loading the Gradient Boosting Classifier" gb_classifier = data_io.load_model(model_name="gb_classifier.pkl") print "Gradient Boosting Predicting" gb_predictions = gb_classifier.predict_proba(test_feature)[:, 1] print "Loading the SGD Classifier" sgd_classifier = data_io.load_model(model_name="sgd_classifier.pkl") print "SGD Predicting" sgd_predictions = sgd_classifier.predict_proba(test_feature)[:, 1] prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions)) mean_score = np.mean(prob_arr, axis=0) mean_score = -1.0 * mean_score mean_recommendations = zip(new_test_samples["srch_id"], new_test_samples["prod_id"], mean_score) print "Writing predictions to file" data_io.write_submission(mean_recommendations, submission_file="mean_result_%i.csv" % n_train_samples)
srch_length_of_stay_features.SrchLengthOfStayFeatures(self.X), srch_booking_window_features.SrchBookingWindowFeatures(self.X), ] return map(self.transformer, feature_list) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate features using train/test data") parser.add_argument("--test", action="store_true", default=False, help="Weather to use test data", required=False) result = parser.parse_args() if result.test: print("Reading test data") data = data_io.read_test() else: print("Reading training data") data = data_io.read_train() fm = FeatureExtractor(data) derived_features = fm.feature_extractor() data.fillna(0, inplace=True) data = pandas.concat([data] + derived_features, axis=1) if result.test: data_io.save_test_features(data) else: data_io.save_train_features(data)
def main(): class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' print bcolors.HEADER + "Start Training" + bcolors.HEADER print bcolors.OKBLUE + "Reading and making Trainingset" + bcolors.OKBLUE train = data_io.read_train() train.fillna(0, inplace=True) train_sample = train[:1250000].fillna(value=0) # change the samplesize over here # list of features that can be removed if you want feature_names = list(train_sample.columns) feature_names.remove("click_bool") feature_names.remove("booking_bool") feature_names.remove("gross_bookings_usd") feature_names.remove("date_time") feature_names.remove("position") features = train_sample[feature_names].values target = train_sample["booking_bool"].values print bcolors.OKGREEN + "Training Dataset" + bcolors.OKGREEN # check over here , you can find the algorithms at http://scikit-learn.org/stable/modules/ensemble.html # random forest classifier = RandomForestClassifier(n_estimators=3200, verbose=2,n_jobs=-1,min_samples_split=10,random_state=1) # extra Trees (better then random forest) (best till now!) #classifier = ExtraTreesClassifier(n_estimators=300, verbose=2, n_jobs=-1, min_samples_split=10,random_state=1) # Adaboost #classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1) # Knearest neighbour with bagging #classifier = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) # Gradient Boosting BEST SOLUTION (i suppose,will try tomorrow) # classifier = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, verbose=0) classifier.fit(features, target) print bcolors.OKBLUE + "Saving Classifier" + bcolors.OKBLUE data_io.save_model(classifier) print bcolors.OKGREEN + "Start Making Predictions On Testset" + bcolors.OKGREEN print bcolors.OKBLUE + "Reading Testset" + bcolors.OKBLUE test = data_io.read_test() test.fillna(0, inplace=True) feature_names = list(test.columns) feature_names.remove("date_time") features = test[feature_names].values classifier = data_io.load_model() print bcolors.OKGREEN + "Make Predictions" + bcolors.OKGREEN predictions = classifier.predict_proba(features)[:,1] print bcolors.OKBLUE + "Calculate NDcg" + bcolors.OKBLUE predictions = list(-1.0*predictions) print bcolors.OKBLUE + "Sort Predictions" + bcolors.OKBLUE recommendations = zip(test["srch_id"], test["prop_id"], predictions) print bcolors.OKGREEN + "Writing Predictions To Outputfile" + bcolors.OKGREEN data_io.write_submission(recommendations) print "" print bcolors.ENDC + "Thats all folks,goodbye!" + bcolors.ENDC
def do_prediction(n_trian_samples): proc_test_samples_file = get_paths()['proc_test_samples_path'] if os.path.exists(proc_test_samples_file): print "Loading processed test data..." new_test_samples = pd.read_csv(proc_test_samples_file) print "Loading processed test data done" else: # prediction print "reading test data..." test_samples = data_io.read_test() test_samples = test_samples.fillna(value=0) print "done." # process test samples print "processing test data..." new_test_samples = process_test_samples(test_samples) new_test_samples.to_csv(proc_test_samples_file, index=None) print "Processing test data done." test_features = new_test_samples.values # 5.1 random forest prediction print("Loading the random forest classifier") rf_classifier = data_io.load_model(model_name='lr_classifier.pkl') print("random forest Predicting") # 拿概率值 rf_predictions = rf_classifier.predict_proba(test_features)[:1] # 5.2 Gradient Boosting prediction print("Loading the Gradient Boosting classifier") gb_classifier = data_io.load_model(model_name='gb_classifier.pkl') print("Gradient Boosting Predicting") gb_predictions = gb_classifier.predict_proba(test_features)[:1] # 5.3 SGD prediction print("Loading the SGD classifier") sgd_classifier = data_io.load_model(model_name='sgd_classifier.pkl') print("SGD Predicting") sgd_predictions = sgd_classifier.predict_proba(test_features)[:1] # 5.4 LR prediction # print("Loading the LR classifier") # lr_classifier = data_io.load_model(model_name='lr_classifier.pkl') # print("Logistic Regression Predicting") # lr_predictions = lr_classifier.predict_proba(test_features)[:1] # step 6 score fusion 把三组概率放到数组中 prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions)) # average mean 取概率的评价值 算术评价 mean_score = np.mean(prob_arr, axis=0) # for sorting 几何评价,效果不太好 # 前面的排序是升序,乘以-1 改为降序 mean_score = -1.0 * mean_score # geometric mean gmean = stats.gmean(prob_arr, axis=0) # for sorting gmean = -1.0 * gmean # step 7 output result mean_recommendations = zip(new_test_samples['srch_id'], new_test_samples['prop_id'], mean_score) gmean_recommendations = zip(new_test_samples['srch_id'], new_test_samples['prop_id'], gmean) print("Writing predictions to file") data_io.write_submission(mean_recommendations, submission_file='mean_result_%i.csv' % n_trian_samples) data_io.write_submission(gmean_recommendations, submission_file='gmean_result_%i.csv' % n_trian_samples)