def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed, computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:, 1] paper_ids_sorted = sorted(zip(preds, row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] featuresfloat = [] for tup in features: a, b, c, d, e = tup featuresfloat.append((float(a), float(b), float(c), float(d), float(e))) print("Totoal number of samples: ", len(featuresfloat)) print("Loading the logistic regression model") logistic = data_io.load_model() print("Making predictions") predictions = logistic.predict_proba(featuresfloat)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def main(): testset = pd.read_csv(path + "test_x.csv", index_col=0) ## deal with the NAs, and add features #train.feature_eng(test) ## predict print "Loading the predict_model classifier.." tstart = datetime.now() classifier = data_io.load_model("predict_model") print "Time used", datetime.now() - tstart print "Making predictions on the predict_model" tstart = datetime.now() fnames = ['year', 'month', 'trade_no', 'sigungu_no', 'price', 'monthly_expense'] test_f = testset[fnames].values predic_proba = classifier.predict_proba(test_f)[:,1] print "Time used", datetime.now() - tstart ## Making prediction prediction = zip(testset['year'], testset['month'], testset['trade_no'], testset['sigungu_no'], testset['price'], testset['monthly_expense'], predic_proba) print "Writing predictions to file.." tstart = datetime.now() data_io.write_submission(prediction) print "Time used,", datetime.now() - tstart
def main(): print("Reading test data") test_chunks = data_io.read_test_features() test = pandas.concat([chunk for chunk in test_chunks], ignore_index=True) feature_names = list(test.columns) #feature_names.remove("date_time") features = test[feature_names].values print("Loading the classifier") classifiers = data_io.load_model() print("Making predictions") #orig_predictions = classifier.predict_proba(features) #multiplier = 2 ** classifier.classes_ #predictions = orig_predictions * multiplier #predictions = predictions.sum(axis=1) predictions = class_probabilities(features, classifiers) print predictions predictions = list(-1.0*predictions) recommendations = zip(test["srch_id"], test["prop_id"], predictions) print("Writing predictions to file") data_io.write_submission(recommendations)
def main(): print "sklearn version", pkg_resources.get_distribution("scikit-learn").version print "numpy version", pkg_resources.get_distribution("numpy").version print "pandas version", pkg_resources.get_distribution("pandas").version print("Loading the classifier") clf = data_io.load_model() X = data_io.load_matlab_valid_features() X = delete_unused_columns(X) X = X.fillna(0) if(X is None): print("No feature file found!") exit(1) print_importances(X,clf, 0.0) print("Predictions outcomes with shape: " + str(X.shape)) print clf predictions = clf.predict(X) #predictions = clf.predict_pruned(X,3000) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print classifier.feature_importances_ print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def main(): print("Reading test data") test_chunks = data_io.read_test_features() test = pandas.concat([chunk for chunk in test_chunks], ignore_index=True) feature_names = list(test.columns) #feature_names.remove("date_time") features = test[feature_names].values print("Loading the classifier") classifiers = data_io.load_model() print("Making predictions") #orig_predictions = classifier.predict_proba(features) #multiplier = 2 ** classifier.classes_ #predictions = orig_predictions * multiplier #predictions = predictions.sum(axis=1) predictions = class_probabilities(features, classifiers) print predictions predictions = list(-1.0 * predictions) recommendations = zip(test["srch_id"], test["prop_id"], predictions) print("Writing predictions to file") data_io.write_submission(recommendations)
def main(): print "sklearn version", pkg_resources.get_distribution( "scikit-learn").version print "numpy version", pkg_resources.get_distribution("numpy").version print "pandas version", pkg_resources.get_distribution("pandas").version print("Loading the classifier") clf = data_io.load_model() X = data_io.load_matlab_valid_features() X = delete_unused_columns(X) X = X.fillna(0) if (X is None): print("No feature file found!") exit(1) print_importances(X, clf, 0.0) print("Predictions outcomes with shape: " + str(X.shape)) print clf predictions = clf.predict(X) #predictions = clf.predict_pruned(X,3000) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Getting features for valid papers from the database") if (os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(features)[:, 1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def main(): print("Reading the test data") test = data_io.read_test() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = [] for author_id, row in test.iterrows(): features = [] paper_ids = [] for paper_id in row["PaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: features.append(s) paper_ids.append(paper_id) feature_matrix = pd.DataFrame(features) preds = classifier.predict_proba(feature_matrix)[:,1] paper_ids_sorted = sorted(zip(preds,row["PaperIds"]), reverse=True) print(paper_ids_sorted) predictions.append([x[1] for x in paper_ids_sorted]) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print "Getting features for valid papers from the database" data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print "Loading the classifier" classifier = data_io.load_model() print "Making predictions" predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print "Writing predictions to file" data_io.write_submission(paper_predictions)
def main(): print("Getting features for valid papers from the database") data = data_io.get_features_db("ValidPaper") author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] featuresfloat = [] for tup in features: a, b, c, d, e = tup featuresfloat.append( (float(a), float(b), float(c), float(d), float(e))) print("Totoal number of samples: ", len(featuresfloat)) print("Loading the logistic regression model") logistic = data_io.load_model() print("Making predictions") predictions = logistic.predict_proba(featuresfloat)[:, 1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions)
def get_cv_score(): classifier = data_io.load_model() train = data_io.get_train_df() scores = cv.cross_val_score(classifier, train[[x for x in train.columns if x != 'label']], train['label']) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def runWithoutWndchrm(self): print "Loading the classifier" classifier = data_io.load_model() imageCollections = data_io.get_valid_df() featureGetter = FeatureGetter() print "Getting the features" fileName = data_io.get_savez_name_test() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, valid) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, valid) = Utils.loadFeatures(fileName) print "Making predictions" #valid = normalize(valid, axis=0) #askdfhashdf predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print "Writing predictions to file" data_io.write_submission(namesObservations, coordinates, predictions) data_io.write_submission_nice(namesObservations, coordinates, predictions) print "Calculating final results" return Predictor.finalResults(namesObservations, predictions, coordinates)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-tv', type=float, action='store', dest='threshold_val', help='specify how to generate recommendation result.') parser.add_argument('-t', type=int, action='store', dest='target', help='for validation or test dataset') if len(sys.argv) != 5: print 'Command e.g.: python predict.py -tv 0.8 -t 0(1)' sys.exit(1) para = parser.parse_args() if para.target == 0: file_name = settings["MTLR_TEST_FILE"] gbt_feature_file = settings["MTLR_FEATURE_FILE"] elif para.target == 1: file_name = settings["MTLR_TEST_FILE_FOR_SUBMIT"] gbt_feature_file = settings["MTLR_FEATURE_FILE_FOR_SUBMIT"] writer = csv.writer(open(gbt_feature_file, "w"), lineterminator="\n") classifier = data_io.load_model(settings["MTLR_MODEL_FILE"]) #print classifier.coef_ #raw_input() user_recommend_result = defaultdict(list) finished_num = 0 features = [] user_product_ids = [] cache_uid = -1 for i, entry in enumerate(csv.reader(open(file_name))): feature = map(float, entry[2:]) uid, pid = map(int, entry[:2]) if i == 0: cache_uid = uid if uid != cache_uid: predictions = classifier.predict_proba(user_product_ids, features) #predictions = classifier.predict(features) for (t_uid, t_pid), pred in zip(user_product_ids, predictions): writer.writerow([t_uid, t_pid, pred]) if pred > para.threshold_val: user_recommend_result[t_uid].append(t_pid) features = [feature] user_product_ids = [[uid, pid]] cache_uid = uid finished_num += 1 #print("FINISHED UID NUM: %d. " % (finished_num)) #sys.stderr.write("\rFINISHED UID NUM: %d. " % (finished_num)) #sys.stderr.flush() else: features.append(feature) user_product_ids.append([uid, pid]) data_io.write_submission(user_recommend_result)
def estimate(features, target): print("[INFO] Loading the classifier") classifier = data_io.load_model() print("[INFO] Making predictions") predictions = classifier.predict_proba(features) return predictions
def get_cv_score(): classifier = data_io.load_model() train = data_io.get_train_df() scores = cv.cross_val_score( classifier, train[[x for x in train.columns if x != 'label']], train['label']) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def main(): test = data_io.read_test() ## deal with the NAs, and add features train.feature_eng(test) ## predict the booking_bool print("Loading the Booking classifier..") tstart = datetime.now() classifier = data_io.load_model(True) print("Time used,") print(datetime.now() - tstart) print("Making predictions on the booking_bool..") tstart = datetime.now() b_fnames = train.get_features(test, True) b_test_f = test[b_fnames].values b_prob = classifier.predict_proba(b_test_f)[:, 1] b_prob = list(-1.0 * b_prob) print("Time used,") print(datetime.now() - tstart) ## predict the click_bool print("Loading the Click classifier..") tstart = datetime.now() classifier = data_io.load_model(False) print("Time used,") print(datetime.now() - tstart) print("Making predictions on the click_bool..") tstart = datetime.now() c_fnames = train.get_features(test, False) c_test_f = test[c_fnames].values c_prob = classifier.predict_proba(c_test_f)[:, 1] c_prob = list(-1.0 * c_prob) print("Time used,") print(datetime.now() - tstart) ## Making Recommendations recommendations = zip(test["srch_id"], test["prop_id"], 4 * b_prob + c_prob) print("Writing predictions to file..") tstart = datetime.now() data_io.write_submission(recommendations) print("Time used,") print(datetime.now() - tstart)
def main(): print("Loading the model") model = data_io.load_model() print("Making predictions") valid = data_io.get_valid_df() predictions = model * np.ones(len(valid)) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") test = data_io.get_test() predictions = classifier.predict(test) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', type=int, action='store', dest='target', help='for validation or test dataset') parser.add_argument('-c1', type=int, action='store', dest='ucluster_num', help='cluster number of users') parser.add_argument('-c2', type=int, action='store', dest='icluster_num', help='cluster number of items') if len(sys.argv) != 7: print 'Command e.g.: python cluster.py -t 0(1) -c1 20 -c2 50' sys.exit(1) para = parser.parse_args() if para.target == 0: user_features = [map(int, entry) for entry in csv.reader(open(settings["USER_CLUSTER_TRAIN_FILE"]))] item_features = [map(int, entry) for entry in csv.reader(open(settings["ITEM_CLUSTER_TRAIN_FILE"]))] user_cluster_file = settings["USER_CLUSTER_TEST_FILE"] item_cluster_file = settings["ITEM_CLUSTER_TEST_FILE"] elif para.target == 1: user_features = [map(int, entry) for entry in csv.reader(open(settings["USER_CLUSTER_TRAIN_FILE_FOR_SUBMIT"]))] item_features = [map(int, entry) for entry in csv.reader(open(settings["ITEM_CLUSTER_TRAIN_FILE_FOR_SUBMIT"]))] user_cluster_file = settings["USER_CLUSTER_TEST_FILE_FOR_SUBMIT"] item_cluster_file = settings["ITEM_CLUSTER_TEST_FILE_FOR_SUBMIT"] else: print 'Invalid train data target choice...' sys.exit(1) writer = csv.writer(open(user_cluster_file, "w"), lineterminator="\n") cluster = data_io.load_model(settings["USER_CLUSTER_MODEL_FILE"]) uids = [entry[0] for entry in user_features] features = [entry[1:] for entry in user_features] labels = cluster.predict(features) for uid, label in zip(uids, labels): writer.writerow([uid, label]) writer = csv.writer(open(item_cluster_file, "w"), lineterminator="\n") cluster = data_io.load_model(settings["ITEM_CLUSTER_MODEL_FILE"]) pids = [entry[0] for entry in item_features] features = [entry[1:] for entry in item_features] labels = cluster.predict(features) for pid, label in zip(pids, labels): writer.writerow([pid, label])
def main(): test = data_io.read_test() ## deal with the NAs, and add features train.feature_eng(test) ## predict the booking_bool print("Loading the Booking classifier..") tstart = datetime.now() classifier = data_io.load_model(True) print("Time used,") print datetime.now() - tstart print("Making predictions on the booking_bool..") tstart = datetime.now() b_fnames = train.get_features(test, True) b_test_f = test[b_fnames].values b_prob = classifier.predict_proba(b_test_f)[:,1] b_prob = list(-1.0*b_prob) print("Time used,") print datetime.now() - tstart ## predict the click_bool print("Loading the Click classifier..") tstart = datetime.now() classifier = data_io.load_model(False) print("Time used,") print datetime.now() - tstart print("Making predictions on the click_bool..") tstart = datetime.now() c_fnames = train.get_features(test, False) c_test_f = test[c_fnames].values c_prob = classifier.predict_proba(c_test_f)[:,1] c_prob = list(-1.0*c_prob) print("Time used,") print datetime.now() - tstart ## Making Recommendations recommendations = zip(test["srch_id"], test["prop_id"], 4*b_prob+c_prob) print("Writing predictions to file..") tstart = datetime.now() data_io.write_submission(recommendations) print("Time used,") print datetime.now() - tstart
def test_trainingData(): print("Loading processed data ...") features, targets = data_io.load_processed_data() print("Loading Regressor ... ") classifier = data_io.load_model() print("Doing prediction ...") predictions = classifier.predict(features) results = zip(targets[:100], predictions[:100]) results = sorted(map(lambda x: abs(x[0] - x[1]), results)) return sum(results) / len(results)
def main(): print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") valid = data_io.get_valid_df() predictions = classifier.predict(valid) predictions = np.rint(predictions) # Round predictions to nearest integer. print("Writing predictions to file") data_io.write_submission(predictions)
def main(): print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") valid = data_io.get_valid_df() predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print("Writing predictions to file") data_io.write_submission(predictions)
def main(): valid = data_io.get_valid_df() P={} for key in valid: print("Loading the classifier for %s" %key) classifier = data_io.load_model(key) print("Making predictions") P[key] = classifier.predict(valid[key]) P[key] = P[key].reshape(len(P[key]), 1) print("Writing predictions to file") data_io.write_submission(P)
def estimate(): features, target = load_svmlight_file(data_io.read_test_svm()) features = features.todense() print("[INFO] Loading the classifier") classifier = data_io.load_model() print("[INFO] Making predictions") predictions = classifier.predict_proba(features) return predictions
def main(): comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() conn = data_io.get_db_conn() feature_name = open("feature_list.txt").read().split() # if size < len(feature_name): # to be done! for table_name in ["ValidPaper"]: if rank > 0: # getting features by parallel computing print "getting features at node " + str(rank) feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1]) else: feature = data_io_parallel.get_trained_validation_data(conn, table_name) # sending features to rank 0 print "sending features to node " + str(rank) features = comm.gather(feature, root = 0) #print features if rank == 0: temp = [] for f in features: temp.extend(f) print "Successfully got the features from " + table_name data = map(list, np.array(temp).T) if rank == 0: author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print classifier.feature_importances_ print("Making predictions") predictions = classifier.predict_proba(features)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions) print "Prediction completed, exit..." comm.Abort()
def run(self): valid = self.getValidationDataset() if f.preprocessedFeatures != []: intermediate = data_io.read_intermediate_valid() for i in f.preprocessedFeatures: valid[i] = intermediate[i] print "Loading the classifier" classifier = data_io.load_model() print "Making predictions" predictions = classifier.predict(valid) predictions = predictions.flatten() print "Writing predictions to file" data_io.write_submission(predictions)
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = classifier.predict(valid) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def main(): cf = ClassifierFactory() filename = None modelnames = ["basic_python_benchmark"] numRows = None try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:h") except getopt.GetoptError as err: print str(err) sys.exit(2) for o, a in opts: if o == "-f": filename = a elif o == "-n": numRows = int(a) elif o == "-m": if a == "all": modelnames = [] for clf_key in cf.get_all_keys(): modelnames.append(clf_key) elif cf.is_valid_key(a): modelnames = [a] elif o == "-h": print 'options:' print "\t -m [classifier key | all]" print "\t -f [filename]" sys.exit(0) else: print "try help: python predict.py -h" sys.exit(1) print "Reading the test pairs" test = data_io.read_test_pairs(numRows) testInfo = data_io.read_test_info(numRows) test['A type'] = testInfo['A type'] test['B type'] = testInfo['B type'] for modelname in modelnames: print "Loading the classifier:", cf.get_classifier_name(modelname) classifier = data_io.load_model(modelname) print "Making predictions" predictions = classifier.predict(test) predictions = predictions.flatten() filename = modelname + '.csv' data_io.write_submission(predictions, filename)
def reclassify(): print("Getting the questions in the database") questions = get_questions_from_postgres() print("%d questions retrieved" % len(questions)) print("Loading the trained model") classifier = data_io.load_model("model.pickle") print("Making predictions") probs = classifier.predict_proba(questions) prob_closed = 1-probs[:,1] update_postgres_close_likelihood(questions["PostId"], prob_closed)
def prediction(n_train_samples): proc_test_samples_file = get_paths()["proc_test_samples_path"] if os.path.exists(proc_test_samples_file): print "Loading processed test data..." new_test_samples = pd.read_csv(proc_test_samples_file) else: print "Reading test data..." test_samples = data_io.read_test() test_samples = test_samples.fillna(value=0) print "Porcessing test samples" new_test_samples = process_test_samples(test_samples) new_test_samples.to_csv(proc_test_samples_file, index=None) test_feature = new_test_samples.values print "Loading the Random Forest Classifier" rf_classifier = data_io.load_model(model_name="rf_classifier.pkl") print "Random Forest Predicting" rf_predictions = rf_classifier.predict_proba(test_feature)[:, 1] print "Loading the Gradient Boosting Classifier" gb_classifier = data_io.load_model(model_name="gb_classifier.pkl") print "Gradient Boosting Predicting" gb_predictions = gb_classifier.predict_proba(test_feature)[:, 1] print "Loading the SGD Classifier" sgd_classifier = data_io.load_model(model_name="sgd_classifier.pkl") print "SGD Predicting" sgd_predictions = sgd_classifier.predict_proba(test_feature)[:, 1] prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions)) mean_score = np.mean(prob_arr, axis=0) mean_score = -1.0 * mean_score mean_recommendations = zip(new_test_samples["srch_id"], new_test_samples["prod_id"], mean_score) print "Writing predictions to file" data_io.write_submission(mean_recommendations, submission_file="mean_result_%i.csv" % n_train_samples)
def main(): markdown = PagedownToHtml() print("Reading in the training data") train = data_io.get_train_df() for i in train.index: train["BodyMarkdown"][i] = markdown.convert(train["BodyMarkdown"][i]) print("Extracting features and training") classifier = get_pipeline() classifier.fit(train, train["OpenStatus"]) print("Saving the classifier") data_io.save_model(classifier, "model.pickle") model = data_io.load_model("model.pickle")
def main(): markdown = PagedownToHtml() print("Reading the private leaderboard file") test = data_io.get_test_df() for i in test.index: test["BodyMarkdown"][i] = markdown.convert(test["BodyMarkdown"][i]) print("Loading the trained model") classifier = data_io.load_model("model.pickle") print("Making predictions") probs = classifier.predict_proba(test) solution = data_io.get_private_leaderboard_solution_df() print("Open AUC: %0.6f" % metrics.auc(solution["open"], probs[:,1]))
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() valid_info = data_io.read_valid_info() valid = pd.concat([valid, valid_info],axis =1) valid = train.get_types(valid) print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = classifier.predict(valid) predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions, fn)
def main(): print("Loading the test data") classifier = data_io.load_model() print ("Load test data. And Clean..") test = data_io.get_test_df() test = FeatureConverter().clean_data(test) passengerIds = test['Id'] test.drop(['Id'], axis = 1, inplace = True) test = test.values print("Making predictions") predictions = classifier.predict(test).astype(int) #predictions = predictions.reshape(len(predictions), 1) print("Writing predictions to file") data_io.write_submission(predictions, passengerIds, ['Id', 'Cover_Type'])
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] predictInts = [] for tup in features: a, b, c, d, e = tup predictInts.append((int(a), int(b), int(c), int(d), int(e))) print("Loading the classifier") mlp = data_io.load_model(prefix="mlp_") print("Making predictions") predictions = [] for x in predictInts : #Propagate the inputs forward to compute the outputs outp = list(x) #output of input layer i.e. output of previous layer to be used as input for next layer for layer in mlp.layers[1:] : #for all layers starting from the second layer for i in range(layer.nNeurons): layer.net[i] = weightedSum(outp, layer.W[1:,i]) + layer.W[0,i] layer.out[i] = g(layer.net[i], layer.transferF) #pass this weighted sum through the transfer function of this layer outp = layer.out predictions.append(mlp.layers[-1].out[0]) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="mlp_")
def main(): print("Loading the test data") classifier = data_io.load_model() print("Load test data. And Clean..") test = data_io.get_test_df() test = FeatureConverter().clean_data(test) passengerIds = test['Id'] test.drop(['Id'], axis=1, inplace=True) test = test.values print("Making predictions") predictions = classifier.predict(test).astype(int) #predictions = predictions.reshape(len(predictions), 1) print("Writing predictions to file") data_io.write_submission(predictions, passengerIds, ['Id', 'Cover_Type'])
def main(): print("Getting features for valid papers from the database") if(os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] #code for including keywords match feature print "adding addtional features..." import additional_features as af all_features = af.get_additional_features() _, _, kw_features = all_features for i in range(len(features)): features[i]+= tuple(kw_features[i][2:]) featuresnp = np.array(features, dtype='int32') # featuresnp -= np.mean(featuresnp, axis=0) # featuresnp /= np.std(featuresnp, axis=0) print("Loading the classifier") classifier = data_io.load_model(prefix="forest_") print("Making predictions") predictions = classifier.predict_proba(featuresnp)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="forest_")
def predict_write(data, predict_type): author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") features = np.array(features) # This line is for xgboost predictions = classifier.predict_proba(features)[:, 1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} if (predict_type == "valid"): targetset = pd.read_csv('dataRev2/Valid.csv') else: targetset = pd.read_csv('dataRev2/Test.csv') parsed_counter = parse_targetset_maintain_duplicate(targetset) for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) new_result = [] for x in paper_ids_sorted: pid = x[1] for i in range(parsed_counter[author_id, pid]): new_result.append(pid) paper_predictions[author_id] = new_result #paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] paper_predictions[author_id] = processDuplicates( paper_predictions[author_id]) print("Writing predictions to file") data_io.write_submission(paper_predictions, predict_type)
def main(): print("Reading the valid pairs") valid = data_io.read_valid_pairs() features = fe.feature_extractor() print("Transforming features") trans_valid = features.fit_transform(valid) trans_valid = np.nan_to_num(trans_valid) print("Saving Valid Features") data_io.save_valid_features(trans_valid) print("Loading the classifier") #(both_classifier, A_classifier, B_classifier, none_classifier) = data_io.load_model() classifier = data_io.load_model() print("Making predictions") valid_info = data_io.read_valid_info() predictions = list() curr_pred = None """ for i in range(len(trans_valid)): if valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] == "Numerical": curr_pred = both_classifier.predict_proba(trans_valid[i, :]) elif valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] != "Numerical": curr_pred = A_classifier.predict_proba(trans_valid[i, :]) elif valid_info["A type"][i] != "Numerical" and valid_info["B type"][i] == "Numerical": curr_pred = B_classifier.predict_proba(trans_valid[i, :]) else: curr_pred = none_classifier.predict_proba(trans_valid[i, :]) predictions.append(curr_pred[0][2] - curr_pred[0][0]) """ orig_predictions = classifier.predict_proba(trans_valid) predictions = orig_predictions[:, 2] - orig_predictions[:, 0] predictions = predictions.flatten() print("Writing predictions to file") data_io.write_submission(predictions)
def predict_feature_from_aid(data): author_paper_ids = [x[:2] for x in data] features = [x[2:] for x in data] print("Loading the classifier") classifier = data_io.load_model() print("Making predictions\n") features = np.array(features) # This line is for xgboost predictions = classifier.predict_proba(features)[:, 1] predictions = list(predictions) result = [] for (a_id, p_id), pred in zip(author_paper_ids, predictions): result.append((pred, p_id)) #print(author_predictions) paper_ids_sorted = sorted(result, reverse=True) result = paper_ids_sorted[0:25] return result
def main(): print("Reading test data") test = data_io.read_test() test.fillna(0, inplace=True) feature_names = list(test.columns) feature_names.remove("date_time") features = test[feature_names].values print("Loading the classifier") classifier = data_io.load_model() print("Making predictions") predictions = classifier.predict_proba(features)[:, 1] predictions = list(-1.0 * predictions) recommendations = zip(test["srch_id"], test["prop_id"], predictions) print("Writing predictions to file") data_io.write_submission(recommendations)
def run(self): print "Preparing the environment" self.prepareEnvironment() print "Loading the classifier" classifier = data_io.load_model() imageCollections = data_io.get_valid_df() featureGetter = FeatureGetter() wndchrmWorker = WndchrmWorkerPredict() print "Getting the features" if not self.loadWndchrm: #Last wndchrm set of features fileName = data_io.get_savez_name_test() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, _) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, _) = Utils.loadFeatures(fileName) print "Saving images" imageSaver = ImageSaver(coordinates, namesObservations, imageCollections, featureGetter.patchSize) imageSaver.saveImages() print "Executing wndchrm algorithm" valid = wndchrmWorker.executeWndchrm(namesObservations) else: (valid, namesObservations) = wndchrmWorker.loadWndchrmFeatures() print "Making predictions" predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print "Writing predictions to file" data_io.write_submission(namesObservations, coordinates, predictions) data_io.write_submission_nice(namesObservations, coordinates, predictions) print "Calculating final results" return Predictor.finalResults(namesObservations, predictions, coordinates)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-mPred', type=int, action='store', dest='rec_num', help='specify how to generate recommendation result.') if len(sys.argv) != 3: print 'Command e.g.: python predict.py -mPred(0 or >0)' sys.exit(1) classifier = data_io.load_model() user_recommend_result = defaultdict(list) para = parser.parse_args() if para.rec_num > 0: features = [] user_product_ids = [] cache_uid = -1 finished_num = 0 for i, entry in enumerate(csv.reader(open(settings["GBT_TEST_FILE"]))): pair = map(float, entry[:2]) uid, pid = map(int, pair) if i == 0: cache_uid = uid if uid != cache_uid: predictions = classifier.predict_proba(features)[:, 1] predictions = list(predictions) user_predictions = [] for (t_uid, t_pid), pred in zip(user_product_ids, predictions): user_predictions.append((pred, t_pid)) sorted_result = sorted(user_predictions, reverse=True) pid_sorted = [x[1] for x in sorted_result] user_recommend_result[cache_uid] = pid_sorted[:para.rec_num] features = [map(float, entry[2:])] user_product_ids = [[uid, pid]] cache_uid = uid finished_num += 1 print finished_num else: features.append(map(float, entry[2:])) user_product_ids.append([uid, pid]) else: product_sellnum = getProductSellNum() product_predictions = defaultdict(list) features = [] user_product_ids = [] cache_uid = -1 finished_num = 0 for i, entry in enumerate(csv.reader(open(settings["GBT_TEST_FILE"]))): pair = map(float, entry[:2]) uid, pid = map(int, pair) if i == 0: cache_uid = uid if uid != cache_uid: predictions = classifier.predict_proba(features)[:, 1] predictions = list(predictions) for (t_uid, t_pid), pred in zip(user_product_ids, predictions): product_predictions[t_pid].append((pred, t_uid)) features = [map(float, entry[2:])] user_product_ids = [[uid, pid]] cache_uid = uid finished_num += 1 print finished_num else: features.append(map(float, entry[2:])) user_product_ids.append([uid, pid]) recommend_pairs = [] for pid in product_predictions: if pid not in product_sellnum: continue sorted_results = sorted(product_predictions[pid], reverse=True) uid_sorted = [x[1] for x in sorted_results] for uid in uid_sorted[:product_sellnum[pid]]: recommend_pairs.append([uid, pid]) for pair in recommend_pairs: user_recommend_result[pair[0]].append(pair[1]) data_io.write_submission(user_recommend_result)