def main():
    print("Reading the test data")
    test = data_io.read_test()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = []
    for author_id, row in test.iterrows():
        features = []
        paper_ids = []
        for paper_id in row["PaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,
                               computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" %
                      (author_id, paper_id))
            else:
                features.append(s)
                paper_ids.append(paper_id)
        feature_matrix = pd.DataFrame(features)
        preds = classifier.predict_proba(feature_matrix)[:, 1]
        paper_ids_sorted = sorted(zip(preds, row["PaperIds"]), reverse=True)
        print(paper_ids_sorted)
        predictions.append([x[1] for x in paper_ids_sorted])

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    print("Reading the test data") 
    test = data_io.read_test()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = []
    for author_id, row in test.iterrows():
        features = []
        paper_ids = []
        for paper_id in row["PaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                features.append(s)
                paper_ids.append(paper_id)
        feature_matrix = pd.DataFrame(features)
        preds = classifier.predict_proba(feature_matrix)[:,1]
        paper_ids_sorted = sorted(zip(preds,row["PaperIds"]), reverse=True)
        print(paper_ids_sorted)
        predictions.append([x[1] for x in paper_ids_sorted])

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    print("Reading the test data") 
    test = data_io.read_test()

    print("Making predictions")
    np.random.seed(12341234) 
    predictions = test.apply(shuffle, axis=1)

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    print("Reading test data")
    test = data_io.read_test()

    ordinals = np.arange(len(test))

    recommendations = zip(test["srch_id"], test["prop_id"], ordinals)

    print("Writing predictions to file")
    data_io.write_submission(recommendations, "testOrderBenchmark.csv")
def main():
    print("Reading test data")
    test = data_io.read_test()

    ordinals = np.arange(len(test))

    recommendations = zip(test["srch_id"], test["prop_id"], ordinals)

    print("Writing predictions to file")
    data_io.write_submission(recommendations, "testOrderBenchmark.csv")
Example #6
0
    def __init__(self):
        self.train = data_io.read_train()
        self.test = data_io.read_test()
        self.destin = data_io.read_desin()

        # pca analysis on the destination
        pca = PCA(n_components=3)
        self.dest_pca = pca.fit_transform(
            self.destin[["d{0}".format(i + 1) for i in range(149)]])
        self.dest_pca = pd.DataFrame(self.dest_pca)
        self.dest_pca["srch_destination_id"] = self.destin[
            "srch_destination_id"]
def main():
    test = data_io.read_test()
    ## deal with the NAs, and add features
    train.feature_eng(test)

    ## predict the booking_bool
    print("Loading the Booking classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(True)
    print("Time used,")
    print(datetime.now() - tstart)
    print("Making predictions on the booking_bool..")
    tstart = datetime.now()
    b_fnames = train.get_features(test, True)
    b_test_f = test[b_fnames].values
    b_prob = classifier.predict_proba(b_test_f)[:, 1]
    b_prob = list(-1.0 * b_prob)
    print("Time used,")
    print(datetime.now() - tstart)

    ## predict the click_bool
    print("Loading the Click classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(False)
    print("Time used,")
    print(datetime.now() - tstart)
    print("Making predictions on the click_bool..")
    tstart = datetime.now()
    c_fnames = train.get_features(test, False)
    c_test_f = test[c_fnames].values
    c_prob = classifier.predict_proba(c_test_f)[:, 1]
    c_prob = list(-1.0 * c_prob)
    print("Time used,")
    print(datetime.now() - tstart)

    ## Making Recommendations
    recommendations = zip(test["srch_id"], test["prop_id"],
                          4 * b_prob + c_prob)

    print("Writing predictions to file..")
    tstart = datetime.now()
    data_io.write_submission(recommendations)
    print("Time used,")
    print(datetime.now() - tstart)
def main():
    test = data_io.read_test()
    ## deal with the NAs, and add features
    train.feature_eng(test)

    ## predict the booking_bool
    print("Loading the Booking classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(True)
    print("Time used,")
    print datetime.now() - tstart
    print("Making predictions on the booking_bool..")
    tstart = datetime.now()
    b_fnames = train.get_features(test, True)
    b_test_f =  test[b_fnames].values
    b_prob = classifier.predict_proba(b_test_f)[:,1]
    b_prob = list(-1.0*b_prob)
    print("Time used,")
    print datetime.now() - tstart

    ## predict the click_bool
    print("Loading the Click classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(False)
    print("Time used,")
    print datetime.now() - tstart
    print("Making predictions on the click_bool..")
    tstart = datetime.now()
    c_fnames = train.get_features(test, False)
    c_test_f =  test[c_fnames].values
    c_prob = classifier.predict_proba(c_test_f)[:,1]
    c_prob = list(-1.0*c_prob)
    print("Time used,")
    print datetime.now() - tstart

    ## Making Recommendations
    recommendations = zip(test["srch_id"], test["prop_id"], 4*b_prob+c_prob)
    
    print("Writing predictions to file..")
    tstart = datetime.now()
    data_io.write_submission(recommendations)
    print("Time used,")
    print datetime.now() - tstart
Example #9
0
def main():
    print("Reading test data ...")
    test = data_io.read_test()
    test.fillna(0, inplace=True)
    
    feature_names = list(test.columns)
    feature_names.remove("date_time")

    features = test[feature_names].values

    print("Loading the Regressor ...")
    regressor = data_io.load_model()

    print("Making predictions ...")
    predictions = regressor.predict(features)
    predictions = list(-1.0*predictions)
    recommendations = zip(test["srch_id"], test["prop_id"], predictions)
    print("Writing predictions to file ...")
    data_io.write_submission(recommendations)
Example #10
0
def main():
    print("Reading test data")
    test = data_io.read_test()
    test.fillna(0, inplace=True)

    feature_names = list(test.columns)
    feature_names.remove("date_time")

    features = test[feature_names].values

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:, 1]
    predictions = list(-1.0 * predictions)
    recommendations = zip(test["srch_id"], test["prop_id"], predictions)

    print("Writing predictions to file")
    data_io.write_submission(recommendations)
Example #11
0
def prediction(n_train_samples):
    proc_test_samples_file = get_paths()["proc_test_samples_path"]
    if os.path.exists(proc_test_samples_file):
        print "Loading processed test data..."
        new_test_samples = pd.read_csv(proc_test_samples_file)
    else:
        print "Reading test data..."
        test_samples = data_io.read_test()
        test_samples = test_samples.fillna(value=0)
        print "Porcessing test samples"
        new_test_samples = process_test_samples(test_samples)
        new_test_samples.to_csv(proc_test_samples_file, index=None)
    test_feature = new_test_samples.values

    print "Loading the Random Forest Classifier"
    rf_classifier = data_io.load_model(model_name="rf_classifier.pkl")
    print "Random Forest Predicting"
    rf_predictions = rf_classifier.predict_proba(test_feature)[:, 1]

    print "Loading the Gradient Boosting Classifier"
    gb_classifier = data_io.load_model(model_name="gb_classifier.pkl")
    print "Gradient Boosting Predicting"
    gb_predictions = gb_classifier.predict_proba(test_feature)[:, 1]

    print "Loading the SGD Classifier"
    sgd_classifier = data_io.load_model(model_name="sgd_classifier.pkl")
    print "SGD Predicting"
    sgd_predictions = sgd_classifier.predict_proba(test_feature)[:, 1]

    prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions))
    mean_score = np.mean(prob_arr, axis=0)
    mean_score = -1.0 * mean_score

    mean_recommendations = zip(new_test_samples["srch_id"], new_test_samples["prod_id"], mean_score)

    print "Writing predictions to file"
    data_io.write_submission(mean_recommendations, submission_file="mean_result_%i.csv" % n_train_samples)
Example #12
0
          srch_length_of_stay_features.SrchLengthOfStayFeatures(self.X),
          srch_booking_window_features.SrchBookingWindowFeatures(self.X),
          ]

      return map(self.transformer, feature_list)
  


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate features using train/test data")
    parser.add_argument("--test", action="store_true", default=False, help="Weather to use test data", required=False)
    result = parser.parse_args()

    if result.test:
        print("Reading test data")
        data = data_io.read_test()
    else:
        print("Reading training data")
        data = data_io.read_train()

    fm = FeatureExtractor(data)
    derived_features = fm.feature_extractor()
    data.fillna(0, inplace=True)
    data = pandas.concat([data] + derived_features, axis=1)
  
    if result.test:
        data_io.save_test_features(data)
    else:
        data_io.save_train_features(data)
      
def main():
    class bcolors:
        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'

    print bcolors.HEADER + "Start Training" + bcolors.HEADER
    print bcolors.OKBLUE + "Reading and making Trainingset" + bcolors.OKBLUE

    train = data_io.read_train()
    train.fillna(0, inplace=True)

    train_sample = train[:1250000].fillna(value=0)       # change the samplesize over here

    # list of features that can be removed if you want
    feature_names = list(train_sample.columns)
    feature_names.remove("click_bool")
    feature_names.remove("booking_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("date_time")
    feature_names.remove("position")

    features = train_sample[feature_names].values
    target = train_sample["booking_bool"].values

    print bcolors.OKGREEN + "Training Dataset" + bcolors.OKGREEN

    # check over here , you can find the algorithms at http://scikit-learn.org/stable/modules/ensemble.html

    # random forest
    classifier = RandomForestClassifier(n_estimators=3200,  verbose=2,n_jobs=-1,min_samples_split=10,random_state=1)

    # extra Trees (better then random forest) (best till now!)
    #classifier = ExtraTreesClassifier(n_estimators=300,  verbose=2, n_jobs=-1, min_samples_split=10,random_state=1)

    # Adaboost
    #classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=600,learning_rate=1)



    # Knearest neighbour with bagging
    #classifier = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)


    # Gradient Boosting  BEST SOLUTION (i suppose,will try tomorrow)
    # classifier = GradientBoostingClassifier(loss='deviance', learning_rate=0.1,  n_estimators=100,  subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, verbose=0)


    classifier.fit(features, target)

    print bcolors.OKBLUE + "Saving Classifier" + bcolors.OKBLUE
    data_io.save_model(classifier)


    print bcolors.OKGREEN + "Start Making Predictions On Testset" + bcolors.OKGREEN

    print bcolors.OKBLUE + "Reading Testset" + bcolors.OKBLUE

    test = data_io.read_test()
    test.fillna(0, inplace=True)

    feature_names = list(test.columns)
    feature_names.remove("date_time")

    features = test[feature_names].values

    classifier = data_io.load_model()

    print bcolors.OKGREEN + "Make Predictions" + bcolors.OKGREEN
    predictions = classifier.predict_proba(features)[:,1]

    print bcolors.OKBLUE + "Calculate NDcg" + bcolors.OKBLUE
    predictions = list(-1.0*predictions)

    print bcolors.OKBLUE +  "Sort Predictions" + bcolors.OKBLUE
    recommendations = zip(test["srch_id"], test["prop_id"], predictions)

    print bcolors.OKGREEN + "Writing Predictions To Outputfile" + bcolors.OKGREEN


    data_io.write_submission(recommendations)

    print ""
    print bcolors.ENDC + "Thats all folks,goodbye!" + bcolors.ENDC
Example #14
0
def do_prediction(n_trian_samples):
    proc_test_samples_file = get_paths()['proc_test_samples_path']

    if os.path.exists(proc_test_samples_file):
        print "Loading processed test data..."
        new_test_samples = pd.read_csv(proc_test_samples_file)
        print "Loading processed test data done"
    else:
        #  prediction
        print "reading test data..."
        test_samples = data_io.read_test()
        test_samples = test_samples.fillna(value=0)
        print "done."

        # process test samples
        print "processing test data..."
        new_test_samples = process_test_samples(test_samples)
        new_test_samples.to_csv(proc_test_samples_file, index=None)
        print "Processing test data done."

    test_features = new_test_samples.values

    # 5.1 random forest prediction
    print("Loading the random forest classifier")
    rf_classifier = data_io.load_model(model_name='lr_classifier.pkl')
    print("random forest Predicting")
    #  拿概率值
    rf_predictions = rf_classifier.predict_proba(test_features)[:1]

    # 5.2 Gradient Boosting prediction
    print("Loading the Gradient Boosting  classifier")
    gb_classifier = data_io.load_model(model_name='gb_classifier.pkl')
    print("Gradient Boosting  Predicting")
    gb_predictions = gb_classifier.predict_proba(test_features)[:1]

    # 5.3 SGD prediction
    print("Loading the SGD classifier")
    sgd_classifier = data_io.load_model(model_name='sgd_classifier.pkl')
    print("SGD Predicting")
    sgd_predictions = sgd_classifier.predict_proba(test_features)[:1]

    # 5.4 LR prediction
    # print("Loading the LR classifier")
    # lr_classifier = data_io.load_model(model_name='lr_classifier.pkl')
    # print("Logistic Regression Predicting")
    # lr_predictions = lr_classifier.predict_proba(test_features)[:1]

    # step 6 score fusion  把三组概率放到数组中
    prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions))

    # average mean   取概率的评价值  算术评价
    mean_score = np.mean(prob_arr, axis=0)
    # for sorting  几何评价,效果不太好
    # 前面的排序是升序,乘以-1 改为降序
    mean_score = -1.0 * mean_score
    # geometric mean
    gmean = stats.gmean(prob_arr, axis=0)
    # for sorting
    gmean = -1.0 * gmean

    # step 7 output result
    mean_recommendations = zip(new_test_samples['srch_id'],
                               new_test_samples['prop_id'], mean_score)
    gmean_recommendations = zip(new_test_samples['srch_id'],
                                new_test_samples['prop_id'], gmean)

    print("Writing predictions to file")
    data_io.write_submission(mean_recommendations, submission_file='mean_result_%i.csv' % n_trian_samples)
    data_io.write_submission(gmean_recommendations, submission_file='gmean_result_%i.csv' % n_trian_samples)