def main():
    print("Reading the test data")
    test = data_io.read_test()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = []
    for author_id, row in test.iterrows():
        features = []
        paper_ids = []
        for paper_id in row["PaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,
                               computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" %
                      (author_id, paper_id))
            else:
                features.append(s)
                paper_ids.append(paper_id)
        feature_matrix = pd.DataFrame(features)
        preds = classifier.predict_proba(feature_matrix)[:, 1]
        paper_ids_sorted = sorted(zip(preds, row["PaperIds"]), reverse=True)
        print(paper_ids_sorted)
        predictions.append([x[1] for x in paper_ids_sorted])

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #2
0
def main():
    print("Getting features for valid papers from the database")
    if(os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model(prefix="forest_")

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="forest_")
Exemple #3
0
def main():
    print("Getting features for valid papers from the database")
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    featuresfloat = []
    for tup in features:
       a, b, c, d, e = tup
       featuresfloat.append((float(a), float(b), float(c), float(d), float(e)))
    print("Totoal number of samples: ", len(featuresfloat))

    print("Loading the logistic regression model")
    logistic = data_io.load_model()

    print("Making predictions")
    predictions = logistic.predict_proba(featuresfloat)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions)
def main():
    testset = pd.read_csv(path + "test_x.csv", index_col=0)

    ## deal with the NAs, and add features
    #train.feature_eng(test)

    ## predict
    print "Loading the predict_model classifier.."
    tstart = datetime.now()

    classifier = data_io.load_model("predict_model")
    print "Time used", datetime.now() - tstart

    print "Making predictions on the predict_model"
    tstart = datetime.now()
    fnames = ['year', 'month', 'trade_no', 'sigungu_no', 'price', 'monthly_expense']
    test_f = testset[fnames].values
    predic_proba = classifier.predict_proba(test_f)[:,1]

    print "Time used", datetime.now() - tstart

    ## Making prediction
    prediction = zip(testset['year'],
                        testset['month'],
                        testset['trade_no'],
                        testset['sigungu_no'],
                        testset['price'],
                        testset['monthly_expense'],
                        predic_proba)

    print "Writing predictions to file.."
    tstart = datetime.now()
    data_io.write_submission(prediction)
    print "Time used,", datetime.now() - tstart
def main():
    print("Reading test data")
    test_chunks = data_io.read_test_features()
    test = pandas.concat([chunk for chunk in test_chunks], ignore_index=True)

    feature_names = list(test.columns)
    #feature_names.remove("date_time")

    features = test[feature_names].values

    print("Loading the classifier")
    classifiers = data_io.load_model()

    print("Making predictions")
    #orig_predictions = classifier.predict_proba(features)
    #multiplier = 2 ** classifier.classes_ 
    #predictions = orig_predictions * multiplier
    #predictions = predictions.sum(axis=1)
    predictions = class_probabilities(features, classifiers)
    print predictions
    predictions = list(-1.0*predictions)
    recommendations = zip(test["srch_id"], test["prop_id"], predictions)

    print("Writing predictions to file")
    data_io.write_submission(recommendations)
Exemple #6
0
def main():

    print "sklearn version", pkg_resources.get_distribution("scikit-learn").version
    print "numpy version", pkg_resources.get_distribution("numpy").version
    print "pandas version", pkg_resources.get_distribution("pandas").version
    print("Loading the classifier")
    clf = data_io.load_model()

    X = data_io.load_matlab_valid_features()
    X = delete_unused_columns(X)
    X = X.fillna(0)
    
    if(X is None):
        print("No feature file found!")
        exit(1)

    print_importances(X,clf, 0.0)
    print("Predictions outcomes with shape: " + str(X.shape))
    print clf
    predictions = clf.predict(X)
    #predictions = clf.predict_pruned(X,3000)
   
    predictions = predictions.flatten()
    
   
    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    print("Getting features for valid papers from the database")
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model()
    print classifier.feature_importances_

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions)
Exemple #8
0
def main():
    print("Reading test data")
    test_chunks = data_io.read_test_features()
    test = pandas.concat([chunk for chunk in test_chunks], ignore_index=True)

    feature_names = list(test.columns)
    #feature_names.remove("date_time")

    features = test[feature_names].values

    print("Loading the classifier")
    classifiers = data_io.load_model()

    print("Making predictions")
    #orig_predictions = classifier.predict_proba(features)
    #multiplier = 2 ** classifier.classes_
    #predictions = orig_predictions * multiplier
    #predictions = predictions.sum(axis=1)
    predictions = class_probabilities(features, classifiers)
    print predictions
    predictions = list(-1.0 * predictions)
    recommendations = zip(test["srch_id"], test["prop_id"], predictions)

    print("Writing predictions to file")
    data_io.write_submission(recommendations)
Exemple #9
0
def main():

    print "sklearn version", pkg_resources.get_distribution(
        "scikit-learn").version
    print "numpy version", pkg_resources.get_distribution("numpy").version
    print "pandas version", pkg_resources.get_distribution("pandas").version
    print("Loading the classifier")
    clf = data_io.load_model()

    X = data_io.load_matlab_valid_features()
    X = delete_unused_columns(X)
    X = X.fillna(0)

    if (X is None):
        print("No feature file found!")
        exit(1)

    print_importances(X, clf, 0.0)
    print("Predictions outcomes with shape: " + str(X.shape))
    print clf
    predictions = clf.predict(X)
    #predictions = clf.predict_pruned(X,3000)

    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #10
0
def main():
    print("Getting features for valid papers from the database")
    if (os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model(prefix="forest_")

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:, 1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="forest_")
def main():
    print("Reading the test data") 
    test = data_io.read_test()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = []
    for author_id, row in test.iterrows():
        features = []
        paper_ids = []
        for paper_id in row["PaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                features.append(s)
                paper_ids.append(paper_id)
        feature_matrix = pd.DataFrame(features)
        preds = classifier.predict_proba(feature_matrix)[:,1]
        paper_ids_sorted = sorted(zip(preds,row["PaperIds"]), reverse=True)
        print(paper_ids_sorted)
        predictions.append([x[1] for x in paper_ids_sorted])

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    print "Getting features for valid papers from the database"
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print "Loading the classifier"
    classifier = data_io.load_model()

    print "Making predictions"
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print "Writing predictions to file"
    data_io.write_submission(paper_predictions)
Exemple #13
0
def main():
    print("Getting features for valid papers from the database")
    data = data_io.get_features_db("ValidPaper")
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    featuresfloat = []
    for tup in features:
        a, b, c, d, e = tup
        featuresfloat.append(
            (float(a), float(b), float(c), float(d), float(e)))
    print("Totoal number of samples: ", len(featuresfloat))

    print("Loading the logistic regression model")
    logistic = data_io.load_model()

    print("Making predictions")
    predictions = logistic.predict_proba(featuresfloat)[:, 1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions)
Exemple #14
0
def get_cv_score():

    classifier = data_io.load_model()
    train = data_io.get_train_df()
    scores = cv.cross_val_score(classifier, train[[x for x in train.columns if x != 'label']], train['label'])

    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Exemple #15
0
 def runWithoutWndchrm(self):
     print "Loading the classifier"
     classifier = data_io.load_model()
     imageCollections = data_io.get_valid_df()
     featureGetter = FeatureGetter()
     print "Getting the features"
     fileName = data_io.get_savez_name_test()
     if not self.load:  #Last features calculated from candidates
         (namesObservations, coordinates,
          valid) = Utils.calculateFeatures(fileName, featureGetter,
                                           imageCollections)
     else:
         (namesObservations, coordinates,
          valid) = Utils.loadFeatures(fileName)
     print "Making predictions"
     #valid = normalize(valid, axis=0) #askdfhashdf
     predictions = classifier.predict(valid)
     predictions = predictions.reshape(len(predictions), 1)
     print "Writing predictions to file"
     data_io.write_submission(namesObservations, coordinates, predictions)
     data_io.write_submission_nice(namesObservations, coordinates,
                                   predictions)
     print "Calculating final results"
     return Predictor.finalResults(namesObservations, predictions,
                                   coordinates)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-tv',
                        type=float,
                        action='store',
                        dest='threshold_val',
                        help='specify how to generate recommendation result.')
    parser.add_argument('-t',
                        type=int,
                        action='store',
                        dest='target',
                        help='for validation or test dataset')

    if len(sys.argv) != 5:
        print 'Command e.g.: python predict.py -tv 0.8 -t 0(1)'
        sys.exit(1)

    para = parser.parse_args()
    if para.target == 0:
        file_name = settings["MTLR_TEST_FILE"]
        gbt_feature_file = settings["MTLR_FEATURE_FILE"]
    elif para.target == 1:
        file_name = settings["MTLR_TEST_FILE_FOR_SUBMIT"]
        gbt_feature_file = settings["MTLR_FEATURE_FILE_FOR_SUBMIT"]

    writer = csv.writer(open(gbt_feature_file, "w"), lineterminator="\n")
    classifier = data_io.load_model(settings["MTLR_MODEL_FILE"])
    #print classifier.coef_
    #raw_input()

    user_recommend_result = defaultdict(list)
    finished_num = 0
    features = []
    user_product_ids = []
    cache_uid = -1
    for i, entry in enumerate(csv.reader(open(file_name))):
        feature = map(float, entry[2:])
        uid, pid = map(int, entry[:2])
        if i == 0:
            cache_uid = uid
        if uid != cache_uid:
            predictions = classifier.predict_proba(user_product_ids, features)
            #predictions = classifier.predict(features)
            for (t_uid, t_pid), pred in zip(user_product_ids, predictions):
                writer.writerow([t_uid, t_pid, pred])
                if pred > para.threshold_val:
                    user_recommend_result[t_uid].append(t_pid)
            features = [feature]
            user_product_ids = [[uid, pid]]
            cache_uid = uid
            finished_num += 1
            #print("FINISHED UID NUM: %d. " % (finished_num))
            #sys.stderr.write("\rFINISHED UID NUM: %d. " % (finished_num))
            #sys.stderr.flush()
        else:
            features.append(feature)
            user_product_ids.append([uid, pid])

    data_io.write_submission(user_recommend_result)
Exemple #17
0
def estimate(features, target):
    print("[INFO] Loading the classifier")
    classifier = data_io.load_model()

    print("[INFO] Making predictions")
    predictions = classifier.predict_proba(features)
    
    return predictions
Exemple #18
0
def get_cv_score():

    classifier = data_io.load_model()
    train = data_io.get_train_df()
    scores = cv.cross_val_score(
        classifier, train[[x for x in train.columns if x != 'label']],
        train['label'])

    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def main():
    test = data_io.read_test()
    ## deal with the NAs, and add features
    train.feature_eng(test)

    ## predict the booking_bool
    print("Loading the Booking classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(True)
    print("Time used,")
    print(datetime.now() - tstart)
    print("Making predictions on the booking_bool..")
    tstart = datetime.now()
    b_fnames = train.get_features(test, True)
    b_test_f = test[b_fnames].values
    b_prob = classifier.predict_proba(b_test_f)[:, 1]
    b_prob = list(-1.0 * b_prob)
    print("Time used,")
    print(datetime.now() - tstart)

    ## predict the click_bool
    print("Loading the Click classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(False)
    print("Time used,")
    print(datetime.now() - tstart)
    print("Making predictions on the click_bool..")
    tstart = datetime.now()
    c_fnames = train.get_features(test, False)
    c_test_f = test[c_fnames].values
    c_prob = classifier.predict_proba(c_test_f)[:, 1]
    c_prob = list(-1.0 * c_prob)
    print("Time used,")
    print(datetime.now() - tstart)

    ## Making Recommendations
    recommendations = zip(test["srch_id"], test["prop_id"],
                          4 * b_prob + c_prob)

    print("Writing predictions to file..")
    tstart = datetime.now()
    data_io.write_submission(recommendations)
    print("Time used,")
    print(datetime.now() - tstart)
Exemple #20
0
def main():
    print("Loading the model")
    model = data_io.load_model()

    print("Making predictions")
    valid = data_io.get_valid_df()
    predictions = model * np.ones(len(valid))

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    print("Loading the model")
    model = data_io.load_model()

    print("Making predictions")
    valid = data_io.get_valid_df()
    predictions = model * np.ones(len(valid))

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    print("Loading the classifier")
    classifier = data_io.load_model()
    
    print("Making predictions") 
    test = data_io.get_test()
    predictions = classifier.predict(test)  

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', type=int, action='store',
            dest='target', help='for validation or test dataset')
    parser.add_argument('-c1', type=int, action='store',
            dest='ucluster_num', help='cluster number of users')
    parser.add_argument('-c2', type=int, action='store',
            dest='icluster_num', help='cluster number of items')

    if len(sys.argv) != 7:
        print 'Command e.g.: python cluster.py -t 0(1) -c1 20 -c2 50'
        sys.exit(1)

    para = parser.parse_args()
    if para.target == 0:
        user_features = [map(int, entry) for entry in csv.reader(open(settings["USER_CLUSTER_TRAIN_FILE"]))]
        item_features = [map(int, entry) for entry in csv.reader(open(settings["ITEM_CLUSTER_TRAIN_FILE"]))]
        user_cluster_file = settings["USER_CLUSTER_TEST_FILE"]
        item_cluster_file = settings["ITEM_CLUSTER_TEST_FILE"]
    elif para.target == 1:
        user_features = [map(int, entry) for entry in csv.reader(open(settings["USER_CLUSTER_TRAIN_FILE_FOR_SUBMIT"]))]
        item_features = [map(int, entry) for entry in csv.reader(open(settings["ITEM_CLUSTER_TRAIN_FILE_FOR_SUBMIT"]))]
        user_cluster_file = settings["USER_CLUSTER_TEST_FILE_FOR_SUBMIT"]
        item_cluster_file = settings["ITEM_CLUSTER_TEST_FILE_FOR_SUBMIT"]
    else:
        print 'Invalid train data target choice...'
        sys.exit(1)

    writer = csv.writer(open(user_cluster_file, "w"), lineterminator="\n")
    cluster = data_io.load_model(settings["USER_CLUSTER_MODEL_FILE"])
    uids = [entry[0] for entry in user_features]
    features = [entry[1:] for entry in user_features]
    labels = cluster.predict(features)
    for uid, label in zip(uids, labels):
        writer.writerow([uid, label])

    writer = csv.writer(open(item_cluster_file, "w"), lineterminator="\n")
    cluster = data_io.load_model(settings["ITEM_CLUSTER_MODEL_FILE"])
    pids = [entry[0] for entry in item_features]
    features = [entry[1:] for entry in item_features]
    labels = cluster.predict(features)
    for pid, label in zip(pids, labels):
        writer.writerow([pid, label])
def main():
    test = data_io.read_test()
    ## deal with the NAs, and add features
    train.feature_eng(test)

    ## predict the booking_bool
    print("Loading the Booking classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(True)
    print("Time used,")
    print datetime.now() - tstart
    print("Making predictions on the booking_bool..")
    tstart = datetime.now()
    b_fnames = train.get_features(test, True)
    b_test_f =  test[b_fnames].values
    b_prob = classifier.predict_proba(b_test_f)[:,1]
    b_prob = list(-1.0*b_prob)
    print("Time used,")
    print datetime.now() - tstart

    ## predict the click_bool
    print("Loading the Click classifier..")
    tstart = datetime.now()
    classifier = data_io.load_model(False)
    print("Time used,")
    print datetime.now() - tstart
    print("Making predictions on the click_bool..")
    tstart = datetime.now()
    c_fnames = train.get_features(test, False)
    c_test_f =  test[c_fnames].values
    c_prob = classifier.predict_proba(c_test_f)[:,1]
    c_prob = list(-1.0*c_prob)
    print("Time used,")
    print datetime.now() - tstart

    ## Making Recommendations
    recommendations = zip(test["srch_id"], test["prop_id"], 4*b_prob+c_prob)
    
    print("Writing predictions to file..")
    tstart = datetime.now()
    data_io.write_submission(recommendations)
    print("Time used,")
    print datetime.now() - tstart
Exemple #25
0
def test_trainingData():
    print("Loading processed data ...")
    features, targets = data_io.load_processed_data()
    print("Loading Regressor ... ")
    classifier = data_io.load_model()
    print("Doing prediction ...")
    predictions = classifier.predict(features)
    results = zip(targets[:100], predictions[:100])
    results = sorted(map(lambda x: abs(x[0] - x[1]), results))
    return sum(results) / len(results)
Exemple #26
0
def main():
    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    valid = data_io.get_valid_df()
    predictions = classifier.predict(valid)
    predictions = np.rint(predictions)  # Round predictions to nearest integer.

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #27
0
def main():
    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    valid = data_io.get_valid_df()
    predictions = classifier.predict(valid)
    predictions = predictions.reshape(len(predictions), 1)

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #28
0
def main():
    print("Loading the classifier")
    classifier = data_io.load_model()
    
    print("Making predictions") 
    valid = data_io.get_valid_df()
    predictions = classifier.predict(valid)   
    predictions = np.rint(predictions) # Round predictions to nearest integer.

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #29
0
def main():
    print("Loading the classifier")
    classifier = data_io.load_model()
    
    print("Making predictions") 
    valid = data_io.get_valid_df()
    predictions = classifier.predict(valid)   
    predictions = predictions.reshape(len(predictions), 1)

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #30
0
def main():
    valid = data_io.get_valid_df()
    P={}
    for key in valid:
        print("Loading the classifier for %s" %key)
        classifier = data_io.load_model(key)  
        print("Making predictions") 
        P[key] = classifier.predict(valid[key])   
        P[key] = P[key].reshape(len(P[key]), 1)

    print("Writing predictions to file")
    data_io.write_submission(P)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-tv', type=float, action='store',
            dest='threshold_val', help='specify how to generate recommendation result.')
    parser.add_argument('-t', type=int, action='store',
            dest='target', help='for validation or test dataset')

    if len(sys.argv) != 5:
        print 'Command e.g.: python predict.py -tv 0.8 -t 0(1)'
        sys.exit(1)

    para = parser.parse_args()
    if para.target == 0:
        file_name = settings["MTLR_TEST_FILE"]
        gbt_feature_file = settings["MTLR_FEATURE_FILE"]
    elif para.target == 1:
        file_name = settings["MTLR_TEST_FILE_FOR_SUBMIT"]
        gbt_feature_file = settings["MTLR_FEATURE_FILE_FOR_SUBMIT"]

    writer = csv.writer(open(gbt_feature_file, "w"), lineterminator="\n")
    classifier = data_io.load_model(settings["MTLR_MODEL_FILE"])
    #print classifier.coef_
    #raw_input()

    user_recommend_result = defaultdict(list)
    finished_num = 0
    features = []
    user_product_ids = []
    cache_uid = -1
    for i, entry in enumerate(csv.reader(open(file_name))):
        feature = map(float, entry[2:])
        uid, pid = map(int, entry[:2])
        if i == 0:
            cache_uid = uid
        if uid != cache_uid:
            predictions = classifier.predict_proba(user_product_ids, features)
            #predictions = classifier.predict(features)
            for (t_uid, t_pid), pred in zip(user_product_ids, predictions):
                writer.writerow([t_uid, t_pid, pred])
                if pred > para.threshold_val:
                    user_recommend_result[t_uid].append(t_pid)
            features = [feature]
            user_product_ids = [[uid, pid]]
            cache_uid = uid
            finished_num += 1
            #print("FINISHED UID NUM: %d. " % (finished_num))
            #sys.stderr.write("\rFINISHED UID NUM: %d. " % (finished_num))
            #sys.stderr.flush()
        else:
            features.append(feature)
            user_product_ids.append([uid, pid])

    data_io.write_submission(user_recommend_result)
def estimate():

    features, target = load_svmlight_file(data_io.read_test_svm())
    features = features.todense()

    print("[INFO] Loading the classifier")
    classifier = data_io.load_model()

    print("[INFO] Making predictions")
    predictions = classifier.predict_proba(features)

    return predictions
def estimate():

    features, target = load_svmlight_file(data_io.read_test_svm())
    features = features.todense()

    print("[INFO] Loading the classifier")
    classifier = data_io.load_model()

    print("[INFO] Making predictions")
    predictions = classifier.predict_proba(features)

    return predictions
def main():
    comm = MPI.COMM_WORLD
    size = comm.Get_size()   
    rank = comm.Get_rank()
    conn = data_io.get_db_conn()
    feature_name = open("feature_list.txt").read().split()
    # if size < len(feature_name):	# to be done!
    for table_name in ["ValidPaper"]:
	if rank > 0:
            # getting features by parallel computing
	    print "getting features at node " + str(rank)
            feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1])
	else:
	    feature = data_io_parallel.get_trained_validation_data(conn, table_name)
	    
	# sending features to rank 0
	print "sending features to node " + str(rank)
	features = comm.gather(feature, root = 0)
        #print features
	if rank == 0:	  
	    temp = []
	    for f in features:
		temp.extend(f)  	    
	    print "Successfully got the features from " + table_name
	    data = map(list, np.array(temp).T)
    
    if rank == 0:
	author_paper_ids = [x[:2] for x in data]
	features = [x[2:] for x in data]

	print("Loading the classifier")
	classifier = data_io.load_model()
	print classifier.feature_importances_

	print("Making predictions")
	predictions = classifier.predict_proba(features)[:,1]
	predictions = list(predictions)

	author_predictions = defaultdict(list)
	paper_predictions = {}

	for (a_id, p_id), pred in zip(author_paper_ids, predictions):
	    author_predictions[a_id].append((pred, p_id))

	for author_id in sorted(author_predictions):
            paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

	print("Writing predictions to file")
	data_io.write_submission(paper_predictions)
	print "Prediction completed, exit..."
        comm.Abort()
Exemple #35
0
 def run(self):
     valid = self.getValidationDataset()
     if f.preprocessedFeatures != []:
         intermediate = data_io.read_intermediate_valid()
         for i in f.preprocessedFeatures:
             valid[i] = intermediate[i]
     print "Loading the classifier"
     classifier = data_io.load_model()
     print "Making predictions"
     predictions = classifier.predict(valid)
     predictions = predictions.flatten()
     print "Writing predictions to file"
     data_io.write_submission(predictions)
def main():
    print("Reading the valid pairs")
    valid = data_io.read_valid_pairs()

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #37
0
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #38
0
def main():
    cf = ClassifierFactory()

    filename = None
    modelnames = ["basic_python_benchmark"]
    numRows = None

    try:
        opts, args = getopt.getopt(sys.argv[1:], "f:m:n:h")
    except getopt.GetoptError as err:
        print str(err)
        sys.exit(2)

    for o, a in opts:
        if o == "-f":
            filename = a
        elif o == "-n":
            numRows = int(a)
        elif o == "-m":
            if a == "all":
                modelnames = []
                for clf_key in cf.get_all_keys():
                    modelnames.append(clf_key)
            elif cf.is_valid_key(a):
                modelnames = [a]
        elif o == "-h":
            print 'options:'
            print "\t -m [classifier key | all]"
            print "\t -f [filename]"
            sys.exit(0)
        else:
            print "try help: python predict.py -h"
            sys.exit(1)

    print "Reading the test pairs"
    test = data_io.read_test_pairs(numRows)
    testInfo = data_io.read_test_info(numRows)
    test['A type'] = testInfo['A type']
    test['B type'] = testInfo['B type']

    for modelname in modelnames:
        print "Loading the classifier:", cf.get_classifier_name(modelname)
        classifier = data_io.load_model(modelname)

        print "Making predictions"
        predictions = classifier.predict(test)
        predictions = predictions.flatten()

        filename = modelname + '.csv'

        data_io.write_submission(predictions, filename)
def reclassify():
    print("Getting the questions in the database")
    questions = get_questions_from_postgres()
    print("%d questions retrieved" % len(questions))

    print("Loading the trained model")
    classifier = data_io.load_model("model.pickle")

    print("Making predictions")
    probs = classifier.predict_proba(questions)

    prob_closed = 1-probs[:,1]

    update_postgres_close_likelihood(questions["PostId"], prob_closed)
Exemple #40
0
def prediction(n_train_samples):
    proc_test_samples_file = get_paths()["proc_test_samples_path"]
    if os.path.exists(proc_test_samples_file):
        print "Loading processed test data..."
        new_test_samples = pd.read_csv(proc_test_samples_file)
    else:
        print "Reading test data..."
        test_samples = data_io.read_test()
        test_samples = test_samples.fillna(value=0)
        print "Porcessing test samples"
        new_test_samples = process_test_samples(test_samples)
        new_test_samples.to_csv(proc_test_samples_file, index=None)
    test_feature = new_test_samples.values

    print "Loading the Random Forest Classifier"
    rf_classifier = data_io.load_model(model_name="rf_classifier.pkl")
    print "Random Forest Predicting"
    rf_predictions = rf_classifier.predict_proba(test_feature)[:, 1]

    print "Loading the Gradient Boosting Classifier"
    gb_classifier = data_io.load_model(model_name="gb_classifier.pkl")
    print "Gradient Boosting Predicting"
    gb_predictions = gb_classifier.predict_proba(test_feature)[:, 1]

    print "Loading the SGD Classifier"
    sgd_classifier = data_io.load_model(model_name="sgd_classifier.pkl")
    print "SGD Predicting"
    sgd_predictions = sgd_classifier.predict_proba(test_feature)[:, 1]

    prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions))
    mean_score = np.mean(prob_arr, axis=0)
    mean_score = -1.0 * mean_score

    mean_recommendations = zip(new_test_samples["srch_id"], new_test_samples["prod_id"], mean_score)

    print "Writing predictions to file"
    data_io.write_submission(mean_recommendations, submission_file="mean_result_%i.csv" % n_train_samples)
def main():
    markdown = PagedownToHtml()

    print("Reading in the training data")
    train = data_io.get_train_df()
    for i in train.index:
        train["BodyMarkdown"][i] = markdown.convert(train["BodyMarkdown"][i])

    print("Extracting features and training")
    classifier = get_pipeline()
    classifier.fit(train, train["OpenStatus"])

    print("Saving the classifier")
    data_io.save_model(classifier, "model.pickle")
    model = data_io.load_model("model.pickle")
def main():
    markdown = PagedownToHtml()

    print("Reading the private leaderboard file")
    test = data_io.get_test_df()
    for i in test.index:
        test["BodyMarkdown"][i] = markdown.convert(test["BodyMarkdown"][i])

    print("Loading the trained model")
    classifier = data_io.load_model("model.pickle")

    print("Making predictions")
    probs = classifier.predict_proba(test)

    solution = data_io.get_private_leaderboard_solution_df()
    print("Open AUC: %0.6f" % metrics.auc(solution["open"], probs[:,1]))
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()
    valid_info = data_io.read_valid_info()
    valid = pd.concat([valid, valid_info],axis =1) 
    valid = train.get_types(valid)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    predictions = classifier.predict(valid)
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions, fn)
Exemple #44
0
def main():
    print("Loading the test data")
    classifier = data_io.load_model()
    
    print ("Load test data. And Clean..")
    test = data_io.get_test_df()
    test = FeatureConverter().clean_data(test)
    passengerIds = test['Id']
    test.drop(['Id'], axis = 1, inplace = True)
    test = test.values
    
    print("Making predictions") 
    predictions = classifier.predict(test).astype(int)
    #predictions = predictions.reshape(len(predictions), 1)
    
    print("Writing predictions to file")
    data_io.write_submission(predictions, passengerIds, ['Id', 'Cover_Type'])
Exemple #45
0
def main():
    
    print("Getting features for valid papers from the database")
    if(os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]
    
    predictInts = []
    for tup in features:
        a, b, c, d, e = tup
        predictInts.append((int(a), int(b), int(c), int(d), int(e)))

    print("Loading the classifier")
    mlp = data_io.load_model(prefix="mlp_")

    print("Making predictions")
    predictions = []
    for x in predictInts : 
        #Propagate the inputs forward to compute the outputs             
        outp = list(x)     #output of  input layer i.e. output of previous layer to be used as input for next layer
        for layer in mlp.layers[1:] :           #for all layers starting from the second layer
            for i in range(layer.nNeurons):
                layer.net[i] =  weightedSum(outp, layer.W[1:,i]) + layer.W[0,i]
                layer.out[i] = g(layer.net[i], layer.transferF)   #pass this weighted sum through the transfer function of this layer                  
                outp = layer.out  
        predictions.append(mlp.layers[-1].out[0])

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="mlp_")
Exemple #46
0
def main():
    print("Loading the test data")
    classifier = data_io.load_model()

    print("Load test data. And Clean..")
    test = data_io.get_test_df()
    test = FeatureConverter().clean_data(test)
    passengerIds = test['Id']
    test.drop(['Id'], axis=1, inplace=True)
    test = test.values

    print("Making predictions")
    predictions = classifier.predict(test).astype(int)
    #predictions = predictions.reshape(len(predictions), 1)

    print("Writing predictions to file")
    data_io.write_submission(predictions, passengerIds, ['Id', 'Cover_Type'])
Exemple #47
0
def main():
    print("Getting features for valid papers from the database")
    if(os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]
    
    #code for including keywords match feature
    print "adding addtional features..."
    import additional_features as af
    all_features = af.get_additional_features()    
    _, _, kw_features = all_features    
    for i in range(len(features)):
        features[i]+= tuple(kw_features[i][2:])
    
    featuresnp = np.array(features, dtype='int32')
        
#    featuresnp -= np.mean(featuresnp, axis=0)
#    featuresnp /= np.std(featuresnp, axis=0)
    
    
    print("Loading the classifier")
    classifier = data_io.load_model(prefix="forest_")

    print("Making predictions")
    predictions = classifier.predict_proba(featuresnp)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="forest_")
Exemple #48
0
def predict_write(data, predict_type):
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")

    features = np.array(features)  # This line is for xgboost
    predictions = classifier.predict_proba(features)[:, 1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    if (predict_type == "valid"):
        targetset = pd.read_csv('dataRev2/Valid.csv')
    else:
        targetset = pd.read_csv('dataRev2/Test.csv')

    parsed_counter = parse_targetset_maintain_duplicate(targetset)

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):

        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)

        new_result = []
        for x in paper_ids_sorted:
            pid = x[1]
            for i in range(parsed_counter[author_id, pid]):
                new_result.append(pid)

        paper_predictions[author_id] = new_result
        #paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

        paper_predictions[author_id] = processDuplicates(
            paper_predictions[author_id])

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, predict_type)
Exemple #49
0
def main():
    print("Reading the valid pairs")
    valid = data_io.read_valid_pairs()
    features = fe.feature_extractor()
    print("Transforming features")
    trans_valid = features.fit_transform(valid)
    trans_valid = np.nan_to_num(trans_valid)

    print("Saving Valid Features")
    data_io.save_valid_features(trans_valid)

    print("Loading the classifier")
    #(both_classifier, A_classifier, B_classifier, none_classifier) = data_io.load_model()
    classifier = data_io.load_model()

    print("Making predictions")
    valid_info = data_io.read_valid_info()
    predictions = list()
    curr_pred = None
    """
    for i in range(len(trans_valid)):
      
      if valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = both_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] == "Numerical" and valid_info["B type"][i] != "Numerical":
        curr_pred = A_classifier.predict_proba(trans_valid[i, :])
      
      elif valid_info["A type"][i] != "Numerical" and valid_info["B type"][i] == "Numerical":
        curr_pred = B_classifier.predict_proba(trans_valid[i, :])
     
      else:
        curr_pred = none_classifier.predict_proba(trans_valid[i, :])

      predictions.append(curr_pred[0][2] - curr_pred[0][0])
    """

    orig_predictions = classifier.predict_proba(trans_valid)
    predictions = orig_predictions[:, 2] - orig_predictions[:, 0]
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
Exemple #50
0
def predict_feature_from_aid(data):
    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions\n")

    features = np.array(features)  # This line is for xgboost
    predictions = classifier.predict_proba(features)[:, 1]
    predictions = list(predictions)

    result = []
    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        result.append((pred, p_id))
    #print(author_predictions)
    paper_ids_sorted = sorted(result, reverse=True)
    result = paper_ids_sorted[0:25]
    return result
def main():
    print("Reading test data")
    test = data_io.read_test()
    test.fillna(0, inplace=True)

    feature_names = list(test.columns)
    feature_names.remove("date_time")

    features = test[feature_names].values

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:, 1]
    predictions = list(-1.0 * predictions)
    recommendations = zip(test["srch_id"], test["prop_id"], predictions)

    print("Writing predictions to file")
    data_io.write_submission(recommendations)
Exemple #52
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Loading the classifier"
     classifier = data_io.load_model()
     imageCollections = data_io.get_valid_df()
     featureGetter = FeatureGetter()
     wndchrmWorker = WndchrmWorkerPredict()
     print "Getting the features"
     if not self.loadWndchrm:  #Last wndchrm set of features
         fileName = data_io.get_savez_name_test()
         if not self.load:  #Last features calculated from candidates
             (namesObservations, coordinates,
              _) = Utils.calculateFeatures(fileName, featureGetter,
                                           imageCollections)
         else:
             (namesObservations, coordinates,
              _) = Utils.loadFeatures(fileName)
         print "Saving images"
         imageSaver = ImageSaver(coordinates, namesObservations,
                                 imageCollections, featureGetter.patchSize)
         imageSaver.saveImages()
         print "Executing wndchrm algorithm"
         valid = wndchrmWorker.executeWndchrm(namesObservations)
     else:
         (valid, namesObservations) = wndchrmWorker.loadWndchrmFeatures()
     print "Making predictions"
     predictions = classifier.predict(valid)
     predictions = predictions.reshape(len(predictions), 1)
     print "Writing predictions to file"
     data_io.write_submission(namesObservations, coordinates, predictions)
     data_io.write_submission_nice(namesObservations, coordinates,
                                   predictions)
     print "Calculating final results"
     return Predictor.finalResults(namesObservations, predictions,
                                   coordinates)
Exemple #53
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-mPred',
                        type=int,
                        action='store',
                        dest='rec_num',
                        help='specify how to generate recommendation result.')
    if len(sys.argv) != 3:
        print 'Command e.g.: python predict.py -mPred(0 or >0)'
        sys.exit(1)

    classifier = data_io.load_model()

    user_recommend_result = defaultdict(list)
    para = parser.parse_args()
    if para.rec_num > 0:
        features = []
        user_product_ids = []
        cache_uid = -1
        finished_num = 0
        for i, entry in enumerate(csv.reader(open(settings["GBT_TEST_FILE"]))):
            pair = map(float, entry[:2])
            uid, pid = map(int, pair)
            if i == 0:
                cache_uid = uid
            if uid != cache_uid:
                predictions = classifier.predict_proba(features)[:, 1]
                predictions = list(predictions)
                user_predictions = []
                for (t_uid, t_pid), pred in zip(user_product_ids, predictions):
                    user_predictions.append((pred, t_pid))
                sorted_result = sorted(user_predictions, reverse=True)
                pid_sorted = [x[1] for x in sorted_result]
                user_recommend_result[cache_uid] = pid_sorted[:para.rec_num]
                features = [map(float, entry[2:])]
                user_product_ids = [[uid, pid]]
                cache_uid = uid
                finished_num += 1
                print finished_num
            else:
                features.append(map(float, entry[2:]))
                user_product_ids.append([uid, pid])

    else:
        product_sellnum = getProductSellNum()
        product_predictions = defaultdict(list)
        features = []
        user_product_ids = []
        cache_uid = -1
        finished_num = 0
        for i, entry in enumerate(csv.reader(open(settings["GBT_TEST_FILE"]))):
            pair = map(float, entry[:2])
            uid, pid = map(int, pair)
            if i == 0:
                cache_uid = uid
            if uid != cache_uid:
                predictions = classifier.predict_proba(features)[:, 1]
                predictions = list(predictions)
                for (t_uid, t_pid), pred in zip(user_product_ids, predictions):
                    product_predictions[t_pid].append((pred, t_uid))
                features = [map(float, entry[2:])]
                user_product_ids = [[uid, pid]]
                cache_uid = uid
                finished_num += 1
                print finished_num
            else:
                features.append(map(float, entry[2:]))
                user_product_ids.append([uid, pid])

        recommend_pairs = []
        for pid in product_predictions:
            if pid not in product_sellnum:
                continue
            sorted_results = sorted(product_predictions[pid], reverse=True)
            uid_sorted = [x[1] for x in sorted_results]
            for uid in uid_sorted[:product_sellnum[pid]]:
                recommend_pairs.append([uid, pid])
        for pair in recommend_pairs:
            user_recommend_result[pair[0]].append(pair[1])

    data_io.write_submission(user_recommend_result)