#["Title", "FullDescription", "LocationRaw"],
#extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_valid_features).astype(np.int64)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns, extra_features)
validation_features = dio.join_features(
    "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns,
    extra_valid_features)

salaries = dio.get_salaries(type_n, log=True).astype(np.int64)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=True)
best_predictions = dio.get_prediction(
    model_name="ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log",
    type_n="valid")

par = " classed from 0-11500 then 4 classes to 100 000 and to end NoNormal classTypeTime"


def encode_salaries(salaries, bins):
    bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True)
    #hist, bin_edges = np.histogram(salaries, bins)
    print np.diff(bin_edges)
    idxs = np.searchsorted(bin_edges, salaries, side="right")
    return idxs


#salaries_enc = encode_salaries(salaries, 4)
#valid_salaries_enc = encode_salaries(valid_salaries, 4)
Beispiel #2
0
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new" % (n_trees)
    model_names.append(name)
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % (n_trees)
    model_names.append(name)
model_names.append("vowpall")
model_names.append("vowpall_loc5")


#fit_predict(model2)
#fit_predict(model1)
#fit_predict(model3)
#fit_predict(model6, "", "", "")

all_model_predictions = []
for model_name in model_names:
    model_predictions = dio.get_prediction(model_name=model_name, type_n=type_v)
    #print model_predictions[0]
    if not model_name.endswith("log") and not model_name.startswith("vowpall"):
        model_predictions = np.log(model_predictions)
    #if model_name.startswith("vowpall"):
        #model_predictions = np.log(model_predictions)
    #print model_predictions[0]
    print "%s\nMAE: %f\n" % (model_name, mean_absolute_error(valid_salaries, np.exp(model_predictions)))
    all_model_predictions.append(model_predictions)
predictions = np.vstack(all_model_predictions).T
predictions = np.exp(predictions)
#predictions = np.random.randint(0,5, size=(10,3))
print predictions.shape
print predictions[1:10, :]
indexes = range(0, len(model_names))
#salaries_enc = encode_salaries(salaries, 4)
#valid_salaries_enc = encode_salaries(valid_salaries, 4)


print salaries.shape
metric = dio.error_metric




for bins in [4]: #range(10,15):
    n_trees = 10
    #salaries_enc = encode_salaries(salaries, bins)
    #valid_salaries_enc = encode_salaries(valid_salaries, bins)
    salaries_enc = dio.get_prediction(model_name="randomForest_tfidf_titleFullLoc_bin4", type_n="train_classes")
    valid_salaries_enc = dio.get_prediction(model_name="randomForest_tfidf_titleFullLoc_bin4", type_n="valid_classes")
    par = " classed from 0-11500 then %d classes to 100 000 and to end NoNormal classTypeTime salaries and valid predicted with randomForest_tfidf_titleFullLoc_bin4" % (bins,)
    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_count_rf10_%dsplit_new_log" % (min_samples_split, n_trees, bins)
    print name
    num_classes = salaries_enc.max()
    print "classes:", num_classes

    def predict(class_id):
        print "predicting: ", class_id
        salaries_idx = np.where(salaries_enc == class_id)
        valid_idx = np.where(valid_salaries_enc == class_id)

        if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0:
            return [], None
Beispiel #4
0
#fit_predict(model5)

#fit_predict(model4, features, salaries, validation_features, type_n="test_subm")

model_name = "predictions_submit_test.txt"
predictions = np.loadtxt(path_join(dio.data_dir, "code", "from_fastml", "optional", model_name))
dio.save_prediction("vowpall_submission", predictions, type_v)

model_name = "predictions_submit_test_loc5.txt"
predictions = np.loadtxt(path_join(dio.data_dir, "code", "from_fastml", "optional", model_name))
dio.save_prediction("vowpall_loc5", predictions, type_v)

all_model_predictions = []
for model_name in model_names:
    #fit_predict(model_name, features, salaries, validation_features, type_n="test_subm")
    model_predictions = dio.get_prediction(model_name=model_name, type_n="test_full")
    if not model_name.endswith("log") and not model_name.startswith("vowpall"):
        model_predictions = np.log(model_predictions)
    print "modelp", model_predictions.shape
    #print "%s\nMAE: %f\n" % (model_name, log_mean_absolute_error(np.log(valid_salaries), model_predictions))
    all_model_predictions.append(model_predictions)
predictions = np.vstack(all_model_predictions).T
predictions = np.exp(predictions)
#predictions = np.random.randint(0,5, size=(10,3))
print predictions.shape
print predictions[1:10, :]


result = predictions.mean(axis=1)
model_name = "-".join(model_names)
model_name = "vowpal_loc5-extra30_40log-extra40-extra40tfidf2log-mean-test"
                             #["Title", "FullDescription", "LocationRaw"],
                             #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
                                        #["Title", "FullDescription", "LocationRaw"],
                                        #extra_valid_features).astype(np.int64)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features)

salaries = dio.get_salaries(type_n, log=True).astype(np.int64)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=True)
best_predictions = dio.get_prediction(model_name="ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log", type_n="valid")


par = " classed from 0-11500 then 4 classes to 100 000 and to end NoNormal classTypeTime"
def encode_salaries(salaries, bins):
    bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True)
    #hist, bin_edges = np.histogram(salaries, bins)
    print np.diff(bin_edges)
    idxs = np.searchsorted(bin_edges, salaries, side="right")
    return idxs




#salaries_enc = encode_salaries(salaries, 4)
#valid_salaries_enc = encode_salaries(valid_salaries, 4)
Beispiel #6
0
        n_trees)
    model_names.append(name)
    name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % (
        n_trees)
    model_names.append(name)
model_names.append("vowpall")
model_names.append("vowpall_loc5")

#fit_predict(model2)
#fit_predict(model1)
#fit_predict(model3)
#fit_predict(model6, "", "", "")

all_model_predictions = []
for model_name in model_names:
    model_predictions = dio.get_prediction(model_name=model_name,
                                           type_n=type_v)
    #print model_predictions[0]
    if not model_name.endswith("log") and not model_name.startswith("vowpall"):
        model_predictions = np.log(model_predictions)
    #if model_name.startswith("vowpall"):
    #model_predictions = np.log(model_predictions)
    #print model_predictions[0]
    print "%s\nMAE: %f\n" % (model_name,
                             mean_absolute_error(valid_salaries,
                                                 np.exp(model_predictions)))
    all_model_predictions.append(model_predictions)
predictions = np.vstack(all_model_predictions).T
predictions = np.exp(predictions)
#predictions = np.random.randint(0,5, size=(10,3))
print predictions.shape
print predictions[1:10, :]
Beispiel #7
0
    idxs = np.searchsorted(bin_edges, salaries, side="right")
    return idxs


#salaries_enc = encode_salaries(salaries, 4)
#valid_salaries_enc = encode_salaries(valid_salaries, 4)

print salaries.shape
metric = dio.error_metric

for bins in [4]:  #range(10,15):
    n_trees = 10
    #salaries_enc = encode_salaries(salaries, bins)
    #valid_salaries_enc = encode_salaries(valid_salaries, bins)
    salaries_enc = dio.get_prediction(
        model_name="randomForest_tfidf_titleFullLoc_bin4",
        type_n="train_classes")
    valid_salaries_enc = dio.get_prediction(
        model_name="randomForest_tfidf_titleFullLoc_bin4",
        type_n="valid_classes")
    par = " classed from 0-11500 then %d classes to 100 000 and to end NoNormal classTypeTime salaries and valid predicted with randomForest_tfidf_titleFullLoc_bin4" % (
        bins, )
    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_count_rf10_%dsplit_new_log" % (
        min_samples_split, n_trees, bins)
    print name
    num_classes = salaries_enc.max()
    print "classes:", num_classes

    def predict(class_id):
        print "predicting: ", class_id
        salaries_idx = np.where(salaries_enc == class_id)