def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load( "train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load( "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % ( min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor( n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)
if idx is not None: predictions[idx] = predictions_part mae_pred = metric(valid_salaries[idx], predictions_part) mae_best_pred = metric(valid_salaries[idx], best_predictions[idx]) if mae_pred < mae_best_pred: isbetter = "DA" else: isbetter = "nope" ppara = "Curr MAE: %0.2f Best MAE: %0.2f %s\n" % ( mae_pred, mae_best_pred, isbetter) print ppara param += ppara if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(ExtraTreesRegressor(n_estimators=n_trees, min_samples_split=min_samples_split, random_state=3465343), name, mae, parameters=param) dio.save_prediction(name, predictions, type_n=type_v) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob
predictions = np.loadtxt(path_join(dio.data_dir, "code", "from_fastml", "optional", model_name)) dio.save_prediction("vowpall_submission", predictions, type_v) model_name = "predictions_submit_test_loc5.txt" predictions = np.loadtxt(path_join(dio.data_dir, "code", "from_fastml", "optional", model_name)) dio.save_prediction("vowpall_loc5", predictions, type_v) all_model_predictions = [] for model_name in model_names: #fit_predict(model_name, features, salaries, validation_features, type_n="test_subm") model_predictions = dio.get_prediction(model_name=model_name, type_n="test_full") if not model_name.endswith("log") and not model_name.startswith("vowpall"): model_predictions = np.log(model_predictions) print "modelp", model_predictions.shape #print "%s\nMAE: %f\n" % (model_name, log_mean_absolute_error(np.log(valid_salaries), model_predictions)) all_model_predictions.append(model_predictions) predictions = np.vstack(all_model_predictions).T predictions = np.exp(predictions) #predictions = np.random.randint(0,5, size=(10,3)) print predictions.shape print predictions[1:10, :] result = predictions.mean(axis=1) model_name = "-".join(model_names) model_name = "vowpal_loc5-extra30_40log-extra40-extra40tfidf2log-mean-test" dio.save_prediction(model_name, result, type_v) dio.write_submission(model_name + ".csv", result)
random_state=3465343) print features[salaries_idx[0], :].shape print salaries[salaries_idx].shape classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx]) predictions_part = classifier.predict(validation_features[valid_idx[0]]) return predictions_part, valid_idx predictions = np.zeros_like(valid_salaries) for cur_class_id in range(num_classes + 1): predictions_part, idx = predict(cur_class_id) if idx is not None: predictions[idx] = predictions_part print "Part MAE: ", metric(valid_salaries[idx], predictions_part) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(ExtraTreesRegressor(), name, mae) dio.save_prediction(name, predictions, type_n=type_v) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob #classifier1 = ExtraTreesRegressor(n_estimators=n_trees, #verbose=1, #n_jobs=3, #oob_score=False, #min_samples_split=min_samples_split,
def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load("train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)