#print idx #i = i + 1 #if i < 50: #print prej, "<=", salarie, "<", after #if not (prej <= salarie < after): #print "NI OK" #print prej, "<=", salarie, "<", after #break #valid_salaries_enc = encode_salaries(valid_salaries, 4) #salaries = np.log(salaries) #valid_salaries = np.log(valid_salaries) dio.is_log = True print salaries.shape metric = dio.error_metric def make_grid_search(pipeline, parameters, model_name, params): print model_name grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=3, #loss_func=f1_score, scoring="f1", iid=False, refit=True) #model_name = "ExtraTree_min_sample2_10trees_gridcv_desc_log"
#after = bin_edges[idx] #except IndexError as iex: #print idx #i = i + 1 #if i < 50: #print prej, "<=", salarie, "<", after #if not (prej <= salarie < after): #print "NI OK" #print prej, "<=", salarie, "<", after #break #valid_salaries_enc = encode_salaries(valid_salaries, 4) #salaries = np.log(salaries) #valid_salaries = np.log(valid_salaries) dio.is_log = True print salaries.shape metric = dio.error_metric def make_grid_search(pipeline, parameters, model_name, params): print model_name grid_search = GridSearchCV( pipeline, parameters, n_jobs=4, verbose=3, #loss_func=f1_score, scoring="f1", iid=False,
def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load( "train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load( "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % ( min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor( n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)
def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load("train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor(n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)