#["Title", "FullDescription", "LocationRaw"], #extra_valid_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features, sparse=True) validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features, sparse=True) print features.shape print validation_features.shape salaries = dio.get_salaries(type_n, log=True) if not submission: valid_salaries = dio.get_salaries(type_v, log=True) print salaries.shape bs = Bootstrap(len(salaries), random_state=45, train_size=0.6) train_index, test_index = next(iter(bs)) param = """Normal count vector with max 200. New submission which is repeatable. and nicer Bag of Words: %s\n Encoded cols: %s\n Logged
#["Title", "FullDescription", "LocationRaw"], #extra_valid_features).astype(np.int64) short_id = "tfidf_200f_l2" dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v) extra_features = [] extra_valid_features = [] features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features) print features.max() print features.min() salaries = dio.get_salaries(type_n, log=False).astype(np.int64) if not submission: valid_salaries = dio.get_salaries(type_v, log=False) def encode_salaries(salaries, bins): bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True) #hist, bin_edges = np.histogram(salaries, bins) print np.diff(bin_edges) idxs = np.searchsorted(bin_edges, salaries, side="right") return idxs #bin_edges.insert(0, 0) #bin_edges.append(salaries.max() + 1) #print "hist", hist #print "edges", bin_edges
feature_category = extra_features[col_index] validation_features_category = extra_valid_features[col_index] #features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features).astype(np.int64) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features( "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features) salaries = dio.get_salaries(type_n, log=True).astype(np.int64) if not submission: valid_salaries = dio.get_salaries(type_v, log=True) best_predictions = dio.get_prediction( model_name="ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_log", type_n="valid") par = " classed from 0-11500 then 4 classes to 100 000 and to end NoNormal classTypeTime" def encode_salaries(salaries, bins): bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True) #hist, bin_edges = np.histogram(salaries, bins) print np.diff(bin_edges) idxs = np.searchsorted(bin_edges, salaries, side="right") return idxs
#columns = ["Category", "ContractTime", "ContractType"] #le_features = dio.get_le_features(columns, "train_full") #extra_features = dio.get_features(columns, type_n, le_features) #extra_valid_features = dio.get_features(columns, type_v, le_features) #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #print "features", features.shape #print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) if not submission: valid_salaries = dio.get_salaries(type_v, log=False) print valid_salaries.shape model_names = [] for n_trees in [20, 30, 40]: name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new_log" % (n_trees) model_names.append(name) name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new" % (n_trees) model_names.append(name) name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % (n_trees) model_names.append(name) model_names.append("vowpall") model_names.append("vowpall_loc5") #fit_predict(model2)
#["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features, sparse=True) validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features, sparse=True) print features.shape print validation_features.shape salaries = dio.get_salaries(type_n, log=True) if not submission: valid_salaries = dio.get_salaries(type_v, log=True) print salaries.shape bs = Bootstrap(len(salaries), random_state=45, train_size=0.6) train_index, test_index = next(iter(bs)) param = """Normal count vector with max 200. New submission which is repeatable. and nicer Bag of Words: %s\n Encoded cols: %s\n Logged
from data_io import DataIO from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import ExtraTreesRegressor from sklearn.base import clone from sklearn.cross_validation import cross_val_score import numpy as np dio = DataIO("Settings.json") title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") pca = RandomizedPCA(random_state=3465343) salaries = dio.get_salaries("train", log=True) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, "train", le_features) #extra_valid_features = dio.get_features(columns, "valid", le_features) param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns) print map(len, extra_features) extra_features = map(lambda x: np.reshape(np.array(x), (len(x), 1)), extra_features) print type(title_corpus) print title_corpus.shape title_pca = clone(pca) title_pca.set_params(n_components=200) title_corpus_pca = title_pca.fit_transform(title_corpus) print type(title_corpus_pca)
#columns = ["Category", "ContractTime", "ContractType"] #le_features = dio.get_le_features(columns, "train_full") #extra_features = dio.get_features(columns, type_n, le_features) #extra_valid_features = dio.get_features(columns, type_v, le_features) #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #print "features", features.shape #print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) if not submission: valid_salaries = dio.get_salaries(type_v, log=False) print valid_salaries.shape model_names = [] for n_trees in [20, 30, 40]: name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new_log" % ( n_trees) model_names.append(name) name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_new" % ( n_trees) model_names.append(name) name = "ExtraTree_min_sample2_%dtrees_200f_noNorm_categoryTimeType_tfidfl2_new_log" % ( n_trees) model_names.append(name) model_names.append("vowpall") model_names.append("vowpall_loc5")
from data_io import DataIO from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import ExtraTreesRegressor from sklearn.base import clone from sklearn.cross_validation import cross_val_score import numpy as np dio = DataIO("Settings.json") title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") pca = RandomizedPCA(random_state=3465343) salaries = dio.get_salaries("train", log=True) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, "train", le_features) #extra_valid_features = dio.get_features(columns, "valid", le_features) param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns) print map(len, extra_features) extra_features = map(lambda x: np.reshape(np.array(x),(len(x),1)),extra_features) print type(title_corpus) print title_corpus.shape title_pca = clone(pca) title_pca.set_params(n_components=200) title_corpus_pca = title_pca.fit_transform(title_corpus)
model_names = [ "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log", "vowpall", #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_14split_new_log", #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_split_new_log", #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_rf10_4split_new1_log" #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_rf10_4split_newOKsalPredictValid_log", "ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_new_faked_log", #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newOKsalPredictValid_log", #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid_log", "Ridge_tfidf_05d_log" #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid1_log", ] valid_salaries = dio.get_salaries("valid", log=False) ylim = (0, 8000) xlim = (-50000, 50000) grid = True def encode_salaries(salaries, bins): bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True) #hist, bin_edges = np.histogram(salaries, bins) #bin_edges = list(bin_edges) #bin_edges.insert(0, 0) #bin_edges.append(salaries.max() + 1) print np.diff(bin_edges) idxs = np.searchsorted(bin_edges, salaries, side="right") return idxs, bin_edges