from data_io import DataIO from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import ExtraTreesRegressor from sklearn.base import clone from sklearn.cross_validation import cross_val_score import numpy as np dio = DataIO("Settings.json") title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") pca = RandomizedPCA(random_state=3465343) salaries = dio.get_salaries("train", log=True) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, "train", le_features) #extra_valid_features = dio.get_features(columns, "valid", le_features) param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns) print map(len, extra_features) extra_features = map(lambda x: np.reshape(np.array(x),(len(x),1)),extra_features) print type(title_corpus) print title_corpus.shape title_pca = clone(pca) title_pca.set_params(n_components=200) title_corpus_pca = title_pca.fit_transform(title_corpus)
from data_io import DataIO from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import ExtraTreesRegressor from sklearn.base import clone from sklearn.cross_validation import cross_val_score import numpy as np dio = DataIO("Settings.json") title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") pca = RandomizedPCA(random_state=3465343) salaries = dio.get_salaries("train", log=True) columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, "train", le_features) #extra_valid_features = dio.get_features(columns, "valid", le_features) param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns) print map(len, extra_features) extra_features = map(lambda x: np.reshape(np.array(x), (len(x), 1)), extra_features) print type(title_corpus) print title_corpus.shape title_pca = clone(pca) title_pca.set_params(n_components=200) title_corpus_pca = title_pca.fit_transform(title_corpus) print type(title_corpus_pca)
from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline from pprint import pprint from time import time from sklearn.ensemble import ExtraTreesRegressor dio = DataIO("Settings_loc5.json") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) salaries = dio.get_salaries("train", log=True) #title_corpus_csc = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx") #desc_corpus_csc = dio.read_gensim_corpus("train_desc_nltk_filtered.corpus.mtx") locraw_corpus_csc = dio.read_gensim_corpus( "train_locraw_nltk_filtered.corpus.mtx") #print title_corpus_csc.shape print locraw_corpus_csc.shape pipeline = Pipeline([ ('pca', RandomizedPCA(random_state=3465343)), ('trees', ExtraTreesRegressor(min_samples_split=2, n_estimators=10, n_jobs=4)), ]) parameters = { 'pca__n_components': range(100, 601, 100), } metric = dio.error_metric