def __init__(self, n_clusters=50, pca_n_components=20, kmpca_n_components=3, kernel_n_components=30): self.counter = text.CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=30, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX', 'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX' ] self.linear_feature_selector = None
def __init__(self, n_clusters=50, pca_n_components=30, kmpca_n_components=3, kernel_n_components=30): ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering self.counter = text.CountVectorizer(stop_words='english', charset='utf-8', charset_error='ignore', ngram_range=(1, 1), min_df=0.001, max_df=0.05, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX', 'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX' ] self.linear_feature_selector = None
def kernel_rbf(x: torch.Tensor, x_t=None): """ K(x, x') :param x: :return: """ rbf_feature = kernel_approximation.RBFSampler(gamma=1, random_state=1) x_kenel = rbf_feature.fit_transform(x) if x_t is None: x_t = x_kenel.t() else: x_t = x_t.t() return x_kenel.mm(x_t)
def __init__(self, n_clusters=100, pca_n_components=10, kmpca_n_components=7, kernel_n_components=30): self.counter = text.CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.8, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Loc_CounterX', 'Loc_ClusterdX', 'Loc_KmX', 'Loc_PCAX', 'Loc_PCAClusterdX', 'Loc_RbfX', 'Loc_TreeX' ] self.linear_feature_selector = None ## BUILD dictionary based on location_tree - faster for search location_tree = [ row[0].lower().split('~')[::-1] for row in csv.reader(open(LOCATION_TREE_FILE)) ] self.location_dict = {} for locs in location_tree: for i in range(len(locs)): if locs[i] not in self.location_dict: self.location_dict[locs[i]] = locs[i:]
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
def _build_feature_pipeline(self, sample_mode='rollouts', num_components=50, gammas=None, num_obs=10000, use_standard_scaler=True, featurizer_max_env_steps=10000): """Build the feature pipeline. Args: sample_mode: A string rerpresenting how to collect data from the environment to build features. Must be {'rollouts', 'reset', 'random'}. - `rollouts` will collect observations by executing a random policy in the env. - `reset` will collect rollouts by repeatedly resetting the env. - `random` will just sample the env observation space randomly. num_components: The number of components in each RBF. gammas: A list containing the frequency of each RBF. If None will default to `[0.5, 1.0, 2.5, 5.0]`. num_obs: The integer number of observations to use to fit the Kernels. use_standard_scaler: Boolean indicating if the observations should be normalized. featurizer_max_env_steps: Maximum number of steps to be taken in each rollout to estimate the kernels in the featurizer. Raises: ValueError: If the `sample_mode` is unknown. """ env = self._env._envs[0] # pylint: disable=protected-access if gammas is None: gammas = [0.5, 1.0, 2.5, 5.0] features = [] for i, gamma in enumerate(gammas): features.append( ('rbf{}'.format(i), kernel_approximation.RBFSampler(gamma=gamma, n_components=num_components))) self.featurizer = pipeline.FeatureUnion(features) if use_standard_scaler: self.scaler = skl_preprocessing.StandardScaler() if sample_mode == 'random': # Randomly sample from the observation space to fit the featurizers. observation_examples = np.array([env.observation_space.sample() for _ in range(num_obs)]) # pylint: disable=line-too-long elif sample_mode == 'reset': # Just reset the environment to obtain the observations. observation_examples = np.array( [env.reset() for _ in range(num_obs)]) elif sample_mode == 'rollouts': # Rollout mode. observations = [] while True: observations.append(env.reset()) done = False t = 0 while not done and t < featurizer_max_env_steps: action = env.action_space.sample() obs, _, done, _ = env.step(action) observations.append(obs) if len(observations) > num_obs: break # Collected enough observations. observation_examples = np.array(observations) else: raise ValueError('Unknown `sample_mode`!') if use_standard_scaler: self.scaler.fit(observation_examples) if use_standard_scaler: self.scaler.transform(observation_examples) self.featurizer.fit(observation_examples) self.use_standard_scaler = use_standard_scaler
def EpsFair_Prediction(self, filename, eps, hparams, avails, p): is_kernel = p.kernel rff = p.rff lmd = hparams["lmd"] gamma = hparams["gamma"] if is_kernel and rff: print("Error: either rff or kernel needs to be false") sys.exit() NTrain = len(self.trainX1) transform_s = p.nonlinears trainS, trainX1 = copy.deepcopy(self.trainS), copy.deepcopy( self.trainX1) if rff: if not transform_s: print("random fourier feature") else: print("random fourier feature (full ns)") ds_new = len(self.trainS[0]) * 10 dx_new = len(self.trainX1[0]) * 10 sys.stdout.flush() if transform_s: sampler_s = kernel_approximation.RBFSampler( gamma=hparams["gamma"], n_components=ds_new) sampler_s.fit(trainS) trainS = sampler_s.transform(trainS) sampler_x = kernel_approximation.RBFSampler(gamma=hparams["gamma"], n_components=dx_new) sampler_x.fit(self.trainX1) trainX1 = sampler_x.transform(self.trainX1) else: trainX1 = self.trainX1 S_std = [np.std(trainS[:, j]) for j in range(len(trainS[0]))] for j in range(len(trainS[0])): trainS[:, j] = trainS[:, j] / S_std[j] X1_size = len(trainX1[0]) lr1 = [] #stage1 regressor/classifiers for i in range(X1_size): lr1.append(copy.deepcopy(self.fstStageRegressor)) lr1[-1].set_params(alpha=lmd) trainS_X2 = np.c_[trainS, self.trainX2] #use S and X2 (not used currently...) X1_hat_tmp = [] self.train_X1_resid(trainX1, trainS_X2, trainS, lr1, use_X2=False) train_X1_resid = self.get_X1_resid(lr1, trainX1, trainS, trainS_X2, use_X2=False) X1_std = [ np.std(train_X1_resid[:, j]) for j in range(len(self.trainX1[0])) ] for j in range(len(self.trainX1[0])): train_X1_resid[:, j] = train_X1_resid[:, j] / X1_std[j] trainX_rn = copy.deepcopy(train_X1_resid) #self.trainX1 for i in range(trainX_rn.shape[1]): trainX_rn[:, i] = trainX_rn[:, i] - np.mean(train_X1_resid[:, i]) trainS_n = copy.deepcopy(trainS) for j in range(len(trainS[0])): trainS_n[:, j] = trainS[:, j] - np.mean(trainS[:, j]) trainY_n = self.trainY - np.mean(self.trainY) validS, validX1, validX2, validY = self.getValidationData() if rff: validX1 = sampler_x.transform(validX1) if transform_s: validS = sampler_s.transform(validS) for j in range(len(trainS[0])): validS[:, j] = validS[:, j] / S_std[j] validS_X2 = np.c_[validS, validX2] #use S and X2 valid_X1_resid = self.get_X1_resid(lr1, validX1, validS, validS_X2, use_X2=False) valid_n = len(validS) for j in range(len(self.trainX1[0])): valid_X1_resid[:, j] = valid_X1_resid[:, j] / X1_std[j] validX_rn = valid_X1_resid #self.trainX1 for i in range(train_X1_resid.shape[1]): validX_rn[:, i] = validX_rn[:, i] - np.mean(train_X1_resid[:, i]) validS_n = copy.deepcopy(validS) for j in range(len(trainS[0])): validS_n[:, j] = validS[:, j] - np.mean(trainS[:, j]) validY_n = validY - np.mean(self.trainY) testS, testX1, testX2, testY = self.getPredictData() if rff: testX1 = sampler_x.transform(testX1) if transform_s: testS = sampler_s.transform(testS) for j in range(len(trainS[0])): testS[:, j] = testS[:, j] / S_std[j] testS_X2 = np.c_[testS, testX2] #use S and X2 test_X1_resid = self.get_X1_resid(lr1, testX1, testS, testS_X2, use_X2=False) for j in range(len(self.trainX1[0])): test_X1_resid[:, j] = test_X1_resid[:, j] / X1_std[j] test_n = len(testS) testX_rn = test_X1_resid #self.trainX1 for i in range(train_X1_resid.shape[1]): testX_rn[:, i] = testX_rn[:, i] - np.mean(train_X1_resid[:, i]) testS_n = copy.deepcopy(testS) for j in range(len(trainS[0])): testS_n[:, j] = testS[:, j] - np.mean(trainS[:, j]) testY_n = testY - np.mean(self.trainY) Vs = np.cov(trainS_n.T) Vx = np.cov(trainX_rn.T) vs = np.matmul(trainS_n.T, trainY_n) / NTrain vx = np.matmul(trainX_rn.T, trainY_n) / NTrain def linearKernel(): return (lambda x, y: np.dot(x, y)) def rbfKernel(gamma): return (lambda x, y: math.exp(-gamma * np.inner(x - y, x - y))) def polyKernel(gamma): return (lambda x, y: (gamma * np.inner(x, y) + 1.0)**3) if not is_kernel: #Linear sol = self.Fair_Prediction_Optimization(eps, lmd / NTrain, Vs, Vx, vs, vx) #main optimization train_S_X1_resid = np.c_[trainS_n, trainX_rn] trainYhat = [ np.dot(sol, train_S_X1_resid[i]) for i in range(NTrain) ] valid_S_X1_resid = np.c_[validS_n, validX_rn] validYhat = [ np.dot(sol, valid_S_X1_resid[i]) for i in range(valid_n) ] test_S_X1_resid = np.c_[testS_n, testX_rn] testYhat = [np.dot(sol, test_S_X1_resid[i]) for i in range(test_n)] result_train = Result(self.trainY, trainYhat + np.mean(self.trainY), self.trainS, avails) result_valid = Result(validY, validYhat + np.mean(self.trainY), self.validS, avails) result_test = Result(testY, testYhat + np.mean(self.trainY), self.testS, avails) else: #Kernel ks = rbfKernel(gamma) kx = rbfKernel(gamma) n = NTrain subsampling_ratio = 1.0 #0.1 n_sub = int(n * subsampling_ratio) if subsampling_ratio < 1.0: sample_ids = self.subsample_from_levscore( ks, kx, trainS_n, trainX_rn, gamma, 0.05, subsampling_ratio) else: sample_ids = [i for i in range(n)] Ks, Kx = np.zeros((n_sub, n_sub)), np.zeros((n_sub, n_sub)) trainS_n_sub = trainS_n[sample_ids] trainX_rn_sub = trainX_rn[sample_ids] trainY_n_sub = trainY_n[sample_ids] for i in range(n_sub): for j in range(n_sub): Ks[i, j], Kx[i, j] = ks(trainS_n_sub[i], trainS_n_sub[j]), kx( trainX_rn_sub[i], trainX_rn_sub[j]) sol = self.Fair_Prediction_Kernel_Optimization( eps, lmd, Ks, Kx, trainS_n_sub, trainX_rn_sub, trainY_n_sub) trainYhat = np.matmul(Ks, sol[:n_sub]) + np.matmul(Kx, sol[n_sub:]) valid_n = len(validS) Ks_valid, Kx_valid = np.zeros((valid_n, n_sub)), np.zeros( (valid_n, n_sub)) for i in range(valid_n): for j in range(n_sub): Ks_valid[i,j], Kx_valid[i,j]\ = ks(validS_n[i],trainS_n_sub[j]), kx(validX_rn[i],trainX_rn_sub[j]) validYhat = np.matmul(Ks_valid, sol[:n_sub]) + np.matmul( Kx_valid, sol[n_sub:]) test_n = len(testS) Ks_test, Kx_test = np.zeros((test_n, n_sub)), np.zeros( (test_n, n_sub)) for i in range(test_n): for j in range(n_sub): Ks_test[i,j], Kx_test[i,j]\ = ks(testS_n[i],trainS_n_sub[j]), kx(testX_rn[i],trainX_rn_sub[j]) testYhat = np.matmul(Ks_test, sol[:n_sub]) + np.matmul( Kx_test, sol[n_sub:]) result_train = Result(self.trainY[sample_ids], trainYhat + np.mean(self.trainY[sample_ids]), self.trainS[sample_ids], avails) result_valid = Result(validY, validYhat + np.mean(self.trainY), validS, avails) result_test = Result(testY, testYhat + np.mean(self.trainY), testS, avails) return result_train, result_valid, result_test
def get_search_params(params_builder): search_params = {} safe_eval = SafeEval(load_scipy=True, load_numpy=True) safe_eval_es = SafeEval(load_estimators=True) for p in params_builder['param_set']: search_p = p['search_param_selector']['search_p'] if search_p.strip() == '': continue param_type = p['search_param_selector']['selected_param_type'] lst = search_p.split(':') assert ( len(lst) == 2 ), "Error, make sure there is one and only one colon in search parameter input." literal = lst[1].strip() param_name = lst[0].strip() if param_name: if param_name.lower() == 'n_jobs': sys.exit("Parameter `%s` is invalid for search." % param_name) elif not param_name.endswith('-'): ev = safe_eval(literal) if param_type == 'final_estimator_p': search_params['estimator__' + param_name] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev else: # only for estimator eval, add `-` to the end of param #TODO maybe add regular express check ev = safe_eval_es(literal) for obj in ev: if 'n_jobs' in obj.get_params(): obj.set_params(n_jobs=N_JOBS) if param_type == 'final_estimator_p': search_params['estimator__' + param_name[:-1]] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev elif param_type != 'final_estimator_p': #TODO regular express check ? ev = safe_eval_es(literal) preprocessors = [ preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0) ] newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported preprocessor type: %r" % (obj)) search_params['preprocessing_' + param_type[5:6]] = newlist else: sys.exit("Parameter name of the final estimator can't be skipped!") return search_params
prices = whiten_data(prices) prices, dep_var = features_engineering(prices) X_train, y_train, y_train_value, X_test, y_test, y_test_value, cols = split_test_train(prices, dep_var) print_statistics(y_train, y_test) # First estimator: assume that after an up day there comes a down day y_MR_train = -X_train[:, cols.index('midDiff')] y_MR_test = -X_test[:, cols.index('midDiff')] lin_ridge_reg = linear_model.Ridge(alpha = 0.2, max_iter = 1e5, normalize = True, tol = 1e-8) lin_ridge_reg.fit(X_train, y_train_value) y_lin_ridge_reg_test = lin_ridge_reg.predict(X_test) y_lin_ridge_reg_train = lin_ridge_reg.predict(X_train) rbf_feature = kernel_approximation.RBFSampler(gamma = 1e-11, n_components = 1000, random_state = 0) PhiX_train = rbf_feature.fit_transform(X_train) PhiX_test = rbf_feature.fit_transform(X_test) RFF_lin_ridge_reg = linear_model.Ridge(alpha = 1e-2, max_iter = 1e5, normalize = True, tol = 1e-8) RFF_lin_ridge_reg.fit(PhiX_train, y_train) y_RFF_train = RFF_lin_ridge_reg.predict(PhiX_train) y_RFF_test = RFF_lin_ridge_reg.predict(PhiX_test) y_ensemble_test = np.sign(y_RFF_test) + np.sign(y_lin_ridge_reg_test) y_ensemble_train = np.sign(y_RFF_train) + np.sign(y_lin_ridge_reg_train) y_ensemble_train[y_ensemble_train == 0] = np.sign(y_MR_train[y_ensemble_train == 0]) y_ensemble_test[y_ensemble_test == 0] = np.sign(y_MR_test[y_ensemble_test == 0]) classif_rates = {} classif_rates['Linear ridge reg.'] = [classif_correct_rate(y_lin_ridge_reg_test, y_test), classif_correct_rate(y_lin_ridge_reg_train, y_train)]
def map_fit_transform(self, X): self.rbf_feature = kernel_approx.RBFSampler( n_components=self.n_features, gamma=self.gamma, random_state=self.random_state) return self.rbf_feature.fit_transform(X)