Exemple #1
0
 def __init__(self,
              n_clusters=50,
              pca_n_components=20,
              kmpca_n_components=3,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 2),
                                         min_df=30,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX',
         'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX'
     ]
     self.linear_feature_selector = None
 def __init__(self,
              n_clusters=50,
              pca_n_components=30,
              kmpca_n_components=3,
              kernel_n_components=30):
     ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection
     ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering
     self.counter = text.CountVectorizer(stop_words='english',
                                         charset='utf-8',
                                         charset_error='ignore',
                                         ngram_range=(1, 1),
                                         min_df=0.001,
                                         max_df=0.05,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX',
         'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX'
     ]
     self.linear_feature_selector = None
Exemple #3
0
def kernel_rbf(x: torch.Tensor, x_t=None):
    """
    K(x, x')
    :param x: 
    :return: 
    """
    rbf_feature = kernel_approximation.RBFSampler(gamma=1, random_state=1)
    x_kenel = rbf_feature.fit_transform(x)
    if x_t is None:
        x_t = x_kenel.t()
    else:
        x_t = x_t.t()
    return x_kenel.mm(x_t)
 def __init__(self,
              n_clusters=100,
              pca_n_components=10,
              kmpca_n_components=7,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 1),
                                         min_df=2,
                                         max_df=0.8,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Loc_CounterX', 'Loc_ClusterdX', 'Loc_KmX', 'Loc_PCAX',
         'Loc_PCAClusterdX', 'Loc_RbfX', 'Loc_TreeX'
     ]
     self.linear_feature_selector = None
     ## BUILD dictionary based on location_tree - faster for search
     location_tree = [
         row[0].lower().split('~')[::-1]
         for row in csv.reader(open(LOCATION_TREE_FILE))
     ]
     self.location_dict = {}
     for locs in location_tree:
         for i in range(len(locs)):
             if locs[i] not in self.location_dict:
                 self.location_dict[locs[i]] = locs[i:]
Exemple #5
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
Exemple #6
0
    def _build_feature_pipeline(self,
                                sample_mode='rollouts',
                                num_components=50,
                                gammas=None,
                                num_obs=10000,
                                use_standard_scaler=True,
                                featurizer_max_env_steps=10000):
        """Build the feature pipeline.

    Args:
      sample_mode: A string rerpresenting how to collect data from the
        environment to build features. Must be {'rollouts', 'reset', 'random'}.
        - `rollouts` will collect observations by executing a random policy in
          the env.
        - `reset` will collect rollouts by repeatedly resetting the env.
        - `random` will just sample the env observation space randomly.
      num_components: The number of components in each RBF.
      gammas: A list containing the frequency of each RBF. If None will default
        to `[0.5, 1.0, 2.5, 5.0]`.
      num_obs: The integer number of observations to use to fit the Kernels.
      use_standard_scaler: Boolean indicating if the observations should be
        normalized.
      featurizer_max_env_steps: Maximum number of steps to be taken in each
        rollout to estimate the kernels in the featurizer.

    Raises:
      ValueError: If the `sample_mode` is unknown.
    """
        env = self._env._envs[0]  # pylint: disable=protected-access
        if gammas is None:
            gammas = [0.5, 1.0, 2.5, 5.0]

        features = []
        for i, gamma in enumerate(gammas):
            features.append(
                ('rbf{}'.format(i),
                 kernel_approximation.RBFSampler(gamma=gamma,
                                                 n_components=num_components)))
        self.featurizer = pipeline.FeatureUnion(features)
        if use_standard_scaler:
            self.scaler = skl_preprocessing.StandardScaler()

        if sample_mode == 'random':
            # Randomly sample from the observation space to fit the featurizers.
            observation_examples = np.array([env.observation_space.sample() for _ in range(num_obs)])  # pylint: disable=line-too-long
        elif sample_mode == 'reset':
            # Just reset the environment to obtain the observations.
            observation_examples = np.array(
                [env.reset() for _ in range(num_obs)])
        elif sample_mode == 'rollouts':
            # Rollout mode.
            observations = []
            while True:
                observations.append(env.reset())
                done = False
                t = 0
                while not done and t < featurizer_max_env_steps:
                    action = env.action_space.sample()
                    obs, _, done, _ = env.step(action)
                    observations.append(obs)
                if len(observations) > num_obs:
                    break  # Collected enough observations.
            observation_examples = np.array(observations)
        else:
            raise ValueError('Unknown `sample_mode`!')

        if use_standard_scaler: self.scaler.fit(observation_examples)
        if use_standard_scaler: self.scaler.transform(observation_examples)
        self.featurizer.fit(observation_examples)
        self.use_standard_scaler = use_standard_scaler
Exemple #7
0
    def EpsFair_Prediction(self, filename, eps, hparams, avails, p):
        is_kernel = p.kernel
        rff = p.rff
        lmd = hparams["lmd"]
        gamma = hparams["gamma"]
        if is_kernel and rff:
            print("Error: either rff or kernel needs to be false")
            sys.exit()

        NTrain = len(self.trainX1)
        transform_s = p.nonlinears
        trainS, trainX1 = copy.deepcopy(self.trainS), copy.deepcopy(
            self.trainX1)

        if rff:
            if not transform_s:
                print("random fourier feature")
            else:
                print("random fourier feature (full ns)")
            ds_new = len(self.trainS[0]) * 10
            dx_new = len(self.trainX1[0]) * 10
            sys.stdout.flush()
            if transform_s:
                sampler_s = kernel_approximation.RBFSampler(
                    gamma=hparams["gamma"], n_components=ds_new)
                sampler_s.fit(trainS)
                trainS = sampler_s.transform(trainS)
            sampler_x = kernel_approximation.RBFSampler(gamma=hparams["gamma"],
                                                        n_components=dx_new)
            sampler_x.fit(self.trainX1)
            trainX1 = sampler_x.transform(self.trainX1)
        else:
            trainX1 = self.trainX1
        S_std = [np.std(trainS[:, j]) for j in range(len(trainS[0]))]
        for j in range(len(trainS[0])):
            trainS[:, j] = trainS[:, j] / S_std[j]
        X1_size = len(trainX1[0])
        lr1 = []  #stage1 regressor/classifiers
        for i in range(X1_size):
            lr1.append(copy.deepcopy(self.fstStageRegressor))
            lr1[-1].set_params(alpha=lmd)
        trainS_X2 = np.c_[trainS,
                          self.trainX2]  #use S and X2 (not used currently...)
        X1_hat_tmp = []
        self.train_X1_resid(trainX1, trainS_X2, trainS, lr1, use_X2=False)
        train_X1_resid = self.get_X1_resid(lr1,
                                           trainX1,
                                           trainS,
                                           trainS_X2,
                                           use_X2=False)
        X1_std = [
            np.std(train_X1_resid[:, j]) for j in range(len(self.trainX1[0]))
        ]
        for j in range(len(self.trainX1[0])):
            train_X1_resid[:, j] = train_X1_resid[:, j] / X1_std[j]
        trainX_rn = copy.deepcopy(train_X1_resid)  #self.trainX1

        for i in range(trainX_rn.shape[1]):
            trainX_rn[:, i] = trainX_rn[:, i] - np.mean(train_X1_resid[:, i])
        trainS_n = copy.deepcopy(trainS)
        for j in range(len(trainS[0])):
            trainS_n[:, j] = trainS[:, j] - np.mean(trainS[:, j])
        trainY_n = self.trainY - np.mean(self.trainY)

        validS, validX1, validX2, validY = self.getValidationData()
        if rff:
            validX1 = sampler_x.transform(validX1)
            if transform_s:
                validS = sampler_s.transform(validS)
        for j in range(len(trainS[0])):
            validS[:, j] = validS[:, j] / S_std[j]
        validS_X2 = np.c_[validS, validX2]  #use S and X2
        valid_X1_resid = self.get_X1_resid(lr1,
                                           validX1,
                                           validS,
                                           validS_X2,
                                           use_X2=False)
        valid_n = len(validS)
        for j in range(len(self.trainX1[0])):
            valid_X1_resid[:, j] = valid_X1_resid[:, j] / X1_std[j]
        validX_rn = valid_X1_resid  #self.trainX1
        for i in range(train_X1_resid.shape[1]):
            validX_rn[:, i] = validX_rn[:, i] - np.mean(train_X1_resid[:, i])
        validS_n = copy.deepcopy(validS)
        for j in range(len(trainS[0])):
            validS_n[:, j] = validS[:, j] - np.mean(trainS[:, j])
        validY_n = validY - np.mean(self.trainY)
        testS, testX1, testX2, testY = self.getPredictData()
        if rff:
            testX1 = sampler_x.transform(testX1)
            if transform_s:
                testS = sampler_s.transform(testS)
        for j in range(len(trainS[0])):
            testS[:, j] = testS[:, j] / S_std[j]
        testS_X2 = np.c_[testS, testX2]  #use S and X2
        test_X1_resid = self.get_X1_resid(lr1,
                                          testX1,
                                          testS,
                                          testS_X2,
                                          use_X2=False)
        for j in range(len(self.trainX1[0])):
            test_X1_resid[:, j] = test_X1_resid[:, j] / X1_std[j]
        test_n = len(testS)
        testX_rn = test_X1_resid  #self.trainX1
        for i in range(train_X1_resid.shape[1]):
            testX_rn[:, i] = testX_rn[:, i] - np.mean(train_X1_resid[:, i])
        testS_n = copy.deepcopy(testS)
        for j in range(len(trainS[0])):
            testS_n[:, j] = testS[:, j] - np.mean(trainS[:, j])
        testY_n = testY - np.mean(self.trainY)

        Vs = np.cov(trainS_n.T)
        Vx = np.cov(trainX_rn.T)
        vs = np.matmul(trainS_n.T, trainY_n) / NTrain
        vx = np.matmul(trainX_rn.T, trainY_n) / NTrain

        def linearKernel():
            return (lambda x, y: np.dot(x, y))

        def rbfKernel(gamma):
            return (lambda x, y: math.exp(-gamma * np.inner(x - y, x - y)))

        def polyKernel(gamma):
            return (lambda x, y: (gamma * np.inner(x, y) + 1.0)**3)

        if not is_kernel:  #Linear
            sol = self.Fair_Prediction_Optimization(eps, lmd / NTrain, Vs, Vx,
                                                    vs, vx)  #main optimization
            train_S_X1_resid = np.c_[trainS_n, trainX_rn]
            trainYhat = [
                np.dot(sol, train_S_X1_resid[i]) for i in range(NTrain)
            ]
            valid_S_X1_resid = np.c_[validS_n, validX_rn]
            validYhat = [
                np.dot(sol, valid_S_X1_resid[i]) for i in range(valid_n)
            ]
            test_S_X1_resid = np.c_[testS_n, testX_rn]
            testYhat = [np.dot(sol, test_S_X1_resid[i]) for i in range(test_n)]

            result_train = Result(self.trainY,
                                  trainYhat + np.mean(self.trainY),
                                  self.trainS, avails)
            result_valid = Result(validY, validYhat + np.mean(self.trainY),
                                  self.validS, avails)
            result_test = Result(testY, testYhat + np.mean(self.trainY),
                                 self.testS, avails)
        else:  #Kernel
            ks = rbfKernel(gamma)
            kx = rbfKernel(gamma)
            n = NTrain
            subsampling_ratio = 1.0  #0.1
            n_sub = int(n * subsampling_ratio)
            if subsampling_ratio < 1.0:
                sample_ids = self.subsample_from_levscore(
                    ks, kx, trainS_n, trainX_rn, gamma, 0.05,
                    subsampling_ratio)
            else:
                sample_ids = [i for i in range(n)]

            Ks, Kx = np.zeros((n_sub, n_sub)), np.zeros((n_sub, n_sub))
            trainS_n_sub = trainS_n[sample_ids]
            trainX_rn_sub = trainX_rn[sample_ids]
            trainY_n_sub = trainY_n[sample_ids]
            for i in range(n_sub):
                for j in range(n_sub):
                    Ks[i, j], Kx[i,
                                 j] = ks(trainS_n_sub[i], trainS_n_sub[j]), kx(
                                     trainX_rn_sub[i], trainX_rn_sub[j])
            sol = self.Fair_Prediction_Kernel_Optimization(
                eps, lmd, Ks, Kx, trainS_n_sub, trainX_rn_sub, trainY_n_sub)

            trainYhat = np.matmul(Ks, sol[:n_sub]) + np.matmul(Kx, sol[n_sub:])
            valid_n = len(validS)
            Ks_valid, Kx_valid = np.zeros((valid_n, n_sub)), np.zeros(
                (valid_n, n_sub))
            for i in range(valid_n):
                for j in range(n_sub):
                    Ks_valid[i,j], Kx_valid[i,j]\
                      = ks(validS_n[i],trainS_n_sub[j]), kx(validX_rn[i],trainX_rn_sub[j])
            validYhat = np.matmul(Ks_valid, sol[:n_sub]) + np.matmul(
                Kx_valid, sol[n_sub:])
            test_n = len(testS)
            Ks_test, Kx_test = np.zeros((test_n, n_sub)), np.zeros(
                (test_n, n_sub))
            for i in range(test_n):
                for j in range(n_sub):
                    Ks_test[i,j], Kx_test[i,j]\
                      = ks(testS_n[i],trainS_n_sub[j]), kx(testX_rn[i],trainX_rn_sub[j])
            testYhat = np.matmul(Ks_test, sol[:n_sub]) + np.matmul(
                Kx_test, sol[n_sub:])

            result_train = Result(self.trainY[sample_ids],
                                  trainYhat + np.mean(self.trainY[sample_ids]),
                                  self.trainS[sample_ids], avails)
            result_valid = Result(validY, validYhat + np.mean(self.trainY),
                                  validS, avails)
            result_test = Result(testY, testYhat + np.mean(self.trainY), testS,
                                 avails)

        return result_train, result_valid, result_test
def get_search_params(params_builder):
    search_params = {}
    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
    safe_eval_es = SafeEval(load_estimators=True)

    for p in params_builder['param_set']:
        search_p = p['search_param_selector']['search_p']
        if search_p.strip() == '':
            continue
        param_type = p['search_param_selector']['selected_param_type']

        lst = search_p.split(':')
        assert (
            len(lst) == 2
        ), "Error, make sure there is one and only one colon in search parameter input."
        literal = lst[1].strip()
        param_name = lst[0].strip()
        if param_name:
            if param_name.lower() == 'n_jobs':
                sys.exit("Parameter `%s` is invalid for search." % param_name)
            elif not param_name.endswith('-'):
                ev = safe_eval(literal)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name] = ev
            else:
                # only for estimator eval, add `-` to the end of param
                #TODO maybe add regular express check
                ev = safe_eval_es(literal)
                for obj in ev:
                    if 'n_jobs' in obj.get_params():
                        obj.set_params(n_jobs=N_JOBS)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name[:-1]] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name[:-1]] = ev
        elif param_type != 'final_estimator_p':
            #TODO regular express check ?
            ev = safe_eval_es(literal)
            preprocessors = [
                preprocessing.StandardScaler(),
                preprocessing.Binarizer(),
                preprocessing.Imputer(),
                preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(),
                feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(),
                feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS),
                skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0)
            ]
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported preprocessor type: %r" % (obj))
            search_params['preprocessing_' + param_type[5:6]] = newlist
        else:
            sys.exit("Parameter name of the final estimator can't be skipped!")

    return search_params
Exemple #9
0
    prices = whiten_data(prices)
    prices, dep_var = features_engineering(prices)
    X_train, y_train, y_train_value, X_test, y_test, y_test_value, cols = split_test_train(prices, dep_var)
    
    print_statistics(y_train, y_test)

    # First estimator: assume that after an up day there comes a down day
    y_MR_train = -X_train[:, cols.index('midDiff')]
    y_MR_test = -X_test[:, cols.index('midDiff')]

    lin_ridge_reg = linear_model.Ridge(alpha = 0.2, max_iter = 1e5, normalize = True, tol = 1e-8)
    lin_ridge_reg.fit(X_train, y_train_value)
    y_lin_ridge_reg_test = lin_ridge_reg.predict(X_test)
    y_lin_ridge_reg_train = lin_ridge_reg.predict(X_train)

    rbf_feature = kernel_approximation.RBFSampler(gamma = 1e-11, n_components = 1000, random_state = 0)
    PhiX_train = rbf_feature.fit_transform(X_train)
    PhiX_test = rbf_feature.fit_transform(X_test)
    RFF_lin_ridge_reg = linear_model.Ridge(alpha = 1e-2, max_iter = 1e5, normalize = True, tol = 1e-8)
    RFF_lin_ridge_reg.fit(PhiX_train, y_train)
    y_RFF_train = RFF_lin_ridge_reg.predict(PhiX_train)
    y_RFF_test = RFF_lin_ridge_reg.predict(PhiX_test)

    y_ensemble_test =  np.sign(y_RFF_test) + np.sign(y_lin_ridge_reg_test)
    y_ensemble_train = np.sign(y_RFF_train) + np.sign(y_lin_ridge_reg_train)

    y_ensemble_train[y_ensemble_train == 0] = np.sign(y_MR_train[y_ensemble_train == 0])
    y_ensemble_test[y_ensemble_test == 0] = np.sign(y_MR_test[y_ensemble_test == 0])

    classif_rates = {}
    classif_rates['Linear ridge reg.'] = [classif_correct_rate(y_lin_ridge_reg_test, y_test), classif_correct_rate(y_lin_ridge_reg_train, y_train)]
Exemple #10
0
 def map_fit_transform(self, X):
     self.rbf_feature = kernel_approx.RBFSampler(
         n_components=self.n_features,
         gamma=self.gamma,
         random_state=self.random_state)
     return self.rbf_feature.fit_transform(X)