Beispiel #1
0
    def _fit_start(self, X, partial=False):

        self._set_features(X)

        if not partial or not hasattr(self, 'trials_'):

            self._reset_trials()
            self.k_gen_ = 0

            # Init toolbox
            self.toolbox = base.Toolbox()
            self.rstate = check_random_state(self.random_state)

            # Define individual
            k_min = self.min_features_
            k_max = self.max_features_

            def get_individual():
                ind_size = self.rstate.choice(range(k_min, k_max + 1))
                features = self.features_.sample(ind_size)
                return features

            self.toolbox.register("individual", get_individual)

            # Define population
            self.toolbox.register("population", tools.initRepeat, list,
                                  self.toolbox.individual)
            self.population = self.toolbox.population(n=self.pop_size)

        return self
Beispiel #2
0
def cxOnePoint(ind1, ind2, indpb=0.5, random_state=None, drop_attrs=['score']):

    rstate = check_random_state(random_state)

    n = ind1.n_features
    argsort = rstate.permutation(n)

    a = rstate.randint(n)

    mask1 = np.zeros((n, ), dtype=bool)
    mask2 = np.zeros((n, ), dtype=bool)

    for i in range(n):
        j = argsort[i]
        x = ind1.mask[i]
        y = ind2.mask[j]
        if a <= i:
            mask1[j] = x
            mask2[j] = y
        else:
            mask1[j] = y
            mask2[j] = x

    child1 = ind1.copy().set_mask(mask1)
    child2 = ind2.copy().set_mask(mask2)

    child1.parents = (ind1, ind2)
    child2.parents = (ind1, ind2)

    for attr in drop_attrs:
        for child in [child1, child2]:
            if hasattr(child, attr):
                delattr(child, attr)

    return child1, child2
Beispiel #3
0
def cxUniform(ind1, ind2, indpb=0.5, random_state=None, drop_attrs=['score']):

    rstate = check_random_state(random_state)
    mask1, mask2 = [], []

    for x, y in zip(ind1.mask, ind2.mask):
        if rstate.rand() < indpb:
            mask1.append(x)
            mask2.append(y)
        else:
            mask1.append(y)
            mask2.append(x)

    child1 = ind1.copy().set_mask(mask1)
    child2 = ind2.copy().set_mask(mask2)

    child1.parents = (ind1, ind2)
    child2.parents = (ind1, ind2)

    for attr in drop_attrs:
        for child in [child1, child2]:
            if hasattr(child, attr):
                delattr(child, attr)

    return child1, child2
Beispiel #4
0
    def __init__(self,
                 param_grid,
                 n_evaluations=10,
                 maximize=True,
                 random_state=None):
        """
        Works in the same way as sklearn.grid_search.RandomizedSearch.
        Each next point is generated independently.

        :param_grid: dict with distributions used to sample each parameter.
          name -> list of possible values (in which case sampled uniformly from options)
          name -> distribution (should implement '.rvs()' as scipy distributions)
        :param bool maximize: ignored parameter, added for uniformity

        NB: this is the only optimizer, which supports passing distributions for parameters.
        """
        self.maximize = maximize
        self.param_grid = OrderedDict(param_grid)
        self.n_evaluations = n_evaluations
        self.random_state = check_random_state(random_state)
        self.indices_to_parameters_ = OrderedDict()
        self.grid_scores_ = OrderedDict()
        self.queued_tasks_ = set()
        from sklearn.grid_search import ParameterSampler
        self.param_sampler = iter(
            ParameterSampler(param_grid,
                             n_iter=n_evaluations,
                             random_state=random_state))
    def run(self, dataset_name, random_state=42):

        config = self.config
        X_train, y_train, X_test, y_test = fetch_load_data(dataset_name)

        for n_run in range(self.n_runs):
            seed_run = random_state * n_run
            logger.info(
                '\n\nRANDOM SEED = {} for data split.'.format(seed_run))
            rng = check_random_state(seed_run)
            if config['dataset']['is_stream']:
                logger.info('Dataset is a stream. Sampling observed labels.')
                # Just randomly sample ratio_labeled samples for mask_labeled
                n_burn_in = config['data']['n_burn_in_stream']

                for ratio_labeled in self.ratio_labeled_values:

                    config['data']['stream']['ratio_labeled'] = ratio_labeled
                    n_labeled = int(ratio_labeled * len(y_train))
                    ind_labeled = rng.choice(len(y_train),
                                             n_labeled,
                                             replace=False)
                    mask_labeled = np.zeros(len(y_train), dtype=bool)
                    mask_labeled[ind_labeled] = True
                    X_run, y_run = X_train, y_train

                    config['data']['n_burn_in'] = n_burn_in
                    config.setdefault('options', {})
                    config['options']['random_state'] = seed_run

                    self.pre_single_run(X_run, y_run, mask_labeled, n_burn_in,
                                        seed_run, X_test, y_test, n_run)
 def _check_params(self):
     """Checking parameters of classifier set in __init__"""
     assert isinstance(self.loss, AbstractLossFunction), \
         'LossFunction should be derived from AbstractLossFunction'
     assert self.n_estimators > 0, 'n_estimators should be positive'
     assert 0 < self.subsample <= 1., 'subsample should be in (0, 1]'
     self.random_state = check_random_state(self.random_state)
    def _sgd(self, data, labels):
        # shuffle data
        n, k = data.shape
        idx = np.arange(n)
        state = check_random_state(self.random_state)
        state.shuffle(idx)
        data = data[idx]
        labels = labels[idx]

        betas = np.zeros(k)
        rate = self.rate
        self.converged_ = False
        for epoch in range(self.max_iters):
            old_lcl = rlcl(data, labels, betas, self.mu)
            for i, (x, y) in enumerate(zip(data, labels)):
                betas = self._sgd_update(betas, x, y, rate)
            new_lcl = rlcl(data, labels, betas, self.mu)
            if np.abs(new_lcl - old_lcl) < 1e-8:
                self.converged_ = True
                break
            rate = rate * self.decay
        if self.converged_:
            print "converged after {} epochs".format(epoch)
        else:
            print "did not converge"

        self.lcl_ = lcl(data, labels, betas)
        self.rlcl_ = rlcl(data, labels, betas, self.mu)
        return betas
    def fit(self, X, y, sample_weight=None):
        X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight)
        self.random_state = check_random_state(self.random_state)
        self.estimators = []
        score = numpy.zeros(len(X), dtype=float)
        y_signed = 2 * y - 1

        self.w_sig = []
        self.w_bck = []

        for _ in range(self.n_estimators):
            residual = y_signed
            # numpy.exp(- y_signed * score)
            # residual[y > 0.5] /= numpy.mean(residual[y > 0.5])
            # residual[y < 0.5] /= -numpy.mean(residual[y < 0.5])

            trainX, testX, trainY, testY, trainW, testW, trainR, testR, trainS, testS = \
                train_test_split(X, y, sample_weight, residual, score,
                                 train_size=self.train_part, test_size=self.test_size, random_state=self.random_state)

            tree = DecisionTreeRegressor(criterion=self.criterion, splitter=self.splitter,
                                         max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf,
                                         max_features=self.max_features, random_state=self.random_state)

            # fitting
            tree.fit(trainX, trainR, sample_weight=trainW, check_input=False)

            # post-pruning
            self.update_terminal_regions(tree.tree_, testX, testY, testW, testS)

            # updating score
            # score += self.learning_rate * tree.predict(X)
            self.estimators.append(tree)
Beispiel #9
0
 def _check_params(self):
     """Checking parameters of classifier set in __init__"""
     assert isinstance(self.loss, AbstractLossFunction), \
         'LossFunction should be derived from AbstractLossFunction'
     assert self.n_estimators > 0, 'n_estimators should be positive'
     assert 0 < self.subsample <= 1., 'subsample should be in (0, 1]'
     self.random_state = check_random_state(self.random_state)
Beispiel #10
0
    def __init__(self, param_grid, n_evaluations=10, maximize=True, random_state=None):
        """
        Abstract class for grid search algorithm.
        The aim of this class is to generate new points, where the function (estimator) will be computed.
        You can define your own algorithm of step location of parameters grid.

        Parameters:
        ----------
        :param OrderedDict param_grid: the grid with parameters to optimize on
        :param int n_evaluations: the number of evaluations to do
        :param random_state: random generator
        :param maximize: whether algorithm should maximize or minimize target function.
        :type random_state: int or RandomState or None
        """
        assert isinstance(param_grid, dict), 'the passed param_grid should be of OrderedDict class'
        self.param_grid = OrderedDict(param_grid)
        _check_param_grid(param_grid)

        self.dimensions = list([len(param_values) for param, param_values in self.param_grid.items()])
        size = numpy.prod(self.dimensions)
        assert size > 1, 'The space of parameters contains only %i points' % size
        self.n_evaluations = min(n_evaluations, size)

        # results on different parameters
        self.grid_scores_ = OrderedDict()
        self.maximize = maximize

        # all the tasks that are being computed or already computed
        self.queued_tasks_ = set()
        self.random_state = check_random_state(random_state)
        self.evaluations_done = 0
Beispiel #11
0
    def _score(self, X, y):
        for switch in self.mapping:
            # Get column name (can be anything: str, number,...)
            column = switch.get('col')

            # Score the column
            transformed_column = pd.Series([np.nan] * X.shape[0], name=column)
            for val in switch.get('woe'):
                transformed_column.loc[X[column] == val] = switch.get('woe')[
                    val]  # THIS LINE IS SLOW

            # Replace missing values only in the computed columns
            if self.impute_missing:
                if self.handle_unknown == 'impute':
                    transformed_column.fillna(0, inplace=True)
                elif self.handle_unknown == 'error':
                    missing = transformed_column.isnull()
                    if any(missing):
                        raise ValueError(
                            'Unexpected categories found in column %s' %
                            switch.get('col'))

            # Randomization is meaningful only for training data -> we do it only if y is present
            if self.randomized and y is not None:
                random_state_generator = check_random_state(self.random_state)
                transformed_column = (
                    transformed_column * random_state_generator.normal(
                        1., self.sigma, transformed_column.shape[0]))

            X[column] = transformed_column.astype(float)
        return X
    def transform_leave_one_out(self, X_in, y, mapping=None):
        """
        Leave one out encoding uses a single column of floats to represent the means of the target variables.
        """
        X = X_in.copy(deep=True)
        random_state_ = check_random_state(self.random_state)

        # Prepare the data
        if y is not None:
            # Convert bools to numbers (the target must be summable)
            y = y.astype('double')

            # Cumsum and cumcount do not work nicely with None.
            # This is a terrible workaround that will fail, when the
            # categorical input contains -999.9
            for cat_col in X.select_dtypes('category').columns.values:
                X[cat_col] = X[cat_col].cat.add_categories(-999.9)
            X = X.fillna(-999.9)

        for col, colmap in mapping.items():
            level_notunique = colmap['count'] > 1

            unique_train = colmap.index
            unseen_values = pd.Series([x for x in X_in[col].unique() if x not in unique_train])

            is_nan = X_in[col].isnull()
            is_unknown_value = X_in[col].isin(unseen_values.dropna())

            if self.handle_unknown == 'error' and is_unknown_value.any():
                raise ValueError('Columns to be encoded can not contain new values')

            if y is None:    # Replace level with its mean target; if level occurs only once, use global mean
                level_means = ((colmap['sum'] + self._mean) / (colmap['count'] + 1)).where(level_notunique, self._mean)
                X[col] = X[col].map(level_means)
            else:
                ## Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
                # The nice thing about this is that it helps to prevent overfitting. The bad thing
                # is that CatBoost uses many iterations over the data. But we run just one iteration.
                # Still, it works better than leave-one-out without any noise.
                # See:
                #   https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
                temp = y.groupby(X[col]).agg(['cumsum', 'cumcount'])
                X[col] = (temp['cumsum'] - y + self._mean) / (temp['cumcount'] + 1)

            if self.handle_unknown == 'value':
                X.loc[is_unknown_value, col] = self._mean
            elif self.handle_unknown == 'return_nan':
                X.loc[is_unknown_value, col] = np.nan

            if self.handle_missing == 'value':
                X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean
            elif self.handle_missing == 'return_nan':
                X.loc[is_nan, col] = np.nan

            if self.sigma is not None and y is not None:
                X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0])

        return X
Beispiel #13
0
    def transform_leave_one_out(self, X_in, y, mapping=None):
        """
        Leave one out encoding uses a single column of floats to represent the means of the target variables.
        """
        X = X_in.copy(deep=True)
        random_state_ = check_random_state(self.random_state)

        # Prepare the data
        if y is not None:
            # Convert bools to numbers (the target must be summable)
            y = y.astype('double')

            # Cumsum and cumcount do not work nicely with None.
            # This is a terrible workaround that will fail, when the
            # categorical input contains -999.9
            for cat_col in X.select_dtypes('category').columns.values:
                X[cat_col] = X[cat_col].cat.add_categories(-999.9)
            X = X.fillna(-999.9)

        for col, colmap in mapping.items():
            level_notunique = colmap['count'] > 1

            unique_train = colmap.index
            unseen_values = pd.Series([x for x in X_in[col].unique() if x not in unique_train])

            is_nan = X_in[col].isnull()
            is_unknown_value = X_in[col].isin(unseen_values.dropna())

            if self.handle_unknown == 'error' and is_unknown_value.any():
                raise ValueError('Columns to be encoded can not contain new values')

            if y is None:    # Replace level with its mean target; if level occurs only once, use global mean
                level_means = ((colmap['sum'] + self._mean) / (colmap['count'] + 1)).where(level_notunique, self._mean)
                X[col] = X[col].map(level_means)
            else:
                # Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
                # The nice thing about this is that it helps to prevent overfitting. The bad thing
                # is that CatBoost uses many iterations over the data. But we run just one iteration.
                # Still, it works better than leave-one-out without any noise.
                # See:
                #   https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
                temp = y.groupby(X[col]).agg(['cumsum', 'cumcount'])
                X[col] = (temp['cumsum'] - y + self._mean) / (temp['cumcount'] + 1)

            if self.handle_unknown == 'value':
                X.loc[is_unknown_value, col] = self._mean
            elif self.handle_unknown == 'return_nan':
                X.loc[is_unknown_value, col] = np.nan

            if self.handle_missing == 'value':
                X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean
            elif self.handle_missing == 'return_nan':
                X.loc[is_nan, col] = np.nan

            if self.sigma is not None and y is not None:
                X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0])

        return X
Beispiel #14
0
    def transform_leave_one_out(self, X_in, y, mapping=None):
        """
        Leave one out encoding uses a single column of floats to represent the means of the target variables.
        """

        X = X_in.copy(deep=True)
        random_state_ = check_random_state(self.random_state)

        for col, colmap in mapping.items():
            level_notunique = colmap['count'] > 1

            unique_train = colmap.index
            unseen_values = pd.Series(
                [x for x in X[col].unique() if x not in unique_train],
                dtype=unique_train.dtype)

            is_nan = X[col].isnull()
            is_unknown_value = X[col].isin(
                unseen_values.dropna().astype(object))

            if X[col].dtype.name == 'category':  # Pandas 0.24 tries hard to preserve categorical data type
                X[col] = X[col].astype(str)

            if self.handle_unknown == 'error' and is_unknown_value.any():
                raise ValueError(
                    'Columns to be encoded can not contain new values')

            if y is None:  # Replace level with its mean target; if level occurs only once, use global mean
                level_means = (colmap['sum'] / colmap['count']).where(
                    level_notunique, self._mean)
                X[col] = X[col].map(level_means)
            else:  # Replace level with its mean target, calculated excluding this row's target
                # The y (target) mean for this level is normally just the sum/count;
                # excluding this row's y, it's (sum - y) / (count - 1)
                level_means = (X[col].map(colmap['sum']) -
                               y) / (X[col].map(colmap['count']) - 1)
                # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean
                X[col] = level_means.where(
                    X[col].map(colmap['count'][level_notunique]).notnull(),
                    self._mean)

            if self.handle_unknown == 'value':
                X.loc[is_unknown_value, col] = self._mean
            elif self.handle_unknown == 'return_nan':
                X.loc[is_unknown_value, col] = np.nan

            if self.handle_missing == 'value':
                X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean
            elif self.handle_missing == 'return_nan':
                X.loc[is_nan, col] = np.nan

            if self.sigma is not None and y is not None:
                X[col] = X[col] * random_state_.normal(1., self.sigma,
                                                       X[col].shape[0])

        return X
Beispiel #15
0
    def sample(self, size=None, random_state=None):

        rstate = check_random_state(random_state)

        if size:
            subset = rstate.choice(self.features, size=size, replace=False)
            return self.copy().set_subset(subset)

        else:
            mask = rstate.randint(0, 2, size=self.n_features, dtype=bool)
            return self.copy().set_mask(mask)
    def _score(self, X, y):
        for col in self.cols:
            # Score the column
            X[col] = X[col].map(self.mapping[col])

            # Randomization is meaningful only for training data -> we do it only if y is present
            if self.randomized and y is not None:
                random_state_generator = check_random_state(self.random_state)
                X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0]))

        return X
    def _score(self, X, y):
        for col in self.cols:
            # Score the column
            X[col] = X[col].map(self.mapping[col])

            # Randomization is meaningful only for training data -> we do it only if y is present
            if self.randomized and y is not None:
                random_state_generator = check_random_state(self.random_state)
                X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0]))

        return X
    def _check_params(self):
        if self.param_generator_type is None:
            self.param_generator_type = SimpleParameterOptimizer
        self.generator = self.param_generator_type(self.param_grid, self.n_evaluations)
        # Deleting parameters
        self.n_evaluations = None
        self.param_grid = None

        if self.score_function is None:
            self.score_function = roc_auc_score
        assert self.fold_checks <= self.folds, "We cannot have more checks than folds"
        self.random_state = check_random_state(self.random_state)
Beispiel #19
0
    def run(self, dataset_name, random_state=42):

        config = self.config
        X_train, y_train, X_test, y_test = fetch_load_data(dataset_name)

        for n_run in range(self.n_runs):
            seed_run = random_state * n_run
            logger.info(
                '\n\nRANDOM SEED = {} for data split.'.format(seed_run))
            rng = check_random_state(seed_run)
            if config['dataset']['is_stream']:
                logger.info('Dataset is a stream. Sampling observed labels.')
                # Just randomly sample ratio_labeled samples for mask_labeled
                n_burn_in = config['data']['n_burn_in_stream']
                ratio_labeled = config['data']['stream']['ratio_labeled']
                n_labeled = int(ratio_labeled * len(y_train))
                ind_labeled = rng.choice(len(y_train),
                                         n_labeled,
                                         replace=False)
                mask_labeled = np.zeros(len(y_train), dtype=bool)
                mask_labeled[ind_labeled] = True
                X_run, y_run = X_train, y_train
            else:
                burn_in_params = config['data']['burn_in']
                ind_burn_in, mask_labeled_burn_in = \
                    split_burn_in_rest(y_train, shuffle=True, seed=seed_run,
                                   **burn_in_params)
                X_burn_in, y_burn_in = X_train[ind_burn_in], \
                                       y_train[ind_burn_in]
                mask_rest = np.ones(len(X_train), dtype=bool)
                mask_rest[ind_burn_in] = False
                X_rest, y_rest = X_train[mask_rest], y_train[mask_rest]
                stream_params = config['data']['stream']
                mask_labeled_rest = split_labels_rest(y_rest,
                                                      seed=seed_run,
                                                      shuffle=True,
                                                      **stream_params)

                # Shuffle the rest
                indices = np.arange(len(y_rest))
                rng.shuffle(indices)
                X_run = np.concatenate((X_burn_in, X_rest[indices]))
                y_run = np.concatenate((y_burn_in, y_rest[indices]))
                mask_labeled = np.concatenate(
                    (mask_labeled_burn_in, mask_labeled_rest[indices]))
                n_burn_in = len(y_burn_in)

            config['data']['n_burn_in'] = n_burn_in
            config.setdefault('options', {})
            config['options']['random_state'] = seed_run

            self.pre_single_run(X_run, y_run, mask_labeled, n_burn_in,
                                seed_run, X_test, y_test, n_run)
Beispiel #20
0
def blocksplit_trajs(trajs,
                     lag=1,
                     sliding=True,
                     shift=None,
                     random_state=None):
    """ Splits trajectories into approximately uncorrelated fragments.

    Will split trajectories into fragments of lengths lag or longer. These fragments
    are overlapping in order to conserve the transition counts at given lag.
    If sliding=True, the resulting trajectories will lead to exactly the same count
    matrix as when counted from dtrajs. If sliding=False (sampling at lag), the
    count matrices are only equal when also setting shift=0.

    Parameters
    ----------
    trajs : list of ndarray(int)
        Trajectories
    lag : int
        Lag time at which counting will be done.
    sliding : bool
        True for splitting trajectories for sliding count, False if lag-sampling will be applied
    shift : None or int
        Start of first full tau-window. If None, shift will be randomly generated
    random_state : None or int or np.random.RandomState
        Random seed to use.

    Returns
    -------
    blocks : list of ndarray
        The blocks.
    """
    from sklearn.utils.random import check_random_state
    random_state = check_random_state(random_state)
    blocks = []
    for traj in trajs:
        if len(traj) <= lag:
            continue
        if shift is None:
            s = random_state.randint(min(lag, traj.size - lag))
        else:
            s = shift
        if sliding:
            if s > 0:
                blocks.append(traj[:lag + s])
            for t0 in range(s, len(traj) - lag, lag):
                blocks.append(traj[t0:t0 + 2 * lag])
        else:
            for t0 in range(s, len(traj) - lag, lag):
                blocks.append(traj[t0:t0 + lag + 1])
    return blocks
Beispiel #21
0
def get_col_score(estimator,
                  X,
                  y,
                  col,
                  n_repeats=5,
                  scoring=None,
                  random_state=None):
    """Calculate score when `col` is permuted."""

    scorer = check_scoring(estimator, scoring=scoring)
    rstate = check_random_state(random_state)

    scores = _get_col_score(estimator, X, y, col, n_repeats, scorer, rstate)

    return scores
Beispiel #22
0
def get_group_score(estimator,
                    X,
                    y,
                    g,
                    n_repeats=5,
                    scoring=None,
                    random_state=None):
    """Calculate score when columns group `g` is permuted."""

    scorer = check_scoring(estimator, scoring=scoring)
    rstate = check_random_state(random_state)

    scores = _get_group_score(estimator, X, y, g, n_repeats, scorer, rstate)

    return scores
    def fit(self, X, y, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # first check the type
        X = convert_input(X)
        y = pd.Series(y, name='target')
        assert X.shape[0] == y.shape[0]

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)
        self.random_state_ = check_random_state(self.random_state)

        _, categories = self.leave_one_out(X,
                                           y,
                                           mapping=self.mapping,
                                           cols=self.cols,
                                           impute_missing=self.impute_missing,
                                           handle_unknown=self.handle_unknown)
        self.mapping = categories

        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            self.drop_cols = [
                x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5
            ]

        return self
Beispiel #24
0
def mutSubset(ind, indpb, random_state=None, drop_attrs=['score']):

    rstate = check_random_state(random_state)
    mask = []

    for x in ind.mask:
        y = (rstate.rand() < indpb)
        mask.append(x ^ y)

    mutant = ind.set_mask(mask)

    for attr in drop_attrs:
        if hasattr(mutant, attr):
            delattr(mutant, attr)

    return mutant
Beispiel #25
0
    def _fit_start(self, X, partial=False):

        self._set_features(X)
        self.k_features_ = _check_k_features(self.k_features, self.n_features_, 'k_features')

        if not partial:

            self.rstate_ = check_random_state(self.random_state)
            self.subset_ = self.features_.copy()

            if self.forward:
                self.subset_.set_subset([])

            self._reset_trials()

        return self
Beispiel #26
0
    def transform_leave_one_out(self,
                                X_in,
                                y,
                                mapping=None,
                                impute_missing=True,
                                handle_unknown='impute'):
        """
        Leave one out encoding uses a single column of floats to represent the means of the target variables.
        """

        X = X_in.copy(deep=True)

        random_state_ = check_random_state(self.random_state)
        for switch in mapping:
            column = switch.get('col')
            transformed_column = pd.Series([np.nan] * X.shape[0], name=column)

            for val in switch.get('mapping'):
                if y is None:
                    transformed_column.loc[X[column] == val] = switch.get(
                        'mapping')[val]['mean']
                elif switch.get('mapping')[val]['count'] == 1:
                    transformed_column.loc[X[column] == val] = self._mean
                else:
                    transformed_column.loc[X[column] == val] = (
                        (switch.get('mapping')[val]['sum'] -
                         y[(X[column] == val).values]) /
                        (switch.get('mapping')[val]['count'] - 1))

            if impute_missing:
                if handle_unknown == 'impute':
                    transformed_column.fillna(self._mean, inplace=True)
                elif handle_unknown == 'error':
                    missing = transformed_column.isnull()
                    if any(missing):
                        raise ValueError(
                            'Unexpected categories found in column %s' %
                            column)

            if self.randomized and y is not None:
                transformed_column = (
                    transformed_column * random_state_.normal(
                        1., self.sigma, transformed_column.shape[0]))

            X[column] = transformed_column.astype(float)

        return X
    def fit(self, X, y, **kwargs):
        """Fit encoder according to X and y.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------

        self : encoder
            Returns self.

        """

        # first check the type
        X = convert_input(X)
        y = pd.Series(y, name='target')
        assert X.shape[0] == y.shape[0]

        self._dim = X.shape[1]

        # if columns aren't passed, just use every string column
        if self.cols is None:
            self.cols = get_obj_cols(X)
        self.random_state_ = check_random_state(self.random_state)

        _, categories = self.leave_one_out(
            X, y,
            mapping=self.mapping,
            cols=self.cols,
            impute_missing=self.impute_missing,
            handle_unknown=self.handle_unknown
        )
        self.mapping = categories

        if self.drop_invariant:
            self.drop_cols = []
            X_temp = self.transform(X)
            self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]

        return self
Beispiel #28
0
    def _fit_start(self, X, y, groups):

        # Basic
        self.rstate_ = check_random_state(self.random_state)
        self._set_features(X)
        self._reset_trials()

        # First trial
        k_min = self.min_features_
        k_max = self.max_features_
        k = self.rstate_.choice(range(k_min, k_max + 1))
        subset = self.features_.sample(size=k, random_state=self.rstate_)

        self.eval_subset(subset, X, y, groups)
        self.subset_ = subset

        return self
Beispiel #29
0
    def __init__(self, param_grid, n_evaluations=10, random_state=None):
        assert isinstance(param_grid, dict), 'the passed param_grid should be of OrderedDict class'
        self.param_grid = OrderedDict(param_grid)
        _check_param_grid(param_grid)

        self.dimensions = list([len(param_values) for param, param_values in self.param_grid.items()])
        size = numpy.prod(self.dimensions)
        assert size > 1, 'The space of parameters contains only %i points' % size
        self.n_evaluations = min(n_evaluations, size)

        # results on different parameters
        self.grid_scores_ = OrderedDict()

        # all the tasks that are being computed or already computed
        self.queued_tasks_ = set()
        self.random_state = check_random_state(random_state)
        self.evaluations_done = 0
Beispiel #30
0
def group_permutation_importance(estimator,
                                 X,
                                 y,
                                 subset=None,
                                 scoring=None,
                                 n_repeats=5,
                                 n_jobs=None,
                                 random_state=0,
                                 tqdm=False):

    columns = list(X.columns.get_level_values(0).unique())

    msg = "<subset> must contain only features from <X>"
    assert not subset or not set(subset) - set(columns), msg

    msg = "<subset> must contain only unique features"
    assert not subset or len(set(subset)) == len(subset), msg

    subset = subset if subset else columns
    subset = tqdm_notebook(subset) if tqdm else subset

    scorer = check_scoring(estimator, scoring=scoring)
    rstate = check_random_state(random_state)

    base_score = scorer(estimator, X, y)
    scores = np.zeros((len(subset), n_repeats))

    # FIXME: avoid <max_nbytes>
    scores = Parallel(n_jobs=n_jobs,
                      max_nbytes='512M',
                      backend='multiprocessing')(delayed(_get_group_score)(
                          estimator, X, y, feature, n_repeats, scorer, rstate)
                                                 for feature in subset)

    importances = np.full((len(columns), n_repeats), np.nan)
    ind = [columns.index(feature) for feature in subset]
    importances[ind] = base_score - np.array(scores)

    result = {
        'importances_mean': np.mean(importances, axis=1),
        'importances_std': np.std(importances, axis=1),
        'importances': importances,
        'score': base_score
    }

    return result
Beispiel #31
0
    def __init__(self, param_grid, n_evaluations=10, random_state=None):
        assert isinstance(param_grid, dict), 'the passed param_grid should be of OrderedDict class'
        self.param_grid = OrderedDict(param_grid)
        _check_param_grid(param_grid)

        self.dimensions = list([len(param_values) for param, param_values in self.param_grid.items()])
        size = numpy.prod(self.dimensions)
        assert size > 1, 'The space of parameters contains only %i points' % size
        self.n_evaluations = min(n_evaluations, size)

        # results on different parameters
        self.grid_scores_ = OrderedDict()

        # all the tasks that are being computed or already computed
        self.queued_tasks_ = set()
        self.random_state = check_random_state(random_state)
        self.evaluations_done = 0
Beispiel #32
0
def perturb_subset(subset, step, random_state=None, drop_attrs=['score']):

    rstate = check_random_state(random_state)
    update = rstate.choice(subset.features, step, False)

    del_list = set(subset) & set(update)
    add_list = set(update) - set(subset)

    subset_ = subset.copy()
    subset_ = subset_.remove(*del_list)
    subset_ = subset_.append(*add_list)

    for attr in drop_attrs:
        if hasattr(subset_, attr):
            delattr(subset_, attr)

    subset_.parents = (subset, )
    return subset_
Beispiel #33
0
    def _score(self, X, y):
        for switch in self.mapping:
            # Get column name (can be anything: str, number,...)
            column = switch.get('col')

            # Scoring or training time?
            if y is None:
                X[str(column) + '_tmp'] = np.nan
                for val in switch.get('woe'):
                    X.loc[X[column] == val,
                          str(column) +
                          '_tmp'] = switch.get('woe')[val]  # THIS LINE IS SLOW
                del X[column]
                X.rename(columns={str(column) + '_tmp': column}, inplace=True)
            else:
                X[str(column) + '_tmp'] = np.nan
                for val in switch.get('woe'):
                    X.loc[(X[column] == val) * (y == 1),
                          str(column) + '_tmp'] = switch.get('woe_positive')[
                              val]  # THIS LINE IS SLOW
                    X.loc[(X[column] == val) * (y == 0),
                          str(column) + '_tmp'] = switch.get('woe_negative')[
                              val]  # THIS LINE IS SLOW
                del X[column]
                X.rename(columns={str(column) + '_tmp': column}, inplace=True)

            # Replace missing values only in the computed columns
            if self.impute_missing:
                if self.handle_unknown == 'impute':
                    X[column].fillna(0, inplace=True)
                elif self.handle_unknown == 'error':
                    missing = X[switch.get('col')].isnull()
                    if any(missing):
                        raise ValueError(
                            'Unexpected categories found in column %s' %
                            switch.get('col'))

            # Randomization is meaningful only for training data -> we do it only if y is present
            if self.randomized and y is not None:
                random_state_generator = check_random_state(self.random_state)
                X[column] = (X[column] * random_state_generator.normal(
                    1., self.sigma, X[column].shape[0]))

        return X
    def transform_leave_one_out(self, X_in, y, mapping=None):
        """
        Leave one out encoding uses a single column of floats to represent the means of the target variables.
        """

        X = X_in.copy(deep=True)
        random_state_ = check_random_state(self.random_state)

        for col, colmap in mapping.items():
            level_notunique = colmap['count'] > 1

            unique_train = colmap.index
            unseen_values = pd.Series([x for x in X[col].unique() if x not in unique_train])

            is_nan = X[col].isnull()
            is_unknown_value = X[col].isin(unseen_values.dropna())

            if self.handle_unknown == 'error' and is_unknown_value.any():
                raise ValueError('Columns to be encoded can not contain new values')

            if y is None:    # Replace level with its mean target; if level occurs only once, use global mean
                level_means = (colmap['sum'] / colmap['count']).where(level_notunique, self._mean)
                X[col] = X[col].map(level_means)
            else:            # Replace level with its mean target, calculated excluding this row's target
                # The y (target) mean for this level is normally just the sum/count;
                # excluding this row's y, it's (sum - y) / (count - 1)
                level_means = (X[col].map(colmap['sum']) - y) / (X[col].map(colmap['count']) - 1)
                # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean
                X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean)

            if self.handle_unknown == 'value':
                X.loc[is_unknown_value, col] = self._mean
            elif self.handle_unknown == 'return_nan':
                X.loc[is_unknown_value, col] = np.nan

            if self.handle_missing == 'value':
                X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean
            elif self.handle_missing == 'return_nan':
                X.loc[is_nan, col] = np.nan

            if self.sigma is not None and y is not None:
                X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0])

        return X
    def _score(self, X, y):
        for col in self.cols:
            # Score the column
            X[col] = X[col].map(self.mapping[col])

            # Replace missing values only in the computed columns
            if self.impute_missing:
                if self.handle_unknown == 'impute':
                    X[col].fillna(0, inplace=True)
                elif self.handle_unknown == 'error':
                    if X[col].isnull().any():
                        raise ValueError('Unexpected categories found in column %s' % col)

            # Randomization is meaningful only for training data -> we do it only if y is present
            if self.randomized and y is not None:
                random_state_generator = check_random_state(self.random_state)
                X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0]))

        return X
Beispiel #36
0
def cvsplit_dtrajs(dtrajs, random_state=None):
    """ Splits the trajectories into a training and test set with approximately equal number of trajectories

    Parameters
    ----------
    dtrajs : list of ndarray(int)
        Discrete trajectories
    random_state : None or int or np.random.RandomState
        Random seed to use.
    """
    from sklearn.utils.random import check_random_state
    if len(dtrajs) == 1:
        raise ValueError('Only have a single trajectory. Cannot be split into train and test set')
    random_state = check_random_state(random_state)
    I0 = random_state.choice(len(dtrajs), int(len(dtrajs) / 2), replace=False)
    I1 = np.array(list(set(list(np.arange(len(dtrajs)))) - set(list(I0))))
    dtrajs_train = [dtrajs[i] for i in I0]
    dtrajs_test = [dtrajs[i] for i in I1]
    return dtrajs_train, dtrajs_test
Beispiel #37
0
    def transform_leave_one_out(self,
                                X_in,
                                y,
                                mapping=None,
                                impute_missing=True,
                                handle_unknown='impute'):
        """
        Leave one out encoding uses a single column of floats to represent the means of the target variables.
        """

        X = X_in.copy(deep=True)
        random_state_ = check_random_state(self.random_state)

        for col, colmap in mapping.items():
            level_notunique = colmap['count'] > 1
            if y is None:  # Replace level with its mean target; if level occurs only once, use global mean
                level_means = (colmap['sum'] / colmap['count']).where(
                    level_notunique, self._mean)
                X[col] = X[col].map(level_means)
            else:  # Replace level with its mean target, calculated excluding this row's target
                # The y (target) mean for this level is normally just the sum/count;
                # excluding this row's y, it's (sum - y) / (count - 1)
                level_means = (X[col].map(colmap['sum']) -
                               y) / (X[col].map(colmap['count']) - 1)
                # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean
                X[col] = level_means.where(
                    X[col].map(colmap['count'][level_notunique]).notnull(),
                    self._mean)

            if impute_missing:
                if handle_unknown == 'impute':
                    X[col].fillna(self._mean, inplace=True)
                elif handle_unknown == 'error':
                    if X[col].isnull().any():
                        raise ValueError(
                            'Unexpected categories found in column %s' % col)

            if self.sigma is not None and y is not None:
                X[col] = X[col] * random_state_.normal(1., self.sigma,
                                                       X[col].shape[0])

        return X
Beispiel #38
0
def cvsplit_trajs(trajs, random_state=None):
    """ Splits the trajectories into a training and test set with approximately equal number of trajectories

    Parameters
    ----------
    trajs : list of ndarray(int)
        Discrete trajectories
    random_state : None or int or np.random.RandomState
        Random seed to use.
    """
    from sklearn.utils.random import check_random_state
    assert len(
        trajs
    ) > 1, 'Only have a single trajectory. Cannot be split into train and test set'
    random_state = check_random_state(random_state)
    I0 = random_state.choice(len(trajs), int(len(trajs) / 2), replace=False)
    I1 = np.array(list(set(list(np.arange(len(trajs)))) - set(list(I0))))
    train_set = [trajs[i] for i in I0]
    test_set = [trajs[i] for i in I1]
    return train_set, test_set
Beispiel #39
0
    def __init__(self, param_grid, n_evaluations=10, maximize=True, random_state=None):
        """
        Works in the same way as sklearn.grid_search.RandomizedSearch.
        Each next point is generated independently.

        :param_grid: dict with distributions used to sample each parameter.
          name -> list of possible values (in which case sampled uniformly from options)
          name -> distribution (should implement '.rvs()' as scipy distributions)
        :param bool maximize: ignored parameter, added for uniformity

        NB: this is the only optimizer, which supports passing distributions for parameters.
        """
        self.maximize = maximize
        self.param_grid = OrderedDict(param_grid)
        self.n_evaluations = n_evaluations
        self.random_state = check_random_state(random_state)
        self.indices_to_parameters_ = OrderedDict()
        self.grid_scores_ = OrderedDict()
        self.queued_tasks_ = set()
        from sklearn.grid_search import ParameterSampler
        self.param_sampler = iter(ParameterSampler(param_grid, n_iter=n_evaluations, random_state=random_state))
Beispiel #40
0
    def fit(self, inp, y):
        self.precomputed_probs_ = None
        self.precomputed_weights_ = None

        self.classes_, y = unique(y, return_inverse=True)
        self.n_classes_ = len(self.classes_)
        self.random_state_ = check_random_state(self.random_state)

        if self.pipeline is not None:
            inp = self.pipeline.fit_transform(inp)

        self.weighting_strategy.prepare(inp, y)
        self.classifiers_ = self.training_strategy.train_estimators(
            self.n_estimators, inp, y,
            self.weighting_strategy, self.random_state_
        )

        # Reset it to null because the previous line uses self.predict
        self.precomputed_probs_ = None
        self.precomputed_weights_ = None
        return self
Beispiel #41
0
    def _fit_start(self, X, partial=False):

        if not partial:
            self._reset_trials()

        if not partial and hasattr(self, 'random_state'):
            self.rstate_ = check_random_state(self.random_state)

        self._set_features(X)

        weights_vals = ['uniform', 'binomal']

        if self.weights == 'binomal':
            self.weights_ = binomal_weights(self.min_features_,
                                            self.max_features_,
                                            self.n_features_)
        elif self.weights == 'uniform':
            self.weights_ = uniform_weights(self.min_features_,
                                            self.max_features_)
        else:
            raise ValueError(f'<weights> must be from {weights_vals}')

        return self
 def __init__(self, param_grid, n_evaluations, random_state=None):
     """
     The aim of this class is to generate new points, where the function (estimator) will be computed.
     :type param_grid: OrderedDict, the grid with parameters to optimize on
     :type n_evaluations: int, the number of evaluations to do
     :type random_state: int | RandomState | None
     """
     assert isinstance(param_grid, dict), 'the passed param_grid should be of OrderedDict class'
     self.param_grid = OrderedDict(param_grid)
     _check_param_grid(self.param_grid)
     self.dimensions = list([len(param_values) for param, param_values in self.param_grid.items()])
     size = numpy.prod(self.dimensions)
     assert size > 1, 'The space of parameters contains only %i points' % size
     if n_evaluations > size / 2:
         warn('The number of evaluations was decreased to %i' % (size // 2), UserWarning)
         n_evaluations = size // 2
     self.n_evaluations = n_evaluations
     # results on different parameters
     self.grid_scores_ = OrderedDict()
     # all the tasks that are being computed or already computed
     self.queued_tasks_ = set()
     self.random_state = check_random_state(random_state)
     self.evaluations_done = 0
Beispiel #43
0
 def _make_estimator(self, inp, y, sample_weights, random_state):
     seed = random_state.randint(MAX_INT)
     est = clone(self.base_estimator)
     est.set_params(random_state=check_random_state(seed))
     est.fit(inp, y, sample_weight=sample_weights)
     return est
Beispiel #44
0
    def fit(self, X, y, sample_weight=None, neighbours_matrix=None):
        """Build a boosted classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The training input samples.

        y : array-like of shape = [n_samples]
            The target values (integers that correspond to classes).

        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            ``1 / n_samples``.

        neighbours_matrix: array-like of shape [n_samples, n_neighbours],
            each row contains indices of signal neighbours
            (neighbours should be computed for background too),
            if None, this matrix is computed.

        Returns
        -------
        self : object
            Returns self.
        """
        if self.smoothing < 0:
            raise ValueError("Smoothing must be non-negative")
        if not isinstance(self.base_estimator, BaseEstimator):
            raise TypeError("estimator must be a subclass of BaseEstimator")
        if self.n_estimators <= 0:
            raise ValueError("n_estimators must be greater than zero.")
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        # Check that algorithm is supported
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported"
                             % self.algorithm)
        if self.algorithm == 'SAMME.R':
            if not hasattr(self.base_estimator, 'predict_proba'):
                raise TypeError(
                    "uBoostBDT with algorithm='SAMME.R' requires "
                    "that the weak learner have a predict_proba method.\n"
                    "Please change the base estimator or set "
                    "algorithm='SAMME' instead.")

        assert np.in1d(y, [0, 1]).all(), \
            "only two-class classification is implemented, with labels 0 and 1"
        self.signed_uniform_label = 2 * self.uniform_label - 1

        if neighbours_matrix is not None:
            assert np.shape(neighbours_matrix) == (len(X), self.n_neighbors), \
                "Wrong shape of neighbours_matrix"
            self.knn_indices = neighbours_matrix
        else:
            assert self.uniform_variables is not None, \
                "uniform_variables should be set"
            self.knn_indices = compute_knn_indices_of_same_class(
                X.ix[:, self.uniform_variables], y, self.n_neighbors)

        sample_weight = commonutils.check_sample_weight(y, sample_weight=sample_weight, normalize=True)
        assert np.all(sample_weight >= 0.), 'the weights should be non-negative'

        # Clear any previous fit results
        self.estimators_ = []
        self.estimator_weights_ = []
        # score cuts correspond to
        # global efficiency == target_efficiency on each iteration.
        self.score_cuts_ = []

        X_train_variables = self.get_train_vars(X)
        X_train_variables, y, sample_weight = check_xyw(X_train_variables, y, sample_weight)

        # A dictionary to keep all intermediate weights, efficiencies and so on
        if self.keep_debug_info:
            self.debug_dict = defaultdict(list)

        self.random_generator = check_random_state(self.random_state)

        self._boost(X_train_variables, y, sample_weight)

        self.score_cut = self.signed_uniform_label * compute_cut_for_efficiency(
            self.target_efficiency, y == self.uniform_label, self.predict_score(X) * self.signed_uniform_label)
        assert np.allclose(self.score_cut, self.score_cuts_[-1], rtol=1e-10, atol=1e-10), \
            "score cut doesn't appear to coincide with the staged one"
        assert len(self.estimators_) == len(self.estimator_weights_) == len(self.score_cuts_)
        return self
    def fit(self, X, y, sample_weight=None):
        shuffler = Shuffler(X, random_state=self.random_state)
        X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        y = column_or_1d(y, warn=True)
        n_samples = len(X)
        n_inbag = int(self.subsample * n_samples)
        sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy()
        self.random_state = check_random_state(self.random_state)

        # skipping all checks
        assert self.update_on in ['all', 'same', 'other', 'random']
        y_pred = numpy.zeros(len(y), dtype=float)

        self.classifiers = []
        self.learning_rates = []
        self.loss_values = []
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)
        iter_X = shuffler.generate(0.)

        prev_smearing = 1
        for iteration in range(self.n_estimators):
            if iteration % self.recount_step == 0:
                if prev_smearing > 0:
                    iter_smearing = interpolate(self.smearing, iteration, self.n_estimators)
                    prev_smearing = iter_smearing
                    iter_X = shuffler.generate(iter_smearing)
                    iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
                    y_pred = numpy.zeros(len(y))
                    y_pred += sum(cl.predict(X) * rate for rate, cl in zip(self.learning_rates, self.classifiers))


            self.loss_values.append(self.loss(y, y_pred, sample_weight=sample_weight))
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=interpolate(self.max_depth, iteration, self.n_estimators),
                min_samples_split=self.min_samples_split,
                min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True),
                max_features=self.max_features,
                random_state=self.random_state)

            sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state)
            loss_weight = sample_weight if self.weights_in_loss else numpy.ones(len(sample_weight))
            tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(len(sample_weight))
            residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight)

            tree.fit(numpy.array(iter_X)[sample_mask, :],
                     residual[sample_mask],
                     sample_weight=tree_weight[sample_mask], check_input=False)
            # update tree leaves
            if self.update_tree:
                if self.update_on == 'all':
                    update_mask = numpy.ones(len(sample_mask), dtype=bool)
                elif self.update_on == 'same':
                    update_mask = sample_mask
                elif self.update_on == 'other':
                    update_mask = ~sample_mask
                else:  # random
                    update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state)
                self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred,
                                                  sample_mask=update_mask, sample_weight=sample_weight)
            iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True)
            y_pred += iter_learning_rate * tree.predict(X)
            self.classifiers.append(tree)
            self.learning_rates.append(iter_learning_rate)

        return self
Beispiel #46
0
    def fit(self, X, y, sample_weight=None, neighbours_matrix=None):
        """Build a boosted classifier from the training set (X, y).

        :param X: array-like of shape [n_samples, n_features]
        :param y: labels, array of shape [n_samples] with 0 and 1.
        :param sample_weight: array-like of shape [n_samples] or None

        :param neighbours_matrix: array-like of shape [n_samples, n_neighbours],
            each row contains indices of signal neighbours
            (neighbours should be computed for background too),
            if None, this matrix is computed.

        :return: self
        """
        if self.smoothing < 0:
            raise ValueError("Smoothing must be non-negative")
        if not isinstance(self.base_estimator, BaseEstimator):
            raise TypeError("estimator must be a subclass of BaseEstimator")
        if self.n_estimators <= 0:
            raise ValueError("n_estimators must be greater than zero.")
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")
        if self.base_estimator is None:
            self.base_estimator = DecisionTreeClassifier(max_depth=2)
        # Check that algorithm is supported
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported" % self.algorithm)
        if self.algorithm == 'SAMME.R':
            if not hasattr(self.base_estimator, 'predict_proba'):
                raise TypeError(
                    "uBoostBDT with algorithm='SAMME.R' requires "
                    "that the weak learner have a predict_proba method.\n"
                    "Please change the base estimator or set algorithm='SAMME' instead.")

        assert np.in1d(y, [0, 1]).all(), \
            "only two-class classification is implemented, with labels 0 and 1"
        self.signed_uniform_label = 2 * self.uniform_label - 1

        if neighbours_matrix is not None:
            assert np.shape(neighbours_matrix) == (len(X), self.n_neighbors), \
                "Wrong shape of neighbours_matrix"
            self.knn_indices = neighbours_matrix
        else:
            assert self.uniform_features is not None, \
                "uniform_variables should be set"
            self.knn_indices = compute_knn_indices_of_same_class(
                X.ix[:, self.uniform_features], y, self.n_neighbors)

        sample_weight = commonutils.check_sample_weight(y, sample_weight=sample_weight, normalize=True)
        assert np.all(sample_weight >= 0.), 'the weights should be non-negative'

        # Clear any previous fit results
        self.estimators_ = []
        self.estimator_weights_ = []
        # score cuts correspond to
        # global efficiency == target_efficiency on each iteration.
        self.score_cuts_ = []

        x_train_features = self._get_train_features(X)
        x_train_features, y, sample_weight = check_xyw(x_train_features, y, sample_weight)

        self.random_state_ = check_random_state(self.random_state)

        self._boost(x_train_features, y, sample_weight)

        self.score_cut = self.signed_uniform_label * compute_cut_for_efficiency(
            self.target_efficiency, y == self.uniform_label, self.decision_function(X) * self.signed_uniform_label)
        assert np.allclose(self.score_cut, self.score_cuts_[-1], rtol=1e-10, atol=1e-10), \
            "score cut doesn't appear to coincide with the staged one"
        assert len(self.estimators_) == len(self.estimator_weights_) == len(self.score_cuts_)
        return self
    def _sgd(self, data, labels):
        #split off validation set
        data_train, data_valid, labels_train, labels_valid = \
            train_test_split(data, labels, test_size=0.3, random_state=0)        
        
        # shuffle data
        n = len(data_train)
        idx = np.arange(n)
        state = check_random_state(self.random_state)
        state.shuffle(idx)
        data_train = data_train[idx]
        labels_train = labels_train[idx]
        
        labels_valid=self.preproclabels(labels_valid)
        labels_train=self.preproclabels(labels_train)
        
        self.ADict = dict.fromkeys(range(n))
        self.BDict = dict.fromkeys(range(n))

        self.ws = np.zeros(ffs.numJ)
        rate = self.rate
        self.converged_ = False
        old_score = 0
        epochscores = []
        minorscores = []
        for epoch in xrange(self.max_iters):
            for i, (x, y) in enumerate(zip(data_train, labels_train)):
                self.ws = self._sgd_update(self.ws, i, x, y, rate)
                if i>0 and i%1000 == 0:
                    prediction = self.predict(data_valid)
                    tagscores = self.tagAccuracy(labels_valid, prediction)
                    score = np.mean(tagscores)
                    minorscores.append(score)
                    print "sample:{}".format(i),score,max(tagscores),min(tagscores)#,self.ws
                    sys.stdout.flush()
                    if score > 0.85 or (score > 0 and score <= old_score):#np.abs(score - old_score) < 1e-8:
                        self.converged_ = True
                        break
                    old_score = score
            if self.converged_ == True:
                break
            prediction = self.predict(data_valid)
            tagscores = self.tagAccuracy(labels_valid, prediction)
            score = np.mean(tagscores)
            epochscores.append(score)
            print "epoch:{}".format(epoch),score,max(tagscores),min(tagscores)#,self.ws
            sys.stdout.flush()
            if score > 0.85 or (score > 0 and score <= old_score):#np.abs(score - old_score) < 1e-8:
                self.converged_ = True
                break
            rate = rate * self.decay
            old_score = score
        if self.converged_:
            print "converged after {} epochs".format(epoch)
        else:
            print "did not converge"
        
        
#        np.savetxt(self.method+"_tagscore_per_epoch.csv", epochscores, delimiter=",", fmt='%1.4e')
#        np.savetxt(self.method+"_tagscore_minor.csv", minorscores, delimiter=",", fmt='%1.4e')
#        
#        if self.method == "collins":
#            np.savetxt(self.method+"_yhat.csv", self.unpreproclabels(self.yhats), delimiter=",", fmt='%s')
#        elif self.method == "cd":
#            np.savetxt(self.method+"_ystar.csv", self.unpreproclabels(self.ystars), delimiter=",", fmt='%s')
#        np.savetxt(self.method+"_labels.csv", self.unpreproclabels(labels_train), delimiter=",", fmt='%s')
            
        return self.ws
Beispiel #48
0
 def check_params(self):
     assert isinstance(self.loss, AbstractLossFunction), \
         'LossFunction should be derived from AbstractLossFunction'
     assert self.n_estimators > 0, 'n_estimators should be positive'
     assert 0 < self.subsample <= 1., 'subsample should be in (0, 1]'
     self.random_state = check_random_state(self.random_state)