def _fit_start(self, X, partial=False): self._set_features(X) if not partial or not hasattr(self, 'trials_'): self._reset_trials() self.k_gen_ = 0 # Init toolbox self.toolbox = base.Toolbox() self.rstate = check_random_state(self.random_state) # Define individual k_min = self.min_features_ k_max = self.max_features_ def get_individual(): ind_size = self.rstate.choice(range(k_min, k_max + 1)) features = self.features_.sample(ind_size) return features self.toolbox.register("individual", get_individual) # Define population self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual) self.population = self.toolbox.population(n=self.pop_size) return self
def cxOnePoint(ind1, ind2, indpb=0.5, random_state=None, drop_attrs=['score']): rstate = check_random_state(random_state) n = ind1.n_features argsort = rstate.permutation(n) a = rstate.randint(n) mask1 = np.zeros((n, ), dtype=bool) mask2 = np.zeros((n, ), dtype=bool) for i in range(n): j = argsort[i] x = ind1.mask[i] y = ind2.mask[j] if a <= i: mask1[j] = x mask2[j] = y else: mask1[j] = y mask2[j] = x child1 = ind1.copy().set_mask(mask1) child2 = ind2.copy().set_mask(mask2) child1.parents = (ind1, ind2) child2.parents = (ind1, ind2) for attr in drop_attrs: for child in [child1, child2]: if hasattr(child, attr): delattr(child, attr) return child1, child2
def cxUniform(ind1, ind2, indpb=0.5, random_state=None, drop_attrs=['score']): rstate = check_random_state(random_state) mask1, mask2 = [], [] for x, y in zip(ind1.mask, ind2.mask): if rstate.rand() < indpb: mask1.append(x) mask2.append(y) else: mask1.append(y) mask2.append(x) child1 = ind1.copy().set_mask(mask1) child2 = ind2.copy().set_mask(mask2) child1.parents = (ind1, ind2) child2.parents = (ind1, ind2) for attr in drop_attrs: for child in [child1, child2]: if hasattr(child, attr): delattr(child, attr) return child1, child2
def __init__(self, param_grid, n_evaluations=10, maximize=True, random_state=None): """ Works in the same way as sklearn.grid_search.RandomizedSearch. Each next point is generated independently. :param_grid: dict with distributions used to sample each parameter. name -> list of possible values (in which case sampled uniformly from options) name -> distribution (should implement '.rvs()' as scipy distributions) :param bool maximize: ignored parameter, added for uniformity NB: this is the only optimizer, which supports passing distributions for parameters. """ self.maximize = maximize self.param_grid = OrderedDict(param_grid) self.n_evaluations = n_evaluations self.random_state = check_random_state(random_state) self.indices_to_parameters_ = OrderedDict() self.grid_scores_ = OrderedDict() self.queued_tasks_ = set() from sklearn.grid_search import ParameterSampler self.param_sampler = iter( ParameterSampler(param_grid, n_iter=n_evaluations, random_state=random_state))
def run(self, dataset_name, random_state=42): config = self.config X_train, y_train, X_test, y_test = fetch_load_data(dataset_name) for n_run in range(self.n_runs): seed_run = random_state * n_run logger.info( '\n\nRANDOM SEED = {} for data split.'.format(seed_run)) rng = check_random_state(seed_run) if config['dataset']['is_stream']: logger.info('Dataset is a stream. Sampling observed labels.') # Just randomly sample ratio_labeled samples for mask_labeled n_burn_in = config['data']['n_burn_in_stream'] for ratio_labeled in self.ratio_labeled_values: config['data']['stream']['ratio_labeled'] = ratio_labeled n_labeled = int(ratio_labeled * len(y_train)) ind_labeled = rng.choice(len(y_train), n_labeled, replace=False) mask_labeled = np.zeros(len(y_train), dtype=bool) mask_labeled[ind_labeled] = True X_run, y_run = X_train, y_train config['data']['n_burn_in'] = n_burn_in config.setdefault('options', {}) config['options']['random_state'] = seed_run self.pre_single_run(X_run, y_run, mask_labeled, n_burn_in, seed_run, X_test, y_test, n_run)
def _check_params(self): """Checking parameters of classifier set in __init__""" assert isinstance(self.loss, AbstractLossFunction), \ 'LossFunction should be derived from AbstractLossFunction' assert self.n_estimators > 0, 'n_estimators should be positive' assert 0 < self.subsample <= 1., 'subsample should be in (0, 1]' self.random_state = check_random_state(self.random_state)
def _sgd(self, data, labels): # shuffle data n, k = data.shape idx = np.arange(n) state = check_random_state(self.random_state) state.shuffle(idx) data = data[idx] labels = labels[idx] betas = np.zeros(k) rate = self.rate self.converged_ = False for epoch in range(self.max_iters): old_lcl = rlcl(data, labels, betas, self.mu) for i, (x, y) in enumerate(zip(data, labels)): betas = self._sgd_update(betas, x, y, rate) new_lcl = rlcl(data, labels, betas, self.mu) if np.abs(new_lcl - old_lcl) < 1e-8: self.converged_ = True break rate = rate * self.decay if self.converged_: print "converged after {} epochs".format(epoch) else: print "did not converge" self.lcl_ = lcl(data, labels, betas) self.rlcl_ = rlcl(data, labels, betas, self.mu) return betas
def fit(self, X, y, sample_weight=None): X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) sample_weight = check_sample_weight(y, sample_weight=sample_weight) sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight) self.random_state = check_random_state(self.random_state) self.estimators = [] score = numpy.zeros(len(X), dtype=float) y_signed = 2 * y - 1 self.w_sig = [] self.w_bck = [] for _ in range(self.n_estimators): residual = y_signed # numpy.exp(- y_signed * score) # residual[y > 0.5] /= numpy.mean(residual[y > 0.5]) # residual[y < 0.5] /= -numpy.mean(residual[y < 0.5]) trainX, testX, trainY, testY, trainW, testW, trainR, testR, trainS, testS = \ train_test_split(X, y, sample_weight, residual, score, train_size=self.train_part, test_size=self.test_size, random_state=self.random_state) tree = DecisionTreeRegressor(criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state) # fitting tree.fit(trainX, trainR, sample_weight=trainW, check_input=False) # post-pruning self.update_terminal_regions(tree.tree_, testX, testY, testW, testS) # updating score # score += self.learning_rate * tree.predict(X) self.estimators.append(tree)
def __init__(self, param_grid, n_evaluations=10, maximize=True, random_state=None): """ Abstract class for grid search algorithm. The aim of this class is to generate new points, where the function (estimator) will be computed. You can define your own algorithm of step location of parameters grid. Parameters: ---------- :param OrderedDict param_grid: the grid with parameters to optimize on :param int n_evaluations: the number of evaluations to do :param random_state: random generator :param maximize: whether algorithm should maximize or minimize target function. :type random_state: int or RandomState or None """ assert isinstance(param_grid, dict), 'the passed param_grid should be of OrderedDict class' self.param_grid = OrderedDict(param_grid) _check_param_grid(param_grid) self.dimensions = list([len(param_values) for param, param_values in self.param_grid.items()]) size = numpy.prod(self.dimensions) assert size > 1, 'The space of parameters contains only %i points' % size self.n_evaluations = min(n_evaluations, size) # results on different parameters self.grid_scores_ = OrderedDict() self.maximize = maximize # all the tasks that are being computed or already computed self.queued_tasks_ = set() self.random_state = check_random_state(random_state) self.evaluations_done = 0
def _score(self, X, y): for switch in self.mapping: # Get column name (can be anything: str, number,...) column = switch.get('col') # Score the column transformed_column = pd.Series([np.nan] * X.shape[0], name=column) for val in switch.get('woe'): transformed_column.loc[X[column] == val] = switch.get('woe')[ val] # THIS LINE IS SLOW # Replace missing values only in the computed columns if self.impute_missing: if self.handle_unknown == 'impute': transformed_column.fillna(0, inplace=True) elif self.handle_unknown == 'error': missing = transformed_column.isnull() if any(missing): raise ValueError( 'Unexpected categories found in column %s' % switch.get('col')) # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) transformed_column = ( transformed_column * random_state_generator.normal( 1., self.sigma, transformed_column.shape[0])) X[column] = transformed_column.astype(float) return X
def transform_leave_one_out(self, X_in, y, mapping=None): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ X = X_in.copy(deep=True) random_state_ = check_random_state(self.random_state) # Prepare the data if y is not None: # Convert bools to numbers (the target must be summable) y = y.astype('double') # Cumsum and cumcount do not work nicely with None. # This is a terrible workaround that will fail, when the # categorical input contains -999.9 for cat_col in X.select_dtypes('category').columns.values: X[cat_col] = X[cat_col].cat.add_categories(-999.9) X = X.fillna(-999.9) for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 unique_train = colmap.index unseen_values = pd.Series([x for x in X_in[col].unique() if x not in unique_train]) is_nan = X_in[col].isnull() is_unknown_value = X_in[col].isin(unseen_values.dropna()) if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError('Columns to be encoded can not contain new values') if y is None: # Replace level with its mean target; if level occurs only once, use global mean level_means = ((colmap['sum'] + self._mean) / (colmap['count'] + 1)).where(level_notunique, self._mean) X[col] = X[col].map(level_means) else: ## Simulation of CatBoost implementation, which calculates leave-one-out on the fly. # The nice thing about this is that it helps to prevent overfitting. The bad thing # is that CatBoost uses many iterations over the data. But we run just one iteration. # Still, it works better than leave-one-out without any noise. # See: # https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/ temp = y.groupby(X[col]).agg(['cumsum', 'cumcount']) X[col] = (temp['cumsum'] - y + self._mean) / (temp['cumcount'] + 1) if self.handle_unknown == 'value': X.loc[is_unknown_value, col] = self._mean elif self.handle_unknown == 'return_nan': X.loc[is_unknown_value, col] = np.nan if self.handle_missing == 'value': X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean elif self.handle_missing == 'return_nan': X.loc[is_nan, col] = np.nan if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) return X
def transform_leave_one_out(self, X_in, y, mapping=None): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ X = X_in.copy(deep=True) random_state_ = check_random_state(self.random_state) # Prepare the data if y is not None: # Convert bools to numbers (the target must be summable) y = y.astype('double') # Cumsum and cumcount do not work nicely with None. # This is a terrible workaround that will fail, when the # categorical input contains -999.9 for cat_col in X.select_dtypes('category').columns.values: X[cat_col] = X[cat_col].cat.add_categories(-999.9) X = X.fillna(-999.9) for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 unique_train = colmap.index unseen_values = pd.Series([x for x in X_in[col].unique() if x not in unique_train]) is_nan = X_in[col].isnull() is_unknown_value = X_in[col].isin(unseen_values.dropna()) if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError('Columns to be encoded can not contain new values') if y is None: # Replace level with its mean target; if level occurs only once, use global mean level_means = ((colmap['sum'] + self._mean) / (colmap['count'] + 1)).where(level_notunique, self._mean) X[col] = X[col].map(level_means) else: # Simulation of CatBoost implementation, which calculates leave-one-out on the fly. # The nice thing about this is that it helps to prevent overfitting. The bad thing # is that CatBoost uses many iterations over the data. But we run just one iteration. # Still, it works better than leave-one-out without any noise. # See: # https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/ temp = y.groupby(X[col]).agg(['cumsum', 'cumcount']) X[col] = (temp['cumsum'] - y + self._mean) / (temp['cumcount'] + 1) if self.handle_unknown == 'value': X.loc[is_unknown_value, col] = self._mean elif self.handle_unknown == 'return_nan': X.loc[is_unknown_value, col] = np.nan if self.handle_missing == 'value': X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean elif self.handle_missing == 'return_nan': X.loc[is_nan, col] = np.nan if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) return X
def transform_leave_one_out(self, X_in, y, mapping=None): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ X = X_in.copy(deep=True) random_state_ = check_random_state(self.random_state) for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 unique_train = colmap.index unseen_values = pd.Series( [x for x in X[col].unique() if x not in unique_train], dtype=unique_train.dtype) is_nan = X[col].isnull() is_unknown_value = X[col].isin( unseen_values.dropna().astype(object)) if X[col].dtype.name == 'category': # Pandas 0.24 tries hard to preserve categorical data type X[col] = X[col].astype(str) if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError( 'Columns to be encoded can not contain new values') if y is None: # Replace level with its mean target; if level occurs only once, use global mean level_means = (colmap['sum'] / colmap['count']).where( level_notunique, self._mean) X[col] = X[col].map(level_means) else: # Replace level with its mean target, calculated excluding this row's target # The y (target) mean for this level is normally just the sum/count; # excluding this row's y, it's (sum - y) / (count - 1) level_means = (X[col].map(colmap['sum']) - y) / (X[col].map(colmap['count']) - 1) # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean X[col] = level_means.where( X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) if self.handle_unknown == 'value': X.loc[is_unknown_value, col] = self._mean elif self.handle_unknown == 'return_nan': X.loc[is_unknown_value, col] = np.nan if self.handle_missing == 'value': X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean elif self.handle_missing == 'return_nan': X.loc[is_nan, col] = np.nan if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) return X
def sample(self, size=None, random_state=None): rstate = check_random_state(random_state) if size: subset = rstate.choice(self.features, size=size, replace=False) return self.copy().set_subset(subset) else: mask = rstate.randint(0, 2, size=self.n_features, dtype=bool) return self.copy().set_mask(mask)
def _score(self, X, y): for col in self.cols: # Score the column X[col] = X[col].map(self.mapping[col]) # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) return X
def _check_params(self): if self.param_generator_type is None: self.param_generator_type = SimpleParameterOptimizer self.generator = self.param_generator_type(self.param_grid, self.n_evaluations) # Deleting parameters self.n_evaluations = None self.param_grid = None if self.score_function is None: self.score_function = roc_auc_score assert self.fold_checks <= self.folds, "We cannot have more checks than folds" self.random_state = check_random_state(self.random_state)
def run(self, dataset_name, random_state=42): config = self.config X_train, y_train, X_test, y_test = fetch_load_data(dataset_name) for n_run in range(self.n_runs): seed_run = random_state * n_run logger.info( '\n\nRANDOM SEED = {} for data split.'.format(seed_run)) rng = check_random_state(seed_run) if config['dataset']['is_stream']: logger.info('Dataset is a stream. Sampling observed labels.') # Just randomly sample ratio_labeled samples for mask_labeled n_burn_in = config['data']['n_burn_in_stream'] ratio_labeled = config['data']['stream']['ratio_labeled'] n_labeled = int(ratio_labeled * len(y_train)) ind_labeled = rng.choice(len(y_train), n_labeled, replace=False) mask_labeled = np.zeros(len(y_train), dtype=bool) mask_labeled[ind_labeled] = True X_run, y_run = X_train, y_train else: burn_in_params = config['data']['burn_in'] ind_burn_in, mask_labeled_burn_in = \ split_burn_in_rest(y_train, shuffle=True, seed=seed_run, **burn_in_params) X_burn_in, y_burn_in = X_train[ind_burn_in], \ y_train[ind_burn_in] mask_rest = np.ones(len(X_train), dtype=bool) mask_rest[ind_burn_in] = False X_rest, y_rest = X_train[mask_rest], y_train[mask_rest] stream_params = config['data']['stream'] mask_labeled_rest = split_labels_rest(y_rest, seed=seed_run, shuffle=True, **stream_params) # Shuffle the rest indices = np.arange(len(y_rest)) rng.shuffle(indices) X_run = np.concatenate((X_burn_in, X_rest[indices])) y_run = np.concatenate((y_burn_in, y_rest[indices])) mask_labeled = np.concatenate( (mask_labeled_burn_in, mask_labeled_rest[indices])) n_burn_in = len(y_burn_in) config['data']['n_burn_in'] = n_burn_in config.setdefault('options', {}) config['options']['random_state'] = seed_run self.pre_single_run(X_run, y_run, mask_labeled, n_burn_in, seed_run, X_test, y_test, n_run)
def blocksplit_trajs(trajs, lag=1, sliding=True, shift=None, random_state=None): """ Splits trajectories into approximately uncorrelated fragments. Will split trajectories into fragments of lengths lag or longer. These fragments are overlapping in order to conserve the transition counts at given lag. If sliding=True, the resulting trajectories will lead to exactly the same count matrix as when counted from dtrajs. If sliding=False (sampling at lag), the count matrices are only equal when also setting shift=0. Parameters ---------- trajs : list of ndarray(int) Trajectories lag : int Lag time at which counting will be done. sliding : bool True for splitting trajectories for sliding count, False if lag-sampling will be applied shift : None or int Start of first full tau-window. If None, shift will be randomly generated random_state : None or int or np.random.RandomState Random seed to use. Returns ------- blocks : list of ndarray The blocks. """ from sklearn.utils.random import check_random_state random_state = check_random_state(random_state) blocks = [] for traj in trajs: if len(traj) <= lag: continue if shift is None: s = random_state.randint(min(lag, traj.size - lag)) else: s = shift if sliding: if s > 0: blocks.append(traj[:lag + s]) for t0 in range(s, len(traj) - lag, lag): blocks.append(traj[t0:t0 + 2 * lag]) else: for t0 in range(s, len(traj) - lag, lag): blocks.append(traj[t0:t0 + lag + 1]) return blocks
def get_col_score(estimator, X, y, col, n_repeats=5, scoring=None, random_state=None): """Calculate score when `col` is permuted.""" scorer = check_scoring(estimator, scoring=scoring) rstate = check_random_state(random_state) scores = _get_col_score(estimator, X, y, col, n_repeats, scorer, rstate) return scores
def get_group_score(estimator, X, y, g, n_repeats=5, scoring=None, random_state=None): """Calculate score when columns group `g` is permuted.""" scorer = check_scoring(estimator, scoring=scoring) rstate = check_random_state(random_state) scores = _get_group_score(estimator, X, y, g, n_repeats, scorer, rstate) return scores
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) y = pd.Series(y, name='target') assert X.shape[0] == y.shape[0] self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.random_state_ = check_random_state(self.random_state) _, categories = self.leave_one_out(X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def mutSubset(ind, indpb, random_state=None, drop_attrs=['score']): rstate = check_random_state(random_state) mask = [] for x in ind.mask: y = (rstate.rand() < indpb) mask.append(x ^ y) mutant = ind.set_mask(mask) for attr in drop_attrs: if hasattr(mutant, attr): delattr(mutant, attr) return mutant
def _fit_start(self, X, partial=False): self._set_features(X) self.k_features_ = _check_k_features(self.k_features, self.n_features_, 'k_features') if not partial: self.rstate_ = check_random_state(self.random_state) self.subset_ = self.features_.copy() if self.forward: self.subset_.set_subset([]) self._reset_trials() return self
def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ X = X_in.copy(deep=True) random_state_ = check_random_state(self.random_state) for switch in mapping: column = switch.get('col') transformed_column = pd.Series([np.nan] * X.shape[0], name=column) for val in switch.get('mapping'): if y is None: transformed_column.loc[X[column] == val] = switch.get( 'mapping')[val]['mean'] elif switch.get('mapping')[val]['count'] == 1: transformed_column.loc[X[column] == val] = self._mean else: transformed_column.loc[X[column] == val] = ( (switch.get('mapping')[val]['sum'] - y[(X[column] == val).values]) / (switch.get('mapping')[val]['count'] - 1)) if impute_missing: if handle_unknown == 'impute': transformed_column.fillna(self._mean, inplace=True) elif handle_unknown == 'error': missing = transformed_column.isnull() if any(missing): raise ValueError( 'Unexpected categories found in column %s' % column) if self.randomized and y is not None: transformed_column = ( transformed_column * random_state_.normal( 1., self.sigma, transformed_column.shape[0])) X[column] = transformed_column.astype(float) return X
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) y = pd.Series(y, name='target') assert X.shape[0] == y.shape[0] self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.random_state_ = check_random_state(self.random_state) _, categories = self.leave_one_out( X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def _fit_start(self, X, y, groups): # Basic self.rstate_ = check_random_state(self.random_state) self._set_features(X) self._reset_trials() # First trial k_min = self.min_features_ k_max = self.max_features_ k = self.rstate_.choice(range(k_min, k_max + 1)) subset = self.features_.sample(size=k, random_state=self.rstate_) self.eval_subset(subset, X, y, groups) self.subset_ = subset return self
def __init__(self, param_grid, n_evaluations=10, random_state=None): assert isinstance(param_grid, dict), 'the passed param_grid should be of OrderedDict class' self.param_grid = OrderedDict(param_grid) _check_param_grid(param_grid) self.dimensions = list([len(param_values) for param, param_values in self.param_grid.items()]) size = numpy.prod(self.dimensions) assert size > 1, 'The space of parameters contains only %i points' % size self.n_evaluations = min(n_evaluations, size) # results on different parameters self.grid_scores_ = OrderedDict() # all the tasks that are being computed or already computed self.queued_tasks_ = set() self.random_state = check_random_state(random_state) self.evaluations_done = 0
def group_permutation_importance(estimator, X, y, subset=None, scoring=None, n_repeats=5, n_jobs=None, random_state=0, tqdm=False): columns = list(X.columns.get_level_values(0).unique()) msg = "<subset> must contain only features from <X>" assert not subset or not set(subset) - set(columns), msg msg = "<subset> must contain only unique features" assert not subset or len(set(subset)) == len(subset), msg subset = subset if subset else columns subset = tqdm_notebook(subset) if tqdm else subset scorer = check_scoring(estimator, scoring=scoring) rstate = check_random_state(random_state) base_score = scorer(estimator, X, y) scores = np.zeros((len(subset), n_repeats)) # FIXME: avoid <max_nbytes> scores = Parallel(n_jobs=n_jobs, max_nbytes='512M', backend='multiprocessing')(delayed(_get_group_score)( estimator, X, y, feature, n_repeats, scorer, rstate) for feature in subset) importances = np.full((len(columns), n_repeats), np.nan) ind = [columns.index(feature) for feature in subset] importances[ind] = base_score - np.array(scores) result = { 'importances_mean': np.mean(importances, axis=1), 'importances_std': np.std(importances, axis=1), 'importances': importances, 'score': base_score } return result
def perturb_subset(subset, step, random_state=None, drop_attrs=['score']): rstate = check_random_state(random_state) update = rstate.choice(subset.features, step, False) del_list = set(subset) & set(update) add_list = set(update) - set(subset) subset_ = subset.copy() subset_ = subset_.remove(*del_list) subset_ = subset_.append(*add_list) for attr in drop_attrs: if hasattr(subset_, attr): delattr(subset_, attr) subset_.parents = (subset, ) return subset_
def _score(self, X, y): for switch in self.mapping: # Get column name (can be anything: str, number,...) column = switch.get('col') # Scoring or training time? if y is None: X[str(column) + '_tmp'] = np.nan for val in switch.get('woe'): X.loc[X[column] == val, str(column) + '_tmp'] = switch.get('woe')[val] # THIS LINE IS SLOW del X[column] X.rename(columns={str(column) + '_tmp': column}, inplace=True) else: X[str(column) + '_tmp'] = np.nan for val in switch.get('woe'): X.loc[(X[column] == val) * (y == 1), str(column) + '_tmp'] = switch.get('woe_positive')[ val] # THIS LINE IS SLOW X.loc[(X[column] == val) * (y == 0), str(column) + '_tmp'] = switch.get('woe_negative')[ val] # THIS LINE IS SLOW del X[column] X.rename(columns={str(column) + '_tmp': column}, inplace=True) # Replace missing values only in the computed columns if self.impute_missing: if self.handle_unknown == 'impute': X[column].fillna(0, inplace=True) elif self.handle_unknown == 'error': missing = X[switch.get('col')].isnull() if any(missing): raise ValueError( 'Unexpected categories found in column %s' % switch.get('col')) # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) X[column] = (X[column] * random_state_generator.normal( 1., self.sigma, X[column].shape[0])) return X
def transform_leave_one_out(self, X_in, y, mapping=None): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ X = X_in.copy(deep=True) random_state_ = check_random_state(self.random_state) for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 unique_train = colmap.index unseen_values = pd.Series([x for x in X[col].unique() if x not in unique_train]) is_nan = X[col].isnull() is_unknown_value = X[col].isin(unseen_values.dropna()) if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError('Columns to be encoded can not contain new values') if y is None: # Replace level with its mean target; if level occurs only once, use global mean level_means = (colmap['sum'] / colmap['count']).where(level_notunique, self._mean) X[col] = X[col].map(level_means) else: # Replace level with its mean target, calculated excluding this row's target # The y (target) mean for this level is normally just the sum/count; # excluding this row's y, it's (sum - y) / (count - 1) level_means = (X[col].map(colmap['sum']) - y) / (X[col].map(colmap['count']) - 1) # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) if self.handle_unknown == 'value': X.loc[is_unknown_value, col] = self._mean elif self.handle_unknown == 'return_nan': X.loc[is_unknown_value, col] = np.nan if self.handle_missing == 'value': X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean elif self.handle_missing == 'return_nan': X.loc[is_nan, col] = np.nan if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) return X
def _score(self, X, y): for col in self.cols: # Score the column X[col] = X[col].map(self.mapping[col]) # Replace missing values only in the computed columns if self.impute_missing: if self.handle_unknown == 'impute': X[col].fillna(0, inplace=True) elif self.handle_unknown == 'error': if X[col].isnull().any(): raise ValueError('Unexpected categories found in column %s' % col) # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) return X
def cvsplit_dtrajs(dtrajs, random_state=None): """ Splits the trajectories into a training and test set with approximately equal number of trajectories Parameters ---------- dtrajs : list of ndarray(int) Discrete trajectories random_state : None or int or np.random.RandomState Random seed to use. """ from sklearn.utils.random import check_random_state if len(dtrajs) == 1: raise ValueError('Only have a single trajectory. Cannot be split into train and test set') random_state = check_random_state(random_state) I0 = random_state.choice(len(dtrajs), int(len(dtrajs) / 2), replace=False) I1 = np.array(list(set(list(np.arange(len(dtrajs)))) - set(list(I0)))) dtrajs_train = [dtrajs[i] for i in I0] dtrajs_test = [dtrajs[i] for i in I1] return dtrajs_train, dtrajs_test
def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ X = X_in.copy(deep=True) random_state_ = check_random_state(self.random_state) for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 if y is None: # Replace level with its mean target; if level occurs only once, use global mean level_means = (colmap['sum'] / colmap['count']).where( level_notunique, self._mean) X[col] = X[col].map(level_means) else: # Replace level with its mean target, calculated excluding this row's target # The y (target) mean for this level is normally just the sum/count; # excluding this row's y, it's (sum - y) / (count - 1) level_means = (X[col].map(colmap['sum']) - y) / (X[col].map(colmap['count']) - 1) # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean X[col] = level_means.where( X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) if impute_missing: if handle_unknown == 'impute': X[col].fillna(self._mean, inplace=True) elif handle_unknown == 'error': if X[col].isnull().any(): raise ValueError( 'Unexpected categories found in column %s' % col) if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) return X
def cvsplit_trajs(trajs, random_state=None): """ Splits the trajectories into a training and test set with approximately equal number of trajectories Parameters ---------- trajs : list of ndarray(int) Discrete trajectories random_state : None or int or np.random.RandomState Random seed to use. """ from sklearn.utils.random import check_random_state assert len( trajs ) > 1, 'Only have a single trajectory. Cannot be split into train and test set' random_state = check_random_state(random_state) I0 = random_state.choice(len(trajs), int(len(trajs) / 2), replace=False) I1 = np.array(list(set(list(np.arange(len(trajs)))) - set(list(I0)))) train_set = [trajs[i] for i in I0] test_set = [trajs[i] for i in I1] return train_set, test_set
def __init__(self, param_grid, n_evaluations=10, maximize=True, random_state=None): """ Works in the same way as sklearn.grid_search.RandomizedSearch. Each next point is generated independently. :param_grid: dict with distributions used to sample each parameter. name -> list of possible values (in which case sampled uniformly from options) name -> distribution (should implement '.rvs()' as scipy distributions) :param bool maximize: ignored parameter, added for uniformity NB: this is the only optimizer, which supports passing distributions for parameters. """ self.maximize = maximize self.param_grid = OrderedDict(param_grid) self.n_evaluations = n_evaluations self.random_state = check_random_state(random_state) self.indices_to_parameters_ = OrderedDict() self.grid_scores_ = OrderedDict() self.queued_tasks_ = set() from sklearn.grid_search import ParameterSampler self.param_sampler = iter(ParameterSampler(param_grid, n_iter=n_evaluations, random_state=random_state))
def fit(self, inp, y): self.precomputed_probs_ = None self.precomputed_weights_ = None self.classes_, y = unique(y, return_inverse=True) self.n_classes_ = len(self.classes_) self.random_state_ = check_random_state(self.random_state) if self.pipeline is not None: inp = self.pipeline.fit_transform(inp) self.weighting_strategy.prepare(inp, y) self.classifiers_ = self.training_strategy.train_estimators( self.n_estimators, inp, y, self.weighting_strategy, self.random_state_ ) # Reset it to null because the previous line uses self.predict self.precomputed_probs_ = None self.precomputed_weights_ = None return self
def _fit_start(self, X, partial=False): if not partial: self._reset_trials() if not partial and hasattr(self, 'random_state'): self.rstate_ = check_random_state(self.random_state) self._set_features(X) weights_vals = ['uniform', 'binomal'] if self.weights == 'binomal': self.weights_ = binomal_weights(self.min_features_, self.max_features_, self.n_features_) elif self.weights == 'uniform': self.weights_ = uniform_weights(self.min_features_, self.max_features_) else: raise ValueError(f'<weights> must be from {weights_vals}') return self
def __init__(self, param_grid, n_evaluations, random_state=None): """ The aim of this class is to generate new points, where the function (estimator) will be computed. :type param_grid: OrderedDict, the grid with parameters to optimize on :type n_evaluations: int, the number of evaluations to do :type random_state: int | RandomState | None """ assert isinstance(param_grid, dict), 'the passed param_grid should be of OrderedDict class' self.param_grid = OrderedDict(param_grid) _check_param_grid(self.param_grid) self.dimensions = list([len(param_values) for param, param_values in self.param_grid.items()]) size = numpy.prod(self.dimensions) assert size > 1, 'The space of parameters contains only %i points' % size if n_evaluations > size / 2: warn('The number of evaluations was decreased to %i' % (size // 2), UserWarning) n_evaluations = size // 2 self.n_evaluations = n_evaluations # results on different parameters self.grid_scores_ = OrderedDict() # all the tasks that are being computed or already computed self.queued_tasks_ = set() self.random_state = check_random_state(random_state) self.evaluations_done = 0
def _make_estimator(self, inp, y, sample_weights, random_state): seed = random_state.randint(MAX_INT) est = clone(self.base_estimator) est.set_params(random_state=check_random_state(seed)) est.fit(inp, y, sample_weight=sample_weights) return est
def fit(self, X, y, sample_weight=None, neighbours_matrix=None): """Build a boosted classifier from the training set (X, y). Parameters ---------- X : array-like of shape = [n_samples, n_features] The training input samples. y : array-like of shape = [n_samples] The target values (integers that correspond to classes). sample_weight : array-like of shape = [n_samples], optional Sample weights. If None, the sample weights are initialized to ``1 / n_samples``. neighbours_matrix: array-like of shape [n_samples, n_neighbours], each row contains indices of signal neighbours (neighbours should be computed for background too), if None, this matrix is computed. Returns ------- self : object Returns self. """ if self.smoothing < 0: raise ValueError("Smoothing must be non-negative") if not isinstance(self.base_estimator, BaseEstimator): raise TypeError("estimator must be a subclass of BaseEstimator") if self.n_estimators <= 0: raise ValueError("n_estimators must be greater than zero.") if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") # Check that algorithm is supported if self.algorithm not in ('SAMME', 'SAMME.R'): raise ValueError("algorithm %s is not supported" % self.algorithm) if self.algorithm == 'SAMME.R': if not hasattr(self.base_estimator, 'predict_proba'): raise TypeError( "uBoostBDT with algorithm='SAMME.R' requires " "that the weak learner have a predict_proba method.\n" "Please change the base estimator or set " "algorithm='SAMME' instead.") assert np.in1d(y, [0, 1]).all(), \ "only two-class classification is implemented, with labels 0 and 1" self.signed_uniform_label = 2 * self.uniform_label - 1 if neighbours_matrix is not None: assert np.shape(neighbours_matrix) == (len(X), self.n_neighbors), \ "Wrong shape of neighbours_matrix" self.knn_indices = neighbours_matrix else: assert self.uniform_variables is not None, \ "uniform_variables should be set" self.knn_indices = compute_knn_indices_of_same_class( X.ix[:, self.uniform_variables], y, self.n_neighbors) sample_weight = commonutils.check_sample_weight(y, sample_weight=sample_weight, normalize=True) assert np.all(sample_weight >= 0.), 'the weights should be non-negative' # Clear any previous fit results self.estimators_ = [] self.estimator_weights_ = [] # score cuts correspond to # global efficiency == target_efficiency on each iteration. self.score_cuts_ = [] X_train_variables = self.get_train_vars(X) X_train_variables, y, sample_weight = check_xyw(X_train_variables, y, sample_weight) # A dictionary to keep all intermediate weights, efficiencies and so on if self.keep_debug_info: self.debug_dict = defaultdict(list) self.random_generator = check_random_state(self.random_state) self._boost(X_train_variables, y, sample_weight) self.score_cut = self.signed_uniform_label * compute_cut_for_efficiency( self.target_efficiency, y == self.uniform_label, self.predict_score(X) * self.signed_uniform_label) assert np.allclose(self.score_cut, self.score_cuts_[-1], rtol=1e-10, atol=1e-10), \ "score cut doesn't appear to coincide with the staged one" assert len(self.estimators_) == len(self.estimator_weights_) == len(self.score_cuts_) return self
def fit(self, X, y, sample_weight=None): shuffler = Shuffler(X, random_state=self.random_state) X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y = column_or_1d(y, warn=True) n_samples = len(X) n_inbag = int(self.subsample * n_samples) sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy() self.random_state = check_random_state(self.random_state) # skipping all checks assert self.update_on in ['all', 'same', 'other', 'random'] y_pred = numpy.zeros(len(y), dtype=float) self.classifiers = [] self.learning_rates = [] self.loss_values = [] self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=sample_weight) iter_X = shuffler.generate(0.) prev_smearing = 1 for iteration in range(self.n_estimators): if iteration % self.recount_step == 0: if prev_smearing > 0: iter_smearing = interpolate(self.smearing, iteration, self.n_estimators) prev_smearing = iter_smearing iter_X = shuffler.generate(iter_smearing) iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True) y_pred = numpy.zeros(len(y)) y_pred += sum(cl.predict(X) * rate for rate, cl in zip(self.learning_rates, self.classifiers)) self.loss_values.append(self.loss(y, y_pred, sample_weight=sample_weight)) tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=interpolate(self.max_depth, iteration, self.n_estimators), min_samples_split=self.min_samples_split, min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True), max_features=self.max_features, random_state=self.random_state) sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) loss_weight = sample_weight if self.weights_in_loss else numpy.ones(len(sample_weight)) tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(len(sample_weight)) residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight) tree.fit(numpy.array(iter_X)[sample_mask, :], residual[sample_mask], sample_weight=tree_weight[sample_mask], check_input=False) # update tree leaves if self.update_tree: if self.update_on == 'all': update_mask = numpy.ones(len(sample_mask), dtype=bool) elif self.update_on == 'same': update_mask = sample_mask elif self.update_on == 'other': update_mask = ~sample_mask else: # random update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state) self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred, sample_mask=update_mask, sample_weight=sample_weight) iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True) y_pred += iter_learning_rate * tree.predict(X) self.classifiers.append(tree) self.learning_rates.append(iter_learning_rate) return self
def fit(self, X, y, sample_weight=None, neighbours_matrix=None): """Build a boosted classifier from the training set (X, y). :param X: array-like of shape [n_samples, n_features] :param y: labels, array of shape [n_samples] with 0 and 1. :param sample_weight: array-like of shape [n_samples] or None :param neighbours_matrix: array-like of shape [n_samples, n_neighbours], each row contains indices of signal neighbours (neighbours should be computed for background too), if None, this matrix is computed. :return: self """ if self.smoothing < 0: raise ValueError("Smoothing must be non-negative") if not isinstance(self.base_estimator, BaseEstimator): raise TypeError("estimator must be a subclass of BaseEstimator") if self.n_estimators <= 0: raise ValueError("n_estimators must be greater than zero.") if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") if self.base_estimator is None: self.base_estimator = DecisionTreeClassifier(max_depth=2) # Check that algorithm is supported if self.algorithm not in ('SAMME', 'SAMME.R'): raise ValueError("algorithm %s is not supported" % self.algorithm) if self.algorithm == 'SAMME.R': if not hasattr(self.base_estimator, 'predict_proba'): raise TypeError( "uBoostBDT with algorithm='SAMME.R' requires " "that the weak learner have a predict_proba method.\n" "Please change the base estimator or set algorithm='SAMME' instead.") assert np.in1d(y, [0, 1]).all(), \ "only two-class classification is implemented, with labels 0 and 1" self.signed_uniform_label = 2 * self.uniform_label - 1 if neighbours_matrix is not None: assert np.shape(neighbours_matrix) == (len(X), self.n_neighbors), \ "Wrong shape of neighbours_matrix" self.knn_indices = neighbours_matrix else: assert self.uniform_features is not None, \ "uniform_variables should be set" self.knn_indices = compute_knn_indices_of_same_class( X.ix[:, self.uniform_features], y, self.n_neighbors) sample_weight = commonutils.check_sample_weight(y, sample_weight=sample_weight, normalize=True) assert np.all(sample_weight >= 0.), 'the weights should be non-negative' # Clear any previous fit results self.estimators_ = [] self.estimator_weights_ = [] # score cuts correspond to # global efficiency == target_efficiency on each iteration. self.score_cuts_ = [] x_train_features = self._get_train_features(X) x_train_features, y, sample_weight = check_xyw(x_train_features, y, sample_weight) self.random_state_ = check_random_state(self.random_state) self._boost(x_train_features, y, sample_weight) self.score_cut = self.signed_uniform_label * compute_cut_for_efficiency( self.target_efficiency, y == self.uniform_label, self.decision_function(X) * self.signed_uniform_label) assert np.allclose(self.score_cut, self.score_cuts_[-1], rtol=1e-10, atol=1e-10), \ "score cut doesn't appear to coincide with the staged one" assert len(self.estimators_) == len(self.estimator_weights_) == len(self.score_cuts_) return self
def _sgd(self, data, labels): #split off validation set data_train, data_valid, labels_train, labels_valid = \ train_test_split(data, labels, test_size=0.3, random_state=0) # shuffle data n = len(data_train) idx = np.arange(n) state = check_random_state(self.random_state) state.shuffle(idx) data_train = data_train[idx] labels_train = labels_train[idx] labels_valid=self.preproclabels(labels_valid) labels_train=self.preproclabels(labels_train) self.ADict = dict.fromkeys(range(n)) self.BDict = dict.fromkeys(range(n)) self.ws = np.zeros(ffs.numJ) rate = self.rate self.converged_ = False old_score = 0 epochscores = [] minorscores = [] for epoch in xrange(self.max_iters): for i, (x, y) in enumerate(zip(data_train, labels_train)): self.ws = self._sgd_update(self.ws, i, x, y, rate) if i>0 and i%1000 == 0: prediction = self.predict(data_valid) tagscores = self.tagAccuracy(labels_valid, prediction) score = np.mean(tagscores) minorscores.append(score) print "sample:{}".format(i),score,max(tagscores),min(tagscores)#,self.ws sys.stdout.flush() if score > 0.85 or (score > 0 and score <= old_score):#np.abs(score - old_score) < 1e-8: self.converged_ = True break old_score = score if self.converged_ == True: break prediction = self.predict(data_valid) tagscores = self.tagAccuracy(labels_valid, prediction) score = np.mean(tagscores) epochscores.append(score) print "epoch:{}".format(epoch),score,max(tagscores),min(tagscores)#,self.ws sys.stdout.flush() if score > 0.85 or (score > 0 and score <= old_score):#np.abs(score - old_score) < 1e-8: self.converged_ = True break rate = rate * self.decay old_score = score if self.converged_: print "converged after {} epochs".format(epoch) else: print "did not converge" # np.savetxt(self.method+"_tagscore_per_epoch.csv", epochscores, delimiter=",", fmt='%1.4e') # np.savetxt(self.method+"_tagscore_minor.csv", minorscores, delimiter=",", fmt='%1.4e') # # if self.method == "collins": # np.savetxt(self.method+"_yhat.csv", self.unpreproclabels(self.yhats), delimiter=",", fmt='%s') # elif self.method == "cd": # np.savetxt(self.method+"_ystar.csv", self.unpreproclabels(self.ystars), delimiter=",", fmt='%s') # np.savetxt(self.method+"_labels.csv", self.unpreproclabels(labels_train), delimiter=",", fmt='%s') return self.ws
def check_params(self): assert isinstance(self.loss, AbstractLossFunction), \ 'LossFunction should be derived from AbstractLossFunction' assert self.n_estimators > 0, 'n_estimators should be positive' assert 0 < self.subsample <= 1., 'subsample should be in (0, 1]' self.random_state = check_random_state(self.random_state)