def test_neuralNetwork_adam(): from sklearn.neural_network._stochastic_optimizers import AdamOptimizer np.random.seed(2019) X = np.random.normal(size=(1, 500)) target = 3.9285985 * X nn = NeuralNetwork(inputs=1, neurons=3, outputs=1, activations='sigmoid', silent=True) nn.addLayer() nn.addLayer() nn.addOutputLayer(activations='identity') learning_rate = 0.001 yhat = nn.forward_pass(X) nn.backpropagation(yhat.T, target.T) nn.learning_rate = learning_rate nn.initializeAdam() nn.adam() skl_adam = AdamOptimizer(params=nn.param, learning_rate_init=learning_rate) upd = skl_adam._get_updates(nn.grad) for update_nn, update_skl in zip(nn.change, upd): assert update_nn == pytest.approx(update_skl)
def test_adam_optimizer(): params = [np.zeros(shape) for shape in shapes] lr = 0.001 epsilon = 1e-8 for beta_1 in np.arange(0.9, 1.0, 0.05): for beta_2 in np.arange(0.995, 1.0, 0.001): optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon) ms = [np.random.random(shape) for shape in shapes] vs = [np.random.random(shape) for shape in shapes] t = 10 optimizer.ms = ms optimizer.vs = vs optimizer.t = t - 1 grads = [np.random.random(shape) for shape in shapes] ms = [beta_1 * m + (1 - beta_1) * grad for m, grad in zip(ms, grads)] vs = [beta_2 * v + (1 - beta_2) * (grad ** 2) for v, grad in zip(vs, grads)] learning_rate = lr * np.sqrt(1 - beta_2 ** t) / (1 - beta_1**t) updates = [-learning_rate * m / (np.sqrt(v) + epsilon) for m, v in zip(ms, vs)] expected = [param + update for param, update in zip(params, updates)] optimizer.update_params(grads) for exp, param in zip(expected, optimizer.params): assert_array_equal(exp, param)
class AdamWrapper(UpdadteFunction): def __init__(self, flags, agents, gpu_nr, memory_fraction, weights): super().__init__(flags, agents, gpu_nr, memory_fraction, weights) self.adam = AdamOptimizer(weights, learning_rate_init=1e-3 / self.agents) # def __call__(self, weights, update, gradients, staleness, epoch): """ :param weights: Copy of the model weights :param update: Curent update :param gradients: List of gradients :param staleness: Staleness of each gradient :param epoch: Current epoch """ grads = np.mean(gradients, axis=0) self.adam.update_params(grads) return self.adam.params def close(self): pass
def __init__(self, flags, agents, gpu_nr, memory_fraction, weights): super().__init__(flags, agents, gpu_nr, memory_fraction, weights) self.adam = AdamOptimizer(weights, learning_rate_init=1e-3 / self.agents) #
def _fit_stochastic(self, X, y, X_val, y_val, activations, deltas, coef_grads, intercept_grads, layer_units, incremental): if not incremental or not hasattr(self, '_optimizer'): params = self.coefs_ + self.intercepts_ if self.solver == 'sgd': self._optimizer = SGDOptimizer(params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t) elif self.solver == 'adam': self._optimizer = AdamOptimizer(params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon) # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental n_samples = X.shape[0] if self.batch_size == 'auto': batch_size = min(200, n_samples) else: batch_size = np.clip(self.batch_size, 1, n_samples) try: for it in range(self.max_iter): X, y = shuffle(X, y, random_state=self._random_state) accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): activations[0] = X[batch_slice] batch_loss, coef_grads, intercept_grads = self._backprop( X[batch_slice], y[batch_slice], activations, deltas, coef_grads, intercept_grads) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) # update weights grads = coef_grads + intercept_grads self._optimizer.update_params(grads) # Get val path activations[0] = X_val activations = self._forward_pass(activations) loss_val = LOSS_FUNCTIONS[self.loss](y_val, activations[-1]) self.lpath['val'].append(loss_val) self.n_iter_ += 1 self.loss_ = accumulated_loss / X.shape[0] self.t_ += n_samples self.loss_curve_.append(self.loss_) self.lpath['train'].append(self.loss_) if self.verbose: print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) # update no_improvement_count based on training loss or # validation score according to early_stopping self._update_no_improvement_count(early_stopping, X_val, y_val) # for learning rate that needs to be updated at iteration end self._optimizer.iteration_ends(self.t_) if self._no_improvement_count > self.n_iter_no_change: # not better than last `n_iter_no_change` iterations by tol # stop or decrease learning rate if early_stopping: msg = ("Validation score did not improve more than " "tol=%f for %d consecutive epochs." % (self.tol, self.n_iter_no_change)) else: msg = ("Training loss did not improve more than tol=%f" " for %d consecutive epochs." % (self.tol, self.n_iter_no_change)) is_stopping = self._optimizer.trigger_stopping( msg, self.verbose) if is_stopping: break else: self._no_improvement_count = 0 if incremental: break if self.n_iter_ == self.max_iter: warnings.warn( "Stochastic Optimizer: Maximum iterations (%d) " "reached and the optimization hasn't converged yet." % self.max_iter, ConvergenceWarning) except KeyboardInterrupt: warnings.warn("Training interrupted by user.") if early_stopping: # restore best weights self.coefs_ = self._best_coefs self.intercepts_ = self._best_intercepts
class MLPRegressor(BaseMLP): def __init__(self, bound=(-10, 10), popsize=5, strategy='best1bin', recombination=None, T=0.01, stepsize=0.01, *args, **kwargs): super(MLPRegressor, self).__init__(*args, **kwargs) self.lpath = dict(train=[], val=[]) self.n_iter_no_change = kwargs.get('n_iter_no_change', 1) self._iter = 0 # parameter for differents evolution self.bound = bound self.popsize = popsize self.straegy = strategy self.recombination = recombination # parameter for anneal self.T = T self.stepsize = stepsize def _calculator_activations(self, packed_parameters): """Extract the coefficients and intercepts from packed_parameters.""" coefs_ = [] intercepts_ = [] for i in range(self.n_layers_ - 1): start, end, shape = self._coef_indptr[i] coefs_.append(np.reshape(packed_parameters[start:end], shape)) start, end = self._intercept_indptr[i] intercepts_.append(packed_parameters[start:end]) """Perform a forward pass on the network by computing the values of the neurons in the hidden layers and the output layer. Parameters ---------- activations : list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. with_output_activation : bool, default True If True, the output passes through the output activation function, which is either the softmax function or the logistic function """ # calculator predict validation set activations = list(self.val_activations) hidden_activation = ACTIVATIONS[self.activation] # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], coefs_[i]) activations[i + 1] += intercepts_[i] # For the hidden layers if (i + 1) != (self.n_layers_ - 1): activations[i + 1] = hidden_activation(activations[i + 1]) # For the last layer output_activation = ACTIVATIONS[self.out_activation_] activations[i + 1] = output_activation(activations[i + 1]) p_val = list(activations[-1]) # calculator predict trainset activations = list(self.train_activations) hidden_activation = ACTIVATIONS[self.activation] # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], coefs_[i]) activations[i + 1] += intercepts_[i] # For the hidden layers if (i + 1) != (self.n_layers_ - 1): activations[i + 1] = hidden_activation(activations[i + 1]) # For the last layer output_activation = ACTIVATIONS[self.out_activation_] activations[i + 1] = output_activation(activations[i + 1]) p_train = list(activations[-1]) return p_train, p_val def _callback(self, packed_parameters, *args, **kwargs): p_train, p_val = self._calculator_activations(packed_parameters) loss_train = LOSS_FUNCTIONS[self.loss](self.y_train, p_train) loss_val = LOSS_FUNCTIONS[self.loss](self.y_val, p_val) if self.verbose: print('Iter %3d loss train %.5f loss val %.5f' % \ (self._iter, loss_train, loss_val)) self._iter += 1 self.lpath['train'].append(loss_train) self.lpath['val'].append(loss_val) def _fit_bfgs(self, X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run BFGS packed_coef_inter = _pack(self.coefs_, self.intercepts_) if self.verbose is True or self.verbose >= 1: iprint = 1 else: iprint = -1 optimal_parameters, self.loss_, d, Bopt, func_calls, grad_calls, warnflag = \ optimize.fmin_bfgs(x0=packed_coef_inter, f=self._loss_func, fprime=self._grad_func, maxiter=self.max_iter, disp=False, gtol=self.tol, args=(X, y, activations, deltas, coef_grads, intercept_grads), full_output=True, callback=self._callback) self._unpack(optimal_parameters) def _fit_lbfgs(self, X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run LBFGS packed_coef_inter = _pack(self.coefs_, self.intercepts_) if self.verbose is True or self.verbose >= 1: iprint = 1 else: iprint = -1 optimal_parameters, self.loss_, d = optimize.fmin_l_bfgs_b( x0=packed_coef_inter, func=self._loss_grad_lbfgs, maxfun=self.max_iter, iprint=-1, pgtol=self.tol, args=(X, y, activations, deltas, coef_grads, intercept_grads), callback=self._callback) self._unpack(optimal_parameters) def _fit_evol(self, X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run evolution packed_coef_inter = _pack(self.coefs_, self.intercepts_) if self.verbose is True or self.verbose >= 1: iprint = 1 else: iprint = -1 bounds = list() for i in range(len(packed_coef_inter)): bounds.append(self.bound) result = optimize.differential_evolution(func=self._loss_func, bounds=bounds, maxiter=self.max_iter, disp=False, polish=True, init='latinhypercube', popsize=self.popsize, strategy=self.straegy, seed=self.random_state, args=(X, y, activations, deltas, coef_grads, intercept_grads), callback=self._callback) optimal_parameters = result.x self.loss_ = result.fun self._unpack(optimal_parameters) def _fit_cg(self, X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run CG packed_coef_inter = _pack(self.coefs_, self.intercepts_) optimal_parameters, self.loss_, func_calls, grad_calls, d = \ optimize.fmin_cg(x0=packed_coef_inter, f=self._loss_func, fprime=self._grad_func, maxiter=self.max_iter, disp=False, epsilon=self.epsilon, gtol=self.tol, args=(X, y, activations, deltas, coef_grads, intercept_grads), callback=self._callback, full_output=True) self._unpack(optimal_parameters) def _fit_ncg(self, X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run Newton-CG packed_coef_inter = _pack(self.coefs_, self.intercepts_) optimal_parameters, self.loss_, func_calls, grad_calls, h_calls, d = \ optimize.fmin_ncg(x0=packed_coef_inter, f=self._loss_func, fprime=self._grad_func, maxiter=200, # maxiter=self.max_iter, disp=True, args=(X, y, activations, deltas, coef_grads, intercept_grads), callback=self._callback, full_output=True) self._unpack(optimal_parameters) def _fit_basinhopping(self, X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run Basinhopping packed_coef_inter = _pack(self.coefs_, self.intercepts_) minimizer_kwargs = { 'method': 'L-BFGS-B', 'args': (X, y, activations, deltas, coef_grads, intercept_grads) } result = optimize.basinhopping(x0=packed_coef_inter, T=self.T, stepsize=self.stepsize, func=self._loss_func, niter=self.max_iter, callback=self._callback, minimizer_kwargs=minimizer_kwargs) optimal_parameters = result.x self.loss = result.fun self._unpack(optimal_parameters) def _fit_anneal(self, X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units): if scipy.__version__ == '0.14.0': # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run Simulated Annealing packed_coef_inter = _pack(self.coefs_, self.intercepts_) result = optimize.anneal(x0=packed_coef_inter, T0=self.T, stepsize=self.stepsize, func=self._loss_func, maxiter=self.max_iter, args=(X, y, activations, deltas, coef_grads, intercept_grads)) optimal_parameters = result.x self.loss = result.fun self._unpack(optimal_parameters) else: raise ImportError('Anneal method require scipy version <= 0.14.0') def _grad_func(self, packed_coef_inter, X, y, activations, deltas, coef_grads, intercept_grads): loss, coef_grads, intercept_grads = \ self._backprop(X, y, activations, deltas, coef_grads, intercept_grads) grad = _pack(coef_grads, intercept_grads) return grad def _loss_func(self, packed_coef_inter, X, y, activations, deltas, coef_grads, intercept_grads): """Compute the MLP loss function and its corresponding derivatives with respect to the different parameters given in the initialization. Returned loss are packed in a single vector so it can be used in cg Parameters ---------- packed_coef_inter : array-like A vector comprising the flattened coefficients and intercepts. X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. activations : list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. deltas : list, length = n_layers - 1 The ith element of the list holds the difference between the activations of the i + 1 layer and the backpropagated error. More specifically, deltas are gradients of loss with respect to z in each layer, where z = wx + b is the value of a particular layer before passing through the activation function coef_grads : list, length = n_layers - 1 The ith element contains the amount of change used to update the coefficient parameters of the ith layer in an iteration. intercept_grads : list, length = n_layers - 1 The ith element contains the amount of change used to update the intercept parameters of the ith layer in an iteration. Returns ------- loss : float """ self._unpack(packed_coef_inter) loss, coef_grads, intercept_grads = self._backprop( X, y, activations, deltas, coef_grads, intercept_grads) self.n_iter_ += 1 return loss def _fit_stochastic(self, X, y, X_val, y_val, activations, deltas, coef_grads, intercept_grads, layer_units, incremental): if not incremental or not hasattr(self, '_optimizer'): params = self.coefs_ + self.intercepts_ if self.solver == 'sgd': self._optimizer = SGDOptimizer(params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t) elif self.solver == 'adam': self._optimizer = AdamOptimizer(params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon) # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental n_samples = X.shape[0] if self.batch_size == 'auto': batch_size = min(200, n_samples) else: batch_size = np.clip(self.batch_size, 1, n_samples) try: for it in range(self.max_iter): X, y = shuffle(X, y, random_state=self._random_state) accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): activations[0] = X[batch_slice] batch_loss, coef_grads, intercept_grads = self._backprop( X[batch_slice], y[batch_slice], activations, deltas, coef_grads, intercept_grads) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) # update weights grads = coef_grads + intercept_grads self._optimizer.update_params(grads) # Get val path activations[0] = X_val activations = self._forward_pass(activations) loss_val = LOSS_FUNCTIONS[self.loss](y_val, activations[-1]) self.lpath['val'].append(loss_val) self.n_iter_ += 1 self.loss_ = accumulated_loss / X.shape[0] self.t_ += n_samples self.loss_curve_.append(self.loss_) self.lpath['train'].append(self.loss_) if self.verbose: print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) # update no_improvement_count based on training loss or # validation score according to early_stopping self._update_no_improvement_count(early_stopping, X_val, y_val) # for learning rate that needs to be updated at iteration end self._optimizer.iteration_ends(self.t_) if self._no_improvement_count > self.n_iter_no_change: # not better than last `n_iter_no_change` iterations by tol # stop or decrease learning rate if early_stopping: msg = ("Validation score did not improve more than " "tol=%f for %d consecutive epochs." % (self.tol, self.n_iter_no_change)) else: msg = ("Training loss did not improve more than tol=%f" " for %d consecutive epochs." % (self.tol, self.n_iter_no_change)) is_stopping = self._optimizer.trigger_stopping( msg, self.verbose) if is_stopping: break else: self._no_improvement_count = 0 if incremental: break if self.n_iter_ == self.max_iter: warnings.warn( "Stochastic Optimizer: Maximum iterations (%d) " "reached and the optimization hasn't converged yet." % self.max_iter, ConvergenceWarning) except KeyboardInterrupt: warnings.warn("Training interrupted by user.") if early_stopping: # restore best weights self.coefs_ = self._best_coefs self.intercepts_ = self._best_intercepts def _backprop(self, X, y, activations, deltas, coef_grads, intercept_grads): """Compute the MLP loss function and its corresponding derivatives with respect to each parameter: weights and bias vectors. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. activations : list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. deltas : list, length = n_layers - 1 The ith element of the list holds the difference between the activations of the i + 1 layer and the backpropagated error. More specifically, deltas are gradients of loss with respect to z in each layer, where z = wx + b is the value of a particular layer before passing through the activation function coef_grads : list, length = n_layers - 1 The ith element contains the amount of change used to update the coefficient parameters of the ith layer in an iteration. intercept_grads : list, length = n_layers - 1 The ith element contains the amount of change used to update the intercept parameters of the ith layer in an iteration. flag: True if calculator train error Returns ------- loss : float coef_grads : list, length = n_layers - 1 intercept_grads : list, length = n_layers - 1 """ n_samples = X.shape[0] # Forward propagate activations = self._forward_pass(activations) # Get loss loss_func_name = self.loss if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic': loss_func_name = 'binary_log_loss' loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1]) # Add L2 regularization term to loss values = np.sum( np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_])) loss += (0.5 * self.alpha) * values / n_samples # Backward propagate last = self.n_layers_ - 2 # The calculation of delta[last] here works with following # combinations of output activation and loss function: # sigmoid and binary cross entropy, softmax and categorical cross # entropy, and identity with squared loss deltas[last] = activations[-1] - y # Compute gradient for the last layer coef_grads, intercept_grads = self._compute_loss_grad( last, n_samples, activations, deltas, coef_grads, intercept_grads) # Iterate over the hidden layers for i in range(self.n_layers_ - 2, 0, -1): deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T) inplace_derivative = DERIVATIVES[self.activation] inplace_derivative(activations[i], deltas[i - 1]) coef_grads, intercept_grads = self._compute_loss_grad( i - 1, n_samples, activations, deltas, coef_grads, intercept_grads) return loss, coef_grads, intercept_grads def _fit(self, X, y, incremental=False): X, X_val, y, Y_val = train_test_split(X, y, test_size=0.1) # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) # Validate input parameters. self._validate_hyperparameters() if np.any(np.array(hidden_layer_sizes) <= 0): raise ValueError("hidden_layer_sizes must be > 0, got %s." % hidden_layer_sizes) X, y = self._validate_input(X, y, incremental) n_samples, n_features = X.shape # Ensure y is 2D if y.ndim == 1: y = y.reshape((-1, 1)) self.n_outputs_ = y.shape[1] layer_units = ([n_features] + hidden_layer_sizes + [self.n_outputs_]) # check random state self._random_state = check_random_state(self.random_state) if not hasattr(self, 'coefs_') or (not self.warm_start and not incremental): # First time training the model self._initialize(y, layer_units) # lbfgs does not support mini-batches if self.solver == 'lbfgs' or self.solver == 'cg': batch_size = n_samples elif self.batch_size == 'auto': batch_size = min(200, n_samples) else: if self.batch_size < 1 or self.batch_size > n_samples: warnings.warn("Got `batch_size` less than 1 or larger than " "sample size. It is going to be clipped") batch_size = np.clip(self.batch_size, 1, n_samples) # Initialize lists activations = [X] activations.extend( np.empty((batch_size, n_fan_out)) for n_fan_out in layer_units[1:]) self.val_activations = list(activations) self.train_activations = list(activations) self.val_activations[0] = [X_val] self.train_activations[0] = [X] self.y_val = Y_val self.y_train = y deltas = [np.empty_like(a_layer) for a_layer in activations] coef_grads = [ np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_, n_fan_out_ in zip(layer_units[:-1], layer_units[1:]) ] intercept_grads = [ np.empty(n_fan_out_) for n_fan_out_ in layer_units[1:] ] # Run the Stochastic optimization solver if self.solver in _STOCHASTIC_SOLVERS: self._fit_stochastic(X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units, incremental) # Run the LBFGS solver elif self.solver == 'lbfgs': self._fit_lbfgs(X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units) # Run the BFGS solver elif self.solver == 'bfgs': self._fit_bfgs(X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units) # Run the Conjugate Gradient elif self.solver == 'cg': self._fit_cg(X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units) # Run the Newton-CG elif self.solver == 'ncg': self._fit_ncg(X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units) # Run the Evolution elif self.solver == 'evol': self._fit_evol(X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units) # Run the Basinhopping elif self.solver == 'basinhopping': self._fit_basinhopping(X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units) # Run the Simulated Annealing elif self.solver == 'anneal': self._fit_anneal(X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units) # Delete dataset is member class del self.train_activations del self.val_activations del self.y_train del self.y_val return self def _validate_hyperparameters(self): if not isinstance(self.shuffle, bool): raise ValueError("shuffle must be either True or False, got %s." % self.shuffle) if self.max_iter <= 0: raise ValueError("max_iter must be > 0, got %s." % self.max_iter) if self.alpha < 0.0: raise ValueError("alpha must be >= 0, got %s." % self.alpha) if (self.learning_rate in ["constant", "invscaling", "adaptive"] and self.learning_rate_init <= 0.0): raise ValueError("learning_rate_init must be > 0, got %s." % self.learning_rate) if self.momentum > 1 or self.momentum < 0: raise ValueError("momentum must be >= 0 and <= 1, got %s" % self.momentum) if not isinstance(self.nesterovs_momentum, bool): raise ValueError("nesterovs_momentum must be either True or False," " got %s." % self.nesterovs_momentum) if not isinstance(self.early_stopping, bool): raise ValueError("early_stopping must be either True or False," " got %s." % self.early_stopping) if self.validation_fraction < 0 or self.validation_fraction >= 1: raise ValueError("validation_fraction must be >= 0 and < 1, " "got %s" % self.validation_fraction) if self.beta_1 < 0 or self.beta_1 >= 1: raise ValueError("beta_1 must be >= 0 and < 1, got %s" % self.beta_1) if self.beta_2 < 0 or self.beta_2 >= 1: raise ValueError("beta_2 must be >= 0 and < 1, got %s" % self.beta_2) if self.epsilon <= 0.0: raise ValueError("epsilon must be > 0, got %s." % self.epsilon) # raise ValueError if not registered supported_activations = ('identity', 'logistic', 'tanh', 'relu') if self.activation not in supported_activations: raise ValueError("The activation '%s' is not supported. Supported " "activations are %s." % (self.activation, supported_activations)) if self.learning_rate not in ["constant", "invscaling", "adaptive"]: raise ValueError("learning rate %s is not supported. " % self.learning_rate) supported_solvers = _STOCHASTIC_SOLVERS + [ "lbfgs", "bfgs", "cg", "ncg", "evol", "anneal", "basinhopping" ] if self.solver not in supported_solvers: raise ValueError("The solver %s is not supported. " " Expected one of: %s" % (self.solver, ", ".join(supported_solvers))) supported_strategy = [ 'best1bin', 'best1exp', 'rand1exp', 'randtobest1exp', 'best2exp', 'rand2exp', 'randtobest1bin', 'best2bin', 'rand2bin', 'rand1bin' ] if self.straegy not in supported_strategy: raise ValueError("The strategy %s is not supported. " " Expected one of: %s" % (self.solver, ", ".join(supported_solvers)))
class ModMLPClassifier(MLPClassifier): """ Extension of MLPClassifier class in scikit-learn. This extension supports the new parameters train_folds and max_fail. param train_folds: number of folds merged in training set. If train_folds is None the default train_test_split is used to obtain the validation set based on validation fraction parameter, if is a number that indicates the number of folds of the training set: the last fold is selected as validation fold. param max_fail: overrides the default constant value (2) of max fail in the scikit-learn implementation """ def __init__(self, hidden_layer_sizes=(100, ), activation="relu", solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=1e-4, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8, train_folds=None, max_fail=5): super().__init__(hidden_layer_sizes, activation, solver, alpha, batch_size, learning_rate, learning_rate_init, power_t, max_iter, shuffle, random_state, tol, verbose, warm_start, momentum, nesterovs_momentum, early_stopping, validation_fraction, beta_1, beta_2, epsilon) # EXTENSION added properties self.train_folds = train_folds self.max_fail = max_fail def _fit_stochastic(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental): if not incremental or not hasattr(self, '_optimizer'): params = self.coefs_ + self.intercepts_ if self.solver == 'sgd': self._optimizer = SGDOptimizer(params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t) elif self.solver == 'adam': self._optimizer = AdamOptimizer(params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon) # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental if early_stopping: # EXTENSION train_folds (modifications here) X, X_val, y, y_val = self._split_train_validation(X, y) if is_classifier(self): y_val = self._label_binarizer.inverse_transform(y_val) else: X_val = None y_val = None n_samples = X.shape[0] if self.batch_size == 'auto': batch_size = min(200, n_samples) else: batch_size = np.clip(self.batch_size, 1, n_samples) try: for it in range(self.max_iter): X, y = shuffle(X, y, random_state=self._random_state) accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): activations[0] = X[batch_slice] batch_loss, coef_grads, intercept_grads = self._backprop( X[batch_slice], y[batch_slice], activations, deltas, coef_grads, intercept_grads) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) # update weights grads = coef_grads + intercept_grads self._optimizer.update_params(grads) self.n_iter_ += 1 self.loss_ = accumulated_loss / X.shape[0] self.t_ += n_samples self.loss_curve_.append(self.loss_) if self.verbose: print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) # update no_improvement_count based on training loss or # validation score according to early_stopping self._update_no_improvement_count(early_stopping, X_val, y_val) # for learning rate that needs to be updated at iteration end self._optimizer.iteration_ends(self.t_) # EXTENSION max_fail (modified next line) if self._no_improvement_count > self.max_fail: # not better than last two iterations by tol. # stop or decrease learning rate if early_stopping: msg = ("Validation score did not improve more than " "tol=%f for two consecutive epochs." % self.tol) else: msg = ("Training loss did not improve more than tol=%f" " for two consecutive epochs." % self.tol) is_stopping = self._optimizer.trigger_stopping( msg, self.verbose) if is_stopping: break else: self._no_improvement_count = 0 if incremental: break if self.n_iter_ == self.max_iter: warnings.warn( "Stochastic Optimizer: Maximum iterations (%d) " "reached and the optimization hasn't converged yet." % self.max_iter, ConvergenceWarning) except KeyboardInterrupt: warnings.warn("Training interrupted by user.") if early_stopping: # restore best weights self.coefs_ = self._best_coefs self.intercepts_ = self._best_intercepts # EXTENSION train_folds (new function) def _split_train_validation(self, X, y): if self.train_folds is None or self.train_folds == 0: X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=self._random_state, test_size=self.validation_fraction) else: split_index = int( len(y) * ((self.train_folds - 1) / self.train_folds)) X_train = X[0:split_index] X_test = X[split_index:X.shape[0]] y_train = y[0:split_index] y_test = y[split_index:X.shape[0]] return X_train, X_test, y_train, y_test
def _fit_stochastic(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental): if not incremental or not hasattr(self, '_optimizer'): params = self.coefs_ + self.intercepts_ if self.solver == 'sgd': self._optimizer = SGDOptimizer( params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t) elif self.solver == 'adam': self._optimizer = AdamOptimizer( params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon) # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental if early_stopping: X, X_val, y, y_val = train_test_split( X, y, random_state=self._random_state, test_size=self.validation_fraction) if isinstance(self, ClassifierMixin): y_val = self._label_binarizer.inverse_transform(y_val) else: X_val = None y_val = None n_samples = X.shape[0] if self.batch_size == 'auto': batch_size = min(200, n_samples) else: batch_size = np.clip(self.batch_size, 1, n_samples) '''for explanation start''' activations_over_all_itr = [] '''for explanation end''' try: for it in range(self.max_iter): X, y = shuffle(X, y, random_state=self._random_state) accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): activations[0] = X[batch_slice] batch_loss, coef_grads, intercept_grads = self._backprop( X[batch_slice], y[batch_slice], activations, deltas, coef_grads, intercept_grads) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) # update weights grads = coef_grads + intercept_grads self._optimizer.update_params(grads) '''for explanation start''' activations_over_all_itr.append(activations) '''for explanation end''' self.n_iter_ += 1 self.loss_ = accumulated_loss / X.shape[0] self.t_ += n_samples self.loss_curve_.append(self.loss_) if self.verbose: print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) # update no_improvement_count based on training loss or # validation score according to early_stopping self._update_no_improvement_count(early_stopping, X_val, y_val) # for learning rate that needs to be updated at iteration end self._optimizer.iteration_ends(self.t_) if self._no_improvement_count > 2: # not better than last two iterations by tol. # stop or decrease learning rate if early_stopping: msg = ("Validation score did not improve more than " "tol=%f for two consecutive epochs." % self.tol) else: msg = ("Training loss did not improve more than tol=%f" " for two consecutive epochs." % self.tol) is_stopping = self._optimizer.trigger_stopping( msg, self.verbose) if is_stopping: break else: self._no_improvement_count = 0 if incremental: break if self.n_iter_ == self.max_iter: warnings.warn( "Stochastic Optimizer: Maximum iterations (%d) " "reached and the optimization hasn't converged yet." % self.max_iter, ConvergenceWarning) except KeyboardInterrupt: warnings.warn("Training interrupted by user.") if early_stopping: # restore best weights self.coefs_ = self._best_coefs self.intercepts_ = self._best_intercepts '''for explanation start''' return activations_over_all_itr '''for explanation end'''
class BaseMultilayerPerceptron_Custom(six.with_metaclass(ABCMeta, BaseEstimator)): """Base class for MLP classification and regression. Warning: This class should not be used directly. Use derived classes instead. .. versionadded:: 0.18 """ @abstractmethod def __init__(self, hidden_layer_sizes, activation, solver, alpha, batch_size, learning_rate, learning_rate_init, power_t, max_iter, loss, shuffle, random_state, tol, verbose, warm_start, momentum, nesterovs_momentum, early_stopping, validation_fraction, beta_1, beta_2, epsilon): self.activation = activation self.solver = solver self.alpha = alpha self.batch_size = batch_size self.learning_rate = learning_rate self.learning_rate_init = learning_rate_init self.power_t = power_t self.max_iter = max_iter self.loss = loss self.hidden_layer_sizes = hidden_layer_sizes self.shuffle = shuffle self.random_state = random_state self.tol = tol self.verbose = verbose self.warm_start = warm_start self.momentum = momentum self.nesterovs_momentum = nesterovs_momentum self.early_stopping = early_stopping self.validation_fraction = validation_fraction self.beta_1 = beta_1 self.beta_2 = beta_2 self.epsilon = epsilon def _unpack(self, packed_parameters): """Extract the coefficients and intercepts from packed_parameters.""" for i in range(self.n_layers_ - 1): start, end, shape = self._coef_indptr[i] self.coefs_[i] = np.reshape(packed_parameters[start:end], shape) start, end = self._intercept_indptr[i] self.intercepts_[i] = packed_parameters[start:end] def _forward_pass(self, activations, training=True): """Perform a forward pass on the network by computing the values of the neurons in the hidden layers and the output layer. Parameters ---------- activations : list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. with_output_activation : bool, default True If True, the output passes through the output activation function, which is either the softmax function or the logistic function """ hidden_activation = ACTIVATIONS[self.activation] '''for explanations start''' # Iterate over the hidden layers # activated neuron for each hidden layer will be saved here. # i'th element of the list will be i'th layer's activated neurons activated_neurons = [] activated_neurons_as_dict = [] activated_neurons_raw_sum = [] activated_neurons_as_np = np.zeros_like((len(activations[0]) * len(activations[0][0]))) '''for explanations end''' # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], self.coefs_[i]) activations[i + 1] += self.intercepts_[i] # For the hidden layers if (i + 1) != (self.n_layers_ - 1): activations[i + 1] = hidden_activation(activations[i + 1]) '''for explanations start''' # elements of activations are the neurons # activations are list, and i'th element of the activations activation[i] is # matrix of dimension = instances * neurons # activations[0] = layer_1 * neurons_in_layer_1 # iterate over all input_instances # calculate the values only for test set... # we have to implement it for train set also if not training: # print('inside training') activated_n = set() activated_n_as_dict = {} activated_n_raw_sum = np.zeros(activations[i + 1].shape[1]) # print('layer : ',(i+1)) for input_instance_i in activations[i + 1]: neuron_set = set(index for index, value in enumerate( input_instance_i) if value >= np.mean(input_instance_i)) neuron_dict = dict([(index, value) if value >= np.mean(input_instance_i) else (index, 0) for index, value in enumerate(input_instance_i)]) activated_n_raw_sum += np.array([1 if value >= np.mean( input_instance_i) else 0 for value in input_instance_i]) activated_n |= neuron_set # print('neurons: ', neuron) # print('activated_n: ',activated_n) # print('') activated_neurons.append(activated_n) activated_neurons_raw_sum.append(activated_n_raw_sum) # print('activated_neurons: ',activated_neurons) # print('') # print('') # print('activations[%s]: ' %(i+1), type(activations[i+1]), activations[i+1].shape,activations[i+1]) '''for explanations end''' # For the last layer output_activation = ACTIVATIONS[self.out_activation_] activations[i + 1] = output_activation(activations[i + 1]) '''for explanations start''' return activations, activated_neurons, activated_neurons_raw_sum '''for explanations end''' def _compute_loss_grad(self, layer, n_samples, activations, deltas, coef_grads, intercept_grads): """Compute the gradient of loss with respect to coefs and intercept for specified layer. This function does backpropagation for the specified one layer. """ coef_grads[layer] = safe_sparse_dot(activations[layer].T, deltas[layer]) coef_grads[layer] += (self.alpha * self.coefs_[layer]) coef_grads[layer] /= n_samples intercept_grads[layer] = np.mean(deltas[layer], 0) return coef_grads, intercept_grads def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas, coef_grads, intercept_grads): """Compute the MLP loss function and its corresponding derivatives with respect to the different parameters given in the initialization. Returned gradients are packed in a single vector so it can be used in lbfgs Parameters ---------- packed_parameters : array-like A vector comprising the flattened coefficients and intercepts. X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. activations : list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. deltas : list, length = n_layers - 1 The ith element of the list holds the difference between the activations of the i + 1 layer and the backpropagated error. More specifically, deltas are gradients of loss with respect to z in each layer, where z = wx + b is the value of a particular layer before passing through the activation function coef_grad : list, length = n_layers - 1 The ith element contains the amount of change used to update the coefficient parameters of the ith layer in an iteration. intercept_grads : list, length = n_layers - 1 The ith element contains the amount of change used to update the intercept parameters of the ith layer in an iteration. Returns ------- loss : float grad : array-like, shape (number of nodes of all layers,) """ self._unpack(packed_coef_inter) loss, coef_grads, intercept_grads = self._backprop( X, y, activations, deltas, coef_grads, intercept_grads) self.n_iter_ += 1 grad = _pack(coef_grads, intercept_grads) return loss, grad def _backprop(self, X, y, activations, deltas, coef_grads, intercept_grads): """Compute the MLP loss function and its corresponding derivatives with respect to each parameter: weights and bias vectors. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. activations : list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. deltas : list, length = n_layers - 1 The ith element of the list holds the difference between the activations of the i + 1 layer and the backpropagated error. More specifically, deltas are gradients of loss with respect to z in each layer, where z = wx + b is the value of a particular layer before passing through the activation function coef_grad : list, length = n_layers - 1 The ith element contains the amount of change used to update the coefficient parameters of the ith layer in an iteration. intercept_grads : list, length = n_layers - 1 The ith element contains the amount of change used to update the intercept parameters of the ith layer in an iteration. Returns ------- loss : float coef_grads : list, length = n_layers - 1 intercept_grads : list, length = n_layers - 1 """ n_samples = X.shape[0] # Forward propagate '''for explanations start''' activations, activated_neurons, activated_neurons_raw_sum = self._forward_pass(activations, training=True) '''for explanations end''' # Get loss loss_func_name = self.loss if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic': loss_func_name = 'binary_log_loss' loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1]) # Add L2 regularization term to loss values = np.sum( np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_])) loss += (0.5 * self.alpha) * values / n_samples # Backward propagate last = self.n_layers_ - 2 # The calculation of delta[last] here works with following # combinations of output activation and loss function: # sigmoid and binary cross entropy, softmax and categorical cross # entropy, and identity with squared loss deltas[last] = activations[-1] - y # Compute gradient for the last layer coef_grads, intercept_grads = self._compute_loss_grad( last, n_samples, activations, deltas, coef_grads, intercept_grads) # Iterate over the hidden layers for i in range(self.n_layers_ - 2, 0, -1): deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T) inplace_derivative = DERIVATIVES[self.activation] inplace_derivative(activations[i], deltas[i - 1]) coef_grads, intercept_grads = self._compute_loss_grad( i - 1, n_samples, activations, deltas, coef_grads, intercept_grads) # print('loss.len: ', len(loss)) '''for explanations start''' # for index, activation in enumerate(activations): # print('activations[%d]' %index, activation[index]) '''for explanations end''' return loss, coef_grads, intercept_grads def _initialize(self, y, layer_units): # set all attributes, allocate weights etc for first call # Initialize parameters self.n_iter_ = 0 self.t_ = 0 self.n_outputs_ = y.shape[1] # Compute the number of layers self.n_layers_ = len(layer_units) # Output for regression if not isinstance(self, ClassifierMixin): self.out_activation_ = 'identity' # Output for multi class elif self._label_binarizer.y_type_ == 'multiclass': self.out_activation_ = 'softmax' # Output for binary class and multi-label else: self.out_activation_ = 'logistic' # Initialize coefficient and intercept layers self.coefs_ = [] self.intercepts_ = [] for i in range(self.n_layers_ - 1): coef_init, intercept_init = self._init_coef(layer_units[i], layer_units[i + 1]) self.coefs_.append(coef_init) self.intercepts_.append(intercept_init) if self.solver in _STOCHASTIC_SOLVERS: self.loss_curve_ = [] self._no_improvement_count = 0 if self.early_stopping: self.validation_scores_ = [] self.best_validation_score_ = -np.inf else: self.best_loss_ = np.inf def _init_coef(self, fan_in, fan_out): if self.activation == 'logistic': # Use the initialization method recommended by # Glorot et al. init_bound = np.sqrt(2. / (fan_in + fan_out)) elif self.activation in ('identity', 'tanh', 'relu'): init_bound = np.sqrt(6. / (fan_in + fan_out)) else: # this was caught earlier, just to make sure raise ValueError("Unknown activation function %s" % self.activation) coef_init = self._random_state.uniform(-init_bound, init_bound, (fan_in, fan_out)) intercept_init = self._random_state.uniform(-init_bound, init_bound, fan_out) return coef_init, intercept_init def _fit(self, X, y, incremental=False): # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) # Validate input parameters. self._validate_hyperparameters() if np.any(np.array(hidden_layer_sizes) <= 0): raise ValueError("hidden_layer_sizes must be > 0, got %s." % hidden_layer_sizes) X, y = self._validate_input(X, y, incremental) n_samples, n_features = X.shape # Ensure y is 2D if y.ndim == 1: y = y.reshape((-1, 1)) self.n_outputs_ = y.shape[1] layer_units = ([n_features] + hidden_layer_sizes + [self.n_outputs_]) # check random state self._random_state = check_random_state(self.random_state) if not hasattr(self, 'coefs_') or (not self.warm_start and not incremental): # First time training the model self._initialize(y, layer_units) # lbfgs does not support mini-batches if self.solver == 'lbfgs': batch_size = n_samples elif self.batch_size == 'auto': batch_size = min(200, n_samples) else: if self.batch_size < 1 or self.batch_size > n_samples: warnings.warn("Got `batch_size` less than 1 or larger than " "sample size. It is going to be clipped") batch_size = np.clip(self.batch_size, 1, n_samples) # Initialize lists activations = [X] activations.extend(np.empty((batch_size, n_fan_out)) for n_fan_out in layer_units[1:]) deltas = [np.empty_like(a_layer) for a_layer in activations] # coef is weight matrix coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_, n_fan_out_ in zip(layer_units[:-1], layer_units[1:])] # intercept is bias intercept_grads = [np.empty(n_fan_out_) for n_fan_out_ in layer_units[1:]] # Run the Stochastic optimization solver if self.solver in _STOCHASTIC_SOLVERS: '''for explanation start''' activations_over_all_itr = self._fit_stochastic(X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental) '''for explanation end''' # Run the LBFGS solver elif self.solver == 'lbfgs': self._fit_lbfgs(X, y, activations, deltas, coef_grads, intercept_grads, layer_units) ''' for explanation start''' return self, activations_over_all_itr ''' for explanation end''' def _validate_hyperparameters(self): if not isinstance(self.shuffle, bool): raise ValueError("shuffle must be either True or False, got %s." % self.shuffle) if self.max_iter <= 0: raise ValueError("max_iter must be > 0, got %s." % self.max_iter) if self.alpha < 0.0: raise ValueError("alpha must be >= 0, got %s." % self.alpha) if (self.learning_rate in ["constant", "invscaling", "adaptive"] and self.learning_rate_init <= 0.0): raise ValueError("learning_rate_init must be > 0, got %s." % self.learning_rate) if self.momentum > 1 or self.momentum < 0: raise ValueError("momentum must be >= 0 and <= 1, got %s" % self.momentum) if not isinstance(self.nesterovs_momentum, bool): raise ValueError("nesterovs_momentum must be either True or False," " got %s." % self.nesterovs_momentum) if not isinstance(self.early_stopping, bool): raise ValueError("early_stopping must be either True or False," " got %s." % self.early_stopping) if self.validation_fraction < 0 or self.validation_fraction >= 1: raise ValueError("validation_fraction must be >= 0 and < 1, " "got %s" % self.validation_fraction) if self.beta_1 < 0 or self.beta_1 >= 1: raise ValueError("beta_1 must be >= 0 and < 1, got %s" % self.beta_1) if self.beta_2 < 0 or self.beta_2 >= 1: raise ValueError("beta_2 must be >= 0 and < 1, got %s" % self.beta_2) if self.epsilon <= 0.0: raise ValueError("epsilon must be > 0, got %s." % self.epsilon) # raise ValueError if not registered supported_activations = ('identity', 'logistic', 'tanh', 'relu') if self.activation not in supported_activations: raise ValueError("The activation '%s' is not supported. Supported " "activations are %s." % (self.activation, supported_activations)) if self.learning_rate not in ["constant", "invscaling", "adaptive"]: raise ValueError("learning rate %s is not supported. " % self.learning_rate) supported_solvers = _STOCHASTIC_SOLVERS + ["lbfgs"] if self.solver not in supported_solvers: raise ValueError("The solver %s is not supported. " " Expected one of: %s" % (self.solver, ", ".join(supported_solvers))) def _fit_lbfgs(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run LBFGS packed_coef_inter = _pack(self.coefs_, self.intercepts_) if self.verbose is True or self.verbose >= 1: iprint = 1 else: iprint = -1 optimal_parameters, self.loss_, d = fmin_l_bfgs_b( x0=packed_coef_inter, func=self._loss_grad_lbfgs, maxfun=self.max_iter, iprint=iprint, pgtol=self.tol, args=(X, y, activations, deltas, coef_grads, intercept_grads)) self._unpack(optimal_parameters) def _fit_stochastic(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental): if not incremental or not hasattr(self, '_optimizer'): params = self.coefs_ + self.intercepts_ if self.solver == 'sgd': self._optimizer = SGDOptimizer( params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t) elif self.solver == 'adam': self._optimizer = AdamOptimizer( params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon) # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental if early_stopping: X, X_val, y, y_val = train_test_split( X, y, random_state=self._random_state, test_size=self.validation_fraction) if isinstance(self, ClassifierMixin): y_val = self._label_binarizer.inverse_transform(y_val) else: X_val = None y_val = None n_samples = X.shape[0] if self.batch_size == 'auto': batch_size = min(200, n_samples) else: batch_size = np.clip(self.batch_size, 1, n_samples) '''for explanation start''' activations_over_all_itr = [] '''for explanation end''' try: for it in range(self.max_iter): X, y = shuffle(X, y, random_state=self._random_state) accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): activations[0] = X[batch_slice] batch_loss, coef_grads, intercept_grads = self._backprop( X[batch_slice], y[batch_slice], activations, deltas, coef_grads, intercept_grads) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) # update weights grads = coef_grads + intercept_grads self._optimizer.update_params(grads) '''for explanation start''' activations_over_all_itr.append(activations) '''for explanation end''' self.n_iter_ += 1 self.loss_ = accumulated_loss / X.shape[0] self.t_ += n_samples self.loss_curve_.append(self.loss_) if self.verbose: print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) # update no_improvement_count based on training loss or # validation score according to early_stopping self._update_no_improvement_count(early_stopping, X_val, y_val) # for learning rate that needs to be updated at iteration end self._optimizer.iteration_ends(self.t_) if self._no_improvement_count > 2: # not better than last two iterations by tol. # stop or decrease learning rate if early_stopping: msg = ("Validation score did not improve more than " "tol=%f for two consecutive epochs." % self.tol) else: msg = ("Training loss did not improve more than tol=%f" " for two consecutive epochs." % self.tol) is_stopping = self._optimizer.trigger_stopping( msg, self.verbose) if is_stopping: break else: self._no_improvement_count = 0 if incremental: break if self.n_iter_ == self.max_iter: warnings.warn( "Stochastic Optimizer: Maximum iterations (%d) " "reached and the optimization hasn't converged yet." % self.max_iter, ConvergenceWarning) except KeyboardInterrupt: warnings.warn("Training interrupted by user.") if early_stopping: # restore best weights self.coefs_ = self._best_coefs self.intercepts_ = self._best_intercepts '''for explanation start''' return activations_over_all_itr '''for explanation end''' def _update_no_improvement_count(self, early_stopping, X_val, y_val): if early_stopping: # compute validation score, use that for stopping self.validation_scores_.append(self.score(X_val, y_val)) if self.verbose: print("Validation score: %f" % self.validation_scores_[-1]) # update best parameters # use validation_scores_, not loss_curve_ # let's hope no-one overloads .score with mse last_valid_score = self.validation_scores_[-1] if last_valid_score < (self.best_validation_score_ + self.tol): self._no_improvement_count += 1 else: self._no_improvement_count = 0 if last_valid_score > self.best_validation_score_: self.best_validation_score_ = last_valid_score self._best_coefs = [c.copy() for c in self.coefs_] self._best_intercepts = [i.copy() for i in self.intercepts_] else: if self.loss_curve_[-1] > self.best_loss_ - self.tol: self._no_improvement_count += 1 else: self._no_improvement_count = 0 if self.loss_curve_[-1] < self.best_loss_: self.best_loss_ = self.loss_curve_[-1] def fit(self, X, y): """Fit the model to data matrix X and target(s) y. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). Returns ------- self : returns a trained MLP model. """ return self._fit(X, y, incremental=False) @property def partial_fit(self): """Fit the model to data matrix X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) The target values. Returns ------- self : returns a trained MLP model. """ if self.solver not in _STOCHASTIC_SOLVERS: raise AttributeError("partial_fit is only available for stochastic" " optimizers. %s is not stochastic." % self.solver) return self._partial_fit def _partial_fit(self, X, y, classes=None): return self._fit(X, y, incremental=True) def _predict(self, X): """Predict using the trained model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs) The decision function of the samples for each class in the model. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) # Make sure self.hidden_layer_sizes is a list hidden_layer_sizes = self.hidden_layer_sizes if not hasattr(hidden_layer_sizes, "__iter__"): hidden_layer_sizes = [hidden_layer_sizes] hidden_layer_sizes = list(hidden_layer_sizes) layer_units = [X.shape[1]] + hidden_layer_sizes + \ [self.n_outputs_] # Initialize layers # now activations is simply X or all input instances activations = [X] for i in range(self.n_layers_ - 1): m = np.empty((X.shape[0], layer_units[i + 1])) activations.append(m) # forward propagate '''for explanations start''' activations, activated_neurons, activated_neurons_raw_sum = self._forward_pass(activations, training=False) '''for explanations end''' y_pred = activations[-1] '''for explanations start''' return y_pred, activations, activated_neurons, activated_neurons_raw_sum '''for explanations end'''
def _fit_stochastic( self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental, ): if not incremental or not hasattr(self, "_optimizer"): params = self.coefs_ + self.intercepts_ if self.solver == "sgd": self._optimizer = SGDOptimizer( params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t, ) elif self.solver == "adam": self._optimizer = AdamOptimizer( params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon, ) # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental if early_stopping: # don't stratify in multilabel classification # should_stratify = is_classifier(self) and self.n_outputs_ == 1 # stratify = y if should_stratify else None # X, X_val, y, y_val = train_test_split( # X, # y, # random_state=self._random_state, # test_size=self.validation_fraction, # stratify=stratify, # ) # if is_classifier(self): # y_val = self._label_binarizer.inverse_transform(y_val) # --------------------------- # # Custom validation set # # --------------------------- # X_val = self.custom_validation_data[0] y_val = self.custom_validation_data[1] if type(X_val) is not np.ndarray: X_val = X_val.to_numpy() if type(y_val) is not np.ndarray: y_val = y_val.to_numpy() # --------------------------- # # Custom code end # # --------------------------- # else: X_val = None y_val = None n_samples = X.shape[0] if self.batch_size == "auto": batch_size = min(200, n_samples) else: batch_size = np.clip(self.batch_size, 1, n_samples) try: for it in tqdm(range(self.max_iter), unit="iteration"): if self.shuffle: X, y = shuffle(X, y, random_state=self._random_state) accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): activations[0] = X[batch_slice] batch_loss, coef_grads, intercept_grads = self._backprop( X[batch_slice], y[batch_slice], activations, deltas, coef_grads, intercept_grads, ) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) # update weights grads = coef_grads + intercept_grads self._optimizer.update_params(grads) self.n_iter_ += 1 self.loss_ = accumulated_loss / X.shape[0] self.t_ += n_samples self.loss_curve_.append(self.loss_) if self.verbose: print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) # update no_improvement_count based on training loss or # validation score according to early_stopping self._update_no_improvement_count(early_stopping, X_val, y_val) # for learning rate that needs to be updated at iteration end self._optimizer.iteration_ends(self.t_) if self._no_improvement_count > self.n_iter_no_change: # not better than last `n_iter_no_change` iterations by tol # stop or decrease learning rate if early_stopping: msg = ("Validation score did not improve more than " "tol=%f for %d consecutive epochs." % (self.tol, self.n_iter_no_change)) else: msg = ("Training loss did not improve more than tol=%f" " for %d consecutive epochs." % (self.tol, self.n_iter_no_change)) is_stopping = self._optimizer.trigger_stopping( msg, self.verbose) if is_stopping: break else: self._no_improvement_count = 0 if incremental: break if self.n_iter_ == self.max_iter: warnings.warn( "Stochastic Optimizer: Maximum iterations (%d) " "reached and the optimization hasn't converged yet." % self.max_iter, ConvergenceWarning, ) except KeyboardInterrupt: warnings.warn("Training interrupted by user.") if early_stopping: # restore best weights self.coefs_ = self._best_coefs self.intercepts_ = self._best_intercepts
class ModifiedMLPClassifier(MLPClassifier): def __init__( self, hidden_layer_sizes=(100, ), activation="relu", solver="adam", alpha=0.0001, batch_size="auto", learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=1e-4, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-8, n_iter_no_change=10, max_fun=15000, custom_validation_data=None, ): super().__init__( hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun, ) self.custom_validation_data = custom_validation_data def score(self, X, y, sample_weight=None): """ Return the LRAP on the given test data and labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples, n_outputs) True labels for X. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float LRAP of self.predict_proba(X) wrt. y. """ return LRAP(y, self._predict(X), sample_weight=sample_weight) def _fit_stochastic( self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental, ): if not incremental or not hasattr(self, "_optimizer"): params = self.coefs_ + self.intercepts_ if self.solver == "sgd": self._optimizer = SGDOptimizer( params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t, ) elif self.solver == "adam": self._optimizer = AdamOptimizer( params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon, ) # early_stopping in partial_fit doesn't make sense early_stopping = self.early_stopping and not incremental if early_stopping: # don't stratify in multilabel classification # should_stratify = is_classifier(self) and self.n_outputs_ == 1 # stratify = y if should_stratify else None # X, X_val, y, y_val = train_test_split( # X, # y, # random_state=self._random_state, # test_size=self.validation_fraction, # stratify=stratify, # ) # if is_classifier(self): # y_val = self._label_binarizer.inverse_transform(y_val) # --------------------------- # # Custom validation set # # --------------------------- # X_val = self.custom_validation_data[0] y_val = self.custom_validation_data[1] if type(X_val) is not np.ndarray: X_val = X_val.to_numpy() if type(y_val) is not np.ndarray: y_val = y_val.to_numpy() # --------------------------- # # Custom code end # # --------------------------- # else: X_val = None y_val = None n_samples = X.shape[0] if self.batch_size == "auto": batch_size = min(200, n_samples) else: batch_size = np.clip(self.batch_size, 1, n_samples) try: for it in tqdm(range(self.max_iter), unit="iteration"): if self.shuffle: X, y = shuffle(X, y, random_state=self._random_state) accumulated_loss = 0.0 for batch_slice in gen_batches(n_samples, batch_size): activations[0] = X[batch_slice] batch_loss, coef_grads, intercept_grads = self._backprop( X[batch_slice], y[batch_slice], activations, deltas, coef_grads, intercept_grads, ) accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start) # update weights grads = coef_grads + intercept_grads self._optimizer.update_params(grads) self.n_iter_ += 1 self.loss_ = accumulated_loss / X.shape[0] self.t_ += n_samples self.loss_curve_.append(self.loss_) if self.verbose: print("Iteration %d, loss = %.8f" % (self.n_iter_, self.loss_)) # update no_improvement_count based on training loss or # validation score according to early_stopping self._update_no_improvement_count(early_stopping, X_val, y_val) # for learning rate that needs to be updated at iteration end self._optimizer.iteration_ends(self.t_) if self._no_improvement_count > self.n_iter_no_change: # not better than last `n_iter_no_change` iterations by tol # stop or decrease learning rate if early_stopping: msg = ("Validation score did not improve more than " "tol=%f for %d consecutive epochs." % (self.tol, self.n_iter_no_change)) else: msg = ("Training loss did not improve more than tol=%f" " for %d consecutive epochs." % (self.tol, self.n_iter_no_change)) is_stopping = self._optimizer.trigger_stopping( msg, self.verbose) if is_stopping: break else: self._no_improvement_count = 0 if incremental: break if self.n_iter_ == self.max_iter: warnings.warn( "Stochastic Optimizer: Maximum iterations (%d) " "reached and the optimization hasn't converged yet." % self.max_iter, ConvergenceWarning, ) except KeyboardInterrupt: warnings.warn("Training interrupted by user.") if early_stopping: # restore best weights self.coefs_ = self._best_coefs self.intercepts_ = self._best_intercepts