def reset(self): self.total_square_error = 0.0 self.average_error = 0.0 self.last_true_label = None self.last_prediction = None self.total_square_error_correction = FastBuffer(self.window_size) self.average_error_correction = FastBuffer(self.window_size)
def __init__(self, n_max_components: int = 10, chunk_size: int = 500, window_size: int = 100, logging=True): super().__init__() self._num_of_max_classifiers = n_max_components self._chunk_size = chunk_size self._Logging = logging self._num_of_current_classifiers = 0 self._num_of_processed_instances = 0 self._classifiers = np.empty((self._num_of_max_classifiers), dtype=object) self._weights = np.zeros((self._num_of_max_classifiers, )) # What to save from current Data Chunk --> will be used for # adjusting weights, pruning purposes and so on. # Individual predictions of components, overall prediction of ensemble, # and ground truth info. self._chunk_comp_preds = FastBuffer(max_size=chunk_size) self._chunk_ensm_preds = FastBuffer(max_size=chunk_size) # chunk_data has instances in the chunk and their ground truth. # To be initialized after receiving n_features, n_targets self._chunk_data = None # self._chunk_truths = FastBuffer(max_size=chunk_size) # some external stuff that is about the data we are dealing with # but useful for recording predictions self._num_classes = None self._target_values = None # Required to correctly train HTs self._record = False # Boolean for keeping records to files
def __init__(self, window_size=200): super().__init__() self.total_square_error = 0.0 self.average_error = 0.0 self.last_true_label = None self.last_prediction = None self.total_square_error_correction = FastBuffer(window_size) self.average_error_correction = FastBuffer(window_size) self.window_size = window_size
def reset(self): if self.targets is not None: self.n_targets = len(self.targets) else: self.n_targets = 0 self.majority_classifier = 0 self.correct_no_change = 0 self.confusion_matrix.restart(self.n_targets) self.majority_classifier_correction = FastBuffer(self.window_size) self.correct_no_change_correction = FastBuffer(self.window_size)
def __configure(self, missing_value, strategy, window_size, new_value=1): if hasattr(missing_value, 'append'): self.missing_value = missing_value else: self.missing_value = [missing_value] self.strategy = strategy self.window_size = window_size self.new_value = new_value if strategy in ['mean', 'median', 'mode']: self.window = FastBuffer(max_size=window_size)
def reset(self): if self.targets is not None: self.n_targets = len(self.targets) else: self.n_targets = 0 self.true_labels = FastBuffer(self.window_size) self.predictions = FastBuffer(self.window_size) self.temp = 0 self.last_prediction = None self.last_true_label = None self.last_sample = None self.majority_classifier = 0 self.correct_no_change = 0 self.confusion_matrix.restart(self.n_targets) self.majority_classifier_correction = FastBuffer(self.window_size) self.correct_no_change_correction = FastBuffer(self.window_size)
def __init__(self, targets=None, dtype=np.int64, window_size=200): super().__init__() if targets is not None: self.n_targets = len(targets) else: self.n_targets = 0 self.confusion_matrix = ConfusionMatrix(self.n_targets, dtype) self.last_class = None self.targets = targets self.window_size = window_size self.true_labels = FastBuffer(window_size) self.predictions = FastBuffer(window_size) self.temp = 0 self.last_prediction = None self.last_true_label = None self.last_sample = None self.majority_classifier = 0 self.correct_no_change = 0 self.majority_classifier_correction = FastBuffer(window_size) self.correct_no_change_correction = FastBuffer(window_size)
def __configure(self): if self.strategy in ['mean', 'median', 'mode']: self.window = FastBuffer(max_size=self.window_size)
class MissingValuesCleaner(StreamTransform): """ This is a transform object. It provides a simple way to replace missing values in samples with another value, which can be chosen from a set of replacing strategies. Parameters ---------- missing_value: int, float or list (Default: numpy.nan) Missing value to replace strategy: string (Default: 'zero') The strategy adopted to find the missing value replacement. It can be one of the following: 'zero', 'mean', 'median', 'mode', 'custom'. window_size: int (Default: 200) Defines the window size for the 'mean', 'median' and 'mode' strategies. new_value: int (Default: 1) This is the replacement value in case the chosen strategy is 'custom'. Examples -------- >>> # Imports >>> import numpy as np >>> from skmultiflow.data.file_stream import FileStream >>> from skmultiflow.transform.missing_values_cleaner import MissingValuesCleaner >>> # Setting up a stream >>> stream = FileStream('skmultiflow/data/datasets/covtype.csv', -1, 1) >>> stream.prepare_for_use() >>> # Setting up the filter to substitute values -47 by the median of the >>> # last 10 samples >>> cleaner = MissingValuesCleaner(-47, 'median', 10) >>> X, y = stream.next_sample(10) >>> X[9, 0] = -47 >>> # We will use this list to keep track of values >>> data = [] >>> # Iterate over the first 9 samples, to build a sample window >>> for i in range(9): >>> X_transf = cleaner.partial_fit_transform([X[i].tolist()]) >>> data.append(X_transf[0][0]) >>> >>> # Transform last sample. The first feature should be replaced by the list's >>> # median value >>> X_transf = cleaner.partial_fit_transform([X[9].tolist()]) >>> np.median(data) Notes ----- A missing value in a sample can be coded in many different ways, but the most common one is to use numpy's NaN, that's why that is the default missing value parameter. The user should choose the correct substitution strategy for his use case, as each strategy has its pros and cons. The strategy can be chosen from a set of predefined strategies, which are: 'zero', 'mean', 'median', 'mode', 'custom'. Notice that `MissingValuesCleaner` can actually be used to replace arbitrary values. """ def __init__(self, missing_value=np.nan, strategy='zero', window_size=200, new_value=1): super().__init__() if isinstance(missing_value, list): self.missing_value = missing_value else: self.missing_value = [missing_value] self.strategy = strategy self.window_size = window_size self.window = None self.new_value = new_value self.__configure() def __configure(self): if self.strategy in ['mean', 'median', 'mode']: self.window = FastBuffer(max_size=self.window_size) def transform(self, X): """ transform Does the transformation process in the samples in X. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. """ r, c = get_dimensions(X) for i in range(r): if self.strategy in ['mean', 'median', 'mode']: self.window.add_element([X[i][:]]) for j in range(c): if X[i][j] in self.missing_value or np.isnan(X[i][j]): X[i][j] = self._get_substitute(j) return X def _get_substitute(self, column_index): """ _get_substitute Computes the replacement for a missing value. Parameters ---------- column_index: int The index from the column where the missing value was found. Returns ------- int or float The replacement. """ if self.strategy == 'zero': return 0 elif self.strategy == 'mean': if not self.window.is_empty(): return np.nanmean( np.array(self.window.get_queue())[:, column_index]) else: return self.new_value elif self.strategy == 'median': if not self.window.is_empty(): return np.nanmedian( np.array(self.window.get_queue())[:, column_index]) else: return self.new_value elif self.strategy == 'mode': if not self.window.is_empty(): return stats.mode(np.array( self.window.get_queue())[:, column_index], nan_policy='omit')[0] else: return self.new_value elif self.strategy == 'custom': return self.new_value def partial_fit_transform(self, X, y=None): """ partial_fit_transform Partially fits the model and then apply the transform to the data. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. y: Array-like The true labels. Returns ------- numpy.ndarray of shape (n_samples, n_features) The transformed data. """ X = self.transform(X) return X def partial_fit(self, X, y=None): """ partial_fit Partial fits the model. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. y: Array-like The true labels. Returns ------- MissingValuesCleaner self """ X = np.asarray(X) if self.strategy in ['mean', 'median', 'mode']: self.window.add_element(X) return self def get_info(self): info = '{}:'.format(type(self).__name__) info += ' - strategy: {}'.format(self.strategy) info += ' - window_size: {}'.format(self.window_size) info += ' - new_value: {}'.format(self.new_value) return info
class WindowRegressionMeasurements(BaseObject): """ This class is used to keep updated statistics over a regression learner in a regression problem context inside a fixed sized window. It uses FastBuffer objects to simulate the fixed sized windows. It will keep track of partial metrics, that can be provided at any moment. The relevant metrics kept by an instance of this class are: MSE (mean square error) and MAE (mean absolute error). """ def __init__(self, window_size=200): super().__init__() self.total_square_error = 0.0 self.average_error = 0.0 self.last_true_label = None self.last_prediction = None self.total_square_error_correction = FastBuffer(window_size) self.average_error_correction = FastBuffer(window_size) self.window_size = window_size def reset(self): self.total_square_error = 0.0 self.average_error = 0.0 self.last_true_label = None self.last_prediction = None self.total_square_error_correction = FastBuffer(self.window_size) self.average_error_correction = FastBuffer(self.window_size) def add_result(self, y_true, y_pred): """ Use the true value and the prediction to update the statistics. Parameters ---------- y_true: float The true value. y_pred: float The predicted value. """ self.last_true_label = y_true self.last_prediction = y_pred self.total_square_error += (y_true - y_pred) * (y_true - y_pred) self.average_error += np.absolute(y_true - y_pred) old_square = self.total_square_error_correction.add_element( np.array([-1 * ((y_true - y_pred) * (y_true - y_pred))])) old_average = self.average_error_correction.add_element( np.array([-1 * (np.absolute(y_true - y_pred))])) if (old_square is not None) and (old_average is not None): self.total_square_error += old_square[0] self.average_error += old_average[0] def get_mean_square_error(self): """ Computes the window/current mean square error. Returns ------- float The window/current mean square error. """ if self.sample_count == 0: return 0.0 else: return self.total_square_error / self.sample_count def get_average_error(self): """ Computes the window/current average error. Returns ------- float The window/current average error. """ if self.sample_count == 0: return 0.0 else: return self.average_error / self.sample_count def get_last(self): return self.last_true_label, self.last_prediction @property def sample_count(self): return self.total_square_error_correction.get_current_size() def get_class_type(self): return 'measurement' def get_info(self): return '{}:'.format(type(self).__name__) + \ ' - sample_count: {}'.format(self.sample_count) + \ ' - mean_square_error: {:.6f}'.format(self.get_mean_square_error()) + \ ' - mean_absolute_error: {:.6f}'.format(self.get_average_error())
class WindowClassificationMeasurements(BaseObject): """ This class will maintain a fixed sized window of the newest information about one classifier. It can provide, as requested, any of the relevant current metrics about the classifier, measured inside the window. To keep track of statistics inside a window, the class will use a ConfusionMatrix object, alongside FastBuffers, to simulate fixed sized windows of the important classifier's attributes. Its functionality is somewhat similar to those of the ClassificationMeasurements class. The difference is that the statistics kept by this class are local, or partial, while the statistics kept by the ClassificationMeasurements class are global. At any given moment, it can compute the following statistics: accuracy, kappa, kappa_t, kappa_m, majority_class and error rate. Parameters ---------- targets: list A list containing the possible labels. dtype: data type (Default: numpy.int64) The data type of the existing labels. window_size: int (Default: 200) The width of the window. Determines how many samples the object can see. Examples -------- """ def __init__(self, targets=None, dtype=np.int64, window_size=200): super().__init__() if targets is not None: self.n_targets = len(targets) else: self.n_targets = 0 self.confusion_matrix = ConfusionMatrix(self.n_targets, dtype) self.last_class = None self.targets = targets self.window_size = window_size self.true_labels = FastBuffer(window_size) self.predictions = FastBuffer(window_size) self.temp = 0 self.last_prediction = None self.last_true_label = None self.last_sample = None self.majority_classifier = 0 self.correct_no_change = 0 self.majority_classifier_correction = FastBuffer(window_size) self.correct_no_change_correction = FastBuffer(window_size) def reset(self): if self.targets is not None: self.n_targets = len(self.targets) else: self.n_targets = 0 self.true_labels = FastBuffer(self.window_size) self.predictions = FastBuffer(self.window_size) self.temp = 0 self.last_prediction = None self.last_true_label = None self.last_sample = None self.majority_classifier = 0 self.correct_no_change = 0 self.confusion_matrix.restart(self.n_targets) self.majority_classifier_correction = FastBuffer(self.window_size) self.correct_no_change_correction = FastBuffer(self.window_size) def add_result(self, y_true, y_pred): """ Updates its statistics with the results of a prediction. If needed it will remove samples from the observation window. Parameters ---------- y_true: int The true label. y_pred: int The classifier's prediction """ true_y = self._get_target_index(y_true, True) pred = self._get_target_index(y_pred, True) old_true = self.true_labels.add_element(np.array([y_true])) old_predict = self.predictions.add_element(np.array([y_pred])) # Verify if it's needed to decrease the count of any label # pair in the confusion matrix if (old_true is not None) and (old_predict is not None): self.temp += 1 self.confusion_matrix.remove( self._get_target_index(old_true[0]), self._get_target_index(old_predict[0])) self.correct_no_change += self.correct_no_change_correction.peek() self.majority_classifier += self.majority_classifier_correction.peek( ) # Verify if it's needed to decrease the majority_classifier count if (self.get_majority_class() == y_true) and (self.get_majority_class() is not None): self.majority_classifier += 1 self.majority_classifier_correction.add_element([-1]) else: self.majority_classifier_correction.add_element([0]) # Verify if it's needed to decrease the correct_no_change if (self.last_true_label == y_true) and (self.last_true_label is not None): self.correct_no_change += 1 self.correct_no_change_correction.add_element([-1]) else: self.correct_no_change_correction.add_element([0]) self.confusion_matrix.update(true_y, pred) self.last_true_label = y_true self.last_prediction = y_pred def get_last(self): return self.last_true_label, self.last_prediction def get_majority_class(self): """ Computes the window/current true majority class. Returns ------- int The true window/current majority class. """ if (self.n_targets is None) or (self.n_targets == 0): return None majority_class = 0 max_prob = 0.0 for i in range(self.n_targets): sum_value = 0.0 for j in range(self.n_targets): sum_value += self.confusion_matrix.value_at(i, j) sum_value = sum_value / self.true_labels.get_current_size() if sum_value > max_prob: max_prob = sum_value majority_class = i return majority_class def get_accuracy(self): """ Computes the window/current accuracy. Returns ------- float The window/current accuracy. """ sum_value = 0.0 n, _ = self.confusion_matrix.shape() for i in range(n): sum_value += self.confusion_matrix.value_at(i, i) try: return sum_value / self.true_labels.get_current_size() except ZeroDivisionError: return 0.0 def get_incorrectly_classified_ratio(self): return 1.0 - self.get_accuracy() def _get_target_index(self, target, add=False): """ Computes the index of an element in the self.targets list. Also reshapes the ConfusionMatrix and adds new found targets if add is True. Parameters ---------- target: int A class label. add: bool Either to add new found labels to the targets list or not. Returns ------- int The target index in the self.targets list. """ if (self.targets is None) and add: self.targets = [] self.targets.append(target) self.n_targets = len(self.targets) self.confusion_matrix.reshape(len(self.targets), len(self.targets)) elif (self.targets is None) and (not add): return None if target not in self.targets and add: self.targets.append(target) self.n_targets = len(self.targets) self.confusion_matrix.reshape(len(self.targets), len(self.targets)) for i in range(len(self.targets)): if self.targets[i] == target: return i return None def get_kappa(self): """ Computes the window/current Cohen's kappa coefficient. Returns ------- float The window/current Cohen's kappa coefficient. """ p0 = self.get_accuracy() pc = 0.0 n_rows, n_cols = self.confusion_matrix.shape() for i in range(n_rows): row = self.confusion_matrix.row(i) column = self.confusion_matrix.column(i) sum_row = np.sum(row) / self.true_labels.get_current_size() sum_column = np.sum(column) / self.true_labels.get_current_size() pc += sum_row * sum_column if pc == 1: return 1 return (p0 - pc) / (1.0 - pc) def get_kappa_t(self): """ Computes the window/current Cohen's kappa T coefficient. This measures the temporal correlation between samples. Returns ------- float The window/current Cohen's kappa T coefficient. """ p0 = self.get_accuracy() if self.sample_count != 0: pc = self.correct_no_change / self.sample_count else: pc = 0 if pc == 1: return 1 return (p0 - pc) / (1.0 - pc) def get_kappa_m(self): """ Computes the window/current Cohen's kappa M coefficient. Returns ------- float The window/current Cohen's kappa M coefficient. """ p0 = self.get_accuracy() if self.sample_count != 0: pc = self.majority_classifier / self.sample_count else: pc = 0 if pc == 1: return 1 return (p0 - pc) / (1.0 - pc) @property def _matrix(self): return self.confusion_matrix.matrix @property def sample_count(self): return self.true_labels.get_current_size() def get_class_type(self): return 'measurement' def get_info(self): return '{}:'.format(type(self).__name__) + \ ' - sample_count: {}'.format(self.sample_count) + \ ' - window_size: {}'.format(self.window_size) + \ ' - accuracy: {:.6f}'.format(self.get_accuracy()) + \ ' - kappa: {:.6f}'.format(self.get_kappa()) + \ ' - kappa_t: {:.6f}'.format(self.get_kappa_t()) + \ ' - kappa_m: {:.6f}'.format(self.get_kappa_m()) + \ ' - majority_class: {}'.format(self.get_majority_class())
class WindowMultiTargetRegressionMeasurements(BaseObject): """ This class is used to keep updated statistics over a multi-target regression learner in a multi-target regression problem context inside a fixed sized window. It uses FastBuffer objects to simulate the fixed sized windows. It will keep track of partial metrics, that can be provided at any moment. The relevant metrics kept by an instance of this class are: AMSE (average mean square error) and AMAE (average mean absolute error). """ def __init__(self, window_size=200): super().__init__() self.n_targets = 0 self.total_square_error = 0.0 self.average_error = 0.0 self.last_true_label = None self.last_prediction = None self.total_square_error_correction = FastBuffer(window_size) self.average_error_correction = FastBuffer(window_size) self.window_size = window_size def reset(self): self.total_square_error = 0.0 self.average_error = 0.0 self.last_true_label = None self.last_prediction = None self.total_square_error_correction = FastBuffer(self.window_size) self.average_error_correction = FastBuffer(self.window_size) def add_result(self, y, prediction): """ Use the true value and the prediction to update the statistics. Parameters ---------- y: float or list or np.ndarray The true value(s). prediction: float or list or np.ndarray The predicted value(s). """ self.last_true_label = y self.last_prediction = prediction m = 0 if hasattr(y, 'size'): m = y.size elif hasattr(y, 'append'): m = len(y) self.n_targets = m self.total_square_error += (y - prediction)**2 self.average_error += np.absolute(y - prediction) old_square = self.total_square_error_correction.add_element( np.array([-1 * ((y - prediction)**2)])) old_average = self.average_error_correction.add_element( np.array([-1 * (np.absolute(y - prediction))])) if (old_square is not None) and (old_average is not None): self.total_square_error += old_square[0] self.average_error += old_average[0] def get_average_mean_square_error(self): """ Computes the window/current average mean square error. Returns ------- float The window/current average mean square error. """ if self._sample_count == 0: return 0.0 else: return np.sum(self.total_square_error / self._sample_count) \ / self.n_targets def get_average_absolute_error(self): """ Computes the window/current average mean absolute error. Returns ------- float The window/current average mean absolute error. """ if self._sample_count == 0: return 0.0 else: return np.sum(self.average_error / self._sample_count) \ / self.n_targets def get_average_root_mean_square_error(self): """ Computes the mean square error. Returns ------- float The average mean square error. """ if self._sample_count == 0: return 0.0 else: return np.sum(np.sqrt(self.total_square_error / self._sample_count)) \ / self.n_targets def get_last(self): return self.last_true_label, self.last_prediction @property def _sample_count(self): return self.total_square_error_correction.get_current_size() def get_class_type(self): return 'measurement' def get_info(self): return 'MultiTargetRegressionMeasurements: sample_count: ' + \ str(self._sample_count) + ' - average_mean_square_error: ' + \ str(self.get_average_mean_square_error()) + ' - average_mean_absolute_error: ' + \ str(self.get_average_absolute_error()) + ' - average_root_mean_square_error: ' + \ str(self.get_average_root_mean_square_error())
class Goowe(StreamModel): #class Goowe(BaseEstimator): """ GOOWE (Geometrically Optimum Online Weighted Ensemble), as it is described in Bonab and Can (2017). Common notation in the code is as follows: K for maximum number of classifiers in the ensemble. N for data instances. A, d as they are, in the aforementioned paper. Parameters ---------- n_max_components: int Ensemble size limit. Maximum number of component classifiers. chunk_size: int The amount of instances necessary for ensemble to learn concepts from. At each chunk_size many instances, some training is done. window_size: int Size of sliding window, which keeps record of the last k instances that are encountered in the data stream. """ def __init__(self, n_max_components: int = 10, chunk_size: int = 500, window_size: int = 100, logging=True): super().__init__() self._num_of_max_classifiers = n_max_components self._chunk_size = chunk_size self._Logging = logging self._num_of_current_classifiers = 0 self._num_of_processed_instances = 0 self._classifiers = np.empty((self._num_of_max_classifiers), dtype=object) self._weights = np.zeros((self._num_of_max_classifiers, )) # What to save from current Data Chunk --> will be used for # adjusting weights, pruning purposes and so on. # Individual predictions of components, overall prediction of ensemble, # and ground truth info. self._chunk_comp_preds = FastBuffer(max_size=chunk_size) self._chunk_ensm_preds = FastBuffer(max_size=chunk_size) # chunk_data has instances in the chunk and their ground truth. # To be initialized after receiving n_features, n_targets self._chunk_data = None # self._chunk_truths = FastBuffer(max_size=chunk_size) # some external stuff that is about the data we are dealing with # but useful for recording predictions self._num_classes = None self._target_values = None # Required to correctly train HTs self._record = False # Boolean for keeping records to files # TODO: Implement Sliding Window Continuous Evaluator. # What to save at Sliding Window (last n instances) --> will be # used for continuous evaluation. # self._sliding_window_ensemble_preds =FastBuffer(max_size=window_size) # self._sliding_window_truths = FastBuffer(max_size=window_size) def prepare_post_analysis_req(self, num_features, num_targets, num_classes, target_values, record=False): # Need to get the dataset information but we do not want to # take it as an argument to the classifier itself, nor we do want to # ask it at each data instance. Hence we take dataset info from user # explicitly to create _chunk_data entries. #chunk_size = self._chunk_size self._chunk_data = InstanceWindow(n_features=num_features, n_targets=num_targets, max_size=self._chunk_size) #self._chunk_data = chunk_data # num_targets shows how many columns you want to predict in the data. # num classes is eqv to possible number of values that that column # can have. self._num_classes = num_classes self._target_values = target_values self._record = record if (self._record): # Create files that keeps record of: # - weights at each chunk # - individual component results for every instance # - ground truths for every instance. self._f_comp_preds = open("component_predictions.csv", "w+") self._f_truths = open("ground_truths.csv", "w+") self._f_weights = open("weights.csv", "w+") self._f_comp_preds.write(str(self._chunk_size) + '\n') self._f_comp_preds.close() self._f_truths.close() self._f_weights.close() return def _get_components_predictions_for_instance(self, inst): """ For a given data instance, takes predictions of individual components from the ensemble as a matrix. Parameters ---------- inst: data instance for which votes of components are delivered. Returns ---------- numpy.array A 2-d numpy array where each row corresponds to predictions of each classifier. """ preds = np.zeros((self._num_of_current_classifiers, self._num_classes)) # print(np.shape(preds)) for k in range(len(preds)): kth_comp_pred = self._classifiers[k].predict_proba(inst) # print(kth_comp_pred[0]) # print(preds) # print("Component {}'s Prediction: {}".format(k, kth_comp_pred)) preds[k, :] = kth_comp_pred[0] if (self._Logging): print('Component Predictions:') print(preds) return preds def _adjust_weights(self): """ Weight adustment by solving linear least squares, as it is described in Bonab and Can (2017). """ # Prepare variables for Weight Adjustment # print('number of current classifiers: {}'.format(self._num_of_current_classifiers)) A = np.zeros(shape=(self._num_of_current_classifiers, self._num_of_current_classifiers)) d = np.zeros(shape=(self._num_of_current_classifiers, )) # Go over all the data chunk, calculate values of (S_i x S_j) for A. # (S_i x O) for d. y_all = self._chunk_data.get_targets_matrix().astype(int) # print(y_all) for i in range(len(y_all)): class_index = y_all[i] comp_preds = self._chunk_comp_preds.get_next_element() #print("{} components predictions:".format(i)) #print(comp_preds) A = A + comp_preds.dot(comp_preds.T) d = d + comp_preds[0][class_index] # A and d are filled. Now, the linear system Aw=d to be solved # to get our desired weights. w is of size K. # print("Solving Aw=d") # print(A) # print(d) w = np.linalg.lstsq(A, d, rcond=None)[0] # _weights has maximum size but what we found can be # smaller. Therefore, need to put the values of w to global weights if (self._num_of_current_classifiers < self._num_of_max_classifiers): for i in range(len(w)): self._weights[i] = w[i] else: # If full size, there is no problem. self._weights = w # print("After solving Aw=d weights:") # print(self._weights) return def _normalize_weights(self): """ Normalizes the weights of the ensemble to (0, 1) range. Performs (x_i - min(x)) / (max(x) - min(x)) on the nonzero elements of the weight vector. """ min = np.amin(self._weights[:self._num_of_current_classifiers]) max = np.amax(self._weights[:self._num_of_current_classifiers]) if (min == max): # all weights are the same for i in range(self._num_of_current_classifiers): self._weights[i] = 1. / self._num_of_current_classifiers else: for i in range(self._num_of_current_classifiers): self._weights[i] = (self._weights[i] - min) / (max - min) return def _normalize_weights_softmax(self): """ Normalizes the weights of the ensemble to (0, 1) range. Performs (x_i - min(x)) / (max(x) - min(x)) on the nonzero elements of the weight vector. """ cur_weights = self._weights[:self._num_of_current_classifiers] self._weights[:self._num_of_current_classifiers] = np.exp( cur_weights) / sum(np.exp(cur_weights)) return def _process_chunk(self): """ A subroutine that runs at the end of each chunk, allowing the components to be trained and ensemble weights to be adjusted. Until the first _process_chunk call, the ensemble is not yet ready. At first call, the first component is learned. At the rest of the calls, new components are formed, and the older ones are trained by the given chunk. If the ensemble size is reached, then the lowest weighted component is removed from the ensemble. """ new_clf = HoeffdingTree() # with default parameters for now new_clf.reset() # Save records of previous chunk if (self._record and self._num_of_current_classifiers > 0): self._record_truths_this_chunk() self._record_comp_preds_this_chunk() self._record_weights_this_chunk() # Case 1: No classifier in the ensemble yet, first chunk: if (self._num_of_current_classifiers == 0): self._classifiers[0] = new_clf self._weights[0] = 1.0 # weight is 1 for the first clf self._num_of_current_classifiers += 1 else: # First, adjust the weights of the old component classifiers # according to what happened in this chunk. self._adjust_weights() # Case 2: There are classifiers in the ensemble but # the ensemble size is still not capped. if (self._num_of_current_classifiers < self._num_of_max_classifiers): # Put the new classifier to ensemble with the weight of 1 self._classifiers[self._num_of_current_classifiers] = new_clf self._weights[self._num_of_current_classifiers] = float(1.0) self._num_of_current_classifiers += 1 # Case 3: Ensemble size is capped. Need to replace the component # with lowest weight. else: assert (self._num_of_current_classifiers == self._num_of_max_classifiers), "Ensemble not full." index_of_lowest_weight = np.argmin(self._weights) self._classifiers[index_of_lowest_weight] = new_clf self._weights[index_of_lowest_weight] = 1.0 # Normalizing weigths to simplify numbers self._normalize_weights_softmax() # maybe useful. we'll see. if (self._Logging): print("After normalization weights: ") print(self._weights) # Ensemble maintenance is done. Now train all classifiers # in the ensemble from the current chunk. # Can be parallelized. data_features = self._chunk_data.get_attributes_matrix() data_truths = self._chunk_data.get_targets_matrix() data_truths = data_truths.astype(int).flatten() if (self._Logging): print("Starting training the components with the current chunk...") for k in range(self._num_of_current_classifiers): print("Training classifier {}".format(k)) self._classifiers[k].partial_fit(data_features, data_truths, classes=self._target_values) print( "Training the components with the current chunk completed...") else: for k in range(self._num_of_current_classifiers): self._classifiers[k].partial_fit(data_features, data_truths, classes=self._target_values) return def _record_truths_this_chunk(self): f = open("ground_truths.csv", "ab") data_truths = self._chunk_data.get_targets_matrix() data_truths = data_truths.astype(int).flatten() # Default behaviour is to store list of lists for savetxt. # Hence, to prevent newline after each element of list, we surround # the truth array with one more set of bracketts. np.savetxt(f, [data_truths], delimiter=",", fmt='%d') f.close() return def _record_comp_preds_this_chunk(self): f = open("component_predictions.csv", "a+") np.savetxt(f, [self._num_of_current_classifiers], fmt='%d') comp_preds = np.array(self._chunk_comp_preds.get_queue()) for i in range(len(comp_preds)): np.savetxt(f, comp_preds[i], delimiter=',', fmt='%1.5f') f.close() return def _record_weights_this_chunk(self): f = open("weights.csv", "a+") np.savetxt(f, [self._num_of_current_classifiers], fmt='%d') weights = self._weights np.savetxt(f, [weights], delimiter=',', fmt='%1.5f') f.close() return # -------------------------------------------------- # Overridden methods from the parent (StreamModel) # -------------------------------------------------- def fit(self, X, y, classes=None, weight=None): raise NotImplementedError("For now, only the stream version " "is implemented. Use partial_fit()") def partial_fit(self, X, y, classes=None, weight=None): # This method should work with individual instances, as well as bunch # of instances, since there can be pre-training for warm start. # If an individual instance is inputted, then just save X and y to # train from them later. if (len(X) == 1): # Save X and y to train classifiers later # y is required to be 1x1, and hence the square bracketts. y_i = np.array([y]) # print(type(X)) # print(type(y_i)) # print(X) # print(y_i) self._chunk_data.add_element(X, y_i) # If still filling the chunk, then just add the instance to the # current data chunk, wait for it to be filled. self._num_of_processed_instances += 1 # If at the end of a chunk, start training components # and adjusting weights using information in this chunk. if (self._num_of_processed_instances % self._chunk_size == 0): print("Instance {}".format(self._num_of_processed_instances)) self._process_chunk() elif (len(X) > 1): # Input is a chunk. Add them individually. for i in range(len(X)): X_i = np.array([X[i]]) y_i = np.array([[y[i]]]) # print(X_i) # print(y_i) self._chunk_data.add_element(X_i, y_i) self._num_of_processed_instances += 1 # If at the end of a chunk, start training components # and adjusting weights using information in this chunk. if (self._num_of_processed_instances % self._chunk_size == 0): print("Instance {}".format( self._num_of_processed_instances)) self._process_chunk() else: print("Something wrong with the data...") print("len(X) is: {}".format(len(X))) return def predict(self, X): """ For a given data instance, yields the prediction values. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) Samples for which we want to predict the labels. Returns ------- numpy.array Predicted labels for all instances in X. """ predictions = [] if (len(X) == 1): predictions.append(np.argmax(self.predict_proba(X))) elif (len(X) > 1): # Add many predictions for i in range(len(X)): relevance_scores = self.predict_proba(X[i]) predictions.append(np.argmax(relevance_scores)) # print(np.argmax(relevance_scores)) if (self._Logging): print('Ensemble Prediction:') print(np.array(predictions)) return np.array(predictions) #, one_hot def predict_proba(self, X): """ For a given data instance, takes WEIGHTED combination of components to get relevance scores for each class. Parameters ---------- X: data instance for which weighted combination is delivered. Returns ---------- numpy.array A vector with number_of_classes elements where each element represents class score of corresponding class for this instance. """ weights = np.array(self._weights) # get only the useful weights weights = weights[:self._num_of_current_classifiers] components_preds = self._get_components_predictions_for_instance(X) #print('*****************************') #print(components_preds) #print('*****************************') # Save individual component predictions and ensemble prediction # for later analysis. self._chunk_comp_preds.add_element([components_preds]) #print(weights) #print(components_preds) #print(self.get_classifiers()) weighted_ensemble_vote = np.dot(weights, components_preds) # print("Weighted Ensemble vote: {}".format(weighted_ensemble_vote)) self._chunk_ensm_preds.add_element(weighted_ensemble_vote) return weighted_ensemble_vote def reset(self): pass def score(self, X, y): pass def get_info(self): return 'The Ensemble GOOWE (Bonab and Can, 2017) with' + \ ' - n_max_components: ' + str(self._num_of_max_classifiers) + \ ' - num_of_current_components: ' + str(self._num_of_current_classifiers) + \ ' - chunk_size: ' + str(self._chunk_size) + \ ' - num_dimensions_in_label_space(num_classes): ' + str(self._num_classes) + \ ' - recording: ' + str(self._record) def get_class_type(self): pass # Some getters and setters.. def get_number_of_current_classifiers(self): return self._num_of_current_classifiers def get_number_of_max_classifiers(self): return self._num_of_max_classifiers # Helper methods for GooweMS def get_classifiers(self): return self._classifiers def set_classifiers(self, classifiers): self._classifiers = classifiers def get_weights(self): return self._weights