class MissingValuesCleaner(StreamTransform): """ Fill missing values with some defined value. Provides a simple way to replace missing values in data samples with some value. The imputation value can be set via a set of imputation strategies. Parameters ---------- missing_value: int, float or list (Default: numpy.nan) Missing value to replace strategy: string (Default: 'zero') The strategy adopted to find the missing value replacement. It can be one of the following: 'zero', 'mean', 'median', 'mode', 'custom'. window_size: int (Default: 200) Defines the window size for the 'mean', 'median' and 'mode' strategies. new_value: int (Default: 1) This is the replacement value in case the chosen strategy is 'custom'. Examples -------- >>> # Imports >>> import numpy as np >>> from skmultiflow.data.file_stream import FileStream >>> from skmultiflow.transform.missing_values_cleaner import MissingValuesCleaner >>> # Setting up a stream >>> stream = FileStream('skmultiflow/data/datasets/covtype.csv', -1, 1) >>> # Setting up the filter to substitute values -47 by the median of the >>> # last 10 samples >>> cleaner = MissingValuesCleaner(-47, 'median', 10) >>> X, y = stream.next_sample(10) >>> X[9, 0] = -47 >>> # We will use this list to keep track of values >>> data = [] >>> # Iterate over the first 9 samples, to build a sample window >>> for i in range(9): >>> X_transf = cleaner.partial_fit_transform([X[i].tolist()]) >>> data.append(X_transf[0][0]) >>> >>> # Transform last sample. The first feature should be replaced by the list's >>> # median value >>> X_transf = cleaner.partial_fit_transform([X[9].tolist()]) >>> np.median(data) Notes ----- A missing value in a sample can be coded in many different ways, but the most common one is to use numpy's NaN, that's why that is the default missing value parameter. The user should choose the correct substitution strategy for his use case, as each strategy has its pros and cons. The strategy can be chosen from a set of predefined strategies, which are: 'zero', 'mean', 'median', 'mode', 'custom'. Notice that `MissingValuesCleaner` can actually be used to replace arbitrary values. """ def __init__(self, missing_value=np.nan, strategy='zero', window_size=200, new_value=1): super().__init__() if isinstance(missing_value, list): self.missing_value = missing_value else: self.missing_value = [missing_value] self.strategy = strategy self.window_size = window_size self.window = None self.new_value = new_value self.__configure() def __configure(self): if self.strategy in ['mean', 'median', 'mode']: self.window = FastBuffer(max_size=self.window_size) def transform(self, X): """ transform Does the transformation process in the samples in X. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. """ r, c = get_dimensions(X) for i in range(r): if self.strategy in ['mean', 'median', 'mode']: self.window.add_element([X[i][:]]) for j in range(c): if X[i][j] in self.missing_value or np.isnan(X[i][j]): X[i][j] = self._get_substitute(j) return X def _get_substitute(self, column_index): """ _get_substitute Computes the replacement for a missing value. Parameters ---------- column_index: int The index from the column where the missing value was found. Returns ------- int or float The replacement. """ if self.strategy == 'zero': return 0 elif self.strategy == 'mean': if not self.window.is_empty(): return np.nanmean( np.array(self.window.get_queue())[:, column_index]) else: return self.new_value elif self.strategy == 'median': if not self.window.is_empty(): return np.nanmedian( np.array(self.window.get_queue())[:, column_index]) else: return self.new_value elif self.strategy == 'mode': if not self.window.is_empty(): return stats.mode(np.array( self.window.get_queue())[:, column_index], nan_policy='omit')[0] else: return self.new_value elif self.strategy == 'custom': return self.new_value def partial_fit_transform(self, X, y=None): """ partial_fit_transform Partially fits the model and then apply the transform to the data. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. y: Array-like The true labels. Returns ------- numpy.ndarray of shape (n_samples, n_features) The transformed data. """ X = self.transform(X) return X def partial_fit(self, X, y=None): """ partial_fit Partial fits the model. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. y: Array-like The true labels. Returns ------- MissingValuesCleaner self """ X = np.asarray(X) if self.strategy in ['mean', 'median', 'mode']: self.window.add_element(X) return self
class MyKNNClassifier(KNNClassifier): # ... def __init__(self, n_neighbors=5, max_window_size=1000, leaf_size=30, metric='euclidean', weighted_vote=False, standardize = False): self.weighted_vote = weighted_vote self.standardize = standardize super().__init__(n_neighbors=n_neighbors, max_window_size=max_window_size, leaf_size=leaf_size, metric=metric) self.window_size = max_window_size self.window = None self.__configure() def __configure(self): self.window = FastBuffer(max_size=self.window_size) def partial_fit(self, X, y, classes=None, sample_weight=None): if(self.standardize == True): instance = np.array(X) X = self.transform_vector(instance) self.window.add_element(X) r, c = get_dimensions(X) if classes is not None: self.classes = list(set().union(self.classes, classes)) for i in range(r): self.data_window.add_sample(X[i], y[i]) return self def standardization(self, X): #scaler = MinMaxScaler(feature_range=(0, 1)) #scaler = scaler.fit(X) #print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_)) #normalize the dataset and print the first 5 rows #normalized = scaler.transform(X) #return X scaler = StandardScaler() scaler.fit(X) normalized = scaler.fit_transform(X) X = normalized return X #Modify this method def predict_proba(self, X): #print("Not Weighted") #Add standardization in this method too if(self.standardize == True): instance = np.array(X) X = self.transform_vector(instance) r, c = get_dimensions(X) #print("Value of R: ", r) # r = 1 #print("Value of C: ", c) # c = 2 if self.data_window is None or self.data_window.size < self.n_neighbors: # The model is empty, defaulting to zero return np.zeros(shape=(r, 1)) proba = [] self.classes = list(set().union(self.classes, np.unique(self.data_window.targets_buffer.astype(np.int)))) new_dist, new_ind = self._get_neighbors(X) #print("new_dist: ", new_dist) #print("new_ind: ", new_ind) ###################################### Weighting that I've added ####################################################### #if(self.weighted_vote == True): #votes = self.vote(new_ind) # self.classes = int(self.data_window.get_targets_matrix()[new_ind]) #Class of our index if(self.weighted_vote == False): #print("Not Weighted") for i in range(r): votes = [0.0 for _ in range(int(max(self.classes) + 1))] for index in new_ind[i]: votes[int(self.data_window.targets_buffer[index])] += 1. / len(new_ind[i]) proba.append(votes) else: #print("Weighted") position = 0 for i in range(r): votes = [0.0 for _ in range(int(max(self.classes) + 1))] for index in new_ind[i]: votes[int(self.data_window.targets_buffer[index])] += np.sum((1. / new_dist[i][position])) / len(new_ind[i]) position = position + 1 proba.append(votes) return np.asarray(proba) def calculate_mean(self, column_index): mean = 0. if not self.window.is_empty(): mean = np.nanmean(np.array(self.window.get_queue())[:, column_index]) return mean def calculate_stddev(self, column_index): std = 1. if not self.window.is_empty(): std = np.nanstd(np.array(self.window.get_queue())[:, column_index]) if(std == 0.): std = 1. return std def transform_vector(self, X): r, c = get_dimensions(X) for i in range(r): row = np.copy([X[i][:]]) for j in range(c): value = X[i][j] mean = self.calculate_mean(j) standard_deviation = self.calculate_stddev(j) standardized = (value - mean) / standard_deviation X[i][j] = standardized self.window.add_element(row) return X
class WindowedMinmaxScaler(StreamTransform): """ Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one. For the training set we consider a window of a given length. Parameters ---------- window_size: int (Default: 200) Defines the window size to compute min and max values. Examples -------- """ def __init__(self, window_size=200): super().__init__() self.window_size = window_size self.window = None self.__configure() def __configure(self): self.window = FastBuffer(max_size=self.window_size) def transform(self, X): """ Does the transformation process in the samples in X. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. """ r, c = get_dimensions(X) for i in range(r): row = np.copy([X[i][:]]) for j in range(c): value = X[i][j] min_val = self._get_min(j) max_val = self._get_max(j) if((max_val-min_val)==0): transformed=0 else: X_std = (value - min_val) / (max_val - min_val) transformed = X_std * (max_val - min_val) + min_val X[i][j] = transformed self.window.add_element(row) return X def _get_min(self, column_index): min_val = 0. if not self.window.is_empty(): min_val = np.nanmin(np.array(self.window.get_queue())[:, column_index]) return min_val def _get_max(self, column_index): max_val = 1. if not self.window.is_empty(): max_val = np.nanmax(np.array(self.window.get_queue())[:, column_index]) return max_val def partial_fit_transform(self, X, y=None): """ Partially fits the model and then apply the transform to the data. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. y: numpy.ndarray (optional, default=None) The target values. Returns ------- numpy.ndarray of shape (n_samples, n_features) The transformed data. """ X = self.transform(X) return X def partial_fit(self, X, y=None): """ Partial fits the model. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. y: numpy.ndarray (optional, default=None) The target values. Returns ------- MinmaxScaler self """ self.window.add_element(X) return self