def __add__(self, other): this = copy.deepcopy(self) this_class2idx = {cls: idx for idx, cls in enumerate(this.classes_)} other_class2idx = {cls: idx for idx, cls in enumerate(other.classes_)} for class_i in this.classes_: i = this_class2idx[class_i] j = other_class2idx[class_i] N_x = this.class_count_[i] N_y = other.class_count_[j] mu_x = this.theta_[i, :] mu_y = other.theta_[j, :] sigma_x = this.sigma_[i, :] sigma_y = this.sigma_[j, :] N_total = N_x + N_y mu_xy = N_x * mu_x + N_y * mu_y sigma_xy = (sigma_x * N_x + sigma_y * N_y + (N_x * N_y * (mu_x - mu_y) ** 2) / N_total) this.theta_[i, :] = mu_xy / N_total this.sigma_[i, :] = sigma_xy / N_total this.class_count_[i] += N_y this.class_prior_[:] = this.class_count_ / np.sum(this.class_count_) return this
def __add__(self, other): """Add method for Linear models with coef and intercept attributes. Parameters ---------- other : fitted sklearn linear model Model to add. Returns ------- model : Linear model Model with updated coefficients. """ model = copy.deepcopy(self) model.coef_ += other.coef_ model.intercept_ += other.intercept_ return model
def __add__(self, other): """ Add method for DiscreteNB models. Parameters ---------- other : fitted splearn multinomilal NB model with class_count_ and feature_count_ attribute Model to add. Returns ------- model : splearn Naive Bayes model Model with updated coefficients. """ # The rdd operator add does not consider __radd__ :( if other == 0: return self model = copy.deepcopy(self) model.class_count_ += other.class_count_ model.feature_count_ += other.feature_count_ model._update_class_log_prior() model._update_feature_log_prob() return model
minibatch_iterators = iter_minibatchs(data_streamer, minibatch_size) def learn(classifier, stats, (X_train, y_train)): if 't0' not in stats: stats['t0'] = time.time() classifier.partial_fit(X_train, y_train, classes=all_classes) stats['n_train'] += X_train.shape[0] stats['n_train_pos'] += sum(y_train) stats['accuracy'] = classifier.score(X_test, y_test) stats['accuracy_history'].append((stats['accuracy'], stats['n_train'])) stats['runtime_history'].append((stats['accuracy'], time.time() - stats['t0'])) return classifier, stats from sklearn.base import copy def merge((cf1, stats1), (cf2, stats2)): new = copy.deepcopy(cf1) new.coef_ += cf2.coef_ new.intercept_ += cf2.intercept_ return new, stats1 # Map/Reduce on Spark sgd, stats = sc.parallelize(minibatch_iterators) .map(lambda batch: learn(classifier, stats, batch)) .reduce(lambda l, r: merge(l, r)) def plot_accuracy(x, y, plot_placement, x_legend): """Plot accuracy as a function of x.""" x = np.array(x) y = np.array(y) pl.subplots_adjust(hspace=0.5)
def merge(left,right): new = copy.deepcopy(left) new.estimators_ += right.estimators_ new.n_estimators = len(new.estimators_) return new
def preprocess_data(self, data: pd.DataFrame, stage: str = "inference") -> Tuple[pd.DataFrame, list]: """The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder Args: data (pd.DataFrame): A dataframe with the features and target stage (str, optional): Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference". Returns: tuple[pd.DataFrame, list]: Returns the processed dataframe and the added features(list) as a tuple """ logger.info(f"Preprocessing data: Stage: {stage}...") added_features = None if self.config.encode_date_columns: for field_name, freq in self.config.date_columns: data = self.make_date(data, field_name) data, added_features = self.add_datepart(data, field_name, frequency=freq, prefix=None, drop=True) # The only features that are added are the date features extracted # from the date which are categorical in nature if (added_features is not None) and (stage == "fit"): logger.debug( f"Added {added_features} features after encoding the date_columns" ) self.config.categorical_cols += added_features self.config.categorical_dim = (len(self.config.categorical_cols) if self.config.categorical_cols is not None else 0) # Encoding Categorical Columns if len(self.config.categorical_cols) > 0: if stage == "fit": if self.do_leave_one_out_encoder(): logger.debug( "Encoding Categorical Columns using LeavOneOutEncoder") self.categorical_encoder = ce.LeaveOneOutEncoder( cols=self.config.categorical_cols, random_state=42) # Multi-Target Regression uses the first target to encode the categorical columns if len(self.config.target) > 1: logger.warning( f"Multi-Target Regression: using the first target({self.config.target[0]}) to encode the categorical columns" ) data = self.categorical_encoder.fit_transform( data, data[self.config.target[0]]) else: logger.debug( "Encoding Categorical Columns using OrdinalEncoder") self.categorical_encoder = OrdinalEncoder( cols=self.config.categorical_cols) data = self.categorical_encoder.fit_transform(data) else: data = self.categorical_encoder.transform(data) # Transforming Continuous Columns if (self.config.continuous_feature_transform is not None) and (len(self.config.continuous_cols) > 0): if stage == "fit": transform = self.CONTINUOUS_TRANSFORMS[ self.config.continuous_feature_transform] self.continuous_transform = transform["callable"]( **transform["params"]) # TODO implement quantile noise data.loc[:, self.config. continuous_cols] = self.continuous_transform.fit_transform( data.loc[:, self.config.continuous_cols]) else: data.loc[:, self.config. continuous_cols] = self.continuous_transform.transform( data.loc[:, self.config.continuous_cols]) # Normalizing Continuous Columns if (self.config.normalize_continuous_features) and (len( self.config.continuous_cols) > 0): if stage == "fit": self.scaler = StandardScaler() data.loc[:, self.config. continuous_cols] = self.scaler.fit_transform( data.loc[:, self.config.continuous_cols]) else: data.loc[:, self.config.continuous_cols] = self.scaler.transform( data.loc[:, self.config.continuous_cols]) # Converting target labels to a 0 indexed label if self.config.task == "classification": if stage == "fit": self.label_encoder = LabelEncoder() data[self.config.target[0]] = self.label_encoder.fit_transform( data[self.config.target[0]]) else: if self.config.target[0] in data.columns: data[self.config.target[0]] = self.label_encoder.transform( data[self.config.target[0]]) # Target Transforms if all([col in data.columns for col in self.config.target]): if self.do_target_transform: target_transforms = [] for col in self.config.target: _target_transform = copy.deepcopy( self.target_transform_template) data[col] = _target_transform.fit_transform( data[col].values.reshape(-1, 1)) target_transforms.append(_target_transform) self.target_transforms = target_transforms return data, added_features
def merge(left, right): new = copy.deepcopy(left) new.coef_ += right.coef_ new.intercept_ += right.intercept_ return new
""" # Init the best model to base rest of tests on base = KNeighborsClassifier(p=8, n_neighbors=50, leaf_size=70, algorithm='kd_tree', weights='distance', n_jobs=-1) train_sizes = np.linspace(0.1, 1, 10) # Compare different n_neighbors values clfs_neighbors = dict() for n_neighbors in range(15, 56, 5): clf = copy.deepcopy(base) clf.n_neighbors = n_neighbors clfs_neighbors['{}-nn'.format(n_neighbors)] = clf compare_models_all_metrics(clfs_neighbors, x_cr, y_cr, train_sizes=train_sizes, title_prefix="Credit Fraud", plot_learning_curve=False) # Compare different leaf sizes clfs_leaf_size = dict() for leaf_size in range(5, 71, 10): clf = copy.deepcopy(base) clf.leaf_size = leaf_size