def __add__(self, other):
        this = copy.deepcopy(self)
        this_class2idx = {cls: idx for idx, cls in enumerate(this.classes_)}
        other_class2idx = {cls: idx for idx, cls in enumerate(other.classes_)}
        for class_i in this.classes_:
            i = this_class2idx[class_i]
            j = other_class2idx[class_i]
            N_x = this.class_count_[i]
            N_y = other.class_count_[j]
            mu_x = this.theta_[i, :]
            mu_y = other.theta_[j, :]
            sigma_x = this.sigma_[i, :]
            sigma_y = this.sigma_[j, :]
            N_total = N_x + N_y

            mu_xy = N_x * mu_x + N_y * mu_y
            sigma_xy = (sigma_x * N_x + sigma_y * N_y +
                        (N_x * N_y * (mu_x - mu_y) ** 2) / N_total)

            this.theta_[i, :] = mu_xy / N_total
            this.sigma_[i, :] = sigma_xy / N_total
            this.class_count_[i] += N_y

        this.class_prior_[:] = this.class_count_ / np.sum(this.class_count_)
        return this
Example #2
0
    def __add__(self, other):
        """Add method for Linear models with coef and intercept attributes.

        Parameters
        ----------
        other : fitted sklearn linear model
            Model to add.

        Returns
        -------
        model : Linear model
            Model with updated coefficients.
        """
        model = copy.deepcopy(self)
        model.coef_ += other.coef_
        model.intercept_ += other.intercept_
        return model
    def __add__(self, other):
        """
        Add method for DiscreteNB models.

        Parameters
        ----------
        other : fitted splearn multinomilal NB model with class_count_
                and feature_count_ attribute
            Model to add.

        Returns
        -------
        model : splearn Naive Bayes model
            Model with updated coefficients.
        """
        # The rdd operator add does not consider __radd__ :(
        if other == 0:
            return self
        model = copy.deepcopy(self)
        model.class_count_ += other.class_count_
        model.feature_count_ += other.feature_count_
        model._update_class_log_prior()
        model._update_feature_log_prob()
        return model
    def __add__(self, other):
        """
        Add method for DiscreteNB models.

        Parameters
        ----------
        other : fitted splearn multinomilal NB model with class_count_
                and feature_count_ attribute
            Model to add.

        Returns
        -------
        model : splearn Naive Bayes model
            Model with updated coefficients.
        """
        # The rdd operator add does not consider __radd__ :(
        if other == 0:
            return self
        model = copy.deepcopy(self)
        model.class_count_ += other.class_count_
        model.feature_count_ += other.feature_count_
        model._update_class_log_prior()
        model._update_feature_log_prob()
        return model
minibatch_iterators = iter_minibatchs(data_streamer, minibatch_size)
def learn(classifier, stats, (X_train, y_train)):
    if 't0' not in stats:
        stats['t0'] = time.time()

    classifier.partial_fit(X_train, y_train, classes=all_classes)
    stats['n_train'] += X_train.shape[0]
    stats['n_train_pos'] += sum(y_train)
    stats['accuracy'] = classifier.score(X_test, y_test)
    stats['accuracy_history'].append((stats['accuracy'], stats['n_train']))
    stats['runtime_history'].append((stats['accuracy'], time.time() - stats['t0']))
    return classifier, stats

from sklearn.base import copy
def merge((cf1, stats1), (cf2, stats2)):
    new = copy.deepcopy(cf1)
    new.coef_ += cf2.coef_
    new.intercept_ += cf2.intercept_
    return new, stats1

# Map/Reduce on Spark
sgd, stats = sc.parallelize(minibatch_iterators)
    .map(lambda batch: learn(classifier, stats, batch))
    .reduce(lambda l, r: merge(l, r))


def plot_accuracy(x, y, plot_placement, x_legend):
    """Plot accuracy as a function of x."""
    x = np.array(x)
    y = np.array(y)
    pl.subplots_adjust(hspace=0.5)
Example #6
0
def merge(left,right):
    new = copy.deepcopy(left)
    new.estimators_ += right.estimators_
    new.n_estimators = len(new.estimators_)  
    return new
Example #7
0
    def preprocess_data(self,
                        data: pd.DataFrame,
                        stage: str = "inference") -> Tuple[pd.DataFrame, list]:
        """The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder

        Args:
            data (pd.DataFrame): A dataframe with the features and target
            stage (str, optional): Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference".

        Returns:
            tuple[pd.DataFrame, list]: Returns the processed dataframe and the added features(list) as a tuple
        """
        logger.info(f"Preprocessing data: Stage: {stage}...")
        added_features = None
        if self.config.encode_date_columns:
            for field_name, freq in self.config.date_columns:
                data = self.make_date(data, field_name)
                data, added_features = self.add_datepart(data,
                                                         field_name,
                                                         frequency=freq,
                                                         prefix=None,
                                                         drop=True)
        # The only features that are added are the date features extracted
        # from the date which are categorical in nature
        if (added_features is not None) and (stage == "fit"):
            logger.debug(
                f"Added {added_features} features after encoding the date_columns"
            )
            self.config.categorical_cols += added_features
            self.config.categorical_dim = (len(self.config.categorical_cols)
                                           if self.config.categorical_cols
                                           is not None else 0)
        # Encoding Categorical Columns
        if len(self.config.categorical_cols) > 0:
            if stage == "fit":
                if self.do_leave_one_out_encoder():
                    logger.debug(
                        "Encoding Categorical Columns using LeavOneOutEncoder")
                    self.categorical_encoder = ce.LeaveOneOutEncoder(
                        cols=self.config.categorical_cols, random_state=42)
                    # Multi-Target Regression uses the first target to encode the categorical columns
                    if len(self.config.target) > 1:
                        logger.warning(
                            f"Multi-Target Regression: using the first target({self.config.target[0]}) to encode the categorical columns"
                        )
                    data = self.categorical_encoder.fit_transform(
                        data, data[self.config.target[0]])
                else:
                    logger.debug(
                        "Encoding Categorical Columns using OrdinalEncoder")
                    self.categorical_encoder = OrdinalEncoder(
                        cols=self.config.categorical_cols)
                    data = self.categorical_encoder.fit_transform(data)
            else:
                data = self.categorical_encoder.transform(data)

        # Transforming Continuous Columns
        if (self.config.continuous_feature_transform
                is not None) and (len(self.config.continuous_cols) > 0):
            if stage == "fit":
                transform = self.CONTINUOUS_TRANSFORMS[
                    self.config.continuous_feature_transform]
                self.continuous_transform = transform["callable"](
                    **transform["params"])
                # TODO implement quantile noise
                data.loc[:, self.config.
                         continuous_cols] = self.continuous_transform.fit_transform(
                             data.loc[:, self.config.continuous_cols])
            else:
                data.loc[:, self.config.
                         continuous_cols] = self.continuous_transform.transform(
                             data.loc[:, self.config.continuous_cols])

        # Normalizing Continuous Columns
        if (self.config.normalize_continuous_features) and (len(
                self.config.continuous_cols) > 0):
            if stage == "fit":
                self.scaler = StandardScaler()
                data.loc[:, self.config.
                         continuous_cols] = self.scaler.fit_transform(
                             data.loc[:, self.config.continuous_cols])
            else:
                data.loc[:,
                         self.config.continuous_cols] = self.scaler.transform(
                             data.loc[:, self.config.continuous_cols])

        # Converting target labels to a 0 indexed label
        if self.config.task == "classification":
            if stage == "fit":
                self.label_encoder = LabelEncoder()
                data[self.config.target[0]] = self.label_encoder.fit_transform(
                    data[self.config.target[0]])
            else:
                if self.config.target[0] in data.columns:
                    data[self.config.target[0]] = self.label_encoder.transform(
                        data[self.config.target[0]])
        # Target Transforms
        if all([col in data.columns for col in self.config.target]):
            if self.do_target_transform:
                target_transforms = []
                for col in self.config.target:
                    _target_transform = copy.deepcopy(
                        self.target_transform_template)
                    data[col] = _target_transform.fit_transform(
                        data[col].values.reshape(-1, 1))
                    target_transforms.append(_target_transform)
                self.target_transforms = target_transforms
        return data, added_features
Example #8
0
def merge(left, right):
    new = copy.deepcopy(left)
    new.coef_ += right.coef_
    new.intercept_ += right.intercept_
    return new
Example #9
0
def merge(left, right):
    new = copy.deepcopy(left)
    new.coef_ += right.coef_
    new.intercept_ += right.intercept_
    return new
    """

    # Init the best model to base rest of tests on
    base = KNeighborsClassifier(p=8,
                                n_neighbors=50,
                                leaf_size=70,
                                algorithm='kd_tree',
                                weights='distance',
                                n_jobs=-1)

    train_sizes = np.linspace(0.1, 1, 10)

    # Compare different n_neighbors values
    clfs_neighbors = dict()
    for n_neighbors in range(15, 56, 5):
        clf = copy.deepcopy(base)
        clf.n_neighbors = n_neighbors
        clfs_neighbors['{}-nn'.format(n_neighbors)] = clf

    compare_models_all_metrics(clfs_neighbors,
                               x_cr,
                               y_cr,
                               train_sizes=train_sizes,
                               title_prefix="Credit Fraud",
                               plot_learning_curve=False)

    # Compare different leaf sizes
    clfs_leaf_size = dict()
    for leaf_size in range(5, 71, 10):
        clf = copy.deepcopy(base)
        clf.leaf_size = leaf_size