def test_multi_output_classification_partial_fit():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict

    sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)

    # train the multi_target_linear and also get the predictions.
    half_index = X.shape[0] // 2
    multi_target_linear.partial_fit(
        X[:half_index], y[:half_index], classes=classes)

    first_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), first_predictions.shape)

    multi_target_linear.partial_fit(X[half_index:], y[half_index:])
    second_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), second_predictions.shape)

    # train the linear classification with each column and assert that
    # predictions are equal after first partial_fit and second partial_fit
    for i in range(3):
        # create a clone with the same state
        sgd_linear_clf = clone(sgd_linear_clf)
        sgd_linear_clf.partial_fit(
            X[:half_index], y[:half_index, i], classes=classes[i])
        assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
        sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
        assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def test_multi_output_classification_partial_fit():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict

    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)

    # train the multi_target_linear and also get the predictions.
    half_index = X.shape[0] // 2
    multi_target_linear.partial_fit(
        X[:half_index], y[:half_index], classes=classes)

    first_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), first_predictions.shape)

    multi_target_linear.partial_fit(X[half_index:], y[half_index:])
    second_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), second_predictions.shape)

    # train the linear classification with each column and assert that
    # predictions are equal after first partial_fit and second partial_fit
    for i in range(3):
        # create a clone with the same state
        sgd_linear_clf = clone(sgd_linear_clf)
        sgd_linear_clf.partial_fit(
            X[:half_index], y[:half_index, i], classes=classes[i])
        assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
        sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
        assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
Beispiel #3
0
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    # parallelism requires this to be the case for a sane implementation
    assert_false(est1 is est2)
Beispiel #4
0
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    if cpu_count() > 1:
        # parallelism requires this to be the case for a sane implementation
        assert est1 is not est2
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    if cpu_count() > 1:
        # parallelism requires this to be the case for a sane implementation
        assert est1 is not est2
Beispiel #6
0
def test_multi_output_classification_partial_fit_no_first_classes_exception():
    sgd_linear_clf = SGDClassifier(loss="log", random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    msg = "classes must be passed on the first call to partial_fit."
    with pytest.raises(ValueError, match=msg):
        multi_target_linear.partial_fit(X, y)
Beispiel #7
0
class MultilabelTraining:

    X_COLUMN_NAME = "page_text_extract"

    DEFAULT_TARGET_THEMES = [
        5,
        6,
        26,
        33,
        139,
        163,
        232,
        313,
        339,
        350,
        406,
        409,
        555,
        589,
        597,
        634,
        660,
        695,
        729,
        766,
        773,
        793,
        800,
        810,
        852,
        895,
        951,
        975,
    ]

    OTHER_THEMES_VALUE = 4242

    def __init__(
        self,
        df=pd.DataFrame(),
        x_column_name=X_COLUMN_NAME,
        group_processes=True,
        classifier=PassiveAggressiveClassifier(random_state=42),
        vectorizer=HashingVectorizer(n_features=2**14),
        target_themes=DEFAULT_TARGET_THEMES,
        other_themes_value=OTHER_THEMES_VALUE,
        remove_processes_without_theme=True,
        is_incremental_training=False,
        vocab_path="",
    ):
        self.is_incremental_training = is_incremental_training
        self.vocab_path = vocab_path
        self.remove_processes_without_theme = remove_processes_without_theme
        self.mo_classifier = MultiOutputClassifier(classifier, n_jobs=-1)
        self.classifier = classifier
        self.vectorizer = vectorizer
        self.target_themes = target_themes
        self.other_themes_value = other_themes_value
        self.group_processes = group_processes
        self.x_column_name = x_column_name
        self._initialize_dataframe(df)

    def _initialize_dataframe(self, df):
        if not df.empty:
            self.dp = DataframePreprocessing(
                df.copy(),
                group_processes=self.group_processes,
                x_column_name=self.x_column_name,
                target_themes=self.target_themes,
                other_themes_value=self.other_themes_value,
                is_incremental_training=self.is_incremental_training,
                remove_processes_without_theme=self.
                remove_processes_without_theme,
                vocab_path=self.vocab_path,
            )
            self.y_columns_names = self.dp.distinct_themes
            self.df = self.dp.processed_df
        else:
            self.df = df

    def _split(self, X, y):
        print("Splitting dataset...")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, stratify=y, test_size=0.2, random_state=42)

    def _vectorize(self, X_train):
        print("Vectorizing...")
        return self.vectorizer.fit_transform(X_train)

    def train(self, split_df=False):
        print("Training...")
        self.X_train, self.y_train = (
            self.df[self.x_column_name],
            self.df[self.y_columns_names],
        )
        if split_df:
            self._split(self.X_train, self.y_train)
        vector = self._vectorize(self.X_train)
        self.mo_classifier.fit(vector, self.y_train)
        if split_df:
            vector_test = self._vectorize(self.X_test)
            self.y_pred = self.mo_classifier.predict(vector_test)
            metrics = get_multilabel_metrics(self.y_test, self.y_pred)
            return metrics
        return None

    def _update_dataframe(self,
                          df,
                          is_incremental_training=True,
                          is_parquet=False,
                          labels_freq={}):
        self.dp = DataframePreprocessing(
            df.copy(),
            x_column_name=self.x_column_name,
            group_processes=self.group_processes,
            target_themes=self.target_themes,
            other_themes_value=self.other_themes_value,
            is_incremental_training=is_incremental_training,
            remove_processes_without_theme=self.remove_processes_without_theme,
            is_parquet=is_parquet,
            vocab_path=self.vocab_path,
            labels_freq=labels_freq,
        )
        self.df = self.dp.processed_df

    def incremental_train(self, df_path, nrows=5000):
        print("Training incrementally...")
        columns_names = pd.read_csv(df_path, nrows=1).columns.tolist()
        skiprows = 1
        classes, labels_freq = DataframePreprocessing(
            target_themes=self.target_themes).get_unique_binarized_labels(
                df_path, "tema")
        while True:
            df = pd.read_csv(
                df_path,
                nrows=nrows,
                skiprows=skiprows,
                header=None,
                names=columns_names,
            )
            if df.empty:
                break
            self._update_dataframe(df, labels_freq=labels_freq)
            X_train, y_train = (
                self.df[self.x_column_name],
                self.df[self.target_themes + [self.other_themes_value]],
            )
            vector = self._vectorize(X_train)
            self.mo_classifier.partial_fit(vector, y_train, classes=classes)
            skiprows += nrows
            print("{} rows already trained\n".format(skiprows - 1))

    def incremental_train_with_parquet(self, parquet_path):
        print("Training incrementally with parquet...")
        nrows = 0
        pf = ParquetFile(parquet_path)
        classes, labels_freq = DataframePreprocessing(
            target_themes=self.target_themes).get_unique_binarized_labels(
                parquet_path, "tema", True)
        for df in pf.iter_row_groups():
            df = df.reset_index()
            self._update_dataframe(df,
                                   is_parquet=True,
                                   labels_freq=labels_freq)
            X_train, y_train = (
                self.df[self.x_column_name],
                self.df[self.target_themes + [self.other_themes_value]],
            )
            vector = self._vectorize(X_train)
            self.mo_classifier.partial_fit(vector.toarray(),
                                           y_train,
                                           classes=classes)
            nrows += len(self.df)
            print("{} rows already trained\n".format(nrows))
            clear_output(wait=True)

    def predict(self):
        return self.mo_classifier.predict(
            self._vectorize(self.X_test).todense())

    def set_X_test(self, X):
        self.X_test = X

    def set_y_test(self, y):
        self.y_test = y

    def get_pickle(self):
        return pickle.dumps(self.mo_classifier)