def test_multi_output_classification_partial_fit(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict sgd_linear_clf = SGDClassifier(loss='log', random_state=1) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) # train the multi_target_linear and also get the predictions. half_index = X.shape[0] // 2 multi_target_linear.partial_fit( X[:half_index], y[:half_index], classes=classes) first_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), first_predictions.shape) multi_target_linear.partial_fit(X[half_index:], y[half_index:]) second_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), second_predictions.shape) # train the linear classification with each column and assert that # predictions are equal after first partial_fit and second partial_fit for i in range(3): # create a clone with the same state sgd_linear_clf = clone(sgd_linear_clf) sgd_linear_clf.partial_fit( X[:half_index], y[:half_index, i], classes=classes[i]) assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i]) sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i]) assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def test_multi_output_classification_partial_fit(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) # train the multi_target_linear and also get the predictions. half_index = X.shape[0] // 2 multi_target_linear.partial_fit( X[:half_index], y[:half_index], classes=classes) first_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), first_predictions.shape) multi_target_linear.partial_fit(X[half_index:], y[half_index:]) second_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), second_predictions.shape) # train the linear classification with each column and assert that # predictions are equal after first partial_fit and second partial_fit for i in range(3): # create a clone with the same state sgd_linear_clf = clone(sgd_linear_clf) sgd_linear_clf.partial_fit( X[:half_index], y[:half_index, i], classes=classes[i]) assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i]) sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i]) assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def test_multi_output_classification_partial_fit_parallelism(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1) mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1) mor.partial_fit(X, y, classes) est1 = mor.estimators_[0] mor.partial_fit(X, y) est2 = mor.estimators_[0] # parallelism requires this to be the case for a sane implementation assert_false(est1 is est2)
def test_multi_output_classification_partial_fit_parallelism(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4) mor.partial_fit(X, y, classes) est1 = mor.estimators_[0] mor.partial_fit(X, y) est2 = mor.estimators_[0] if cpu_count() > 1: # parallelism requires this to be the case for a sane implementation assert est1 is not est2
def test_multi_output_classification_partial_fit_parallelism(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4) mor.partial_fit(X, y, classes) est1 = mor.estimators_[0] mor.partial_fit(X, y) est2 = mor.estimators_[0] if cpu_count() > 1: # parallelism requires this to be the case for a sane implementation assert est1 is not est2
def test_multi_output_classification_partial_fit_no_first_classes_exception(): sgd_linear_clf = SGDClassifier(loss="log", random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) msg = "classes must be passed on the first call to partial_fit." with pytest.raises(ValueError, match=msg): multi_target_linear.partial_fit(X, y)
class MultilabelTraining: X_COLUMN_NAME = "page_text_extract" DEFAULT_TARGET_THEMES = [ 5, 6, 26, 33, 139, 163, 232, 313, 339, 350, 406, 409, 555, 589, 597, 634, 660, 695, 729, 766, 773, 793, 800, 810, 852, 895, 951, 975, ] OTHER_THEMES_VALUE = 4242 def __init__( self, df=pd.DataFrame(), x_column_name=X_COLUMN_NAME, group_processes=True, classifier=PassiveAggressiveClassifier(random_state=42), vectorizer=HashingVectorizer(n_features=2**14), target_themes=DEFAULT_TARGET_THEMES, other_themes_value=OTHER_THEMES_VALUE, remove_processes_without_theme=True, is_incremental_training=False, vocab_path="", ): self.is_incremental_training = is_incremental_training self.vocab_path = vocab_path self.remove_processes_without_theme = remove_processes_without_theme self.mo_classifier = MultiOutputClassifier(classifier, n_jobs=-1) self.classifier = classifier self.vectorizer = vectorizer self.target_themes = target_themes self.other_themes_value = other_themes_value self.group_processes = group_processes self.x_column_name = x_column_name self._initialize_dataframe(df) def _initialize_dataframe(self, df): if not df.empty: self.dp = DataframePreprocessing( df.copy(), group_processes=self.group_processes, x_column_name=self.x_column_name, target_themes=self.target_themes, other_themes_value=self.other_themes_value, is_incremental_training=self.is_incremental_training, remove_processes_without_theme=self. remove_processes_without_theme, vocab_path=self.vocab_path, ) self.y_columns_names = self.dp.distinct_themes self.df = self.dp.processed_df else: self.df = df def _split(self, X, y): print("Splitting dataset...") self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, stratify=y, test_size=0.2, random_state=42) def _vectorize(self, X_train): print("Vectorizing...") return self.vectorizer.fit_transform(X_train) def train(self, split_df=False): print("Training...") self.X_train, self.y_train = ( self.df[self.x_column_name], self.df[self.y_columns_names], ) if split_df: self._split(self.X_train, self.y_train) vector = self._vectorize(self.X_train) self.mo_classifier.fit(vector, self.y_train) if split_df: vector_test = self._vectorize(self.X_test) self.y_pred = self.mo_classifier.predict(vector_test) metrics = get_multilabel_metrics(self.y_test, self.y_pred) return metrics return None def _update_dataframe(self, df, is_incremental_training=True, is_parquet=False, labels_freq={}): self.dp = DataframePreprocessing( df.copy(), x_column_name=self.x_column_name, group_processes=self.group_processes, target_themes=self.target_themes, other_themes_value=self.other_themes_value, is_incremental_training=is_incremental_training, remove_processes_without_theme=self.remove_processes_without_theme, is_parquet=is_parquet, vocab_path=self.vocab_path, labels_freq=labels_freq, ) self.df = self.dp.processed_df def incremental_train(self, df_path, nrows=5000): print("Training incrementally...") columns_names = pd.read_csv(df_path, nrows=1).columns.tolist() skiprows = 1 classes, labels_freq = DataframePreprocessing( target_themes=self.target_themes).get_unique_binarized_labels( df_path, "tema") while True: df = pd.read_csv( df_path, nrows=nrows, skiprows=skiprows, header=None, names=columns_names, ) if df.empty: break self._update_dataframe(df, labels_freq=labels_freq) X_train, y_train = ( self.df[self.x_column_name], self.df[self.target_themes + [self.other_themes_value]], ) vector = self._vectorize(X_train) self.mo_classifier.partial_fit(vector, y_train, classes=classes) skiprows += nrows print("{} rows already trained\n".format(skiprows - 1)) def incremental_train_with_parquet(self, parquet_path): print("Training incrementally with parquet...") nrows = 0 pf = ParquetFile(parquet_path) classes, labels_freq = DataframePreprocessing( target_themes=self.target_themes).get_unique_binarized_labels( parquet_path, "tema", True) for df in pf.iter_row_groups(): df = df.reset_index() self._update_dataframe(df, is_parquet=True, labels_freq=labels_freq) X_train, y_train = ( self.df[self.x_column_name], self.df[self.target_themes + [self.other_themes_value]], ) vector = self._vectorize(X_train) self.mo_classifier.partial_fit(vector.toarray(), y_train, classes=classes) nrows += len(self.df) print("{} rows already trained\n".format(nrows)) clear_output(wait=True) def predict(self): return self.mo_classifier.predict( self._vectorize(self.X_test).todense()) def set_X_test(self, X): self.X_test = X def set_y_test(self, y): self.y_test = y def get_pickle(self): return pickle.dumps(self.mo_classifier)