def test_input_type(): # Numpy array X = np.array(['alice', 'bob']) enc = GapEncoder(n_components=2, random_state=42) X_enc_array = enc.fit_transform(X) # List X = ['alice', 'bob'] enc = GapEncoder(n_components=2, random_state=42) X_enc_list = enc.fit_transform(X) # Check if the encoded vectors are the same np.testing.assert_array_equal(X_enc_array, X_enc_list) return
def test_partial_fit(n_samples=70): X_txt = fetch_20newsgroups(subset='train')['data'] X = X_txt[:n_samples] # Gap encoder with fit on one batch enc = GapEncoder(random_state=42, batch_size=n_samples, max_iter=1) X_enc = enc.fit_transform(X) # Gap encoder with partial fit enc = GapEncoder(random_state=42) enc.partial_fit(X) X_enc_partial = enc.transform(X) # Check if the encoded vectors are the same np.testing.assert_almost_equal(X_enc, X_enc_partial) return
def test_input_type(): # Numpy array with one column X = np.array([['alice'], ['bob']]) enc = GapEncoder(n_components=2, random_state=42) X_enc_array = enc.fit_transform(X) # List X2 = [['alice'], ['bob']] enc = GapEncoder(n_components=2, random_state=42) X_enc_list = enc.fit_transform(X2) # Check if the encoded vectors are the same np.testing.assert_array_equal(X_enc_array, X_enc_list) # Numpy array with two columns X = np.array([['alice', 'charlie'], ['bob', 'delta']]) enc = GapEncoder(n_components=2, random_state=42) X_enc_array = enc.fit_transform(X) # Pandas dataframe with two columns df = pd.DataFrame(X) enc = GapEncoder(n_components=2, random_state=42) X_enc_df = enc.fit_transform(df) # Check if the encoded vectors are the same np.testing.assert_array_equal(X_enc_array, X_enc_df) return
def test_missing_values(missing): observations = [['alice', 'bob'], ['bob', 'alice'], ['bob', np.nan], ['alice', 'charlie'], [np.nan, 'alice']] observations = np.array(observations, dtype=object) enc = GapEncoder(handle_missing=missing, n_components=3) if missing == 'error': with pytest.raises(ValueError, match=r'Input data contains missing values.'): enc.fit_transform(observations) elif missing == 'zero_impute': enc.fit_transform(observations) enc.partial_fit(observations) else: with pytest.raises(ValueError, match=r"handle_missing should be either " r"'error' or 'zero_impute', got 'aaa'"): enc.fit_transform(observations)
X_dirty = employee_salaries.X[[dirty_column]] print(X_dirty.head(), end='\n\n') print(f'Number of dirty entries = {len(X_dirty)}') ############################################################################### # Encoding dirty job titles # ------------------------- # # We first create an instance of the GapEncoder with n_components=10: from dirty_cat import GapEncoder enc = GapEncoder(n_components=10, random_state=42) ############################################################################### # Then we fit the model on the dirty categorical data and transform it to # obtain encoded vectors of size 10: X_enc = enc.fit_transform(X_dirty) print(f'Shape of encoded vectors = {X_enc.shape}') ############################################################################### # Interpreting encoded vectors # ---------------------------- # # The GapEncoder can be understood as a continuous encoding on a set of latent # topics estimated from the data. The latent topics are built by # capturing combinations of substrings that frequently co-occur, and encoded # vectors correspond to their activations. # To interpret these latent topics, we select for each of them a few labels # from the input data with the highest activations. # In the example below we select 3 labels to summarize each topic. topic_labels = enc.get_feature_names_out(n_labels=3)