Ejemplo n.º 1
0
def test_input_type():
    # Numpy array
    X = np.array(['alice', 'bob'])
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_array = enc.fit_transform(X)
    # List
    X = ['alice', 'bob']
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_list = enc.fit_transform(X)
    # Check if the encoded vectors are the same
    np.testing.assert_array_equal(X_enc_array, X_enc_list)
    return
Ejemplo n.º 2
0
def test_partial_fit(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_samples]
    # Gap encoder with fit on one batch
    enc = GapEncoder(random_state=42, batch_size=n_samples, max_iter=1)
    X_enc = enc.fit_transform(X)
    # Gap encoder with partial fit
    enc = GapEncoder(random_state=42)
    enc.partial_fit(X)
    X_enc_partial = enc.transform(X)
    # Check if the encoded vectors are the same
    np.testing.assert_almost_equal(X_enc, X_enc_partial)
    return
Ejemplo n.º 3
0
def test_input_type():
    # Numpy array with one column
    X = np.array([['alice'], ['bob']])
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_array = enc.fit_transform(X)
    # List
    X2 = [['alice'], ['bob']]
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_list = enc.fit_transform(X2)
    # Check if the encoded vectors are the same
    np.testing.assert_array_equal(X_enc_array, X_enc_list)

    # Numpy array with two columns
    X = np.array([['alice', 'charlie'], ['bob', 'delta']])
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_array = enc.fit_transform(X)
    # Pandas dataframe with two columns
    df = pd.DataFrame(X)
    enc = GapEncoder(n_components=2, random_state=42)
    X_enc_df = enc.fit_transform(df)
    # Check if the encoded vectors are the same
    np.testing.assert_array_equal(X_enc_array, X_enc_df)
    return
Ejemplo n.º 4
0
def test_missing_values(missing):
    observations = [['alice', 'bob'], ['bob', 'alice'], ['bob', np.nan],
                    ['alice', 'charlie'], [np.nan, 'alice']]
    observations = np.array(observations, dtype=object)
    enc = GapEncoder(handle_missing=missing, n_components=3)
    if missing == 'error':
        with pytest.raises(ValueError,
                           match=r'Input data contains missing values.'):
            enc.fit_transform(observations)
    elif missing == 'zero_impute':
        enc.fit_transform(observations)
        enc.partial_fit(observations)
    else:
        with pytest.raises(ValueError,
                           match=r"handle_missing should be either "
                           r"'error' or 'zero_impute', got 'aaa'"):
            enc.fit_transform(observations)
X_dirty = employee_salaries.X[[dirty_column]]
print(X_dirty.head(), end='\n\n')
print(f'Number of dirty entries = {len(X_dirty)}')

###############################################################################
# Encoding dirty job titles
# -------------------------
#
# We first create an instance of the GapEncoder with n_components=10:
from dirty_cat import GapEncoder
enc = GapEncoder(n_components=10, random_state=42)

###############################################################################
# Then we fit the model on the dirty categorical data and transform it to
# obtain encoded vectors of size 10:
X_enc = enc.fit_transform(X_dirty)
print(f'Shape of encoded vectors = {X_enc.shape}')

###############################################################################
# Interpreting encoded vectors
# ----------------------------
#
# The GapEncoder can be understood as a continuous encoding on a set of latent
# topics estimated from the data. The latent topics are built by
# capturing combinations of substrings that frequently co-occur, and encoded
# vectors correspond to their activations.
# To interpret these latent topics, we select for each of them a few labels
# from the input data with the highest activations.
# In the example below we select 3 labels to summarize each topic.

topic_labels = enc.get_feature_names_out(n_labels=3)