Esempio n. 1
0
def test_gap_encoder(hashing, init, analyzer, add_words, n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_samples]
    n_components = 10
    # Test output shape
    encoder = GapEncoder(
        n_components=n_components, hashing=hashing, init=init,
        analyzer=analyzer, add_words=add_words,
        random_state=42, rescale_W=True)
    encoder.fit(X)
    y = encoder.transform(X)
    assert y.shape == (n_samples, n_components), str(y.shape)
    assert len(set(y[0])) == n_components

    # Test L1-norm of topics W.
    l1_norm_W = np.abs(encoder.W_).sum(axis=1)
    np.testing.assert_array_almost_equal(
        l1_norm_W, np.ones(n_components))

    # Test same seed return the same output
    encoder = GapEncoder(
        n_components=n_components, hashing=hashing, init=init,
        analyzer=analyzer, add_words=add_words,
        random_state=42)
    encoder.fit(X)
    y2 = encoder.transform(X)
    np.testing.assert_array_equal(y, y2)
    return
Esempio n. 2
0
def test_analyzer(init1, analyzer1, analyzer2):
    """" Test if the output is different when the analyzer is 'word' or 'char'.
        If it is, no error ir raised. 
    """
    add_words = False
    n_samples = 70
    X_txt = fetch_20newsgroups(subset='train')['data'][:n_samples]
    X = np.array([X_txt, X_txt]).T
    n_components = 10
    # Test first analyzer output:
    encoder = GapEncoder(n_components=n_components,
                         init='k-means++',
                         analyzer=analyzer1,
                         add_words=add_words,
                         random_state=42,
                         rescale_W=True)
    encoder.fit(X)
    y = encoder.transform(X)

    # Test the other analyzer output:
    encoder = GapEncoder(n_components=n_components,
                         init='k-means++',
                         analyzer=analyzer2,
                         add_words=add_words,
                         random_state=42)
    encoder.fit(X)
    y2 = encoder.transform(X)

    # Test inequality btw analyzer word and char ouput:
    np.testing.assert_raises(AssertionError, np.testing.assert_array_equal, y,
                             y2)
Esempio n. 3
0
def test_partial_fit(n_samples=70):
    X_txt = fetch_20newsgroups(subset='train')['data']
    X = X_txt[:n_samples]
    # Gap encoder with fit on one batch
    enc = GapEncoder(random_state=42, batch_size=n_samples, max_iter=1)
    X_enc = enc.fit_transform(X)
    # Gap encoder with partial fit
    enc = GapEncoder(random_state=42)
    enc.partial_fit(X)
    X_enc_partial = enc.transform(X)
    # Check if the encoded vectors are the same
    np.testing.assert_almost_equal(X_enc, X_enc_partial)
    return
# In the example below we select 3 labels to summarize each topic.

topic_labels = enc.get_feature_names_out(n_labels=3)
for k in range(len(topic_labels)):
    labels = topic_labels[k]
    print(f'Topic n°{k}: {labels}')

###############################################################################
# As expected, topics capture labels that frequently co-occur. For instance,
# the labels *firefighter*, *rescuer*, *rescue* appear together in
# *Firefigther/Rescuer III*, or *Fire/Rescue Lieutenant*.
#
# This enables us to understand the encoding of different samples

import matplotlib.pyplot as plt
encoded_labels = enc.transform(X_dirty[:20])
plt.figure(figsize=(8, 10))
plt.imshow(encoded_labels)
plt.xlabel('Latent topics', size=12)
plt.xticks(range(0, 10), labels=topic_labels, rotation=50, ha='right')
plt.ylabel('Data entries', size=12)
plt.yticks(range(0, 20), labels=X_dirty[:20].to_numpy().flatten())
plt.colorbar().set_label(label='Topic activations', size=12)
plt.tight_layout()
plt.show()

###############################################################################
# As we can see, each dirty category encodes on a small number of topics,
# These can thus be reliably used to summarize each topic, which are in
# effect latent categories captured from the data.