Beispiel #1
0
def test_errors_loss_output_activation():
    """Make sure cross-entropy loss with activations not equal to
    `tensorflow.nn.sigmoid` or `tensorflow.nn.softmax` fails."""

    # This data will not actually be fit.
    # I am just using it to call the `fit` method.
    X = np.ones((1000, 4))

    # There are two code paths for this test. One for all features
    # using the default loss and one for a mix of losses.

    # All features use the default loss.
    ae = Autoencoder(loss='cross-entropy', output_activation=tf.exp)
    with pytest.raises(ValueError) as e:
        ae.fit(X)
        assert "'cross-entropy' loss!" in str(e), (
            "Wrong error raised for testing 'cross-entropy' loss with "
            "output activation that is not allowed for all features!")

    # Not all features use the default loss.
    ae = Autoencoder(loss='cross-entropy',
                     output_activation=tf.exp,
                     sigmoid_indices=[0])
    with pytest.raises(ValueError) as e:
        ae.fit(X)
        assert "'cross-entropy' loss!" in str(e), (
            "Wrong error raised for testing 'cross-entropy' loss with "
            "output activation that is not allowed for a subset of features!")
Beispiel #2
0
def test_errors_overlapping_sigmoid_softmax_indixes():
    """Make overlapping sigmoid and softmax indices raises an error."""

    # This data will not actually be fit.
    # I am just using it to call the `fit` method.
    X = np.ones((1000, 4))

    ae = Autoencoder(loss='blah',
                     sigmoid_indices=[0],
                     softmax_indices=[[0, 2]])
    with pytest.raises(ValueError) as e:
        ae.fit(X)
        assert "Sigmoid indices and softmax indices" in str(e), (
            "Wrong error raised for overlapping sigmoid and softmax indices")
Beispiel #3
0
def test_persistence():
    """Make sure we can pickle it."""
    X = iris.data  # Use the iris features.
    X = MinMaxScaler().fit_transform(X)

    ae = Autoencoder(hidden_units=(1,),
                     n_epochs=1000,
                     random_state=4556,
                     learning_rate=1e-2,
                     keep_prob=1.0)
    Xenc = ae.fit_transform(X)

    b = BytesIO()
    pickle.dump(ae, b)
    ae_pickled = pickle.loads(b.getvalue())
    Xenc_pickled = ae_pickled.transform(X)
    assert_array_almost_equal(Xenc, Xenc_pickled)
Beispiel #4
0
def test_refitting():
    """Make sure that refitting resets internals."""
    X = iris.data  # Use the iris features.
    X = MinMaxScaler().fit_transform(X)

    # Use digitize to make a binary features.
    for i in range(X.shape[1]):
        bins = [0.0, np.median(X[:, i]), 1.1]
        X[:, i] = np.digitize(X[:, i], bins) - 1.0

    ae = Autoencoder(hidden_units=(1,),
                     n_epochs=1000,
                     random_state=4556,
                     learning_rate=1e-2,
                     keep_prob=1.0,
                     loss='cross-entropy',
                     output_activation=tf.nn.sigmoid)
    ae.fit(X)
    assert ae.input_layer_size_ == 4, ("Input layer is the wrong size for "
                                       "the Autoencoder!")

    X_small = X[:, 0:-1]
    assert X_small.shape != X.shape, "Test data for refitting does not work!"
    ae.fit(X_small)
    assert ae.input_layer_size_ == 3, ("Input layer is the wrong size for "
                                       "the Autoencoder!")
Beispiel #5
0
def test_monitor_ae():
    """Test the monitor keyword."""
    # Use the iris features.
    X = iris.data
    X = MinMaxScaler().fit_transform(X)

    ae = Autoencoder(hidden_units=(3, 2,),
                     n_epochs=7500,
                     random_state=4556,
                     learning_rate=DEFAULT_LEARNING_RATE,
                     keep_prob=1.0,
                     hidden_activation=tf.nn.sigmoid,
                     encoding_activation=tf.nn.sigmoid,
                     output_activation=tf.nn.sigmoid)

    def _monitor(epoch, est, stats):
        assert epoch <= 1000, "The autoencoder has been running too long!"
        if stats['loss'] < 0.2:
            assert epoch > 10, "The autoencoder returned too soon!"
            return True
        else:
            return False
    ae.fit(X, monitor=_monitor)
Beispiel #6
0
def test_errors_unallowed_loss():
    """Make sure unallowed losses cause an error."""

    # This data will not actually be fit.
    # I am just using it to call the `fit` method.
    X = np.ones((1000, 4))

    # There are two code paths for this test. One for all features
    # using the default loss and one for a mix of losses.

    # All features use the default loss.
    ae = Autoencoder(loss='blah')
    with pytest.raises(ValueError) as e:
        ae.fit(X)
        assert "Loss 'blah'" in str(e), (
            "Wrong error raised for testing unallowed losses!")

    # Not all features use the default loss.
    ae = Autoencoder(loss='blah', sigmoid_indices=[0])
    with pytest.raises(ValueError) as e:
        ae.fit(X)
        assert "Loss 'blah'" in str(e), (
            "Wrong error raised for testing unallowed losses!")
Beispiel #7
0
def test_mse_sigmoid_activations():
    """Test the MSE loss w/ sigmoid activation."""
    # Use the iris features.
    X = iris.data
    X = MinMaxScaler().fit_transform(X)

    ae = Autoencoder(hidden_units=(3, 2,),
                     n_epochs=7500,
                     random_state=4556,
                     learning_rate=DEFAULT_LEARNING_RATE,
                     keep_prob=1.0,
                     hidden_activation=tf.nn.sigmoid,
                     encoding_activation=tf.nn.sigmoid,
                     output_activation=tf.nn.sigmoid)
    Xenc = ae.fit_transform(X)
    Xdec = ae.inverse_transform(Xenc)

    assert Xenc.shape == (X.shape[0], 2), ("Encoded iris data "
                                           "is not the right"
                                           " shape!")

    assert Xdec.shape == X.shape, ("Decoded iris data is not the right "
                                   "shape!")

    # Compute and test the scores.
    scores = 0.0
    for i in range(X.shape[1]):
        scores += np.sum((X[:, i:i+1] - Xdec[:, i:i+1]) ** 2, axis=1)

    ae_scores = ae.score_samples(X)
    assert_array_almost_equal(scores, ae_scores, decimal=5)

    score = np.mean(scores)
    ae_score = ae.score(X)
    assert_almost_equal(score, ae_score, decimal=5)

    max_score = 0.1
    _LOGGER.warning("\ntest info:\n    ae: %s\n"
                    "    score: %g\n    X[10]: %s\n    Xdec[10]: %s",
                    str(ae), ae_score,
                    pprint.pformat(list(X[10])),
                    pprint.pformat(list(Xdec[10])))

    assert ae_score < max_score, ("Autoencoder should have a score "
                                  "less than %f for the iris features." %
                                  max_score)
Beispiel #8
0
def test_replicability():
    """Make sure it can be seeded properly."""
    X = iris.data  # Use the iris features.
    X = MinMaxScaler().fit_transform(X)

    ae1 = Autoencoder(hidden_units=(1,),
                      n_epochs=1000,
                      random_state=4556,
                      learning_rate=1e-2,
                      keep_prob=1.0)
    Xenc1 = ae1.fit_transform(X)

    ae2 = Autoencoder(hidden_units=(1,),
                      n_epochs=1000,
                      random_state=4556,
                      learning_rate=1e-2,
                      keep_prob=1.0)
    Xenc2 = ae2.fit_transform(X)

    assert_array_almost_equal(Xenc1, Xenc2)
Beispiel #9
0
def _check_ae(max_score,
              hidden_units=(1,),
              keep_prob=1.0,
              learning_rate=None,
              sparse_type=None,
              bin_inds=None,
              bin_inds_to_use=None,
              cat_inds=None,
              n_epochs=7500,
              loss='mse'):
    """Helper function for testing the Autoencoder.

    This function does in order:

    1. Loads the Iris data.
    2. Converts columns to either binary (bin_inds) or categorical (cat_inds).
        Binary stuff is sent as sigmoid indices and categorical stuff is sent
        as softmax indices.
    3. Converts the data to a sparse type (sparse_type).
    4. Builds and trains the autoencoder (learning_rate, n_epochs, dropout,
        hidden_units, loss, bin_inds_to_use)
    5. Tests the outputs (max_score).

    The `bin_inds_to_use` parameter in particular specifies which columns
    the autoencoder is explicitly told are sigmoid values.
    """
    # Use the iris features.
    X = iris.data
    X = MinMaxScaler().fit_transform(X)

    # Make some columns binary or one-hot encoded.
    cat_size = []
    cat_begin = []
    binary_inds = []
    keep_cols = []
    def_inds = []
    num_cols = 0
    for i in range(X.shape[1]):
        if bin_inds is not None and i in bin_inds:
            bins = [0.0, np.median(X[:, i]), 1.1]
            keep_cols.append((np.digitize(X[:, i], bins) - 1.0)[:, np.newaxis])
            binary_inds.append(num_cols)
            num_cols += 1
        elif cat_inds is not None and i in cat_inds:
            # Vary the number of categories to shake out bugs.
            bins = np.percentile(X[:, i], 100.0 / (i + 3) * np.arange(i + 4))
            bins[0] = 0.0
            bins[-1] = 1.1
            oe = OneHotEncoder(sparse=False)
            col = oe.fit_transform(
                (np.digitize(X[:, i], bins) - 1.0)[:, np.newaxis])
            keep_cols.append(col)
            cat_begin.append(num_cols)
            cat_size.append(col.shape[1])
            num_cols += col.shape[1]
        else:
            keep_cols.append(X[:, i:i+1])
            def_inds.append(num_cols)
            num_cols += 1

    X = np.hstack(keep_cols)

    if len(cat_size) == 0:
        cat_indices = None
    else:
        cat_indices = np.hstack([np.array(cat_begin)[:, np.newaxis],
                                 np.array(cat_size)[:, np.newaxis]])
        assert cat_indices.shape[1] == 2, ("Categorical indices are the "
                                           "wrong shape!")

    if len(binary_inds) == 0:
        binary_inds = None

    if sparse_type is not None:
        X = getattr(sp, sparse_type + '_matrix')(X)

    # For sigmoid runs, we can set the loss or use sigmoid_indices.
    # This if handles those cases.
    if bin_inds_to_use is None and binary_inds is not None:
        bin_inds_to_use = binary_inds
    elif bin_inds_to_use == -1:
        bin_inds_to_use = None

    if loss == 'cross-entropy':
        output_activation = tf.nn.sigmoid
    else:
        output_activation = None

    if learning_rate is None:
        learning_rate = DEFAULT_LEARNING_RATE

    ae = Autoencoder(hidden_units=hidden_units,
                     n_epochs=n_epochs,
                     random_state=4556,
                     learning_rate=learning_rate,
                     keep_prob=keep_prob,
                     loss=loss,
                     sigmoid_indices=bin_inds_to_use,
                     softmax_indices=cat_indices,
                     hidden_activation=tf.nn.relu,
                     encoding_activation=None,
                     output_activation=output_activation)

    Xenc = ae.fit_transform(X)
    Xdec = ae.inverse_transform(Xenc)

    if sparse_type is not None:
        X = X.todense().A

    assert Xenc.shape == (X.shape[0], hidden_units[-1]), ("Encoded iris data "
                                                          "is not the right"
                                                          " shape!")

    assert Xdec.shape == X.shape, ("Decoded iris data is not the right "
                                   "shape!")

    # One-hot encoded stuff should come back out normalized.
    if cat_size is not None:
        for begin, size in zip(cat_begin, cat_size):
            assert_array_almost_equal(
                np.sum(Xdec[:, begin: begin + size], axis=1), 1.0, decimal=5)

    # Compute and test the scores.
    scores = 0.0
    for i in range(X.shape[1]):
        if binary_inds is not None and i in binary_inds:
            scores += _cross_entropy(X[:, i], Xdec[:, i])
        elif cat_size is not None and i in cat_begin:
            ind = cat_begin.index(i)
            b = cat_begin[ind]
            s = cat_size[ind]
            scores += _cross_entropy(X[:, b:b+s], Xdec[:, b:b+s])
        elif i in def_inds:
            if loss == 'mse':
                scores += np.sum((X[:, i:i+1] - Xdec[:, i:i+1]) ** 2, axis=1)
            else:
                scores += _cross_entropy(X[:, i], Xdec[:, i])

    ae_scores = ae.score_samples(X)
    assert_array_almost_equal(scores, ae_scores, decimal=5)

    score = np.mean(scores)
    ae_score = ae.score(X)
    assert_almost_equal(score, ae_score, decimal=5)

    _LOGGER.warning("\ntest info:\n    ae: %s\n    sparse format: %s\n"
                    "    score: %g\n    X[10]: %s\n    Xdec[10]: %s",
                    str(ae), sparse_type, ae_score,
                    pprint.pformat(list(X[10])),
                    pprint.pformat(list(Xdec[10])))

    assert ae_score < max_score, ("Autoencoder should have a score "
                                  "less than %f for the iris features." %
                                  max_score)

    return ae_score
Beispiel #10
0
def test_sigmoid_softmax_cross_entropy_loss():
    """Test the cross-entropy loss w/ softmax and sigmoid."""

    # Use the iris features.
    X = iris.data
    X = MinMaxScaler().fit_transform(X)

    # Make some columns normalized to unity.
    X = X / np.sum(X, axis=1)[:, np.newaxis]

    for i in range(2):
        if i == 1:
            bins = [0.0, np.median(X[:, 0]), 1.1]
            X[:, 0] = np.digitize(X[:, 0], bins) - 1.0
            X[:, 1:] = X[:, 1:] / np.sum(X[:, 1:], axis=1)[:, np.newaxis]
            binary_indices = [0]
        else:
            binary_indices = None

        ae = Autoencoder(hidden_units=(2,),
                         n_epochs=7500,
                         random_state=4556,
                         learning_rate=DEFAULT_LEARNING_RATE,
                         keep_prob=1.0,
                         loss='cross-entropy',
                         output_activation=tf.nn.softmax,
                         sigmoid_indices=binary_indices,
                         hidden_activation=tf.nn.relu,
                         encoding_activation=None)

        Xenc = ae.fit_transform(X)
        Xdec = ae.inverse_transform(Xenc)

        assert Xenc.shape == (X.shape[0], 2), ("Encoded iris data is not the "
                                               "right shape!")

        assert Xdec.shape == X.shape, ("Decoded iris data is not the right "
                                       "shape!")

        if i == 1:
            # Softmax stuff should come back out normalized.
            assert_array_almost_equal(np.sum(Xdec[:, 1:], axis=1), 1.0)

            # Compute and test the scores.
            scores = _cross_entropy(X[:, 1:], Xdec[:, 1:])
            scores += _cross_entropy(X[:, 0], Xdec[:, 0])

            ae_scores = ae.score_samples(X)
            assert_array_almost_equal(scores, ae_scores, decimal=5)
        else:
            # Softmax stuff should come back out normalized.
            assert_array_almost_equal(np.sum(Xdec, axis=1), 1.0)

            # Compute and test the scores.
            scores = _cross_entropy(X, Xdec)
            ae_scores = ae.score_samples(X)
            assert_array_almost_equal(scores, ae_scores, decimal=5)

        score = np.mean(scores)
        ae_score = ae.score(X)
        assert_almost_equal(score, ae_score, decimal=5)

        _LOGGER.warning("\ntest info:\n    ae: %s\n"
                        "    score: %g\n    X[10]: %s\n    Xdec[10]: %s",
                        str(ae), ae_score,
                        pprint.pformat(list(X[10])),
                        pprint.pformat(list(Xdec[10])))

        assert ae_score < 2.5, ("Autoencoder should have a score "
                                "less than 2.5 for the iris features.")