Exemple #1
0
def train_mlp_model(data,
                    learning_rate=1e-3,
                    epochs=1000,
                    batch_size=128,
                    layers=2,
                    units=64,
                    dropout_rate=0.3):
    (train_texts, train_labels), (val_texts, val_labels) = data

    num_classes = explore_data.get_num_classes(train_labels)

    unexpected_labels = [i for i in val_labels if i not in range(num_classes)]
    if len(unexpected_labels) > 0:
        raise ValueError(
            'Unexpected label values found in validation set: '
            '{unexpected_labels}. Please make sure that the '
            'labels in the validation set are in the same range '
            'as training labels.'.format(unexpected_labels=unexpected_labels))

    x_train, x_val = vectorize_data.tfidf_vectorize(train_texts, train_labels,
                                                    val_texts)

    model = build_model.mlp_model(layers=layers,
                                  units=units,
                                  dropout_rate=dropout_rate,
                                  input_shape=x_train.shape[1:],
                                  num_classes=num_classes)

    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    ]

    history = model.fit(x_train,
                        train_labels,
                        epochs=epochs,
                        callbacks=callbacks,
                        validation_data=(x_val, val_labels),
                        verbose=2,
                        batch_size=batch_size)

    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    model.save('mlp_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]
def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = explore_data.get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError(
            'Unexpected label values found in the validation set:'
            ' {unexpected_labels}. Please make sure that the '
            'labels in the validation set are in the same range '
            'as training labels.'.format(unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val = vectorize_data.ngram_vectorize(train_texts, train_labels,
                                                    val_texts)

    # Create model instance.
    model = build_model.mlp_model(layers=layers,
                                  units=units,
                                  dropout_rate=dropout_rate,
                                  input_shape=x_train.shape[1:],
                                  num_classes=num_classes)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    ]

    # Train and validate model.
    history = model.fit(
        x_train,
        train_labels,
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val, val_labels),
        verbose=2,  # Logs once per epoch.
        batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('tuned_model.h5')
    return model, history['val_acc'][-1], history['val_loss'][-1]
    X_train, X_val, y_train, y_val = train_test_split(
        training_data['article_words'],
        training_data['topic'],
        test_size=0.10,
        random_state=42)
    X_train, X_val, vectorizer = ngram_vectorize(X_train, y_train, X_val)

    LB = LabelEncoder()
    train_labels = LB.fit_transform(y_train)
    val_labels = LB.fit_transform(y_val)

    num_classes = len(list((training_data['topic'].value_counts()).keys()))

    model = build_model.mlp_model(layers=2,
                                  units=32,
                                  dropout_rate=0.2,
                                  input_shape=X_train.shape[1:],
                                  num_classes=num_classes)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = Adam(lr=1e-3)
    model.compile(optimizer='adam', loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [EarlyStopping(monitor='val_loss', patience=2)]