Ejemplo n.º 1
0
def ensemble_models(data, args):
    # split, train, test
    data.process()
    dense_out = len(data.labels[0])
    # split for all models
    X_train_, X_test_, Y_train, Y_test = train_test_split(data.text, data.labels,
                                                          test_size=0.20, random_state=42)

    # Prep data for the LSTM model
    tokenizer = Tokenizer(num_words=max_fatures, split=' ')
    tokenizer.fit_on_texts(X_train_)
    X_train = tokenizer.texts_to_sequences(X_train_)
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_test = tokenizer.texts_to_sequences(X_test_)
    X_test = pad_sequences(X_test, maxlen=max_len)

    # Train the LSTM model
    lstm_model = simple_lstm(max_fatures, dense_out, X_train.shape[1], embed_dim, lstm_out)
    model_hist = lstm_model.fit(X_train, Y_train, epochs=args.epochs, batch_size=batch_size,
                                verbose=1, validation_data=(X_test, Y_test))
    lstm_acc = model_hist.history['acc'][-1]
    print("LSTM model accuracy ", lstm_acc)

    # And make predictions using the LSTM model
    lstm_predictions = lstm_model.predict(X_test)

    # Now prep data for the one-hot CNN model
    X_train_cnn = np.asarray([to_one_hot(x) for x in X_train_])
    X_test_cnn = np.asarray([to_one_hot(x) for x in X_test_])

    # And train the one-hot CNN classifier
    model_cnn = one_hot_cnn(dense_out, max_len)
    model_hist_cnn = model_cnn.fit(X_train_cnn, Y_train, batch_size=batch_size, epochs=args.epochs,
                                   verbose=1, validation_data=(X_test_cnn, Y_test))
    cnn_acc = model_hist_cnn.history['acc'][-1]
    print("CNN model accuracy: ", cnn_acc)

    # And make predictions
    one_hot_cnn_predictions = model_cnn.predict(X_test_cnn)

    # Using the accuracies create an ensemble
    accuracies = [lstm_acc, cnn_acc]
    norm_accuracies = [a / sum(accuracies) for a in accuracies]

    print("Ensembling with weights: ")
    for na in norm_accuracies:
        print(na)
    ensembled_predictions = simple_ensembler([lstm_predictions, one_hot_cnn_predictions],
                                             norm_accuracies)
    final_preds = np.argmax(ensembled_predictions, axis=1)

    # Get the final accuracy
    print(classification_report(np.argmax(Y_test, axis=1), final_preds,
                                target_names=data.labels_0.columns.values))
Ejemplo n.º 2
0
def run_loss(args):
    data = args['data']

    # For each run we want to get a new random balance
    data.process()
    # split, train, test
    dense_out = len(data.labels[0])
    # split for all models
    X_train_, X_test_, Y_train, Y_test = train_test_split(data.text, data.labels,
                                                          test_size=0.20, random_state=42)

    print(args)

    # Prep data for the LSTM model
    # This currently will train the tokenizer on all text (unbalanced and train/test)
    # It would be nice to replace this with a pretrained embedding on larger text

    tokenizer = Tokenizer(num_words=int(args['max_features']), split=' ')
    tokenizer.fit_on_texts(data.all_text)
    X_train = tokenizer.texts_to_sequences(X_train_)
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_test = tokenizer.texts_to_sequences(X_test_)
    X_test = pad_sequences(X_test, maxlen=max_len)

    # Train the LSTM model
    lstm_model = simple_lstm(int(args['max_features']), dense_out, X_train.shape[1],
                             int(args['embed_dim']), int(args['lstm_out']), args['dropout'])

    if args['epochs'] == 0:
        args['epochs'] = 1

    es = EarlyStopping(monitor='val_acc', min_delta=0, patience=6, verbose=0, mode='max')
    model_hist = lstm_model.fit(X_train, Y_train, epochs=args['epochs'], batch_size=batch_size,
                                verbose=1, validation_data=(X_test, Y_test), callbacks=[es])
    lstm_acc = model_hist.history['val_acc'][-1]
    print("LSTM model accuracy ", lstm_acc)
    # This minimizes, so the maximize we have to take the inverse :)
    return 1 - lstm_acc