Beispiel #1
0
def prepare_data(featurizer, dim):
    if featurizer != 'ngram' and featurizer != 'glove':
        print("Please choose featurizer: 'ngram' or 'glove'.")
        return

    # Load and preprocessing data.
    train_data = load_train_data()
    train_data = process_train_data(train_data)
    test_data = load_test_data_a()
    test_data = process_test_data(test_data)

    # Get training X, y, and testing X, y
    if featurizer == 'ngram':
        train_set_ngram = build_ngrams_dataset(train_data)       
        vectorizer = train_set_ngram['vectorizer']
        test_set_ngram = build_ngrams_dataset(test_data, vectorizer=vectorizer)
    else:
        train_set_ngram = build_glove_featurized_dataset(train_data, dim)
        test_set_ngram = build_glove_featurized_dataset(test_data, dim)
    train_X = train_set_ngram['X']
    train_y = train_set_ngram['y']
    print("Shape of train_X: {}".format(train_X.shape))
    test_X = test_set_ngram['X']
    test_y = test_set_ngram['y']
    print("Shape of test_X: {}".format(test_X.shape))
    return {'train_X': train_X,
            'train_y': train_y,
            'test_X': test_X,
            'test_y': test_y}
def BERT_model(max_set_length=128,
               max_iter=2,
               batch_size=32,
               eta=2e-5,
               eps=1e-8):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        2,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )

    train_data = load_train_data()
    train_data = process_train_data(train_data)
    X_train, y_train = list(train_data['tweet']), list(train_data['subtask_a'])

    BertClassifier = TorchBertClassifier(tokenizer=tokenizer,
                                         model=model,
                                         optimizer=AdamW,
                                         max_set_length=max_set_length,
                                         max_iter=max_iter,
                                         batch_size=batch_size,
                                         eta=eta,
                                         eps=eps)
    print(BertClassifier)

    BertClassifier.fit(X_train, y_train)

    test_data = load_test_data_a()
    test_data = process_test_data(test_data)
    X_test, y_test = list(test_data['tweet']), list(test_data['subtask_a'])

    predictions = BertClassifier.predict(X_test)
    test_data['prediction'] = np.array(predictions)
    if not os.path.exists(RESULT_FOLDER):
        os.makedirs(RESULT_FOLDER)
    output_file_path = os.path.join(
        RESULT_FOLDER, "BERT_Iter_{}_prediction.csv".format(max_iter))
    test_data.to_csv(output_file_path, index=False)

    print("\nClassification report:")
    print(classification_report(y_test, predictions))

    F1_score = f1_score(change_to_binary(y_test),
                        change_to_binary(predictions),
                        average='macro')
    print("f1 score: {}".format(F1_score))
    return F1_score
Beispiel #3
0
def BiLSTM_CNN_model(embed_dim=50,
                     batch_size=1024,
                     max_iter=10,
                     hidden_dim=50,
                     bidirectional=True,
                     out_channels=30,
                     kernel_sizes=[3, 4, 5],
                     dropout_prob=0.1):
    start_time = time.time()
    vocab, embedding = generate_glove_embedding(embed_dim)

    train_data = load_train_data()
    train_data = process_train_data(train_data)
    X_train, y_train = build_LSTM_dataset(train_data, 128)

    mod = TorchLSTM_CNNClassifier(vocab=vocab,
                                  embedding=embedding,
                                  embed_dim=embed_dim,
                                  max_iter=max_iter,
                                  bidirectional=bidirectional,
                                  hidden_dim=hidden_dim,
                                  out_channels=out_channels,
                                  kernel_sizes=kernel_sizes,
                                  dropout_prob=dropout_prob,
                                  batch_size=batch_size)

    mod.fit(X_train, y_train)

    test_data = load_test_data_a()
    test_data = process_test_data(test_data)
    X_test, y_test = build_LSTM_dataset(test_data, 128)

    predictions = mod.predict(X_test)
    test_data['prediction'] = np.array(predictions)
    if not os.path.exists(RESULT_FOLDER):
        os.makedirs(RESULT_FOLDER)
    output_file_path = os.path.join(RESULT_FOLDER, "BiLSTM_CNN_{}-embedding_{}-batchsize_{}-hidden_{}-filters_{}-iter_prediction.csv". \
        format(embed_dim, batch_size, hidden_dim, out_channels, max_iter))
    test_data.to_csv(output_file_path, index=False)

    print("\nClassification report:")
    print(classification_report(y_test, predictions))

    f1_macro = f1_score(change_to_binary(y_test),
                        change_to_binary(predictions),
                        average='macro')
    print("BiLSTM+CNN embedding dim: {}, batch size: {}, hiddend dim: {}, out channels: {}, max_iter: {}, dropout: {}, macro f1 score: {}" \
        .format(embed_dim, batch_size, hidden_dim, out_channels, max_iter, dropout_prob, f1_macro))

    end_time = time.time()
    print("Finish BiLSTM+CNN in {} mins.".format((end_time - start_time) / 60))
    return f1_macro
def LSTM_model(embed_dim=300,
               max_iter=100,
               batch_size=32,
               hidden_dim=50,
               eta=0.001,
               bidirectional=False):
    vocab, embedding = generate_glove_embedding(embed_dim)

    train_data = load_train_data()
    train_data = process_train_data(train_data)
    X_train, y_train = build_LSTM_dataset(train_data, 128)

    mod = TorchLSTMClassifier(vocab=vocab,
                              embedding=embedding,
                              embed_dim=embed_dim,
                              max_iter=max_iter,
                              batch_size=batch_size,
                              eta=eta,
                              bidirectional=bidirectional,
                              hidden_dim=hidden_dim)

    print(mod)

    mod.fit(X_train, y_train)

    test_data = load_test_data_a()
    test_data = process_test_data(test_data)
    X_test, y_test = build_LSTM_dataset(test_data, 128)

    predictions = mod.predict(X_test)
    test_data['prediction'] = np.array(predictions)
    if not os.path.exists(RESULT_FOLDER):
        os.makedirs(RESULT_FOLDER)
    output_file_path = os.path.join(
        RESULT_FOLDER, "LSTM_{}-embedding_{}-hidden_prediction.csv".format(
            embed_dim, hidden_dim))
    test_data.to_csv(output_file_path, index=False)

    print("\nClassification report:")
    print(classification_report(y_test, predictions))

    F1_score = f1_score(change_to_binary(y_test),
                        change_to_binary(predictions),
                        average='macro')

    print("LSTM embedding dim: {}, f1 score: {}".format(embed_dim, F1_score))
    return F1_score