コード例 #1
0
ファイル: baseline.py プロジェクト: tsproisl/EmotiKLUE
def bow_baseline_naive_bayes(train_data, train_labels, test_data, test_labels):
    clf = sklearn.naive_bayes.MultinomialNB()
    cv = sklearn.feature_extraction.text.CountVectorizer()
    train_data = strip_triggerword(train_data)
    test_data = strip_triggerword(test_data)
    train = cv.fit_transform(train_data)
    test = cv.transform(test_data)
    clf.fit(train, train_labels)
    pred = clf.predict(test)
    print("\n## Bag of words (Naive Bayes) ##\n")
    evaluate_iest.calculatePRF(test_labels, pred.tolist())
コード例 #2
0
ファイル: baseline.py プロジェクト: tsproisl/EmotiKLUE
def tfidf_baseline_svm(train_data, train_labels, test_data, test_labels):
    cv = sklearn.feature_extraction.text.TfidfVectorizer()
    clf = sklearn.svm.LinearSVC()
    # scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
    # train_data = strip_triggerword(train_data)
    # test_data = strip_triggerword(test_data)
    train = cv.fit_transform(train_data)
    # train = scaler.fit_transform(train)
    test = cv.transform(test_data)
    # test = scaler.transform(test)
    clf.fit(train, train_labels)
    pred = clf.predict(test)
    print("\n## Bag of words tf-idf (Linear SVC) ##\n")
    evaluate_iest.calculatePRF(test_labels, pred.tolist())
コード例 #3
0
ファイル: baseline.py プロジェクト: tsproisl/EmotiKLUE
def bigrams_svm(train_data, train_labels, test_data, test_labels):
    cv = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1, 2))
    clf = sklearn.svm.LinearSVC()
    # scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
    # train_data = strip_triggerword(train_data)
    # test_data = strip_triggerword(test_data)
    train = cv.fit_transform(train_data)
    # train = scaler.fit_transform(train)
    test = cv.transform(test_data)
    # test = scaler.transform(test)
    clf.fit(train, train_labels)
    pred = clf.predict(test)
    print("\n## Bag of uni- and bigrams (Linear SVC) ##\n")
    evaluate_iest.calculatePRF(test_labels, pred.tolist())
コード例 #4
0
ファイル: emotiklue.py プロジェクト: tsproisl/EmotiKLUE
def test(args):
    # with keras.utils.CustomObjectScope({L1L2_m.__name__: L1L2_m}):
    model = keras.models.load_model("%s.h5" % args.model)
    with open("%s.maps" % args.model, encoding="utf-8") as f:
        word_to_idx, tgt_to_idx, max_len_lw, max_len_rw = json.load(f)
    idx_to_tgt = {i: c for c, i in tgt_to_idx.items()}
    test_lw, test_rw, test_tgt, _, _ = read_dataset(args.FILE)
    if args.lda:
        topics = topic_distribution(args.dict, args.lda, test_lw, test_rw)
    test_lw = [vectorize_words(lw, word_to_idx) for lw in test_lw]
    test_rw = [
        vectorize_words(rw, word_to_idx, reverse=True) for rw in test_rw
    ]
    test_left_words = keras.preprocessing.sequence.pad_sequences(
        test_lw, maxlen=max_len_lw, padding="pre", truncating="pre")
    test_right_words = keras.preprocessing.sequence.pad_sequences(
        test_rw, maxlen=max_len_rw, padding="pre", truncating="pre")
    if args.lda:
        predictions = model.predict(
            [test_left_words, test_right_words, topics])
    else:
        predictions = model.predict([test_left_words, test_right_words])
    predicted = [idx_to_tgt[p] for p in predictions.argmax(axis=1)]
    evaluate_iest.calculatePRF(list(test_tgt), predicted)
コード例 #5
0
ファイル: baseline.py プロジェクト: tsproisl/EmotiKLUE
def majority_baseline(train_labels, test_labels):
    train_freq = collections.Counter(train_labels)
    most_common = train_freq.most_common(1)[0][0]
    fake_prediction = [most_common] * len(test_labels)
    print("\n## Majority baseline ##\n")
    evaluate_iest.calculatePRF(test_labels, fake_prediction)
コード例 #6
0
ファイル: emotiklue.py プロジェクト: tsproisl/EmotiKLUE
def retrain(args):
    BATCH_SIZE = 160
    # with keras.utils.CustomObjectScope({L1L2_m.__name__: L1L2_m}):
    model = keras.models.load_model("%s.h5" % args.model)
    # model.optimizer.set_state()
    with open("%s.maps" % args.model, encoding="utf-8") as f:
        word_to_idx, tgt_to_idx, max_len_lw, max_len_rw = json.load(f)
    train_lw, train_rw, train_tgt, _, _ = read_dataset(args.FILE)
    val_lw, val_rw, val_tgt, _, _ = read_dataset(args.val)
    if args.lda:
        train_topics = topic_distribution(args.dict, args.lda, train_lw,
                                          train_rw)
        val_topics = topic_distribution(args.dict, args.lda, val_lw, val_rw)
    train_lw = [vectorize_words(lw, word_to_idx) for lw in train_lw]
    train_rw = [
        vectorize_words(rw, word_to_idx, reverse=True) for rw in train_rw
    ]
    train_tgt = vectorize_words(train_tgt, tgt_to_idx)
    targets = keras.utils.to_categorical(train_tgt,
                                         num_classes=len(tgt_to_idx.values()))
    train_left_words = keras.preprocessing.sequence.pad_sequences(
        train_lw, maxlen=max_len_lw, padding="pre", truncating="pre")
    train_right_words = keras.preprocessing.sequence.pad_sequences(
        train_rw, maxlen=max_len_rw, padding="pre", truncating="pre")
    val_tgts = val_tgt
    val_lw = [vectorize_words(lw, word_to_idx) for lw in val_lw]
    val_rw = [vectorize_words(rw, word_to_idx, reverse=True) for rw in val_rw]
    val_tgt = vectorize_words(val_tgt, tgt_to_idx)
    val_targets = keras.utils.to_categorical(val_tgt,
                                             num_classes=len(
                                                 tgt_to_idx.values()))
    val_left_words = keras.preprocessing.sequence.pad_sequences(
        val_lw, maxlen=max_len_lw, padding="pre", truncating="pre")
    val_right_words = keras.preprocessing.sequence.pad_sequences(
        val_rw, maxlen=max_len_rw, padding="pre", truncating="pre")
    early_stopper = keras.callbacks.EarlyStopping(monitor="val_loss",
                                                  patience=2)
    if args.lda:
        model.fit(
            [train_left_words, train_right_words, train_topics],
            targets,
            batch_size=BATCH_SIZE,
            epochs=args.epochs,
            callbacks=[early_stopper],
            validation_data=([val_left_words, val_right_words,
                              val_topics], val_targets))
    else:
        model.fit([train_left_words, train_right_words],
                  targets,
                  batch_size=BATCH_SIZE,
                  epochs=args.epochs,
                  callbacks=[early_stopper],
                  validation_data=([val_left_words,
                                    val_right_words], val_targets))
    model.save("%s_retrain.h5" % args.model)
    with open("%s_retrain.maps" % args.model, mode="w", encoding="utf-8") as f:
        json.dump((word_to_idx, tgt_to_idx, max_len_lw, max_len_rw),
                  f,
                  ensure_ascii=False)

    # Official score
    if args.lda:
        predictions = model.predict(
            [val_left_words, val_right_words, val_topics])
    else:
        predictions = model.predict([val_left_words, val_right_words])
    idx_to_tgt = {i: c for c, i in tgt_to_idx.items()}
    predicted = [idx_to_tgt[p] for p in predictions.argmax(axis=1)]
    evaluate_iest.calculatePRF(list(val_tgts), predicted)
コード例 #7
0
ファイル: emotiklue.py プロジェクト: tsproisl/EmotiKLUE
def train(args):
    WORD_LSTM_DIM = 300
    DENSE_DIM = WORD_LSTM_DIM
    DROPOUT = 0.2
    RECURRENT_DROPOUT = 0.0
    BATCH_SIZE = 160

    train_lw, train_rw, train_tgt, vocabulary, classes = read_dataset(
        args.FILE)
    val_lw, val_rw, val_tgt, _, _ = read_dataset(args.val)
    val_tgts = val_tgt
    embeddings_index, WORD_EMBEDDING_DIM = read_glove(args.embeddings)

    # mappings
    word_to_idx = {w: i for i, w in enumerate(sorted(vocabulary), start=1)}
    tgt_to_idx = {c: i for i, c in enumerate(sorted(classes), start=0)}

    # create embedding layers
    embedding_matrix = np.zeros((len(vocabulary) + 2, WORD_EMBEDDING_DIM))
    for word, idx in word_to_idx.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[idx] = embedding_vector

    # LDA models
    if args.lda:
        train_topics = topic_distribution(args.dict, args.lda, train_lw,
                                          train_rw)
        val_topics = topic_distribution(args.dict, args.lda, val_lw, val_rw)

    # vectorize
    train_lw = [vectorize_words(lw, word_to_idx) for lw in train_lw]
    train_rw = [
        vectorize_words(rw, word_to_idx, reverse=True) for rw in train_rw
    ]
    train_tgt = vectorize_words(train_tgt, tgt_to_idx)
    targets = keras.utils.to_categorical(train_tgt, num_classes=len(classes))
    val_lw = [vectorize_words(lw, word_to_idx) for lw in val_lw]
    val_rw = [vectorize_words(rw, word_to_idx, reverse=True) for rw in val_rw]
    val_tgt = vectorize_words(val_tgt, tgt_to_idx)
    val_targets = keras.utils.to_categorical(val_tgt, num_classes=len(classes))

    # pad sequences
    max_len_lw = int(
        max((len(lw) for lw in itertools.chain(train_lw, val_lw))) * 1.1)
    max_len_rw = int(
        max((len(rw) for rw in itertools.chain(train_rw, val_rw))) * 1.1)

    train_left_words = keras.preprocessing.sequence.pad_sequences(
        train_lw, maxlen=max_len_lw, padding="pre", truncating="pre")
    train_right_words = keras.preprocessing.sequence.pad_sequences(
        train_rw, maxlen=max_len_rw, padding="pre", truncating="pre")
    val_left_words = keras.preprocessing.sequence.pad_sequences(
        val_lw, maxlen=max_len_lw, padding="pre", truncating="pre")
    val_right_words = keras.preprocessing.sequence.pad_sequences(
        val_rw, maxlen=max_len_rw, padding="pre", truncating="pre")

    # with keras.utils.CustomObjectScope({L1L2_m.__name__: L1L2_m}):
    # input layers
    input_lw = keras.layers.Input(shape=(train_left_words.shape[1], ))
    input_rw = keras.layers.Input(shape=(train_right_words.shape[1], ))
    input_topics = keras.layers.Input(shape=(100, ))

    # embedding layers
    embedding_lw = keras.layers.Embedding(len(vocabulary) + 2,
                                          WORD_EMBEDDING_DIM,
                                          mask_zero=True,
                                          weights=[embedding_matrix],
                                          trainable=False)(input_lw)
    embedding_rw = keras.layers.Embedding(len(vocabulary) + 2,
                                          WORD_EMBEDDING_DIM,
                                          mask_zero=True,
                                          weights=[embedding_matrix],
                                          trainable=False)(input_rw)

    # LSTMs
    # lstm_lw = keras.layers.LSTM(WORD_LSTM_DIM, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT,
    #                             kernel_regularizer=L1L2_m(l1=0, l2=0, prior_shape=(WORD_EMBEDDING_DIM, WORD_LSTM_DIM * 4)))(embedding_lw)
    # lstm_rw = keras.layers.LSTM(WORD_LSTM_DIM, dropout=DROPOUT, recurrent_dropout=RECURRENT_DROPOUT,
    #                             kernel_regularizer=L1L2_m(l1=0, l2=0, prior_shape=(WORD_EMBEDDING_DIM, WORD_LSTM_DIM * 4)))(embedding_rw)
    lstm_lw = keras.layers.LSTM(
        WORD_LSTM_DIM, dropout=DROPOUT,
        recurrent_dropout=RECURRENT_DROPOUT)(embedding_lw)
    lstm_rw = keras.layers.LSTM(
        WORD_LSTM_DIM, dropout=DROPOUT,
        recurrent_dropout=RECURRENT_DROPOUT)(embedding_rw)

    # concatenate
    lstm_out = keras.layers.Concatenate(axis=1)([lstm_lw, lstm_rw])

    # dense layers
    # dense01 = keras.layers.Dense(DENSE_DIM, activation="tanh", kernel_regularizer=L1L2_m(l1=0, l2=0, prior_shape=(LSTM_DIM, DENSE_DIM)))(lstm_out)

    if args.lda:
        if args.lda_mode == "feature":
            # TOPICS AS FEATURES #
            topics = keras.layers.Concatenate(axis=1)([lstm_out, input_topics])
            dense01 = keras.layers.Dense(DENSE_DIM, activation="tanh")(topics)
            dropout01 = keras.layers.Dropout(DROPOUT)(dense01)
            predictions = keras.layers.Dense(len(classes),
                                             activation="softmax")(dropout01)
        elif args.lda_mode == "filter":
            # MULTIPLY WITH TOPICS #
            dense01 = keras.layers.Dense(DENSE_DIM,
                                         activation="tanh")(lstm_out)
            # dropout01 = keras.layers.Dropout(DROPOUT)(dense01)
            topic_dense = keras.layers.Dense(
                DENSE_DIM, activation="softmax")(input_topics)

            # element-wise multiplication with topics
            # topic_filter = keras.layers.Multiply()([dropout01, topic_dense])
            topic_filter = keras.layers.Multiply()([dense01, topic_dense])
            predictions = keras.layers.Dense(
                len(classes), activation="softmax")(topic_filter)
    else:
        # NOTHING #
        dense01 = keras.layers.Dense(DENSE_DIM, activation="tanh")(lstm_out)
        dropout01 = keras.layers.Dropout(DROPOUT)(dense01)
        predictions = keras.layers.Dense(len(classes),
                                         activation="softmax")(dropout01)

    # dense02 = keras.layers.Dense(DENSE_DIM // 2, activation="tanh", kernel_regularizer=L1L2_m(l1=0, l2=0, prior_shape=(LSTM_DIM, DENSE_DIM)))(dropout01)
    # predictions = keras.layers.Dense(len(classes), activation="softmax", kernel_regularizer=L1L2_m(l1=0, l2=0, prior_shape=(DENSE_DIM, len(classes))))(dropout02)

    # predictions = keras.layers.Dense(len(classes), activation="softmax")(dropout01)

    if args.lda:
        model = keras.models.Model(inputs=[input_lw, input_rw, input_topics],
                                   outputs=predictions)
    else:
        model = keras.models.Model(inputs=[input_lw, input_rw],
                                   outputs=predictions)
    model.compile(optimizer="adam",
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    model.summary()
    early_stopper = keras.callbacks.EarlyStopping(monitor="val_loss",
                                                  patience=2)
    if args.lda:
        model.fit(
            [train_left_words, train_right_words, train_topics],
            targets,
            batch_size=BATCH_SIZE,
            epochs=args.epochs,
            callbacks=[early_stopper],
            validation_data=([val_left_words, val_right_words,
                              val_topics], val_targets))
    else:
        model.fit([train_left_words, train_right_words],
                  targets,
                  batch_size=BATCH_SIZE,
                  epochs=args.epochs,
                  callbacks=[early_stopper],
                  validation_data=([val_left_words,
                                    val_right_words], val_targets))
    model.save("%s.h5" % args.model)
    with open("%s.maps" % args.model, mode="w", encoding="utf-8") as f:
        json.dump((word_to_idx, tgt_to_idx, max_len_lw, max_len_rw),
                  f,
                  ensure_ascii=False)

    # Official score
    if args.lda:
        predictions = model.predict(
            [val_left_words, val_right_words, val_topics])
    else:
        predictions = model.predict([val_left_words, val_right_words])
    idx_to_tgt = {i: c for c, i in tgt_to_idx.items()}
    predicted = [idx_to_tgt[p] for p in predictions.argmax(axis=1)]
    evaluate_iest.calculatePRF(list(val_tgts), predicted)