def run(df):
    glove_dir = basepath + "glove.6B"
    model_name = "books_sup"
    max_sentence_length = 100
    max_sentences = 15
    max_words = 20000
    embedding_dim = 100

    X = df["text"]
    y = df["label"]
    y_true = df["label"]

    labels, label_to_index, index_to_label = get_distinct_labels(df)
    y_one_hot = make_one_hot(y, label_to_index)
    # y = np.array(y)
    print("Fitting tokenizer...")
    tokenizer = fit_get_tokenizer(X, max_words)
    # print("Getting tokenizer")
    # tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb"))
    print("Splitting into train, dev...")
    X_train, y_train, X_val, y_val, X_test, y_test = create_train_dev_test(
        X,
        labels=y_one_hot,
        tokenizer=tokenizer,
        max_sentences=max_sentences,
        max_sentence_length=max_sentence_length,
        max_words=max_words)
    print("Creating Embedding matrix...")
    embedding_matrix = create_embedding_matrix(glove_dir, tokenizer,
                                               embedding_dim)
    # print("Getting Embedding matrix...")
    # embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb"))
    print("Initializing model...")
    model = HAN(max_words=max_sentence_length,
                max_sentences=max_sentences,
                output_size=len(y_train[0]),
                embedding_matrix=embedding_matrix)
    print("Compiling model...")
    model.summary()
    model.compile(loss="categorical_crossentropy",
                  optimizer='adam',
                  metrics=['acc'])
    print("model fitting - Hierachical attention network...")
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    model.fit(X_train,
              y_train,
              validation_data=(X_val, y_val),
              nb_epoch=100,
              batch_size=256,
              callbacks=[es])
    print("****************** CLASSIFICATION REPORT ********************")
    pred = model.predict(X_test)
    true_labels = get_from_one_hot(y_test, index_to_label)
    pred_labels = get_from_one_hot(pred, index_to_label)
    print(classification_report(true_labels, pred_labels))
def train_classifier(df,
                     labels,
                     label_term_dict,
                     label_adult_dict,
                     label_actor_dict,
                     label_actress_dict,
                     label_producer_dict,
                     label_writer_dict,
                     label_director_dict,
                     label_composer_dict,
                     label_cinematographer_dict,
                     label_editor_dict,
                     label_prod_designer_dict,
                     label_dir_adult_dict,
                     label_dir_actor_dict,
                     label_dir_actress_dict,
                     label_dir_producer_dict,
                     label_dir_writer_dict,
                     label_dir_composer_dict,
                     label_dir_cinematographer_dict,
                     label_dir_editor_dict,
                     label_dir_prod_designer_dict,
                     label_actor_actress_dict,
                     label_to_index, index_to_label, model_name, soft=False):
    basepath = "/data4/dheeraj/metaguide/"
    dataset = "imdb/"
    # glove_dir = basepath + "glove.6B"
    dump_dir = basepath + "models/" + dataset + model_name + "/"
    tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/"
    os.makedirs(dump_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)
    max_sentence_length = 100
    max_sentences = 15
    max_words = 20000
    embedding_dim = 100
    tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb"))

    X, y, y_true = get_train_data(df,
                                  labels,
                                  label_term_dict,
                                  label_adult_dict,
                                  label_actor_dict,
                                  label_actress_dict,
                                  label_producer_dict,
                                  label_writer_dict,
                                  label_director_dict,
                                  label_composer_dict,
                                  label_cinematographer_dict,
                                  label_editor_dict,
                                  label_prod_designer_dict,
                                  label_dir_adult_dict,
                                  label_dir_actor_dict,
                                  label_dir_actress_dict,
                                  label_dir_producer_dict,
                                  label_dir_writer_dict,
                                  label_dir_composer_dict,
                                  label_dir_cinematographer_dict,
                                  label_dir_editor_dict,
                                  label_dir_prod_designer_dict,
                                  label_actor_actress_dict, tokenizer, label_to_index, soft=soft)
    print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************")
    # df_train = create_training_df(X, y, y_true)
    # df_train.to_csv(basepath + dataset + "training_label.csv")
    if not soft:
        y_vec = make_one_hot(y, label_to_index)
        print(classification_report(y_true, y))
    else:
        y_vec = np.array(y)
        y_argmax = np.argmax(y, axis=-1)
        y_str = []
        for i in y_argmax:
            y_str.append(index_to_label[i])
        print(classification_report(y_true, y_str))
    # print("Fitting tokenizer...")
    # tokenizer = fit_get_tokenizer(X, max_words)
    print("Getting tokenizer")
    tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb"))

    print("Splitting into train, dev...")
    X_train, y_train, X_val, y_val, _, _ = create_train_dev(X, labels=y_vec, tokenizer=tokenizer,
                                                            max_sentences=max_sentences,
                                                            max_sentence_length=max_sentence_length,
                                                            max_words=max_words, val=False)
    # print("Creating Embedding matrix...")
    # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim)
    print("Getting Embedding matrix...")
    embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb"))
    print("Initializing model...")
    model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]),
                embedding_matrix=embedding_matrix)
    print("Compiling model...")
    model.summary()
    if not soft:
        model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc'])
    else:
        model.compile(loss=kullback_leibler_divergence, optimizer='adam', metrics=['acc'])
    print("model fitting - Hierachical attention network...")
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max',
                         verbose=1, save_weights_only=True, save_best_only=True)
    model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc])
    # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************")
    # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length,
    #                         tokenizer=tokenizer)
    # pred = model.predict(X_label_all)
    # pred_labels = get_from_one_hot(pred, index_to_label)
    # print(classification_report(y_true, pred_labels))
    print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************")
    X_all = prep_data(texts=df["text"], max_sentences=max_sentences, max_sentence_length=max_sentence_length,
                      tokenizer=tokenizer)
    y_true_all = df["label"]
    pred = model.predict(X_all)
    pred_labels = get_from_one_hot(pred, index_to_label)
    print(classification_report(y_true_all, pred_labels))
    print("Dumping the model...")
    model.save_weights(dump_dir + "model_weights_" + model_name + ".h5")
    model.save(dump_dir + "model_" + model_name + ".h5")
    return pred_labels, pred
Beispiel #3
0
def train_classifier(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict,
                     label_author_pub_dict, label_pub_year_dict, label_author_year_dict, label_to_index, index_to_label,
                     model_name, clf, use_gpu, old=True, soft=False):
    basepath = "/data4/dheeraj/metaguide/"
    dataset = "books/"
    # glove_dir = basepath + "glove.6B"
    dump_dir = basepath + "models/" + dataset + model_name + "/"
    tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/"
    os.makedirs(dump_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)
    max_sentence_length = 100
    max_sentences = 15
    max_words = 20000
    embedding_dim = 100
    tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb"))

    if old:
        X, y, y_true = get_train_data(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict,
                                      label_author_pub_dict, label_pub_year_dict, label_author_year_dict, tokenizer,
                                      label_to_index, soft=soft, clf=clf)
        if clf == "BERT":
            df_orig = pickle.load(open(basepath + dataset + "df.pkl", "rb"))
            X = list(df_orig.iloc[X]["text"])
    else:
        X, y, y_true = get_confident_train_data(df, labels, label_term_dict, label_author_dict, label_pub_dict,
                                                label_year_dict, label_author_pub_dict, label_pub_year_dict,
                                                label_author_year_dict, tokenizer)
    print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************")
    # df_train = create_training_df(X, y, y_true)
    # df_train.to_csv(basepath + dataset + "training_label.csv")
    if not soft:
        y_vec = make_one_hot(y, label_to_index)
        print(classification_report(y_true, y))
    else:
        y_vec = np.array(y)
        y_argmax = np.argmax(y, axis=-1)
        y_str = []
        for i in y_argmax:
            y_str.append(index_to_label[i])
        print(classification_report(y_true, y_str))
    # print("Fitting tokenizer...")
    # tokenizer = fit_get_tokenizer(X, max_words)
    print("Getting tokenizer")
    tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb"))

    # print("Creating Embedding matrix...")
    # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim)
    if clf == "HAN":
        print("Splitting into train, dev...")
        X_train, y_train, X_val, y_val, _, _ = create_train_dev(X, labels=y_vec, tokenizer=tokenizer,
                                                                max_sentences=max_sentences,
                                                                max_sentence_length=max_sentence_length,
                                                                max_words=max_words, val=False)
        print("Getting Embedding matrix...")
        embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb"))
        print("Initializing model...")
        model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]),
                    embedding_matrix=embedding_matrix)
        print("Compiling model...")
        model.summary()
        if not soft:
            model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc'])
        else:
            model.compile(loss=kullback_leibler_divergence, optimizer='adam', metrics=['acc'])
        print("model fitting - Hierachical attention network...")
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
        mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max',
                             verbose=1, save_weights_only=True, save_best_only=True)
        model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc])
        # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************")
        # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length,
        #                         tokenizer=tokenizer)
        # pred = model.predict(X_label_all)
        # pred_labels = get_from_one_hot(pred, index_to_label)
        # print(classification_report(y_true, pred_labels))
        print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************")
        X_all = prep_data(texts=df["text"], max_sentences=max_sentences, max_sentence_length=max_sentence_length,
                          tokenizer=tokenizer)
        y_true_all = df["label"]
        pred = model.predict(X_all)
        pred_labels = get_from_one_hot(pred, index_to_label)
        print("Dumping the model...")
        model.save_weights(dump_dir + "model_weights_" + model_name + ".h5")
        model.save(dump_dir + "model_" + model_name + ".h5")
    elif clf == "BERT":
        y_vec = []
        for lbl_ in y:
            y_vec.append(label_to_index[lbl_])
        model = train_bert(X, y_vec, use_gpu)

        y_true_all = []
        for lbl_ in df.label:
            y_true_all.append(label_to_index[lbl_])

        predictions = test(model, df_orig["text"], y_true_all, use_gpu)
        for i, p in enumerate(predictions):
            if i == 0:
                pred = p
            else:
                pred = np.concatenate((pred, p))

        pred_labels = []
        for p in pred:
            pred_labels.append(index_to_label[p.argmax(axis=-1)])
        y_true_all = df["label"]
    elif clf == "CNN":
        y_vec = []
        for lbl_ in y:
            y_vec.append(label_to_index[lbl_])

        y_true_all = []
        for lbl_ in df.label:
            y_true_all.append(label_to_index[lbl_])

        pred_idxs, pred, true_idxs = train_cnn(X, y_vec, df["text"], y_true_all, use_gpu)

        pred_labels = []
        for p in pred_idxs:
            pred_labels.append(index_to_label[p])

        y_true_all = []
        for p in true_idxs:
            y_true_all.append(index_to_label[p])
    else:
        raise ValueError("clf can only be HAN or BERT or CNN")
    print(classification_report(y_true_all, pred_labels))
    return pred_labels, pred
Beispiel #4
0
        labels=y_one_hot,
        tokenizer=tokenizer,
        max_sentences=max_sentences,
        max_sentence_length=max_sentence_length,
        max_words=max_words)

    # print("Creating Embedding matrix...")
    # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim)

    print("Getting Embedding matrix...")
    embedding_matrix = pickle.load(
        open(basepath + dataset + "embedding_matrix_topk_dict.pkl", "rb"))

    print("Initializing model...")
    model = HAN(max_words=max_sentence_length,
                max_sentences=max_sentences,
                output_size=len(y_train[0]),
                embedding_matrix=embedding_matrix)

    print("Compiling model...")
    model.summary()
    model.compile(loss="categorical_crossentropy",
                  optimizer='adam',
                  metrics=['acc'])
    print("model fitting - Hierachical attention network...")

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
    mc = ModelCheckpoint(filepath=tmp_dir +
                         'model.{epoch:02d}-{val_loss:.2f}.hdf5',
                         monitor='val_acc',
                         mode='max',
                         verbose=1,
Beispiel #5
0
    def train_classifier(df, labels, label_term_dict, label_to_index,
                         index_to_label, dataset_path):
        print("Going to train classifier..")
        basepath = dataset_path
        model_name = "conwea"
        dump_dir = basepath + "models/" + model_name + "/"
        tmp_dir = basepath + "checkpoints/" + model_name + "/"
        os.makedirs(dump_dir, exist_ok=True)
        os.makedirs(tmp_dir, exist_ok=True)
        max_sentence_length = 100
        max_sentences = 15
        max_words = 20000
        tokenizer = pickle.load(open(dataset_path + "tokenizer.pkl", "rb"))

        X, y, y_true = generate_pseudo_labels(df, labels, label_term_dict,
                                              tokenizer)
        y_one_hot = make_one_hot(y, label_to_index)
        print("Fitting tokenizer...")
        print("Splitting into train, dev...")
        X_train, y_train, X_val, y_val = create_train_dev(
            X,
            labels=y_one_hot,
            tokenizer=tokenizer,
            max_sentences=max_sentences,
            max_sentence_length=max_sentence_length,
            max_words=max_words)
        print("Creating Embedding matrix...")
        embedding_matrix = pickle.load(
            open(dataset_path + "embedding_matrix.pkl", "rb"))
        print("Initializing model...")
        model = HAN(max_words=max_sentence_length,
                    max_sentences=max_sentences,
                    output_size=len(y_train[0]),
                    embedding_matrix=embedding_matrix)
        print("Compiling model...")
        model.summary()
        model.compile(loss="categorical_crossentropy",
                      optimizer='adam',
                      metrics=['acc'])
        print("model fitting - Hierachical attention network...")
        es = EarlyStopping(monitor='val_loss',
                           mode='min',
                           verbose=1,
                           patience=3)
        mc = ModelCheckpoint(filepath=tmp_dir +
                             'model.{epoch:02d}-{val_loss:.2f}.hdf5',
                             monitor='val_acc',
                             mode='max',
                             verbose=1,
                             save_weights_only=True,
                             save_best_only=True)
        model.fit(X_train,
                  y_train,
                  validation_data=(X_val, y_val),
                  epochs=100,
                  batch_size=256,
                  callbacks=[es, mc])
        print(
            "****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************"
        )
        X_all = prep_data(texts=df["sentence"],
                          max_sentences=max_sentences,
                          max_sentence_length=max_sentence_length,
                          tokenizer=tokenizer)
        y_true_all = df["label"]
        pred = model.predict(X_all)
        pred_labels = get_from_one_hot(pred, index_to_label)
        print(classification_report(y_true_all, pred_labels))
        print("Dumping the model...")
        model.save_weights(dump_dir + "model_weights_" + model_name + ".h5")
        model.save(dump_dir + "model_" + model_name + ".h5")
        return pred_labels
Beispiel #6
0
def train_weight_classifier(df, labels, label_term_dict, label_author_dict, label_conf_dict, label_to_index,
                            index_to_label, model_name, AND=True):
    basepath = "/data4/dheeraj/metaguide/"
    dataset = "dblp/"
    # glove_dir = basepath + "glove.6B"
    dump_dir = basepath + "models/" + dataset + model_name + "/"
    tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/"
    os.makedirs(dump_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)
    max_sentence_length = 100
    max_sentences = 15
    max_words = 20000
    embedding_dim = 100
    tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb"))

    X, y, y_true, weights = get_weighted_train_data(df, labels, label_term_dict, label_author_dict, label_conf_dict,
                                                    tokenizer, label_to_index, AND=AND)
    print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************")
    # df_train = create_training_df(X, y, y_true)
    # df_train.to_csv(basepath + dataset + "training_label.csv")
    y_vec = make_one_hot(y, label_to_index)
    print(classification_report(y_true, y))
    # y = np.array(y)
    # print("Fitting tokenizer...")
    # tokenizer = fit_get_tokenizer(X, max_words)
    print("Getting tokenizer")
    tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb"))

    print("Splitting into train, dev...")
    X_train, y_train, X_val, y_val, weights_train, _ = create_train_dev_weights(X, labels=y_vec,
                                                                                weights=weights,
                                                                                tokenizer=tokenizer,
                                                                                max_sentences=max_sentences,
                                                                                max_sentence_length=max_sentence_length,
                                                                                max_words=max_words)
    # print("Creating Embedding matrix...")
    # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim)
    print("Getting Embedding matrix...")
    embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb"))
    print("Initializing model...")
    model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]),
                embedding_matrix=embedding_matrix)
    print("Compiling model...")
    model.summary()
    model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc'])
    print("model fitting - Hierachical attention network...")
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max',
                         verbose=1, save_weights_only=True, save_best_only=True)
    model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc],
              sample_weight=np.array(weights_train))
    # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************")
    # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length,
    #                         tokenizer=tokenizer)
    # pred = model.predict(X_label_all)
    # pred_labels = get_from_one_hot(pred, index_to_label)
    # print(classification_report(y_true, pred_labels))
    print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************")
    X_all = prep_data(texts=df["abstract"], max_sentences=max_sentences, max_sentence_length=max_sentence_length,
                      tokenizer=tokenizer)
    y_true_all = df["label"]
    pred = model.predict(X_all)
    pred_labels = get_from_one_hot(pred, index_to_label)
    print(classification_report(y_true_all, pred_labels))
    print("Dumping the model...")
    model.save_weights(dump_dir + "model_weights_" + model_name + ".h5")
    model.save(dump_dir + "model_" + model_name + ".h5")
    return pred_labels, pred
Beispiel #7
0
# Loop though all the words in the word_index and where possible
# replace the random initalization with the GloVe vector.
for word, index in tqdm(word_tokenizer.word_index.items()):
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

#####################################################
# Model Training                                    #
#####################################################
logger.info("Training the model.")

han_model = HAN(
    MAX_WORDS_PER_SENT,
    MAX_SENT,
    embedding_matrix,
    word_encoding_dim=100,
    sentence_encoding_dim=100,
)

loss = tf.keras.losses.BinaryCrossentropy(name="loss")
# loss = WeightedBinaryCrossEntropy(pos_weight=442475 / 89972, name="loss")
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
han_model.compile(
    optimizer=opt,
    loss=loss,
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name="acc"),
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.AUC(name="pr_auc", curve="PR"),
    ],
Beispiel #8
0
# Loop though all the words in the word_index and where possible
# replace the random initalization with the GloVe vector.
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

#####################################################
# Model Training                                    #
#####################################################
logger.info("Training the model.")

han_model = HAN(MAX_WORDS_PER_SENT,
                MAX_SENT,
                2,
                embedding_matrix,
                word_encoding_dim=100,
                sentence_encoding_dim=100)

han_model.summary()

han_model.compile(optimizer='adagrad',
                  loss='categorical_crossentropy',
                  metrics=['acc'])

checkpoint_saver = ModelCheckpoint(
    filepath='./tmp/model.{epoch:02d}-{val_loss:.2f}.hdf5',
    verbose=1,
    save_best_only=True)

han_model.fit(X_train,
Beispiel #9
0
    def train_classifier(df, labels, label_term_dict, label_to_index,
                         index_to_label, dataset_path):
        print("Going to train classifier..")
        basepath = dataset_path
        model_name = "conwea"
        dump_dir = basepath + "models/" + model_name + "/"
        tmp_dir = basepath + "checkpoints/" + model_name + "/"
        os.makedirs(dump_dir, exist_ok=True)
        os.makedirs(tmp_dir, exist_ok=True)
        max_sentence_length = 100
        #TODO what is max sentences???
        max_sentences = 15
        max_words = 20000
        tokenizer = pickle.load(open(dataset_path + "tokenizer.pkl", "rb"))

        X, y, y_true = generate_pseudo_labels(df, labels, label_term_dict,
                                              tokenizer)
        #y_one_hot = make_one_hot(y, label_to_index)
        y_one_hot = np.array(y)

        #code too see distribution of labels
        twodmatrix = np.stack(y, axis=0)
        labelcounts = np.sum(twodmatrix, axis=0)
        plt.bar(range(0, 13), labelcounts)
        plt.title('PSEUDOLABEL DISTRIBUTION')
        plt.show()

        print("Fitting tokenizer...")
        print("Splitting into train, dev...")
        X_train, y_train, X_val, y_val = create_train_dev(
            X,
            labels=y_one_hot,
            tokenizer=tokenizer,
            max_sentences=max_sentences,
            max_sentence_length=max_sentence_length,
            max_words=max_words)
        print("Creating Embedding matrix...")
        embedding_matrix = pickle.load(
            open(dataset_path + "embedding_matrix.pkl", "rb"))
        print("Initializing model...")
        model = HAN(max_words=max_sentence_length,
                    max_sentences=max_sentences,
                    output_size=len(y_train[0]),
                    embedding_matrix=embedding_matrix)
        print("Compiling model...")
        model.summary()
        model.compile(loss="binary_crossentropy",
                      optimizer='adam',
                      metrics=[TopKCategoricalAccuracy(k=3)])
        print("model fitting - Hierachical attention network...")
        es = EarlyStopping(monitor='val_loss',
                           mode='min',
                           verbose=1,
                           patience=3)
        mc = ModelCheckpoint(filepath=tmp_dir +
                             'model.{epoch:02d}-{val_loss:.2f}.hdf5',
                             monitor=TopKCategoricalAccuracy(k=3),
                             mode='max',
                             verbose=1,
                             save_weights_only=True,
                             save_best_only=True)
        model.fit(X_train,
                  y_train,
                  validation_data=(X_val, y_val),
                  epochs=1,
                  batch_size=256,
                  callbacks=[es, mc])
        print(
            "****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************"
        )
        X_all = prep_data(texts=df["sentence"],
                          max_sentences=max_sentences,
                          max_sentence_length=max_sentence_length,
                          tokenizer=tokenizer)

        y_true_all = df["label"]

        #pred now is an array as long as the classes
        pred = model.predict(X_all)
        #i need to convert this to binary 0,1 array

        #code to see prediction distribution
        twodmatrix = np.stack(y, axis=0)
        labelcounts = np.sum(twodmatrix, axis=0)
        plt.bar(range(0, 13), labelcounts)
        plt.title('NN PREDICTION DISTRIBUTION')
        plt.show()

        # one-hot-encoding of predictions based on >0,5> thresh for recall and accuracy
        lsprecrec = (pred > 0.5).astype(int)

        #array of strings of predicted labels( with hard threshold for seeding words)
        #pred usualy. trying lsprecrec for lower threshold
        pred_labels = get_from_one_hot(lsprecrec, index_to_label)

        y_true_allnp = np.array(y_true_all)
        #this is to fix the error of different dimensions
        y_true_allnp = np.array([np.array(x) for x in y_true_allnp])

        from sklearn.metrics import confusion_matrix
        for i, l in enumerate(label_to_index.keys()):
            if sum(y_true_allnp.T[i]) == 0:
                print('no {l} in dataset')
            if sum(lsprecrec.T[i]) == 0:
                print("no {} ever predicted".format(l))
            tn, fp, fn, tp = confusion_matrix(y_true_allnp.T[i],
                                              lsprecrec.T[i]).ravel()
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            print('{} : precision {}, recall: {}'.format(l, precision, recall))

        topk1_accuracypseudo = TopKCategoricalAccuracy(
            k=1, name="top_k1_categorical_accuracy", dtype=None)
        topk2_accuracypseudo = TopKCategoricalAccuracy(
            k=2, name="top_k2_categorical_accuracy", dtype=None)
        topk3_accuracypseudo = TopKCategoricalAccuracy(
            k=3, name="top_k3_categorical_accuracy", dtype=None)

        topk1_accuracypseudo.update_state(y_true=y_true, y_pred=y_one_hot)
        topk2_accuracypseudo.update_state(y_true=y_true, y_pred=y_one_hot)
        topk3_accuracypseudo.update_state(y_true=y_true, y_pred=y_one_hot)
        print("ACCURACY PSEUDO LABELS")
        print("K1: ", topk1_accuracypseudo.result().numpy())
        print("K2: ", topk2_accuracypseudo.result().numpy())
        print("K3: ", topk3_accuracypseudo.result().numpy())

        #keras top-k accuracy on nn prediction
        topk1_accuracy = TopKCategoricalAccuracy(
            k=1, name="top_k1_categorical_accuracy", dtype=None)
        topk2_accuracy = TopKCategoricalAccuracy(
            k=2, name="top_k2_categorical_accuracy", dtype=None)
        topk3_accuracy = TopKCategoricalAccuracy(
            k=3, name="top_k3_categorical_accuracy", dtype=None)

        topk1_accuracy.update_state(y_true=y_true_allnp.astype(np.float64),
                                    y_pred=pred)
        topk2_accuracy.update_state(y_true=y_true_allnp.astype(np.float64),
                                    y_pred=pred)
        topk3_accuracy.update_state(y_true=y_true_allnp.astype(np.float64),
                                    y_pred=pred)

        print("ACCURACY NN PREDICTION")
        print("K1: ", topk1_accuracy.result().numpy())
        print("K2: ", topk2_accuracy.result().numpy())
        print("K3: ", topk3_accuracy.result().numpy())

        #print(classification_report(y_true_all, pred_labels))
        print("Dumping the model...")
        # model.save_weights(dump_dir + "model_weights_" + model_name + ".h5")
        # model.save(dump_dir + "model_" + model_name + ".h5")
        return pred_labels
Beispiel #10
0
def train_classifier(df, tokenizer, embedding_matrix, labels,
                     motpat_label_motifs_dict, label_to_index, index_to_label,
                     index_word, dataset_path, config):
    def generate_pseudo_labels(df, labels, motpat_label_motifs_dict, tokenizer,
                               index_word, config):
        y = []
        X = []

        for index, row in df.iterrows():
            count_dict = {}
            flag = 0
            for mot_pat in motpat_label_motifs_dict:
                label_motifs_dict = motpat_label_motifs_dict[mot_pat]
                if len(label_motifs_dict) == 0:
                    continue
                if mot_pat == "phrase":
                    tokens = tokenizer.texts_to_sequences([row["text"]])[0]
                    words = []
                    for tok in tokens:
                        words.append(index_word[tok])
                    for l in labels:
                        if len(label_motifs_dict[l]) == 0:
                            continue
                        seed_words = set(label_motifs_dict[l].keys())
                        int_words = list(set(words).intersection(seed_words))
                        for word in int_words:
                            flag = 1
                            try:
                                count_dict[l] += label_motifs_dict[l][word]
                            except:
                                count_dict[l] = label_motifs_dict[l][word]
                else:
                    size = len(mot_pat)
                    if size == 1:
                        first = mot_pat[0]
                        entities = get_entity_from_col(row[first], first,
                                                       config)
                    elif size == 2:
                        first = mot_pat[0]
                        second = mot_pat[1]
                        first_ents = get_entity_from_col(
                            row[first], first, config)
                        second_ents = get_entity_from_col(
                            row[second], second, config)
                        if first == second:
                            entities = set(
                                itertools.combinations(first_ents, 2))
                        else:
                            entities = set(
                                itertools.product(first_ents, second_ents))
                    else:
                        raise Exception(
                            "Motif patterns of size more than 2 not yet handled but can be easily extended."
                        )
                    for l in labels:
                        if len(label_motifs_dict[l]) == 0:
                            continue
                        seed_entities = set(label_motifs_dict[l].keys())
                        int_ents = list(entities.intersection(seed_entities))
                        for ent in int_ents:
                            flag = 1
                            try:
                                count_dict[l] += label_motifs_dict[l][ent]
                            except:
                                count_dict[l] = label_motifs_dict[l][ent]

            if flag:
                lbl = max(count_dict, key=count_dict.get)
                if not lbl:
                    continue
                y.append(lbl)
                X.append(row["text"])
        return X, y

    basepath = dataset_path
    model_name = "meta"
    dump_dir = basepath + "models/" + model_name + "/"
    tmp_dir = basepath + "checkpoints/" + model_name + "/"
    os.makedirs(dump_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)
    max_sentence_length = 100
    max_sentences = 15
    max_words = 20000

    print("Generating pseudo-labels", flush=True)
    X, y = generate_pseudo_labels(df, labels, motpat_label_motifs_dict,
                                  tokenizer, index_word, config)
    y_vec = make_one_hot(y, label_to_index)

    print("Splitting into train, dev...", flush=True)
    X_train, y_train, X_val, y_val = create_train_dev(
        X,
        labels=y_vec,
        tokenizer=tokenizer,
        max_sentences=max_sentences,
        max_sentence_length=max_sentence_length,
        max_words=max_words)

    print("Initializing model...", flush=True)
    model = HAN(max_words=max_sentence_length,
                max_sentences=max_sentences,
                output_size=len(y_train[0]),
                embedding_matrix=embedding_matrix)
    print("Compiling model...", flush=True)
    model.summary()
    model.compile(loss="categorical_crossentropy",
                  optimizer='adam',
                  metrics=['acc'])
    print("model fitting - Hierachical attention network...", flush=True)
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    mc = ModelCheckpoint(filepath=tmp_dir +
                         'model.{epoch:02d}-{val_loss:.2f}.hdf5',
                         monitor='val_acc',
                         mode='max',
                         verbose=1,
                         save_weights_only=True,
                         save_best_only=True)
    model.fit(X_train,
              y_train,
              validation_data=(X_val, y_val),
              nb_epoch=100,
              batch_size=256,
              callbacks=[es, mc])
    print(
        "****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************",
        flush=True)
    X_all = prep_data(texts=df["text"],
                      max_sentences=max_sentences,
                      max_sentence_length=max_sentence_length,
                      tokenizer=tokenizer)
    y_true_all = df["label"]
    pred = model.predict(X_all)
    pred_labels = get_from_one_hot(pred, index_to_label)
    print(classification_report(y_true_all, pred_labels), flush=True)
    print("Dumping the model...", flush=True)
    model.save_weights(dump_dir + "model_weights_" + model_name + ".h5")
    model.save(dump_dir + "model_" + model_name + ".h5")
    return pred_labels, pred