train_doc, train_answer = tk.tokenize_set(train_doc_str, train_answer_str,
                                          tokenizer)
test_doc, test_answer = tk.tokenize_set(test_doc_str, test_answer_str,
                                        tokenizer)
val_doc, val_answer = tk.tokenize_set(val_doc_str, val_answer_str, tokenizer)

logging.info("Dataset loaded. Generating candidate keyphrases...")

train_candidates = chunker.extract_candidates_from_set(train_doc_str,
                                                       tokenizer)
test_candidates = chunker.extract_candidates_from_set(test_doc_str, tokenizer)
val_candidates = chunker.extract_candidates_from_set(val_doc_str, tokenizer)

logging.debug("Candidates recall on training set   : %.4f",
              metrics.recall(train_answer, train_candidates))
logging.debug("Candidates recall on test set       : %.4f",
              metrics.recall(test_answer, test_candidates))
logging.debug("Candidates recall on validation set : %.4f",
              metrics.recall(val_answer, val_candidates))

train_pos = []
for answers in train_answer.values():
    for answer in answers:
        train_pos.append(nltk.pos_tag(answer))

test_pos = []
for answers in test_answer.values():
    for answer in answers:
        test_pos.append(nltk.pos_tag(answer))
Beispiel #2
0
    prepare_sequential(train_doc, train_answer, test_doc, test_answer,val_doc,val_answer,
                       max_document_length=MAX_DOCUMENT_LENGTH,
                       max_vocabulary_size=MAX_VOCABULARY_SIZE,
                       embeddings_size=EMBEDDINGS_SIZE)

check_randomness("After preprocess")

# weigh training examples: everything that's not class 0 (not kp)
# gets a heavier score
train_y_weights = np.argmax(train_y,axis=2) # this removes the one-hot representation
train_y_weights[train_y_weights > 0] = KP_WEIGHT
train_y_weights[train_y_weights < 1] = 1

logging.info("Data preprocessing complete.")
logging.info("Maximum possible recall: %s",
             metrics.recall(test_answer,
                               postprocessing.get_words(test_doc,postprocessing.undo_sequential(test_y))))

if not SAVE_MODEL or not os.path.isfile(MODEL_PATH) :

    logging.debug("Building the network...")
    model = Sequential()

    embedding_layer = Embedding(np.shape(embedding_matrix)[0],
                                EMBEDDINGS_SIZE,
                                weights=[embedding_matrix],
                                input_length=MAX_DOCUMENT_LENGTH,
                                trainable=False)

    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(150,activation='tanh', recurrent_activation='hard_sigmoid', return_sequences=True)))
    model.add(Dropout(0.25))
Beispiel #3
0
train_doc, train_answer = tk.tokenize_set(train_doc_str, train_answer_str,
                                          tokenizer)
test_doc, test_answer = tk.tokenize_set(test_doc_str, test_answer_str,
                                        tokenizer)
val_doc, val_answer = tk.tokenize_set(val_doc_str, val_answer_str, tokenizer)

logging.info("Dataset loaded. Generating candidate keyphrases...")

train_candidates = chunker.extract_candidates_from_set(train_doc_str,
                                                       tokenizer)
test_candidates = chunker.extract_candidates_from_set(test_doc_str, tokenizer)
val_candidates = chunker.extract_candidates_from_set(val_doc_str, tokenizer)

logging.debug("Candidates recall on training set   : %.4f",
              metrics.recall(train_answer, train_candidates))
logging.debug("Candidates recall on test set       : %.4f",
              metrics.recall(test_answer, test_candidates))
logging.debug("Candidates recall on validation set : %.4f",
              metrics.recall(val_answer, val_candidates))

logging.info("Candidates generated. Preprocessing data...")

train_x, train_y, test_x, test_y, val_x, val_y, val_x_b, val_y_b, embedding_matrix, dictionary = preprocessing. \
    prepare_answer_2(train_doc, train_answer, train_candidates,
                     test_doc, test_answer, test_candidates,
                     val_doc, val_answer, val_candidates,
                     max_document_length=MAX_DOCUMENT_LENGTH,
                     max_answer_length=MAX_ANSWER_LENGTH,
                     max_vocabulary_size=MAX_VOCABULARY_SIZE,
                     embeddings_size=EMBEDDINGS_SIZE)
Beispiel #4
0
# weigh training examples: everything that's not class 0 (not kp)
# gets a heavier score
from sklearn.utils import class_weight

train_y_weights = np.argmax(train_y, axis=2)
train_y_weights = np.reshape(
    class_weight.compute_sample_weight('balanced', train_y_weights.flatten()),
    np.shape(train_y_weights))

logging.info("Data preprocessing complete.")
logging.info(
    "Maximum possible recall: %s",
    metrics.recall(
        test_answer,
        postprocessing.get_words(test_doc,
                                 postprocessing.undo_sequential(test_y)),
        STEM_MODE))

if not SAVE_MODEL or not os.path.isfile(MODEL_PATH):

    logging.debug("Building the network...")
    model = Sequential()

    embedding_layer = Embedding(np.shape(embedding_matrix)[0],
                                EMBEDDINGS_SIZE,
                                weights=[embedding_matrix],
                                input_length=MAX_DOCUMENT_LENGTH,
                                trainable=False)

    model.add(embedding_layer)
train_doc_str, train_answer_str = data.load_train()
test_doc_str, test_answer_str = data.load_test()
val_doc_str, val_answer_str = data.load_validation()

train_doc, train_answer = tk.tokenize_set(train_doc_str,train_answer_str,tokenizer)
test_doc, test_answer = tk.tokenize_set(test_doc_str,test_answer_str,tokenizer)
val_doc, val_answer = tk.tokenize_set(val_doc_str,val_answer_str,tokenizer)

logging.info("Dataset loaded. Generating candidate keyphrases...")

train_candidates = chunker.extract_candidates_from_set(train_doc_str,tokenizer)
test_candidates = chunker.extract_candidates_from_set(test_doc_str,tokenizer)
val_candidates = chunker.extract_candidates_from_set(val_doc_str,tokenizer)

logging.debug("Candidates recall on training set   : %.4f", metrics.recall(train_answer,train_candidates))
logging.debug("Candidates recall on test set       : %.4f", metrics.recall(test_answer,test_candidates))
logging.debug("Candidates recall on validation set : %.4f", metrics.recall(val_answer,val_candidates))

logging.info("Candidates generated. Preprocessing data...")

train_x,train_y,test_x,test_y,val_x,val_y, val_x_b, val_y_b,embedding_matrix, dictionary = preprocessing.\
    prepare_answer_2(train_doc, train_answer, train_candidates,
                   test_doc, test_answer, test_candidates,
                   val_doc,val_answer, val_candidates,
                   max_document_length=MAX_DOCUMENT_LENGTH,
                   max_answer_length=MAX_ANSWER_LENGTH,
                   max_vocabulary_size=MAX_VOCABULARY_SIZE,
                   embeddings_size=EMBEDDINGS_SIZE)

logging.info("Data preprocessing complete.")