コード例 #1
0
# ======================================================================================================================
# Combine ABSTRACT + SUMMARIES (document text + keyphrases)
# ======================================================================================================================
print("data_summaries['abstract']")
print(data_summaries['abstract'])
# needed for extraction f1-score - gold keyphrases should be checked if they exist with both abstract and summary
for doc_index, test_summar in enumerate(data_summaries['abstract']):
    data_summaries['abstract'].iat[
        doc_index] = data_abstract['abstract'][doc_index] + ' ' + test_summar
print(data_summaries['abstract'])

print('pred_keyphrases_abstract')
print(pred_keyphrases_abstract)
# combine the predicted keyphrase of the abstract and the summaries
for doc_indx, pred_keyphrase_abstr in enumerate(pred_keyphrases_abstract):
    pred_keyphrase_abstr.extend(pred_keyphrases_summaries[doc_indx])
    pred_keyphrases_abstract[doc_indx] = pred_keyphrase_abstr
print(pred_keyphrases_abstract)

# ======================================================================================================================
# Evaluation
# ======================================================================================================================

# traditional evaluation the model's performance
# ("x_filename": does not matter in this case -> used to combine paragraphs and sentences to original documents)
traditional_evaluation.evaluation(y_pred=pred_keyphrases_abstract,
                                  y_test=gold_keyphrases,
                                  x_test=data_summaries,
                                  x_filename='')
# ======================================================================================================================
# Combine ABSTRACT + SUMMARIES
# ======================================================================================================================

# needed for extraction f1-score - gold keyphrases should be checked if they exist with both abstract and summary
for doc_index, test_summar in enumerate(x_test_summar['abstract']):
    x_test_summar['abstract'].iat[doc_index] = ' '.join(
        x_test['abstract'][doc_index]) + ' ' + ' '.join(test_summar)

# ======================================================================================================================
# Evaluation
# ======================================================================================================================
gold_keyphrases = pd.read_csv(y_filename, encoding="utf8")
gold_keyphrases = gold_keyphrases['keyword'].map(ast.literal_eval)

# traditional evaluation the model's performance
traditional_evaluation.evaluation(y_pred=y_pred,
                                  y_test=gold_keyphrases,
                                  x_test=x_test_summar,
                                  x_filename=x_filename)

# sequence evaluation of the model's performance
#sequence_evaluation.evaluation(y_pred, MAX_LEN, y_test_filename)

# ======================================================================================================================
# Count the total running time
# ======================================================================================================================

total_time = str(timedelta(seconds=(time.time() - start_time)))
print("\n--- %s running time ---" % total_time)
    stoplist += stopwords.words('english')
    extractor.candidate_selection(pos=pos, stoplist=stoplist)

    # 4. build the Multipartite graph and rank candidates using random walk,
    #    alpha controls the weight adjustment mechanism, see TopicRank for
    #    threshold/method parameters.
    extractor.candidate_weighting(alpha=1.1,
                                  threshold=0.74,
                                  method='average')

    # 5. get the 10-highest scored candidates as keyphrases
    pred_kps = extractor.get_n_best(n=10)

    pred_keyphrases.append([kp[0].split() for kp in pred_kps])  # keep only the predicted keyphrase and discard the frequency number

print(pred_keyphrases)
print(gold_keyphrases)

# ======================================================================================================================
# Evaluation
# ======================================================================================================================

# traditional evaluation the model's performance
if use_fulltext == 2:
    traditional_evaluation.evaluation(y_pred=pred_keyphrases, y_test=gold_keyphrases, x_test=data, x_filename=file, paragraph_assemble_docs=data['assemble_documents_index'])
else:
    print('WTF HERE file', file)
    traditional_evaluation.evaluation(y_pred=pred_keyphrases, y_test=gold_keyphrases, x_test=data, x_filename=file)


コード例 #4
0
print('AFTER LOADING', model.get_weights())
# ======================================================================================================================
# Predict on validation data
# ======================================================================================================================

print('\nPredicting...')
# y_pred = model.predict(x=test_batch_generator, steps=test_steps)  # steps=validation_steps, because it does not read the last batch
y_pred = model.predict(x=test_generator)
print(y_pred)
print('\nY_PRED SHAPE', np.array(y_pred, dtype=object).shape)


# ======================================================================================================================
# Evaluation
# ======================================================================================================================

# traditional evaluation the model's performance
traditional_evaluation.evaluation(y_pred=y_pred, x_filename=x_filename, y_filename=y_filename)

# sequence evaluation of the model's performance
sequence_evaluation.evaluation(y_pred, MAX_LEN, y_test_filename)


# ======================================================================================================================
# Count the total running time
# ======================================================================================================================

total_time = str(timedelta(seconds=(time.time() - start_time)))
print("\n--- %s running time ---" % total_time)