Example #1
0
def export_ke20k_testing_maui():
    from keyphrase.dataset import keyphrase_test_dataset
    target_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/ke20k/'

    config = keyphrase.config.setup_keyphrase_all()  # load settings.
    doc_list = keyphrase_test_dataset.testing_data_loader(
        'ke20k', kwargs=dict(basedir=config['path'])).get_docs(False)

    for d in doc_list:
        d_id = d.name[:d.name.find('.txt')]
        print(d_id)
        with open(target_dir + d_id + '.txt', 'w') as textfile:
            textfile.write(d.title + '\n' + d.text)
        with open(target_dir + d_id + '.key', 'w') as phrasefile:
            for p in d.phrases:
                phrasefile.write('%s\t1\n' % p)
Example #2
0
import os
import numpy as np
from keyphrase import keyphrase_utils
from keyphrase.dataset.keyphrase_test_dataset import testing_data_loader, load_additional_testing_data
from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
from keyphrase.config import *

config = setup_keyphrase_all  # setup_keyphrase_all_testing

__author__ = "Rui Meng"
__email__ = "*****@*****.**"

if __name__ == '__main__':
    config = setup_keyphrase_all()  # load settings.

    loader = testing_data_loader('irbooks',
                                 kwargs=dict(basedir=config['path']))
    docs = loader.get_docs(return_dict=True)

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = load_additional_testing_data(config['testing_datasets'],
                                             idx2word,
                                             word2idx,
                                             config,
                                             postagging=False,
                                             process_type=2)

    test_set, test_s_list, test_t_list, test_s_o_list, test_t_o_list, input_encodings, predictions, scores, output_encodings, idx2word \
        = deserialize_from_file(config['predict_path'] + 'predict.{0}.{1}.pkl'.format(config['predict_type'], 'irbooks'))

    do_stem = False
Example #3
0
def evaluate_multiple(config, test_set, inputs, outputs, original_input,
                      original_outputs, samples, scores, idx2word, do_stem,
                      model_name, dataset_name):
    '''
    inputs_unk is same as inputs except for filtered out all the low-freq words to 1 (<unk>)
    return the top few keywords, number is set in config
    :param: original_input, same as inputs, the vector of one input sentence
    :param: original_outputs, vectors of corresponding multiple outputs (e.g. keyphrases)
    :return:
    '''

    # Generate keyphrases
    # if inputs_unk is None:
    #     samples, scores = self.generate_multiple(inputs[None, :], return_all=True)
    # else:
    #     samples, scores = self.generate_multiple(inputs_unk[None, :], return_all=True)

    stemmer = PorterStemmer()
    # Evaluation part
    outs = []
    micro_metrics = []
    micro_matches = []
    predict_scores = []

    # load stopword
    with open(config['path'] +
              '/dataset/stopword/stopword_en.txt') as stopword_file:
        stopword_set = set([stemmer.stem(w.strip()) for w in stopword_file])

    # postag_lists = [[s[1] for s in d] for d in test_set['tagged_source']]
    # postag_lists = [[] for d in test_set['tagged_source']]

    model_nickname = config[
        'model_name']  # 'TfIdf', 'TextRank', 'SingleRank', 'ExpandRank', 'Maui', 'Kea', 'RNN', 'CopyRNN'
    base_dir = config[
        'path'] + '/dataset/keyphrase/prediction/' + model_nickname + '_' + config[
            'timemark'] + '/'
    # text_dir = config['baseline_data_path'] + dataset_name + '/text/'
    # target_dir = config['baseline_data_path'] + dataset_name + '/keyphrase/'
    prediction_dir = base_dir + dataset_name
    # doc_names = [name[:name.index('.')] for name in os.listdir(text_dir)]

    loader = test_dataset.testing_data_loader(
        dataset_name, kwargs=dict(basedir=config['path']))
    docs = loader.get_docs(return_dict=False)
    doc_names = [d.name for d in docs]

    # reload the targets from corpus directly
    # target_dir = config['baseline_data_path'] + dataset_name + '/keyphrase/'

    # test_set['source_postag'] = test_set['target_str']

    # for input_sentence, target_list, predict_list, score_list in zip(inputs, original_outputs, samples, scores):
    for doc_name, source_str, input_sentence, target_list, predict_list, score_list, postag_list in zip(
            doc_names, test_set['source_str'], inputs, test_set['target_str'],
            samples, scores, test_set['source_postag']):
        '''
        enumerate each document, process target/predict/score and measure via p/r/f1
        '''
        target_outputs = []
        original_target_list = copy.copy(target_list)  # no stemming
        predict_indexes = []
        original_predict_outputs = []  # no stemming
        predict_outputs = []
        predict_score = []
        predict_set = set()
        correctly_matched = np.asarray(
            [0] * max(len(target_list), len(predict_list)), dtype='int32')
        is_copied = []

        # stem the original input, do on source_str not the index list input_sentence
        # stemmed_input = [stemmer.stem(w) for w in cut_zero(input_sentence, idx2word)]
        stemmed_input = [stemmer.stem(w) for w in source_str]

        # convert target index into string
        for target in target_list:
            # target = cut_zero(target, idx2word)
            if do_stem:
                target = [stemmer.stem(w) for w in target]
            # print(target)

            keep = True
            # whether do filtering on groundtruth phrases. if config['target_filter']==None, do nothing
            if config['target_filter']:
                match = None
                for i in range(len(stemmed_input) - len(target) + 1):
                    match = None
                    for j in range(len(target)):
                        if target[j] != stemmed_input[i + j]:
                            match = False
                            break
                    if j == len(target) - 1 and match == None:
                        match = True
                        break

                if match == True:
                    # if match and 'appear-only', keep this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and True
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and False
                elif match == False:
                    # if not match and 'appear-only', discard this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and False
                    # if not match and 'non-appear-only', keep this phrase
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and True

            if not keep:
                continue

            target_outputs.append(target)

        # check if prediction is noun-phrase, initialize a filter. Be sure this should be after stemming
        if config['noun_phrase_only']:
            stemmed_source = [stemmer.stem(w) for w in source_str]
            noun_phrases = dataset_utils.get_none_phrases(
                stemmed_source, postag_list, config['max_len'])
            noun_phrase_set = set([' '.join(p[0]) for p in noun_phrases])

        def cut_zero(sample_index, idx2word, source_str):
            sample_index = list(sample_index)
            # if 0 not in sample:
            #     return ['{}'.format(idx2word[w].encode('utf-8')) for w in sample]
            # # return the string before 0 (<eol>)
            # return ['{}'.format(idx2word[w].encode('utf-8')) for w in sample[:sample.index(0)]]

            if 0 in sample_index:
                sample_index = sample_index[:sample_index.index(0)]

            wordlist = []
            find_copy = False
            for w_index in sample_index:
                if w_index >= config['voc_size']:
                    wordlist.append(
                        source_str[w_index -
                                   config['voc_size']].encode('utf-8'))
                    find_copy = True
                else:
                    wordlist.append(idx2word[w_index].encode('utf-8'))
            if find_copy:
                logger.info('Find copy! - %s - %s' %
                            (' '.join(wordlist), str(sample_index)))
            return sample_index, wordlist

        single_word_maximum = 1
        # convert predict index into string
        for id, (predict, score) in enumerate(zip(predict_list, score_list)):
            predict_index, original_predict = cut_zero(predict, idx2word,
                                                       source_str)
            predict = [stemmer.stem(w) for w in original_predict]

            # filter some not good ones
            keep = True
            if len(predict) == 0:
                keep = False
            number_digit = 0
            for w in predict:
                w = w.strip()
                if w == '<unk>' or w == '<eos>':
                    keep = False
                if re.match(r'[_,\(\)\.\'%]', w):
                    keep = False
                    # print('\t\tPunctuations! - %s' % str(predict))
                if w == '<digit>':
                    number_digit += 1

            if len(predict) >= 1 and (predict[0] in stopword_set
                                      or predict[-1] in stopword_set):
                keep = False

            # filter out single-word predictions
            if len(predict) <= 1:
                if single_word_maximum > 0:
                    single_word_maximum -= 1
                else:
                    keep = False

            # whether do filtering on predicted phrases. if config['predict_filter']==None, do nothing
            if config['predict_filter']:
                match = None
                for i in range(len(stemmed_input) - len(predict) + 1):
                    match = None
                    for j in range(len(predict)):
                        if predict[j] != stemmed_input[i + j]:
                            match = False
                            break
                    if j == len(predict) - 1 and match == None:
                        match = True
                        break

                if match == True:
                    # if match and 'appear-only', keep this phrase
                    if config['predict_filter'] == 'appear-only':
                        keep = keep and True
                    elif config['predict_filter'] == 'non-appear-only':
                        keep = keep and False
                elif match == False:
                    # if not match and 'appear-only', discard this phrase
                    if config['predict_filter'] == 'appear-only':
                        keep = keep and False
                    # if not match and 'non-appear-only', keep this phrase
                    elif config['predict_filter'] == 'non-appear-only':
                        keep = keep and True

            # if all are <digit>, discard
            if number_digit == len(predict):
                keep = False

            # remove duplicates
            key = '-'.join(predict)
            if key in predict_set:
                keep = False

            # if #(word) == #(letter), it predicts like this: h a s k e l
            if sum([len(w)
                    for w in predict]) == len(predict) and len(predict) > 2:
                keep = False
                # print('\t\tall letters! - %s' % str(predict))

            # check if prediction is noun-phrase
            if config['noun_phrase_only']:
                if ' '.join(predict) not in noun_phrase_set:
                    print('Not a NP: %s' % (' '.join(predict)))
                    keep = False

            # discard invalid ones
            if not keep:
                continue

            if any(i_ > config['voc_size'] for i_ in predict_index):
                is_copied.append(1)
            else:
                is_copied.append(0)

            original_predict_outputs.append(original_predict)
            predict_indexes.append(predict_index)
            predict_outputs.append(predict)
            predict_score.append(score)
            predict_set.add(key)

        # whether keep the longest phrases only, as there're too many phrases are part of other longer phrases
        if config['keep_longest']:
            match_phrase_index = []

            for ii, p_ii in enumerate(predict_outputs):  # shorter one
                match_times = 0
                for jj, p_jj in enumerate(predict_outputs):  # longer one
                    if ii == jj or len(p_ii) >= len(
                            p_jj):  # p_jj must be longer than p_ii
                        continue

                    match = None
                    for start in range(len(p_jj) - len(p_ii) +
                                       1):  # iterate the start of long phrase
                        match = None
                        for w_index in range(
                                len(p_ii)):  # iterate the short phrase
                            if (p_ii[w_index] != p_jj[start + w_index]):
                                match = False
                                break
                        if w_index == len(p_ii) - 1 and match == None:
                            match = True
                            match_times += 1
                if match_times == 1:  # p_ii is part of p_jj, discard
                    match_phrase_index.append(ii)
                    # print("Matched pair: %s \t - \t %s" % (str(p_ii), str(p_jj)))
                    # pass

            original_predict_outputs = np.delete(original_predict_outputs,
                                                 match_phrase_index)
            predict_indexes = np.delete(predict_indexes, match_phrase_index)
            predict_outputs = np.delete(predict_outputs, match_phrase_index)
            predict_score = np.delete(predict_score, match_phrase_index)
            is_copied = np.delete(is_copied, match_phrase_index)

        # check whether the predicted phrase is correct (match any groundtruth)
        for p_id, predict in enumerate(predict_outputs):
            for target in target_outputs:
                if len(target) == len(predict):
                    flag = True
                    for i, w in enumerate(predict):
                        if predict[i] != target[i]:
                            flag = False
                    if flag:
                        correctly_matched[p_id] = 1
                        # print('%s correct!!!' % predict)

        original_predict_outputs = np.asarray(original_predict_outputs)
        predict_indexes = np.asarray(predict_indexes)
        predict_outputs = np.asarray(predict_outputs)
        predict_score = np.asarray(predict_score)
        is_copied = np.asarray(is_copied)
        # normalize the score?
        if config['normalize_score']:
            predict_score = np.asarray([
                math.log(math.exp(score) / len(predict))
                for predict, score in zip(predict_outputs, predict_score)
            ])
            score_list_index = np.argsort(predict_score)
            original_predict_outputs = original_predict_outputs[
                score_list_index]
            predict_indexes = predict_indexes[score_list_index]
            predict_outputs = predict_outputs[score_list_index]
            predict_score = predict_score[score_list_index]
            correctly_matched = correctly_matched[score_list_index]
            is_copied = is_copied[score_list_index]

        metric_dict = {}
        '''
        Compute micro metrics
        '''
        for number_to_predict in [5, 10, 15, 20, 30, 40,
                                  50]:  #5, 10, 15, 20, 30, 40, 50
            metric_dict['appear_target_number'] = len(target_outputs)
            metric_dict['target_number'] = len(target_list)
            metric_dict['correct_number@%d' % number_to_predict] = sum(
                correctly_matched[:number_to_predict])

            metric_dict['p@%d' % number_to_predict] = float(
                sum(correctly_matched[:number_to_predict])) / float(
                    number_to_predict)

            if len(target_outputs) != 0:
                metric_dict['r@%d' % number_to_predict] = float(
                    sum(correctly_matched[:number_to_predict])) / float(
                        len(target_outputs))
            else:
                metric_dict['r@%d' % number_to_predict] = 0

            if metric_dict['p@%d' % number_to_predict] + metric_dict[
                    'r@%d' % number_to_predict] != 0:
                metric_dict['f1@%d' % number_to_predict] = 2 * metric_dict[
                    'p@%d' % number_to_predict] * metric_dict[
                        'r@%d' % number_to_predict] / float(
                            metric_dict['p@%d' % number_to_predict] +
                            metric_dict['r@%d' % number_to_predict])
            else:
                metric_dict['f1@%d' % number_to_predict] = 0

            # Compute the binary preference measure (Bpref)
            bpref = 0.
            trunked_match = correctly_matched[:number_to_predict].tolist(
            )  # get the first K prediction to evaluate
            match_indexes = np.nonzero(trunked_match)[0]

            if len(match_indexes) > 0:
                for mid, mindex in enumerate(match_indexes):
                    bpref += 1. - float(mindex - mid) / float(
                        number_to_predict
                    )  # there're mindex elements, and mid elements are correct, before the (mindex+1)-th element
                metric_dict['bpref@%d' %
                            number_to_predict] = float(bpref) / float(
                                len(match_indexes))
            else:
                metric_dict['bpref@%d' % number_to_predict] = 0

            # Compute the mean reciprocal rank (MRR)
            rank_first = 0
            try:
                rank_first = trunked_match.index(1) + 1
            except ValueError:
                pass

            if rank_first > 0:
                metric_dict['mrr@%d' %
                            number_to_predict] = float(1) / float(rank_first)
            else:
                metric_dict['mrr@%d' % number_to_predict] = 0

        micro_metrics.append(metric_dict)
        micro_matches.append(correctly_matched)
        predict_scores.append(predict_score)
        '''
        Output keyphrases to prediction folder
        '''
        if not os.path.exists(prediction_dir):
            os.makedirs(prediction_dir)

        with open(prediction_dir + '/' + doc_name + '.txt.phrases',
                  'w') as output_file:
            output_file.write('\n'.join(
                [' '.join(o_) for o_ in original_predict_outputs]))
        '''
        Print information on each prediction
        '''
        # print stuff
        a = '[SOURCE][{0}]: {1}'.format(len(input_sentence),
                                        ' '.join(source_str))
        logger.info(a)
        a += '\n'

        b = '[GROUND-TRUTH]: %d/%d ground-truth phrases\n\t\t' % (
            len(target_outputs), len(target_list))
        target_output_set = set(['_'.join(t) for t in target_outputs])
        for id, target in enumerate(original_target_list):
            if '_'.join([stemmer.stem(w)
                         for w in target]) in target_output_set:
                b += '[' + ' '.join(target) + ']; '
            else:
                b += ' '.join(target) + '; '
        logger.info(b)
        b += '\n'
        c = '[PREDICTION]: %d/%d predictions\n' % (len(predict_outputs),
                                                   len(predict_list))
        c += '[Correct@10] = %d\n' % metric_dict['correct_number@10']
        c += '[Correct@50] = %d\n' % metric_dict['correct_number@50']
        for id, (predict, score, predict_index) in enumerate(
                zip(original_predict_outputs, predict_score, predict_indexes)):
            c += ('\n\t\t[%.3f][%d][%d]' %
                  (score, len(predict), sum([len(w) for w in predict
                                             ]))) + ' '.join(predict)
            if correctly_matched[id] == 1:
                c += ' [correct!]'
            if is_copied[id] == 1:
                c += '[copied!] %s' % str(predict_index)
                # print(('\n\t\t[%.3f]'% score) + ' '.join(predict) + ' [correct!]')
                # print(('\n\t\t[%.3f]'% score) + ' '.join(predict))
        c += '\n'

        # c = '[DECODE]: {}'.format(' '.join(cut_zero(phrase, idx2word)))
        # if inputs_unk is not None:
        #     k = '[_INPUT]: {}\n'.format(' '.join(cut_zero(inputs_unk.tolist(),  idx2word, Lmax=len(idx2word))))
        #     logger.info(k)
        # a += k
        logger.info(c)
        a += b + c

        for number_to_predict in [5, 10, 15, 20, 30, 40, 50]:
            d = '@%d - Precision=%.4f, Recall=%.4f, F1=%.4f, Bpref=%.4f, MRR=%.4f' % (
                number_to_predict, metric_dict['p@%d' % number_to_predict],
                metric_dict['r@%d' % number_to_predict],
                metric_dict['f1@%d' % number_to_predict],
                metric_dict['bpref@%d' % number_to_predict],
                metric_dict['mrr@%d' % number_to_predict])
            logger.info(d)
            a += d + '\n'

        logger.info('*' * 100)
        outs.append(a)
        outs.append('*' * 100 + '\n')

    # omit the bad data which contains 0 predictions
    # real_test_size = sum([1 if m['target_number'] > 0 else 0 for m in micro_metrics])
    real_test_size = len(inputs)
    '''
    Compute the corpus evaluation
    '''
    logger.info('Experiment result: %s' %
                (config['predict_path'] + '/' + model_name + '-' +
                 dataset_name + '.txt'))
    csv_writer = open(
        config['predict_path'] + '/' + model_name + '-' + dataset_name +
        '.txt', 'w')
    overall_score = {}
    for k in [5, 10, 15, 20, 30, 40, 50]:
        correct_number = sum(
            [m['correct_number@%d' % k] for m in micro_metrics])
        appear_target_number = sum(
            [m['appear_target_number'] for m in micro_metrics])
        target_number = sum([m['target_number'] for m in micro_metrics])

        # Compute the Micro Measures, by averaging the micro-score of each prediction
        overall_score['p@%d' % k] = float(
            sum([m['p@%d' % k]
                 for m in micro_metrics])) / float(real_test_size)
        overall_score['r@%d' % k] = float(
            sum([m['r@%d' % k]
                 for m in micro_metrics])) / float(real_test_size)
        overall_score['f1@%d' % k] = float(
            sum([m['f1@%d' % k]
                 for m in micro_metrics])) / float(real_test_size)

        output_str = 'Overall - %s valid testing data=%d, Number of Target=%d/%d, Number of Prediction=%d, Number of Correct=%d' % (
            config['predict_type'], real_test_size, appear_target_number,
            target_number, real_test_size * k, correct_number)
        outs.append(output_str + '\n')
        logger.info(output_str)
        output_str = 'Micro:\t\tP@%d=%f, R@%d=%f, F1@%d=%f' % (
            k, overall_score['p@%d' % k], k, overall_score['r@%d' % k], k,
            overall_score['f1@%d' % k])
        outs.append(output_str + '\n')
        logger.info(output_str)
        csv_writer.write(
            'Micro@%d, %f, %f, %f\n' %
            (k, overall_score['p@%d' % k], overall_score['r@%d' % k],
             overall_score['f1@%d' % k]))

        # Compute the Macro Measures
        overall_score['macro_p@%d' %
                      k] = correct_number / float(real_test_size * k)
        overall_score['macro_r@%d' %
                      k] = correct_number / float(appear_target_number)
        if overall_score['macro_p@%d' % k] + overall_score['macro_r@%d' %
                                                           k] > 0:
            overall_score[
                'macro_f1@%d' %
                k] = 2 * overall_score['macro_p@%d' % k] * overall_score[
                    'macro_r@%d' % k] / float(overall_score['macro_p@%d' % k] +
                                              overall_score['macro_r@%d' % k])
        else:
            overall_score['macro_f1@%d' % k] = 0

        output_str = 'Macro:\t\tP@%d=%f, R@%d=%f, F1@%d=%f' % (
            k, overall_score['macro_p@%d' % k], k,
            overall_score['macro_r@%d' % k], k,
            overall_score['macro_f1@%d' % k])
        outs.append(output_str + '\n')
        logger.info(output_str)
        csv_writer.write('Macro@%d, %f, %f, %f\n' %
                         (k, overall_score['macro_p@%d' % k],
                          overall_score['macro_r@%d' % k],
                          overall_score['macro_f1@%d' % k]))

        # Compute the binary preference measure (Bpref)
        overall_score['bpref@%d' % k] = float(
            sum([m['bpref@%d' % k]
                 for m in micro_metrics])) / float(real_test_size)

        # Compute the mean reciprocal rank (MRR)
        overall_score['mrr@%d' % k] = float(
            sum([m['mrr@%d' % k]
                 for m in micro_metrics])) / float(real_test_size)

        output_str = '\t\t\tBpref@%d=%f, MRR@%d=%f' % (
            k, overall_score['bpref@%d' % k], k, overall_score['mrr@%d' % k])
        outs.append(output_str + '\n')
        logger.info(output_str)

    # evaluate the score cutoff

    for cutoff in range(15):
        overall_predicted_number = 0
        overall_correct_number = 0
        overall_target_number = sum(
            [m['target_number'] for m in micro_metrics])

        for score_list, metric_dict, correctly_matched in zip(
                predict_scores, micro_metrics, micro_matches):
            predicted_number = len(filter(lambda s: s < cutoff, score_list))
            overall_predicted_number += predicted_number
            overall_correct_number += sum(correctly_matched[:predicted_number])

        if overall_predicted_number > 0:
            macro_p = float(overall_correct_number) / float(
                overall_predicted_number)
        else:
            macro_p = 0
        macro_r = float(overall_correct_number) / float(overall_target_number)

        if macro_p + macro_r > 0:
            macro_f1 = 2. * macro_p * macro_r / (macro_p + macro_r)
        else:
            macro_f1 = 0

        logger.info(
            'Macro,cutoff@%d, correct_number=%d, predicted_number=%d, target_number=%d, p=%f, r=%f, f1=%f'
            % (cutoff, overall_correct_number, overall_predicted_number,
               overall_target_number, macro_p, macro_r, macro_f1))
        csv_writer.write('Macro,cutoff@%d, %f, %f, %f\n' %
                         (cutoff, macro_p, macro_r, macro_f1))

    csv_writer.close()

    return outs, overall_score
                # 6. Stop if exceed patience
                if valid_param['valids_not_improved'] >= valid_param[
                        'patience']:
                    print("Not improved for %s epochs. Stopping..." %
                          valid_param['valids_not_improved'])
                    valid_param['early_stop'] = True
                    break
    '''
    test accuracy and f-score at the end of each epoch
    '''
    if do_predict:
        for dataset_name in config['testing_datasets']:
            # override the original test_set
            # if the dataset does not provide postag, use load_testing_data()
            test_set = keyphrase_test_dataset.testing_data_loader(
                dataset_name, kwargs=dict(
                    basedir=config['path'])).load_testing_data(word2idx)
            # test_set = keyphrase_test_dataset.testing_data_loader(dataset_name, kwargs=dict(basedir=config['path'])).load_testing_data_postag(word2idx)
            # test_set = test_sets[dataset_name]

            test_data_plain = list(
                zip(*(test_set['source_str'], test_set['target_str'],
                      test_set['source'], test_set['target'])))
            test_size = len(test_data_plain)

            print(dataset_name)
            print('Size of test data=%d' % test_size)
            print('Avg length=%d, Max length=%d' %
                  (np.average([len(s) for s in test_set['source']]),
                   np.max([len(s) for s in test_set['source']])))