Esempio n. 1
0
    def __call__(self, idx2word, word2idx, type=1):
        self.get_docs()

        pairs = []

        for doc in self.doclist:
            try:
                title = utils.get_tokens(doc.title, type)
                text = utils.get_tokens(doc.text, type)
                if type == 0:
                    title.append('<eos>')
                elif type == 1:
                    title.append('.')

                title.extend(text)
                text = title

                # trunk, many texts are too long, would lead to out-of-memory
                if len(text) > 1500:
                    text = text[:1500]

                keyphrases = [utils.get_tokens(k, type) for k in doc.phrases]
                pairs.append((text, keyphrases))

            except UnicodeDecodeError:
                print('UnicodeDecodeError detected! %s' % doc.name)
            # print(text)
            # print(keyphrases)
            # print('*'*50)
        dataset = utils.build_data(pairs, idx2word, word2idx)

        return dataset, self.doclist
Esempio n. 2
0
def check_data():
    config = setup_keyphrase_all()
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(
        config['dataset'])

    for dataset_name in config['testing_datasets']:
        print('*' * 50)
        print(dataset_name)

        number_groundtruth = 0
        number_present_groundtruth = 0

        loader = testing_data_loader(dataset_name,
                                     kwargs=dict(basedir=config['path']))

        if dataset_name == 'nus':
            docs = loader.get_docs(only_abstract=True, return_dict=False)
        else:
            docs = loader.get_docs(return_dict=False)

        stemmer = PorterStemmer()

        for id, doc in enumerate(docs):

            text_tokens = dataset_utils.get_tokens(doc.title.strip() + ' ' +
                                                   doc.text.strip())
            # if len(text_tokens) > 1500:
            #     text_tokens = text_tokens[:1500]
            print('[%d] length= %d' % (id, len(doc.text)))

            stemmed_input = [
                stemmer.stem(t).strip().lower() for t in text_tokens
            ]

            phrase_str = ';'.join([l.strip() for l in doc.phrases])
            phrases = dataset_utils.process_keyphrase(phrase_str)
            targets = [[stemmer.stem(w).strip().lower() for w in target]
                       for target in phrases]

            present_targets = []

            for target in targets:
                keep = True
                # whether do filtering on groundtruth phrases. if config['target_filter']==None, do nothing
                match = None
                for i in range(len(stemmed_input) - len(target) + 1):
                    match = None
                    for j in range(len(target)):
                        if target[j] != stemmed_input[i + j]:
                            match = False
                            break
                    if j == len(target) - 1 and match == None:
                        match = True
                        break

                if match == True:
                    # if match and 'appear-only', keep this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and True
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and False
                elif match == False:
                    # if not match and 'appear-only', discard this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and False
                    # if not match and 'non-appear-only', keep this phrase
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and True

                if not keep:
                    continue

                present_targets.append(target)

            number_groundtruth += len(targets)
            number_present_groundtruth += len(present_targets)

        print('number_groundtruth=' + str(number_groundtruth))
        print('number_present_groundtruth=' + str(number_present_groundtruth))
        '''