def __call__(self, idx2word, word2idx, type=1): self.get_docs() pairs = [] for doc in self.doclist: try: title = utils.get_tokens(doc.title, type) text = utils.get_tokens(doc.text, type) if type == 0: title.append('<eos>') elif type == 1: title.append('.') title.extend(text) text = title # trunk, many texts are too long, would lead to out-of-memory if len(text) > 1500: text = text[:1500] keyphrases = [utils.get_tokens(k, type) for k in doc.phrases] pairs.append((text, keyphrases)) except UnicodeDecodeError: print('UnicodeDecodeError detected! %s' % doc.name) # print(text) # print(keyphrases) # print('*'*50) dataset = utils.build_data(pairs, idx2word, word2idx) return dataset, self.doclist
def check_data(): config = setup_keyphrase_all() train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file( config['dataset']) for dataset_name in config['testing_datasets']: print('*' * 50) print(dataset_name) number_groundtruth = 0 number_present_groundtruth = 0 loader = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path'])) if dataset_name == 'nus': docs = loader.get_docs(only_abstract=True, return_dict=False) else: docs = loader.get_docs(return_dict=False) stemmer = PorterStemmer() for id, doc in enumerate(docs): text_tokens = dataset_utils.get_tokens(doc.title.strip() + ' ' + doc.text.strip()) # if len(text_tokens) > 1500: # text_tokens = text_tokens[:1500] print('[%d] length= %d' % (id, len(doc.text))) stemmed_input = [ stemmer.stem(t).strip().lower() for t in text_tokens ] phrase_str = ';'.join([l.strip() for l in doc.phrases]) phrases = dataset_utils.process_keyphrase(phrase_str) targets = [[stemmer.stem(w).strip().lower() for w in target] for target in phrases] present_targets = [] for target in targets: keep = True # whether do filtering on groundtruth phrases. if config['target_filter']==None, do nothing match = None for i in range(len(stemmed_input) - len(target) + 1): match = None for j in range(len(target)): if target[j] != stemmed_input[i + j]: match = False break if j == len(target) - 1 and match == None: match = True break if match == True: # if match and 'appear-only', keep this phrase if config['target_filter'] == 'appear-only': keep = keep and True elif config['target_filter'] == 'non-appear-only': keep = keep and False elif match == False: # if not match and 'appear-only', discard this phrase if config['target_filter'] == 'appear-only': keep = keep and False # if not match and 'non-appear-only', keep this phrase elif config['target_filter'] == 'non-appear-only': keep = keep and True if not keep: continue present_targets.append(target) number_groundtruth += len(targets) number_present_groundtruth += len(present_targets) print('number_groundtruth=' + str(number_groundtruth)) print('number_present_groundtruth=' + str(number_present_groundtruth)) '''