def _from_json_entity_data_returner(self, json_path: str): entity_data = jopen(json_path)['doc_title2sents'] entity_names, descriptions = list(), list() for entity_name, tokenized_data in entity_data.items(): tokenized_title = tokenized_data[ 'sudachi_tokenized_title'][:self.max_token_in_one_entity_name] tokenized_descs = tokenized_data[ 'sudachi_tokenized_sents'][:self.max_sent_from_one_entity] tokenized_descs = [ token for token in [ tokenized_sent[:self. max_token_in_one_sentence_of_entity_desc] for tokenized_sent in tokenized_descs ] ] tokenized_descs = [ item for sublist in tokenized_descs for item in sublist ] entity_names.append(tokenized_title) descriptions.append(tokenized_descs) assert len(entity_names) == len(descriptions) return entity_names, descriptions
def _m2_collect_from_one_json(json_path: str) -> List[Tuple[str, str]]: annotations = jopen(json_path)['annotations'] m2e = list() for annotation in annotations: mention, destination_of_its_mention_doc_title = annotation[ 'mention'], annotation['annotation_doc_entity_title'] if destination_of_its_mention_doc_title != None: m2e.append((mention, destination_of_its_mention_doc_title)) return m2e
def _title2doc_loader(self) -> dict: return jopen(file_path=self.config.title2doc_file_path)
def _test_loader(self) -> List[Dict]: data = jopen(file_path=self.config.biencoder_dataset_file_path) return data['test']