def _parse_pack( self, doc_data: Tuple[Dict[str, str], Dict[str, List[state_type]]] ) -> Iterator[DataPack]: str_data, node_data = doc_data pack = DataPack() doc_name: str = str_data['doc_name'] if doc_name in self.redirects: doc_name = self.redirects[doc_name] full_text: str = str_data['text'] pack.set_text(full_text) page = WikiPage(pack, 0, len(full_text)) pack.add_entry(page) page.set_page_id(str_data['oldid']) page.set_page_name(doc_name) if len(node_data['struct']) > 0: add_struct(pack, node_data['struct']) else: logging.warning('Structure info for %s not found.', doc_name) if len(node_data['links']) > 0: add_anchor_links(pack, node_data['links'], self.redirects) else: logging.warning('Links for [%s] not found.', doc_name) pack.meta.doc_id = doc_name yield pack
def add_info_boxes(pack: DataPack, info_box_statements: List, info_type: str): for _, v, o in info_box_statements: info_box = WikiInfoBoxMapped(pack) info_box.set_key(v.toPython()) info_box.set_value(get_resource_name(o)) info_box.set_infobox_type(info_type) pack.add_entry(info_box)
def _process(self, input_pack: MultiPack): r"""Searches ElasticSearch indexer to fetch documents for a query. This query should be contained in the input multipack with name `self.config.query_pack_name`. This method adds new packs to `input_pack` containing the retrieved results. Each result is added as a `ft.onto.base_ontology.Document`. Args: input_pack: A multipack containing query as a pack. """ query_pack = input_pack.get_pack(self.config.query_pack_name) # ElasticSearchQueryCreator adds a Query entry to query pack. We now # fetch it as the first element. first_query = list(query_pack.get_entries(Query))[0] results = self.index.search(first_query.value) hits = results["hits"]["hits"] packs = {} for idx, hit in enumerate(hits): document = hit["_source"] first_query.update_results({document["doc_id"]: hit["_score"]}) pack = DataPack(doc_id=document["doc_id"]) content = document[self.config.field] document = Document(pack=pack, begin=0, end=len(content)) pack.add_entry(document) pack.set_text(content) packs[f"{self.config.response_pack_name_prefix}_{idx}"] = pack input_pack.update_pack(packs)
def _process(self, input_pack: MultiPack): query = input_pack.get_pack(self.in_pack_name).text params = '?' + urlencode( { 'api-version': '3.0', 'from': self.src_language, 'to': [self.target_language] }, doseq=True) microsoft_constructed_url = self.microsoft_translate_url + params response = requests.post(microsoft_constructed_url, headers=self.microsoft_headers, json=[{ "text": query }]) if response.status_code != 200: raise RuntimeError(response.json()['error']['message']) text = response.json()[0]["translations"][0]["text"] pack = DataPack() document = Document(pack, 0, len(text)) utterance = Utterance(pack, 0, len(text)) pack.add_entry(document) pack.add_entry(utterance) pack.set_text(text=text) input_pack.update_pack({self.out_pack_name: pack})
def add_info_boxes(pack: DataPack, info_box_statements: List): for _, v, o in info_box_statements: slot_name = v.toPython() slot_value = get_resource_name(o) info_box = WikiInfoBoxMapped(pack) info_box.set_key(slot_name) info_box.set_value(slot_value) pack.add_entry(info_box)
def add_property(pack: DataPack, statements: List): for _, v, o in statements: slot_name = v.toPython() slot_value = get_resource_name(o) info_box = WikiInfoBoxProperty(pack) info_box.set_key(slot_name) info_box.set_value(slot_value) pack.add_entry(info_box)
def _process(self, input_pack: DataPack): pattern = r"\s+" start = 0 for m in re.finditer(pattern, input_pack.text): input_pack.add_entry(Token(input_pack, start, m.start())) start = m.end() if start < len(input_pack.text): input_pack.add_entry(Token(input_pack, start, len(input_pack.text)))
def _process(self, input_pack: DataPack): pattern = "\\.\\s*" start = 0 for m in re.finditer(pattern, input_pack.text): end = m.end() Sentence(input_pack, start, end) start = end if start < len(input_pack.text): input_pack.add_entry( Sentence(input_pack, start, len(input_pack.text)))
def _process(self, input_pack: MultiPack): query_pack = input_pack.get_pack(self.config.query_pack_name) first_query = list(query_pack.get_entries(Query))[0] results = self.index.search(first_query.value, self.k) documents = [r[1] for result in results for r in result] packs = {} for i, doc in enumerate(documents): pack = DataPack() document = Document(pack=pack, begin=0, end=len(doc)) pack.add_entry(document) pack.set_text(doc) packs[self.config.response_pack_name[i]] = pack input_pack.update_pack(packs)
def _process(self, input_pack: MultiPack): query_pack = input_pack.get_pack("pack") first_query = list(query_pack.get_entries(Query))[0] results = self.index.search(first_query.value, self.k) documents = [r[1] for result in results for r in result] packs = {} counter = 0 for doc in documents: pack = DataPack() document = Document(pack=pack, begin=0, end=len(doc)) pack.add_entry(document) pack.set_text(doc) packs[f"doc_{counter}"] = pack counter += 1 input_pack.update_pack(packs)
def _process(self, input_pack: DataPack): kp = KeywordProcessor(case_sensitive=True) anchor_entities = {} existing_anchors = set() anchor: WikiAnchor for anchor in input_pack.get(WikiAnchor): kp.add_keyword(anchor.text) existing_anchors.add((anchor.span.begin, anchor.span.end)) try: anchor_entities[anchor.text].append(anchor) except KeyError: anchor_entities[anchor.text] = [anchor] for kw, b, e in kp.extract_keywords(input_pack.text, span_info=True): targets = anchor_entities[kw] if (b, e) in existing_anchors: # Ignore existing anchors. continue copy_from: WikiAnchor if len(targets) == 1: copy_from = targets[0] elif len(targets) > 1: latest_ = targets[0] for t in targets: if t.begin < b: latest_ = t copy_from = latest_ else: raise RuntimeError(f"Unknown target length {len(targets)}") anchor = WikiAnchor(input_pack, b, e) anchor.target_page_name = copy_from.target_page_name anchor.is_external = copy_from.is_external input_pack.add_entry(anchor)
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type], redirects: Dict[str, str]): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in text_link_statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if not anchor_type == 'Phrase' and not anchor_type == 'Word': logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if target_page_name in redirects: target_page_name = redirects[target_page_name] anchor.set_target_page_name(target_page_name) pack.add_entry(anchor)
def add_struct(pack: DataPack, struct_statements: List): for nif_range, rel, struct_type in struct_statements: r = get_resource_fragment(rel) if r == 'type': range_ = get_resource_attribute(nif_range, 'char') begin, end = [int(d) for d in range_.split(',')] struct_ = get_resource_fragment(struct_type) if struct_ == 'Section': section = WikiSection(pack, begin, end) pack.add_entry(section) elif struct_ == 'Paragraph': para = WikiParagraph(pack, begin, end) pack.add_entry(para) elif struct_ == 'Title': title = WikiTitle(pack, begin, end) pack.add_entry(title) else: logging.warning("Unknown struct type: %s", struct_type)