def _parse_pack(
        self, doc_data: Tuple[Dict[str, str], Dict[str, List[state_type]]]
    ) -> Iterator[DataPack]:
        str_data, node_data = doc_data

        pack = DataPack()
        doc_name: str = str_data['doc_name']
        if doc_name in self.redirects:
            doc_name = self.redirects[doc_name]

        full_text: str = str_data['text']

        pack.set_text(full_text)
        page = WikiPage(pack, 0, len(full_text))
        pack.add_entry(page)
        page.set_page_id(str_data['oldid'])
        page.set_page_name(doc_name)

        if len(node_data['struct']) > 0:
            add_struct(pack, node_data['struct'])
        else:
            logging.warning('Structure info for %s not found.', doc_name)

        if len(node_data['links']) > 0:
            add_anchor_links(pack, node_data['links'], self.redirects)
        else:
            logging.warning('Links for [%s] not found.', doc_name)

        pack.meta.doc_id = doc_name

        yield pack
Beispiel #2
0
def add_info_boxes(pack: DataPack, info_box_statements: List, info_type: str):
    for _, v, o in info_box_statements:
        info_box = WikiInfoBoxMapped(pack)
        info_box.set_key(v.toPython())
        info_box.set_value(get_resource_name(o))
        info_box.set_infobox_type(info_type)
        pack.add_entry(info_box)
Beispiel #3
0
    def _process(self, input_pack: MultiPack):
        r"""Searches ElasticSearch indexer to fetch documents for a query. This
        query should be contained in the input multipack with name
        `self.config.query_pack_name`.

        This method adds new packs to `input_pack` containing the retrieved
        results. Each result is added as a `ft.onto.base_ontology.Document`.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.config.query_pack_name)

        # ElasticSearchQueryCreator adds a Query entry to query pack. We now
        # fetch it as the first element.
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value)
        hits = results["hits"]["hits"]
        packs = {}
        for idx, hit in enumerate(hits):
            document = hit["_source"]
            first_query.update_results({document["doc_id"]: hit["_score"]})
            pack = DataPack(doc_id=document["doc_id"])
            content = document[self.config.field]
            document = Document(pack=pack, begin=0, end=len(content))
            pack.add_entry(document)
            pack.set_text(content)
            packs[f"{self.config.response_pack_name_prefix}_{idx}"] = pack

        input_pack.update_pack(packs)
    def _process(self, input_pack: MultiPack):
        query = input_pack.get_pack(self.in_pack_name).text
        params = '?' + urlencode(
            {
                'api-version': '3.0',
                'from': self.src_language,
                'to': [self.target_language]
            },
            doseq=True)
        microsoft_constructed_url = self.microsoft_translate_url + params

        response = requests.post(microsoft_constructed_url,
                                 headers=self.microsoft_headers,
                                 json=[{
                                     "text": query
                                 }])

        if response.status_code != 200:
            raise RuntimeError(response.json()['error']['message'])

        text = response.json()[0]["translations"][0]["text"]
        pack = DataPack()

        document = Document(pack, 0, len(text))
        utterance = Utterance(pack, 0, len(text))
        pack.add_entry(document)
        pack.add_entry(utterance)

        pack.set_text(text=text)
        input_pack.update_pack({self.out_pack_name: pack})
def add_info_boxes(pack: DataPack, info_box_statements: List):
    for _, v, o in info_box_statements:
        slot_name = v.toPython()
        slot_value = get_resource_name(o)
        info_box = WikiInfoBoxMapped(pack)
        info_box.set_key(slot_name)
        info_box.set_value(slot_value)
        pack.add_entry(info_box)
def add_property(pack: DataPack, statements: List):
    for _, v, o in statements:
        slot_name = v.toPython()
        slot_value = get_resource_name(o)
        info_box = WikiInfoBoxProperty(pack)
        info_box.set_key(slot_name)
        info_box.set_value(slot_value)
        pack.add_entry(info_box)
Beispiel #7
0
    def _process(self, input_pack: DataPack):
        pattern = r"\s+"
        start = 0

        for m in re.finditer(pattern, input_pack.text):
            input_pack.add_entry(Token(input_pack, start, m.start()))
            start = m.end()

        if start < len(input_pack.text):
            input_pack.add_entry(Token(input_pack, start, len(input_pack.text)))
Beispiel #8
0
    def _process(self, input_pack: DataPack):
        pattern = "\\.\\s*"
        start = 0

        for m in re.finditer(pattern, input_pack.text):
            end = m.end()
            Sentence(input_pack, start, end)
            start = end

        if start < len(input_pack.text):
            input_pack.add_entry(
                Sentence(input_pack, start, len(input_pack.text)))
Beispiel #9
0
    def _process(self, input_pack: MultiPack):
        query_pack = input_pack.get_pack(self.config.query_pack_name)
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value, self.k)
        documents = [r[1] for result in results for r in result]

        packs = {}
        for i, doc in enumerate(documents):
            pack = DataPack()
            document = Document(pack=pack, begin=0, end=len(doc))
            pack.add_entry(document)
            pack.set_text(doc)
            packs[self.config.response_pack_name[i]] = pack

        input_pack.update_pack(packs)
Beispiel #10
0
    def _process(self, input_pack: MultiPack):
        query_pack = input_pack.get_pack("pack")
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value, self.k)
        documents = [r[1] for result in results for r in result]

        packs = {}
        counter = 0
        for doc in documents:
            pack = DataPack()
            document = Document(pack=pack, begin=0, end=len(doc))
            pack.add_entry(document)
            pack.set_text(doc)
            packs[f"doc_{counter}"] = pack
            counter += 1

        input_pack.update_pack(packs)
Beispiel #11
0
    def _process(self, input_pack: DataPack):
        kp = KeywordProcessor(case_sensitive=True)
        anchor_entities = {}
        existing_anchors = set()

        anchor: WikiAnchor
        for anchor in input_pack.get(WikiAnchor):
            kp.add_keyword(anchor.text)
            existing_anchors.add((anchor.span.begin, anchor.span.end))

            try:
                anchor_entities[anchor.text].append(anchor)
            except KeyError:
                anchor_entities[anchor.text] = [anchor]

        for kw, b, e in kp.extract_keywords(input_pack.text, span_info=True):
            targets = anchor_entities[kw]

            if (b, e) in existing_anchors:
                # Ignore existing anchors.
                continue

            copy_from: WikiAnchor
            if len(targets) == 1:
                copy_from = targets[0]
            elif len(targets) > 1:
                latest_ = targets[0]
                for t in targets:
                    if t.begin < b:
                        latest_ = t
                copy_from = latest_
            else:
                raise RuntimeError(f"Unknown target length {len(targets)}")

            anchor = WikiAnchor(input_pack, b, e)
            anchor.target_page_name = copy_from.target_page_name
            anchor.is_external = copy_from.is_external
            input_pack.add_entry(anchor)
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type],
                     redirects: Dict[str, str]):
    link_grouped: DefaultDict[str, Dict[str,
                                        rdflib.term.Node]] = defaultdict(dict)
    for nif_range, rel, info in text_link_statements:
        range_ = get_resource_attribute(nif_range, 'char')
        r = get_resource_fragment(rel)
        link_grouped[range_][r] = info

    for range_, link_infos in link_grouped.items():
        begin, end = [int(d) for d in range_.split(',')]
        anchor = WikiAnchor(pack, begin, end)
        for info_key, info_value in link_infos.items():
            if info_key == 'type':
                anchor_type = get_resource_fragment(info_value)
                if not anchor_type == 'Phrase' and not anchor_type == 'Word':
                    logging.warning("Unknown anchor type: %s", info_value)
            if info_key == 'taIdentRef':
                target_page_name = get_resource_name(info_value)
                if target_page_name in redirects:
                    target_page_name = redirects[target_page_name]
                anchor.set_target_page_name(target_page_name)
        pack.add_entry(anchor)
def add_struct(pack: DataPack, struct_statements: List):
    for nif_range, rel, struct_type in struct_statements:
        r = get_resource_fragment(rel)
        if r == 'type':
            range_ = get_resource_attribute(nif_range, 'char')
            begin, end = [int(d) for d in range_.split(',')]

            struct_ = get_resource_fragment(struct_type)

            if struct_ == 'Section':
                section = WikiSection(pack, begin, end)
                pack.add_entry(section)
            elif struct_ == 'Paragraph':
                para = WikiParagraph(pack, begin, end)
                pack.add_entry(para)
            elif struct_ == 'Title':
                title = WikiTitle(pack, begin, end)
                pack.add_entry(title)
            else:
                logging.warning("Unknown struct type: %s", struct_type)