Ejemplo n.º 1
0
 def setUp(self) -> None:
     self.data_pack1 = DataPack(doc_id="1")
     self.data_pack2 = DataPack(doc_id="2")
     self.data_pack3 = DataPack(doc_id="Three")
     self.multi_pack = MultiPack()
     self.multi_pack.add_pack(self.data_pack1, pack_name="pack1")
     self.multi_pack.add_pack(self.data_pack2, pack_name="pack2")
     self.multi_pack.add_pack(self.data_pack3, pack_name="pack_three")
Ejemplo n.º 2
0
    def _parse_pack(
        self, doc_data: Tuple[Dict[str, str], Dict[str, List[state_type]]]
    ) -> Iterator[DataPack]:
        str_data, node_data = doc_data

        pack = DataPack()
        doc_name: str = str_data['doc_name']
        if doc_name in self.redirects:
            doc_name = self.redirects[doc_name]

        full_text: str = str_data['text']

        pack.set_text(full_text)
        page = WikiPage(pack, 0, len(full_text))
        pack.add_entry(page)
        page.set_page_id(str_data['oldid'])
        page.set_page_name(doc_name)

        if len(node_data['struct']) > 0:
            add_struct(pack, node_data['struct'])
        else:
            logging.warning('Structure info for %s not found.', doc_name)

        if len(node_data['links']) > 0:
            add_anchor_links(pack, node_data['links'], self.redirects)
        else:
            logging.warning('Links for [%s] not found.', doc_name)

        pack.meta.doc_id = doc_name

        yield pack
Ejemplo n.º 3
0
    def _parse_pack(self, nli_instance) -> Iterator[PackType]:
        pair_id, source, target, label = nli_instance

        pack = DataPack(pair_id)
        text = source + "\n" + target + "\n"
        pack.set_text(text)

        premise = Premise(pack, 0, len(source))
        hypo = Hypothesis(pack, len(source) + 1, len(text) - 1)

        pair = NLIPair(pack)
        pair.set_parent(premise)
        pair.set_child(hypo)

        pair.entailment = {
            "entailment": 0,
            "neutral": 0,
            "contradiction": 0,
        }

        if label == 2:
            pair.entailment["contradiction"] = 1
        elif label == 0:
            pair.entailment["entailment"] = 1
        elif label == 1:
            pair.entailment["neutral"] = 1
        else:
            raise ValueError("Unknown label value.")

        yield pack
Ejemplo n.º 4
0
    def _process(self, input_pack: MultiPack):
        r"""Searches ElasticSearch indexer to fetch documents for a query. This
        query should be contained in the input multipack with name
        `self.config.query_pack_name`.

        This method adds new packs to `input_pack` containing the retrieved
        results. Each result is added as a `ft.onto.base_ontology.Document`.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.config.query_pack_name)

        # ElasticSearchQueryCreator adds a Query entry to query pack. We now
        # fetch it as the first element.
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value)
        hits = results["hits"]["hits"]
        packs = {}
        for idx, hit in enumerate(hits):
            document = hit["_source"]
            first_query.update_results({document["doc_id"]: hit["_score"]})
            pack = DataPack(doc_id=document["doc_id"])
            content = document[self.config.field]
            document = Document(pack=pack, begin=0, end=len(content))
            pack.add_entry(document)
            pack.set_text(content)
            packs[f"{self.config.response_pack_name_prefix}_{idx}"] = pack

        input_pack.update_pack(packs)
    def _process(self, input_pack: MultiPack):
        query = input_pack.get_pack(self.in_pack_name).text
        params = '?' + urlencode(
            {
                'api-version': '3.0',
                'from': self.src_language,
                'to': [self.target_language]
            },
            doseq=True)
        microsoft_constructed_url = self.microsoft_translate_url + params

        response = requests.post(microsoft_constructed_url,
                                 headers=self.microsoft_headers,
                                 json=[{
                                     "text": query
                                 }])

        if response.status_code != 200:
            raise RuntimeError(response.json()['error']['message'])

        text = response.json()[0]["translations"][0]["text"]
        pack = DataPack()

        document = Document(pack, 0, len(text))
        utterance = Utterance(pack, 0, len(text))
        pack.add_entry(document)
        pack.add_entry(utterance)

        pack.set_text(text=text)
        input_pack.update_pack({self.out_pack_name: pack})
Ejemplo n.º 6
0
    def _process(self, input_pack: MultiPack):
        query_pack = input_pack.get_pack(self.config.query_pack_name)
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value, self.k)
        documents = [r[1] for result in results for r in result]

        packs = {}
        for i, doc in enumerate(documents):
            pack = DataPack()
            document = Document(pack=pack, begin=0, end=len(doc))
            pack.add_entry(document)
            pack.set_text(doc)
            packs[self.config.response_pack_name[i]] = pack

        input_pack.update_pack(packs)
Ejemplo n.º 7
0
    def _process(self, input_pack: MultiPack):
        query_pack = input_pack.get_pack("pack")
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value, self.k)
        documents = [r[1] for result in results for r in result]

        packs = {}
        counter = 0
        for doc in documents:
            pack = DataPack()
            document = Document(pack=pack, begin=0, end=len(doc))
            pack.add_entry(document)
            pack.set_text(doc)
            packs[f"doc_{counter}"] = pack
            counter += 1

        input_pack.update_pack(packs)
Ejemplo n.º 8
0
    def _parse_pack(self, doc_info: DocInfoType) -> Iterator[DataPack]:
        # pylint: disable = no-self-use
        """
        Takes the `doc_info` returned by the `_collect` method and returns a
        `data_pack` that either contains entry of the type `Query`, or contains
        an entry of the type Document.
        :param doc_info: document info to be populatd in the data_pack
        :return: query or document data_pack
        """
        data_pack: DataPack = DataPack()

        is_query, doc_id, doc_content, rel_docs = doc_info
        data_pack.meta.doc_id = doc_id

        annotations: List[Union[Passage, Document, Query]] = []

        if not is_query:
            # add passages
            for passage in doc_content:
                annotations.append(Passage(data_pack, 0, len(passage)))

            doc_text = os.linesep.join(doc_content)
            annotations.append(Document(data_pack, 0, len(doc_text)))
            data_pack.set_text(doc_text)
        else:
            query = Query(data_pack)
            query.query = doc_content[0]
            if rel_docs is not None:
                query.doc_ids = {"relevant_docs": rel_docs}
            annotations = [query]

        # add annotations to data_pack
        for annotation in annotations:
            data_pack.add_or_get_entry(annotation)

        yield data_pack
Ejemplo n.º 9
0
    def _parse_pack(self, input_paths) -> Iterator[PackType]:
        src_path, ere_path = input_paths

        with open(ere_path) as f:
            tree = ET.parse(f)
            root = tree.getroot()

            pack = DataPack()

            with open(src_path) as src_text_file:
                src_text = replace_unicode(src_text_file.read())
                self.set_text(pack, src_text)
                assert len(src_text) == len(pack.text)

            pack.pack_name = root.get("doc_id")

            args = []
            ems = {}

            for c1 in root:
                if c1.tag == "hoppers":
                    for c2 in c1.iter("hopper"):
                        hopper = Hopper(pack)
                        hopper.id = c2.get("id")

                        for em_node in c2.iter("event_mention"):
                            trigger = em_node.find("trigger")
                            begin = int(trigger.get("offset"))
                            end = begin + int(trigger.get("length"))
                            evm = EventMention(pack, begin, end)
                            evm.event_type = em_node.get(
                                "type") + "_" + em_node.get("subtype")
                            evm.realis = em_node.get("realis")
                            evm.audience = em_node.get("audience")
                            evm.formality = em_node.get("formality")
                            evm.medium = em_node.get("medium")
                            evm.schedule = em_node.get("schedule")
                            evm.id = em_node.get("id")

                            evm_arg = em_node.find("em_arg")

                            if evm_arg is not None:
                                arg_mention_id = evm_arg.get(
                                    "entity_mention_id")
                                if arg_mention_id is None:
                                    arg_mention_id = evm_arg.get("filler_id")

                                args.append((evm, (
                                    arg_mention_id,
                                    evm_arg.get("role"),
                                    evm_arg.get("realis"),
                                    evm_arg.get("id"),
                                )))

                            hopper.add_member(evm)
                elif c1.tag == "entities":
                    for c2 in c1.iter("entity"):
                        for em_node in c2.iter("entity_mention"):
                            begin = int(em_node.get("offset"))
                            end = begin + int(em_node.get("length"))
                            em = EntityMention(pack, begin, end)
                            em.ner_type = c1.get("type")
                            em.id = c1.get("id")
                            em.is_filler = False

                            ems[em_node.get("id")] = em
                elif c1.tag == "fillers":
                    for filler in c1.iter("filler"):
                        begin = int(filler.get("offset"))
                        end = begin + int(filler.get("length"))
                        em = EntityMention(pack, begin, end)
                        em.ner_type = c1.get("type")
                        em.id = c1.get("id")
                        em.is_filler = True

                        ems[filler.get("id")] = em

            for evm, (em_id, role, realis, arg_id) in args:
                em = ems[em_id]
                argument = EventArgument(pack, evm, em)
                argument.role = role
                argument.realis = realis
                argument.id = arg_id

            yield pack
Ejemplo n.º 10
0
 def _parse_pack(self, data_source: str) -> Iterator[DataPack]:
     pack = DataPack()
     self.set_text(pack, data_source)
     Utterance(pack, 0, len(data_source))
     yield pack