Esempio n. 1
0
    def _process(self, input_pack: MultiPack):
        input_ids = []
        segment_ids = []

        query_pack = input_pack.get_pack("pack")
        context = [query_pack.text]

        # use context to build the query
        if "user_utterance" in input_pack.pack_names:
            user_pack = input_pack.get_pack("user_utterance")
            context.append(user_pack.text)

        if "bot_utterance" in input_pack.pack_names:
            bot_pack = input_pack.get_pack("bot_utterance")
            context.append(bot_pack.text)

        for text in context:
            t = self.tokenizer.encode_text(text)
            input_ids.append(t[0])
            segment_ids.append(t[1])

        input_ids = torch.LongTensor(input_ids).to(self.device)
        segment_ids = torch.LongTensor(segment_ids).to(self.device)
        _, query_vector = self.get_embeddings(input_ids, segment_ids)
        query_vector = torch.mean(query_vector, dim=0, keepdim=True)
        query_vector = query_vector.cpu().numpy()
        query = Query(pack=query_pack, value=query_vector)
        query_pack.add_or_get_entry(query)
Esempio n. 2
0
    def pack(self, data_pack: MultiPack, output_dict):
        """
        Write the prediction results back to datapack. If :attr:`_overwrite`
        is `True`, write the predicted ner to the original tokens.
        Otherwise, create a new set of tokens and write the predicted ner
        to the new tokens (usually use this configuration for evaluation.)
        """
        assert output_dict is not None
        output_pack = data_pack.get_pack(self.output_pack_name)

        input_sent_tids = output_dict["input_sents_tids"]
        output_sentences = output_dict["output_sents"]

        text = output_pack.text
        input_pack = data_pack.get_pack(self.input_pack_name)
        for input_id, output_sentence in zip(input_sent_tids, output_sentences):
            offset = len(output_pack.text)
            sent = Sentence(output_pack, offset, offset + len(output_sentence))
            output_pack.add_entry(sent)
            text += output_sentence + "\n"

            input_sent = input_pack.get_entry(input_id)
            cross_link = MultiPackLink(
                data_pack, data_pack.subentry(self.input_pack_name, input_sent),
                data_pack.subentry(self.output_pack_name, sent))
            data_pack.add_entry(cross_link)
            # We may also consider adding two link with opposite directions
            # Here the unidirectional link indicates the generation dependency
        output_pack.set_text(text)
Esempio n. 3
0
    def _get_data_batch(
            self,
            data_pack: MultiPack,
            context_type: Type[Annotation],
            requests: Optional[Dict[Type[Entry], Union[Dict, List]]] = None,
            offset: int = 0) -> Iterable[Tuple[Dict, int]]:
        r"""Try to get batches of size ``batch_size``. If the tail instances
        cannot make up a full batch, will generate a small batch with the tail
        instances.

        Returns:
            An iterator of tuples ``(batch, cnt)``, ``batch`` is a dict
            containing the required annotations and context, and ``cnt`` is
            the number of instances in the batch.
        """
        input_pack = data_pack.get_pack(self.input_pack_name)

        instances: List[Dict] = []
        for data in input_pack.get_data(context_type, requests, offset):
            instances.append(data)
            if len(instances) == self.batch_size:
                batch = batch_instances(instances)
                self.batch_is_full = True
                yield (batch, len(instances))
                instances = []
                self.batch_is_full = False

        if len(instances):
            batch = batch_instances(instances)
            self.last_batch = True
            yield (batch, len(instances))
Esempio n. 4
0
    def _process(self, input_pack: MultiPack):
        r"""Searches ElasticSearch indexer to fetch documents for a query. This
        query should be contained in the input multipack with name
        `self.config.query_pack_name`.

        This method adds new packs to `input_pack` containing the retrieved
        results. Each result is added as a `ft.onto.base_ontology.Document`.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.config.query_pack_name)

        # ElasticSearchQueryCreator adds a Query entry to query pack. We now
        # fetch it as the first element.
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value)
        hits = results["hits"]["hits"]
        packs = {}
        for idx, hit in enumerate(hits):
            document = hit["_source"]
            first_query.update_results({document["doc_id"]: hit["_score"]})
            pack = DataPack(doc_id=document["doc_id"])
            content = document[self.config.field]
            document = Document(pack=pack, begin=0, end=len(content))
            pack.add_entry(document)
            pack.set_text(content)
            packs[f"{self.config.response_pack_name_prefix}_{idx}"] = pack

        input_pack.update_pack(packs)
    def _process(self, input_pack: MultiPack):
        query = input_pack.get_pack(self.in_pack_name).text
        params = '?' + urlencode(
            {
                'api-version': '3.0',
                'from': self.src_language,
                'to': [self.target_language]
            },
            doseq=True)
        microsoft_constructed_url = self.microsoft_translate_url + params

        response = requests.post(microsoft_constructed_url,
                                 headers=self.microsoft_headers,
                                 json=[{
                                     "text": query
                                 }])

        if response.status_code != 200:
            raise RuntimeError(response.json()['error']['message'])

        text = response.json()[0]["translations"][0]["text"]
        pack = DataPack()

        document = Document(pack, 0, len(text))
        utterance = Utterance(pack, 0, len(text))
        pack.add_entry(document)
        pack.add_entry(utterance)

        pack.set_text(text=text)
        input_pack.update_pack({self.out_pack_name: pack})
Esempio n. 6
0
    def _process_query(self, input_pack: MultiPack):
        query_pack = input_pack.get_pack(self.config.query_pack_name)
        context = [query_pack.text]

        # use context to build the query
        if "user_utterance" in input_pack.pack_names:
            user_pack = input_pack.get_pack("user_utterance")
            context.append(user_pack.text)

        if "bot_utterance" in input_pack.pack_names:
            bot_pack = input_pack.get_pack("bot_utterance")
            context.append(bot_pack.text)

        text = ' '.join(context)

        query_vector = self._build_query(text=text)

        return query_pack, query_vector
Esempio n. 7
0
    def _process(self, input_pack: MultiPack):
        src_pack = input_pack.get_pack(self.configs.source_pack_name)

        instance: NLIPair
        for instance in src_pack.get(NLIPair):
            premise = instance.get_parent()
            hypo = instance.get_child()

            for i, (new_prem,
                    new_hypo) in enumerate(self.tweak_nli_text(premise, hypo)):
                pack = input_pack.add_pack(f"generated_{i}",
                                           input_pack.pack_name + f"_{i}")
                create_nli(pack, new_prem, new_hypo)
Esempio n. 8
0
    def _process(self, input_pack: MultiPack):
        query_pack = input_pack.get_pack(self.config.query_pack_name)
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value, self.k)
        documents = [r[1] for result in results for r in result]

        packs = {}
        for i, doc in enumerate(documents):
            pack = DataPack()
            document = Document(pack=pack, begin=0, end=len(doc))
            pack.add_entry(document)
            pack.set_text(doc)
            packs[self.config.response_pack_name[i]] = pack

        input_pack.update_pack(packs)
Esempio n. 9
0
    def _process(self, input_pack: MultiPack):
        query_pack = input_pack.get_pack("pack")
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value, self.k)
        documents = [r[1] for result in results for r in result]

        packs = {}
        counter = 0
        for doc in documents:
            pack = DataPack()
            document = Document(pack=pack, begin=0, end=len(doc))
            pack.add_entry(document)
            pack.set_text(doc)
            packs[f"doc_{counter}"] = pack
            counter += 1

        input_pack.update_pack(packs)
Esempio n. 10
0
 def _process_query(self, input_pack: MultiPack) -> \
         Tuple[DataPack, Dict[str, Any]]:
     query_pack = input_pack.get_pack(self.config.query_pack_name)
     query = self._build_query(text=query_pack.text)
     return query_pack, query