Exemple #1
0
 def _process(self, input_pack: DataPack):
     page = input_pack.get_single(WikiPage)
     sys.stdout.write(".")
     sys.stdout.flush()
     if input_pack.get_single(WikiPage).page_id == '729678636':
         import pdb
         pdb.set_trace()
Exemple #2
0
def add_info_boxes(pack: DataPack, info_box_statements: List, info_type: str):
    for _, v, o in info_box_statements:
        info_box = WikiInfoBoxMapped(pack)
        info_box.set_key(v.toPython())
        info_box.set_value(get_resource_name(o))
        info_box.set_infobox_type(info_type)
        pack.add_entry(info_box)
    def _parse_pack(self, nli_instance) -> Iterator[PackType]:
        pair_id, source, target, label = nli_instance

        pack = DataPack(pair_id)
        text = source + "\n" + target + "\n"
        pack.set_text(text)

        premise = Premise(pack, 0, len(source))
        hypo = Hypothesis(pack, len(source) + 1, len(text) - 1)

        pair = NLIPair(pack)
        pair.set_parent(premise)
        pair.set_child(hypo)

        pair.entailment = {
            "entailment": 0,
            "neutral": 0,
            "contradiction": 0,
        }

        if label == 2:
            pair.entailment["contradiction"] = 1
        elif label == 0:
            pair.entailment["entailment"] = 1
        elif label == 1:
            pair.entailment["neutral"] = 1
        else:
            raise ValueError("Unknown label value.")

        yield pack
Exemple #4
0
 def _process(self, input_pack: DataPack):
     title_text = input_pack.get_single(WikiPage).page_name
     title_text = title_text.replace("_", " ")
     new_text = input_pack.text + "\n" + title_text
     title_begin = len(input_pack.text) + 1
     title_end = title_begin + len(title_text)
     input_pack.set_text(new_text)
     WikiArticleTitle(input_pack, title_begin, title_end)
def add_property(pack: DataPack, statements: List):
    for _, v, o in statements:
        slot_name = v.toPython()
        slot_value = get_resource_name(o)
        info_box = WikiInfoBoxProperty(pack)
        info_box.set_key(slot_name)
        info_box.set_value(slot_value)
        pack.add_entry(info_box)
Exemple #6
0
 def setUp(self) -> None:
     self.data_pack1 = DataPack(doc_id="1")
     self.data_pack2 = DataPack(doc_id="2")
     self.data_pack3 = DataPack(doc_id="Three")
     self.multi_pack = MultiPack()
     self.multi_pack.add_pack(self.data_pack1, pack_name="pack1")
     self.multi_pack.add_pack(self.data_pack2, pack_name="pack2")
     self.multi_pack.add_pack(self.data_pack3, pack_name="pack_three")
def add_info_boxes(pack: DataPack, info_box_statements: List):
    for _, v, o in info_box_statements:
        slot_name = v.toPython()
        slot_value = get_resource_name(o)
        info_box = WikiInfoBoxMapped(pack)
        info_box.set_key(slot_name)
        info_box.set_value(slot_value)
        pack.add_entry(info_box)
Exemple #8
0
 def __get_info_boxes(self, pack: DataPack) -> Iterable[EntryType]:
     if self.config.infobox_type == "property":
         yield from pack.get(WikiInfoBoxProperty)
     elif self.config.infobox_type == "mapped":
         yield from pack.get(WikiInfoBoxMapped)
     else:
         yield from pack.get(WikiInfoBoxProperty)
         yield from pack.get(WikiInfoBoxMapped)
Exemple #9
0
def create_nli(pack: DataPack, premise_text, hypothesis_text):
    text = premise_text + "\n" + hypothesis_text + "\n"
    pack.set_text(text)

    premise = Premise(pack, 0, len(premise_text))
    hypo = Hypothesis(pack, len(premise_text) + 1, len(text) - 1)

    pair = NLIPair(pack)
    pair.set_parent(premise)
    pair.set_child(hypo)
Exemple #10
0
    def _process(self, input_pack: DataPack):
        pattern = r"\s+"
        start = 0

        for m in re.finditer(pattern, input_pack.text):
            input_pack.add_entry(Token(input_pack, start, m.start()))
            start = m.end()

        if start < len(input_pack.text):
            input_pack.add_entry(Token(input_pack, start, len(input_pack.text)))
Exemple #11
0
    def _process(self, input_pack: DataPack):
        pattern = "\\.\\s*"
        start = 0

        for m in re.finditer(pattern, input_pack.text):
            end = m.end()
            Sentence(input_pack, start, end)
            start = end

        if start < len(input_pack.text):
            input_pack.add_entry(
                Sentence(input_pack, start, len(input_pack.text)))
def sentence_clues(src_sent: Sentence, src_page: str, target_pack: DataPack):
    clues = []

    tgt_sent: Sentence
    for tgt_sent in target_pack.get(Sentence):
        bidirectional = False
        for target_anchor in target_pack.get(WikiAnchor, tgt_sent):
            if target_anchor.target_page_name == src_page:
                bidirectional = True
        overlap, all_grams = compute_overlap(src_sent, tgt_sent)
        clues.append((bidirectional, overlap, tgt_sent, all_grams))
    return sorted(clues, reverse=True)
    def _process(self, input_pack: DataPack):
        doc = input_pack.text
        end_pos = 0

        # sentence parsing
        sentences = self.nlp(doc).sentences  # type: ignore

        # Iterating through stanfordnlp sentence objects
        for sentence in sentences:
            begin_pos = doc.find(sentence.words[0].text, end_pos)
            end_pos = doc.find(sentence.words[-1].text, begin_pos) + len(
                sentence.words[-1].text)
            sentence_entry = Sentence(input_pack, begin_pos, end_pos)
            input_pack.add_or_get_entry(sentence_entry)

            tokens: List[Token] = []
            if "tokenize" in self.processors:
                offset = sentence_entry.span.begin
                end_pos_word = 0

                # Iterating through stanfordnlp word objects
                for word in sentence.words:
                    begin_pos_word = sentence_entry.text. \
                        find(word.text, end_pos_word)
                    end_pos_word = begin_pos_word + len(word.text)
                    token = Token(input_pack, begin_pos_word + offset,
                                  end_pos_word + offset)

                    if "pos" in self.processors:
                        token.set_fields(pos=word.pos)
                        token.set_fields(upos=word.upos)
                        token.set_fields(xpos=word.xpos)

                    if "lemma" in self.processors:
                        token.set_fields(lemma=word.lemma)

                    token = input_pack.add_or_get_entry(token)
                    tokens.append(token)

            # For each sentence, get the dependency relations among tokens
            if "depparse" in self.processors:
                # Iterating through token entries in current sentence
                for token, word in zip(tokens, sentence.words):
                    child = token  # current token
                    parent = tokens[word.governor - 1]  # Root token
                    relation_entry = Dependency(input_pack, parent, child)
                    relation_entry.set_fields(
                        rel_type=word.dependency_relation)

                    input_pack.add_or_get_entry(relation_entry)
    def _process(self, input_pack: MultiPack):
        query = input_pack.get_pack(self.in_pack_name).text
        params = '?' + urlencode(
            {
                'api-version': '3.0',
                'from': self.src_language,
                'to': [self.target_language]
            },
            doseq=True)
        microsoft_constructed_url = self.microsoft_translate_url + params

        response = requests.post(microsoft_constructed_url,
                                 headers=self.microsoft_headers,
                                 json=[{
                                     "text": query
                                 }])

        if response.status_code != 200:
            raise RuntimeError(response.json()['error']['message'])

        text = response.json()[0]["translations"][0]["text"]
        pack = DataPack()

        document = Document(pack, 0, len(text))
        utterance = Utterance(pack, 0, len(text))
        pack.add_entry(document)
        pack.add_entry(utterance)

        pack.set_text(text=text)
        input_pack.update_pack({self.out_pack_name: pack})
Exemple #15
0
    def _process(self, input_pack: DataPack):
        # pylint: disable=no-self-use
        text = input_pack.text

        begin_pos = 0
        while begin_pos < len(text):
            end_pos = min(text.find('.', begin_pos))
            if end_pos == -1:
                end_pos = len(text) - 1
            sentence_entry = Sentence(input_pack, begin_pos, end_pos + 1)
            input_pack.add_or_get_entry(sentence_entry)

            begin_pos = end_pos + 1
            while begin_pos < len(text) and text[begin_pos] == " ":
                begin_pos += 1
Exemple #16
0
    def _get_data_batch(self,
                        data_pack: DataPack,
                        context_type: Type[Annotation],
                        requests: Optional[Dict[Type[Entry],
                                                Union[Dict, List]]] = None,
                        offset: int = 0) -> Iterable[Tuple[Dict, int]]:
        """
        Try to get batches from a dataset  with ``batch_size``, but will
        yield an incomplete batch if the data_pack is exhausted.

        Returns:
            An iterator of tuples ``(batch, cnt)``, ``batch`` is a dict
            containing the required annotations and context, and ``cnt`` is
            the number of instances in the batch.
        """
        instances: List[Dict] = []
        for data in data_pack.get_data(context_type, requests, offset):
            instances.append(data)
            if len(instances) == self.batch_size:
                batch = batch_instances(instances)
                # self.instance_num_in_current_batch += len(instances)
                self.batch_is_full = True
                yield (batch, len(instances))
                instances = []
                self.batch_is_full = False

        # Flush the remaining data.
        if len(instances) > 0:
            # self.instance_num_in_current_batch += len(instances)
            batch = batch_instances(instances)
            yield (batch, len(instances))
Exemple #17
0
def get_coref_chains(pack: DataPack) -> List[List[int]]:
    """

    Args:
        pack:

    Returns: Coref chains, where each chain is the indices of the mention.

    """
    evm_id2index = {}

    for idx, mention in enumerate(all_valid_events(pack)):
        evm_id2index[mention.tid] = idx

    chains: List[List[int]] = []

    hopper: Hopper
    for hopper in pack.get(Hopper):
        chain = []
        for mention in hopper.get_members():
            # Invalid mentions should be removed.
            if mention.tid in evm_id2index:
                idx = evm_id2index[mention.tid]
                chain.append(idx)
        if len(chain) > 1:
            chains.append(sorted(chain))
    return chains
Exemple #18
0
    def _process(self, data_pack: DataPack):
        """
        Process the data pack to collect vocabulary information.

        Args:
            data_pack: The ner data to create vocabulary with.

        Returns:

        """
        # for data_pack in input_pack:
        for instance in data_pack.get_data(context_type=Sentence,
                                           request={
                                               Token: ["chunk", "pos", "ner"]
                                           }):
            for token in instance["Token"]["text"]:
                for char in token:
                    self.char_cnt[char] += 1
                word = self.normalize_func(token)
                self.word_cnt[word] += 1

            for pos in instance["Token"]["pos"]:
                self.pos_cnt[pos] += 1
            for chunk in instance["Token"]["chunk"]:
                self.chunk_cnt[chunk] += 1
            for ner in instance["Token"]["ner"]:
                self.ner_cnt[ner] += 1
Exemple #19
0
    def _parse_pack(
        self, collection: Tuple[str,
                                Dict[str,
                                     List[state_type]]]) -> Iterator[DataPack]:
        resource_name, info_box_data = collection

        if resource_name in self.redirects:
            resource_name = self.redirects[resource_name]

        if resource_name in self.pack_index:
            print_progress(f'Add infobox to resource: [{resource_name}]')

            pack_path = os.path.join(self.pack_dir,
                                     self.pack_index[resource_name])

            if os.path.exists(pack_path):
                with open(pack_path) as pack_file:
                    pack = DataPack.deserialize(pack_file.read())

                    add_info_boxes(pack, info_box_data['literals'], 'literal')
                    add_info_boxes(pack, info_box_data['objects'], 'object')
                    add_property(pack, info_box_data['properties'])
                    yield pack
        else:
            print_notice(f"Resource {resource_name} is not in the raw packs.")
            self.logger.warning("Resource %s is not in the raw packs.",
                                resource_name)
Exemple #20
0
    def _process(self, input_pack: DataPack):
        instance: NLIPair
        for instance in input_pack.get(NLIPair):
            premise = instance.get_parent().text
            hypo = instance.get_child().text
            results = self._nli_inference(premise, hypo)

            for k, v in enumerate(results):
                instance.entailment[self.__id2label[k]] = v
Exemple #21
0
    def _process(self, input_pack: DataPack):
        all_anchors = defaultdict(list)
        anchor: WikiAnchor
        for anchor in input_pack.get(WikiAnchor):
            all_anchors[(anchor.span.begin, anchor.span.end)].append(anchor)

        for span in all_anchors.keys():
            l_a: List[WikiAnchor] = all_anchors[span]
            if len(l_a) > 1:
                if len(l_a) > 2:
                    print(input_pack.pack_name, l_a[0].target_page_name,
                          len(l_a))
                    logging.error(
                        "There are links that have more than 2 copies.")
                    import pdb
                    pdb.set_trace()
                for a in l_a[1:]:
                    # Removing duplicates.
                    input_pack.delete_entry(a)
    def _process(self, input_pack: DataPack):
        # handle existing entries
        self._process_existing_entries(input_pack)

        for sentence in input_pack.get(Sentence):
            result = self.predictor.predict(sentence=sentence.text)

            if "tokenize" in self.processors:
                # creating new tokens and dependencies
                tokens = self._create_tokens(input_pack, sentence, result)
                if "depparse" in self.processors:
                    self._create_dependencies(input_pack, tokens, result)
    def _parse_pack(
        self, doc_data: Tuple[Dict[str, str], Dict[str, List[state_type]]]
    ) -> Iterator[DataPack]:
        str_data, node_data = doc_data

        pack = DataPack()
        doc_name: str = str_data['doc_name']
        if doc_name in self.redirects:
            doc_name = self.redirects[doc_name]

        full_text: str = str_data['text']

        pack.set_text(full_text)
        page = WikiPage(pack, 0, len(full_text))
        pack.add_entry(page)
        page.set_page_id(str_data['oldid'])
        page.set_page_name(doc_name)

        if len(node_data['struct']) > 0:
            add_struct(pack, node_data['struct'])
        else:
            logging.warning('Structure info for %s not found.', doc_name)

        if len(node_data['links']) > 0:
            add_anchor_links(pack, node_data['links'], self.redirects)
        else:
            logging.warning('Links for [%s] not found.', doc_name)

        pack.meta.doc_id = doc_name

        yield pack
Exemple #24
0
    def _process(self, input_pack: MultiPack):
        r"""Searches ElasticSearch indexer to fetch documents for a query. This
        query should be contained in the input multipack with name
        `self.config.query_pack_name`.

        This method adds new packs to `input_pack` containing the retrieved
        results. Each result is added as a `ft.onto.base_ontology.Document`.

        Args:
             input_pack: A multipack containing query as a pack.
        """
        query_pack = input_pack.get_pack(self.config.query_pack_name)

        # ElasticSearchQueryCreator adds a Query entry to query pack. We now
        # fetch it as the first element.
        first_query = list(query_pack.get_entries(Query))[0]
        results = self.index.search(first_query.value)
        hits = results["hits"]["hits"]
        packs = {}
        for idx, hit in enumerate(hits):
            document = hit["_source"]
            first_query.update_results({document["doc_id"]: hit["_score"]})
            pack = DataPack(doc_id=document["doc_id"])
            content = document[self.config.field]
            document = Document(pack=pack, begin=0, end=len(content))
            pack.add_entry(document)
            pack.set_text(content)
            packs[f"{self.config.response_pack_name_prefix}_{idx}"] = pack

        input_pack.update_pack(packs)
Exemple #25
0
    def _process(self, input_pack: DataPack):
        kp = KeywordProcessor(case_sensitive=True)
        anchor_entities = {}
        existing_anchors = set()

        anchor: WikiAnchor
        for anchor in input_pack.get(WikiAnchor):
            kp.add_keyword(anchor.text)
            existing_anchors.add((anchor.span.begin, anchor.span.end))

            try:
                anchor_entities[anchor.text].append(anchor)
            except KeyError:
                anchor_entities[anchor.text] = [anchor]

        for kw, b, e in kp.extract_keywords(input_pack.text, span_info=True):
            targets = anchor_entities[kw]

            if (b, e) in existing_anchors:
                # Ignore existing anchors.
                continue

            copy_from: WikiAnchor
            if len(targets) == 1:
                copy_from = targets[0]
            elif len(targets) > 1:
                latest_ = targets[0]
                for t in targets:
                    if t.begin < b:
                        latest_ = t
                copy_from = latest_
            else:
                raise RuntimeError(f"Unknown target length {len(targets)}")

            anchor = WikiAnchor(input_pack, b, e)
            anchor.target_page_name = copy_from.target_page_name
            anchor.is_external = copy_from.is_external
            input_pack.add_entry(anchor)
Exemple #26
0
    def _process(self, input_pack: DataPack):
        self._tbf_out.write(f"#BeginOfDocument {input_pack.pack_name}\n")

        eids: Dict[int, str] = {}
        for i, evm in enumerate(input_pack.get(EventMention)):
            self._tbf_out.write("\t".join([
                self.configs.system_name, input_pack.pack_name, f"E{i}",
                f"{evm.begin},{evm.end}",
                evm.text.replace("\n", ""), evm.event_type, "Actual"
            ]) + "\n")
            eids[evm.tid] = f"E{i}"

        hopper: Hopper
        for i, hopper in enumerate(input_pack.get(Hopper)):
            if len(hopper.get_members()) <= 1:
                continue

            member_text = ",".join(
                [eids[evm.tid] for evm in hopper.get_members()])
            self._tbf_out.write(
                "\t".join(["@Coreference", f"R{i}", member_text]) + "\n")

        self._tbf_out.write("#EndOfDocument\n")
Exemple #27
0
def build_arguments(pack: DataPack):
    all_args: Dict[int, Dict[str, int]] = {}

    argument: EventArgument
    for argument in pack.get(EventArgument):
        evm: EventMention = argument.get_parent()
        arg: EntityMention = argument.get_child()

        try:
            all_args[evm.tid][argument.role] = arg
        except KeyError:
            all_args[evm.tid] = {argument.role: arg}

    return all_args
Exemple #28
0
def all_valid_events(pack: DataPack) -> List[EventMention]:
    """
    Some events are not in filtered text. We ignore them.

    Args:
        pack:

    Returns:

    """
    all_events: List[EventMention] = []
    for sent in pack.get(Sentence):
        all_events.extend(sent.get(EventMention))
    return all_events
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type],
                     redirects: Dict[str, str]):
    link_grouped: DefaultDict[str, Dict[str,
                                        rdflib.term.Node]] = defaultdict(dict)
    for nif_range, rel, info in text_link_statements:
        range_ = get_resource_attribute(nif_range, 'char')
        r = get_resource_fragment(rel)
        link_grouped[range_][r] = info

    for range_, link_infos in link_grouped.items():
        begin, end = [int(d) for d in range_.split(',')]
        anchor = WikiAnchor(pack, begin, end)
        for info_key, info_value in link_infos.items():
            if info_key == 'type':
                anchor_type = get_resource_fragment(info_value)
                if not anchor_type == 'Phrase' and not anchor_type == 'Word':
                    logging.warning("Unknown anchor type: %s", info_value)
            if info_key == 'taIdentRef':
                target_page_name = get_resource_name(info_value)
                if target_page_name in redirects:
                    target_page_name = redirects[target_page_name]
                anchor.set_target_page_name(target_page_name)
        pack.add_entry(anchor)
Exemple #30
0
def get_single(pack: DataPack, entry_type: Type[EntryType]) -> EntryType:
    r"""Take a single entry of type :attr:`entry_type` from the provided data
    pack. This is useful when the target entry type normally appears only one
    time in the :class:`DataPack` for e.g., a Document entry.

    Args:
        pack: The provided data pack to take entries from.
        entry_type: The entry type to be retrieved.

    Returns:
        A single data entry.
    """
    for a in pack.get(entry_type):
        return a

    raise EntryNotFoundError(
        f"The entry {entry_type} is not found in the provided data pack.")