def _process(self, input_pack: DataPack): page = input_pack.get_single(WikiPage) sys.stdout.write(".") sys.stdout.flush() if input_pack.get_single(WikiPage).page_id == '729678636': import pdb pdb.set_trace()
def add_info_boxes(pack: DataPack, info_box_statements: List, info_type: str): for _, v, o in info_box_statements: info_box = WikiInfoBoxMapped(pack) info_box.set_key(v.toPython()) info_box.set_value(get_resource_name(o)) info_box.set_infobox_type(info_type) pack.add_entry(info_box)
def _parse_pack(self, nli_instance) -> Iterator[PackType]: pair_id, source, target, label = nli_instance pack = DataPack(pair_id) text = source + "\n" + target + "\n" pack.set_text(text) premise = Premise(pack, 0, len(source)) hypo = Hypothesis(pack, len(source) + 1, len(text) - 1) pair = NLIPair(pack) pair.set_parent(premise) pair.set_child(hypo) pair.entailment = { "entailment": 0, "neutral": 0, "contradiction": 0, } if label == 2: pair.entailment["contradiction"] = 1 elif label == 0: pair.entailment["entailment"] = 1 elif label == 1: pair.entailment["neutral"] = 1 else: raise ValueError("Unknown label value.") yield pack
def _process(self, input_pack: DataPack): title_text = input_pack.get_single(WikiPage).page_name title_text = title_text.replace("_", " ") new_text = input_pack.text + "\n" + title_text title_begin = len(input_pack.text) + 1 title_end = title_begin + len(title_text) input_pack.set_text(new_text) WikiArticleTitle(input_pack, title_begin, title_end)
def add_property(pack: DataPack, statements: List): for _, v, o in statements: slot_name = v.toPython() slot_value = get_resource_name(o) info_box = WikiInfoBoxProperty(pack) info_box.set_key(slot_name) info_box.set_value(slot_value) pack.add_entry(info_box)
def setUp(self) -> None: self.data_pack1 = DataPack(doc_id="1") self.data_pack2 = DataPack(doc_id="2") self.data_pack3 = DataPack(doc_id="Three") self.multi_pack = MultiPack() self.multi_pack.add_pack(self.data_pack1, pack_name="pack1") self.multi_pack.add_pack(self.data_pack2, pack_name="pack2") self.multi_pack.add_pack(self.data_pack3, pack_name="pack_three")
def add_info_boxes(pack: DataPack, info_box_statements: List): for _, v, o in info_box_statements: slot_name = v.toPython() slot_value = get_resource_name(o) info_box = WikiInfoBoxMapped(pack) info_box.set_key(slot_name) info_box.set_value(slot_value) pack.add_entry(info_box)
def __get_info_boxes(self, pack: DataPack) -> Iterable[EntryType]: if self.config.infobox_type == "property": yield from pack.get(WikiInfoBoxProperty) elif self.config.infobox_type == "mapped": yield from pack.get(WikiInfoBoxMapped) else: yield from pack.get(WikiInfoBoxProperty) yield from pack.get(WikiInfoBoxMapped)
def create_nli(pack: DataPack, premise_text, hypothesis_text): text = premise_text + "\n" + hypothesis_text + "\n" pack.set_text(text) premise = Premise(pack, 0, len(premise_text)) hypo = Hypothesis(pack, len(premise_text) + 1, len(text) - 1) pair = NLIPair(pack) pair.set_parent(premise) pair.set_child(hypo)
def _process(self, input_pack: DataPack): pattern = r"\s+" start = 0 for m in re.finditer(pattern, input_pack.text): input_pack.add_entry(Token(input_pack, start, m.start())) start = m.end() if start < len(input_pack.text): input_pack.add_entry(Token(input_pack, start, len(input_pack.text)))
def _process(self, input_pack: DataPack): pattern = "\\.\\s*" start = 0 for m in re.finditer(pattern, input_pack.text): end = m.end() Sentence(input_pack, start, end) start = end if start < len(input_pack.text): input_pack.add_entry( Sentence(input_pack, start, len(input_pack.text)))
def sentence_clues(src_sent: Sentence, src_page: str, target_pack: DataPack): clues = [] tgt_sent: Sentence for tgt_sent in target_pack.get(Sentence): bidirectional = False for target_anchor in target_pack.get(WikiAnchor, tgt_sent): if target_anchor.target_page_name == src_page: bidirectional = True overlap, all_grams = compute_overlap(src_sent, tgt_sent) clues.append((bidirectional, overlap, tgt_sent, all_grams)) return sorted(clues, reverse=True)
def _process(self, input_pack: DataPack): doc = input_pack.text end_pos = 0 # sentence parsing sentences = self.nlp(doc).sentences # type: ignore # Iterating through stanfordnlp sentence objects for sentence in sentences: begin_pos = doc.find(sentence.words[0].text, end_pos) end_pos = doc.find(sentence.words[-1].text, begin_pos) + len( sentence.words[-1].text) sentence_entry = Sentence(input_pack, begin_pos, end_pos) input_pack.add_or_get_entry(sentence_entry) tokens: List[Token] = [] if "tokenize" in self.processors: offset = sentence_entry.span.begin end_pos_word = 0 # Iterating through stanfordnlp word objects for word in sentence.words: begin_pos_word = sentence_entry.text. \ find(word.text, end_pos_word) end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word + offset, end_pos_word + offset) if "pos" in self.processors: token.set_fields(pos=word.pos) token.set_fields(upos=word.upos) token.set_fields(xpos=word.xpos) if "lemma" in self.processors: token.set_fields(lemma=word.lemma) token = input_pack.add_or_get_entry(token) tokens.append(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.governor - 1] # Root token relation_entry = Dependency(input_pack, parent, child) relation_entry.set_fields( rel_type=word.dependency_relation) input_pack.add_or_get_entry(relation_entry)
def _process(self, input_pack: MultiPack): query = input_pack.get_pack(self.in_pack_name).text params = '?' + urlencode( { 'api-version': '3.0', 'from': self.src_language, 'to': [self.target_language] }, doseq=True) microsoft_constructed_url = self.microsoft_translate_url + params response = requests.post(microsoft_constructed_url, headers=self.microsoft_headers, json=[{ "text": query }]) if response.status_code != 200: raise RuntimeError(response.json()['error']['message']) text = response.json()[0]["translations"][0]["text"] pack = DataPack() document = Document(pack, 0, len(text)) utterance = Utterance(pack, 0, len(text)) pack.add_entry(document) pack.add_entry(utterance) pack.set_text(text=text) input_pack.update_pack({self.out_pack_name: pack})
def _process(self, input_pack: DataPack): # pylint: disable=no-self-use text = input_pack.text begin_pos = 0 while begin_pos < len(text): end_pos = min(text.find('.', begin_pos)) if end_pos == -1: end_pos = len(text) - 1 sentence_entry = Sentence(input_pack, begin_pos, end_pos + 1) input_pack.add_or_get_entry(sentence_entry) begin_pos = end_pos + 1 while begin_pos < len(text) and text[begin_pos] == " ": begin_pos += 1
def _get_data_batch(self, data_pack: DataPack, context_type: Type[Annotation], requests: Optional[Dict[Type[Entry], Union[Dict, List]]] = None, offset: int = 0) -> Iterable[Tuple[Dict, int]]: """ Try to get batches from a dataset with ``batch_size``, but will yield an incomplete batch if the data_pack is exhausted. Returns: An iterator of tuples ``(batch, cnt)``, ``batch`` is a dict containing the required annotations and context, and ``cnt`` is the number of instances in the batch. """ instances: List[Dict] = [] for data in data_pack.get_data(context_type, requests, offset): instances.append(data) if len(instances) == self.batch_size: batch = batch_instances(instances) # self.instance_num_in_current_batch += len(instances) self.batch_is_full = True yield (batch, len(instances)) instances = [] self.batch_is_full = False # Flush the remaining data. if len(instances) > 0: # self.instance_num_in_current_batch += len(instances) batch = batch_instances(instances) yield (batch, len(instances))
def get_coref_chains(pack: DataPack) -> List[List[int]]: """ Args: pack: Returns: Coref chains, where each chain is the indices of the mention. """ evm_id2index = {} for idx, mention in enumerate(all_valid_events(pack)): evm_id2index[mention.tid] = idx chains: List[List[int]] = [] hopper: Hopper for hopper in pack.get(Hopper): chain = [] for mention in hopper.get_members(): # Invalid mentions should be removed. if mention.tid in evm_id2index: idx = evm_id2index[mention.tid] chain.append(idx) if len(chain) > 1: chains.append(sorted(chain)) return chains
def _process(self, data_pack: DataPack): """ Process the data pack to collect vocabulary information. Args: data_pack: The ner data to create vocabulary with. Returns: """ # for data_pack in input_pack: for instance in data_pack.get_data(context_type=Sentence, request={ Token: ["chunk", "pos", "ner"] }): for token in instance["Token"]["text"]: for char in token: self.char_cnt[char] += 1 word = self.normalize_func(token) self.word_cnt[word] += 1 for pos in instance["Token"]["pos"]: self.pos_cnt[pos] += 1 for chunk in instance["Token"]["chunk"]: self.chunk_cnt[chunk] += 1 for ner in instance["Token"]["ner"]: self.ner_cnt[ner] += 1
def _parse_pack( self, collection: Tuple[str, Dict[str, List[state_type]]]) -> Iterator[DataPack]: resource_name, info_box_data = collection if resource_name in self.redirects: resource_name = self.redirects[resource_name] if resource_name in self.pack_index: print_progress(f'Add infobox to resource: [{resource_name}]') pack_path = os.path.join(self.pack_dir, self.pack_index[resource_name]) if os.path.exists(pack_path): with open(pack_path) as pack_file: pack = DataPack.deserialize(pack_file.read()) add_info_boxes(pack, info_box_data['literals'], 'literal') add_info_boxes(pack, info_box_data['objects'], 'object') add_property(pack, info_box_data['properties']) yield pack else: print_notice(f"Resource {resource_name} is not in the raw packs.") self.logger.warning("Resource %s is not in the raw packs.", resource_name)
def _process(self, input_pack: DataPack): instance: NLIPair for instance in input_pack.get(NLIPair): premise = instance.get_parent().text hypo = instance.get_child().text results = self._nli_inference(premise, hypo) for k, v in enumerate(results): instance.entailment[self.__id2label[k]] = v
def _process(self, input_pack: DataPack): all_anchors = defaultdict(list) anchor: WikiAnchor for anchor in input_pack.get(WikiAnchor): all_anchors[(anchor.span.begin, anchor.span.end)].append(anchor) for span in all_anchors.keys(): l_a: List[WikiAnchor] = all_anchors[span] if len(l_a) > 1: if len(l_a) > 2: print(input_pack.pack_name, l_a[0].target_page_name, len(l_a)) logging.error( "There are links that have more than 2 copies.") import pdb pdb.set_trace() for a in l_a[1:]: # Removing duplicates. input_pack.delete_entry(a)
def _process(self, input_pack: DataPack): # handle existing entries self._process_existing_entries(input_pack) for sentence in input_pack.get(Sentence): result = self.predictor.predict(sentence=sentence.text) if "tokenize" in self.processors: # creating new tokens and dependencies tokens = self._create_tokens(input_pack, sentence, result) if "depparse" in self.processors: self._create_dependencies(input_pack, tokens, result)
def _parse_pack( self, doc_data: Tuple[Dict[str, str], Dict[str, List[state_type]]] ) -> Iterator[DataPack]: str_data, node_data = doc_data pack = DataPack() doc_name: str = str_data['doc_name'] if doc_name in self.redirects: doc_name = self.redirects[doc_name] full_text: str = str_data['text'] pack.set_text(full_text) page = WikiPage(pack, 0, len(full_text)) pack.add_entry(page) page.set_page_id(str_data['oldid']) page.set_page_name(doc_name) if len(node_data['struct']) > 0: add_struct(pack, node_data['struct']) else: logging.warning('Structure info for %s not found.', doc_name) if len(node_data['links']) > 0: add_anchor_links(pack, node_data['links'], self.redirects) else: logging.warning('Links for [%s] not found.', doc_name) pack.meta.doc_id = doc_name yield pack
def _process(self, input_pack: MultiPack): r"""Searches ElasticSearch indexer to fetch documents for a query. This query should be contained in the input multipack with name `self.config.query_pack_name`. This method adds new packs to `input_pack` containing the retrieved results. Each result is added as a `ft.onto.base_ontology.Document`. Args: input_pack: A multipack containing query as a pack. """ query_pack = input_pack.get_pack(self.config.query_pack_name) # ElasticSearchQueryCreator adds a Query entry to query pack. We now # fetch it as the first element. first_query = list(query_pack.get_entries(Query))[0] results = self.index.search(first_query.value) hits = results["hits"]["hits"] packs = {} for idx, hit in enumerate(hits): document = hit["_source"] first_query.update_results({document["doc_id"]: hit["_score"]}) pack = DataPack(doc_id=document["doc_id"]) content = document[self.config.field] document = Document(pack=pack, begin=0, end=len(content)) pack.add_entry(document) pack.set_text(content) packs[f"{self.config.response_pack_name_prefix}_{idx}"] = pack input_pack.update_pack(packs)
def _process(self, input_pack: DataPack): kp = KeywordProcessor(case_sensitive=True) anchor_entities = {} existing_anchors = set() anchor: WikiAnchor for anchor in input_pack.get(WikiAnchor): kp.add_keyword(anchor.text) existing_anchors.add((anchor.span.begin, anchor.span.end)) try: anchor_entities[anchor.text].append(anchor) except KeyError: anchor_entities[anchor.text] = [anchor] for kw, b, e in kp.extract_keywords(input_pack.text, span_info=True): targets = anchor_entities[kw] if (b, e) in existing_anchors: # Ignore existing anchors. continue copy_from: WikiAnchor if len(targets) == 1: copy_from = targets[0] elif len(targets) > 1: latest_ = targets[0] for t in targets: if t.begin < b: latest_ = t copy_from = latest_ else: raise RuntimeError(f"Unknown target length {len(targets)}") anchor = WikiAnchor(input_pack, b, e) anchor.target_page_name = copy_from.target_page_name anchor.is_external = copy_from.is_external input_pack.add_entry(anchor)
def _process(self, input_pack: DataPack): self._tbf_out.write(f"#BeginOfDocument {input_pack.pack_name}\n") eids: Dict[int, str] = {} for i, evm in enumerate(input_pack.get(EventMention)): self._tbf_out.write("\t".join([ self.configs.system_name, input_pack.pack_name, f"E{i}", f"{evm.begin},{evm.end}", evm.text.replace("\n", ""), evm.event_type, "Actual" ]) + "\n") eids[evm.tid] = f"E{i}" hopper: Hopper for i, hopper in enumerate(input_pack.get(Hopper)): if len(hopper.get_members()) <= 1: continue member_text = ",".join( [eids[evm.tid] for evm in hopper.get_members()]) self._tbf_out.write( "\t".join(["@Coreference", f"R{i}", member_text]) + "\n") self._tbf_out.write("#EndOfDocument\n")
def build_arguments(pack: DataPack): all_args: Dict[int, Dict[str, int]] = {} argument: EventArgument for argument in pack.get(EventArgument): evm: EventMention = argument.get_parent() arg: EntityMention = argument.get_child() try: all_args[evm.tid][argument.role] = arg except KeyError: all_args[evm.tid] = {argument.role: arg} return all_args
def all_valid_events(pack: DataPack) -> List[EventMention]: """ Some events are not in filtered text. We ignore them. Args: pack: Returns: """ all_events: List[EventMention] = [] for sent in pack.get(Sentence): all_events.extend(sent.get(EventMention)) return all_events
def add_anchor_links(pack: DataPack, text_link_statements: List[state_type], redirects: Dict[str, str]): link_grouped: DefaultDict[str, Dict[str, rdflib.term.Node]] = defaultdict(dict) for nif_range, rel, info in text_link_statements: range_ = get_resource_attribute(nif_range, 'char') r = get_resource_fragment(rel) link_grouped[range_][r] = info for range_, link_infos in link_grouped.items(): begin, end = [int(d) for d in range_.split(',')] anchor = WikiAnchor(pack, begin, end) for info_key, info_value in link_infos.items(): if info_key == 'type': anchor_type = get_resource_fragment(info_value) if not anchor_type == 'Phrase' and not anchor_type == 'Word': logging.warning("Unknown anchor type: %s", info_value) if info_key == 'taIdentRef': target_page_name = get_resource_name(info_value) if target_page_name in redirects: target_page_name = redirects[target_page_name] anchor.set_target_page_name(target_page_name) pack.add_entry(anchor)
def get_single(pack: DataPack, entry_type: Type[EntryType]) -> EntryType: r"""Take a single entry of type :attr:`entry_type` from the provided data pack. This is useful when the target entry type normally appears only one time in the :class:`DataPack` for e.g., a Document entry. Args: pack: The provided data pack to take entries from. entry_type: The entry type to be retrieved. Returns: A single data entry. """ for a in pack.get(entry_type): return a raise EntryNotFoundError( f"The entry {entry_type} is not found in the provided data pack.")