def _process_parser(self, sentences, input_pack): """Parse the sentence. Default behaviour is to segment sentence, POSTag and Lemmatize. Args: sentences: Generator object which yields sentences in document input_pack: input pack which needs to be modified Returns: """ for sentence in sentences: sentence_entry = Sentence(input_pack, sentence.start_char, sentence.end_char) input_pack.add_or_get_entry(sentence_entry) if "tokenize" in self.processors: # Iterating through spaCy token objects for word in sentence: begin_pos_word = word.idx end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word, end_pos_word) if "pos" in self.processors: token.set_fields(pos=word.tag_) if "lemma" in self.processors: token.set_fields(lemma=word.lemma_) input_pack.add_or_get_entry(token)
def _create_tokens(self, input_pack, sentence, result): words, pos = result['words'], result['pos'] tokens = [] offset = sentence.span.begin word_end = 0 for i, word in enumerate(words): word_begin = sentence.text.find(word, word_end) word_end = word_begin + len(word) token = Token(input_pack, offset + word_begin, offset + word_end) if "pos" in self.processors: token.set_fields(pos=pos[i]) tokens.append(token) input_pack.add_entry(token) return tokens
def _process(self, input_pack: DataPack): doc = input_pack.text end_pos = 0 # sentence parsing sentences = self.nlp(doc).sentences # type: ignore # Iterating through stanfordnlp sentence objects for sentence in sentences: begin_pos = doc.find(sentence.words[0].text, end_pos) end_pos = doc.find(sentence.words[-1].text, begin_pos) + len( sentence.words[-1].text) sentence_entry = Sentence(input_pack, begin_pos, end_pos) input_pack.add_or_get_entry(sentence_entry) tokens: List[Token] = [] if "tokenize" in self.processors: offset = sentence_entry.span.begin end_pos_word = 0 # Iterating through stanfordnlp word objects for word in sentence.words: begin_pos_word = sentence_entry.text. \ find(word.text, end_pos_word) end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word + offset, end_pos_word + offset ) if "pos" in self.processors: token.set_fields(pos=word.pos) token.set_fields(upos=word.upos) token.set_fields(xpos=word.xpos) if "lemma" in self.processors: token.set_fields(lemma=word.lemma) tokens.append(token) input_pack.add_or_get_entry(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.governor - 1] # Root token relation_entry = Dependency(input_pack, parent, child) relation_entry.set_fields( rel_type=word.dependency_relation) input_pack.add_or_get_entry(relation_entry)
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = DataPack() doc = codecs.open(file_path, "r", encoding="utf8") text = "" offset = 0 has_rows = False sentence_begin = 0 sentence_cnt = 0 for line in doc: line = line.strip() if line != "" and not line.startswith("#"): conll_components = line.split() word = conll_components[1] pos = conll_components[2] chunk_id = conll_components[3] ner_tag = conll_components[4] word_begin = offset word_end = offset + len(word) # Add tokens. kwargs_i = {"pos": pos, "chunk": chunk_id, "ner": ner_tag} token = Token(pack, word_begin, word_end) token.set_fields(**kwargs_i) pack.add_or_get_entry(token) text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: # Skip consecutive empty lines. continue # add sentence sent = Sentence(pack, sentence_begin, offset - 1) pack.add_or_get_entry(sent) sentence_begin = offset sentence_cnt += 1 has_rows = False if has_rows: # Add the last sentence if exists. sent = Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.add_or_get_entry(sent) document = Document(pack, 0, len(text)) pack.add_or_get_entry(document) pack.set_text(text, replace_func=self.text_replace_operation) pack.meta.doc_id = file_path doc.close() yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = DataPack() with open(file_path, encoding="utf8") as doc: text = "" offset = 0 has_rows = False speaker = part_id = document_id = None sentence_begin = 0 # auxiliary structures current_entity_mention: Optional[Tuple[int, str]] = None verbal_predicates: List[PredicateMention] = [] current_pred_arg: List[Optional[Tuple[int, str]]] = [] verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = [] groups: DefaultDict[int, List[EntityMention]] = defaultdict(list) coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for line in doc: line = line.strip() if line.startswith("#end document"): break if line != "" and not line.startswith("#"): conll_components = line.split() document_id = conll_components[0] part_id = int(conll_components[1]) word = conll_components[3] pos_tag = conll_components[4] lemmatised_word = conll_components[6] framenet_id = conll_components[7] word_sense = conll_components[8] speaker = conll_components[9] entity_label = conll_components[10] pred_labels = conll_components[11:-1] word_begin = offset word_end = offset + len(word) # add tokens kwargs_i: Dict[str, Any] = {"pos": pos_tag, "sense": word_sense} token = Token(pack, word_begin, word_end) token.set_fields(**kwargs_i) pack.add_or_get_entry(token) # add entity mentions current_entity_mention = self._process_entity_annotations( pack, entity_label, word_begin, word_end, current_entity_mention ) # add predicate mentions if lemmatised_word != "-": word_is_verbal_predicate = any( ["(V" in x for x in pred_labels] ) kwargs_i = { "framenet_id": framenet_id, "pred_lemma": lemmatised_word, "pred_type": "verb" if word_is_verbal_predicate else "other" } pred_mention = PredicateMention( pack, word_begin, word_end) pred_mention.set_fields(**kwargs_i) pred_mention = pack.add_or_get_entry( pred_mention ) if word_is_verbal_predicate: verbal_predicates.append(pred_mention) if not verbal_pred_args: current_pred_arg = [None for _ in pred_labels] verbal_pred_args = [[] for _ in pred_labels] # add predicate arguments self._process_pred_annotations( pack, conll_components[11:-1], word_begin, word_end, current_pred_arg, verbal_pred_args, ) # add coreference mentions self._process_coref_annotations( pack, conll_components[-1], word_begin, word_end, coref_stacks, groups, ) text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: continue # add predicate links in the sentence for predicate, pred_arg in zip(verbal_predicates, verbal_pred_args): for arg in pred_arg: kwargs_i = { "arg_type": arg[1], } link = PredicateLink(pack, predicate, arg[0]) link.set_fields(**kwargs_i) pack.add_or_get_entry(link) verbal_predicates = [] current_pred_arg = [] verbal_pred_args = [] # add sentence kwargs_i = {"speaker": speaker, "part_id": part_id} sent = Sentence(pack, sentence_begin, offset - 1) sent.set_fields(**kwargs_i) pack.add_or_get_entry(sent) sentence_begin = offset has_rows = False # group the coreference mentions in the whole document for _, mention_list in groups.items(): # kwargs_i = {"coref_type": group_id} group = CoreferenceGroup(pack) # group.set_fields(**kwargs_i) group.add_members(mention_list) pack.add_or_get_entry(group) document = Document(pack, 0, len(text)) pack.add_or_get_entry(document) kwargs_i = {"doc_id": document_id} pack.set_meta(**kwargs_i) pack.set_text(text, replace_func=self.text_replace_operation) yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = DataPack() with open(file_path, encoding="utf8") as doc: words = [] offset = 0 has_rows = False speaker = part_id = document_id = None sentence_begin = 0 # auxiliary structures current_entity_mention: Optional[Tuple[int, str]] = None verbal_predicates: List[PredicateMention] = [] current_pred_arg: List[Optional[Tuple[int, str]]] = [] verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = [] groups: DefaultDict[int, List[EntityMention]] = defaultdict(list) coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for line in doc: line = line.strip() if line.startswith("#end document"): break if line != "" and not line.startswith("#"): fields = self._parse_line(line) speaker = fields.speaker if fields.part_number is not None: part_id = int(fields.part_number) document_id = fields.document_id assert fields.word is not None word_begin = offset word_end = offset + len(fields.word) # add tokens token = Token(pack, word_begin, word_end) if fields.pos_tag is not None: token.set_fields(pos=fields.pos_tag) if fields.word_sense is not None: token.set_fields(sense=fields.word_sense) pack.add_entry(token) # add entity mentions current_entity_mention = self._process_entity_annotations( pack, fields.entity_label, word_begin, word_end, current_entity_mention, ) # add predicate mentions if (fields.lemmatised_word is not None and fields.lemmatised_word != "-"): word_is_verbal_predicate = any( "(V" in x for x in fields.predicate_labels) kwargs_i = { "pred_lemma": fields.lemmatised_word, "pred_type": ("verb" if word_is_verbal_predicate else "other") } pred_mention = PredicateMention( pack, word_begin, word_end) pred_mention.set_fields(**kwargs_i) if fields.framenet_id is not None: pred_mention.set_fields( framenet_id=fields.framenet_id) pack.add_entry(pred_mention) if word_is_verbal_predicate: verbal_predicates.append(pred_mention) if not verbal_pred_args: current_pred_arg = [None] * len( fields.predicate_labels) verbal_pred_args = [[] for _ in fields.predicate_labels] # add predicate arguments self._process_pred_annotations( pack, fields.predicate_labels, word_begin, word_end, current_pred_arg, verbal_pred_args, ) # add coreference mentions self._process_coref_annotations( pack, fields.coreference, word_begin, word_end, coref_stacks, groups, ) words.append(fields.word) offset = word_end + 1 has_rows = True else: if not has_rows: continue # add predicate links in the sentence for predicate, pred_arg in zip(verbal_predicates, verbal_pred_args): for arg in pred_arg: kwargs_i = { "arg_type": arg[1], } link = PredicateLink(pack, predicate, arg[0]) link.set_fields(**kwargs_i) pack.add_entry(link) verbal_predicates = [] current_pred_arg = [] verbal_pred_args = [] # add sentence sent = Sentence(pack, sentence_begin, offset - 1) if speaker is not None: sent.set_fields(speaker=speaker) if part_id is not None: sent.set_fields(part_id=int(part_id)) pack.add_entry(sent) sentence_begin = offset has_rows = False # group the coreference mentions in the whole document for _, mention_list in groups.items(): # kwargs_i = {"coref_type": group_id} group = CoreferenceGroup(pack) # group.set_fields(**kwargs_i) group.add_members(mention_list) pack.add_entry(group) text = " ".join(words) document = Document(pack, 0, len(text)) pack.add_entry(document) if document_id is not None: pack.set_meta(doc_id=document_id) pack.set_text(text, replace_func=self.text_replace_operation) yield pack