def _create_dependencies(input_pack, tokens, result): deps = result['predicted_dependencies'] heads = result['predicted_heads'] for i, token in enumerate(tokens): relation = Dependency(input_pack, parent=tokens[heads[i] - 1], child=token) relation.rel_type = deps[i]
def _process(self, input_pack: DataPack): doc = input_pack.text if len(doc) == 0: logging.warning("Find empty text in doc.") # sentence parsing sentences = self.nlp(doc).sentences # Iterating through stanfordnlp sentence objects for sentence in sentences: Sentence( input_pack, sentence.tokens[0].start_char, sentence.tokens[-1].end_char, ) tokens: List[Token] = [] if "tokenize" in self.processors: # Iterating through stanfordnlp word objects for word in sentence.words: misc = word.misc.split("|") t_start = -1 t_end = -1 for m in misc: k, v = m.split("=") if k == "start_char": t_start = int(v) elif k == "end_char": t_end = int(v) if t_start < 0 or t_end < 0: raise ValueError( "Cannot determine word start or end for " "stanfordnlp." ) token = Token(input_pack, t_start, t_end) if "pos" in self.processors: token.pos = word.pos token.ud_xpos = word.xpos if "lemma" in self.processors: token.lemma = word.lemma tokens.append(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.head - 1] # Head token relation_entry = Dependency(input_pack, parent, child) relation_entry.rel_type = word.deprel
def _process(self, input_pack: DataPack): doc = input_pack.text end_pos = 0 # sentence parsing sentences = self.nlp(doc).sentences # type: ignore # Iterating through stanfordnlp sentence objects for sentence in sentences: begin_pos = doc.find(sentence.words[0].text, end_pos) end_pos = doc.find(sentence.words[-1].text, begin_pos) + len( sentence.words[-1].text) sentence_entry = Sentence(input_pack, begin_pos, end_pos) input_pack.add_or_get_entry(sentence_entry) tokens: List[Token] = [] if "tokenize" in self.processors: offset = sentence_entry.span.begin end_pos_word = 0 # Iterating through stanfordnlp word objects for word in sentence.words: begin_pos_word = sentence_entry.text. \ find(word.text, end_pos_word) end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word + offset, end_pos_word + offset ) if "pos" in self.processors: token.set_fields(pos=word.pos) token.set_fields(upos=word.upos) token.set_fields(xpos=word.xpos) if "lemma" in self.processors: token.set_fields(lemma=word.lemma) tokens.append(token) input_pack.add_or_get_entry(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.governor - 1] # Root token relation_entry = Dependency(input_pack, parent, child) relation_entry.set_fields( rel_type=word.dependency_relation) input_pack.add_or_get_entry(relation_entry)
def add_dependency(dep_parent, dep_child, dep_label, dep_type, data_pack_): """Adds dependency to a data_pack Args: dep_parent: dependency parent token dep_child: dependency child token dep_label: dependency label dep_type: "primary" or "enhanced" dependency data_pack_: data_pack to which the dependency is to be added """ dependency = Dependency(data_pack, dep_parent, dep_child) dependency.dep_label = dep_label dependency.type = dep_type data_pack_.add_or_get_entry(dependency)
def _parse_pack(self, doc_lines) -> Iterator[DataPack]: token_comp_fields = ["id", "form", "lemma", "pos", "ud_xpos", "ud_features", "head", "label", "enhanced_dependency_relations", "ud_misc"] token_multi_fields = ["ud_features", "ud_misc", "enhanced_dependency_relations"] token_feature_fields = ["ud_features", "ud_misc"] data_pack: DataPack = DataPack() doc_sent_begin: int = 0 doc_num_sent: int = 0 doc_text: str = '' doc_offset: int = 0 doc_id: str sent_text: str sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {} for line in doc_lines: line = line.strip() line_comps = line.split() if line.startswith("# newdoc"): doc_id = line.split("=")[1].strip() elif line.startswith("# sent"): sent_text = '' elif len(line_comps) > 0 and \ line_comps[0].strip().isdigit(): # token token_comps: Dict[str, Any] = {} for index, key in enumerate(token_comp_fields): token_comps[key] = str(line_comps[index]) if key in token_multi_fields: values = str(token_comps[key]).split("|") \ if token_comps[key] != '_' else [] if key not in token_feature_fields: token_comps[key] = values else: feature_lst = [elem.split('=', 1) for elem in values] feature_dict = {elem[0]: elem[1] for elem in feature_lst} token_comps[key] = feature_dict word: str = token_comps["form"] word_begin = doc_offset word_end = doc_offset + len(word) # add token token: Token = Token(data_pack, word_begin, word_end) token.lemma = token_comps['lemma'] token.pos = token_comps['pos'] token.ud_xpos = token_comps['ud_xpos'] token.ud_features = token_comps['ud_features'] token.ud_misc = token_comps['ud_misc'] sent_tokens[str(token_comps["id"])] = (token_comps, token) sent_text += word + " " doc_offset = word_end + 1 elif line == "": # sentence ends sent_text = sent_text.strip() doc_text += ' ' + sent_text # add dependencies for a sentence when all the tokens have been # added for token_id in sent_tokens: token_comps, token = sent_tokens[token_id] # add primary dependency label = token_comps["label"] if label == "root": token.is_root = True else: token.is_root = False head = sent_tokens[token_comps["head"]][1] dependency = Dependency(data_pack, head, token) dependency.dep_label = label # add enhanced dependencies for dep in token_comps["enhanced_dependency_relations"]: head_id, label = dep.split(":", 1) if label != "root": head = sent_tokens[head_id][1] enhanced_dependency = \ EnhancedDependency(data_pack, head, token) enhanced_dependency.dep_label = label # add sentence Sentence(data_pack, doc_sent_begin, doc_offset - 1) doc_sent_begin = doc_offset doc_num_sent += 1 doc_text = doc_text.strip() data_pack.set_text(doc_text) # add doc to data_pack Document(data_pack, 0, len(doc_text)) data_pack.pack_name = doc_id yield data_pack