def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = self.new_pack() doc = codecs.open(file_path, "r", encoding="utf8") text = "" offset = 0 has_rows = False sentence_begin = 0 sentence_cnt = 0 for line in doc: line = line.strip() if line != "" and not line.startswith("#"): conll_components = line.split() word = conll_components[1] pos = conll_components[2] chunk_id = conll_components[3] ner_tag = conll_components[4] word_begin = offset word_end = offset + len(word) # Add tokens. token = Token(pack, word_begin, word_end) token.pos = pos token.chunk = chunk_id token.ner = ner_tag text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: # Skip consecutive empty lines. continue # add sentence Sentence(pack, sentence_begin, offset - 1) sentence_begin = offset sentence_cnt += 1 has_rows = False if has_rows: # Add the last sentence if exists. Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = file_path doc.close() yield pack
def _process(self, input_pack: DataPack): pattern = "\\.\\s*" start = 0 for m in re.finditer(pattern, input_pack.text): end = m.end() Sentence(input_pack, start, end) start = end if start < len(input_pack.text): input_pack.add_entry( Sentence(input_pack, start, len(input_pack.text)))
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: data_pack: DataPack = self.new_pack() sent_begin: int = 0 doc_text: str = "" with open(file_path, encoding="utf8") as doc: for para in doc: para = self.preprocess_reviews(para) sents = para.split("\n") for sent in sents: if len(sent) > 0: sent = sent.strip() doc_text += sent + " " doc_offset = sent_begin + len(sent) + 1 # Add sentences. Sentence(data_pack, sent_begin, doc_offset - 1) sent_begin = doc_offset pos_dir: str = os.path.basename(os.path.dirname(file_path)) movie_file: str = os.path.basename(file_path) title: List = movie_file.split('_') doc_id: str = pos_dir + title[0] score: float = float(title[1].split('.')[0]) score /= 10.0 data_pack.pack_name = doc_id data_pack.set_text(doc_text) # Add documents. document: Document = Document(data_pack, 0, len(doc_text)) document.sentiment = {doc_id: score} yield data_pack
def _parse_pack(self, sent_lines) -> Iterator[DataPack]: data_pack: DataPack = DataPack() sent_bias: int = 0 batch_text: str = "\n".join( [sent_text for _, sent_text, _ in sent_lines] ) data_pack.set_text(batch_text) for i, sent_line in enumerate(sent_lines): sent_id: str = sent_line[0] sent_text: str = sent_line[1].strip() parent_pointer_list: List[int] = sent_line[2] # Name the data_pack with the first sentence id. if i == 0: data_pack.pack_name = sent_id # Add sentence to data_pack. Sentence(data_pack, sent_bias, sent_bias + len(sent_text)) self._parse_parent_pointer_list( data_pack, sent_bias, sent_text, parent_pointer_list ) sent_bias += len(sent_text) + 1 yield data_pack
def _process_parser(self, sentences, input_pack: DataPack): """Parse the sentence. Default behaviour is to segment sentence, POSTag and Lemmatize. Args: sentences: Generator object which yields sentences in document input_pack: input pack which needs to be modified Returns: """ for sentence in sentences: Sentence(input_pack, sentence.start_char, sentence.end_char) if "tokenize" in self.processors: # Iterating through spaCy token objects for word in sentence: begin_pos_word = word.idx end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word, end_pos_word) if "pos" in self.processors: token.pos = word.tag_ if "lemma" in self.processors: token.lemma = word.lemma_
def pack(self, data_pack: MultiPack, output_dict): """ Write the prediction results back to datapack. If :attr:`_overwrite` is `True`, write the predicted ner to the original tokens. Otherwise, create a new set of tokens and write the predicted ner to the new tokens (usually use this configuration for evaluation.) """ assert output_dict is not None output_pack = data_pack.get_pack(self.output_pack_name) input_sent_tids = output_dict["input_sents_tids"] output_sentences = output_dict["output_sents"] text = output_pack.text input_pack = data_pack.get_pack(self.input_pack_name) for input_id, output_sentence in zip(input_sent_tids, output_sentences): offset = len(output_pack.text) sent = Sentence(output_pack, offset, offset + len(output_sentence)) output_pack.add_entry(sent) text += output_sentence + "\n" input_sent = input_pack.get_entry(input_id) cross_link = MultiPackLink( data_pack, data_pack.subentry(self.input_pack_name, input_sent), data_pack.subentry(self.output_pack_name, sent)) data_pack.add_entry(cross_link) # We may also consider adding two link with opposite directions # Here the unidirectional link indicates the generation dependency output_pack.set_text(text)
def _parse_pack(self, file_path: str) -> Iterator[MultiPack]: m_pack: MultiPack = MultiPack() input_pack_name = self.config.input_pack_name output_pack_name = self.config.output_pack_name text = "" offset = 0 with open(file_path, "r", encoding="utf8") as doc: input_pack = DataPack(doc_id=file_path) for line in doc: line = line.strip() if len(line) == 0: continue # add sentence sent = Sentence(input_pack, offset, offset + len(line)) input_pack.add_entry(sent) text += line + '\n' offset = offset + len(line) + 1 input_pack.set_text(text, replace_func=self.text_replace_operation) output_pack = DataPack() m_pack.update_pack({ input_pack_name: input_pack, output_pack_name: output_pack }) yield m_pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = self.new_pack() text: str = "" offset: int = 0 with open(file_path, "r", encoding="utf8") as f: for line in f: line = line.strip() if line != "": line_list1 = line.split("\"sentenceTokens\":") line_list2 = line_list1[1].split(",\"verbEntries\"") sentence = line_list2[0] # Add sentence. temp_offset = offset temp_offset = temp_offset + len(line_list1[0]) temp_offset = temp_offset + len("\"sentenceTokens\":") Sentence(pack, temp_offset, temp_offset + len(sentence)) # For \n offset += len(line) + 1 text += line + " " pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = file_path yield pack
def _parse_pack(self, base_and_path: Tuple[str, str]) -> Iterator[MultiPack]: base_dir, file_path = base_and_path m_pack: MultiPack = MultiPack() input_pack_name = self.config.input_pack_name output_pack_name = self.config.output_pack_name text = "" offset = 0 with open(file_path, "r", encoding="utf8") as doc: # Remove long path from the beginning. doc_id = file_path[file_path.startswith(base_dir) and len(base_dir ):] doc_id = doc_id.strip(os.path.sep) input_pack = m_pack.add_pack(input_pack_name) input_pack.doc_id = doc_id for line in doc: line = line.strip() if len(line) == 0: continue # add sentence Sentence(input_pack, offset, offset + len(line)) text += line + '\n' offset = offset + len(line) + 1 input_pack.set_text(text, replace_func=self.text_replace_operation) # Create a output pack without text. m_pack.add_pack(output_pack_name) yield m_pack
def _process(self, input_pack: DataPack): pattern = '\\.\\s*' start = 0 for m in re.finditer(pattern, input_pack.text): end = m.end() Sentence(input_pack, start, end) start = end
def _process(self, input_pack: DataPack): doc = input_pack.text if len(doc) == 0: logging.warning("Find empty text in doc.") # sentence parsing sentences = self.nlp(doc).sentences # Iterating through stanfordnlp sentence objects for sentence in sentences: Sentence( input_pack, sentence.tokens[0].start_char, sentence.tokens[-1].end_char, ) tokens: List[Token] = [] if "tokenize" in self.processors: # Iterating through stanfordnlp word objects for word in sentence.words: misc = word.misc.split("|") t_start = -1 t_end = -1 for m in misc: k, v = m.split("=") if k == "start_char": t_start = int(v) elif k == "end_char": t_end = int(v) if t_start < 0 or t_end < 0: raise ValueError( "Cannot determine word start or end for " "stanfordnlp." ) token = Token(input_pack, t_start, t_end) if "pos" in self.processors: token.pos = word.pos token.ud_xpos = word.xpos if "lemma" in self.processors: token.lemma = word.lemma tokens.append(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.head - 1] # Head token relation_entry = Dependency(input_pack, parent, child) relation_entry.rel_type = word.deprel
def build_ngram(sent: Sentence, n: int): # Should exclude light words from ngrams. if n == 1: return [[t] for t in sent.get(Token)] ngrams = [] ngram = [] k = 0 for i, t in enumerate(sent.get(Token)): if k < n: ngram.append(t) k += 1 else: if len(ngrams) > 0: ngram = ngram[1:] + [t] ngrams.append(ngram) return ngrams
def test_back_translation(self): random.seed(0) data_pack = DataPack() text = "Natural Language Processing has never been made this simple!" data_pack.set_text(text) sent = Sentence(data_pack, 0, len(text)) data_pack.add_entry(sent) translated_text = "The treatment of natural language has never been easier!" assert(translated_text == self.bta.replace(sent)[1])
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() text: str = "" offset: int = 0 with open(file_path, "r", encoding="utf8") as f: for line in f: line = line.strip() if line != "": oie_component: List[str] = line.split("\t") sentence: str = oie_component[0] # Add sentence. Sentence(pack, offset, offset + len(sentence)) offset += len(sentence) + 1 text += sentence + " " head_predicate: str = oie_component[1] full_predicate: str = oie_component[2] # Add head predicate. token: Token = Token(pack, offset, offset + len(head_predicate)) offset += len(head_predicate) + 1 text += head_predicate + " " # Add full predicate. predicate_mention: PredicateMention = PredicateMention(pack, offset, offset + len(full_predicate)) predicate_mention.headword = token offset += len(full_predicate) + 1 text += full_predicate + " " for arg in oie_component[3:]: # Add predicate argument. predicate_arg: PredicateArgument = \ PredicateArgument(pack, offset, offset + len(arg)) offset += len(arg) + 1 text += arg + " " # Add predicate link. PredicateLink(pack, predicate_mention, predicate_arg) pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = file_path yield pack
def _process(self, input_pack: DataPack): text = input_pack.text end_pos = 0 paragraphs = [p for p in text.split('\n') if p] for paragraph in paragraphs: sentences = sent_tokenize(paragraph) for sentence_text in sentences: begin_pos = text.find(sentence_text, end_pos) end_pos = begin_pos + len(sentence_text) sentence_entry = Sentence(input_pack, begin_pos, end_pos) input_pack.add_or_get_entry(sentence_entry)
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: with open(file_path, "r", encoding="utf8") as doc: for line in doc: pack = DataPack(doc_id=file_path) line = line.strip() if len(line) == 0: continue sent = Sentence(pack, 0, len(line)) pack.add_entry(sent) pack.set_text(line) self.count += 1 yield pack
def _process(self, input_pack: DataPack): doc = input_pack.text end_pos = 0 # sentence parsing sentences = self.nlp(doc).sentences # type: ignore # Iterating through stanfordnlp sentence objects for sentence in sentences: begin_pos = doc.find(sentence.words[0].text, end_pos) end_pos = doc.find(sentence.words[-1].text, begin_pos) + len( sentence.words[-1].text) sentence_entry = Sentence(input_pack, begin_pos, end_pos) input_pack.add_or_get_entry(sentence_entry) tokens: List[Token] = [] if "tokenize" in self.processors: offset = sentence_entry.span.begin end_pos_word = 0 # Iterating through stanfordnlp word objects for word in sentence.words: begin_pos_word = sentence_entry.text. \ find(word.text, end_pos_word) end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word + offset, end_pos_word + offset ) if "pos" in self.processors: token.set_fields(pos=word.pos) token.set_fields(upos=word.upos) token.set_fields(xpos=word.xpos) if "lemma" in self.processors: token.set_fields(lemma=word.lemma) tokens.append(token) input_pack.add_or_get_entry(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.governor - 1] # Root token relation_entry = Dependency(input_pack, parent, child) relation_entry.set_fields( rel_type=word.dependency_relation) input_pack.add_or_get_entry(relation_entry)
def _process(self, input_pack: MultiPack): from_pack: DataPack = input_pack.get_pack(self.configs.copy_from) copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to) copy_pack.set_text(from_pack.text) if from_pack.pack_name is not None: copy_pack.pack_name = from_pack.pack_name + '_copy' else: copy_pack.pack_name = 'copy' s: Sentence for s in from_pack.get(Sentence): Sentence(copy_pack, s.begin, s.end)
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: # type: ignore with open(file_path, "r", encoding="utf8") as doc: for line in doc: line = line.strip() if len(line) == 0: continue m_pack = MultiPack() pack = m_pack.add_pack("pack") pack.set_text(line) Sentence(pack, 0, len(line)) self.count += 1 yield m_pack # type: ignore
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: # type: ignore with open(file_path, "r", encoding="utf8") as doc: for line in doc: m_pack = MultiPack() pack = DataPack(doc_id=file_path) line = line.strip() if len(line) == 0: continue sent = Sentence(pack, 0, len(line)) pack.add_entry(sent) pack.set_text(line) self.count += 1 m_pack.update_pack({"pack": pack}) yield m_pack # type: ignore
def _process(self, input_pack: DataPack): # pylint: disable=no-self-use text = input_pack.text begin_pos = 0 while begin_pos < len(text): end_pos = min(text.find('.', begin_pos)) if end_pos == -1: end_pos = len(text) - 1 sentence_entry = Sentence(input_pack, begin_pos, end_pos + 1) input_pack.add_or_get_entry(sentence_entry) begin_pos = end_pos + 1 while begin_pos < len(text) and text[begin_pos] == " ": begin_pos += 1
def _process(self, input_pack: MultiPack): from_pack: DataPack = input_pack.get_pack(self.configs.copy_from) copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to) copy_pack.set_text(from_pack.text) if from_pack.pack_name is not None: copy_pack.pack_name = from_pack.pack_name + "_copy" else: copy_pack.pack_name = "copy" s: Sentence for s in from_pack.get(Sentence): Sentence(copy_pack, s.begin, s.end) e: EntityMention for e in from_pack.get(EntityMention): EntityMention(copy_pack, e.begin, e.end)
def _parse_pack(self, file_path: str) -> Iterator[MultiPack]: m_pack: MultiPack = MultiPack() input_pack_name = "input_src" output_pack_name = "output_tgt" with open(file_path, "r", encoding="utf8") as doc: text = "" offset = 0 sentence_cnt = 0 input_pack = DataPack(doc_id=file_path) for line in doc: line = line.strip() if len(line) == 0: # skip empty lines continue # add sentence sent = Sentence(input_pack, offset, offset + len(line)) input_pack.add_entry(sent) text += line + '\n' offset = offset + len(line) + 1 sentence_cnt += 1 if sentence_cnt >= 20: break input_pack.set_text(text, replace_func=self.text_replace_operation) output_pack = DataPack() m_pack.update_pack({ input_pack_name: input_pack, output_pack_name: output_pack }) yield m_pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() text: str = "" offset: int = 0 with open(file_path, "r", encoding="utf8") as f: for line in f: line = line.strip() if line != "": oie_component: List[str] = line.split("\t") # Add sentence. sentence = oie_component[0] text += sentence + "\n" Sentence(pack, offset, offset + len(sentence)) # Find argument 1. arg1_begin = sentence.find(oie_component[3]) + offset arg1_end = arg1_begin + len(oie_component[3]) arg1: EntityMention = EntityMention( pack, arg1_begin, arg1_end) # Find argument 2. arg2_begin = sentence.find(oie_component[4]) + offset arg2_end = arg2_begin + len(oie_component[4]) arg2: EntityMention = EntityMention( pack, arg2_begin, arg2_end) head_relation = RelationLink(pack, arg1, arg2) head_relation.rel_type = oie_component[2] offset += len(sentence) + 1 self.set_text(pack, text) pack.pack_name = os.path.basename(file_path) yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = self.new_pack() with open(file_path, encoding="utf8") as doc: words = [] offset = 0 has_rows = False speaker = part_id = document_id = None sentence_begin = 0 # auxiliary structures current_entity_mention: Optional[Tuple[int, str]] = None verbal_predicates: List[PredicateMention] = [] current_pred_arg: List[Optional[Tuple[int, str]]] = [] verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = [] groups: DefaultDict[int, List[EntityMention]] = defaultdict(list) coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for line in doc: line = line.strip() if line.startswith("#end document"): break if line != "" and not line.startswith("#"): fields = self._parse_line(line) speaker = fields.speaker if fields.part_number is not None: part_id = int(fields.part_number) document_id = fields.document_id assert fields.word is not None word_begin = offset word_end = offset + len(fields.word) # add tokens token = Token(pack, word_begin, word_end) if fields.pos_tag is not None: token.pos = fields.pos_tag if fields.word_sense is not None: token.sense = fields.word_sense # add entity mentions current_entity_mention = self._process_entity_annotations( pack, fields.entity_label, word_begin, word_end, current_entity_mention, ) # add predicate mentions if (fields.lemmatised_word is not None and fields.lemmatised_word != "-"): word_is_verbal_predicate = any( "(V" in x for x in fields.predicate_labels) pred_mention = PredicateMention( pack, word_begin, word_end) pred_mention.predicate_lemma = fields.lemmatised_word pred_mention.is_verb = word_is_verbal_predicate if fields.framenet_id is not None: pred_mention.framenet_id = fields.framenet_id if word_is_verbal_predicate: verbal_predicates.append(pred_mention) if not verbal_pred_args: current_pred_arg = [None] * len( fields.predicate_labels) verbal_pred_args = [[] for _ in fields.predicate_labels] # add predicate arguments self._process_pred_annotations( pack, fields.predicate_labels, word_begin, word_end, current_pred_arg, verbal_pred_args, ) # add coreference mentions self._process_coref_annotations( pack, fields.coreference, word_begin, word_end, coref_stacks, groups, ) words.append(fields.word) offset = word_end + 1 has_rows = True else: if not has_rows: continue # add predicate links in the sentence for predicate, pred_arg in zip(verbal_predicates, verbal_pred_args): for arg in pred_arg: link = PredicateLink(pack, predicate, arg[0]) link.arg_type = arg[1] verbal_predicates = [] current_pred_arg = [] verbal_pred_args = [] # add sentence sent = Sentence(pack, sentence_begin, offset - 1) if speaker is not None: sent.speaker = speaker if part_id is not None: sent.part_id = int(part_id) sentence_begin = offset has_rows = False # group the coreference mentions in the whole document for _, mention_list in groups.items(): group = CoreferenceGroup(pack) group.add_members(mention_list) text = " ".join(words) pack.set_text(text, replace_func=self.text_replace_operation) _ = Document(pack, 0, len(text)) if document_id is not None: pack.pack_name = document_id yield pack
def _parse_pack(self, doc_lines) -> Iterator[DataPack]: # pylint: disable=no-self-use token_comp_fields = [ "id", "form", "lemma", "pos", "ud_xpos", "features", "head", "label", "enhanced_dependency_relations", "ud_misc" ] token_multi_fields = [ "features", "ud_misc", "enhanced_dependency_relations" ] token_feature_fields = ["features", "ud_misc"] token_entry_fields = ["lemma", "pos", "ud_xpos", "features", "ud_misc"] data_pack: DataPack = DataPack() doc_sent_begin: int = 0 doc_num_sent: int = 0 doc_text: str = '' doc_offset: int = 0 doc_id: str sent_text: str sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {} for line in doc_lines: line = line.strip() line_comps = line.split() if line.startswith("# newdoc"): doc_id = line.split("=")[1].strip() elif line.startswith("# sent"): sent_text = '' elif len(line_comps) > 0 and \ line_comps[0].strip().isdigit(): # token token_comps: Dict[str, Any] = {} for index, key in enumerate(token_comp_fields): token_comps[key] = str(line_comps[index]) if key in token_multi_fields: values = str(token_comps[key]).split("|") \ if token_comps[key] != '_' else [] if key not in token_feature_fields: token_comps[key] = values else: feature_lst = [ elem.split('=', 1) for elem in values ] feature_dict = { elem[0]: elem[1] for elem in feature_lst } token_comps[key] = feature_dict word: str = token_comps["form"] word_begin = doc_offset word_end = doc_offset + len(word) token: Token \ = Token(data_pack, word_begin, word_end) kwargs = {key: token_comps[key] for key in token_entry_fields} # add token token.set_fields(**kwargs) data_pack.add_or_get_entry(token) sent_tokens[str(token_comps["id"])] = (token_comps, token) sent_text += word + " " doc_offset = word_end + 1 elif line == "": # sentence ends sent_text = sent_text.strip() doc_text += ' ' + sent_text # add dependencies for a sentence when all the tokens have been # added for token_id in sent_tokens: token_comps, token = sent_tokens[token_id] def add_dependency(dep_parent, dep_child, dep_label, dep_type, data_pack_): """Adds dependency to a data_pack Args: dep_parent: dependency parent token dep_child: dependency child token dep_label: dependency label dep_type: "primary" or "enhanced" dependency data_pack_: data_pack to which the dependency is to be added """ dependency = Dependency(data_pack, dep_parent, dep_child) dependency.dep_label = dep_label dependency.type = dep_type data_pack_.add_or_get_entry(dependency) # add primary dependency label = token_comps["label"] if label == "root": token.is_root = True else: token.is_root = False head = sent_tokens[token_comps["head"]][1] add_dependency(head, token, label, "primary", data_pack) # add enhanced dependencies for dep in token_comps["enhanced_dependency_relations"]: head_id, label = dep.split(":", 1) if label != "root": head = sent_tokens[head_id][1] add_dependency(head, token, label, "enhanced", data_pack) # add sentence sent = Sentence(data_pack, doc_sent_begin, doc_offset - 1) data_pack.add_or_get_entry(sent) doc_sent_begin = doc_offset doc_num_sent += 1 # add doc to data_pack document = Document(data_pack, 0, len(doc_text)) data_pack.add_or_get_entry(document) data_pack.meta.doc_id = doc_id data_pack.set_text(doc_text.strip()) yield data_pack
def _parse_pack(self, doc_lines) -> Iterator[DataPack]: token_comp_fields = ["id", "form", "lemma", "pos", "ud_xpos", "ud_features", "head", "label", "enhanced_dependency_relations", "ud_misc"] token_multi_fields = ["ud_features", "ud_misc", "enhanced_dependency_relations"] token_feature_fields = ["ud_features", "ud_misc"] data_pack: DataPack = DataPack() doc_sent_begin: int = 0 doc_num_sent: int = 0 doc_text: str = '' doc_offset: int = 0 doc_id: str sent_text: str sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {} for line in doc_lines: line = line.strip() line_comps = line.split() if line.startswith("# newdoc"): doc_id = line.split("=")[1].strip() elif line.startswith("# sent"): sent_text = '' elif len(line_comps) > 0 and \ line_comps[0].strip().isdigit(): # token token_comps: Dict[str, Any] = {} for index, key in enumerate(token_comp_fields): token_comps[key] = str(line_comps[index]) if key in token_multi_fields: values = str(token_comps[key]).split("|") \ if token_comps[key] != '_' else [] if key not in token_feature_fields: token_comps[key] = values else: feature_lst = [elem.split('=', 1) for elem in values] feature_dict = {elem[0]: elem[1] for elem in feature_lst} token_comps[key] = feature_dict word: str = token_comps["form"] word_begin = doc_offset word_end = doc_offset + len(word) # add token token: Token = Token(data_pack, word_begin, word_end) token.lemma = token_comps['lemma'] token.pos = token_comps['pos'] token.ud_xpos = token_comps['ud_xpos'] token.ud_features = token_comps['ud_features'] token.ud_misc = token_comps['ud_misc'] sent_tokens[str(token_comps["id"])] = (token_comps, token) sent_text += word + " " doc_offset = word_end + 1 elif line == "": # sentence ends sent_text = sent_text.strip() doc_text += ' ' + sent_text # add dependencies for a sentence when all the tokens have been # added for token_id in sent_tokens: token_comps, token = sent_tokens[token_id] # add primary dependency label = token_comps["label"] if label == "root": token.is_root = True else: token.is_root = False head = sent_tokens[token_comps["head"]][1] dependency = Dependency(data_pack, head, token) dependency.dep_label = label # add enhanced dependencies for dep in token_comps["enhanced_dependency_relations"]: head_id, label = dep.split(":", 1) if label != "root": head = sent_tokens[head_id][1] enhanced_dependency = \ EnhancedDependency(data_pack, head, token) enhanced_dependency.dep_label = label # add sentence Sentence(data_pack, doc_sent_begin, doc_offset - 1) doc_sent_begin = doc_offset doc_num_sent += 1 doc_text = doc_text.strip() data_pack.set_text(doc_text) # add doc to data_pack Document(data_pack, 0, len(doc_text)) data_pack.pack_name = doc_id yield data_pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() with open(file_path, 'r', encoding='utf8') as fp: txt = "" offset = 0 while True: sent_line: str = fp.readline() if not sent_line: break if len(sent_line.split()) == 0: continue relation_line: str = fp.readline() # Command line is not used. _ = fp.readline() sent_line = sent_line[sent_line.find('"') + 1:sent_line.rfind('"')] index1 = sent_line.find("<e1>") index2 = sent_line.find("<e2>") # 5 is the length of "</e1>", include both <e1> and # </e1> when extracting the string. e1 = sent_line[index1:sent_line.find("</e1>") + 5] e2 = sent_line[index2:sent_line.find("</e2>") + 5] # Remove <e1> and </e1> in the sentence. sent_line = sent_line.replace(e1, e1[4:-5]) sent_line = sent_line.replace(e2, e2[4:-5]) # Remove <e1> and </e1> in e1. e1 = e1[4:-5] e2 = e2[4:-5] # Re-calculate the index after removing <e1>, </e1> in # in the sentence. if index1 < index2: diff1 = 0 diff2 = 9 else: diff1 = 9 diff2 = 0 index1 += offset - diff1 index2 += offset - diff2 Sentence(pack, offset, offset + len(sent_line)) entry1 = EntityMention(pack, index1, index1 + len(e1)) entry2 = EntityMention(pack, index2, index2 + len(e2)) offset += len(sent_line) + 1 txt += sent_line + " " pair = relation_line[relation_line.find("(") + 1:relation_line.find(")")] if "," in pair: parent, _ = pair.split(",") if parent == "e1": relation = RelationLink(pack, entry1, entry2) else: relation = RelationLink(pack, entry2, entry1) relation.rel_type = relation_line[:relation_line.find("(")] else: # For "Other" relation, just set parent as e1 # set child as e2. relation = RelationLink(pack, entry1, entry2) relation.rel_type = relation_line.strip() pack.set_text(txt, replace_func=self.text_replace_operation) pack.pack_name = os.path.basename(file_path) yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = DataPack() doc = codecs.open(file_path, "r", encoding="utf8") text = "" offset = 0 has_rows = False sentence_begin = 0 sentence_cnt = 0 for line in doc: line = line.strip() if line != "" and not line.startswith("#"): conll_components = line.split() word = conll_components[1] pos = conll_components[2] chunk_id = conll_components[3] ner_tag = conll_components[4] word_begin = offset word_end = offset + len(word) # Add tokens. kwargs_i = {"pos": pos, "chunk": chunk_id, "ner": ner_tag} token = Token(pack, word_begin, word_end) token.set_fields(**kwargs_i) pack.add_or_get_entry(token) text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: # Skip consecutive empty lines. continue # add sentence sent = Sentence(pack, sentence_begin, offset - 1) pack.add_or_get_entry(sent) sentence_begin = offset sentence_cnt += 1 has_rows = False if has_rows: # Add the last sentence if exists. sent = Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.add_or_get_entry(sent) document = Document(pack, 0, len(text)) pack.add_or_get_entry(document) pack.set_text(text, replace_func=self.text_replace_operation) pack.meta.doc_id = file_path doc.close() yield pack
def _process(self, input_pack: DataPack): for begin, end in self.sent_splitter.span_tokenize(input_pack.text): Sentence(input_pack, begin, end)