def segment_text(self, text): """Segment the provided text into sentences.""" self.tokenizer.setText(text) is_another = True sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: sentences.append(u_sentence.getText()) return sentences
def _runApp(self, dto, opNotDone): text = dto.getText() tokenizer = self._model.newTokenizer(self._model.DEFAULT) tokenizer.setText(text) error = ProcessingError() sentence = Sentence() sid = 0 while tokenizer.nextSentence(sentence, error): self._model.tag(sentence, self._model.DEFAULT) self._model.parse(sentence, self._model.DEFAULT) # Teprolin tokenized sentence ttsent = [] # Teprolin string sentence tssent = sentence.getText() for w in sentence.words: if w.id == 0: continue tt = TeproTok() tt.setId(w.id) tt.setWordForm(w.form) tt.setCTAG(w.upostag) tt.setMSD(w.xpostag) tt.setLemma(w.lemma) tt.setHead(w.head) tt.setDepRel(w.deprel) ttsent.append(tt) # end for w if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()): dto.addSentenceString(tssent) dto.addSentenceTokens(ttsent) else: # Check and update annotations that only TTL # can produce or that are requested specifically from it. alignment = dto.alignSentences(ttsent, sid) for op in opNotDone: dto.copyTokenAnnotation(ttsent, sid, alignment, op) sentence = Sentence() sid += 1 # end all split sentences. return dto
def parse(text, sentence_id): """Takes a sentence in raw text and produces its CoNLL-U annotation by invoking udpipe Paratemeters: text - the sentence to be parsed sentence_id - the ID of the sentence Output: a UD graph """ model = Model.load('./models/udpipe/english-ewt-ud-2.3-181115.udpipe') tokenizer = model.newTokenizer(model.TOKENIZER_PRESEGMENTED) # tokenizer = model.TOKENIZER_PRESEGMENTED(model.DEFAULT) conlluOutput = OutputFormat.newOutputFormat("conllu") sentence = Sentence() error = ProcessingError() tokenizer.setText(text) tokenizer.nextSentence(sentence, error) model.tag(sentence, model.DEFAULT) model.parse(sentence, model.DEFAULT) return conlluOutput.writeSentence(sentence).replace( '# sent_id = 1', '# sent_id = ' + sentence_id)
def transform(self, string: str): self.tokenizer.setText(string) sentences = [] sentence = Sentence() while self.tokenizer.nextSentence(sentence, self.error): self.model.tag(sentence, self.model.DEFAULT) sentences.append(sentence) sentence = Sentence() if self.error.occurred(): raise Exception(self.error.message) words = functools.reduce(lambda x, y: x + y, [[(w.lemma, w.upostag) for w in s.words] for s in sentences], []) return [word for word in words if word[1] not in ['<root>']]
def __call__(self, text): """Convert input text to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ udpipe_sents = self.model(text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self.get_tokens_with_heads(udpipe_sents) if not tokens: return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not span: break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.form) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upostag or "")) # CoNNL xpostag-s, custom for each UD treebank #tags.append(self.vocab.strings.add(token.xpostag or "")) tags.append(self.vocab.strings.add(token.feats or "")) deps.append(self.vocab.strings.add(self._dep(token.deprel) or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] if i == len(tokens) - 1 or "SpaceAfter=No" in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.form)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) # Overwrite lemmas separately to prevent overwritting by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc
def tokenize_tag_parse_tree(self, root): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.""" if root.children: raise ValueError( 'Tree already contained nodes before tokenization') # tokenization (I cannot turn off segmenter, so I need to join the segments) self.tokenizer.setText(root.text) u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) u_words = u_sentence.words n_words = u_words.size() - 1 if is_another: u_sent_cont = Sentence() while self.tokenizer.nextSentence(u_sent_cont): n_cont = u_sent_cont.words.size() - 1 for i in range(1, n_cont + 1): u_w = u_sent_cont.words[i] n_words += 1 u_w.id = n_words u_words.append(u_w) # tagging and parsing self.tool.tag(u_sentence, Model.DEFAULT) self.tool.parse(u_sentence, Model.DEFAULT) # converting UDPipe nodes to Udapi nodes heads, nodes = [], [root] for i in range(1, u_words.size()): u_w = u_words[i] node = root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, ) node.misc = u_w.misc heads.append(u_w.head) nodes.append(node) for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head]
def _read(self, text: str, input_format: str) -> List[Sentence]: """Convert the text to an UDPipe representation. text: Input text. input_format: Desired input format. RETURNS: Processed sentences. """ input_format.setText(text) error = ProcessingError() sentences = [] sentence = Sentence() while input_format.nextSentence(sentence, error): sentences.append(sentence) sentence = Sentence() if error.occurred(): raise Exception(error.message) return sentences
def _read(self, text, input_format): """Convert the text to a UDPipe representation. text (unicode): Input text. input_format (unicode): Desired input format. RETURNS (list): Processed ufal.udpipe.Sentence-s. """ input_format.setText(text) error = ProcessingError() sentences = [] sentence = Sentence() while input_format.nextSentence(sentence, error): sentences.append(sentence) sentence = Sentence() if error.occurred(): raise Exception(error.message) return sentences
def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool: """Tokenize each line from stored lines and store tokens in sentence. sentence: UDPipe container for storing tokens. """ try: line = next(self.lines) except StopIteration: return False tokens = line.split("\t") prev_word = Word() for token in tokens: word = sentence.addWord(token) if re.match(r"\W", token): # leave no space after previous token iff current token # is non-alphanumeric (i.e. punctuation) prev_word.misc = NO_SPACE prev_word = word return True
def preproc_item(text): if pd.isna(text): text = '' tokenizer.resetDocument() try: tokenizer.setText(text) except TypeError: print(row, text) 1/0 sentence = Sentence() error = ProcessingError() text = '' while (tokenizer.nextSentence(sentence, error)): udpipe_model.tag(sentence, Pipeline.DEFAULT, error) #udpipe_model.parse(sentence, Pipeline.DEFAULT, error) text += OutputFormat.newConlluOutputFormat().writeSentence(sentence) return text
def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ if root.children: raise ValueError( 'Tree already contained nodes before tokenization') # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) is_another = True u_sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: u_sentences.append(u_sentence) # If resegmentation was not required, we need to join the segments. if not resegment and len(u_sentences) > 1: first_sent = u_sentences[0] n_words = first_sent.words.size() - 1 for other_sent in u_sentences[1:]: other_words = other_sent.words.size() - 1 for i in range(1, other_words + 1): u_w = other_sent.words[i] n_words += 1 u_w.id = n_words first_sent.words.append(u_w) u_sentences = [first_sent] # tagging and parsing if tag: for u_sentence in u_sentences: self.tool.tag(u_sentence, Model.DEFAULT) if parse: self.tool.parse(u_sentence, Model.DEFAULT) elif parse: raise ValueError( 'Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes new_root = root trees = [] for u_sentence in u_sentences: if not new_root: new_root = Root() heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): u_w = u_words[i] node = new_root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, ) if parse: heads.append(u_w.head) nodes.append(node) if parse: for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head] trees.append(new_root) new_root = None return trees
def execute(job, task): assert settings.UDPIPE_MODEL_FILE, "You must set UDPIPE_MODEL_FILE setting to begin" from .mongodb import get_db from .udpipe_model import Model as UDPipeModel from ufal.udpipe import Sentence # type: ignore db = get_db() model = UDPipeModel(settings.UDPIPE_MODEL_FILE) total_docs = TagWithUDPipeJob.get_total_count(db, job, task) feat_categories = Counter() poses = Counter() feat_values = defaultdict(Counter) for i, (corpus, article) in enumerate(TagWithUDPipeJob.get_iter(db, job, task)): update_clause = defaultdict(list) article_lemmas = [] article_postags = [] article_features = [] if "nlp" in article: for f in ["title", "text"]: if f not in article["nlp"]: task.log( logging.WARNING, f"Cannot find field {f} in the document {article['_id']}" ) continue if "tokens" not in article["nlp"][f]: task.log( logging.WARNING, f"Cannot find tokenized version of field {f} in the document {article['_id']}", ) continue for s in article["nlp"][f]["tokens"].split("\n"): # ignoring default model tokenizer in order to use whitespace tokenizer tok_sent = Sentence() for w in s.split(" "): tok_sent.addWord(w) sent_lemmas = [] sent_postags = [] sent_features = [] model.tag(tok_sent) for w in tok_sent.words[1:]: poses.update([w.upostag]) sent_lemmas.append(w.lemma) # Again, not moving that to a separate function to # reduce number of unnecessary calls try: sent_postags.append( COMPRESS_UPOS_MAPPING[w.upostag]) except KeyError: task.log( logging.WARNING, f"Cannot find {w.upostag} in the COMPRESS_UPOS_MAPPING, skipping for now", ) sent_postags.append("Z") sent_features.append(compress_features(w.feats)) for pair in w.feats.split("|"): if not pair: continue cat, val = pair.split("=") feat_categories.update([cat]) feat_values[cat].update([val]) update_clause[f"nlp.{f}.ud_lemmas"].append( " ".join(sent_lemmas)) # We don't need to have a separator for the postags as there is always one # pos tag (which is character) per word update_clause[f"nlp.{f}.ud_postags"].append( "".join(sent_postags)) update_clause[f"nlp.{f}.ud_features"].append( " ".join(sent_features)) for k, v in update_clause.items(): update_clause[k] = "\n".join(v) if update_clause: try: db[corpus].update_one( {"_id": article["_id"]}, { "$set": update_clause, "$addToSet": { "processing_status": "udpipe_tagged" }, }, ) except pymongo.errors.WriteError: task.log( logging.WARNING, f"Cannot store results back to the document {article['_id']}" ) continue else: task.log( logging.WARNING, f"Cannot find any text in the document {article['_id']}" ) task.set_progress((i + 1) * 100 // total_docs, step=1)
def __call__(self, text: Union[str, List[str], List[List[str]]]) -> Doc: """Convert input text to a spaCy Doc. text: The text to process. It can be presegmented or pretokenized: str : raw text, List[str] : presegmented text, List[List[str]] : pretokenized text. RETURNS: The spaCy Doc object. """ if not text: return Doc(vocab=self.vocab) udpipe_sents = self.model(text=text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents) words = [] spaces = [] pos = [] tags = [] morphs = [] deps = [] lemmas = [] offset = 0 is_aligned = self._check_aligned(text=text, tokens=tokens) if not is_aligned: text = "" for token in tokens: text += token.form if NO_SPACE not in token.misc: text += " " for i, token in enumerate(tokens): span = text[offset:] if not span: break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.form) # Make sure all strings are in the vocabulary pos.append(token.upostag or "") # CoNNL xpostag-s, custom for each UD treebank morphs.append(token.feats or "") tags.append(token.xpostag or "") deps.append(_spacy_dep(token.deprel) or "") lemmas.append(token.lemma or "") offset += len(token.form) span = text[offset:] if i == len(tokens) - 1 or NO_SPACE in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.form)) doc = Doc( vocab=self.vocab, words=words, spaces=spaces, pos=pos, tags=tags, morphs=morphs, lemmas=lemmas, deps=deps, heads=[head + i for i, head in enumerate(heads)], ) return doc
def __call__(self, text: Union[str, List[str], List[List[str]]]) -> Doc: """Convert input text to a spaCy Doc. text: The text to process. It can be presegmented or pretokenized: str : raw text, List[str] : presegmented text, List[List[str]] : pretokenized text. RETURNS: The spaCy Doc object. """ udpipe_sents = self.model(text=text) if text else [Sentence()] text = " ".join(s.getText() for s in udpipe_sents) tokens, heads = self._get_tokens_with_heads(udpipe_sents=udpipe_sents) if not tokens: return Doc(vocab=self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self._check_aligned(text=text, tokens=tokens) if not is_aligned: text = "" for token in tokens: text += token.form if NO_SPACE not in token.misc: text += " " for i, token in enumerate(tokens): span = text[offset:] if not span: break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.form) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upostag or "")) # CoNNL xpostag-s, custom for each UD treebank tags.append(self.vocab.strings.add(token.xpostag or "")) deps.append(self.vocab.strings.add(self._dep(token.deprel) or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.form) span = text[offset:] if i == len(tokens) - 1 or NO_SPACE in token.misc: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.form)) try: attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) except ValueError as e: if '[E167]' in str(e): raise ValueError( "Could not properly assign morphology features. " f"Please update the tag map for '{self.model._lang}'" " language. See " "https://spacy.io/usage/adding-languages#tag-map " "for details. A quick workaround is to use the keyword " "argument ignore_tag_map=True when loading UDPipeLanguage." ) else: raise e # Overwrite lemmas separately to prevent overwritting by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array(attrs=[LEMMA], array=lemma_array) doc.is_tagged = bool(any(pos) and any(tags)) doc.is_parsed = bool(any(deps)) return doc