def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool: """Tokenize each line from stored lines and store tokens in sentence. sentence: UDPipe container for storing tokens. """ try: line = next(self.lines) except StopIteration: return False tokens = line.split("\t") prev_word = Word() for token in tokens: word = sentence.addWord(token) if re.match(r"\W", token): # leave no space after previous token iff current token # is non-alphanumeric (i.e. punctuation) prev_word.misc = NO_SPACE prev_word = word return True
def execute(job, task): assert settings.UDPIPE_MODEL_FILE, "You must set UDPIPE_MODEL_FILE setting to begin" from .mongodb import get_db from .udpipe_model import Model as UDPipeModel from ufal.udpipe import Sentence # type: ignore db = get_db() model = UDPipeModel(settings.UDPIPE_MODEL_FILE) total_docs = TagWithUDPipeJob.get_total_count(db, job, task) feat_categories = Counter() poses = Counter() feat_values = defaultdict(Counter) for i, (corpus, article) in enumerate(TagWithUDPipeJob.get_iter(db, job, task)): update_clause = defaultdict(list) article_lemmas = [] article_postags = [] article_features = [] if "nlp" in article: for f in ["title", "text"]: if f not in article["nlp"]: task.log( logging.WARNING, f"Cannot find field {f} in the document {article['_id']}" ) continue if "tokens" not in article["nlp"][f]: task.log( logging.WARNING, f"Cannot find tokenized version of field {f} in the document {article['_id']}", ) continue for s in article["nlp"][f]["tokens"].split("\n"): # ignoring default model tokenizer in order to use whitespace tokenizer tok_sent = Sentence() for w in s.split(" "): tok_sent.addWord(w) sent_lemmas = [] sent_postags = [] sent_features = [] model.tag(tok_sent) for w in tok_sent.words[1:]: poses.update([w.upostag]) sent_lemmas.append(w.lemma) # Again, not moving that to a separate function to # reduce number of unnecessary calls try: sent_postags.append( COMPRESS_UPOS_MAPPING[w.upostag]) except KeyError: task.log( logging.WARNING, f"Cannot find {w.upostag} in the COMPRESS_UPOS_MAPPING, skipping for now", ) sent_postags.append("Z") sent_features.append(compress_features(w.feats)) for pair in w.feats.split("|"): if not pair: continue cat, val = pair.split("=") feat_categories.update([cat]) feat_values[cat].update([val]) update_clause[f"nlp.{f}.ud_lemmas"].append( " ".join(sent_lemmas)) # We don't need to have a separator for the postags as there is always one # pos tag (which is character) per word update_clause[f"nlp.{f}.ud_postags"].append( "".join(sent_postags)) update_clause[f"nlp.{f}.ud_features"].append( " ".join(sent_features)) for k, v in update_clause.items(): update_clause[k] = "\n".join(v) if update_clause: try: db[corpus].update_one( {"_id": article["_id"]}, { "$set": update_clause, "$addToSet": { "processing_status": "udpipe_tagged" }, }, ) except pymongo.errors.WriteError: task.log( logging.WARNING, f"Cannot store results back to the document {article['_id']}" ) continue else: task.log( logging.WARNING, f"Cannot find any text in the document {article['_id']}" ) task.set_progress((i + 1) * 100 // total_docs, step=1)