Exemple #1
0
    def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
        """Tokenize each line from stored lines and store tokens in sentence.

        sentence: UDPipe container for storing tokens.
        """
        try:
            line = next(self.lines)
        except StopIteration:
            return False
        tokens = line.split("\t")
        prev_word = Word()
        for token in tokens:
            word = sentence.addWord(token)
            if re.match(r"\W", token):
                # leave no space after previous token iff current token
                # is non-alphanumeric (i.e. punctuation)
                prev_word.misc = NO_SPACE
            prev_word = word
        return True
Exemple #2
0
    def execute(job, task):
        assert settings.UDPIPE_MODEL_FILE, "You must set UDPIPE_MODEL_FILE setting to begin"

        from .mongodb import get_db
        from .udpipe_model import Model as UDPipeModel
        from ufal.udpipe import Sentence  # type: ignore

        db = get_db()

        model = UDPipeModel(settings.UDPIPE_MODEL_FILE)
        total_docs = TagWithUDPipeJob.get_total_count(db, job, task)

        feat_categories = Counter()
        poses = Counter()
        feat_values = defaultdict(Counter)

        for i, (corpus,
                article) in enumerate(TagWithUDPipeJob.get_iter(db, job,
                                                                task)):
            update_clause = defaultdict(list)
            article_lemmas = []
            article_postags = []
            article_features = []
            if "nlp" in article:
                for f in ["title", "text"]:
                    if f not in article["nlp"]:
                        task.log(
                            logging.WARNING,
                            f"Cannot find field {f} in the document {article['_id']}"
                        )
                        continue

                    if "tokens" not in article["nlp"][f]:
                        task.log(
                            logging.WARNING,
                            f"Cannot find tokenized version of field {f} in the document {article['_id']}",
                        )
                        continue

                    for s in article["nlp"][f]["tokens"].split("\n"):
                        # ignoring default model tokenizer in order to use whitespace tokenizer

                        tok_sent = Sentence()
                        for w in s.split(" "):
                            tok_sent.addWord(w)

                        sent_lemmas = []
                        sent_postags = []
                        sent_features = []

                        model.tag(tok_sent)

                        for w in tok_sent.words[1:]:
                            poses.update([w.upostag])
                            sent_lemmas.append(w.lemma)
                            # Again, not moving that to a separate function to
                            # reduce number of unnecessary calls
                            try:
                                sent_postags.append(
                                    COMPRESS_UPOS_MAPPING[w.upostag])
                            except KeyError:
                                task.log(
                                    logging.WARNING,
                                    f"Cannot find {w.upostag} in the COMPRESS_UPOS_MAPPING, skipping for now",
                                )
                                sent_postags.append("Z")

                            sent_features.append(compress_features(w.feats))

                            for pair in w.feats.split("|"):
                                if not pair:
                                    continue
                                cat, val = pair.split("=")
                                feat_categories.update([cat])
                                feat_values[cat].update([val])

                        update_clause[f"nlp.{f}.ud_lemmas"].append(
                            " ".join(sent_lemmas))

                        # We don't need to have a separator for the postags as there is always one
                        # pos tag (which is character) per word
                        update_clause[f"nlp.{f}.ud_postags"].append(
                            "".join(sent_postags))
                        update_clause[f"nlp.{f}.ud_features"].append(
                            " ".join(sent_features))

                for k, v in update_clause.items():
                    update_clause[k] = "\n".join(v)

                if update_clause:
                    try:
                        db[corpus].update_one(
                            {"_id": article["_id"]},
                            {
                                "$set": update_clause,
                                "$addToSet": {
                                    "processing_status": "udpipe_tagged"
                                },
                            },
                        )
                    except pymongo.errors.WriteError:
                        task.log(
                            logging.WARNING,
                            f"Cannot store results back to the document {article['_id']}"
                        )
                        continue
                else:
                    task.log(
                        logging.WARNING,
                        f"Cannot find any text in the document {article['_id']}"
                    )

            task.set_progress((i + 1) * 100 // total_docs, step=1)