Python Sentence.addWord Examples

Programming Language: Python

Namespace/Package Name: ufal.udpipe

Class/Type: Sentence

Method/Function: addWord

Examples at hotexamples.com: 2

Python Sentence.addWord - 2 examples found. These are the top rated real world Python examples of ufal.udpipe.Sentence.addWord extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Sentence(13)

getText(3)

addWord(2)

Frequently Used Methods

Sentence (13)

getText (3)

addWord (2)

Example #1

Show file

File: udpipe.py Project: harisont/spacy-udpipe

    def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
        """Tokenize each line from stored lines and store tokens in sentence.

        sentence: UDPipe container for storing tokens.
        """
        try:
            line = next(self.lines)
        except StopIteration:
            return False
        tokens = line.split("\t")
        prev_word = Word()
        for token in tokens:
            word = sentence.addWord(token)
            if re.match(r"\W", token):
                # leave no space after previous token iff current token
                # is non-alphanumeric (i.e. punctuation)
                prev_word.misc = NO_SPACE
            prev_word = word
        return True

Example #2

Show file

File: jobs.py Project: lang-uk/lang.org.ua

    def execute(job, task):
        assert settings.UDPIPE_MODEL_FILE, "You must set UDPIPE_MODEL_FILE setting to begin"

        from .mongodb import get_db
        from .udpipe_model import Model as UDPipeModel
        from ufal.udpipe import Sentence  # type: ignore

        db = get_db()

        model = UDPipeModel(settings.UDPIPE_MODEL_FILE)
        total_docs = TagWithUDPipeJob.get_total_count(db, job, task)

        feat_categories = Counter()
        poses = Counter()
        feat_values = defaultdict(Counter)

        for i, (corpus,
                article) in enumerate(TagWithUDPipeJob.get_iter(db, job,
                                                                task)):
            update_clause = defaultdict(list)
            article_lemmas = []
            article_postags = []
            article_features = []
            if "nlp" in article:
                for f in ["title", "text"]:
                    if f not in article["nlp"]:
                        task.log(
                            logging.WARNING,
                            f"Cannot find field {f} in the document {article['_id']}"
                        )
                        continue

                    if "tokens" not in article["nlp"][f]:
                        task.log(
                            logging.WARNING,
                            f"Cannot find tokenized version of field {f} in the document {article['_id']}",
                        )
                        continue

                    for s in article["nlp"][f]["tokens"].split("\n"):
                        # ignoring default model tokenizer in order to use whitespace tokenizer

                        tok_sent = Sentence()
                        for w in s.split(" "):
                            tok_sent.addWord(w)

                        sent_lemmas = []
                        sent_postags = []
                        sent_features = []

                        model.tag(tok_sent)

                        for w in tok_sent.words[1:]:
                            poses.update([w.upostag])
                            sent_lemmas.append(w.lemma)
                            # Again, not moving that to a separate function to
                            # reduce number of unnecessary calls
                            try:
                                sent_postags.append(
                                    COMPRESS_UPOS_MAPPING[w.upostag])
                            except KeyError:
                                task.log(
                                    logging.WARNING,
                                    f"Cannot find {w.upostag} in the COMPRESS_UPOS_MAPPING, skipping for now",
                                )
                                sent_postags.append("Z")

                            sent_features.append(compress_features(w.feats))

                            for pair in w.feats.split("|"):
                                if not pair:
                                    continue
                                cat, val = pair.split("=")
                                feat_categories.update([cat])
                                feat_values[cat].update([val])

                        update_clause[f"nlp.{f}.ud_lemmas"].append(
                            " ".join(sent_lemmas))

                        # We don't need to have a separator for the postags as there is always one
                        # pos tag (which is character) per word
                        update_clause[f"nlp.{f}.ud_postags"].append(
                            "".join(sent_postags))
                        update_clause[f"nlp.{f}.ud_features"].append(
                            " ".join(sent_features))

                for k, v in update_clause.items():
                    update_clause[k] = "\n".join(v)

                if update_clause:
                    try:
                        db[corpus].update_one(
                            {"_id": article["_id"]},
                            {
                                "$set": update_clause,
                                "$addToSet": {
                                    "processing_status": "udpipe_tagged"
                                },
                            },
                        )
                    except pymongo.errors.WriteError:
                        task.log(
                            logging.WARNING,
                            f"Cannot store results back to the document {article['_id']}"
                        )
                        continue
                else:
                    task.log(
                        logging.WARNING,
                        f"Cannot find any text in the document {article['_id']}"
                    )

            task.set_progress((i + 1) * 100 // total_docs, step=1)