Esempio n. 1
0
 def segment_text(self, text):
     """Segment the provided text into sentences."""
     self.tokenizer.setText(text)
     is_another = True
     sentences = []
     while is_another:
         u_sentence = Sentence()
         is_another = self.tokenizer.nextSentence(u_sentence)
         if is_another:
             sentences.append(u_sentence.getText())
     return sentences
Esempio n. 2
0
    def _runApp(self, dto, opNotDone):
        text = dto.getText()

        tokenizer = self._model.newTokenizer(self._model.DEFAULT)
        tokenizer.setText(text)
        error = ProcessingError()
        sentence = Sentence()
        sid = 0

        while tokenizer.nextSentence(sentence, error):
            self._model.tag(sentence, self._model.DEFAULT)
            self._model.parse(sentence, self._model.DEFAULT)
            # Teprolin tokenized sentence
            ttsent = []
            # Teprolin string sentence
            tssent = sentence.getText()

            for w in sentence.words:
                if w.id == 0:
                    continue

                tt = TeproTok()

                tt.setId(w.id)
                tt.setWordForm(w.form)
                tt.setCTAG(w.upostag)
                tt.setMSD(w.xpostag)
                tt.setLemma(w.lemma)
                tt.setHead(w.head)
                tt.setDepRel(w.deprel)

                ttsent.append(tt)
            # end for w

            if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()):
                dto.addSentenceString(tssent)
                dto.addSentenceTokens(ttsent)
            else:
                # Check and update annotations that only TTL
                # can produce or that are requested specifically from it.
                alignment = dto.alignSentences(ttsent, sid)

                for op in opNotDone:
                    dto.copyTokenAnnotation(ttsent, sid, alignment, op)

            sentence = Sentence()
            sid += 1
        # end all split sentences.

        return dto
Esempio n. 3
0
    def tokenize_tag_parse_tree(self,
                                root,
                                resegment=False,
                                tag=True,
                                parse=True):
        """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.

        If resegment=True, the returned list of Udapi trees may contain multiple trees.
        """
        if root.children:
            raise ValueError(
                'Tree already contained nodes before tokenization')

        # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions).
        self.tokenizer.setText(root.text)
        is_another = True
        u_sentences = []
        while is_another:
            u_sentence = Sentence()
            is_another = self.tokenizer.nextSentence(u_sentence)
            if is_another:
                u_sentences.append(u_sentence)

        # If resegmentation was not required, we need to join the segments.
        if not resegment and len(u_sentences) > 1:
            first_sent = u_sentences[0]
            n_words = first_sent.words.size() - 1
            for other_sent in u_sentences[1:]:
                other_words = other_sent.words.size() - 1
                for i in range(1, other_words + 1):
                    u_w = other_sent.words[i]
                    n_words += 1
                    u_w.id = n_words
                    first_sent.words.append(u_w)
            u_sentences = [first_sent]

        # tagging and parsing
        if tag:
            for u_sentence in u_sentences:
                self.tool.tag(u_sentence, Model.DEFAULT)
                if parse:
                    self.tool.parse(u_sentence, Model.DEFAULT)
        elif parse:
            raise ValueError(
                'Combination parse=True tag=False is not allowed.')

        # converting UDPipe nodes to Udapi nodes
        new_root = root
        trees = []
        for u_sentence in u_sentences:
            if not new_root:
                new_root = Root()
            new_root.text = u_sentence.getText() if resegment else root.text
            heads, nodes = [], [new_root]
            u_words = u_sentence.words
            for i in range(1, u_words.size()):
                u_w = u_words[i]
                node = new_root.create_child(
                    form=u_w.form,
                    lemma=u_w.lemma,
                    upos=u_w.upostag,
                    xpos=u_w.xpostag,
                    feats=u_w.feats,
                    deprel=u_w.deprel,
                    misc=u_w.misc,
                )
                if parse:
                    heads.append(u_w.head)
                    nodes.append(node)
            if parse:
                for node in nodes[1:]:
                    head = heads.pop(0)
                    node.parent = nodes[head]
            trees.append(new_root)
            new_root = None
        return trees