Ejemplo n.º 1
0
 def read_tree(self, document=None):
     if self.filehandle is None:
         return None
     line = self.filehandle.readline()
     if line == '':
         return None
     root = Root()
     root.text = line.rstrip()
     return root
Ejemplo n.º 2
0
 def read_tree(self, document=None):
     if self.filehandle is None:
         return None
     line = self.filehandle.readline()
     # if readline() returns an empty string, the end of the file has been
     # reached, while a blank line is represented by '\n'
     # (or '\r\n' if reading a Windows file on Unix machine).
     if line == '':
         return None
     if self.ignore_empty_lines:
         while line in {'\n', '\r\n'}:
             line = self.filehandle.readline()
             if line == '':
                 return None
     root = Root()
     root.text = line.rstrip(self.rstrip)
     return root
Ejemplo n.º 3
0
 def read_tree(self, document=None):
     if self.filehandle is None:
         return None
     line = self.filehandle.readline()
     # if readline() returns an empty string, the end of the file has been
     # reached, while a blank line is represented by '\n'
     # (or '\r\n' if reading a Windows file on Unix machine).
     if line == '':
         return None
     if self.ignore_empty_lines:
         while line in {'\n', '\r\n'}:
             line = self.filehandle.readline()
             if line == '':
                 return None
     root = Root()
     root.text = line.rstrip()
     return root
Ejemplo n.º 4
0
    def tokenize_tag_parse_tree(self,
                                root,
                                resegment=False,
                                tag=True,
                                parse=True):
        """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.

        If resegment=True, the returned list of Udapi trees may contain multiple trees.
        """
        if root.children:
            raise ValueError(
                'Tree already contained nodes before tokenization')

        # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions).
        self.tokenizer.setText(root.text)
        is_another = True
        u_sentences = []
        while is_another:
            u_sentence = Sentence()
            is_another = self.tokenizer.nextSentence(u_sentence)
            if is_another:
                u_sentences.append(u_sentence)

        # If resegmentation was not required, we need to join the segments.
        if not resegment and len(u_sentences) > 1:
            first_sent = u_sentences[0]
            n_words = first_sent.words.size() - 1
            for other_sent in u_sentences[1:]:
                other_words = other_sent.words.size() - 1
                for i in range(1, other_words + 1):
                    u_w = other_sent.words[i]
                    n_words += 1
                    u_w.id = n_words
                    first_sent.words.append(u_w)
            u_sentences = [first_sent]

        # tagging and parsing
        if tag:
            for u_sentence in u_sentences:
                self.tool.tag(u_sentence, Model.DEFAULT)
                if parse:
                    self.tool.parse(u_sentence, Model.DEFAULT)
        elif parse:
            raise ValueError(
                'Combination parse=True tag=False is not allowed.')

        # converting UDPipe nodes to Udapi nodes
        new_root = root
        trees = []
        for u_sentence in u_sentences:
            if not new_root:
                new_root = Root()
            new_root.text = u_sentence.getText() if resegment else root.text
            heads, nodes = [], [new_root]
            u_words = u_sentence.words
            for i in range(1, u_words.size()):
                u_w = u_words[i]
                node = new_root.create_child(
                    form=u_w.form,
                    lemma=u_w.lemma,
                    upos=u_w.upostag,
                    xpos=u_w.xpostag,
                    feats=u_w.feats,
                    deprel=u_w.deprel,
                    misc=u_w.misc,
                )
                if parse:
                    heads.append(u_w.head)
                    nodes.append(node)
            if parse:
                for node in nodes[1:]:
                    head = heads.pop(0)
                    node.parent = nodes[head]
            trees.append(new_root)
            new_root = None
        return trees