Exemple #1
0
def parse(text, sentence_id):
    """Takes a sentence in raw text and produces
	its CoNLL-U annotation by invoking udpipe

	Paratemeters: text - the sentence to be parsed
				  sentence_id - the ID of the sentence

	Output: a UD graph
	"""
    model = Model.load('./models/udpipe/english-ewt-ud-2.3-181115.udpipe')

    tokenizer = model.newTokenizer(model.TOKENIZER_PRESEGMENTED)
    # tokenizer = model.TOKENIZER_PRESEGMENTED(model.DEFAULT)

    conlluOutput = OutputFormat.newOutputFormat("conllu")

    sentence = Sentence()

    error = ProcessingError()

    tokenizer.setText(text)

    tokenizer.nextSentence(sentence, error)

    model.tag(sentence, model.DEFAULT)

    model.parse(sentence, model.DEFAULT)

    return conlluOutput.writeSentence(sentence).replace(
        '# sent_id = 1', '# sent_id = ' + sentence_id)
Exemple #2
0
    def write(self, sentences: List[Sentence], out_format: str) -> str:
        """Write given sentences in the required output format.

        sentences: Input ufal.udpipe.Sentence-s.
        out_format: 'conllu'|'horizontal'|'vertical'.
        RETURNS: Sentences formatted in the out_format.
        """
        output_format = OutputFormat.newOutputFormat(out_format)
        output = "".join([output_format.writeSentence(s) for s in sentences])
        output += output_format.finishDocument()

        return output
Exemple #3
0
    def write(self, sentences, out_format):
        """Write given sentences in the required output format.

        sentences (list): Input ufal.udpipe.Sentence-s.
        out_format (unicode): One of conllu|horizontal|vertical.
        RETURNS (unicode): Sentences in the desired format.
        """
        output_format = OutputFormat.newOutputFormat(out_format)
        output = ''.join([output_format.writeSentence(s) for s in sentences])
        output += output_format.finishDocument()

        return output
Exemple #4
0
        def preproc_item(text):
            if pd.isna(text):
                text = ''
            tokenizer.resetDocument()
            try:
                tokenizer.setText(text)
            except TypeError:
                print(row, text)
                1/0
            
            sentence = Sentence()
            error = ProcessingError()
            
            text = ''
            while (tokenizer.nextSentence(sentence, error)):
    
                udpipe_model.tag(sentence, Pipeline.DEFAULT, error)
                #udpipe_model.parse(sentence, Pipeline.DEFAULT, error)

                text += OutputFormat.newConlluOutputFormat().writeSentence(sentence)
                
            return text