def parse(text, sentence_id): """Takes a sentence in raw text and produces its CoNLL-U annotation by invoking udpipe Paratemeters: text - the sentence to be parsed sentence_id - the ID of the sentence Output: a UD graph """ model = Model.load('./models/udpipe/english-ewt-ud-2.3-181115.udpipe') tokenizer = model.newTokenizer(model.TOKENIZER_PRESEGMENTED) # tokenizer = model.TOKENIZER_PRESEGMENTED(model.DEFAULT) conlluOutput = OutputFormat.newOutputFormat("conllu") sentence = Sentence() error = ProcessingError() tokenizer.setText(text) tokenizer.nextSentence(sentence, error) model.tag(sentence, model.DEFAULT) model.parse(sentence, model.DEFAULT) return conlluOutput.writeSentence(sentence).replace( '# sent_id = 1', '# sent_id = ' + sentence_id)
def write(self, sentences: List[Sentence], out_format: str) -> str: """Write given sentences in the required output format. sentences: Input ufal.udpipe.Sentence-s. out_format: 'conllu'|'horizontal'|'vertical'. RETURNS: Sentences formatted in the out_format. """ output_format = OutputFormat.newOutputFormat(out_format) output = "".join([output_format.writeSentence(s) for s in sentences]) output += output_format.finishDocument() return output
def write(self, sentences, out_format): """Write given sentences in the required output format. sentences (list): Input ufal.udpipe.Sentence-s. out_format (unicode): One of conllu|horizontal|vertical. RETURNS (unicode): Sentences in the desired format. """ output_format = OutputFormat.newOutputFormat(out_format) output = ''.join([output_format.writeSentence(s) for s in sentences]) output += output_format.finishDocument() return output
def preproc_item(text): if pd.isna(text): text = '' tokenizer.resetDocument() try: tokenizer.setText(text) except TypeError: print(row, text) 1/0 sentence = Sentence() error = ProcessingError() text = '' while (tokenizer.nextSentence(sentence, error)): udpipe_model.tag(sentence, Pipeline.DEFAULT, error) #udpipe_model.parse(sentence, Pipeline.DEFAULT, error) text += OutputFormat.newConlluOutputFormat().writeSentence(sentence) return text