Esempio n. 1
0
 def put_sent(self, orig_sent: Sent, nlp_sent):
     text = orig_sent.get_text()
     # here we process the words!
     list_words = []
     list_uposes = []
     list_lemmas = []
     list_dep_heads = []
     list_dep_labels = []
     list_word_positions = []
     cur_word_start = 0
     # find them!!
     for w in nlp_sent.words:
         list_words.append(w.text)
         list_uposes.append(w.upos)
         list_lemmas.append(w.lemma)
         list_dep_heads.append(w.head)
         list_dep_labels.append(w.deprel)
         try:
             # todo(+N): some words can map to the same token if using MWT!
             t = w.parent
             tok_start = text.index(t.text, cur_word_start)  # idx inside the sentence
             list_word_positions.append((tok_start, t.end_char-t.start_char))  # [widx, wlen]
             cur_word_start = sum(list_word_positions[-1])  # start with next one
         except:
             list_word_positions = None
     # add them
     orig_sent.build_words(list_words)
     if self.pred_upos:
         orig_sent.build_uposes(list_uposes)
     if self.pred_lemma:
         orig_sent.build_lemmas(list_lemmas)
     if self.pred_dep:
         orig_sent.build_dep_tree(list_dep_heads, list_dep_labels)
     if list_word_positions is not None:
         orig_sent.build_word_positions(list_word_positions)
Esempio n. 2
0
 def to_obj(self, inst: Sent) -> str:
     if self.do_tok_sep:
         sep = " " if self.tok_sep is None else self.tok_sep
         return sep.join(inst.seq_word.vals)
     else:
         return inst.get_text()