def create_features(self, sentences): """ Build training instances. @return: list(features), list(action) for each state while parsing each sentence """ train_x, train_y = [], [] with tqdm(total=len(sentences)) as prog: for sent in sentences: # arcs = [(head, dependent, deprel)] state = ParserState([self.root_token], sent, []) # FIXME while state.buffer or len(state.stack) > 1: gold_t = state.get_oracle() if gold_t is None: break train_x.append(state.extract_features(self)) train_y.append(gold_t) state.step(gold_t) # perform transition prog.update(1) return train_x, train_y
def parse(self, sentences, model, conllu=False): """ @param sentences: a list of (list Token). @param model: a trained parser model. @param conllu: if True prints the parsed sentences in CoNLL-U format. """ vsentences = self.vectorize(sentences) UAS = LAS = all_tokens = 0.0 for sent, vsent in zip(sentences, vsentences): if not conllu: print('.', end='') # show progress state = ParserState([self.root_token], vsent, []) # FIXME while state.buffer or len(state.stack) > 1: feats = state.extract_features(self) trans = model.predict([feats])[0].argmax() if not state.step(trans): break # if transition is not feasible if conllu: for j, t in enumerate(sent): head = deprel = 0 for arc in state.arcs: if arc[1].id == t.id: head = arc[0].id deprel = arc[2] break print('\t'.join([ str(j + 1), t.form, '_', t.pos, '_', '_', str(head), self.id2dep[deprel], '_', '_' ])) print() for arc in state.arcs: pred_h = arc[0].id gold_h = arc[1].head UAS += pred_h == gold_h pred_l = arc[2] gold_l = arc[1].deprel LAS += pred_h == gold_h and pred_l == gold_l all_tokens += 1 UAS /= all_tokens LAS /= all_tokens return UAS, LAS