def wrapper(obj, *args, **kwargs): try: return f(obj, *args, **kwargs), f.__name__ except: logger.error("Error handling xnmt event at object:", obj.__class__.__name__) raise
def read_sents(self, filename: str, filter_ids: Sequence[numbers.Integral] = None): # Routine to add tree def emit_tree(idx, lines): nodes = {} edge_list = [] for node_id, form, lemma, pos, feat, head, deprel in lines: nodes[node_id] = sent.SyntaxTreeNode(node_id=node_id, value=form, head=pos) for node_id, form, lemma, pos, feat, head, deprel in lines: if head != 0 and deprel != "ROOT": edge_list.append(HyperEdge(head, [node_id], None, deprel)) return sent.RNNGSequenceSentence(idx, HyperGraph(edge_list, nodes), self.surface_vocab, self.nt_vocab, all_surfaces=True) idx = 0 lines = [] # Loop all lines in the file with open(filename) as fp: for line in fp: line = line.strip() if len(line) == 0: yield emit_tree(idx, lines) lines.clear() idx += 1 else: try: node_id, form, lemma, pos, feat, head, deprel = line.strip().split() lines.append((int(node_id), form, lemma, pos, feat, int(head), deprel)) except ValueError: logger.error("Bad line: %s", line) if len(lines) != 0: yield emit_tree(idx, lines)
def evaluate(self, ref, hyp, desc=None): try: from xnmt.cython import xnmt_cython except: logger.error("BLEU evaluate fast requires xnmt cython installation step." "please check the documentation.") raise return xnmt_cython.bleu_sentence(self.ngram, self.smooth, ref, hyp)
def evaluate_one_sent(self, ref, hyp): try: from xnmt.cython import xnmt_cython except: logger.error("BLEU evaluate fast requires xnmt cython installation step." "please check the documentation.") raise if len(ref) == 0 or len(hyp) == 0: return 0 return xnmt_cython.bleu_sentence(self.ngram, self.smooth, ref, hyp)
def read_sents(self, filename: str, filter_ids: Sequence[numbers.Integral] = None): # Routine to add tree def emit_tree(idx, lines): nodes = {} edge_list = [] max_node = -1 for node_id, form, lemma, pos, feat, head, deprel in lines: nodes[node_id] = sent.SyntaxTreeNode(node_id=node_id, value=form, head=pos) max_node = max(max_node, node_id) nodes[max_node + 1] = sent.SyntaxTreeNode( node_id=max_node + 1, value=vocabs.Vocab.ES_STR, head=vocabs.Vocab.ES_STR) root = -1 for node_id, form, lemma, pos, feat, head, deprel in lines: if head == 0: root = node_id else: edge_list.append(HyperEdge(head, [node_id], None, deprel)) edge_list.append( HyperEdge(root, [max_node + 1], None, vocabs.Vocab.ES_STR)) return sent.DepTreeRNNGSequenceSentence( idx, score=None, graph=HyperGraph(edge_list, nodes), surface_vocab=self.value_vocab, nt_vocab=self.node_vocab, edge_vocab=self.edge_vocab, all_surfaces=True, output_procs=self.output_procs) idx = 0 lines = [] # Loop all lines in the file with open(filename) as fp: for line in fp: line = line.strip() if len(line) <= 1: yield emit_tree(idx, lines) lines.clear() idx += 1 else: try: node_id, form, lemma, pos, feat, head, deprel = line.strip( ).split("\t") lines.append((int(node_id), form, lemma, pos, feat, int(head), deprel)) except ValueError: logger.error("Bad line: %s", line) raise if len(lines) != 0: yield emit_tree(idx, lines)
def sparse_to_dense(self, actions, length): try: from xnmt.cython import xnmt_cython except: logger.error( "BLEU evaluate fast requires xnmt cython installation step." "please check the documentation.") raise RuntimeError() batch_dense = [] for batch_action in actions: batch_dense.append( xnmt_cython.dense_from_sparse(batch_action, length)) return np.array(batch_dense).transpose()
def sparse_to_dense(self, actions, length): try: from xnmt.cython import xnmt_cython except: logger.error( "BLEU evaluate fast requires xnmt cython installation step." "please check the documentation.") raise RuntimeError() dense_actions = [] for sample_actions in actions: batch_dense = [] for batch_action in sample_actions: batch_dense.append( xnmt_cython.dense_from_sparse(batch_action, length)) dense_actions.append(batch_dense) arr = np.array(dense_actions) # (sample, batch, length) return np.rollaxis(arr, 2)
def __exit__(self, et, ev, traceback): if et is not None: # exception occurred logger.error("------ Error Report ------") for key, val in self.args.items(): logger.error(f"*** {key} ***") if callable(val): val() else: logger.error(str(val))
def __exit__(self, et, ev, traceback): if et is not None: # exception occurred logger.error("------ Error Report ------") for key, val in self.args.items(): logger.error(f"*** {key} ***") if callable(val): val() elif batchers.is_batched(val): for sent in val: if hasattr(sent, "idx"): print("{:>10}. {}".format(sent.idx, str(sent)[:100])) else: print("{}".format(str(sent))) else: logger.error(str(val))
def __call__(self, generator, src_file=None, trg_file=None, candidate_id_file=None): """ Args: generator (GeneratorModel): the model to be used src_file (str): path of input src file to be translated trg_file (str): path of file where trg translatons will be written candidate_id_file (str): if we are doing something like retrieval where we select from fixed candidates, sometimes we want to limit our candidates to a certain subset of the full set. this setting allows us to do this. """ args = dict(src_file=src_file or self.src_file, trg_file=trg_file or self.trg_file, ref_file=self.ref_file, max_src_len=self.max_src_len, post_process=self.post_process, candidate_id_file=candidate_id_file, report_path=self.report_path, report_type=self.report_type, beam=self.beam, max_len=self.max_len, len_norm_type=self.len_norm_type, mode=self.mode) is_reporting = issubclass( generator.__class__, Reportable) and args["report_path"] is not None # Corpus src_corpus = list(generator.src_reader.read_sents(args["src_file"])) # Get reference if it exists and is necessary if args["mode"] == "forced" or args["mode"] == "forceddebug" or args[ "mode"] == "score": if args["ref_file"] == None: raise RuntimeError( "When performing {} decoding, must specify reference file". format(args["mode"])) score_src_corpus = [] ref_corpus = [] with open(args["ref_file"], "r", encoding="utf-8") as fp: for line in fp: if args["mode"] == "score": nbest = line.split("|||") assert len( nbest ) > 1, "When performing scoring, ref_file must have nbest format 'index ||| hypothesis'" src_index = int(nbest[0].strip()) assert src_index < len( src_corpus ), "The src_file has only {} instances, nbest file has invalid src_index {}".format( len(src_corpus), src_index) score_src_corpus.append(src_corpus[src_index]) trg_input = generator.trg_reader.read_sent( nbest[1].strip()) else: trg_input = generator.trg_reader.read_sent(line) ref_corpus.append(trg_input) if args["mode"] == "score": src_corpus = score_src_corpus else: if self.max_len and any( len(s) > self.max_len for s in ref_corpus): logger.warning( "Forced decoding with some targets being longer than max_len. Increase max_len to avoid unexpected behavior." ) else: ref_corpus = None # Vocab src_vocab = generator.src_reader.vocab if hasattr( generator.src_reader, "vocab") else None trg_vocab = generator.trg_reader.vocab if hasattr( generator.trg_reader, "vocab") else None # Perform initialization generator.set_train(False) generator.initialize_generator(**args) if hasattr(generator, "set_post_processor"): generator.set_post_processor(self.get_output_processor()) if hasattr(generator, "set_trg_vocab"): generator.set_trg_vocab(trg_vocab) if hasattr(generator, "set_reporting_src_vocab"): generator.set_reporting_src_vocab(src_vocab) if is_reporting: generator.set_report_resource("src_vocab", src_vocab) generator.set_report_resource("trg_vocab", trg_vocab) # If we're debugging, calculate the loss for each target sentence ref_scores = None if args["mode"] == 'forceddebug' or args["mode"] == 'score': some_batcher = xnmt.batcher.InOrderBatcher(32) # Arbitrary if not isinstance(some_batcher, xnmt.batcher.InOrderBatcher): raise ValueError( f"forceddebug requires InOrderBatcher, got: {some_batcher}" ) batched_src, batched_ref = some_batcher.pack( src_corpus, ref_corpus) ref_scores = [] for src, ref in zip(batched_src, batched_ref): dy.renew_cg(immediate_compute=settings.IMMEDIATE_COMPUTE, check_validity=settings.CHECK_VALIDITY) loss_expr = generator.calc_loss( src, ref, loss_calculator=LossCalculator()) if isinstance(loss_expr.value(), Iterable): ref_scores.extend(loss_expr.value()) else: ref_scores.append(loss_expr.value()) ref_scores = [-x for x in ref_scores] # Make the parent directory if necessary make_parent_dir(args["trg_file"]) # Perform generation of output if args["mode"] != 'score': with open(args["trg_file"], 'wt', encoding='utf-8' ) as fp: # Saving the translated output to a trg file src_ret = [] for i, src in enumerate(src_corpus): # This is necessary when the batcher does some sort of pre-processing, e.g. # when the batcher pads to a particular number of dimensions if self.batcher: self.batcher.add_single_batch(src_curr=[src], trg_curr=None, src_ret=src_ret, trg_ret=None) src = src_ret.pop()[0] # Do the decoding if args["max_src_len"] is not None and len( src) > args["max_src_len"]: output_txt = NO_DECODING_ATTEMPTED else: dy.renew_cg( immediate_compute=settings.IMMEDIATE_COMPUTE, check_validity=settings.CHECK_VALIDITY) ref_ids = ref_corpus[i] if ref_corpus != None else None output = generator.generate_output( src, i, forced_trg_ids=ref_ids) # If debugging forced decoding, make sure it matches if ref_scores != None and ( abs(output[0].score - ref_scores[i]) / abs(ref_scores[i])) > 1e-5: logger.error( f'Forced decoding score {output[0].score} and loss {ref_scores[i]} do not match at sentence {i}' ) output_txt = output[0].plaintext # Printing to trg file fp.write(f"{output_txt}\n") else: with open(args["trg_file"], 'wt', encoding='utf-8') as fp: with open(args["ref_file"], "r", encoding="utf-8") as nbest_fp: for nbest, score in zip(nbest_fp, ref_scores): fp.write("{} ||| score={}\n".format( nbest.strip(), score))