def testComputeNotEqual(self): translation_corpus = [[1, 2, 3, 4]] reference_corpus = [[5, 6, 7, 8]] bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) # The smoothing prevents 0 for small corpora actual_bleu = 0.0798679 self.assertAllClose(bleu, actual_bleu, atol=1e-03)
def testBLEU(): translation_corpus = [['how', 'do', 'i', 'learn', 'linux']] reference_corpus = [[ 'how', 'do', 'i', 'learn', 'networking', 'with', 'linux' ]] bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) print(bleu)
def t2t_bleu(targets, predictions): """Tokenizes with the bleu_tokenize method from the t2t library then calls the compute_bleu function Args: targets: a list of strings, the target from the validation set preditcions: a list of strings, the model predictions Returns: a dictionary: {"t2t_bleu": bleu_value} """ targets_tokens = [bleu_hook.bleu_tokenize(x) for x in targets] predictions_tokens = [bleu_hook.bleu_tokenize(x) for x in predictions] return {"t2t_bleu": 100 * bleu_hook.compute_bleu(targets_tokens, predictions_tokens)}
def eval_metrics(self): def get_tokens(program): return [self.tokens[x] for x in program[:, 0].flatten().tolist()] def get_functions(program): return [x for x in program[:, 0] if self.tokens[x].startswith('@')] def get_devices(program): return [ self.tokens[x].rsplit('.', 1)[0] for x in program[:, 0] if self.tokens[x].startswith('@') ] def accuracy_without_parameters(predictions, labels, features): batch_size, predictions, labels = adjust_predictions_labels( predictions, labels, num_elements_per_time=3) weights = tf.ones((batch_size, ), dtype=tf.float32) ok = tf.to_float( tf.reduce_all(tf.equal(predictions[:, :, 0], labels[:, :, 0]), axis=1)) return ok, weights return { "accuracy": accuracy, "grammar_accuracy": grammar_accuracy, "device_accuracy": make_pyfunc_metric_fn( lambda pred, label: get_devices(pred) == get_devices(label)), "function_accuracy": make_pyfunc_metric_fn(lambda pred, label: get_functions(pred) == get_functions(label)), "accuracy_without_parameters": accuracy_without_parameters, "bleu_score": make_pyfunc_metric_fn(lambda pred, label: compute_bleu( [get_tokens(pred)], [get_tokens(label)])), "num_function_accuracy": make_pyfunc_metric_fn(lambda pred, label: len(get_functions(pred)) == len(get_functions(label))), "token_f1_accuracy": make_pyfunc_metric_fn(lambda pred, label: compute_f1_score( get_tokens(pred), get_tokens(label))) }
def testComputeMultipleNgrams(self): reference_corpus = [[1, 2, 1, 13], [12, 6, 7, 4, 8, 9, 10]] translation_corpus = [[1, 2, 1, 3], [5, 6, 7, 4]] bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) actual_bleu = 0.3436 self.assertAllClose(bleu, actual_bleu, atol=1e-03)
def testComputeMultipleBatch(self): translation_corpus = [[1, 2, 3, 4], [5, 6, 7, 0]] reference_corpus = [[1, 2, 3, 4], [5, 6, 7, 10]] bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) actual_bleu = 0.7231 self.assertAllClose(bleu, actual_bleu, atol=1e-03)
def testComputeBleuEqual(self): translation_corpus = [[1, 2, 3]] reference_corpus = [[1, 2, 3]] bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) actual_bleu = 1.0 self.assertEqual(bleu, actual_bleu)
def testComputeNotEqual(self): translation_corpus = [[1, 2, 3, 4]] reference_corpus = [[5, 6, 7, 8]] bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) actual_bleu = 0.0 self.assertEqual(bleu, actual_bleu)
def evaluate(reference_corpus, translation_corpus): bleu = bleu_hook.compute_bleu(reference_corpus, translation_corpus) return bleu