def test_case_where_n_is_bigger_than_hypothesis_length(self): # Test BLEU to nth order of n-grams, where n > len(hypothesis). references = ['John loves Mary ?'.split()] hypothesis = 'John loves Mary'.split() n = len(hypothesis) + 1 # weights = [1.0/n] * n # Uniform weights. self.assertAlmostEqual(sentence_bleu(references, hypothesis, weights), 0.7165, places=4) # Test case where n > len(hypothesis) but so is n > len(reference), and # it's a special case where reference == hypothesis. references = ['John loves Mary'.split()] hypothesis = 'John loves Mary'.split() assert(sentence_bleu(references, hypothesis, weights) == 1.0)
def bleu_advanced(y_true: List[Any], y_predicted: List[Any], weights: Tuple=(1,), smoothing_function=SMOOTH.method1, auto_reweigh=False, penalty=True) -> float: """Calculate BLEU score Parameters: y_true: list of reference tokens y_predicted: list of query tokens weights: n-gram weights smoothing_function: SmoothingFunction auto_reweigh: Option to re-normalize the weights uniformly penalty: either enable brevity penalty or not Return: BLEU score """ bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh) hyp_len = len(y_predicted) hyp_lengths = hyp_len ref_lengths = closest_ref_length([y_true], hyp_len) bpenalty = brevity_penalty(ref_lengths, hyp_lengths) if penalty is True or bpenalty == 0: return bleu_measure return bleu_measure/bpenalty
def calc_test_bleu_and_loss(sess, epoch): test_feed_generator = get_batch(test_vect_eng_sentences, test_decoder_input_data, test_decoder_target_data, batch_size) number_of_batches_in_test = int(len(test_vect_eng_sentences) / batch_size) # Calcualte the bleu of the translations of the test data bleu_scores = [] average_loss = 0 for i in tqdm(range(number_of_batches_in_test), desc="test metrics"): fd = next(test_feed_generator) predict_, loss_ = sess.run([decoder_prediction, loss], fd) for i, (inp, pred, exp) in enumerate(zip(fd[encoder_inputs].T, predict_.T, fd[decoder_targets].T)): input_sentence = decode_sequence(inp[::-1], rev_eng_vocab) output_sentence = decode_sequence(pred, rev_heb_vocab) expected_sentence = decode_sequence(exp, rev_heb_vocab) score = sentence_bleu([decode_sequence(pred, rev_heb_vocab, False)], decode_sequence(exp, rev_heb_vocab, False), smoothing_function=chencherry.method1) bleu_scores.append(score) average_loss += (loss_ / number_of_batches_in_test) train_writer.add_summary( tf.Summary(value=[tf.Summary.Value(tag="test_loss", simple_value=average_loss), ]), epoch) train_writer.add_summary( tf.Summary(value=[tf.Summary.Value(tag="test_bleu", simple_value=np.mean(bleu_scores)), ]), epoch)
def main(): """ bleu function parameters: bleu(candidate, references, weights) :param candidate: a candidate sentence :type candidate: list(str) :param references: reference sentences :type references: list(list(str)) :param weights: weights for unigrams, bigrams, trigrams and so on :type weights: list(float) """ # Command line argument checking if(len(sys.argv) != 3): sys.exit("ERROR: Invalid number of arguments, expecting 2") # Import the files, first the candidate into cFile and the reference to rFile cFile = open(sys.argv[1]) rFile = open(sys.argv[2]) cRaw = cFile.read() rRaw = rFile.read() # Then tokenize them both cToken = word_tokenize(cRaw) rToken = word_tokenize(rRaw) # Finally compute the BLEU score bleuSc = bleu_score.sentence_bleu([rToken], cToken) print(bleuSc)
def main(): """ bleu function parameters: bleu(candidate, references, weights) :param candidate: a candidate sentence :type candidate: list(str) :param references: reference sentences :type references: list(list(str)) :param weights: weights for unigrams, bigrams, trigrams and so on :type weights: list(float) """ # First define some test strings to work with refTextRaw = "This is the story of a man who fell from the fiftieth story of a building. While he fell, he reassured himself by repeating, 'So far, so good. So far, so good. So far, so good'. But, the important thing is not the fall - only the landing." candidateTextRaw = "This is the story of a man who fell from the 50th floor of a block. To reassure himself while he fell, he repeated, 'So far, so good. So far, so good. So far, so good'. However, the important thing is not the fall. Only the landing." refTextTokens = word_tokenize(refTextRaw) candidateTextTokens = word_tokenize(candidateTextRaw) candidate1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party'] candidate2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 'forever', 'hearing', 'the', 'activity', 'guidebook', 'that', 'party', 'direct'] reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'] reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party'] # Work out the BLEU score bleuSc = bleu_score.sentence_bleu([refTextTokens], candidateTextTokens) print(bleuSc)
def computeSimple(sentence1, sentence2): features = [0] * 7 tokenizer = RegexpTokenizer(r'\w+') words1 = tokenizer.tokenize(sentence1) words2 = tokenizer.tokenize(sentence2) n = len(words1) m = len(words2) # word overlap features count = 0 # num of same words in sentence for word1 in words1: for word2 in words2: if word1 == word2: count += 1 features[0] = count / n # "precision" features[1] = count / m # "recall" features[2] = sentence_bleu([sentence1], sentence2) features[3] = sentence_bleu([sentence2], sentence1) # Obtain pairs of adjacent words skipgrams1 = skipgrams(words1, 2, 0) skipgrams2 = skipgrams(words2, 2, 0) count = 0 for gram1 in skipgrams1: for gram2 in skipgrams2: if gram1 == gram2: count += 1 features[4] = count / combinations(n, count) features[5] = count / combinations(m, count) """if (n > m): features[6] = m / n else: features[6] = n / m""" if len(sentence1) > len(sentence2): features[7] = len(sentence2) / len(sentence1) else: features[7] = len(sentence1) / len(sentence2) return features
def test_case_where_n_is_bigger_than_hypothesis_length(self): # Test BLEU to nth order of n-grams, where n > len(hypothesis). # TODO: Currently this test breaks the BLEU implementation (13.03.2016) references = ['John loves Mary'.split()] hypothesis = 'John loves Mary'.split() n = len(hypothesis) + 1 # weights = [1.0/n] * n # Uniform weights. assert(sentence_bleu(references, hypothesis, weights) == 1.0)
def evaluate_score(translation, score, smoothing_func): if score == 'BLEU': translation_split = translation.translation reference_split = translation.reference try: return bleu.sentence_bleu([reference_split], translation_split, smoothing_function=smoothing_func) except: word_count = min(len(reference_split), len(translation_split)) weights = [] weight = 0.25 if word_count < 4: weight = 1 / float(word_count) for i in range(min(4, word_count)): weights.append(weight) return bleu.sentence_bleu([reference_split], translation_split, weights=weights, smoothing_function=smoothing_func) else: print 'evaluate_score: unrecognized score \'{0}\''.format(score)
def test_partial_matches_hypothesis_longer_than_reference(self): references = ['John loves Mary'.split()] hypothesis = 'John loves Mary who loves Mike'.split() self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.4729, places=4) # Checks that the warning has been raised because len(reference) < 4. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
def test_case_where_n_is_bigger_than_hypothesis_length(self): # Test BLEU to nth order of n-grams, where n > len(hypothesis). references = ['John loves Mary ?'.split()] hypothesis = 'John loves Mary'.split() n = len(hypothesis) + 1 # weights = [1.0/n] * n # Uniform weights. self.assertAlmostEqual(sentence_bleu(references, hypothesis, weights), 0.7165, places=4) # Checks that the warning has been raised because len(hypothesis) < 4. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. # Test case where n > len(hypothesis) but so is n > len(reference), and # it's a special case where reference == hypothesis. references = ['John loves Mary'.split()] hypothesis = 'John loves Mary'.split() assert(sentence_bleu(references, hypothesis, weights) == 1.0)
def test(): """Test the translation model.""" nltk.download('punkt') with tf.Session() as sess: model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. src_lang_vocab_path = PATH_TO_DATA_FILES + FLAGS.src_lang + "_mapping%d.txt" % FLAGS.src_lang_vocab_size dst_lang_vocab_path = PATH_TO_DATA_FILES + FLAGS.dst_lang + "_mapping%d.txt" % FLAGS.dst_lang_vocab_size src_lang_vocab, _ = data_utils.initialize_vocabulary(src_lang_vocab_path) _, rev_dst_lang_vocab = data_utils.initialize_vocabulary(dst_lang_vocab_path) weights = [0.25, 0.25, 0.25, 0.25] first_lang_file = open(generate_src_lang_sentences_file_name(FLAGS.src_lang)) second_lang_file = open(generate_src_lang_sentences_file_name(FLAGS.dst_lang)) total_bleu_value = 0.0 computing_bleu_iterations = 0 for first_lang_raw in first_lang_file: second_lang_gold_raw = second_lang_file.readline() # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(first_lang_raw), src_lang_vocab) # Which bucket does it belong to? try: bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) except ValueError: continue # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out sentence corresponding to outputs. model_tran_res = " ".join([tf.compat.as_str(rev_dst_lang_vocab[output]) for output in outputs]) second_lang_gold_tokens = word_tokenize(second_lang_gold_raw) model_tran_res_tokens = word_tokenize(model_tran_res) try: current_bleu_value = sentence_bleu([model_tran_res_tokens], second_lang_gold_tokens, weights) total_bleu_value += current_bleu_value computing_bleu_iterations += 1 except ZeroDivisionError: pass if computing_bleu_iterations % 10 == 0: print("BLEU value after %d iterations: %.2f" % (computing_bleu_iterations, total_bleu_value / computing_bleu_iterations)) final_bleu_value = total_bleu_value / computing_bleu_iterations print("Final BLEU value after %d iterations: %.2f" % (computing_bleu_iterations, final_bleu_value)) return
def test_zero_matches(self): # Test case where there's 0 matches references = ['The candidate has no alignment to any of the references'.split()] hypothesis = 'John loves Mary'.split() # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1,len(hypothesis)): weights = [1.0/n] * n # Uniform weights. assert(sentence_bleu(references, hypothesis, weights) == 0)
def test_full_matches(self): # Test case where there's 100% matches references = ['John loves Mary'.split()] hypothesis = 'John loves Mary'.split() # Test BLEU to nth order of n-grams, where n is len(hypothesis). for n in range(1,len(hypothesis)): weights = [1.0/n] * n # Uniform weights. assert(sentence_bleu(references, hypothesis, weights) == 1.0)
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = setup_args() logging.info(args) f = codecs.open('report-%s.csv'% args.model, 'w') csv_f = csv.writer(f, delimiter=',', encoding='utf-8') src_lines = codecs.open(args.src, 'r', 'utf-8').readlines() src_lines_nounk = codecs.open(args.src + '.nounk', 'r', 'utf-8').readlines() target_lines = codecs.open(args.target, 'r', 'utf-8').readlines() target_lines_nounk = codecs.open(args.target + '.nounk', 'r', 'utf-8').readlines() gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines() gold_lines_nounk = codecs.open(args.gold + '.nounk', 'r', 'utf-8').readlines() data = ['Src', 'Src_UNK', 'Target_UNK', 'Target', 'Gold_UNK', 'Gold', 'BLEU1'] csv_f.writerow(data) num_lines = len(gold_lines) logging.info('Num Lines: %d'% num_lines) references = [] hypotheses = [] for index in range(num_lines): data = [] data.append(src_lines_nounk[index].strip()) data.append(src_lines[index].strip()) data.append(target_lines[index].strip()) data.append(target_lines_nounk[index].strip()) data.append(gold_lines[index].strip()) data.append(gold_lines_nounk[index].strip()) gold = gold_lines[index].strip().split() output = target_lines[index].strip().split() default = 'UNK UNK UNK UNK'.split() if len(output) < 4: bleu_score = 0.0 hypotheses.append(default) else: bleu_score = sentence_bleu([gold], output, weights=(1.0,)) hypotheses.append(output) references.append([gold]) logging.info('sentence:%d bleu:%f'%(index, bleu_score)) data.append(str(bleu_score)) csv_f.writerow(data) final_bleu = corpus_bleu(references, hypotheses) unigram_bleu = corpus_bleu(references, hypotheses, weights=(1.0,)) logging.info('Final BLEU: %f Unigram_BLEU: %f '% (final_bleu, unigram_bleu))
def test_reference_or_hypothesis_shorter_than_fourgrams(self): # Tese case where the length of reference or hypothesis # is shorter than 4. references = ['let it go'.split()] hypothesis = 'let go it'.split() # Checks that the value the hypothesis and reference returns is 1.0 assert(sentence_bleu(references, hypothesis) == 1.0) # Checks that the warning has been raised. try: self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) except AttributeError: pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
def reward_function(self, reference, summary, measure='rouge_l/f_score'): """Calculate the reward between the reference and summary. Args: reference: A list of ids representing the ground-truth data summary: A list of ids representing the model generated data Returns: A single value representing the evaluation value for reference and summary """ if 'rouge' in measure: return rouge([summary],[reference])[measure] else: return sentence_bleu([reference.split()],summary.split(),weights=(0.25,0.25,0.25,0.25))
def get_validation_bleu_scores(sess, generator, iterations): """ calculate the mean bleu score for specific number of minibatch iterations of a generator """ scores = [] average_loss = 0 for i in tqdm(range(iterations), desc="validation metrics"): feed = next(generator) validation_predict_, loss_ = sess.run([decoder_prediction, loss], feed) scores += [ sentence_bleu([decode_sequence(pred, rev_heb_vocab, False)], decode_sequence(exp, rev_heb_vocab, False), smoothing_function=chencherry.method1) for pred, exp in zip(validation_predict_.T, feed[decoder_targets].T)] average_loss += (loss_ / iterations) return np.mean(scores), average_loss
def bleu(reference, candidate): """ Compute the BLEU score for a given candidate sentence, with respect to a given reference sentence. reference: the reference translation candidate: the candidate translation """ chen_cherry = SmoothingFunction() try: return sentence_bleu([reference], candidate, smoothing_function=chen_cherry.method7) except ZeroDivisionError as error: return 0 except AttributeError as error: return 0
def bleu_ngram_score(arg): if arg == 1: sampled_poem = poem1_display.get('1.0', 'end') else: sampled_poem = poem2_display.get('1.0', 'end') sampled_poem = sampled_poem.rstrip() sampled_poem = sampled_poem.replace(',', '') sampled_poem = sampled_poem.split('\n') sampled_poem = [x.lower() for x in sampled_poem] sampled_token = [nltk.word_tokenize(x) for x in sampled_poem] # Smoothing function sf = SmoothingFunction().method4 score = [] for x in sampled_token: score.append(sentence_bleu(reference, x, weights = (0.25, 0.25, 0.25, 0.25), smoothing_function = sf)) average_score = sum(score) / len(score) write_to_log("BLEU (cumulative 4-gram) score of Sonnet %d: %.2f" %(arg, average_score), logger_index)
def calculate_bleu(reference, candidate): reference_list = tokenize_sentence(reference) candidate_list = tokenize_sentence(candidate) return sentence_bleu([reference_list], candidate_list, weights=(1, 0, 0, 0))
#!/usr/bin/env python # -*- coding: utf-8 -*- """ __title__ = '' __author__ = 'zhangjingjun' __mtime__ = '2018/9/5' # ----------Dragon be here!---------- ┏━┓ ┏━┓ ┏━┛ ┻━━━━━━┛ ┻━━┓ ┃ ━ ┃ ┃ ━┳━┛ ┗━┳━ ┃ ┃ ┻ ┃ ┗━━━┓ ┏━━━━┛ ┃ ┃神兽保佑 ┃ ┃永无BUG! ┃ ┗━━━━━━━━━┓ ┃ ┣━┓ ┃ ┏━┛ ┗━━┓ ┓ ┏━━━┳━┓ ┏━┛ ┃ ┫ ┫ ┃ ┫ ┫ ┗━┻━┛ ┗━┻━┛ """ from nltk.translate.bleu_score import sentence_bleu reference = [['this', 'is', 'small', 'test']] candidate = ['this', 'is', 'a', 'test'] score = sentence_bleu(reference, candidate) print(score)
def bleuScore(s1, s2): return bleu_score.sentence_bleu(s1, s2)
from nltk.translate.bleu_score import sentence_bleu reference = list('她的故事在法国遥远的西部山上') hypothesis = list('她的故事在法国的遥远山') score = sentence_bleu([reference], hypothesis) print(score)
if val is None: continue elif val is True: constrained = constrained.replace(slot, slot[7:]) attn += ' {}'.format(slot[7:]) unconstrained += ' {}'.format(slot[7:]) else: constrained = constrained.replace(slot, str(val)) attn += ' {}'.format(str(val)) unconstrained += ' {}'.format(str(val)) ''' if count % 1000 == 0: print(count) count += 1 bleu_score[0] += sentence_bleu([target], attn) bleu_score[1] += sentence_bleu([target], unconstrained) bleu_score[2] += sentence_bleu([target], constrained) r_score = RGE.get_scores(attn, target) rouge_1[0] += r_score[0]['rouge-1']['f'] rouge_2[0] += r_score[0]['rouge-2']['f'] rouge_l[0] += r_score[0]['rouge-l']['f'] r_score = RGE.get_scores(unconstrained, target) rouge_1[1] += r_score[0]['rouge-1']['f'] rouge_2[1] += r_score[0]['rouge-2']['f'] rouge_l[1] += r_score[0]['rouge-l']['f'] r_score = RGE.get_scores(constrained, target) rouge_1[2] += r_score[0]['rouge-1']['f'] rouge_2[2] += r_score[0]['rouge-2']['f'] rouge_l[2] += r_score[0]['rouge-l']['f']
with open(GENERATED_FILE, 'r') as input_file: lines = input_file.readlines() for i in range(0, len(lines), 2): source = lines[i].strip() generated = lines[i + 1].strip() source_clean = re.sub(r'<UNK> ', "", source) key = '' for target_str in source_target_dict.keys(): word_count = 0 for word1 in source_clean.split(): if word1 in target_str.split(): word_count += 1 if word_count == len(source_clean.split()): key = target_str break target = source_target_dict[key] semples.append((source, target, generated)) total_bleu = 0 smoothing_foonction = SmoothingFunction() for _, t, g in semples: target_words = t.strip().split() generated_words = g.strip().split() score = sentence_bleu([target_words], generated_words, smoothing_function=smoothing_foonction.method4) total_bleu += score print('BLEU: {:.4}'.format(total_bleu / len(semples)))
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting running in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): if FLAGS.mode == "train": os.makedirs(FLAGS.log_root) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root)) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_generator = namedtuple("HParams", hps_dict.keys())(**hps_dict) hparam_list = [ 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_discriminator = namedtuple("HParams", hps_dict.keys())(**hps_dict) tf.set_random_seed(111) # a seed value for randomness if hps_generator.mode == 'train': print("Start pre-training......") model_class = Classification(hps_discriminator, vocab) cla_batcher = ClaBatcher(hps_discriminator, vocab) sess_cls, saver_cls, train_dir_cls = setup_training_classification( model_class) print("Start pre-training classification......") #run_pre_train_classification(model_class, cla_batcher, 10, sess_cls, saver_cls, train_dir_cls) #generated = Generate_training_sample(model_class, vocab, cla_batcher, sess_cls) #print("Generating training examples......") #generated.generate_training_example("train") #generated.generator_validation_example("valid") model_sentiment = Sentimentor(hps_generator, vocab) sentiment_batcher = SenBatcher(hps_generator, vocab) sess_sen, saver_sen, train_dir_sen = setup_training_sentimentor( model_sentiment) #run_pre_train_sentimentor(model_sentiment,sentiment_batcher,1,sess_sen,saver_sen,train_dir_sen) sentiment_generated = Generate_non_sentiment_weight( model_sentiment, vocab, sentiment_batcher, sess_sen) #sentiment_generated.generate_training_example("train_sentiment") #sentiment_generated.generator_validation_example("valid_sentiment") model = Generator(hps_generator, vocab) # Create a batcher object that will create minibatches of data batcher = GenBatcher(vocab, hps_generator) sess_ge, saver_ge, train_dir_ge = setup_training_generator(model) util.load_ckpt(saver_sen, sess_sen, ckpt_dir="train-sentimentor") util.load_ckpt(saver_cls, sess_cls, ckpt_dir="train-classification") generated = Generated_sample(model, vocab, batcher, sess_ge) #print("Start pre-training generator......") run_pre_train_generator( model, batcher, 4, sess_ge, saver_ge, train_dir_ge, generated, model_class, sess_cls, cla_batcher) # this is an infinite loop until interrupted #generated.generator_validation_negetive_example("temp_negetive", batcher, model_class,sess_cls,cla_batcher) # batcher, model_class, sess_cls, cla_batcher #generated.generator_validation_positive_example( # "temp_positive", batcher, model_class,sess_cls,cla_batcher) loss_window = 0 t0 = time.time() print("begin dual learning:") for epoch in range(30): batches = batcher.get_batches(mode='train') for i in range(len(batches)): current_batch = copy.deepcopy(batches[i]) sentiment_batch = batch_sentiment_batch( current_batch, sentiment_batcher) result = model_sentiment.max_generator(sess_sen, sentiment_batch) weight = result['generated'] current_batch.weight = weight sentiment_batch.weight = weight cla_batch = batch_classification_batch(current_batch, batcher, cla_batcher) result = model_class.run_ypred_auc(sess_cls, cla_batch) cc = SmoothingFunction() reward_sentiment = 1 - np.abs(0.5 - result['y_pred_auc']) reward_BLEU = [] for k in range(FLAGS.batch_size): reward_BLEU.append( sentence_bleu( [current_batch.original_reviews[k].split()], cla_batch.original_reviews[k].split(), smoothing_function=cc.method1)) reward_BLEU = np.array(reward_BLEU) reward_de = (2 / (1.0 / (1e-6 + reward_sentiment) + 1.0 / (1e-6 + reward_BLEU))) result = model.run_train_step(sess_ge, current_batch) train_step = result[ 'global_step'] # we need this to update our running average loss loss = result['loss'] loss_window += loss if train_step % 100 == 0: t1 = time.time() tf.logging.info( 'seconds for %d training generator step: %.3f ', train_step, (t1 - t0) / 100) t0 = time.time() tf.logging.info('loss: %f', loss_window / 100) # print the loss to screen loss_window = 0.0 if train_step % 10000 == 0: #bleu_score = generatored.compute_BLEU(str(train_step)) #tf.logging.info('bleu: %f', bleu_score) # print the loss to screen generated.generator_validation_negetive_example( "valid-generated-transfer/" + str(epoch) + "epoch_step" + str(train_step) + "_temp_positive", batcher, model_class, sess_cls, cla_batcher) generated.generator_validation_positive_example( "valid-generated/" + str(epoch) + "epoch_step" + str(train_step) + "_temp_positive", batcher, model_class, sess_cls, cla_batcher) #saver_ge.save(sess, train_dir + "/model", global_step=train_step) cla_batch, bleu = output_to_classification_batch( result['generated'], current_batch, batcher, cla_batcher, cc) result = model_class.run_ypred_auc(sess_cls, cla_batch) reward_result_sentiment = result['y_pred_auc'] reward_result_bleu = np.array(bleu) reward_result = (2 / (1.0 / (1e-6 + reward_result_sentiment) + 1.0 / (1e-6 + reward_result_bleu))) current_batch.score = 1 - current_batch.score result = model.max_generator(sess_ge, current_batch) cla_batch, bleu = output_to_classification_batch( result['generated'], current_batch, batcher, cla_batcher, cc) result = model_class.run_ypred_auc(sess_cls, cla_batch) reward_result_transfer_sentiment = result['y_pred_auc'] reward_result_transfer_bleu = np.array(bleu) reward_result_transfer = ( 2 / (1.0 / (1e-6 + reward_result_transfer_sentiment) + 1.0 / (1e-6 + reward_result_transfer_bleu))) #tf.logging.info("reward_nonsentiment: "+str(reward_sentiment) +" output_original_sentiment: "+str(reward_result_sentiment)+" output_original_bleu: "+str(reward_result_bleu)) reward = reward_result_transfer #reward_de + reward_result_sentiment + #tf.logging.info("reward_de: "+str(reward_de)) model_sentiment.run_train_step(sess_sen, sentiment_batch, reward) elif hps_generator.mode == 'decode': decode_model_hps = hps_generator # This will be the hyperparameters for the decoder model #model = Generator(decode_model_hps, vocab) #generated = Generated_sample(model, vocab, batcher) #bleu_score = generated.compute_BLEU() #tf.logging.info('bleu: %f', bleu_score) # print the loss to screen else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
hypothesis_list.extend(fake_data_text) reference.extend(real_data_text) nll_gen_error = np.array(nll_gen_error) nll_gen_error_mean = nll_gen_error.mean() print(nll_gen_error_mean) random.shuffle(hypothesis_list) random.shuffle(reference) reference = reference[:5000] n_gram_bleu_scores = {"{}-gram".format(gram):0 for gram in range(2,6)} for ngram in range(2,6): weight = tuple((1./ngram for _ in range(ngram))) bleu_score = [] for h in hypothesis_list[:2000]: BLEUscore = sentence_bleu(reference,h,weight) bleu_score.append(BLEUscore) current_bleu = 1.0*sum(bleu_score)/len(bleu_score) n_gram_bleu_scores["{}-gram".format(len(weight))] = current_bleu if current_bleu < 1e-2: break text_log.write("\n\nGot nll_gen mean: {}".format(nll_gen_error_mean)) for gram,score in n_gram_bleu_scores.items(): text_log.write("\nGot {} score: {}".format(gram,score)) text_log.close() save_model(generator,summary_path) save_model(discriminator,summary_path)
def train(model, train_loader, criterion, optimizer, lr_scheduler, dataloader): model.train() device = next(model.parameters()).device.index losses = [] total_iter = len(train_loader) for i, batch in enumerate(train_loader): srcs, trgs = batch.src.cuda(device), batch.trg.cuda(device) # Empty gradients optimizer.zero_grad() # Predict targets (Forward propagation) preds, _, _, _ = model(srcs, trgs) # Unroll the preds and trgs preds_unroll = preds[1:].view(-1, preds.shape[-1]) trgs_unroll = trgs[1:].view(-1) # Calculate loss loss = criterion(preds_unroll, trgs_unroll) losses.append(loss.item()) # Calculate gradients (Backpropagation) loss.backward() # Cliping the parameters nn.utils.clip_grad_norm_(model.parameters(), 1) # Update learning rate lr_scheduler.step() # Update parameters optimizer.step() sys.stdout.write( "[{:5d}/{:5d}] lrate: {:f} total steps: {:d}\r".format( i + 1, total_iter, optimizer.param_groups[0]['lr'], lr_scheduler.current_step)) if lr_scheduler.current_step >= TRAINING_STEPS: break # Calculate average loss avg_loss = sum(losses) / len(losses) #=========================================================================== # Check train metric model.eval() sum_bleu = 0.0 num_sentence = 0.0 sos_idx = dataloader.sos_idx with torch.no_grad(): for i, batch in enumerate(train_loader): srcs, trgs = batch.src.cuda(device), batch.trg.cuda(device) # Predict targets (Forward propagation) preds, _, _, _ = model(srcs, trgs) # Unroll the preds and trgs preds_unroll = preds[1:].view(-1, preds.shape[-1]) trgs_unroll = trgs[1:].view(-1) #=================================================================== # For BLEU score # Target Decoding trans_preds = preds trans_preds = trans_preds.argmax(dim=2) # Greedy Decoding # trans_preds = model.translate_forward(srcs, sos_idx, trgs.size(1)) trans_preds = trans_preds.cpu().detach().numpy() trgs = trgs.cpu().detach().numpy() for trans_pred, trg in zip(trans_preds, trgs): # Translate each sentence pred_sentence = dataloader.translate_sentence(trans_pred) trg_sentence = dataloader.translate_sentence(trg) # Calculate each sentence bleu score if len(pred_sentence) > 1: sum_bleu += sentence_bleu( [trg_sentence], pred_sentence, smoothing_function=smoothing_func) * 100 num_sentence += 1 #=================================================================== sys.stdout.write("[{:5d}/{:5d}]\r".format(i + 1, total_iter)) # Calculate the metrics # Perplexity ppl = np.exp(avg_loss) # Bilingual Evaluation Understudy Score bleu = sum_bleu / num_sentence return avg_loss, (ppl, bleu)
from nltk.translate.bleu_score import sentence_bleu ref_google = "the sleeve of the shirt tore" ref_bing = "the shirt sleeve tore" # traduzi pelo site Systran Translate ref_yandex = "shirt sleeve ripped" candidate = "" candidates = [ "the tore sleeve", "the collar has tear", "the short sleeve rent", "this shirt is torn", "shirt sleeve the rent", "the shirt sleeve tore" ] reference = [ref_google.split(' '), ref_bing.split(' '), ref_yandex.split(' ')] ref_text = [ "the shirt", "the shirt is tore", "the shirt is black", "a shirt can tear", "shirt sleeve" ] i = 0 reference = [] while i < len(ref_text): reference.append(ref_text[i].split(' ')) i += 1 candidate = candidate.split(' ') print("Referencia: {}".format(reference)) print("Original: " + ' '.join(candidate)) i = 0 while i < len(candidates): candidate = candidates[i].split(' ') score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)) print("Candidato {}({}): {}".format(i, score, candidate)) i += 1
def test(model, test_loader, criterion, dataloader): model.eval() device = next(model.parameters()).device.index losses = [] total_iter = len(test_loader) sum_bleu = 0.0 num_sentence = 0.0 sos_idx = dataloader.sos_idx with torch.no_grad(): for i, batch in enumerate(test_loader): srcs, trgs = batch.src.cuda(device), batch.trg.cuda(device) # Predict targets (Forward propagation) preds, enc_self, dec_self, dec_enc = model(srcs, trgs) # Unroll the preds and trgs preds_unroll = preds[1:].view(-1, preds.shape[-1]) trgs_unroll = trgs[1:].view(-1) # Calculate loss loss = criterion(preds_unroll, trgs_unroll) losses.append(loss.item()) #=================================================================== # For BLEU score # Target Decoding trans_preds = preds trans_preds = trans_preds.argmax(dim=2) # Greedy Decoding # trans_preds = model.translate_forward(srcs, sos_idx, trgs.size(1)) trans_preds = trans_preds.cpu().detach().numpy() trgs = trgs.cpu().detach().numpy() for idx, (trans_pred, trg, src) in enumerate(zip(trans_preds, trgs, srcs)): # Translate each sentence pred_sentence = dataloader.translate_sentence(trans_pred) trg_sentence = dataloader.translate_sentence(trg) src_sentence = dataloader.translate_sentence(src, type='src') # Calculate each sentence bleu score if len(pred_sentence) > 1: each_belu = sentence_bleu( [trg_sentence], pred_sentence, smoothing_function=smoothing_func) * 100 sum_bleu += each_belu num_sentence += 1 #=========================================================== # Monitoring the results # print('SRC :', src_sentence) # print('TRG :', trg_sentence) # print('PRED:', pred_sentence) # print(each_belu) # input() #=========================================================== # Visualize the attentions # visualize_attention(enc_self, idx, src_sentence, src_sentence, 'enc', i) # visualize_attention(dec_self, idx, trg_sentence, trg_sentence, 'dec', i) # visualize_attention(dec_enc, idx, src_sentence, trg_sentence, 'edc', i) #=================================================================== sys.stdout.write("[{:5d}/{:5d}]\r".format(i + 1, total_iter)) # Calculate average loss avg_loss = sum(losses) / len(losses) # Calculate the metrics # Perplexity ppl = np.exp(avg_loss) # Bilingual Evaluation Understudy Score bleu = sum_bleu / num_sentence return avg_loss, (ppl, bleu)
def bleu_score_char(dialogs, sample_amount): from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import SmoothingFunction smoothie = SmoothingFunction().method4 if (dialogs == 'cornell'): dataset_folder_name = 'cornell-dialogs' else: dataset_folder_name = dialogs data = get_outputs_and_references(dataset_folder_name, sample_amount) # print('DATAAA') # print(data) sum_bleu_score_1 = 0 sum_bleu_score_2 = 0 sum_bleu_score_3 = 0 sum_bleu_score_4 = 0 i = 0 for datum in data: reference = [] reference_str = datum['reference'].lower() output_str = datum['output'] reference_str = reference_str.replace(".", "") reference_str = reference_str.replace(",", "") output_str = output_str.replace(".", "") output_str = output_str.replace(",", "") reference.append(nltk.tokenize.word_tokenize(reference_str)) candidate = nltk.tokenize.word_tokenize(output_str) print(reference) print(candidate) try: bleu_1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)) except: bleu_1 = 0.5 try: bleu_2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)) except: bleu_2 = 0.5 try: bleu_3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)) except: bleu_3 = 0.5 try: bleu_4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)) except: bleu_4 = 0.5 print(bleu_1) print(bleu_2) print(bleu_3) print(bleu_4) sum_bleu_score_1 = sum_bleu_score_1 + bleu_1 sum_bleu_score_2 = sum_bleu_score_2 + bleu_2 sum_bleu_score_3 = sum_bleu_score_3 + bleu_3 sum_bleu_score_4 = sum_bleu_score_4 + bleu_4 i = i + 1 print(i) print('BLEU-1 : ' + str(round(sum_bleu_score_1 / len(data), 4))) print('BLEU-2 : ' + str(round(sum_bleu_score_2 / len(data), 4))) print('BLEU-3 : ' + str(round(sum_bleu_score_3 / len(data), 4))) print('BLEU-4 : ' + str(round(sum_bleu_score_4 / len(data), 4)))
def bleu_score(dialogs, sample_amount): from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import SmoothingFunction smoothie = SmoothingFunction().method4 if (dialogs == 'cornell'): dataset_folder_name = 'cornell-dialogs' else: dataset_folder_name = dialogs data = get_outputs_and_references(dataset_folder_name, sample_amount) # print('DATAAA') # print(data) sum_bleu_score_1 = 0 sum_bleu_score_2 = 0 sum_bleu_score_3 = 0 sum_bleu_score_4 = 0 i = 0 for datum in data: reference = [] reference_str = datum['reference'].lower() output_str = datum['output'] reference_str = reference_str.replace(".", "") reference_str = reference_str.replace(",", "") output_str = output_str.replace(".", "") output_str = output_str.replace(",", "") reference.append(nltk.tokenize.word_tokenize(reference_str)) candidate = nltk.tokenize.word_tokenize(output_str) print(reference) print(candidate) try: bleu_1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie) except: bleu_1 = 0.5 try: bleu_2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie) except: bleu_2 = 0.5 try: bleu_3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie) except: bleu_3 = 0.5 try: bleu_4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie) except: bleu_4 = 0.5 print(bleu_1) print(bleu_2) print(bleu_3) print(bleu_4) sum_bleu_score_1 = sum_bleu_score_1 + bleu_1 sum_bleu_score_2 = sum_bleu_score_2 + bleu_2 sum_bleu_score_3 = sum_bleu_score_3 + bleu_3 sum_bleu_score_4 = sum_bleu_score_4 + bleu_4 i = i + 1 print(i) print('BLEU-1 : ' + str(round(sum_bleu_score_1 / len(data), 4))) print('BLEU-2 : ' + str(round(sum_bleu_score_2 / len(data), 4))) print('BLEU-3 : ' + str(round(sum_bleu_score_3 / len(data), 4))) print('BLEU-4 : ' + str(round(sum_bleu_score_4 / len(data), 4))) with open(os.environ['CURRENT_CORNELL_MODEL'] + '-' + str(sample_amount) + '.txt', 'w', encoding="latin1") as out_file: out_file.write('BLEU-1 : ' + str(round(sum_bleu_score_1 / len(data), 4))) out_file.write('\n') out_file.write('BLEU-2 : ' + str(round(sum_bleu_score_2 / len(data), 4))) out_file.write('\n') out_file.write('BLEU-3 : ' + str(round(sum_bleu_score_3 / len(data), 4))) out_file.write('\n') out_file.write('BLEU-4 : ' + str(round(sum_bleu_score_4 / len(data), 4))) out_file.write('\n') out_file.write('Sample amount : ' + str(sample_amount))
def test_empty_references_and_hypothesis(self): # Test case where both references and hypothesis is empty. references = [[]] hypothesis = [] assert sentence_bleu(references, hypothesis) == 0
def test_empty_references(self): # Test case where there's reference is empty. references = [[]] hypothesis = 'John loves Mary'.split() assert sentence_bleu(references, hypothesis) == 0
def bleu_single(hypothesis: str, reference: str) -> float: return sentence_bleu([tokenizer.tokenize(reference)], tokenizer.tokenize(hypothesis))
def baseline_bleu(df): smoothie = SmoothingFunction().method1 df['bleu'] = df.apply(lambda x: sentence_bleu(x['reference_token'], x['translation_token'], weights=(1, 0, 0, 0), smoothing_function=smoothie), axis=1) return df
def evaluate_summ_qa(model, dataset, mode, batch_size=64): assert mode in ('summ', 'qa'), 'Invalid mode!' model.eval() data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x) rouge1_f_sum = rouge2_f_sum = rougeL_f_sum = bleu_sum = 0 examples_rouge = examples_bleu = 0 rouge = Rouge() count = 0 if mode == 'summ': for mini_batch in tqdm(data_loader): count += 1 refs = [' '.join(data['question']) for data in mini_batch] x = [data['description'] for data in mini_batch] hyps_raw = beam_search('summ', model, x) hyps = [' '.join(list(sent)) for sent in hyps_raw] try: rouge_score = rouge.get_scores(hyps, refs, avg=True, ignore_empty=True) rouge1_f_sum += rouge_score['rouge-1']['f'] * len(mini_batch) rouge2_f_sum += rouge_score['rouge-2']['f'] * len(mini_batch) rougeL_f_sum += rouge_score['rouge-l']['f'] * len(mini_batch) examples_rouge += len(mini_batch) except ValueError as e: print(str(e) + ' | continuing...') continue elif mode == 'qa': for mini_batch in tqdm(data_loader): count += 1 refs = [' '.join(data['answer']) for data in mini_batch] x = [data['question'] for data in mini_batch] hyps_raw = beam_search('qa', model, x) hyps = [' '.join(list(sent)) for sent in hyps_raw] try: rouge_score = rouge.get_scores(hyps, refs, avg=True, ignore_empty=True) rouge1_f_sum += rouge_score['rouge-1']['f'] * len(mini_batch) rouge2_f_sum += rouge_score['rouge-2']['f'] * len(mini_batch) rougeL_f_sum += rouge_score['rouge-l']['f'] * len(mini_batch) examples_rouge += len(mini_batch) except ValueError as e: print(str(e) + ' | continuing...') continue # calculate BLEU score refs = [data['answer'] for data in mini_batch] hyps = [list(sent) for sent in hyps_raw] smoothie = SmoothingFunction().method4 for i in range(len(hyps)): try: bleu = sentence_bleu([refs[i]], hyps[i], smoothing_function=smoothie) bleu_sum += bleu examples_bleu += 1 except ZeroDivisionError as e: print(str(e) + ' | continuing...') continue rouge_1_f = rouge1_f_sum / examples_rouge rouge_2_f = rouge2_f_sum / examples_rouge rouge_L_f = rougeL_f_sum / examples_rouge if mode == 'qa': bleu_score = bleu_sum / examples_bleu # with open('output/test_{}.txt'.format(mode), 'w', encoding='utf-8') as f: # f.write('rouge-1 f: ' + str(rouge_1_f) + '\n') # f.write('rouge-2 f: ' + str(rouge_2_f) + '\n') # f.write('rouge-L f: ' + str(rouge_L_f) + '\n') # f.write('\n') # # for i in range((len(candidates)): # f.write('input: ' + inputs[i] + '\n') # f.write('hyp: ' + ''.join(candidates[i]) + '\n') # f.write('ref: ' + targets[i] + '\n\n') if is_training: model.train() print('rouge-1 f: ' + str(rouge_1_f)) print('rouge-2 f: ' + str(rouge_2_f)) print('rouge-L f: ' + str(rouge_L_f)) if mode == 'qa': print('bleu: ', bleu_score)
def sample_results(preds, ind2word, word2ind, converted_summaries, converted_texts, use_bleu=False): """Plots the actual text and summary and the corresponding created summary. takes care of whether beam search or greedy decoder was used. """ beam = False if len(np.array(preds).shape) == 4: beam = True '''Bleu score is not used correctly here, but serves as reference. ''' if use_bleu: bleu_scores = [] for pred, summary, text, seq_length in zip( preds[0], converted_summaries, converted_texts, [len(inds) for inds in converted_summaries]): print('\n\n\n', 100 * '-') if beam: actual_text = [ ind2word[word] for word in text if word != word2ind["<SOS>"] and word != word2ind["<EOS>"] ] actual_summary = [ ind2word[word] for word in summary if word != word2ind['<EOS>'] and word != word2ind['<SOS>'] ] created_summary = [] for word in pred: if word[0] != word2ind['<SOS>'] and word[0] != word2ind[ '<EOS>']: created_summary.append(ind2word[word[0]]) continue else: continue print('Actual Text:\n{}\n'.format(' '.join(actual_text))) print('Actual Summary:\n{}\n'.format(' '.join(actual_summary))) print('Created Summary:\n{}\n'.format(' '.join(created_summary))) if use_bleu: bleu_score = sentence_bleu([actual_summary], created_summary) bleu_scores.append(bleu_score) print('Bleu-score:', bleu_score) print() else: actual_text = [ ind2word[word] for word in text if word != word2ind["<SOS>"] and word != word2ind["<EOS>"] ] actual_summary = [ ind2word[word] for word in summary if word != word2ind['<EOS>'] and word != word2ind['<SOS>'] ] created_summary = [ ind2word[word] for word in pred if word != word2ind['<EOS>'] and word != word2ind['<SOS>'] ] print('Actual Text:\n{}\n'.format(' '.join(actual_text))) print('Actual Summary:\n{}\n'.format(' '.join(actual_summary))) print('Created Summary:\n{}\n'.format(' '.join(created_summary))) if use_bleu: bleu_score = sentence_bleu([actual_summary], created_summary) bleu_scores.append(bleu_score) print('Bleu-score:', bleu_score) if use_bleu: bleu_score = np.mean(bleu_scores) print('\n\n\nTotal Bleu Score:', bleu_score)
#!/usr/bin/python3 # coding: utf-8 from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import corpus_bleu ################################################################## ## 一: sentence_bleu: Calculate BLEU score (Bilingual Evaluation Understudy) # sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, emulate_multibleu=False) # 参考语句必须作为语句列表来提供, 其中每个语句是一个记号列表, 候选语句作为一个记号列表被提供 reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']] candidate = ['this', 'is', 'a', 'test'] score = sentence_bleu(reference, candidate) print(score) # 会输出一个满分, 因为候选语句完全匹配其中一个参考语句 reference = [['the', 'cat', "is", "sitting", "on", "the", "mat"]] test = ["on", 'the', "mat", "is", "a", "cat"] # The hypothesis contains 0 counts of 4-gram overlaps. print(sentence_bleu(reference, test)) # 5.5546715329196825e-78 test = ['the', 'cat', 'is', 'sitting', 'on', 'mat'] print(sentence_bleu(reference, test)) # 0.6731821382417487 ################################################################## ## 二: corpus_bleu: 计算多个句子(如段落或文档)的 BLEU 分数 # 参考文本必须被指定为文档列表, 其中每个文档是一个参考语句列表, 并且每个可替换的参考语句也是记号列表, 也就是说文档列表是记号列表的列表的列表 # 候选文档必须被指定为列表, 其中每个文件是一个记号列表, 也就是说候选文档是记号列表的列表 references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]] # two references for one document candidates = [['this', 'is', 'a', 'test']] score = corpus_bleu(references, candidates) print(score) # 1.0; 运行这个例子就像之前一样输出满分 ##################################################################
def validate(val_loader, encoder, decoder, criterion, tok_en, tok_zh): ''' Performs one epoch's validation. ''' decoder.eval() # eval mode (no dropout or batchnorm) if encoder is not None: encoder.eval() references_en = list( ) # references (true captions) for calculating corpus BLEU-4 score hypotheses_en = list() # hypotheses (predictions) references_zh = list( ) # references (true captions) for calculating corpus BLEU-4 score hypotheses_zh = list() # hypotheses (predictions) avg_loss = 0 with torch.no_grad(): # Batches for cnt, (encap, zhcap, video, caplen_en, caplen_zh, enrefs, zhrefs) in enumerate(val_loader, 1): encap, zhcap, video, caplen_en, caplen_zh = encap.cuda( ), zhcap.cuda(), video.cuda(), caplen_en.cuda(), caplen_zh.cuda() # Forward prop. init_hidden, vid_out = encoder( video ) # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim) scores_en, pred_lengths_en, scores_zh, pred_lengths_zh = decoder.inference( encap, zhcap, init_hidden, vid_out, args.MAX_INPUT_LENGTH) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets_en = encap[:, 1:] scores_copy_en = scores_en.clone() targets_zh = zhcap[:, 1:] scores_copy_zh = scores_zh.clone() # Calculate loss loss_en = criterion( scores_en[:, 1:].contiguous().view(-1, decoder.vocab_size_en), targets_en.contiguous().view(-1)) loss_zh = criterion( scores_zh[:, 1:].contiguous().view(-1, decoder.vocab_size_zh), targets_zh.contiguous().view(-1)) # Hypotheses _, preds_en = torch.max(scores_copy_en, dim=2) preds_en = preds_en.tolist() temp_preds_en = list() for j, p in enumerate(preds_en): temp_preds_en.append( preds_en[j][1:pred_lengths_en[j]]) # remove pads and idx-0 preds_en = temp_preds_en hypotheses_en.extend(preds_en) # preds= [1,2,3] enrefs = [list(map(int, i.split())) for i in enrefs ] # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]] for r in enrefs: references_en.append([r]) assert len(references_en) == len(hypotheses_en) _, preds_zh = torch.max(scores_copy_zh, dim=2) preds_zh = preds_zh.tolist() temp_preds_zh = list() for j, p in enumerate(preds_zh): temp_preds_zh.append( preds_zh[j][1:pred_lengths_zh[j]]) # remove pads and idx-0 preds_zh = temp_preds_zh hypotheses_zh.extend(preds_zh) # preds= [1,2,3] zhrefs = [list(map(int, i.split())) for i in zhrefs ] # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]] for r in zhrefs: references_zh.append([r]) assert len(references_zh) == len(hypotheses_zh) avg_loss += loss_en.item() + loss_zh.item() # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> # Calculate loss # Hypotheses # Calculate metrics avg_loss = avg_loss / cnt scorers = { "Bleu": Bleu(4), "Meteor": Meteor(), "Rouge": Rouge(), "Cider": Cider(), "Spice": Spice() } gts_en = {} res_en = {} for i in range(len(references_en)): gts_en[i] = [tok_en.decode_sentence(references_en[i][0])] res_en[i] = [tok_en.decode_sentence(hypotheses_en[i])] scores = {} for name, scorer in scorers.items(): score, all_scores = scorer.compute_score(gts_en, res_en) if isinstance(score, list): for i, sc in enumerate(score, 1): scores[name + str(i)] = sc else: scores[name] = score print("Score of EN:") print(scores) """ gts_zh = {} res_zh = {} for i in range(len(references_zh)): gts_zh[i] = [tok_zh.decode_sentence(references_zh[i][0])] res_zh[i] = [tok_zh.decode_sentence(hypotheses_zh[i])] scores = {} for name, scorer in scorers.items(): score, all_scores = scorer.compute_score(gts_zh, res_zh) if isinstance(score, list): for i, sc in enumerate(score, 1): scores[name + str(i)] = sc else: scores[name] = score print("Score of ZH:") print(scores) """ corpbleu_en = corpus_bleu(references_en, hypotheses_en) sentbleu_en = 0 for i, (r, h) in enumerate(zip(references_en, hypotheses_en), 1): sentbleu_en += sentence_bleu(r, h, smoothing_function=cc.method7) sentbleu_en /= i return avg_loss, sentbleu_en, corpbleu_en
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH): encoder_hidden = encoder.initHidden() encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() input_length = input_tensor.size(0) target_length = target_tensor.size(0) encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device) loss = 0 blue_score = 0 for ei in range(input_length): encoder_output, encoder_hidden = encoder( input_tensor[ei], encoder_hidden) encoder_outputs[ei] = encoder_output[0, 0] decoder_input = torch.tensor([[SOS_token]], device=device) decoder_hidden = encoder_hidden use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False if use_teacher_forcing: # Teacher forcing: Feed the target as the next input reference_words = [] candidate_words = [] for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) topv, topi = decoder_output.data.topk(1) if topi.item() == EOS_token: reference_words.append('</s>') break else: reference_words.append(indo_vocab.id2token[topi.item()]) for index in target_tensor[di].data.topk(1): candidate_words.append(indo_vocab.id2token[index.item()]) loss += criterion(decoder_output, target_tensor[di]) #blue_score += sentence_bleu(list(reference_words),candidate_words,weights=(1, 0, 0, 0)) print("reference words",reference_words) print("candidate words",candidate_words) print("blue score:",sentence_bleu(list(reference_words),candidate_words,weights=(1, 0, 0, 0))) decoder_input = target_tensor[di] # Teacher forcing # print("Decoded words",decoded_words) # print("candidate words",candidate_words) else: # Without teacher forcing: use its own predictions as the next input reference_words = [] candidate_words = [] for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) topv, topi = decoder_output.topk(1) decoder_input = topi.squeeze().detach() # detach from history as input if topi.item() == EOS_token: reference_words.append('</s>') break else: reference_words.append(indo_vocab.id2token[topi.item()]) for index in target_tensor[di].data.topk(1): candidate_words.append(indo_vocab.id2token[index.item()]) #print("Decoded words",decoded_words) print("reference words",reference_words) print("candidate words",candidate_words) print("blue score:",sentence_bleu(list(reference_words),candidate_words,weights=(1, 0, 0, 0))) loss += criterion(decoder_output, target_tensor[di]) #blue_score += sentence_bleu(list(reference_words),candidate_words,weights=(1, 0, 0, 0)) if decoder_input.item() == EOS_token: break # print("Decoded words",decoded_words) # print("candidate words",candidate_words) loss.backward() encoder_optimizer.step() decoder_optimizer.step() return (loss.item() / target_length),(blue_score)
def test_empty_references(self): # Test case where there's reference is empty. references = [[]] hypothesis = 'John loves Mary'.split() assert(sentence_bleu(references, hypothesis) == 0)
def crossUnigramsRatio(s1, s2): nPairs = min(len(s1),len(s2)) l2 = [w2.pos_ for w2 in s2] cnt = 0. for w in s1: if w.pos_ in l2: cnt += 1. idx = l2.index(w.pos_) l2.pop(idx) cuRatio = cnt / nPairs return cuRatio with open('myOutput_'+fname+'.csv','w') as outFile: outFile.write('label,bleu,similarity,wmd,crossUnigrams\n') for snt in reader: if snt['gold_label'] != '-': s1 = nlp(snt['sentence1']) s2 = nlp(snt['sentence2']) a = bleu_score.sentence_bleu(s1[:].text, s2[:].text) b = s1.similarity(s2) c = wmd(s1.text, s2.text) d = crossUnigramsRatio(s1, s2) outFile.write('%s,%f,%f,%f,%f\n'%(snt['gold_label'],a,b,c,d)) else: noLabel += 1. print('Done calculating values')
# cumulative BLEU scores from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import SmoothingFunction reference = [['this', 'is', 'small', 'test']] candidate = ['this', 'is', 'a', 'test'] print('Cumulative 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))) print('Cumulative 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0))) print('Cumulative 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0))) print('Cumulative 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))
def scoring(reference, candidate): score = sentence_bleu(reference, candidate) return score
def get_bleus(referencess, wordss): '''Return bleu using nltk and 0.0 for empty decoded sequnces''' return [sentence_bleu([r], s, smoothing_function=bleu_smoothing) if s else 0.0 for r, s in zip(referencess, wordss)]
plot_attention("california is never pleasant during winter , and it is sometimes wonderful in december .") from nltk.translate.bleu_score import sentence_bleu # 存储每个句子的模型翻译结果# 存储每个句子 fr_preds = [] # 对样本中的每个英文进行翻译 for sentence in tqdm.tqdm(source_text.split("\n")): fr_pred = make_prediction(sentence) # 存储翻译结果 fr_preds.append(fr_pred) # 以样本中的法语翻译结果为reference references = target_text.split("\n") # 存储每个句子的BLEU分数 bleu_score = [] for i in tqdm.tqdm(range(len(fr_preds))): # 去掉特殊字符 pred = fr_preds[i].replace("<EOS>", "").replace("<PAD>", "").rstrip() reference = references[i].lower() # 计算BLEU分数 score = sentence_bleu([reference.split()], pred.split()) bleu_score.append(score) print("The BLEU score on our corpus is about {}".format(sum(bleu_score) / len(bleu_score)))
def bleu_score(dialogs, sample_amount): from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import SmoothingFunction smoothie = SmoothingFunction().method4 if (dialogs == 'cornell'): dataset_folder_name = 'cornell-dialogs' else: dataset_folder_name = dialogs data = get_outputs_and_references(dataset_folder_name, sample_amount) # print('DATAAA') # print(data) sum_bleu_score_1 = 0 sum_bleu_score_2 = 0 sum_bleu_score_3 = 0 sum_bleu_score_4 = 0 i = 0 for datum in data: reference = [] reference_str = datum['reference'].lower() output_str = datum['output'] reference_str = reference_str.replace(".", "") reference_str = reference_str.replace(",", "") output_str = output_str.replace(".", "") output_str = output_str.replace(",", "") reference.append(nltk.tokenize.word_tokenize(reference_str)) candidate = nltk.tokenize.word_tokenize(output_str) print(reference) print(candidate) try: bleu_1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothie) except: print('BLEU Error') bleu_1 = float(os.getenv('defbleu1')) / float(100) try: bleu_2 = sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie) except: print('BLEU Error') bleu_2 = float(os.getenv('defbleu2')) / float(100) try: bleu_3 = sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie) except: print('BLEU Error') bleu_3 = float(os.getenv('defbleu3')) / float(100) try: bleu_4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie) except: print('BLEU Error') bleu_4 = float(os.getenv('defbleu4')) / float(100) print(bleu_1) print(bleu_2) print(bleu_3) print(bleu_4) sum_bleu_score_1 = sum_bleu_score_1 + bleu_1 sum_bleu_score_2 = sum_bleu_score_2 + bleu_2 sum_bleu_score_3 = sum_bleu_score_3 + bleu_3 sum_bleu_score_4 = sum_bleu_score_4 + bleu_4 i = i + 1 print(i) # print('BLEU-1 : ' + str(round(sum_bleu_score_1/len(data), 4))) # print('BLEU-2 : ' + str(round(sum_bleu_score_2/len(data), 4))) # print('BLEU-3 : ' + str(round(sum_bleu_score_3/len(data), 4))) # print('BLEU-4 : ' + str(round(sum_bleu_score_4/len(data), 4))) # print('Cumulative BLEU-1 : ' + str(round(sum_bleu_score_1/len(data), 4))) # print('Cumulative BLEU-2 : ' + str(round(sum_bleu_score_2/len(data), 4))) # print('Cumulative BLEU-3 : ' + str(round(sum_bleu_score_3/len(data), 4))) num = os.getenv('bleu' + sys.argv[1]).split('/') print('Cumulative BLEU : ' + str(round(float(num[0]) / float(num[1]), 4)))
def similarity(attrs_text1, attrs_text2): # подсчет похожести по BLEU attrs1 = [] attrs1.append(attrs_text1.split(' ')) attrs2 = attrs_text2.split(' ') score = sentence_bleu(attrs1, attrs2) return score
pred_good, pred_bad, bleus = [], [], [] count = 0 for jpgfnm, image_feature, tokenized_text in zip(fnm_test, di_test, dt_test): count += 1 if count % 200 == 0: print(" {:4.2f}% is done..".format(100 * count / float(len(fnm_test)))) desc_true = [index_word[i] for i in tokenized_text] desc_true = desc_true[1:-1] # remove startseq and endseq desc = predict_desc(image_feature.reshape(1, len(image_feature))) desc = desc.split() desc = desc[1:-1] # remove startseq and endseq bleu = sentence_bleu([desc_true], desc) bleus.append(bleu) if bleu > 0.7 and len(pred_good) < nkeep: pred_good.append((bleu, jpgfnm, desc_true, desc)) elif bleu < 0.3 and len(pred_bad) < nkeep: pred_bad.append((bleu, jpgfnm, desc_true, desc)) print('The average accuracy based on BLEU is:', np.mean(bleus)) # demo: show the 'good' and 'bad' results def plot_images(pred): def create_str(desc_true): line = "" for s in desc_true: line += " " + s
def test_partial_matches_hypothesis_longer_than_reference(self): references = ['John loves Mary'.split()] hypothesis = 'John loves Mary who loves Mike'.split() self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.4729, places=4)
def main(): reports = {} with open(config.cleaned_reports) as csv_file: csv_reader = csv.reader(csv_file, delimiter=",") line_count = 0 for row in csv_reader: if line_count == 0: line_count += 1 else: uid, problems, findings, impression = row[1:] reports[str(uid)] = (parse_list(problems), findings, impression) train_reports, valid_reports, _ = create_report_splits(reports) train_dataset = data.XRayDataset(reports=train_reports, transform=transforms.Compose([ transforms.Resize(299), transforms.RandomCrop((299, 299)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ])) train_dataloader = torch.utils.data.dataloader.DataLoader( train_dataset, collate_fn=data.collate_fn, pin_memory=True, shuffle=True, drop_last=True, batch_size=config.batch_size, num_workers=config.batch_size) valid_dataset = data.XRayDataset(reports=valid_reports, transform=transforms.Compose([ transforms.Resize(299), transforms.CenterCrop((299, 299)), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ])) valid_dataset.tokenizer = train_dataset.tokenizer valid_dataloader = torch.utils.data.dataloader.DataLoader( valid_dataset, collate_fn=data.collate_fn, pin_memory=True, shuffle=True, drop_last=True, batch_size=config.batch_size, num_workers=config.batch_size) num_classes = len(train_dataset.classes) encoder = models.EncoderCNN(config.emb_dim, num_classes).to(config.device, memory_format=memory_format) decoder = models.DecoderRNN_Word( config.emb_dim, config.hidden_dim, train_dataset.tokenizer, config.num_layers).to(config.device, memory_format=memory_format) classes_loss = torch.nn.BCEWithLogitsLoss() outputs_loss = torch.nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = apex.optimizers.FusedAdam(params, lr=config.learning_rate) [encoder, decoder], optimizer = apex.amp.initialize([encoder, decoder], optimizer, opt_level="O1") def train_one_epoch(dataloader, batch_size, encoder, decoder, classes_loss, outputs_loss, optimizer, train=True): total_step = len(dataloader.dataset) // batch_size if train: encoder.train() decoder.train() else: encoder.eval() decoder.eval() running_c_loss = torch.Tensor([0.0]) running_o_loss = torch.Tensor([0.0]) state_h, state_c = decoder.zero_state(batch_size) with torch.set_grad_enabled(train): for i, (images, class_labels, captions, lengths) in enumerate(progress_bar(dataloader)): images = images.to( config.device, non_blocking=True).contiguous(memory_format=memory_format) captions = captions.to(config.device, non_blocking=True) class_labels = class_labels.to(config.device, non_blocking=True) lengths = [o - 1 for o in lengths] targets = torch.nn.utils.rnn.pack_padded_sequence( captions[:, 1:], lengths, batch_first=True, enforce_sorted=False)[0] encoder.zero_grad() decoder.zero_grad() logits, features = encoder(images) c_loss = classes_loss(logits, class_labels) outputs, (state_h, state_c) = decoder(features, captions[:, :-1], lengths, (state_h, state_c)) o_loss = outputs_loss(outputs, targets) if train: with apex.amp.scale_loss(c_loss, optimizer) as scaled_loss: scaled_loss.backward(retain_graph=True) with apex.amp.scale_loss(o_loss, optimizer) as scaled_loss: scaled_loss.backward() state_h = state_h.detach() state_c = state_c.detach() optimizer.step() running_c_loss += c_loss running_o_loss += o_loss c_loss = float(running_c_loss.item() / total_step) o_loss = float(running_o_loss.item() / total_step) return c_loss, o_loss batch_size = config.batch_size if not args.test: print("Start training") history = { "train_c_loss": [], "train_o_loss": [], "valid_c_loss": [], "valid_o_loss": [] } best_loss = 100 for epoch in range(num_epochs): print("\nEpoch", epoch + 1, "/", num_epochs, ":\n") train_c_loss, train_o_loss = train_one_epoch(train_dataloader, batch_size, encoder, decoder, classes_loss, outputs_loss, optimizer, train=True) print("* train_loss - ", round(train_c_loss, 3), round(train_o_loss, 3), "- perplexity -", round(np.exp(train_o_loss), 3)) history["train_c_loss"].append(train_c_loss) history["train_o_loss"].append(train_o_loss) valid_c_loss, valid_o_loss = train_one_epoch(valid_dataloader, batch_size, encoder, decoder, classes_loss, outputs_loss, optimizer, train=False) print("* valid_loss - ", round(valid_c_loss, 3), round(valid_o_loss, 3), "- perplexity -", round(np.exp(valid_o_loss), 3)) history["valid_c_loss"].append(valid_c_loss) history["valid_o_loss"].append(valid_o_loss) current_valid_loss = valid_o_loss if current_valid_loss < best_loss: print("* best loss, saving weights") best_loss = current_valid_loss torch.save(encoder.state_dict(), outdir + "encoder_word.pt") torch.save(decoder.state_dict(), outdir + "decoder_word.pt") print("Save history to CSV file") df = pd.DataFrame(list( zip(history["train_c_loss"], history["train_o_loss"], history["valid_c_loss"], history["valid_o_loss"])), columns=[ "train_c_loss", "train_o_loss", "valid_c_loss", "valid_o_loss" ]) df.to_csv(outdir + "history.csv") print("Load weights and run mAP and BLEU eval") encoder.load_state_dict(torch.load(outdir + "encoder_word.pt")) decoder.load_state_dict(torch.load(outdir + "decoder_word.pt")) y_true, y_pred = get_class_predictions(encoder, train_dataset) recall, precision, AP, train_mAP = evaluate_encoder_predictions( y_true, y_pred) y_true, y_pred = get_class_predictions(encoder, valid_dataset) recall, precision, AP, valid_mAP = evaluate_encoder_predictions( y_true, y_pred) print("* train mAP -", round(train_mAP, 3), "- valid mAP -", round(valid_mAP, 3)) bleu_scores = [] for name, dataloader in zip(["train", "valid"], [train_dataloader, valid_dataloader]): encoder.eval() decoder.eval() running_bleu = 0.0 dataset_len = len(dataloader.dataset) with torch.set_grad_enabled(False): for index in trange(0, dataset_len): image, problems, impression = dataloader.dataset.__getitem__( index) image_tensor = image.unsqueeze(0).to(device) logits, features = encoder(image_tensor) # seed = [] # seed = torch.from_numpy(train_dataset.tokenizer.encode(seed)).unsqueeze(0).cuda() # predictions, seed, decode_lengths, alphas = decoder.sample(features, seed, [32, ]) # sampled_ids = list(predictions[0].cpu().numpy()) # sampled_ids = decoder.beam_decode(features) sampled_ids = decoder.greedy_decode(features) sampled_ids = [i for i in sampled_ids] original = train_dataset.tokenizer.decode(impression[1:-1]) generated = train_dataset.tokenizer.decode(sampled_ids[:-1]) reference = [nltk.word_tokenize(original)] candidate = nltk.word_tokenize(generated) bleu_score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)) running_bleu += bleu_score bleu_score = running_bleu / dataset_len bleu_scores.append(bleu_score) print("* train/valid BLEU-1 scores", bleu_scores)
def test_empty_hypothesis(self): # Test case where there's hypothesis is empty. references = ['The candidate has no alignment to any of the references'.split()] hypothesis = [] assert(sentence_bleu(references, hypothesis) == 0)
def calc_bleu_many(cand_seq, ref_sequences): sf = bleu_score.SmoothingFunction() return bleu_score.sentence_bleu(ref_sequences, cand_seq, smoothing_function=sf.method1, weights=(0.5, 0.5))
def test_empty_references_and_hypothesis(self): # Test case where both references and hypothesis is empty. references = [[]] hypothesis = [] assert(sentence_bleu(references, hypothesis) == 0)
def translate_en_fr(src_sent): # read checkpoint path, number indicates the latest step CHECKPOINT_PATH = "INFO7374-12200" tf.reset_default_graph() # define the trained model with tf.variable_scope("nmt_model", reuse=None): model = NMTModel() # sentence for testing test_en_text = src_sent # file for vocab SRC_VOCAB = "vocab.en" TRG_VOCAB = "vocab.fr" # convert sentence to word_index according to vocab with codecs.open(SRC_VOCAB, "r", "utf-8") as f_vocab: src_vocab = [w.strip() for w in f_vocab.readlines()] src_id_dict = dict((src_vocab[x], x) for x in range(len(src_vocab))) test_en_ids = [ (src_id_dict[token] if token in src_id_dict else src_id_dict['<unk>']) for token in test_en_text.split() ] # build inference based on saved model weights output_op = model.inference(test_en_ids) sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, CHECKPOINT_PATH) # read translation output output_ids = sess.run(output_op) # convert translation idx into word with codecs.open(TRG_VOCAB, "r", "utf-8") as f_vocab: trg_vocab = [w.strip() for w in f_vocab.readlines()] output_text = ' '.join([trg_vocab[x] for x in output_ids]) # output translation final_output_text = output_text.encode('utf8').decode( sys.stdout.encoding).strip('<eos>') # load test_set - size: 100 src_test = [] with open('test.en', 'r', encoding='utf-8') as f: for line in f: src_test.append(line.strip()) tgt_test = [] with open('test.fr', 'r', encoding='utf-8') as f: for line in f: tgt_test.append(line.strip()) if src_sent in src_test: idx = src_test.index(src_sent) trgt_sent = tgt_test[idx] bleu = sentence_bleu(trgt_sent, final_output_text) lst = levenshtein(trgt_sent, final_output_text) else: trgt_sent = 'Not Available In App Test Set' bleu = 'NA' lst = 'NA' return output_text[6:-7], trgt_sent, bleu, lst sess.close()
def bleuScore(self, s1, s2): return bleu_score.sentence_bleu(s1, s2)
def main(_): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) generator, rnnlm, style_discriminator, siamese_discriminator,semantic_discriminator, rollout, vocab, tsf_vocab_inv = \ pretrain.create_model(sess, save_folder, FLAGS, embed_fn) saver = tf.train.Saver(tf.all_variables()) MODEL = FLAGS.model_path try: saver.restore(sess, MODEL) except: print("Error: No model found in {}".format(MODEL)) sys.exit(0) # load test data test_orig_sents, test_orig_words, test_orig_len = data_helpers.loadTestInputs(FLAGS.max_sent_len, save_folder) print("test size: {}".format(len(test_orig_sents))) dump_folder = "../dump/" + str(FLAGS.data_type) + "/" #output_path = dump_folder + FLAGS.output_path output_path = FLAGS.output_path +"outputs.txt" log_path = output_path + "logs.txt" f = open(output_path, "w") g = open(log_path, "w") ind = 0 total_bleu = 0 total_sem = 0 total_loss = 0 while (ind < len(test_orig_sents)): input_sents = test_orig_sents[ind:ind+FLAGS.batch_size] input_len = test_orig_len[ind:ind+FLAGS.batch_size] orig_words = test_orig_words[ind:ind+FLAGS.batch_size] # pad to batch size if (ind+FLAGS.batch_size > len(test_orig_sents)): input_sents = test_orig_sents[ind:] + [test_orig_sents[ind]] * (ind+FLAGS.batch_size-len(test_orig_sents)) input_len = test_orig_len[ind:]+[test_orig_len[ind]]*(ind+FLAGS.batch_size-len(test_orig_sents)) orig_words = test_orig_words[ind:] + [test_orig_words[ind]] * (ind+FLAGS.batch_size-len(test_orig_sents)) # generator_outputs: [batch_size, time, beam_width] beam_generator_outputs = generator.generate(sess, input_sents, input_len) # generator_outputs: [batch_size, time] generator_outputs = np.array(beam_generator_outputs)[:,:,0] generator_outputs_raw,generator_outputs_len = data_helpers.cleanGeneratorOutputs(generator_outputs, FLAGS.max_sent_len) generator_outputs = data_helpers.cleanTexts(generator_outputs, FLAGS.max_sent_len) style_loss = np.sum(style_discriminator.getStyleReward(sess,generator_outputs_raw,generator_outputs_len), axis = 0) lm_loss = np.sum(rnnlm.getLMReward(sess,generator_outputs_raw), axis = 0) tsf_words = data_helpers.convertIdxToWords(generator_outputs, tsf_vocab_inv) tmp_ind = ind ind += FLAGS.batch_size batch_bleu = 0 for (orig_word_seq, tsf_word_seq) in zip(orig_words, tsf_words): if (tmp_ind >= len(test_orig_sents)): break # output f.write(" ".join(tsf_word_seq)+"\n") # log g.write("orig:\t"+" ".join(orig_word_seq)+"\n") g.write("tsf:\t"+" ".join(tsf_word_seq)+"\n") score = sentence_bleu(orig_word_seq, tsf_word_seq) batch_bleu +=score g.write("\n") tmp_ind += 1 print("bleu score is {}".format(batch_bleu/FLAGS.batch_size)) total_bleu += batch_bleu total_loss += style_loss total_sem += lm_loss print("total bleu score is {}".format(total_bleu/len(test_orig_sents))) print("total style score is {}".format(total_loss/len(test_orig_sents))) print("total style score is {}".format(total_sem/len(test_orig_sents))) f.close() g.close() print("done saving tsf sents to", output_path) print("done saving both orig and tsf logs to", log_path)
def bleu_compute(reference, candidate): score = sentence_bleu(reference, candidate) return score
def evaluate_autoencoder(whichdecoder, data_source, epoch): # Turn on evaluation mode which disables dropout. eos_id = corpus.dictionary.word2idx['<eos>'] autoencoder.eval() ntokens = len(corpus.dictionary.word2idx) n_sents = 0.0 total_loss = 0.0 token_accuracies = 0.0 all_source_sents = [] all_transfer_sents = [] pbar = tqdm(range(len(data_source))) for ii in pbar: batch = data_source[ii] source, target, lengths = batch source = to_gpu(use_cuda, Variable(source, requires_grad=False)) target = to_gpu(use_cuda, Variable(target, requires_grad=False)) n_sents += source.size()[0] mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) hidden = autoencoder(0, source, lengths, noise=False, encode_only=True) # output: batch x seq_len x ntokens if whichdecoder == 0: output = autoencoder(0, source, lengths, noise=False) flattened_output = output.view(-1, ntokens) masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens) # accuracy max_vals1, max_indices1 = torch.max(masked_output, 1) token_accuracies += torch.mean(max_indices1.eq(masked_target).float()).item() max_values1, max_indices1 = torch.max(output, 2) max_indices2 = autoencoder.generate(1, hidden, maxlen=50) else: output = autoencoder(1, source, lengths, noise=False) flattened_output = output.view(-1, ntokens) masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens) # accuracy max_vals2, max_indices2 = torch.max(masked_output, 1) token_accuracies += torch.mean(max_indices2.eq(masked_target).float()).item() max_values2, max_indices2 = torch.max(output, 2) max_indices1 = autoencoder.generate(0, hidden, maxlen=50) # forward total_loss += criterion_ce(masked_output / args.temp, masked_target).data # all_source_sents, all_transfer_sents max_indices1 = max_indices1.view(output.size(0), -1).data.cpu().numpy() max_indices2 = max_indices2.view(output.size(0), -1).data.cpu().numpy() target = target.view(output.size(0), -1).data.cpu().numpy() tran_indices = max_indices2 if whichdecoder == 0 else max_indices1 for t, tran_idx in zip(target, tran_indices): # real sentence truncated_to_eos = t.tolist().index(eos_id) if eos_id in t.tolist() else len(t) chars = " ".join([corpus.dictionary.idx2word[x] for x in t[:truncated_to_eos]]) all_source_sents.append(chars) # transfer sentence truncated_to_eos = tran_idx.tolist().index(eos_id) if eos_id in tran_idx.tolist() else len(tran_idx) chars = " ".join([corpus.dictionary.idx2word[x] for x in tran_idx[:truncated_to_eos]]) all_transfer_sents.append(chars) # compare the original and transfer aeoutf_from = "{}/{}_output_decoder_{}_from.txt".format(args.outf, epoch, whichdecoder) aeoutf_tran = "{}/{}_output_decoder_{}_tran.txt".format(args.outf, epoch, whichdecoder) with open(aeoutf_from, 'w') as f_from, open(aeoutf_tran, 'w') as f_trans: # laplacian smoothing # for word in corpus.dictionary.word2idx.keys(): # f_from.write(word + "\n") # f_trans.write(word + "\n") for i in range(len(all_source_sents)): # real sentence f_from.write(all_source_sents[i]) # transfer sentence f_trans.write(all_transfer_sents[i]) if i != len(all_source_sents) - 1: f_from.write("\n") f_trans.write("\n") # bleu all_bleu_scores = 0.0 for i in range(len(all_source_sents)): sou = all_source_sents[i].split(' ') tran = all_transfer_sents[i].split(' ') all_bleu_scores += sentence_bleu([sou], tran,smoothing_function=SmoothingFunction().method7,weights=[1.0/3.0]*3) bleu = all_bleu_scores / n_sents * 100.0 # forward and reverse loss = total_loss.item() / len(data_source) ppl = math.exp(loss) #print('bleu {:4.2f} | ppl {:4.3f}'.format(bleu, ppl)) #logging.info('bleu {:4.2f} | ppl {:4.3f}'.format(bleu, ppl)) # transfer labels = fasttext_classifier.predict(all_transfer_sents) truth = str(1 - whichdecoder) transfer = float(sum([l == truth for ll in labels for l in ll])) / n_sents * 100.0 # load sentences to evaluate on arpa_path = '{}/{}_lm_{}.arpa'.format(args.outf, epoch, whichdecoder) kenlm_model = train_ngram_lm(args.kenlm_path, aeoutf_from, arpa_path, args.N) forward = get_ppl(kenlm_model, all_transfer_sents) kenlm_model = train_ngram_lm(args.kenlm_path, aeoutf_tran, arpa_path, args.N) reverse = get_ppl(kenlm_model, all_source_sents) #print('transfer {:4.2f} | forward {:4.3f} | reverse {:4.3f}'.format(transfer, forward, reverse)) #logging.info('transfer {:4.2f} | forward {:4.3f} | reverse {:4.3f}'.format(transfer, forward, reverse)) return bleu, ppl, transfer, forward, reverse