コード例 #1
0
def moses_bl_rouge(p, l):
    bl = bleu.moses_multi_bleu(p, l)
    x = rouge.rouge(p, l)
    print(
        'Moses BLEU: %f\nROUGE1-F: %f\nROUGE1-P: %f\nROUGE1-R: %f\nROUGE2-F: %f\nROUGE2-P: %f\nROUGE2-R: %f\nROUGEL-F: %f\nROUGEL-P: %f\nROUGEL-R: %f'
        %
        (bl, x['rouge_1/f_score'], x['rouge_1/p_score'], x['rouge_1/r_score'],
         x['rouge_2/f_score'], x['rouge_2/p_score'], x['rouge_2/r_score'],
         x['rouge_l/f_score'], x['rouge_l/p_score'], x['rouge_l/r_score']))
コード例 #2
0
def bleu_eval(ref_dir, dec_dir):
    ref_dir = ref_dir + '/'
    dec_dir = dec_dir + '/'
    ref = []
    dec = []
    for i, j in zip(sorted(glob.glob(dec_dir + '*.txt')),
                    sorted(glob.glob(ref_dir + '*.txt'))):
        ref_tex = ''
        dec_tex = ''
        for k in open(i).readlines():
            dec_tex = dec_tex + k.strip()
        for l in open(j).readlines():
            ref_tex = ref_tex + l.strip()
        ref.append(ref_tex)
        dec.append(dec_tex)
    bleu_score = bleu.moses_multi_bleu(dec, ref)
    return bleu_score
コード例 #3
0
def get_metrics(f1,f2):
        ref = []
        decoded = []
        count = 0
        for i, j in zip(sorted(glob.glob(f1)),sorted(glob.glob(f2))):
                ref_tex = ''
                dec_tex = ''
                for k in open(i).readlines():
                        dec_tex = dec_tex + k.strip()
                for l in open(j).readlines():
                        ref_tex = ref_tex + l.strip()
                ref.append(ref_tex)
                decoded.append(dec_tex)
                count = count + 1

        bl = bleu.moses_multi_bleu(decoded,ref)
        x = rouge.rouge(decoded,ref)
        s = "\t%.2f\t%.2f\t%.2f\t%.2f"%(bl,x['rouge_1/f_score']*100,x['rouge_2/f_score']*100,x['rouge_l/f_score']*100)
        print(count)
        return s
コード例 #4
0
def _bleu_fn(hypotheses, references):
    # Deal with byte chars
    if hypotheses.dtype.kind == np.dtype("U"):
        hypotheses = np.char.encode(hypotheses, "utf-8")
    if references.dtype.kind == np.dtype("U"):
        references = np.char.encode(references, "utf-8")

    # Convert back to unicode object
    hypotheses = [_.decode("utf-8") for _ in hypotheses]
    references = [_.decode("utf-8") for _ in references]

    # Slice all hypotheses and references up to SOS -> EOS
    sliced_hypotheses = [utils.slice_text(
        _) for _ in hypotheses]
    sliced_references = [utils.slice_text(
        _) for _ in references]

    bleu_score = bleu.moses_multi_bleu(sliced_hypotheses, sliced_references, lowercase=False) #pylint: disable=E1102
    print('bleu_score:', bleu_score)
    return bleu_score
コード例 #5
0
def get_metrics(f1, f2):
    ref = []
    decoded = []
    count = 0
    print(f1)
    print(f2)
    for i, j in zip(sorted(glob.glob(f1)), sorted(glob.glob(f2))):
        ref_tex = ''
        dec_tex = ''
        for k in open(i).readlines():
            dec_tex = dec_tex + k.strip()
        for l in open(j).readlines():
            ref_tex = ref_tex + l.strip()
        ref.append(ref_tex)
        decoded.append(dec_tex)
        count = count + 1
    print(len(decoded))
    print(len(ref))
    x = rouge.rouge(decoded, ref)

    bl = bleu.moses_multi_bleu(decoded, ref)  #replace by pycoco bleu
    return 0, 0, 0, bl
コード例 #6
0
def calculate_metrics(y_pred,
                      y_true,
                      orig_y_pred=None,
                      verbose=False,
                      bleu=False):
    ''' 
	Calculate exact match accuracy, precision, recall, F1 score, word-level accuracy
	y_pred and y_true are lists of strings
	function returns dict with the calculated metrics
	'''

    N = min(len(y_pred), len(y_true))
    # N = 4500
    if len(y_pred) != len(y_true):
        print(
            'Warning: The number of predictions and ground truths are not equal, calculating metrics over %d points'
            % N)

    # for precision, recall, f1
    tp = 0
    fp = 0
    fn = 0

    # for exact match
    exact_match, exact_match_idx, exact_predicted, good_match_idx, good_match_idx_extended , exact_match_idx_orig = 0, [], [], [], [], []
    li_exact_match, li_orig_match, err_idx = [], [], []

    # for word-level accuracy
    correct_words = 0
    total_words = 0

    if verbose:
        a = tqdm.tqdm(range(N))
    else:
        a = range(N)

    for i in a:
        # print(i)
        pred = y_pred[i].split()
        true = y_true[i].split()

        total_words += len(true)
        correct_matches = 0
        for j in range(min(len(true), len(pred))):
            if pred[j] == true[j]:
                correct_words += 1
                correct_matches += 1

        d_pred, d_true = get_freqs(pred, true)

        if pred == true:
            exact_match += 1
            if len(pred) > 1 and ('<unk>' not in pred):
                exact_match_idx.append(i)
                # exact_predicted.append(pred)
                # print(pred)

        if orig_y_pred is not None:
            orig_pred = orig_y_pred[i].split()
            orig_d_pred, _ = get_freqs(orig_pred, true)
            exact_matches, orig_exact_match_cnt = 0, 0
            for j in range(min(len(true), len(pred), len(orig_pred))):
                if true[j] == orig_pred[j] and pred[j] != true[j]:
                    exact_matches += 1
                    '''
					print(orig_pred)
					print(true)
					print(pred)
					err_idx.append(i)
					print('=====')
					'''

                if true[j] == orig_pred[j]:
                    orig_exact_match_cnt += 1

            li_exact_match.append(exact_matches)
            li_orig_match.append(orig_exact_match_cnt)

        # print(d_pred, d_true)

        calc_type = 2

        if calc_type == 1:
            # this is my implementation
            for word in d_pred:
                tp += min(d_pred[word], d_true[word])
                fp += max(0, d_pred[word] - d_true[word])
                fn += max(0, d_true[word] - d_pred[word])
        else:
            # this is the code2seq implementation
            orig_80, pred_80 = 0, 0
            for word in d_pred:
                if d_pred[word] > 0:
                    if d_true[word] > 0:
                        tp += 1
                    else:
                        fp += 1
                if d_true[word] > 0 and d_pred[word] == 0:
                    fn += 1

            if orig_y_pred is not None:
                for word in orig_d_pred:
                    if orig_d_pred[word] > 0:
                        if word in d_true and d_true[word] > 0:
                            if word in d_pred and d_pred[word] > 0:
                                pred_80 += 1
                            orig_80 += 1

            # if tp > 0.8*len(d_pred) and  len(pred) > 1 and ('unk' not in y_pred[i]):
            #	good_match_idx.append(i)

    # print(tp, fp, fn)
    precision = tp / (tp + fp + 0.0000000001)
    recall = tp / (tp + fn + 0.0000000001)
    f1 = 2 * precision * recall / (precision + recall + 0.0000000001)
    exact_match /= N
    word_level_accuracy = correct_words / total_words

    if sum(li_orig_match) == 0:
        sum_li = 1
    else:
        sum_li = sum(li_orig_match)
    asr_dataset = round(sum(li_exact_match) / sum_li * 100, 2)
    ax = [
        e / o if o != 0 else 0 for e, o in zip(li_exact_match, li_orig_match)
    ]
    asr_sample_mean = round(sum(ax) / sum_li, 2)
    asr_sample_std = round(np.std(np.array(ax)), 2)

    d = {
        'precision': precision * 100,
        'recall': recall * 100,
        'f1': f1 * 100,
        'exact_match': exact_match * 100,
        'word-level accuracy': word_level_accuracy * 100,
        'total_samples': N,
        'asr_dataset': asr_dataset,
        'asr_sample_mean': asr_sample_mean,
        'asr_sample_std': asr_sample_std,
        'li_exact_match': li_exact_match,
        'li_orig_match': li_orig_match,
        'exact_match_idx': exact_match_idx
    }

    if bleu:
        bleu_score = moses_multi_bleu(np.array(y_pred), np.array(y_true))
        d['BLEU'] = bleu_score

    return d
コード例 #7
0
def calculate_metrics(y_pred, y_true, verbose=False, bleu=False):
	''' 
	Calculate exact match accuracy, precision, recall, F1 score, word-level accuracy
	y_pred and y_true are lists of strings
	function returns dict with the calculated metrics
	'''

	N = min(len(y_pred),len(y_true))
	# N = 4500
	if len(y_pred)!=len(y_true):
		print('Warning: The number of predictions and ground truths are not equal, calculating metrics over %d points'%N)

	# for precision, recall, f1
	tp = 0
	fp = 0
	fn = 0

	# for exact match
	exact_match = 0

	# for word-level accuracy
	correct_words = 0
	total_words = 0

	if verbose:
		a = tqdm.tqdm(range(N))
	else:
		a = range(N)

	for i in a:
		# print(i)
		pred = y_pred[i].split()
		true = y_true[i].split()

		total_words += len(true)
		for j in range(min(len(true), len(pred))):
			if pred[j]==true[j]:
				correct_words += 1


		d_pred, d_true = get_freqs(pred, true)
		if pred == true:
			exact_match += 1

		# print(d_pred, d_true)

		calc_type = 2

		if calc_type==1:
			# this is my implementation
			for word in d_pred: 
				tp += min(d_pred[word], d_true[word])
				fp += max(0, d_pred[word]-d_true[word])
				fn += max(0, d_true[word]-d_pred[word])
		else:
			# this is the code2seq implementation
			for word in d_pred: 
				if d_pred[word]>0:
					if d_true[word]>0:
						tp += 1
					else:
						fp += 1

				if d_true[word]>0 and d_pred[word]==0:
					fn += 1


	# print(tp, fp, fn)
	precision = tp / (tp+fp+0.0000000001)
	recall = tp / (tp+fn+0.0000000001)
	f1 = 2*precision*recall / (precision+recall+0.0000000001)
	exact_match /= N
	word_level_accuracy = correct_words / total_words

	d = {
			'precision': precision*100, 
			'recall': recall*100, 
			'f1': f1*100, 
			'exact_match':exact_match*100, 
			'word-level accuracy': word_level_accuracy*100, 
			}

	if bleu:
		bleu_score = moses_multi_bleu(np.array(y_pred), np.array(y_true))
		d['BLEU'] = bleu_score

	return d
コード例 #8
0
import sys
import glob
import rouge
import bleu
import pandas as pd
f1 = sys.argv[1]  #decoded
f2 = sys.argv[2]  #reference
ref = []
decoded = []

for i, j in zip(sorted(glob.glob(f1 + '*.txt')),
                sorted(glob.glob(f2 + '*.txt'))):
    ref_tex = ''
    dec_tex = ''
    for k in open(i).readlines():
        dec_tex = dec_tex + k.strip()
    for l in open(j).readlines():
        ref_tex = ref_tex + l.strip()
    ref.append(ref_tex)
    decoded.append(dec_tex)
data = {'decoded': decoded, 'reference': ref}
df = pd.DataFrame(data)
df.to_csv('analysis.csv', index=False)
bl = bleu.moses_multi_bleu(decoded, ref)
x = rouge.rouge(decoded, ref)
print('%.2f\t%.2f\t%.2f\t%.2f' %
      (bl, x['rouge_1/f_score'] * 100, x['rouge_2/f_score'] * 100,
       x['rouge_l/f_score'] * 100))
コード例 #9
0
def _train(epoch: int,
           enc: nn.Module,
           dec: nn.Module,
           disc: nn.Module,
           prior_size: int,
           dl: Iterator,
           vocab: Vocab,
           device: str,
           validate: bool = False) -> Tuple[float, float, float, float]:

    if not validate:
        enc.train()
        dec.train()
        disc.train()
    else:
        enc.eval()
        dec.eval()
        disc.eval()

    epoch_g_loss = 0.0
    epoch_ae_loss = 0.0
    epoch_disc_loss = 0.0

    strs = []
    dec_strs = []

    n_batches = len(dl)

    for batch_idx, batch in enumerate(dl):

        seq = batch.text
        seq = seq[1:]

        label = batch.label
        label = to_onehot(label, 2, device)

        (seq_len, batch_size) = seq.shape

        batch_zeros = torch.zeros((batch_size, 1)).to(device)
        batch_ones = torch.ones((batch_size, 1)).to(device)

        # ======== train/validate Discriminator ========

        if not validate:
            enc.zero_grad()
            disc.zero_grad()

        z = torch.randn((batch_size, prior_size)).to(device)
        z_label = to_onehot(
            torch.randint(0, 2, (batch_size, )).long(), 2, device)

        latent = enc(seq)
        fake_pred = disc(latent, label)
        true_pred = disc(z, z_label)

        fake_loss = F.binary_cross_entropy_with_logits(fake_pred, batch_zeros)
        true_loss = F.binary_cross_entropy_with_logits(true_pred, batch_ones)

        disc_loss = 0.5 * (fake_loss + true_loss)

        if not validate:
            disc_loss.backward()
            disc.optim.step()

        # ======== train/validate Autoencoder ========

        if not validate:
            enc.zero_grad()
            dec.zero_grad()
            disc.zero_grad()

        latent = enc(seq)
        x = torch.zeros(1, batch_size).to(device).long() + vocab.stoi['<sos>']

        h = None

        output = None

        for i in range(seq_len):
            o, h = dec(x, latent, h, label)
            x = seq[i].view(1, -1)
            output = o if output is None else torch.cat((output, o), 0)

        ae_loss = F.nll_loss(output, seq.view(-1))

        fake_pred_z = disc(latent, label)

        enc_loss = F.binary_cross_entropy_with_logits(fake_pred_z, batch_ones)

        g_loss = ae_loss + enc_loss

        if not validate:
            g_loss.backward()
            dec.optim.step()
            enc.optim.step()

        # ----------------------------------------------------

        epoch_g_loss += g_loss.item()
        epoch_ae_loss += ae_loss.item()
        epoch_disc_loss += disc_loss.item()

        _, w_idxs = output.topk(1, dim=1)
        dec_seq = w_idxs.view(seq_len, batch_size)

        strs.extend(seq_to_str(seq.detach(), vocab))
        dec_strs.extend(seq_to_str(dec_seq.detach(), vocab))

    epoch_g_loss /= n_batches
    epoch_ae_loss /= n_batches
    epoch_disc_loss /= n_batches

    bleu = moses_multi_bleu(np.array(dec_strs), np.array(strs))

    mode = 'Valid' if validate else 'Train'

    print(
        "Epoch {:3} {:5}: BLEU: {:.2f}, AE: {:.5f}, G: {:.5f}, D: {:.5f} at {}"
        .format(epoch, mode, bleu, epoch_ae_loss, epoch_g_loss,
                epoch_disc_loss,
                datetime.now().strftime("%H:%M:%S")))

    return epoch_ae_loss, epoch_g_loss, epoch_disc_loss, bleu
コード例 #10
0
def optain_all_data():
    main_folder = './result_data/'
    # Obtain all folders
    folders = [
        f for f in os.listdir(main_folder)
        if f != '__pycache__' and os.path.isdir(os.path.join(main_folder, f))
    ]

    # Process each checkpoint in the folders
    epochs_data = []
    for folder in folders:
        print('folder:{}'.format(folder))
        input_fname = os.path.join('../data/tokenized_target.txt')
        sorted_fname_responses = sort_filenames_on_epoch(
            os.path.join(main_folder, folder), 'response_str')

        epoch_data = []
        for i in range(len(sorted_fname_responses)):
            response_fname = sorted_fname_responses[i]

            if response_fname == None:
                epoch_data.append((-1, -1, -1))
                continue

            ref_tex = []
            dec_tex = []
            for k in open(input_fname).readlines():
                sentence = k.strip()
                sentence = sentence.replace("<bos> ", "").replace(" <eos>", "")
                dec_tex.append(sentence)
            for l in open(response_fname).readlines():
                sentence = l.strip()
                sentence = sentence.replace("<bos> ", "").replace(" <eos>", "")
                ref_tex.append(sentence)

            # Bleu
            print("\nBleu score...")
            bl = bleu.moses_multi_bleu(dec_tex, ref_tex)
            print(bl)

            # Rouge 1
            print("\nRouge 1 score...")
            r1_f1_score, r1_precision, r1_recall = rouge.rouge_n(
                dec_tex, ref_tex, 1)
            print(r1_f1_score * 100)  #, precision, recall)

            # Rouge 2
            print("\nRouge 2 score...")
            r2_f1_score, r2_precision, r2_recall = rouge.rouge_n(
                dec_tex, ref_tex, 2)
            print(r2_f1_score * 100)  #, precision, recall)

            #      # Rouge l
            #      print("\nCalculating the rouge l score...")
            #      f1_score, precision, recall = rouge.rouge_l_sentence_level(dec_tex, ref_tex)
            #      print(f1_score*100)#, precision, recall)

            epoch_data.append((bl, r1_f1_score * 100, r2_f1_score * 100))

        epochs_data.append((folder, epoch_data))
    return epochs_data