Ejemplo n.º 1
0
def main() -> None:
    args = parse_arguments(subtype="evaluate_shallow_metrics")
    # get verbosity
    if args.verbosity == 1:
        logger = logging.getLogger('base')
    else:
        logger = logging.getLogger('root')
    # define json glob
    json_glob = args.json_glob
    # define search space
    files = glob(json_glob)
    for input_file in files:
        # log information
        logger.info("Computing bleu and chrf scores: %s", input_file)
        # load single dictionary and compute surface similarity scores
        with open(input_file, "r") as f:
            store = json.load(f)
        for key in tqdm(store.keys()):
            source_orig_de = store[key]["sentence_original"]["source"]
            source_para_de = store[key]["sentence_paraphrase"]["source"]
            target_orig_en = store[key]["sentence_original"]["target"]
            target_para_en = store[key]["sentence_paraphrase"]["target"]
            chrf_bar_source = (sacrebleu.sentence_chrf(
                source_orig_de, [source_para_de]).score +
                               sacrebleu.sentence_chrf(
                                   source_para_de, [source_orig_de]).score) / 2
            chrf_bar_target = (sacrebleu.sentence_chrf(
                target_orig_en, [target_para_en]).score +
                               sacrebleu.sentence_chrf(
                                   target_para_en, [target_orig_en]).score) / 2
            bleu_bar_source = (sacrebleu.sentence_bleu(
                source_orig_de, [source_para_de]).score +
                               sacrebleu.sentence_bleu(
                                   source_para_de, [source_orig_de]).score) / 2
            bleu_bar_target = (sacrebleu.sentence_bleu(
                target_orig_en, [target_para_en]).score +
                               sacrebleu.sentence_bleu(
                                   target_para_en, [target_orig_en]).score) / 2
            store[key]["chrf_bar_source"] = chrf_bar_source
            store[key]["chrf_bar_target"] = chrf_bar_target
            store[key]["bleu_bar_source"] = bleu_bar_source
            store[key]["bleu_bar_target"] = bleu_bar_target
            store[key]["chrf_bar_mean"] = (chrf_bar_source +
                                           chrf_bar_target) / 2
            store[key]["bleu_bar_mean"] = (bleu_bar_source +
                                           bleu_bar_target) / 2
        # write back json to disk
        with open(input_file, "w") as f:
            store = json.dump(store, f, ensure_ascii=False)
Ejemplo n.º 2
0
def validate(val_loader, encoder, decoder, criterion, tok_tgt):
    '''
    Performs one epoch's validation.
    '''
    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    references = list()  # references (true captions) for calculating corpus BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    avg_loss = 0

    with torch.no_grad():
        # Batches
        for cnt, (srccap, tgtcap, video, audio, caplen_src, caplen_tgt, srcrefs, tgtrefs) in enumerate(val_loader, 1):
            srccap, tgtcap, caplen_src, caplen_tgt = srccap.cuda(), tgtcap.cuda(), caplen_src.cuda(), caplen_tgt.cuda()
            video, audio = video.cuda(), audio.cuda()
            # Forward prop.
            src_out, init_hidden, vid_out = encoder(srccap, video, audio) # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim)
            scores, pred_lengths = decoder.inference(srccap, tgtcap, init_hidden, src_out, vid_out, args.MAX_INPUT_LENGTH)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = tgtcap[:, 1:]
            scores_copy = scores.clone()

            # Calculate loss
            loss = criterion(scores[:, 1:].contiguous().view(-1, decoder.vocab_size), targets.contiguous().view(-1))

            # Hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][1:pred_lengths[j]])  # remove pads and idx-

            preds = [tok_tgt.decode_sentence(t) for t in temp_preds]
            hypotheses.extend(preds) # preds= [1,2,3]

            tgtrefs = [list(map(int, i.split())) for i in tgtrefs] # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]]
            
            for r in tgtrefs:
                references.extend([tok_tgt.decode_sentence(r)]) 

            assert len(references) == len(hypotheses)

            avg_loss += loss.item()

        # Calculate metrics
        print('pred example:', hypotheses[0])
        print('ref example:', references[0])
        avg_loss = avg_loss/cnt
        corpbleu = sacrebleu.corpus_bleu(hypotheses, [references])
        print(corpbleu)
        sentbleu = 0
        for i, (r, h) in enumerate(zip(references, hypotheses), 1):
            sentbleu += sacrebleu.sentence_bleu([h], [[r]]).score
        sentbleu /= i

    return avg_loss, sentbleu, corpbleu.score
Ejemplo n.º 3
0
def get_similarity_by_sacrebleu(text1, text2):
    # pip install sacrebleu
    references = [text1]
    hypothesis = text2
    score = sacrebleu.sentence_bleu(hypothesis, references).score

    return score
Ejemplo n.º 4
0
 def sentence_level_eval(self):
     self.metrics["sentence_bleu"] = sacrebleu.sentence_bleu(
         self.prediction(), [self.reference()]).score
     self.metrics["latency"] = eval_all_latency(self.delays,
                                                self.source_length())
     self.metrics["latency_ca"] = eval_all_latency(self.elapsed,
                                                   self.source_length())
def compare_lines(hyp_lines, ref_lines, return_sorted=False):
    scores = [
        sacrebleu.sentence_bleu(hyp, ref)
        for hyp, ref in zip(hyp_lines, ref_lines)
    ]
    result = zip(scores, hyp_lines, ref_lines)
    return sorted(result, reverse=True) if return_sorted else result
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('reference', help='Reference translation.')
    parser.add_argument('nbest', help='N-best list to score.')
    parser.add_argument(
        '-val_indices',
        help='File containing indices of validation sentences.')
    args = parser.parse_args()

    with open(args.reference, 'r') as f:
        refs = [line.rstrip('\n') for line in f]

    if args.val_indices:
        with open(args.val_indices, 'r') as f:
            val_indices = {int(line.rstrip('\n')) for line in f}
    else:
        val_indices = []

    with open(args.nbest, 'r') as f:
        for line in f:
            if ' ||| ' not in line:
                continue
            fields = line.rstrip('\n').split(' ||| ')
            sntno = int(fields[0])
            score = sacrebleu.sentence_bleu(fields[1],
                                            refs[sntno],
                                            smooth_method='floor',
                                            smooth_value=.1)
            outfile = sys.stderr if sntno in val_indices else sys.stdout
            print(sntno, fields[1], score.score, sep=' ||| ', file=outfile)
Ejemplo n.º 7
0
 def evaluate_example(self, summary, reference):
     #print("BLEU is intended as a corpus-level metric. Be careful!")
     if isinstance(reference, str):
         reference = [reference]
     score = sacrebleu.sentence_bleu(summary, reference, smooth_method=self.sent_smooth_method, \
          smooth_value=self.sent_smooth_value, use_effective_order=self.sent_use_effective_order)
     score_dict = {"bleu" : score.score}
     return score_dict
Ejemplo n.º 8
0
def compute_bleu(hyp: str, ref: str) -> float:
    """

    :param hyp:
    :param ref:
    :return:
    """

    return sacrebleu.sentence_bleu(hyp, [ref]).score
Ejemplo n.º 9
0
def sent_bleu(hypotheses, references) -> List[float]:
    scores = []
    for h, r in zip(hypotheses, references):
        scores.append(
            sacrebleu.sentence_bleu(hypothesis=h,
                                    references=r,
                                    smooth_method="add-k",
                                    smooth_value=1).score)
    return scores
Ejemplo n.º 10
0
def eval(test_loader, encoder, decoder, cp_file, tok_tgt, result_path):
    '''
    Testing the model
    '''
    ### the best model is the last model saved in our implementation
    epoch = torch.load(cp_file)['epoch']
    logging.info ('Use epoch {0} as the best model for testing'.format(epoch))
    encoder.load_state_dict(torch.load(cp_file)['enc_state_dict'])
    decoder.load_state_dict(torch.load(cp_file)['dec_state_dict'])
    
    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    ids = list() # sentence ids
    hypotheses = list()  # hypotheses (predictions)
    references = list()

    with torch.no_grad():
        # Batches
        for cnt, (srccap, tgtcap, video, audio, caplen_src, caplen_tgt, srcrefs, tgtrefs) in enumerate(tqdm(test_loader)):
            srccap, tgtcap, caplen_src, caplen_tgt = srccap.cuda(), tgtcap.cuda(), caplen_src.cuda(), caplen_tgt.cuda()
            video, audio = video.cuda(), audio.cuda()

            # Forward prop.
            src_out, init_hidden, vid_out = encoder(srccap, video, audio) # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim)
            preds, pred_lengths = decoder.beam_decoding(srccap, init_hidden, src_out, vid_out, args.MAX_INPUT_LENGTH, beam_size=5)

            # Hypotheses
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:pred_lengths[j]])  # remove pads and idx-0

            preds = [tok_tgt.decode_sentence(t) for t in temp_preds]

            hypotheses.extend(preds) # preds= [[1,2,3], ... ]
            
            tgtrefs = [ list(map(int, i.split())) for i in tgtrefs] # tgtrefs = [[1,2,3], [2,4,3], [1,4,5,]]
            for r in tgtrefs:
                references.extend([tok_tgt.decode_sentence(r)]) 
    # Calculate metrics
    print('pred example:', hypotheses[0])
    print('ref example:', references[0])
    corpbleu = sacrebleu.corpus_bleu(hypotheses, [references])
    print(corpbleu)
    sentbleu = 0
    for i, (r, h) in enumerate(zip(references, hypotheses), 1):
        sentbleu += sacrebleu.sentence_bleu([h], [[r]]).score
    sentbleu /= i
    print('beam5 bleu: ',corpbleu, sentbleu)
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    # Save results
    df = pd.DataFrame([hypotheses,references]).T
    df.columns = ['baseline NMT prediction','references']
    df.to_excel(result_path+'results.xlsx')
Ejemplo n.º 11
0
def test_sentence_score():

    refs = ['The dog bit the man.', 'The dog had bit the man.']
    sys = 'The dog bit the man.'

    bleu = sacrebleu.sentence_bleu(sys, refs)
    mover = sentence_score(sys, refs)

    print(bleu.score)
    print(mover)
Ejemplo n.º 12
0
 def ds_score(self):
     ds_avg = 0
     for prompt in self.pred:
         y = self.pred[prompt].keys()
         ds = 0
         for y1, y2 in combinations(y, 2):
             ds += 1 - sacrebleu.sentence_bleu(y1, [y2]).score / 100
         ds = ds / (len(y) * (len(y) - 1))
         ds_avg += ds
     return ds_avg / len(self.pred)
Ejemplo n.º 13
0
def get_char_bleu(src: str, trg: str) -> float:
    """
    Fucntion to calculate character BLEU score
    :param src: string, source
    :param trg: string, target
    :return: float, BLEU score
    """
    src = " ".join(src)
    trg = " ".join(trg)
    return sacrebleu.sentence_bleu(src, [trg]).score
Ejemplo n.º 14
0
 def evaluate_example(self, summary, reference):
     if isinstance(reference, str):
         reference = [reference]
     score = sacrebleu.sentence_bleu(
         summary,
         reference,
         smooth_method=self.sent_smooth_method,
         smooth_value=self.sent_smooth_value,
         use_effective_order=self.sent_use_effective_order,
     )
     score_dict = {"bleu": score.score}
     return score_dict
Ejemplo n.º 15
0
def get_sent_bleu(hyp, ref):
    # hyp: hypothesis ref: reference
    # Returns:
    # bleu_score: Object with BLEU related information about sentence-level BLEU with exponential smoothing between hyp and ref
    hyp_line = hyp.strip()
    ref_line = ref.strip()

    bleu_score = sacrebleu.sentence_bleu(hyp_line,
                                         ref_line,
                                         smooth_method='exp')

    return bleu_score
Ejemplo n.º 16
0
def single_reference_sentence_bleu(reference: str, variant: str, stem: bool = False):

    def stem_sentence(sentence):
        return " ".join([
            STEMMER.stem(w)
            for w in word_tokenize(sentence)
        ])

    if stem:
        variant = stem_sentence(variant)
        reference = stem_sentence(reference)
    return sacrebleu.sentence_bleu(variant, [reference]).score
Ejemplo n.º 17
0
def sentence_bleu(sentence, reference, detokenizer=None):
    """
    Utility function for calculating sentence BLEU. 
    Expects sentence and reference as list of tokens.
    Reference may be list of multiple references
    """
    if not isinstance(reference[0], list):
        reference = [reference]
    if detokenizer is not None:
        sentence = detokenizer(sentence.split())
        reference = [detokenizer(r.split()) for r in reference]

    return sacrebleu.sentence_bleu(sentence, [reference]).score
Ejemplo n.º 18
0
def main():
    args = parse_args()
    with open(args.ref) as infile, open(args.sys) as wefile:
        outputs = []
        for idx, (inline, weline) in enumerate(zip(infile, wefile), start=1):
            bleu = sacrebleu.sentence_bleu(weline, inline)
            print(bleu)
            outputs.append((idx, bleu.score, inline.strip(), weline.strip()))
        outputs.sort(key=lambda x: -x[1])
        with open(args.out, 'w') as outfile:
            print('line_no\tbleu\tref\tsys', file=outfile)
            for idx, bleu, inline, weline in outputs:
                print('{}\t{}\t{}\t{}'.format(idx, bleu, inline, weline),
                      file=outfile)
Ejemplo n.º 19
0
 def score_multi_all(
     self,
     summaries_list: List[List[SummaryType]],
     references_list: List[List[ReferenceType]],
     **kwargs,
 ) -> List[List[MetricsDict]]:
     scores_list = []
     for summaries, references in zip(summaries_list, references_list):
         references = [flatten(reference) for reference in references]
         scores_list.append([])
         for summary in summaries:
             summary = flatten(summary)
             score = sentence_bleu(summary, references, **self.kwargs)
             scores_list[-1].append(MetricsDict({'sent-bleu': score.score}))
     return scores_list
Ejemplo n.º 20
0
def get_bleu_scores(trg_tensor, pred_tensor, TGT):
    bleus_per_sentence = torch.zeros(trg_tensor.shape[1], requires_grad=False)
    for col in range(trg_tensor.shape[1]):  #each column contains sentence
        true_sentence = [
            TGT.vocab.itos[i] for i in trg_tensor[:, col]
            if TGT.vocab.itos[i] != settings.BLANK_WORD
        ][1:-1]
        pred_sentence = [
            TGT.vocab.itos[i] for i in pred_tensor[:, col]
            if TGT.vocab.itos[i] != settings.BLANK_WORD
        ]
        #print('Before: ')
        #print(true_sentence)
        #print(pred_sentence)
        #now also need to stop pred_sentence after first EOS_WORD outputted
        #also don't want to use BOS chars
        ind_first_eos = 0
        for tok in pred_sentence:
            if tok == settings.EOS_WORD:
                break
            ind_first_eos += 1

        if ind_first_eos != 0:
            pred_sentence = pred_sentence[
                1:ind_first_eos]  #this gets rid of EOS_WORD

        #now undo some of the weird tokenization

        pred_sentence = fix_sentence(pred_sentence, as_str=True)
        true_sentence = fix_sentence(true_sentence, as_str=True)

        #This bleu_score defaults to calculating BLEU-4 (not normal bleu) so change weights,
        #this change of weights gives BLEU based on 1-grams so normal bleu I believe
        #score = nltk.translate.bleu_score.sentence_bleu([true_sentence], pred_sentence, weights=(1, 0, 0, 0))
        score = sacrebleu.sentence_bleu(pred_sentence,
                                        true_sentence,
                                        smooth_method='exp').score
        #score = score*len(true_sentence)

        #print(true_sentence)
        #print(pred_sentence)
        #print(score)
        #print()
        bleus_per_sentence[col] = score / 100.0
    return bleus_per_sentence
Ejemplo n.º 21
0
def calc_bleu_score_document(references, MT):
    """
    Calculating bleu score by using sacrebleu module. In this method, all Complete segmented in MT is sys document and all Ts sentences is ref document. 
    
    :param references: a list of references
    :param MT: a list of MT senetnces 
    :return sacre_blue_score: the bleu score
    """

    merge_mt_sentences = []
    for i in range(len(MT)):
        mt = MT[i][-1][3:-1]
        merge_mt_sentences += mt
    merge_references_sentences = []
    for ref in references:
        l = []
        for sentence in ref:
            l.append(' '.join(sentence[:-1]))
        merge_references_sentences.append(l)
    refs = [' '.join(i) for i in merge_references_sentences]
    sys = ' '.join(merge_mt_sentences[:])
    b_sacre = sacrebleu.sentence_bleu(sys, refs)
    sacre_blue_score = b_sacre.score
    return sacre_blue_score
# Open the test dataset human translation file and detokenize the references
refs = []

with open(target_test) as test:
    for line in test: 
        line = line.strip().split() 
        line = md.detokenize(line) 
        refs.append(line)
    
print("Reference 1st sentence:", refs[0])

# Open the translation file by the NMT model and detokenize the predictions
preds = []

with open(target_pred) as pred:  
    for line in pred: 
        line = line.strip().split() 
        line = md.detokenize(line) 
        preds.append(line)

# Calculate BLEU for sentence by sentence and save the result to a file
with open("bleu-" + target_pred + ".txt", "w+") as output:
    for line in zip(refs,preds):
        test = line[0]
        pred = line[1]
        print(test, "\t--->\t", pred)
        bleu = sacrebleu.sentence_bleu(pred, [test], smooth_method='exp')
        print(bleu.score, "\n")
        output.write(str(bleu.score) + "\n")
Ejemplo n.º 23
0
def sentbleu(y_hat, y):
    return sentence_bleu(y_hat, y, smooth_method="add-n", smooth_value=1.0)
Ejemplo n.º 24
0
 def score_sentence(self, hyp, ref, lang=None):
     return sacrebleu.sentence_bleu(hyp, ref, smooth_value=0.01) / 100
Ejemplo n.º 25
0
 def get_bleu(self, source, reference, beam=5):
     predicted = self.model.translate(source, beam)
     return sacrebleu.sentence_bleu(predicted, reference).score, predicted
Ejemplo n.º 26
0
def compute_sacrebleu(references, translation):
    hypo = ' '.join(translation)
    refs = [' '.join(r) for r in references]
    return sacrebleu.sentence_bleu(hypo, refs).score
Ejemplo n.º 27
0
 def bleu(self):
     """
     BLEU of the hypothesis
     """
     return sacrebleu.sentence_bleu(self.new_hyp.replace(' ', '').replace('</s>', '').replace('▁', ' '), [self.cur_ref]).score
Ejemplo n.º 28
0
def calc_bleu_score_sentence_by_time(Ts, MT, time_step):
    """
    Calculates blue score using the NLTK module with time slice strategy.
    
    :param Ts: a list of T tables 
    :param MT: a list of MT senetnces 
    :param time_step: size of time-step
    :return blue_scores: the average bleu score between time-step scores
    :return avg_SacreBleu: a list of time-step scores 
    """

    tail_number = float(MT[-1][-1][2])
    start = 0
    end = time_step
    mt_sentences = list()
    while start <= float(tail_number):
        l = []
        for i in range(len(MT)):
            estimat_word_times = build_A_Time_Based_quality(MT[i])
            for k, v in estimat_word_times.items():
                if v >= start and v <= end:
                    l.append(k)
        mt_sentences.append(l)
        start += time_step
        end += time_step
    references_sentences = list()
    start = 0
    end = time_step
    while start <= float(tail_number):
        l = []
        for i in range(len(Ts)):
            s = []
            for sentence in Ts[i]:
                for k, v in sentence.items():
                    if v >= start and v <= end:
                        s.append(k)
            l.append(s)
        references_sentences.append(l)
        start += time_step
        end += time_step
    start = 0
    end = time_step
    sacreBLEU_list = []
    blue_scores = []
    for t in range(len(mt_sentences)):
        try:
            sys = ' '.join(mt_sentences[t])
            refs = [' '.join(ref) for ref in references_sentences[t]]
            b_sacre = sacrebleu.sentence_bleu(sys, refs)
            sacre_blue_score = b_sacre.score
            text1 = 'detailed sacreBLEU     span-' + format(
                start, '06') + '-' + format(end, '06') + '     ' + str(
                    "{0:.3f}".format(sacre_blue_score))
            sacreBLEU_list.append(sacre_blue_score)
            start += time_step
            end += time_step
            blue_scores.append(text1)
        except:
            pass
    avg_SacreBleu = "avg      sacreBLEU     span*                  " + str(
        "{0:.3f}".format(round(
            (sum(sacreBLEU_list) / len(sacreBLEU_list)), 3)))
    return blue_scores, avg_SacreBleu
Ejemplo n.º 29
0
 def score_sentence(self, hyp, ref, lang=None):
     return sacrebleu.sentence_bleu(hyp, ref) / 100
Ejemplo n.º 30
0
def get_Bleu_for_beam(key, Src_tokens, Src_text, Tgt_tokens, Tgt_text, model,
                      plot_path, args):
    import sacrebleu
    from sacrebleu import sentence_bleu
    SMOOTH_VALUE_DEFAULT = 1e-8

    #-----------------------------------
    """ If you see best-hypothesis having worse WER that the remainig beam them tweak with the beam hyperpearmaeters Am_wt, len_pen, gamma 
        If you see best-hypothesis having better performance than the oothers in the beam then improve the model training
    """
    #-----------------------------------
    #-----------------------------------
    ####get the model predictions
    Output_seq = model.predict(Src_tokens, args)
    #Output_seq = model.predict(input,args.LM_model,args.Am_weight,args.beam,args.gamma,args.len_pen)

    ###get the true label if it exists
    True_label = Tgt_text
    #-----------------------------------

    llr = [item.get('score').unsqueeze(0) for item in Output_seq]
    norm_llr = torch.nn.functional.softmax(torch.cat(llr, dim=0), dim=0)

    print("final_ouputs", '====', 'key', 'Text_seq', 'LLR', 'Beam_norm_llr',
          'Yseq', 'CER')
    print("True_label", True_label)

    #-----------------------------------
    #-----------------------------------

    for ind, seq in enumerate(Output_seq):
        Text_seq = seq['Text_seq']
        if len(Text_seq) > 1:
            Text_seq = Text_seq[0]
            Text_seq_formatted = [x for x in Text_seq.split(' ') if x.strip()]
            Text_seq_formatted = " ".join(Text_seq_formatted)
        else:
            Text_seq_formatted = Text_seq[0]

        Yseq = seq['yseq'].data.numpy()
        Ynorm_llr = norm_llr[ind].data.numpy()
        Yllr = seq['score'].data.data.numpy()

        #---------------------------------------------
        attention_record = seq.get('alpha_i_list', 'None')

        if (torch.is_tensor(attention_record)):
            #---------------------------------------------
            attention_record = attention_record[:, :, 0].transpose(0, 1)
            attention_record = attention_record.data.cpu().numpy()

            #---------------------------------------------
            if args.plot_decoding_pics:
                pname = str(key) + '_beam_' + str(ind)
                plotting_name = join(plot_path_name, pname)
                plotting(plotting_name, attention_record)

        #-----------------------------------
        #-----------------------------------

        if True_label:
            if Text_seq_formatted.strip():
                CER = compute_cer(Text_seq_formatted, True_label,
                                  'doesnot_matter') * 100
            else:
                CER = 100
            #breakpoint()
            hyp_value = Text_seq_formatted
            ref_value = True_label
            Bleu_score = sentence_bleu(hyp_value, [ref_value],
                                       smooth_value=SMOOTH_VALUE_DEFAULT,
                                       smooth_method='exp',
                                       use_effective_order='True')

            Bleu_score = Bleu_score.score
        else:
            CER = None
            Bleu_score = None
        #---------------------------------------------
        if ind == 0:
            print("nbest_output", '=', key, '=', Text_seq_formatted, '=',
                  True_label, '=', CER, '=', Bleu_score)

        print("final_ouputs", '=', ind, '=', key, '=', Text_seq_formatted, '=',
              Yllr, '=', Ynorm_llr, '=', Yseq, '=', CER, '=', Bleu_score)