def compute_metrics(self, data, preds): em = 0 hyps, refs = [], [] utt_hyps, utt_refs = [], [] for ex in data: p = preds[ex['id']] gsql = ex['g_query_recov'].lower().split() psql = p['query'].lower().split() refs.append([gsql]) hyps.append(psql) em += psql == gsql utt_hyps.append(p['utt_toks']) utt_refs.append([ex['g_question_toks']]) metrics = { 'em': em / len(data), 'bleu': corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method3), 'utt_bleu': corpus_bleu(utt_refs, utt_hyps, smoothing_function=SmoothingFunction().method3), } if not self.training: metrics.update(self.compute_official_eval(data, preds)) else: metrics['official_em'] = metrics['em'] return metrics
def compute_metrics(self, data, preds): em = 0 hyps, refs = [], [] for ex in data: if ex['id'] not in preds and self.training: continue p = preds[ex['id']] gsql = ex['query'].lower().split() psql = p['query'].lower().split() refs.append([gsql]) hyps.append(psql) em += psql == gsql metrics = { 'em': em / len(data), 'bleu': corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method3), } if not self.training: metrics.update(self.compute_official_eval(data, preds)) else: metrics['official_em'] = metrics['em'] return metrics
def compute_metrics(self, data, preds): em = 0 utt_hyps, utt_refs = [], [] generated = dataset.Dataset() for ex in data: p = preds[ex['id']] utt_hyps.append(p['utt_toks']) utt_refs.append([ex['g_question_toks']]) # make new example db_id = ex['db_id'] db = self.conv.database_schemas[db_id] question_toks = p['utt_toks'] query_context = preprocess_nl2sql.SQLDataset.build_contexts( question_toks, db, self.bert_tokenizer) if 'g_sql' not in ex: ex['g_sql'] = self.conv.build_sql(ex['query'], db_id) new = dict( id=ex['id'], question=ex['question'], db_id=db_id, g_question_toks=question_toks, query=ex['query'], g_values=ex['g_values'], g_sql=ex['g_sql'], value_context=[self.bert_tokenizer.cls_token] + question_toks + [self.bert_tokenizer.sep_token], query_context=query_context, invalid=False, cands_query=preprocess_nl2sql.SQLDataset.make_column_cands( query_context), ) new['cands_query'], new[ 'cands_value'] = preprocess_nl2sql.SQLDataset.make_cands( new, self.nl2sql.sql_vocab) generated.append(new) metrics = { 'utt_bleu': corpus_bleu(utt_refs, utt_hyps, smoothing_function=SmoothingFunction().method3), } if not self.training: with torch.no_grad(): self.nl2sql.eval() preds = self.nl2sql.run_pred(generated, self.nl2sql.args, verbose=True, desc='cycle_pred') metrics.update(self.nl2sql.compute_official_eval(generated, preds)) return metrics
def evaluate(test_generations, extral_reference, del_map): bleu_score = 0.0 for ind, w in enumerate(test_generations): gen = [] end = False for i in w: if i == 2: end = True if not end and i != 0: gen.append(index2word[i]) reference = [[i.strip() for i in s.split()] for s in extral_reference[ind]] replaced_gen = [] for w in gen: if w in del_map[ind]: replaced_gen.extend(del_map[ind][w].split()) else: replaced_gen.append(w) ''' print("gen : ",gen) for i in reference: print("ref : ",i) print("\n") ''' if len(replaced_gen) <= 4: bleu = sentence_bleu( reference, replaced_gen, smoothing_function=SmoothingFunction().method2, weights=(0.5, 0.5)) else: bleu = sentence_bleu( reference, replaced_gen, smoothing_function=SmoothingFunction().method2) bleu_score += bleu bleu_score = bleu_score / len(test_generations) return bleu_score
def expert_policy(batch_generation, extral_reference, del_map): batch_target = [] for ind, w in enumerate(batch_generation): if ind >= 1: break gen = [] end = False for i in w: if i == 2: end = True if not end and i != 0: gen.append(index2word[i]) reference = [[i.strip() for i in s.split()] for s in extral_reference[ind]] for s in reference: s.insert(0, "@start@") s.append("@end@") replaced_gen = [] for w in gen: if w in del_map[ind]: replaced_gen.extend(del_map[ind][w].split()) else: replaced_gen.append(w) replaced_gen.insert(0, "@start@") available_words = set() for sent in extral_reference[ind]: for word in sent.split(): available_words.add(word.strip()) available_words.add("@end@") target = replaced_gen[:1] while ("@end@" not in target and len(target) < 79): best_score = 0 best_action = "" for action in available_words: temp_target = target[:] temp_target.append(action) while len(temp_target) < 5: temp_target.insert(0, "@start@") bleu = sentence_bleu( reference, temp_target, smoothing_function=SmoothingFunction().method2, weights=(0.2, 0.2, 0.2, 0.2, 0.2)) if bleu > best_score: best_score = bleu best_action = action target.append(best_action) replaced_gen.append("@end@") bleu = sentence_bleu(reference, target, smoothing_function=SmoothingFunction().method2) ref_bleu = sentence_bleu( reference, replaced_gen, smoothing_function=SmoothingFunction().method2) print(target) print(bleu) print(replaced_gen) print(ref_bleu) for s in reference: print(s) print("\n") best_bleu = bleu best_target = target[:] if ref_bleu > bleu: best_target = replaced_gen[:] best_bleu = ref_bleu best_target.insert(0, "@start@") target_string = "" for wo in best_target: target_string += wo + " " for attr, value in del_map[ind].items(): if target_string.__contains__(value): target_string = target_string.replace(value, attr) after_best_target = target_string.strip().split() while (len(after_best_target) < max_len): after_best_target.append("@null@") if len(after_best_target) > max_len: print(len(after_best_target)) print(after_best_target) print(best_target) print(best_bleu) exit(0) target_ind = [ word2index[w] if w in word2index else 0 for w in best_target ] batch_target.append(target_ind) return batch_target
def single_valid(model, epo): B = [] L = [] previous = "" f = open('result.txt', 'w') for n in range(len(v_input_vecotors)): #for n in range(1): vec = v_input_vecotors[n] ref = v_input_reference[n] while len(ref) < max_len: ref.append(0) map = v_input_delmap[n] extra_ref = v_input_extra_references[n] feed_dict = {model.learning_rate: lr} for i in range(len(ref)): feed_dict[model.dec_inputs[i]] = [ref[i]] for i in range(len(model.enc_inputs)): feed_dict[model.enc_inputs[i]] = [vec[i]] output = model.session.run(model.dec_outputs[0:len(ref)], feed_dict=feed_dict) result = np.argmax(output, axis=-1).T gen = [] end = False for i in result[0]: if i == 2: end = True if not end and i != 0: gen.append(index2word[i]) reference = [[i.strip() for i in s.split()] for s in extra_ref] replaced_gen = [] for w in gen: if w in map: replaced_gen.extend(map[w].split()) else: replaced_gen.append(w) string = "" for w in replaced_gen: string += w + " " bleu = sentence_bleu(reference, replaced_gen, smoothing_function=SmoothingFunction().method2) if previous != string: previous = string #f.write(string +"\n") print(string) #f.write(str(bleu)+"\n") print(bleu) ''' for s in reference: sen = "" for w in s: sen+=w+" " f.write(sen+"\n") print(sen) print("\n") f.write("\n") ''' B.append(bleu) print("valid bleu score is {}".format(np.mean(B)))