def bert_based(gts, res): refs, cands = [], [] for refers in gts.values(): sub_refs = [] for ref in refers: sub_refs.append(ref + '.') refs.append(sub_refs) for cand in res.values(): cands.append(cand[0] + '.') scorer = BERTScorer(lang="en", rescale_with_baseline=True) P, R, F1 = scorer.score(cands, refs, verbose=True) out_file.write('BERTScore = %s' % F1.mean().item() + "\n") BERTScore = F1.mean().item() total_bleurt_score = [] scorer = bleurt_sc.BleurtScorer(bleurt_checkpoint) for ref_caption, cand in zip(refs, cands): bleurt_score_per_img = [] for ref in ref_caption: bleurt_score_per_img.append( scorer.score([ref], [cand], batch_size=None)[0]) total_bleurt_score.append(max(bleurt_score_per_img)) out_file.write('BLEURT =%s' % statistics.mean(total_bleurt_score))
def bertscore_bias(): scorer = BERTScorer(lang="zh", rescale_with_baseline=True) df = read_data() sample1 = df.apply( lambda x: scorer.score([x.context], [x['True']])[2].item(), axis=1) sample2 = df.apply( lambda x: scorer.score([x.context], [x['False']])[2].item(), axis=1) print('True', sample1.mean(), 'False', sample2.mean()) return stats.ttest_ind(sample1, sample2, equal_var=False)[1]
def get_bertscore_sentence_scores( sys_sents: List[str], refs_sents: List[List[str]], lowercase: bool = False, tokenizer: str = "13a", ): scorer = BERTScorer(lang="en", rescale_with_baseline=True) sys_sents = [utils_prep.normalize(sent, lowercase, tokenizer) for sent in sys_sents] refs_sents = [[utils_prep.normalize(sent, lowercase, tokenizer) for sent in ref_sents] for ref_sents in refs_sents] refs_sents = [list(r) for r in zip(*refs_sents)] return scorer.score(sys_sents, refs_sents)
def gen_samples(): scorer = BERTScorer(lang="zh", rescale_with_baseline=True) data_file = '32-deduplicate-story.csv' df = pd.read_csv(data_file) # import pdb;pdb.set_trace() stories = list(df.story.dropna()) stories_split = [split_by_fullstop(x) for x in stories] stories_split_select = [ random.randint(0, len(x) - 1) for x in stories_split ] stories_sentencesample = [ x[y] for x, y in zip(stories_split, stories_split_select) ] stories_split_copy = copy.deepcopy(stories_split) stories_context = [] for ss, sss in zip(stories_split_copy, stories_split_select): ss[sss] = '<MASK>' stories_context.append(ss) stories_context = [''.join(x) for x in stories_context] positive_samples = [ (x, y, True) for x, y in zip(stories_context, stories_sentencesample) ] cands = stories_sentencesample assert len(cands) == len(stories_split) refs = [] for i, cand in enumerate(cands): refs.append([ x for j, y in enumerate(stories_split) for x in y if len(x) > 0 and j != i ]) bestmatch = [] print(len(cands)) for i, (c, ref) in enumerate(zip(cands, refs)): print(i, 'th candidate...') cand = [c] * len(ref) import pdb pdb.set_trace() P, R, F1 = scorer.score(cand, ref) bestmatch.append(int(torch.argmax(R))) negative_samples = [(x, y[z], False) for x, y, z in zip(stories_context, refs, bestmatch)] return [(x, w, y[z]) for x, y, z, w in zip( stories_context, refs, bestmatch, stories_sentencesample)]
def run_bartscore(self): ''' Computes the BARTScore score between the set of hypothesis and reference summaries. ''' print('\n===== BARTScore =====\n') bartscore = BERTScorer(lang="en", model_type=self.bartscore_model, num_layers=12) for hyps_path, refs_path in zip(self.hyps_paths, self.refs_paths): self.load_summs(hyps_path, refs_path) P, R, F1 = bartscore.score(self.hyps, self.refs, batch_size=64) self.df_scores.loc[self.df_scores['hyps_path'] == hyps_path, 'bartscore'] = F1.tolist() self.save_temp_csv() print(F1.mean()) del P, R, F1, bartscore torch.cuda.empty_cache()
# output_path = "/data/private/E2E/predictions/final/*" # output_path = "/data/private/E2E/predictions/reproduce/try_2/*" output_path = "/data/private/E2E/predictions/no_pretrained/try_1/*" pred_files = glob.glob(output_path) # pred_files = ["/project/work/E2E/predictions/final/sampling30_1.txt"] score_list = [] for i in range(len(pred_files)): cands = [] pred_data_open = open(pred_files[i], "r") pred_data_dataset = pred_data_open.readlines() pred_len = len(pred_data_dataset) pred_data_open.close() for k in range(len(pred_data_dataset)): out_sen = pred_data_dataset[k].strip() repeat_num = len(human_references[k]) for _ in range(repeat_num): cands.append(out_sen) # P, R, F1 = score(cands, human_compare, lang='en', verbose=True) P, R, F1 = scorer.score(cands, human_compare) F1_list = list(F1.numpy()) BERT_score = sum(F1_list) / len(F1_list) score_list.append(BERT_score) for i in range(len(pred_files)): print(pred_files[i]) print(score_list[i])
class Scorer: def __init__(self, src_path, ref_path, metric, ref_sep, fast_moverscore=False, num_ref=1): self.src_path = src_path self.ref_path = ref_path self.metric = metric self.ref_sep = ref_sep self.num_ref = num_ref self.ref_lines_with_tags = read_file(ref_path) self.ref_lines = [ ' '.join( get_sents_from_tags(ref.replace(self.ref_sep, ''), sent_start_tag='<t>', sent_end_tag='</t>')) for ref in self.ref_lines_with_tags ] for i, ref in enumerate(self.ref_lines): if len(ref) == 0: self.ref_lines[i] = '### DUPLICATE ###' self.idf_refs = None self.idf_hyps = None if metric == 'moverscore': from all_metrics.moverscore import get_idf_dict with open('all_metrics/stopwords.txt', 'r', encoding='utf-8') as f: self.stop_words = set(f.read().strip().split(' ')) if fast_moverscore: assert src_path is not None, f"src_path must be provided for fast moverscore" src_lines_with_tags = read_file(src_path) src_lines = [ ' '.join( get_sents_from_tags(src, sent_start_tag='<t>', sent_end_tag='</t>')) for src in src_lines_with_tags ] self.idf_refs = get_idf_dict(self.ref_lines) self.idf_hyps = get_idf_dict(src_lines) if metric == 'bertscore': from bert_score import BERTScorer self.bert_scorer = BERTScorer(lang='en', rescale_with_baseline=True) if metric == 'js2': ref_sents = [ get_sents_from_tags(ref_line.replace(ref_sep, ''), sent_start_tag='<t>', sent_end_tag='</t>') for ref_line in self.ref_lines_with_tags ] self.ref_freq = [compute_tf(rs, N=2) for rs in ref_sents] if metric == 'rwe': self.embs = we.load_embeddings('../data/peyrard_s3/deps.words') def score(self, file_num, summ_path, model_name, variant_name): """ :return: a list with format: [{score: value}] with scores for each doc in each dict """ logger.info( f"getting scores for model: {model_name}, variant: {variant_name}, file num: {file_num}" ) summ_lines_with_tags = read_file(summ_path) summ_lines = [ ' '.join( get_sents_from_tags(summ, sent_start_tag='<t>', sent_end_tag='</t>')) for summ in summ_lines_with_tags ] for i, summ in enumerate(summ_lines): if len(summ) == 0: summ_lines[i] = '### DUPLICATE ###' if self.metric == 'moverscore': from all_metrics.moverscore import word_mover_score, get_idf_dict idf_refs = get_idf_dict( self.ref_lines) if self.idf_refs is None else self.idf_refs idf_hyps = get_idf_dict( summ_lines) if self.idf_hyps is None else self.idf_hyps scores = word_mover_score(self.ref_lines, summ_lines, idf_refs, idf_hyps, self.stop_words, n_gram=1, remove_subwords=True, batch_size=64, device='cuda:0') scores = [{'mover_score': s} for s in scores] elif self.metric == 'bertscore': (P, R, F) = self.bert_scorer.score(summ_lines, self.ref_lines) P, R, F = list(F.numpy()), list(P.numpy()), list(R.numpy()) scores = [{ 'bert_precision_score': p, 'bert_recall_score': r, 'bert_f_score': f_score } for p, r, f_score in zip(P, R, F)] elif self.metric == 'js2': summ_sents = [ get_sents_from_tags(summ_line, sent_start_tag='<t>', sent_end_tag='</t>') for summ_line in summ_lines_with_tags ] # import pdb; pdb.set_trace() scores = [{ 'js-2': -js_divergence(summ_sent, ref_freq, N=2) } for summ_sent, ref_freq in zip(summ_sents, self.ref_freq)] elif self.metric == 'rouge': args = argparse.Namespace(check_repeats=True, delete=True, get_each_score=True, stemming=True, method='sent_tag_verbatim', n_bootstrap=1000, run_google_rouge=False, run_rouge=True, source=summ_path, target=self.ref_path, ref_sep=self.ref_sep, num_ref=self.num_ref, temp_dir='../data/temp/') scores = baseline_main( args, return_pyrouge_scores=True)['individual_score_results'] scores = [scores[doc_id] for doc_id in range(len(self.ref_lines))] elif self.metric == 'rwe': scores = [{ 'rouge_1_we': pd_rouge.rouge_n_we(ref, [summ], self.embs, n=1, alpha=0.5) } for ref, summ in zip(self.ref_lines, summ_lines)] elif self.metric == 'sms' or self.metric == 'wms': from all_metrics.sentence_mover.smd import smd scores = smd(self.ref_lines, summ_lines, word_rep='glove', metric=self.metric) scores = [{self.metric: s} for s in scores] else: raise NotImplementedError(f"metric {self.metric} not supported") assert len(scores) == len(self.ref_lines) sd = {} for doc_id in range(len(self.ref_lines)): sd[doc_id] = { 'doc_id': doc_id, 'ref_summ': self.ref_lines_with_tags[doc_id], 'system_summaries': { f'{model_name}_{variant_name}': { 'system_summary': summ_lines_with_tags[doc_id], 'scores': scores[doc_id] } } } return sd
def compute_bert_based_scores(test_path, path_results, sentences_generated_path): bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True) bleurt_scorer = bleurt_sc.BleurtScorer(bleurt_checkpoint) with open(test_path) as json_file: test = json.load(json_file) test_sentences = defaultdict(list) for ref in test["annotations"]: image_id = ref["image_id"] caption = ref["caption"] test_sentences[image_id].append(caption) # get previous score of coco metrics (bleu,meteor,etc) to append bert_based_score scores_path = path_results with open(scores_path) as json_file: scores = json.load(json_file) # get previous generated sentences to calculate bertscore according to refs generated_sentences_path = sentences_generated_path with open(generated_sentences_path) as json_file: generated_sentences = json.load(json_file) total_precision = 0.0 total_recall = 0.0 total_fmeasure = 0.0 total_bleurt_score = [] for dict_image_and_caption in generated_sentences: image_id = dict_image_and_caption["image_id"] caption = [dict_image_and_caption["caption"]] references = [test_sentences[image_id]] bleurt_score_per_img = [] for ref in references[0]: bleurt_score_per_img.append( bleurt_scorer.score([ref], caption, batch_size=None)[0]) total_bleurt_score.append(max(bleurt_score_per_img)) P_mul, R_mul, F_mul = bert_scorer.score(caption, references) precision = P_mul[0].item() recall = R_mul[0].item() f_measure = F_mul[0].item() total_precision += precision total_recall += recall total_fmeasure += f_measure # calculate bert_based_scores key_image_id = str(image_id) scores[str(key_image_id)]["BertScore_P"] = precision scores[key_image_id]["BertScore_R"] = recall scores[key_image_id]["BertScore_F"] = f_measure scores[key_image_id]["BLEURT"] = max(bleurt_score_per_img) # print("\ncaption and score", caption, f_measure) n_captions = len(generated_sentences) scores["avg_metrics"]["BertScore_P"] = total_precision / n_captions scores["avg_metrics"]["BertScore_R"] = total_recall / n_captions scores["avg_metrics"]["BertScore_F"] = total_fmeasure / n_captions scores["avg_metrics"]["BLEURT"] = statistics.mean(total_bleurt_score) # save scores dict to a json with open(scores_path, 'w+') as f: json.dump(scores, f, indent=2)
class KobeModel(pl.LightningModule): def __init__(self, args): super(KobeModel, self).__init__() self.encoder = Encoder( vocab_size=args.text_vocab_size + args.cond_vocab_size, max_seq_len=args.max_seq_len, d_model=args.d_model, nhead=args.nhead, num_layers=args.num_encoder_layers, dropout=args.dropout, mode=args.mode, ) self.decoder = Decoder( vocab_size=args.text_vocab_size, max_seq_len=args.max_seq_len, d_model=args.d_model, nhead=args.nhead, num_layers=args.num_decoder_layers, dropout=args.dropout, ) self.lr = args.lr self.d_model = args.d_model self.loss = nn.CrossEntropyLoss(reduction="mean", ignore_index=0, label_smoothing=0.1) self._reset_parameters() self.decoding_strategy = args.decoding_strategy self.vocab = BertTokenizer.from_pretrained(args.text_vocab_path) self.bleu = BLEU(tokenize=args.tokenize) self.sacre_tokenizer = _get_tokenizer(args.tokenize)() self.bert_scorer = BERTScorer(lang=args.tokenize, rescale_with_baseline=True) def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: xavier_uniform_(p) def _tokenwise_loss_acc(self, logits: torch.Tensor, batch: Batched) -> Tuple[torch.Tensor, float]: unmask = ~batch.description_token_ids_mask.T[1:] unmasked_logits = logits[unmask] unmasked_targets = batch.description_token_ids[1:][unmask] acc = helpers.accuracy(unmasked_logits, unmasked_targets) return self.loss(logits.transpose(1, 2), batch.description_token_ids[1:]), acc def training_step(self, batch: Batched, batch_idx: int): encoded = self.encoder.forward(batch) logits = self.decoder.forward(batch, encoded) loss, acc = self._tokenwise_loss_acc(logits, batch) self.lr_schedulers().step() self.log("train/loss", loss.item()) self.log("train/acc", acc) return loss def _shared_eval_step(self, batch: Batched, batch_idx: int) -> DecodedBatch: encoded = self.encoder.forward(batch) logits = self.decoder.forward(batch, encoded) loss, acc = self._tokenwise_loss_acc(logits, batch) preds = self.decoder.predict(encoded_batch=encoded, decoding_strategy=self.decoding_strategy) generated = self.vocab.batch_decode(preds.T.tolist(), skip_special_tokens=True) return DecodedBatch( loss=loss.item(), acc=acc, generated=generated, descriptions=batch.descriptions, ) def validation_step(self, batch, batch_idx): return self._shared_eval_step(batch, batch_idx) def test_step(self, batch, batch_idx, dataloader_idx=0): return self._shared_eval_step(batch, batch_idx) def _shared_epoch_end(self, outputs: List[DecodedBatch], prefix): loss = np.mean([o.loss for o in outputs]) acc = np.mean([o.acc for o in outputs]) self.log(f"{prefix}/loss", loss) self.log(f"{prefix}/acc", acc) generated = [g for o in outputs for g in o.generated] references = [r for o in outputs for r in o.descriptions] # fmt: off # BLEU score self.log(f"{prefix}/bleu", self.bleu.corpus_score(generated, [references]).score) # Diversity score self.log( f"{prefix}/diversity_3", float( helpers.diversity([self.sacre_tokenizer(g) for g in generated], n=3))) self.log( f"{prefix}/diversity_4", float( helpers.diversity([self.sacre_tokenizer(g) for g in generated], n=4))) self.log( f"{prefix}/diversity_5", float( helpers.diversity([self.sacre_tokenizer(g) for g in generated], n=5))) # fmt: on # BERTScore p, r, f = self.bert_scorer.score(generated, references) self.log(f"{prefix}/BERTScore_P", p.mean().item()) self.log(f"{prefix}/BERTScore_R", r.mean().item()) self.log(f"{prefix}/BERTScore_F", f.mean().item()) # Examples columns = ["Generated", "Reference"] data = list(zip(generated[:256:16], references[:256:16])) table = wandb.Table(data=data, columns=columns) self.logger.experiment.log({f"examples/{prefix}": table}) def validation_epoch_end(self, outputs): self._shared_epoch_end(outputs, "val") def test_epoch_end(self, outputs): self._shared_epoch_end(outputs, "test") def configure_optimizers(self): optimizer = optim.AdamW(self.parameters(), lr=self.lr, betas=(0.9, 0.98)) scheduler = WarmupDecayLR(optimizer, warmup_steps=10000, d_model=self.d_model) return [optimizer], [scheduler]
from bert_score import BERTScorer scorer = BERTScorer(lang="en", rescale_with_baseline=False) import glob human_files = "/data/private/WebNLG-models/prediction/challenge/reference.txt" human_open = open(human_files, "r") human_dataset = human_open.readlines() human_open.close() output_path = "/data/private/WebNLG-models/prediction/challenge/compare/*" # output_path = "/data/private/WebNLG-models/prediction/challenge/my_output/*" pred_files = glob.glob(output_path) score_list = [] for i in range(len(pred_files)): cands = [] pred_data_open = open(pred_files[i], "r") pred_data_dataset = pred_data_open.readlines() pred_data_open.close() P, R, F1 = scorer.score(human_dataset, pred_data_dataset) F1_list = list(F1.numpy()) BERT_score = sum(F1_list) / len(F1_list) score_list.append(BERT_score) for i in range(len(pred_files)): print(pred_files[i]) print(score_list[i])
stories_context = [] for ss,sss in zip(stories_split,stories_split_select): ss[sss] = '<MASK>' stories_context.append(ss) stories_context = [''.join(x) for x in stories_context] positive_samples = [(x,y,True) for x,y in zip(stories_context,stories_sentencesample)] cands_pre = stories_sentencesample len_refs = len(refs_pre) len_cands = len(cands_pre) cands = [x for x in cands_pre for i in range(len_refs)] refs = refs_pre*len_cands # print(refs) # print(cands) P, R, F1 = scorer.score(cands, refs) print(R.reshape(len_cands,len_refs)) R=R.reshape(len_cands,len_refs) bestmatch = torch.topk(R,2,dim=1).indices[:,1] negative_samples = [(x,refs_pre[y],False) for x,y in zip(stories_context,bestmatch)] import pdb;pdb.set_trace() samples = negative_samples+positive_samples result_df = pd.DataFrame(samples, columns = ['context','keysen','label']) print(result_df.head()) result_df.to_csv('data/autocloze.csv',encoding="utf_8_sig") result_df.to_excel('data/autocloze.xlsx',encoding="utf_8_sig") # print(R.reshape(2,3)) #import pdb;pdb.set_trace()
import os from bert_score import BERTScorer scorer = BERTScorer(lang="ch", batch_size=1, device='cuda:0') logdir = './logs/S2S/decode_val_600maxenc_4beam_35mindec_150maxdec_ckpt-62256' decodeddir = logdir + '/decoded' referencedir = logdir + '/reference' dir_or_files = os.listdir(decodeddir) dir_or_files = sorted(dir_or_files) count = 0 for file in dir_or_files: f = open(os.path.join(decodeddir, file), 'r', encoding='utf-8') decodetext = [] for line in f.readlines(): decodetext.append(line[1:]) f.close() f = open(os.path.join(referencedir, file[0:6] + '_reference.txt'), 'r', encoding='utf-8') reftext = [] for line in f.readlines(): reftext.append(line[1:]) # reftext.append(line[1:]) f.close() # count += 1 # if count == 10: # break print(scorer.score(decodetext, reftext))
def get_scores(nrows, metrics=None): ''' Get correlations between metric similarity and label similarity ''' df = pd.read_csv(QQP_DATA_PATH, nrows=nrows) start_time = time() if not metrics: metrics = [ 'mover-1', 'mover-2', 'bleurt', 'bertscore', 'bartscore', 'rouge1', 'rouge2', 'rougeLsum', ] for m in tqdm(metrics): if m.startswith('rouge'): scorer = rouge_scorer.RougeScorer( [met for met in metrics if met.startswith('rouge')], use_stemmer=True) scores = [ scorer.score(r, c)[m].fmeasure for c, r in zip(df.question1, df.question2) ] elif m == 'bertscore': scorer = BERTScorer(lang="en", rescale_with_baseline=True, model_type='roberta-large-mnli') _, _, scores = scorer.score(df.question1.tolist(), df.question2.tolist()) elif m == 'bartscore': scorer = BERTScorer(lang="en", model_type="facebook/bart-large-mnli", num_layers=12) _, _, scores = scorer.score(df.question1.tolist(), df.question2.tolist()) elif m == 'bleurt': checkpoint = "bleurt-large-512" scorer = score.BleurtScorer(checkpoint) scores = scorer.score(df.question1, df.question2, batch_size=50) elif m.startswith('mover'): # Truncate long questions else moverscore gets OOM q1 = df['question1'].apply(lambda s: s[:300]).tolist() q2 = df['question2'].apply(lambda s: s[:300]).tolist() idf_dict_hyp = get_idf_dict(q1) idf_dict_ref = get_idf_dict(q2) if '1' in m: n_gram = 1 else: n_gram = 2 scores = word_mover_score(q2, q1, idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=n_gram, remove_subwords=True, batch_size=64) df[m] = scores print('\n' * 10, m, '\n' * 10) df.to_csv(QQP_OUT_PATH)
def main(params): # Loading data dataset, num_labels = load_data(params) dataset = dataset["train"] text_key = 'text' if params.dataset == "dbpedia14": text_key = 'content' print(f"Loaded dataset {params.dataset}, that has {len(dataset)} rows") # Load model and tokenizer from HuggingFace model_class = transformers.AutoModelForSequenceClassification model = model_class.from_pretrained(params.model, num_labels=num_labels).cuda() if params.ckpt != None: state_dict = torch.load(params.ckpt) model.load_state_dict(state_dict) tokenizer = textattack.models.tokenizers.AutoTokenizer(params.model) model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper( model, tokenizer, batch_size=params.batch_size) # Create radioactive directions and modify classification layer to use those if params.radioactive: torch.manual_seed(0) radioactive_directions = torch.randn(num_labels, 768) radioactive_directions /= torch.norm(radioactive_directions, dim=1, keepdim=True) print(radioactive_directions) model.classifier.weight.data = radioactive_directions.cuda() model.classifier.bias.data = torch.zeros(num_labels).cuda() start_index = params.chunk_id * params.chunk_size end_index = start_index + params.chunk_size if params.target_dir is not None: target_file = join(params.target_dir, f"{params.chunk_id}.csv") f = open(target_file, "w") f = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) # Creating attack print(f"Building {params.attack} attack") if params.attack == "custom": current_label = -1 if params.targeted: current_label = dataset[start_index]['label'] assert all([ dataset[i]['label'] == current_label for i in range(start_index, end_index) ]) attack = build_attack(model_wrapper, current_label) elif params.attack == "bae": print(f"Building BAE method with threshold={params.bae_threshold:.2f}") attack = build_baegarg2019(model_wrapper, threshold_cosine=params.bae_threshold, query_budget=params.query_budget) elif params.attack == "bert-attack": assert params.query_budget is None attack = BERTAttackLi2020.build(model_wrapper) elif params.attack == "clare": assert params.query_budget is None attack = CLARE2020.build(model_wrapper) # Launching attack begin_time = time.time() samples = [ (dataset[i][text_key], attack.goal_function.get_output(AttackedText(dataset[i][text_key]))) for i in range(start_index, end_index) ] results = list(attack.attack_dataset(samples)) # Storing attacked text bert_scorer = BERTScorer(model_type="bert-base-uncased", idf=False) n_success = 0 similarities = [] queries = [] use = USE() for i_result, result in enumerate(results): print("") print(50 * "*") print("") text = dataset[start_index + i_result][text_key] ptext = result.perturbed_text() i_data = start_index + i_result if params.target_dir is not None: if params.dataset == 'dbpedia14': f.writerow([ dataset[i_data]['label'] + 1, dataset[i_data]['title'], ptext ]) else: f.writerow([dataset[i_data]['label'] + 1, ptext]) print("True label ", dataset[i_data]['label']) print(f"CLEAN TEXT\n {text}") print(f"ADV TEXT\n {ptext}") if type(result) not in [SuccessfulAttackResult, FailedAttackResult]: print("WARNING: Attack neither succeeded nor failed...") print(result.goal_function_result_str()) precision, recall, f1 = [ r.item() for r in bert_scorer.score([ptext], [text]) ] print( f"Bert scores: precision {precision:.2f}, recall: {recall:.2f}, f1: {f1:.2f}" ) initial_logits = model_wrapper([text]) final_logits = model_wrapper([ptext]) print("Initial logits", initial_logits) print("Final logits", final_logits) print("Logits difference", final_logits - initial_logits) # Statistics n_success += 1 if type(result) is SuccessfulAttackResult else 0 queries.append(result.num_queries) similarities.append(use.compute_sim([text], [ptext])) print("Processing all samples took %.2f" % (time.time() - begin_time)) print(f"Total success: {n_success}/{len(results)}") logs = { "success_rate": n_success / len(results), "avg_queries": sum(queries) / len(queries), "queries": queries, "avg_similarity": sum(similarities) / len(similarities), "similarities": similarities, } print("__logs:" + json.dumps(logs)) if params.target_dir is not None: f.close()
############################### ######## data ################# question_data, answer_data = utils.question_answers_dataset() ############################### index_Q1 = [] index_Q2 = [] index_Q3 = [] count_Q1 = 0 count_Q2 = 0 count_Q3 = 0 t0 = time.time() print(len(question_data)) scorer = BERTScorer(model_type='bert-base-uncased') print('test_score:', scorer.score(["are you okay?"], [["are you good?"]])) for i in range(len(question_data)): if i % 50 == 0: print("*", end='') P, R, F1 = scorer.score([Q1], [question_data[i]]) if F1.item() > 0.9: count_Q1 += 1 index_Q1.append(i) P, R, F1 = scorer.score([Q2], [question_data[i]]) if F1.item() > 0.9: count_Q2 += 1 index_Q2.append(i) P, R, F1 = scorer.score([Q3], [question_data[i]]) if F1.item() > 0.9:
class RhymeDistanceMeter: def __init__(self, chara_word='野球'): self.c = chara_word data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data') self.baseline_file_path = os.path.join(data_dir, 'bert-base-multilingual-cased.tsv') self.scorer = BERTScorer(model_type=os.path.join(data_dir, 'bert-base_mecab-ipadic-bpe-32k_whole-word-mask'), num_layers=11, lang='ja', rescale_with_baseline=True, baseline_path=self.baseline_file_path) self.min_rhyme = 2 def throw(self, s1, s2): rhyme_count = self.count_rhyme(s1, s2) sim_s, sim_c = self.score_similarity(s1, s2) len_rate = self.len_rate(s1, s2) dist = self.calc_dist(rhyme_count, sim_s, sim_c, len_rate) return dist def most_rhyming(self, killer_phrase, candidates, topn=3): res = {} for c in candidates: res[c] = self.count_rhyme(killer_phrase, c) logger.debug(f'{res=}') sorted_res = sorted(res.items(), key=lambda item: item[1], reverse=True) return [w[0] for w in sorted_res[:topn]] def len_rate(self, s1, s2): return min(len(s1), len(s2)) / max(len(s1), len(s2)) def count_rhyme(self, s1, s2): romaji1 = romanize_sentence(s1) romaji2 = romanize_sentence(s2) vowel1 = vowelize(romaji1) vowel2 = vowelize(romaji2) logger.debug(f'{vowel1=}') logger.debug(f'{vowel2=}') min_len = min(len(vowel1), len(vowel2)) cnt = 0 # 脚韻 for i in range(1, min_len+1): if vowel1[-i] == vowel2[-i]: cnt += 1 else: break if cnt > 0: return cnt # 頭韻 for i in range(min_len): if vowel1[i] == vowel2[i]: cnt += 1 else: break return cnt def score_similarity(self, s1, s2): refs = [s1] hyps = [s2] s1_nouns = [w.surface for w in tagger(s1) if (w.feature[0] == '名詞' and w.surface != self.c)] s2_nouns = [w.surface for w in tagger(s2) if (w.feature[0] == '名詞' and w.surface != self.c)] logger.debug(f'{s1_nouns=}') logger.debug(f'{s2_nouns=}') for s in s1_nouns: refs.append(self.c) hyps.append(s) for s in s2_nouns: refs.append(self.c) hyps.append(s) logger.debug(f'{refs=}') logger.debug(f'{hyps=}') P, R, F1 = self.scorer.score(refs, hyps) dist_s = F1[0] logger.debug(f'{F1[1:]=}') dist_c = max(F1[1:]) return dist_s, dist_c def calc_dist(self, count, sim_s, sim_c, len_rate): logger.debug(f'{count=}') logger.debug(f'{sim_s=}') logger.debug(f'{sim_c=}') logger.debug(f'{len_rate=}') return int(count ** ((1 - sim_s) * (sim_c * 10) * (1 + len_rate)))
# load hypothesis cands = [line.strip().split("\t")[1] for line in open(hyp_file, 'r')] # prepare reference list ref_files = os.listdir(ref_path) ref_dict = defaultdict(list) for name in ref_files: _, _, sty, index = name.split(".") ref_dict[index].append((sty, ref_path + name)) for index in ref_dict: ref_dict[index].sort(key=lambda x: x[0]) refs_i = [] for _, file_path in ref_dict[index]: refs_i += [line.strip() for line in open(file_path, 'r')] ref_dict[index] = refs_i ref_list = [refs for refs in ref_dict.values()] ref_sents = [ref for refs in ref_list for ref in refs] ref_list = list(zip(*ref_list)) # load BERT model scorer = BERTScorer(model_type="albert-xlarge-v2", lang="en", rescale_with_baseline=True, idf=True, idf_sents=ref_sents, batch_size=32) P, R, F1 = scorer.score(cands, ref_list) P, R, F1 = P.mean().item(), R.mean().item(), F1.mean().item() print("P: %.4f; R: %.4f; F1: %.4f." % (P, R, F1))
result = json.load(f) with open('data/R2R_val_unseen.json') as f: answer = json.load(f) import logging import transformers transformers.tokenization_utils.logger.setLevel(logging.ERROR) transformers.configuration_utils.logger.setLevel(logging.ERROR) transformers.modeling_utils.logger.setLevel(logging.ERROR) from bert_score import BERTScorer scorer = BERTScorer(lang='en', rescale_with_baseline = True) #%% scores = [] for traj in answer: instr_id = traj['path_id'] for i in range(3): instr_id_i = '{}_{}'.format(instr_id, i) instr_ans = [traj['instructions'][i]] instr_pred = [' '.join(result[instr_id_i]['words'])] _, _, F1 = scorer.score(instr_pred, [instr_ans]) scores.append(F1.numpy()) print(np.mean(scores))