def eval_reverse_proposal(input_original, masked_sent, input_ids_old, pos_set, reverse_action_set, sim=None): proposal_prob_reverse = 1.0 # Q(x|x') input_ids_tmp = np.array(masked_sent) for step_i in range(len(pos_set)): ind = pos_set[step_i] # note: here the positions are exchanged action = reverse_action_set[step_i] old_tok = input_ids_old[ind] # word replacement (action: 0) if action == 0: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_ids_tmp[ind] = old_tok proposal_prob_reverse *= prob_mask[old_tok] # Q(x|x') if sim is not None: proposal_prob_reverse *= similarity(input_ids_tmp, input_original) # word insertion(action:1) if action == 1: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_ids_tmp[ind] = old_tok proposal_prob_reverse *= prob_mask[old_tok] # Q(x|x') if sim is not None: proposal_prob_reverse *= similarity(input_ids_tmp, input_original) # word deletion(action: 2) if action == 2: input_ids_tmp = input_ids_tmp # already deleted proposal_prob_reverse *= 1.0 # Q(x|x') return proposal_prob_reverse, input_ids_tmp
def eval_template(searcher, input_original, cand_template, masked_sent, adjusted_pos_set, action_set, sim=None, verbose=False): proposal_prob = 1.0 # Q(x'|x) input_ids_tmp = np.array(masked_sent) # copy for step_i in range(len(adjusted_pos_set)): ind = adjusted_pos_set[step_i] # ind_old = pos_set[step_i] action = action_set[step_i] if action in [0, 1]: temp_tag = cand_template[ind] else: temp_tag = None # word replacement (action: 0) if action == 0: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_candidate, prob_candidate = \ searcher.generate_candidate_input_with_mask(input_ids_tmp, ind, prob_mask, config.search_size, temp_tag=temp_tag) if sim is not None: similarity_candidate = similarity_batch( input_candidate, input_original) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = prob_candidate # no normalize here prob_candidate_ind = sample_from_candidate(prob_candidate_norm) input_ids_tmp = input_candidate[prob_candidate_ind] # changed proposal_prob *= prob_candidate_norm[prob_candidate_ind] # Q(x'|x) if verbose: print('action:0, pos:', ind, prob_candidate_norm[prob_candidate_ind]) # word insertion(action:1) if action == 1: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_candidate, prob_candidate = \ searcher.generate_candidate_input_with_mask(input_ids_tmp, ind, prob_mask, config.search_size, temp_tag=temp_tag) if sim is not None: similarity_candidate = similarity_batch( input_candidate, input_original) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = prob_candidate # no normalize here prob_candidate_ind = sample_from_candidate(prob_candidate_norm) input_ids_tmp = input_candidate[prob_candidate_ind] proposal_prob *= prob_candidate_norm[prob_candidate_ind] # Q(x'|x) if verbose: print('action:1, pos:', ind, prob_candidate_norm[prob_candidate_ind]) # word deletion(action: 2) if action == 2: input_ids_tmp = input_ids_tmp # already deleted proposal_prob *= 1.0 # Q(x'|x) if verbose: print('action:2, pos:', ind, 1.0) if verbose: print(cand_template) print(tokenizer.decode(input_ids_tmp).encode('utf8', errors='ignore')) return proposal_prob, input_ids_tmp
def main(): if os.path.exists(config.use_output_path): os.system('rm ' + config.use_output_path) with open(config.use_output_path, 'a') as g: g.write(str(config) + '\n\n') # for item in config.record_time: # if os.path.exists(config.use_output_path + str(item)): # os.system('rm ' + config.use_output_path + str(item)) #CGMH sampling for paraphrase sim = config.sim # sta_vec=list(np.zeros([config.num_steps-1])) config.shuffle = False #original sentence input use_data = dataset_str(config.use_data_path) config.batch_size = 1 step_size = config.step_size start_time = time.time() proposal_cnt = 0 accept_cnt = 0 all_samples = [] all_acc_samples = [] all_chosen_samples = [] for sen_id in range(use_data.length): sent_ids = use_data.token_ids[sen_id] keys = use_data.keys[sen_id] searcher = ConstraintSearch(keys) sequence_length = len(sent_ids) #generate for each sentence sta_vec = np.zeros(sequence_length) input_ids = np.array(sent_ids) input_original = use_data.tokens[sen_id] prev_inds = [] old_prob = def_sent_scorer(tokenizer.decode(input_ids)) old_prob *= penalty_constraint( searcher.count_unsafisfied_constraint( searcher.sent2tag(input_ids))) if sim != None: old_prob *= similarity(input_ids, input_original, sta_vec) outputs = [] output_p = [] for iter in range(config.sample_time): # if iter in config.record_time: # with open(config.use_output_path, 'a', encoding='utf-8') as g: # g.write(bert_scorer.tokenizer.decode(input_ids)+'\n') # print(bert_scorer.tokenizer.decode(input_ids).encode('utf8', errors='ignore')) pos_set = get_sample_positions(sequence_length, prev_inds, step_size) action_set = [ choose_action(config.action_prob) for i in range(len(pos_set)) ] # if not check_constraint(input_ids): # if 0 not in pos_set: # pos_set[-1] = 0 keep_non = config.keep_non masked_sent, adjusted_pos_set = mask_sentence( input_ids, pos_set, action_set) prev_inds = pos_set proposal_prob = 1.0 # Q(x'|x) proposal_prob_reverse = 1.0 # Q(x|x') input_ids_tmp = np.array(masked_sent) # copy sequence_length_tmp = sequence_length for step_i in range(len(pos_set)): ind = adjusted_pos_set[step_i] ind_old = pos_set[step_i] action = action_set[step_i] if config.restrict_constr: if step_i == len(pos_set) - 1: use_constr = True else: use_constr = False else: use_constr = True #word replacement (action: 0) if action == 0: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_candidate, prob_candidate, reverse_candidate_idx, _ = \ generate_candidate_input_with_mask(input_ids_tmp, sequence_length_tmp, ind, prob_mask, config.search_size, old_tok=input_ids[ind_old], mode=action) if sim is not None and use_constr: similarity_candidate = similarity_batch( input_candidate, input_original, sta_vec) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = normalize(prob_candidate) prob_candidate_ind = sample_from_candidate( prob_candidate_norm) input_ids_tmp = input_candidate[ prob_candidate_ind] # changed proposal_prob *= prob_candidate_norm[ prob_candidate_ind] # Q(x'|x) proposal_prob_reverse *= prob_candidate_norm[ reverse_candidate_idx] # Q(x|x') sequence_length_tmp += 0 print('action:0', prob_candidate_norm[prob_candidate_ind], prob_candidate_norm[reverse_candidate_idx]) #word insertion(action:1) if action == 1: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_candidate, prob_candidate, reverse_candidate_idx, non_idx = \ generate_candidate_input_with_mask(input_ids_tmp, sequence_length_tmp, ind, prob_mask, config.search_size, mode=action, old_tok=input_ids[ind_old], keep_non=keep_non) if sim is not None and use_constr: similarity_candidate = similarity_batch( input_candidate, input_original, sta_vec) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = normalize(prob_candidate) prob_candidate_ind = sample_from_candidate( prob_candidate_norm) input_ids_tmp = input_candidate[prob_candidate_ind] if prob_candidate_ind == non_idx: if input_ids_tmp[-1] == PAD_IDX: input_ids_tmp = input_ids_tmp[:-1] print('action:1 insert non', 1.0, 1.0) else: proposal_prob *= prob_candidate_norm[ prob_candidate_ind] # Q(x'|x) proposal_prob_reverse *= 1.0 # Q(x|x'), reverse action is deleting sequence_length_tmp += 1 print('action:1', prob_candidate_norm[prob_candidate_ind], 1.0) #word deletion(action: 2) if action == 2: input_ids_for_del = np.concatenate( [input_ids_tmp[:ind], [MASK_IDX], input_ids_tmp[ind:]]) if keep_non: non_cand = np.array(input_ids_for_del) non_cand[ind] = input_ids[ind_old] input_candidate = np.array([input_ids_tmp, non_cand]) prob_candidate = np.array([ bert_scorer.sent_score(x) for x in input_candidate ]) non_idx = 1 if sim is not None and use_constr: similarity_candidate = similarity_batch( input_candidate, input_original, sta_vec) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = normalize(prob_candidate) prob_candidate_ind = sample_from_candidate( prob_candidate_norm) input_ids_tmp = input_candidate[prob_candidate_ind] else: non_idx = -1 prob_candidate_ind = 0 input_ids_tmp = input_ids_tmp # already deleted if prob_candidate_ind == non_idx: print('action:2 delete non', 1.0, 1.0) else: # add mask, for evaluating reverse probability prob_mask = bert_scorer.mask_score(input_ids_for_del, ind, mode=0) input_candidate, prob_candidate, reverse_candidate_idx, _ = \ generate_candidate_input_with_mask(input_ids_for_del, sequence_length_tmp, ind, prob_mask, config.search_size, mode=0, old_tok=input_ids[ind_old]) if sim != None: similarity_candidate = similarity_batch( input_candidate, input_original, sta_vec) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = normalize(prob_candidate) proposal_prob *= 1.0 # Q(x'|x) proposal_prob_reverse *= prob_candidate_norm[ reverse_candidate_idx] # Q(x|x'), reverse action is inserting sequence_length_tmp -= 1 print('action:2', 1.0, prob_candidate_norm[reverse_candidate_idx]) new_prob = def_sent_scorer(tokenizer.decode(input_ids_tmp)) new_prob *= penalty_constraint( searcher.count_unsafisfied_constraint( searcher.sent2tag(input_ids_tmp))) if sim != None: sim_constr = similarity(input_ids_tmp, input_original, sta_vec) new_prob *= sim_constr input_text_tmp = tokenizer.decode(input_ids_tmp) all_samples.append([ input_text_tmp, new_prob, searcher.count_unsafisfied_constraint( searcher.sent2tag(input_ids_tmp)), bert_scorer.sent_score(input_ids_tmp, log_prob=True), gpt2_scorer.sent_score(input_text_tmp, ppl=True) ]) if tokenizer.decode(input_ids_tmp) not in output_p: outputs.append(all_samples[-1]) if outputs != []: output_p.append(outputs[-1][0]) if proposal_prob == 0.0 or old_prob == 0.0: alpha_star = 1.0 else: alpha_star = (proposal_prob_reverse * new_prob) / (proposal_prob * old_prob) alpha = min(1, alpha_star) print( tokenizer.decode(input_ids_tmp).encode('utf8', errors='ignore')) print(alpha, old_prob, proposal_prob, new_prob, proposal_prob_reverse) proposal_cnt += 1 if choose_action([alpha, 1 - alpha]) == 0 and ( new_prob > old_prob * config.threshold or just_acc() == 0): if tokenizer.decode(input_ids_tmp) != tokenizer.decode( input_ids): accept_cnt += 1 print('Accept') all_acc_samples.append(all_samples[-1]) input_ids = input_ids_tmp sequence_length = sequence_length_tmp old_prob = new_prob # choose output from samples for num in range(config.min_length, 0, -1): outputss = [x for x in outputs if len(x[0].split()) >= num] print(num, outputss) if outputss != []: break if outputss == []: outputss.append([tokenizer.decode(input_ids), 0]) outputss = sorted(outputss, key=lambda x: x[1])[::-1] with open(config.use_output_path, 'a') as g: g.write(outputss[0][0] + '\t' + str(outputss[0][1]) + '\n') all_chosen_samples.append(outputss[0]) print('Sentence %d, used time %.2f\n' % (sen_id, time.time() - start_time)) print(proposal_cnt, accept_cnt, float(accept_cnt / proposal_cnt)) print("All samples:") all_samples_ = list(zip(*all_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) print("All accepted samples:") all_samples_ = list(zip(*all_acc_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) print("All chosen samples:") all_samples_ = list(zip(*all_chosen_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) with open(config.use_output_path + '-result.csv', 'w', newline='') as f: csv_writer = csv.writer(f, delimiter='\t') csv_writer.writerow( ['Sentence', 'Prob_sim', 'Constraint_num', 'Log_prob', 'PPL']) csv_writer.writerows(all_samples)