def compare_with_random(load_emb): pkl_path = './data-toxic-kaggle/toxic_comments_100.pkl' perturbed_path = './data-toxic-kaggle/toxic_comments_100_perturbed.pkl' # perturbed by Edwin's script original, perturbed = data_helper.load_samples(perturbed_path,pkl_path) logging.info("data loaded") perturbed = original[:] np.random.shuffle(perturbed) logging.info("sentences are shuffled") output = [] for index in range(len(original)): logging.info("index:%d"%index) original_sample = original[index] perturbed_sample = perturbed[index] #print([original_sample, perturbed_sample]) (vec_orig, vec_pert) = embeddings.get_embeddings([original_sample, perturbed_sample],load_emb) output.append("%s,%s,%.2f"%(original_sample,perturbed_sample, cos_sim(vec_orig, vec_pert))) return output
def compare_with_probaility(load_emb, perturbation_script, p=1.0): pkl_path = './data-toxic-kaggle/toxic_comments_100.pkl' if perturbation_script == 'steffen_even': perturbed_path = './data-toxic-kaggle/toxic_comments_100_mm_even_p%.1f.pkl'%p # perturbed by Steffen_even script original, perturbed = data_helper.load_samples(perturbed_path,pkl_path) logging.info("perturbation_script: steffen_even") logging.info("data loaded") output = [] for index in range(len(original)): logging.info("index:%d"%index) original_sample = original[index] perturbed_sample = perturbed[index] (vec_orig, vec_pert) = embeddings.get_embeddings([original_sample, perturbed_sample],load_emb) output.append("%s,%s,%.2f"%(original_sample,perturbed_sample, cos_sim(vec_orig, vec_pert))) elif perturbation_script == 'edwin': perturbed_path = './data-toxic-kaggle/toxic_comments_100_perturbed.pkl' # perturbed by Edwin's script original, perturbed = data_helper.load_samples(perturbed_path,pkl_path) logging.info("perturbation_script: edwin") logging.info("data loaded") output = [] for index in range(len(original)): logging.info("index:%d"%index) original_sample = original[index] perturbed_sample = perturbed[index] pert = '' for i,ch in enumerate(original_sample): prob = np.random.uniform() if ( prob<= p) and (ch not in ignore_list): # disturb pert += perturbed_sample[i] else: pert += ch #print([original_sample,pert]) (vec_orig, vec_pert) = embeddings.get_embeddings([original_sample, pert],load_emb) output.append("%s,%s,%.2f"%(original_sample,pert, cos_sim(vec_orig, vec_pert))) else: raise ValueError("%s is not implemented!"%perturbation_script) return output
def cosine_similarities(loaded_emb, clean_path, perturbed_path): clean_sentences, perturbed_sentences = data_helper.load_samples_txt( clean_path, perturbed_path) n = len(clean_sentences) cosine_similarities = [] for index in range(n): clean_sentence = clean_sentences[index] perturbed_sentence = perturbed_sentences[index] (vec_clean, vec_pert) = embeddings.get_embeddings( [clean_sentence, perturbed_sentence], loaded_emb) cosine_similarities.append(cos_sim(vec_clean, vec_pert)) return (clean_sentences, perturbed_sentences, cosine_similarities)
def online_compare(load_emb, original, perturbed): print("data loaded") output = [] (vec_orig, vec_p_1) = embeddings.get_embeddings([original, perturbed],load_emb) return cos_sim(vec_orig, vec_p_1)