Example #1
0
def compare_with_random(load_emb):

    pkl_path = './data-toxic-kaggle/toxic_comments_100.pkl' 

    perturbed_path = './data-toxic-kaggle/toxic_comments_100_perturbed.pkl' # perturbed by Edwin's script
    
    original, perturbed = data_helper.load_samples(perturbed_path,pkl_path)
    
    logging.info("data loaded")

    perturbed = original[:]

    np.random.shuffle(perturbed)

    logging.info("sentences are shuffled")
    
    output = []
        
    for index in range(len(original)):
            
        logging.info("index:%d"%index) 
        
        original_sample = original[index]
        
        perturbed_sample = perturbed[index]
            
        #print([original_sample, perturbed_sample])
        
        (vec_orig, vec_pert) = embeddings.get_embeddings([original_sample, perturbed_sample],load_emb)
        
        output.append("%s,%s,%.2f"%(original_sample,perturbed_sample, cos_sim(vec_orig, vec_pert)))

    return output
Example #2
0
def compare_with_probaility(load_emb, perturbation_script, p=1.0):
    pkl_path = './data-toxic-kaggle/toxic_comments_100.pkl' 

    if perturbation_script == 'steffen_even':
        perturbed_path = './data-toxic-kaggle/toxic_comments_100_mm_even_p%.1f.pkl'%p # perturbed by Steffen_even script   
        original, perturbed = data_helper.load_samples(perturbed_path,pkl_path)
        logging.info("perturbation_script: steffen_even")

        logging.info("data loaded")
        output = []
        for index in range(len(original)):
            logging.info("index:%d"%index) 
            original_sample = original[index]
            perturbed_sample = perturbed[index]
            (vec_orig, vec_pert) = embeddings.get_embeddings([original_sample, perturbed_sample],load_emb)
            output.append("%s,%s,%.2f"%(original_sample,perturbed_sample, cos_sim(vec_orig, vec_pert)))


    elif perturbation_script == 'edwin':
        perturbed_path = './data-toxic-kaggle/toxic_comments_100_perturbed.pkl' # perturbed by Edwin's script
        original, perturbed = data_helper.load_samples(perturbed_path,pkl_path)
        logging.info("perturbation_script: edwin")

        logging.info("data loaded")
        output = []
        for index in range(len(original)):
            logging.info("index:%d"%index) 
            original_sample = original[index]
            perturbed_sample = perturbed[index]
            pert = ''
            for i,ch in enumerate(original_sample):
                prob = np.random.uniform()
                if ( prob<= p) and (ch not in ignore_list):
                    # disturb
                    pert +=  perturbed_sample[i]
                else:
                    pert += ch
            #print([original_sample,pert])
            (vec_orig, vec_pert) = embeddings.get_embeddings([original_sample, pert],load_emb)
            output.append("%s,%s,%.2f"%(original_sample,pert, cos_sim(vec_orig, vec_pert)))

    else:
        raise ValueError("%s is not implemented!"%perturbation_script)

    return output
Example #3
0
def cosine_similarities(loaded_emb, clean_path, perturbed_path):

    clean_sentences, perturbed_sentences = data_helper.load_samples_txt(
        clean_path, perturbed_path)

    n = len(clean_sentences)
    cosine_similarities = []
    for index in range(n):
        clean_sentence = clean_sentences[index]
        perturbed_sentence = perturbed_sentences[index]
        (vec_clean, vec_pert) = embeddings.get_embeddings(
            [clean_sentence, perturbed_sentence], loaded_emb)
        cosine_similarities.append(cos_sim(vec_clean, vec_pert))

    return (clean_sentences, perturbed_sentences, cosine_similarities)
Example #4
0
def online_compare(load_emb, original, perturbed):
    print("data loaded")
    output = []
    (vec_orig, vec_p_1) = embeddings.get_embeddings([original, perturbed],load_emb)
    return cos_sim(vec_orig, vec_p_1)