예제 #1
0
def return_pca_debiasing(models, arguments, content):
    logging.info("APP-DE: Forwarding to related definitions")
    database = arguments['space']
    augment_flag = arguments['augments']
    target1, target2, attr1, attr2, augments1, augments2, augments3, augments4 = JSONFormatter.retrieve_vectors_debiasing(
        content, database,
        augment_flag)
    target1, target2 = calculation.check_sizes(target1, target2)
    attr1, attr2 = calculation.check_sizes(attr1, attr2)
    logging.info("APP: Final retrieved set sizes: T1=" + str(len(target1)) + " T2=" + str(len(target2)) + " A1=" + str(
        len(attr1)) + " A2=" + str(len(attr2)))
    if len(target1) == 0 or len(target2) == 0 or len(attr1) == 0 or len(attr2) == 0:
        logging.info("APP: Stopped, no values found in database")
        return jsonify(message="ERROR: No values found in database."), 404
    logging.info("APP: Debiasing process started")
    res1, res2, res3, res4 = {}, {}, {}, {}
    try:
        if models is None:
            res1, res2, res3, res4 = gbdd.generalized_bias_direction_debiasing(target1, target2, attr1, attr2,
                                                                               augments1, augments2, augments3,
                                                                               augments4)
        if models == 'gbdd':
            res1, res2, res3, res4 = gbdd.generalized_bias_direction_debiasing(target1, target2, attr1, attr2,
                                                                               augments1, augments2, augments3,
                                                                               augments4)
        if models == 'bam':
            res1, res2, res3, res4 = bam.bias_alignment_model(target1, target2, attr1, attr2, augments1, augments2,
                                                              augments3, augments4)
        if models == 'gbddxbam':
            res1, res2, res3, res4 = gbdd.generalized_bias_direction_debiasing(target1, target2, attr1, attr2,
                                                                               augments1, augments2, augments3,
                                                                               augments4)
            res1, res2, res3, res4 = bam.bias_alignment_model(res1, res2, res3, res4, augments1, augments2, augments3,
                                                              augments4)
        if models == 'bamxgbdd':
            res1, res2, res3, res4 = bam.bias_alignment_model(target1, target2, attr1, attr2, augments1, augments2,
                                                              augments3, augments4)
            res1, res2, res3, res4 = gbdd.generalized_bias_direction_debiasing(res1, res2, res3, res4, augments1,
                                                                               augments2, augments3, augments4)
        target1_copy, target2_copy = calculation.create_duplicates(target1, target2)
        attr1_copy, attr2_copy = calculation.create_duplicates(attr1, attr2)
        res1_copy, res2_copy, res3_copy, res4_copy = calculation.create_duplicates(res1, res2, res3, res4)
        biased_terms = calculation.concatenate_dicts(target1_copy, target2_copy, attr1_copy, attr2_copy)
        debiased_terms = calculation.concatenate_dicts(res1_copy, res2_copy, res3_copy, res4_copy)
        biased_pca = calculation.principal_componant_analysis(target1, target2, attr1, attr2)
        debiased_pca = calculation.principal_componant_analysis(res1, res2, res3, res4)
        response = json.dumps(
            {"EmbeddingSpace": database, "Model": models,
             "BiasedVectorsPCA": JSONFormatter.dict_to_json(biased_pca),
             "DebiasedVectorsPCA": JSONFormatter.dict_to_json(debiased_pca),
             "BiasedVecs:": JSONFormatter.dict_to_json(biased_terms),
             "DebiasedVecs": JSONFormatter.dict_to_json(debiased_terms)})
    except:
        return jsonify(message="DEBIASING ERROR"), 500
    logging.info("APP: Debiasing process finished")
    return response, 200
예제 #2
0
def k_means_clustering(target_set1, target_set2, accuracy=50):
    logging.info("KMeans: Calculation started with " + str(accuracy) + " iterations")
    target1, target2 = calculation.create_duplicates(target_set1, target_set2)
    target1 = calculation.transform_dict_to_list(target1)
    target2 = calculation.transform_dict_to_list(target2)
    vector_list = target1 + target2
    logging.info("KMeans: Vector dictionaries and lists prepared successfully")

    gold_standard1 = [1] * len(target1) + [0] * len(target2)
    gold_standard2 = [0] * len(target1) + [1] * len(target2)
    cluster = list(zip(vector_list, gold_standard1, gold_standard2))
    scores = []
    logging.info("KMeans: Cluster & Gold Standards created")
    for i in range(accuracy):
        random.shuffle(cluster)
        k_means = KMeans(n_clusters=2, random_state=0, init='k-means++').fit(numpy.array([x[0] for x in cluster]))
        labels = k_means.labels_
        accuracy1 = len([i for i in range(len(labels)) if labels[i] == cluster[i][1]]) / len(labels)
        accuracy2 = len([i for i in range(len(labels)) if labels[i] == cluster[i][2]]) / len(labels)
        scores.append(max(accuracy1, accuracy2))

    result = sum(scores) / len(scores)
    logging.info("KMeans: Finished calculation")
    logging.info("KMeans: Results: " + str(result))
    return result
예제 #3
0
def k_means_clustering(target_set1, target_set2, accuracy=50):
    logging.info("Eval-Engine: K-Means++ clustering started")
    target1, target2 = calculation.create_duplicates(target_set1, target_set2)
    target1 = calculation.transform_dict_to_list(target1)
    target2 = calculation.transform_dict_to_list(target2)
    vector_list = numpy.concatenate((target1, target2), axis=0)

    gold_standard1 = [1] * len(target1) + [0] * len(target2)
    gold_standard2 = [0] * len(target1) + [1] * len(target2)
    cluster = list(zip(vector_list, gold_standard1, gold_standard2))
    scores = []
    for i in range(accuracy):
        random.shuffle(cluster)
        k_means = KMeans(n_clusters=2, random_state=0, init='k-means++').fit(
            numpy.array([x[0] for x in cluster]))
        labels = k_means.labels_
        accuracy1 = len([
            i for i in range(len(labels)) if labels[i] == cluster[i][1]
        ]) / len(labels)
        accuracy2 = len([
            i for i in range(len(labels)) if labels[i] == cluster[i][2]
        ]) / len(labels)
        scores.append(max(accuracy1, accuracy2))

    result = sum(scores) / len(scores)
    logging.info("Eval-Engine: K-Means++ clustering score: " + str(result))
    return result
예제 #4
0
파일: ect.py 프로젝트: umanlp/debie-backend
def embedding_coherence_test(test_set1, test_set2, attribute_set):
    logging.info("ECT: Calculation started")
    # Create duplicates
    test1, test2, argument = calculation.create_duplicates(
        test_set1, test_set2, attribute_set)
    # Transform vector sets in lists
    test_list1 = calculation.transform_dict_to_list(test1)
    test_list2 = calculation.transform_dict_to_list(test2)
    arg_list = calculation.transform_dict_to_list(argument)
    logging.info("ECT: Vector dictionaries/lists prepared successfully")
    mean_target_vector1 = calculation.target_set_mean_vector(test_list1)
    mean_target_vector2 = calculation.target_set_mean_vector(test_list2)
    logging.info("ECT: Target set mean vectors calculated successfully")
    array_sim1 = []
    array_sim2 = []
    for i in range(len(arg_list)):
        memory = arg_list[i]
        cos_sim1 = calculation.cosine_similarity(mean_target_vector1, memory)
        array_sim1.append(cos_sim1)
        cos_sim2 = calculation.cosine_similarity(mean_target_vector2, memory)
        array_sim2.append(cos_sim2)
    value_array, p_value = spearmanr(array_sim1, array_sim2)
    logging.info("ECT: Calculated successfully:")
    logging.info("ECT: Results: " + str(value_array) + " p: " + str(p_value))
    return value_array, p_value
예제 #5
0
def word_embedding_association_test(target_set1,
                                    target_set2,
                                    argument_set1,
                                    argument_set2,
                                    accuracy=100):
    logging.info("WEAT: Started calculation")
    target1, target2, arg1, arg2 = calculation.create_duplicates(
        target_set1, target_set2, argument_set1, argument_set2)
    target1, target2, arg1, arg2 = calculation.transform_multiple_dicts_to_lists(
        target1, target2, arg1, arg2)
    target_list = target1 + target2
    logging.info("WEAT: Vector dictionaries and lists prepared successfully")
    # Calculate effect size
    effect_size = effect_size_calculation(target_list, target1, target2, arg1,
                                          arg2)
    # Calculate p_value
    logging.info("WEAT: Started p-value calculation")
    s_b_e = differential_association(target1, target2, arg1, arg2)
    s_b_e_all = sum_up_diff_ass_all_permutations(target_list, arg1, arg2,
                                                 accuracy)
    p_value = p_value_calculation(s_b_e, s_b_e_all)
    logging.info("WEAT: Finished p-value calculation with result " +
                 str(p_value))
    logging.info("WEAT: Finished calculation")
    logging.info("WEAT: Results: effect-size: " + str(effect_size) +
                 " p-value: " + str(p_value))
    return effect_size, p_value
예제 #6
0
def full_analogy(word1, word2, word3):
    word1_copy, word2_copy, word3_copy = calculation.create_duplicates(
        word1, word2, word3)
    v1 = get_vector_from_small_dict(word1_copy)
    v2 = get_vector_from_small_dict(word2_copy)
    v3 = get_vector_from_small_dict(word3_copy)
    target_vector = numpy.array(v1) + numpy.array(v2) - numpy.array(v3)
    result_word = database_handler.word_for_nearest_vector(target_vector)
    return result_word
예제 #7
0
def generalized_bias_direction_debiasing(target_set1, target_set2, attributes1, attributes2, augments1, augments2,
                                         augments3, augments4):
    logging.info("GBDD: Debiasing started")
    target1_copy, target2_copy = calculation.create_duplicates(target_set1, target_set2)
    attributes1_copy, attributes2_copy = calculation.create_duplicates(attributes1, attributes2)
    augments1_copy, augments2_copy = calculation.create_duplicates(augments1, augments2)
    augments3_copy, augments4_copy = calculation.create_duplicates(augments3, augments4)
    augments1_copy = calculation.concatenate_dicts(augments1_copy, augments3_copy)
    augments2_copy = calculation.concatenate_dicts(augments2_copy, augments4_copy)

    aug1, aug2 = calculation.transform_multiple_dicts_to_lists(augments1_copy, augments2_copy)
    logging.info("GBDD: Vector dictionaries and lists prepared successfully")
    gbdv = calculate_bias_direction_matrix(aug1, aug2)
    new_target1 = calculate_gbdd(gbdv, target1_copy)
    new_target2 = calculate_gbdd(gbdv, target2_copy)
    new_attributes1 = calculate_gbdd(gbdv, attributes1_copy)
    new_attributes2 = calculate_gbdd(gbdv, attributes2_copy)
    logging.info("GBDD: Debiasing finished successfully")
    return new_target1, new_target2, new_attributes1, new_attributes2
예제 #8
0
def return_eval_ect(target_vectors1, target_vectors2, attr_vectors1, attr_vectors2, database):
    logging.info("APP-BE: Starting ECT evaluation")
    arg_vecs = calculation.concatenate_dicts(calculation.create_duplicates(attr_vectors1),
                                             calculation.create_duplicates(attr_vectors2))
    ect_value, p_value = ect.embedding_coherence_test(target_vectors1, target_vectors2, arg_vecs)
    ect_value1, p_value1 = ect.embedding_coherence_test(target_vectors1, target_vectors2, attr_vectors1)
    ect_value2, p_value2 = ect.embedding_coherence_test(target_vectors1, target_vectors2, attr_vectors2)
    logging.info("APP-BE: ECT finished successfully")
    response = json.dumps(
        {"EmbeddingSpace": database, "EvaluationMethods": "all",
         "EctValue": ect_value, "EctPValue": p_value,
         "EctValue1": ect_value1, "EctPValue1": p_value1,
         "EctValue2": ect_value2, "EctPValue2": p_value2,
         "T1": JSONFormatter.dict_to_json(target_vectors1),
         "T2": JSONFormatter.dict_to_json(target_vectors2),
         "A1": JSONFormatter.dict_to_json(attr_vectors1),
         "A2": JSONFormatter.dict_to_json(attr_vectors2)
         })
    # response = jsonify(ect_value1=ect_value1, p_value1=p_value1, p_value2=p_value2,
    #                   ect_value2=ect_value2)
    logging.info("APP-BE: Results: " + str(response))
    return response
예제 #9
0
def return_eval_all(target_vectors1, target_vectors2, attr_vectors1, attr_vectors2, database):
    logging.info("APP-BE: Starting all evaluations")
    try:
        arg_vecs = calculation.concatenate_dicts(calculation.create_duplicates(attr_vectors1),
                                                 calculation.create_duplicates(attr_vectors2))
        ect_value, p_value = ect.embedding_coherence_test(target_vectors1, target_vectors2, arg_vecs)
        ect_value1, p_value1 = ect.embedding_coherence_test(target_vectors1, target_vectors2, attr_vectors1)
        ect_value2, p_value2 = ect.embedding_coherence_test(target_vectors1, target_vectors2, attr_vectors2)
        bat_result = bat.bias_analogy_test(target_vectors1, target_vectors2, attr_vectors1, attr_vectors2)
        # bat_result = 'Currently not available'
        weat_effect_size, weat_p_value = weat.word_embedding_association_test(target_vectors1, target_vectors2,
                                                                              attr_vectors1,
                                                                              attr_vectors2)
        kmeans = k_means.k_means_clustering(target_vectors1, target_vectors2)
        logging.info("APP-BE: Evaluations finished successfully")
        response = json.dumps(
            {"EmbeddingSpace": database,
             "EvaluationMethods": "all",
             "EctValue": ect_value, "EctPValue": p_value,
             "EctValue1": ect_value1, "EctPValue1": p_value1,
             "EctValue2": ect_value2, "EctPValue2": p_value2,
             "BatValue": bat_result,
             "WeatEffectSize": weat_effect_size, "WeatPValue": weat_p_value,
             "KmeansValue": kmeans,
             "T1": JSONFormatter.dict_keys_to_string(target_vectors1),
             "T2": JSONFormatter.dict_keys_to_string(target_vectors2),
             "A1": JSONFormatter.dict_keys_to_string(attr_vectors1),
             "A2": JSONFormatter.dict_keys_to_string(attr_vectors2)
             })
        # response = jsonify(ect_value1=ect_value1, p_value1=p_value1, p_value2=p_value2, ect_value2=ect_value2,
        #                   bat_result=bat_result, weat_effect_size=weat_effect_size, weat_pvalue=weat_p_value,
        #                   k_means=kmeans)
        logging.info("APP-BE: Results: " + str(response))
        return response
    except RuntimeWarning as rw:
        print(rw)

    return jsonify(message="Internal Calculation Error")
예제 #10
0
def load_multiple_augments(word_list, sourcefile):
    fin = io.open(sourcefile,
                  'r',
                  encoding='utf-8',
                  newline='\n',
                  errors='ignore')
    database = 'fasttext'
    n, d = map(int, fin.readline().split())
    source_dict = database_handler.get_multiple_vectors_from_db(
        word_list, database)
    source_dict2 = calculation.create_duplicates(source_dict)
    source_vectors = []
    for word in source_dict2:
        source_vectors.append(numpy.array(list(source_dict2[word])))
    augmentations = {}
    for i in range(len(source_dict)):
        cosinesim = {}
        for line in fin:
            tokens = line.rstrip().split(' ')
            vector = []
            for i in range(1, len(tokens)):
                vector.append(float(tokens[i]))
            if tokens[0] != source_dict[i]:
                cosinesim[tokens[0]] = calculation.cosine_similarity(
                    source_vectors[i], numpy.array(vector))
        maximum1 = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum1)
        maximum2 = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum2)
        maximum3 = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum3)
        maximum4 = max(cosinesim, key=lambda k: cosinesim[k])
        cosinesim.pop(maximum4)
        augmentations[source_dict[i]] = [
            maximum1, maximum2, maximum3, maximum4
        ]
    return augmentations
예제 #11
0
파일: bam.py 프로젝트: umanlp/debie-backend
def bias_alignment_model(target_set1, target_set2, attributes1, attributes2,
                         augments1, augments2, augments3, augments4):
    target1_copy, target2_copy = calculation.create_duplicates(
        target_set1, target_set2)
    attr1_copy, attr2_copy = calculation.create_duplicates(
        attributes1, attributes2)
    augments1_copy, augments2_copy = calculation.create_duplicates(
        augments1, augments2)
    augments3_copy, augments4_copy = calculation.create_duplicates(
        augments3, augments4)

    term_list, vector_list = [], []
    augments_list, aug_vecs, aug1_list, aug2_list, = [], [], [], [],

    for word in target1_copy:
        term_list.append(word)
        vector_list.append(list(target1_copy[word]))
    for word in target2_copy:
        term_list.append(word)
        vector_list.append(list(target2_copy[word]))
    for word in attr1_copy:
        term_list.append(word)
        vector_list.append(numpy.array(list(attr1_copy[word])))
    for word in attr2_copy:
        term_list.append(word)
        vector_list.append(numpy.array(list(attr2_copy[word])))

    for word in augments1:
        augments_list.append(word)
        aug1_list.append(numpy.array(list(augments1_copy[word])))
    for word in augments2:
        augments_list.append(word)
        aug2_list.append(numpy.array(list(augments2_copy[word])))
    for word in augments3:
        augments_list.append(word)
        aug1_list.append(numpy.array(list(augments3_copy[word])))
    for word in augments4:
        augments_list.append(word)
        aug2_list.append(numpy.array(list(augments4_copy[word])))

    term_pairs = []
    for i in range(len(aug1_list)):
        for j in range(len(aug2_list)):
            term_pairs.append([aug1_list[i], aug2_list[j]])

    x_t1 = numpy.array([term_pairs[i][0] for i in range(len(term_pairs))])
    x_t2 = numpy.array([term_pairs[i][1] for i in range(len(term_pairs))])
    multi = numpy.matmul(numpy.transpose(x_t1), x_t2)
    u, s, vh = numpy.linalg.svd(multi)
    w_matrix = numpy.matmul(u, vh)
    new_x_matrix = numpy.matmul(vector_list, w_matrix)
    result_matrix = 0.5 * (numpy.array(vector_list) + new_x_matrix)
    result_dict = {}
    for i in range(len(term_list)):
        result_dict[term_list[i]] = result_matrix[i]
    t1, t2, a1, a2 = {}, {}, {}, {}
    for word in result_dict:
        if word in target1_copy:
            t1[word] = result_dict[word]
        if word in target2_copy:
            t2[word] = result_dict[word]
        if word in attr1_copy:
            a1[word] = result_dict[word]
        if word in attr2_copy:
            a2[word] = result_dict[word]

    return t1, t2, a1, a2
예제 #12
0
def debias_net(target_set1, target_set2, argument_set, lambda_value=0.2):
    target1, target2, argument = calculation.create_duplicates(
        target_set1, target_set2, argument_set)
    t1_list, t2_list, arg_list = calculation.transform_multiple_dicts_to_lists(
        target1, target2, argument)
예제 #13
0
def dict_keys_to_string(vector_dict):
    vector_dict_copy = calculation.create_duplicates(vector_dict)
    keys = ''
    for word in vector_dict_copy.keys():
        keys += str(word) + ' '
    return keys
예제 #14
0
def dict_to_json(vector_dict):
    vector_dict_copy = calculation.create_duplicates(vector_dict)
    string_dict = {}
    for word in vector_dict_copy:
        string_dict[word] = str(list(vector_dict_copy[word]))
    return string_dict
예제 #15
0
파일: bat.py 프로젝트: umanlp/debie-backend
def bias_analogy_test(target_set1, target_set2, attribute_set1, attribute_set2):
    logging.info("BAT: Calculation started")
    target1, target2, attribute1, attribute2 = calculation.create_duplicates(target_set1, target_set2, attribute_set1, attribute_set2)
    counter = 0
    vocab = {}
    vectors = []
    target_1 = []
    target_2 = []
    attributes_1 = []
    attributes_2 = []

    for word in target1:
        vocab[word] = counter
        counter += 1
        target_1.append(word)
        vectors.append(np.array(list(target1[word])))
    for word in target2:
        vocab[word] = counter
        counter += 1
        target_2.append(word)
        vectors.append(np.array(list(target2[word])))
    for word in attribute1:
        vocab[word] = counter
        counter += 1
        attributes_1.append(word)
        vectors.append(np.array(list(attribute1[word])))
    for word in attribute2:
        vocab[word] = counter
        counter += 1
        attributes_2.append(word)
        vectors.append(np.array(list(attribute2[word])))
    attributes_paired = []
    for a1 in attributes_1:
        for a2 in attributes_2:
            attributes_paired.append((a1, a2))

    temporary_vocab = list(set(target_1 + target_2 + attributes_1 + attributes_2))
    dictionary_list = []
    vector_matrix = []
    for w in temporary_vocab:
        if w in vocab:
            vector_matrix.append(vectors[vocab[w]])
            dictionary_list.append(w)

    vector_matrix = np.array(vector_matrix)
    vocab = {dictionary_list[i]: i for i in range(len(dictionary_list))}
    eq_pairs = []
    for t1 in target_1:
        for t2 in target_2:
            eq_pairs.append((t1, t2))

    for pair in eq_pairs:
        t1 = pair[0]
        t2 = pair[1]
        vec_t1 = vector_matrix[vocab[t1]]
        vec_t2 = vector_matrix[vocab[t2]]

        biased = []
        totals = []
        for a1, a2 in attributes_paired:
            vectors_a1 = vector_matrix[vocab[a1]]
            vectors_a2 = vector_matrix[vocab[a2]]

            diff_vec = vec_t1 - vec_t2

            query_1 = diff_vec + vectors_a2
            query_2 = vectors_a1 - diff_vec

            sims_q1 = np.sum(np.square(vector_matrix - query_1), axis=1)
            sorted_q1 = np.argsort(sims_q1)
            indices = np.where(sorted_q1 == vocab[a1])[0][0]
            other_attr_2 = [x for x in attributes_2 if x != a2]
            indices_other = [np.where(sorted_q1 == vocab[x])[0][0] for x in other_attr_2]
            number_biased = [x for x in indices_other if indices < x]
            biased.append(len(number_biased))
            totals.append(len(indices_other))

            sims_q2 = np.sum(np.square(vector_matrix - query_2), axis=1)
            sorted_q2 = np.argsort(sims_q2)
            indices = np.where(sorted_q2 == vocab[a2])[0][0]
            other_attr_1 = [x for x in attributes_1 if x != a1]
            indices_other = [np.where(sorted_q2 == vocab[x])[0][0] for x in other_attr_1]
            number_biased = [x for x in indices_other if indices < x]
            biased.append(len(number_biased))
            totals.append(len(indices_other))

        result = sum(biased) / sum(totals)
        logging.info("BAT: Calculated successfully, result: " + str(result))
        return result