Example #1
0
def make_replacements_faster(small_frame,
                             big_frame,
                             tree_depth=1000,
                             lang='en',
                             verbose=False):
    """
    Create a replacements dictionary to map terms only present in a big frame to the closest term
    in a small_frame. This is a faster than make_replacements(), because it uses a fast
    implementation of the approximate nearest neighbor algorithm.

    tree_depth=1000 provides a good balance of speed and accuracy.
    """
    intersected = big_frame.reindex(small_frame.index).dropna()
    index, index_map = build_annoy_tree(intersected, tree_depth)
    replacements = {}
    for term in big_frame.index:
        if term not in small_frame.index and not term.startswith('/x/'):
            most_similar_index = index.get_nns_by_vector(
                big_frame.loc[term], 1)[0]
            most_similar = index_map[most_similar_index]
            similarity = cosine_similarity(
                get_vector(big_frame, term, lang),
                get_vector(small_frame, most_similar, lang))
            replacements[term] = [most_similar, round(similarity, 2)]

            if verbose and not (len(replacements) % 20):
                print('{} ==> {}, {}'.format(term, most_similar, similarity))
    return replacements
Example #2
0
def spearman_evaluate(vectors, standard, language='en', verbose=0):
    """
    Tests assoc_space's ability to recognize word correlation. This function
    computes the spearman correlation between assoc_space's reported word
    correlation and the expected word correlation according to 'standard'.
    """
    gold_scores = []
    our_scores = []

    for term1, term2, gold_score in standard:
        uri1 = standardized_uri(language, term1)
        uri2 = standardized_uri(language, term2)
        if isinstance(vectors, VectorSpaceWrapper):
            our_score = vectors.get_similarity(uri1, uri2)
        else:
            our_score = cosine_similarity(get_vector(vectors, uri1),
                                          get_vector(vectors, uri2))
        if verbose > 1:
            print('%s\t%s\t%3.3f\t%3.3f' %
                  (term1, term2, gold_score, our_score))
        gold_scores.append(gold_score)
        our_scores.append(our_score)

    correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0]

    if verbose:
        print("Spearman correlation: %s" % (correlation, ))

    return confidence_interval(correlation, len(gold_scores))
Example #3
0
def spearman_evaluate(vectors, standard, language='en', verbose=0):
    """
    Tests assoc_space's ability to recognize word correlation. This function
    computes the spearman correlation between assoc_space's reported word
    correlation and the expected word correlation according to 'standard'.
    """
    gold_scores = []
    our_scores = []

    for term1, term2, gold_score in standard:
        uri1 = standardized_uri(language, term1)
        uri2 = standardized_uri(language, term2)
        if isinstance(vectors, VectorSpaceWrapper):
            our_score = vectors.get_similarity(uri1, uri2)
        else:
            our_score = cosine_similarity(get_vector(vectors, uri1), get_vector(vectors, uri2))
        if verbose > 1:
            print('%s\t%s\t%3.3f\t%3.3f' % (term1, term2, gold_score, our_score))
        gold_scores.append(gold_score)
        our_scores.append(our_score)

    correlation = spearmanr(np.array(gold_scores), np.array(our_scores))[0]

    if verbose:
        print("Spearman correlation: %s" % (correlation,))

    return confidence_interval(correlation, len(gold_scores))
Example #4
0
def measure_bias(frame):
    """
    Return a DataFrame that measures biases in a semantic space, on four
    data sets:

    - Gender
    - Fine-grained ethnicity
    - Coarse-grained ethnicity
    - Religious beliefs
    """
    gender_binary_axis = normalize_vec(
        get_category_axis(frame, FEMALE_WORDS) -
        get_category_axis(frame, MALE_WORDS))
    gender_bias_numbers = []
    for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS:
        female_biased_uri = standardized_uri('en', female_biased_word)
        male_biased_uri = standardized_uri('en', male_biased_word)
        diff = normalize_vec(
            get_vector(frame, female_biased_uri) -
            get_vector(frame, male_biased_uri)).dot(gender_binary_axis)
        gender_bias_numbers.append(diff)

    mean = np.mean(gender_bias_numbers)
    sem = scipy.stats.sem(gender_bias_numbers)
    gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2],
                            index=['bias', 'low', 'high'])

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = pd.DataFrame(
        np.vstack(
            [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS]))
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS)
    belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    return pd.DataFrame({
        'gender': gender_bias,
        'ethnicity-fine': fine_ethnic_bias,
        'ethnicity-coarse': coarse_ethnic_bias,
        'ethnicity-names': name_ethnic_bias,
        'beliefs': belief_bias
    }).T
Example #5
0
def test_get_vector(frame=None):
    """
    Check if vectors.get_vector() returns the same vector given labels that are shaped in a
    different way.
    """
    if frame:
        vectors = load_any_embeddings(frame)
        ok_(
            get_vector(vectors,
                       '/c/en/cat').equals(get_vector(vectors, 'cat', 'en')))

    vectors = load_any_embeddings(DATA + '/vectors/glove12-840B.h5')
    ok_(
        get_vector(vectors,
                   '/c/en/cat').equals(get_vector(vectors, 'cat', 'en')))
Example #6
0
def test_get_vector():
    ok_(get_vector(TEST_FRAME, '/c/en/cat').equals(get_vector(TEST_FRAME, 'cat', 'en')))
Example #7
0
def test_get_vector(simple_frame):
    assert get_vector(simple_frame,
                      '/c/en/cat').equals(get_vector(simple_frame, 'cat',
                                                     'en'))
Example #8
0
def test_get_vector():
    ok_(get_vector(TEST_FRAME, '/c/en/cat').equals(get_vector(TEST_FRAME, 'cat', 'en')))
Example #9
0
def analogy_func(frame, a1, b1, a2):
    return get_vector(frame, b1) - get_vector(frame, a1) + get_vector(frame, a2)
Example #10
0
def analogy_func(frame, a1, b1, a2):
    return get_vector(frame, b1) - get_vector(frame, a1) + get_vector(frame, a2)