Example #1
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
    def __init__(self, ablate=()):
        self.wrap = VectorSpaceWrapper(
            get_external_data_filename('numberbatch-20180108-biased.h5'),
            use_db=False)
        self.cache = {}
        self.wp_db = None
        self.sme = None
        self.queries = None
        self.phrases = None
        self.svm = None
        self.ablate = ablate

        self.feature_methods = [
            self.direct_relatedness_features, self.sme_features,
            self.wikipedia_relatedness_features,
            self.wordnet_relatedness_features, self.phrase_hit_features
        ]

        self.feature_names = [
            'ConceptNet vector relatedness',
            'SME: RelatedTo',
            'SME: (x IsA a)',
            'SME: (x HasA a)',
            'SME: (x PartOf a)',
            'SME: (x CapableOf a)',
            'SME: (x UsedFor a)',
            'SME: (x HasContext a)',
            'SME: (x HasProperty a)',
            'SME: (x AtLocation a)',
            'SME: (a PartOf x)',
            'SME: (a AtLocation x)',
            'Wikipedia lead sections',
            'WordNet relatedness',
            'Google Ngrams',
        ]
Example #3
0
def evaluate(frame, subset='val'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    # Make subset names consistent with other datasets
    if subset == 'dev':
        subset = 'val'
    elif subset == 'all':
        # for the final evaluation, use just the test data
        subset = 'test'
    filename = get_support_data_filename('story-cloze/cloze_test_spring2016_%s.tsv' % subset)
    vectors = VectorSpaceWrapper(frame=frame)
    total = 0
    correct = 0
    for sentences, answers in read_cloze(filename):
        text = ' '.join(sentences)
        right_answer, wrong_answer = answers
        probe_vec = vectors.text_to_vector('en', text)
        right_vec = vectors.text_to_vector('en', right_answer)
        wrong_vec = vectors.text_to_vector('en', wrong_answer)

        right_sim = cosine_similarity(probe_vec, right_vec)
        wrong_sim = cosine_similarity(probe_vec, wrong_vec)
        if right_sim > wrong_sim:
            correct += 1
        total += 1
        # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer))
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Example #4
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
Example #5
0
def evaluate(frame, subset='val'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    # Make subset names consistent with other datasets
    if subset == 'dev':
        subset = 'val'
    elif subset == 'all':
        # for the final evaluation, use just the test data
        subset = 'test'
    filename = get_support_data_filename(
        'story-cloze/cloze_test_spring2016_%s.tsv' % subset)
    vectors = VectorSpaceWrapper(frame=frame)
    total = 0
    correct = 0
    for sentences, answers in read_cloze(filename):
        text = ' '.join(sentences)
        right_answer, wrong_answer = answers
        probe_vec = vectors.text_to_vector('en', text)
        right_vec = vectors.text_to_vector('en', right_answer)
        wrong_vec = vectors.text_to_vector('en', wrong_answer)

        right_sim = cosine_similarity(probe_vec, right_vec)
        wrong_sim = cosine_similarity(probe_vec, wrong_vec)
        if right_sim > wrong_sim:
            correct += 1
        total += 1
        # print("%+4.2f %s / %s / %s" % (right_sim - wrong_sim, text, right_answer, wrong_answer))
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high],
                     index=['acc', 'low', 'high'])
def test_missing_language():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()

    # The frame contains no Esperanto, of course, so the out-of-vocabulary
    # mechanism will fail. We should simply get no results, not crash.
    similarity = vectors.similar_terms('/c/eo/ekzemplo')
    eq_(len(similarity), 0)
def test_missing_language():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()

    # The frame contains no Esperanto, of course, so the out-of-vocabulary
    # mechanism will fail. We should simply get no results, not crash.
    similarity = vectors.similar_terms('/c/eo/ekzemplo')
    eq_(len(similarity), 0)
def test_match_prefix():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    term = '/c/en/figure_skate'
    expected_prefix_matches = [('/c/en/figure', 0.0033333333333333335),
                               ('/c/en/figure skater', 0.0033333333333333335),
                               ('/c/en/figure skating', 0.0033333333333333335)]
    prefix_matches = vectors._match_prefix(term=term, prefix_weight=0.01)
    eq_(expected_prefix_matches, prefix_matches)
def test_cache_with_oov():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    # check the vector of all zeros is returned if the term is not present
    ok_(not vectors.get_vector('/c/en/test', oov_vector=False).any())

    # If include_neighbors=True, the neighbor of 'test' in ConceptNet ('trial')
    #  will be used to approximate its vector
    ok_(vectors.get_vector('/c/en/test', oov_vector=True).any())
Example #10
0
def test_expand_terms(multi_ling_frame):
    vectors = VectorSpaceWrapper(frame=multi_ling_frame)
    vectors.load()
    term = [('/c/en/ski_jumper', 1.0)]
    expanded_terms = vectors.expand_terms(terms=term, oov_vector=True)

    expected_expanded_terms = [
        ('/c/en/ski_jumper', 0.9900990099009901),
        ('/c/en/ski_jumping', 0.009900990099009901),
    ]
    assert expected_expanded_terms == expanded_terms
Example #11
0
def test_match_prefix(simple_frame):
    vectors = VectorSpaceWrapper(frame=simple_frame)
    vectors.load()
    term = '/c/en/figure_skate'
    expected_prefix_matches = [
        ('/c/en/figure', 0.0033333333333333335),
        ('/c/en/figure skater', 0.0033333333333333335),
        ('/c/en/figure skating', 0.0033333333333333335),
    ]
    prefix_matches = vectors._match_prefix(term=term, prefix_weight=0.01)
    assert expected_prefix_matches == prefix_matches
def test_match_prefix():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    term = '/c/en/figure_skate'
    expected_prefix_matches = [
        ('/c/en/figure', 0.0033333333333333335),
        ('/c/en/figure skater', 0.0033333333333333335),
        ('/c/en/figure skating', 0.0033333333333333335),
    ]
    prefix_matches = vectors._match_prefix(term=term, prefix_weight=0.01)
    eq_(expected_prefix_matches, prefix_matches)
def test_lookup_neighbors():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    term = '/c/pl/skoki_narciarskie'
    neighbors = vectors._find_neighbors(term=term,
                                        limit_per_term=10,
                                        weight=1.0)
    expected_neighbors = [('/c/en/ski_jumping', 0.02),
                          ('http://pl.dbpedia.org/resource/Skoki_narciarskie',
                           0.01), ('/c/en/ski_jumping', 0.01),
                          ('/c/en/ski_jumping', 0.005)]
    eq_(expected_neighbors, neighbors)
def test_similar_terms_filter():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    ok_(
        '/c/pl/kombinacja'
        in vectors.similar_terms('/c/en/nordic_combined', filter='/c/pl', limit=1).index
    )

    ok_(
        '/c/en/present'
        in vectors.similar_terms('/c/en/gift', filter='/c/en/present', limit=1).index
    )
def test_expand_terms():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    term = [('/c/en/ski_jumper', 1.0)]
    expanded_terms = vectors.expand_terms(terms=term, limit_per_term=2, oov_vector=True)

    expected_expanded_terms = [
        ('/c/en/ski_jumper', 0.9523809523809523),
        ('/c/pt/saltadores_de_esqui', 0.019047619047619046),
        ('/c/pl/skoczek_narciarski', 0.019047619047619046),
        ('/c/en/ski_jumping', 0.009523809523809523),
    ]
    eq_(expected_expanded_terms, expanded_terms)
def test_lookup_neighbors():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    term = '/c/pl/skoki_narciarskie'
    neighbors = vectors._find_neighbors(term=term, limit_per_term=10, weight=1.0)
    expected_neighbors = {
        ('/c/en/ski_jumping', 0.02),
        ('/c/en/ski_jumping', 0.01),
        ('http://pl.dbpedia.org/resource/Skoki_narciarskie', 0.01),
        ('/c/de/skispringen', 0.01),
        ('/c/en/ski_jumping', 0.005),
    }
    eq_(expected_neighbors, set(neighbors))
def test_load():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    ok_(vectors.frame is not None)
    ok_(vectors.small_frame is not None)
    ok_(all(label.startswith('/c/en/') for label in vectors.frame.index))
    ok_(vectors.frame.index.is_monotonic_increasing)
    ok_(vectors.small_frame.shape[1] <= 100)
    ok_(vectors._trie is not None)

    # test there are no transformations to raw terms other than adding the
    # English tag
    ok_('/c/en/figure skater' in vectors.frame.index)  # no underscore
    ok_('/c/en/Island' in vectors.frame.index)  # no case folding
def test_load():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    ok_(vectors.frame is not None)
    ok_(vectors.small_frame is not None)
    ok_(all(label.startswith('/c/en/') for label in vectors.frame.index))
    ok_(vectors.frame.index.is_monotonic_increasing)
    ok_(vectors.small_frame.shape[1] <= 100)
    ok_(vectors._trie is not None)

    # test there are no transformations to raw terms other than adding the
    # English tag
    ok_('/c/en/figure skater' in vectors.frame.index)  # no underscore
    ok_('/c/en/Island' in vectors.frame.index)  # no case folding
Example #19
0
def test_load(simple_frame):
    vectors = VectorSpaceWrapper(frame=simple_frame)
    vectors.load()
    assert vectors.frame is not None
    assert vectors.small_frame is not None
    assert all(label.startswith('/c/en/') for label in vectors.frame.index)
    assert vectors.frame.index.is_monotonic_increasing
    assert vectors.small_frame.shape[1] <= 100
    assert vectors._trie is not None

    # test there are no transformations to raw terms other than adding the
    # English tag
    assert '/c/en/figure skater' in vectors.frame.index  # no underscore
    assert '/c/en/Island' in vectors.frame.index  # no case folding
def test_expand_terms():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    term = [('/c/en/ski_jumper', 1.0)]
    expanded_terms = vectors.expand_terms(terms=term,
                                          limit_per_term=2,
                                          oov_vector=True)

    # /c/en/bounder and /c/en/skier from neighbor search
    # /c/en/ski_jumping from prefix match
    expected_expanded_terms = [('/c/en/ski_jumper', 0.9523809523809523),
                               ('/c/en/bounder', 0.019047619047619046),
                               ('/c/en/skier', 0.019047619047619046),
                               ('/c/en/ski_jumping', 0.009523809523809523)]
    eq_(expected_expanded_terms, expanded_terms)
def test_expand_terms():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    term = [('/c/en/ski_jumper', 1.0)]
    expanded_terms = vectors.expand_terms(terms=term,
                                          limit_per_term=2,
                                          oov_vector=True)

    expected_expanded_terms = [
        ('/c/en/ski_jumper', 0.9523809523809523),
        ('/c/pt/saltadores_de_esqui', 0.019047619047619046),
        ('/c/pl/skoczek_narciarski', 0.019047619047619046),
        ('/c/en/ski_jumping', 0.009523809523809523),
    ]
    eq_(expected_expanded_terms, expanded_terms)
Example #22
0
def eval_pairwise_analogies(frame,
                            eval_filename,
                            subset='all',
                            weight_direct=0.35,
                            weight_transpose=0.65):
    total = 0
    correct = 0
    wrap = VectorSpaceWrapper(frame=frame)
    for idx, (prompt, choices,
              answer) in enumerate(read_turney_analogies(eval_filename)):
        # Enable an artificial training/test split
        if subset == 'all' or (subset == 'dev') == (idx % 2 == 0):
            a1, b1 = prompt
            choice_values = []
            for choice in choices:
                a2, b2 = choice
                choice_values.append(
                    pairwise_analogy_func(wrap, a1, b1, a2, b2, weight_direct,
                                          weight_transpose))
            our_answer = np.argmax(choice_values)
            if our_answer == answer:
                correct += 1
            total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high],
                     index=['acc', 'low', 'high'])
class RelatednessClassifier(AttributeClassifier):
    def __init__(self):
        self.wrap = VectorSpaceWrapper(
            get_external_data_filename('numberbatch-20180108-biased.h5'),
            use_db=False)
        self.cache = {}

    def get_vector(self, uri):
        if uri in self.cache:
            return self.cache[uri]
        else:
            vec = normalize_vec(self.wrap.get_vector(uri))
            self.cache[uri] = vec
            return vec

    def get_similarity(self, uri1, uri2):
        return self.get_vector(uri1).dot(self.get_vector(uri2))

    def direct_relatedness(self, example):
        match1 = max(0, self.get_similarity(example.node1(),
                                            example.att_node()))**0.5
        match2 = max(0, self.get_similarity(example.node2(),
                                            example.att_node()))**0.5
        return match1 - match2

    def classify(self, examples, mode):
        return np.array(
            [self.direct_relatedness(example) > .1 for example in examples])
Example #24
0
def evaluate(frame, subset='dev'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, and WordSim-353. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    if subset == 'all':
        men_subset = 'test'
    else:
        men_subset = subset
    vectors = VectorSpaceWrapper(frame=frame)
    men_score = spearman_evaluate(vectors, read_men3000(men_subset))
    rw_score = spearman_evaluate(vectors, read_rw(subset))
    mturk_score = spearman_evaluate(vectors, read_mturk())
    ws_score = spearman_evaluate(vectors, read_ws353())
    ws_es_score = spearman_evaluate(vectors,
                                    read_ws353_multilingual('es'),
                                    language='es')
    ws_ro_score = spearman_evaluate(vectors,
                                    read_ws353_multilingual('ro'),
                                    language='ro')
    results = empty_comparison_table()
    results.loc['men3000'] = men_score
    results.loc['rw'] = rw_score
    results.loc['mturk'] = mturk_score
    results.loc['ws353'] = ws_score
    results.loc['ws353-es'] = ws_es_score
    results.loc['ws353-ro'] = ws_ro_score
    return results
Example #25
0
def evaluate(frame, subset='dev', semeval_scope='global'):
    """
    Evaluate a DataFrame containing term vectors on its ability to predict term
    relatedness, according to MEN-3000, RW, MTurk-771, WordSim-353, and Semeval2017-Task2. Use a
    VectorSpaceWrapper to fill missing vocabulary from ConceptNet.

    Return a Series containing these labeled results.
    """
    if subset == 'all':
        men_subset = 'test'
    else:
        men_subset = subset

    vectors = VectorSpaceWrapper(frame=frame)

    men_score = measure_correlation(spearmanr, vectors, read_men3000(men_subset))
    rw_score = measure_correlation(spearmanr, vectors, read_rw(subset))
    mturk_score = measure_correlation(spearmanr, vectors, read_mturk())
    simlex_score = measure_correlation(spearmanr, vectors, read_simlex())
    gur350_score = measure_correlation(spearmanr, vectors, read_gurevych('350'))
    zg222_score = measure_correlation(spearmanr, vectors, read_gurevych('222'))
    ws_score = measure_correlation(spearmanr, vectors, read_ws353())
    ws_es_score = measure_correlation(spearmanr, vectors, read_ws353_multilingual('es'))
    ws_ro_score = measure_correlation(spearmanr, vectors, read_ws353_multilingual('ro'))
    pku500_score = measure_correlation(spearmanr, vectors, read_pku500())
    jsim_score = measure_correlation(spearmanr, vectors, read_jsim())

    results = empty_comparison_table()
    results.loc['men3000'] = men_score
    results.loc['rw'] = rw_score
    results.loc['mturk'] = mturk_score
    results.loc['simlex'] = simlex_score
    results.loc['gur350-de'] = gur350_score
    results.loc['zg222-de'] = zg222_score
    results.loc['ws353'] = ws_score
    results.loc['ws353-es'] = ws_es_score
    results.loc['ws353-ro'] = ws_ro_score
    results.loc['pku500-zh'] = pku500_score
    results.loc['jsim-ja'] = jsim_score

    if semeval_scope == 'global':
        results.loc['semeval17-2a'] = evaluate_semeval_monolingual_global(vectors)
        results.loc['semeval17-2b'] = evaluate_semeval_crosslingual_global(vectors)

    else:
        languages = ['en', 'de', 'es', 'it', 'fa']

        for lang in languages:
            results.loc['semeval-2a-{}'.format(lang)] = evaluate_semeval_monolingual(
                vectors, lang
            )

        for lang1, lang2 in combinations(languages, 2):
            results.loc[
                'semeval-2b-{}-{}'.format(lang1, lang2)
            ] = evaluate_semeval_crosslingual(vectors, lang1, lang2)

    return results
Example #26
0
def test_vector_space_wrapper_filter():
    wrap = VectorSpaceWrapper(frame=TEST_FRAME)
    wrap.load()
    ok_('/c/pl/kombinacja' in wrap.similar_terms('/c/en/nordic_combined', filter='/c/pl',
                                                 limit=1).index)

    ok_('/c/en/present' in wrap.similar_terms('/c/en/gift', filter='/c/en/present', limit=1).index)
def test_similar_terms_filter():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    ok_('/c/pl/kombinacja' in vectors.similar_terms(
        '/c/en/nordic_combined', filter='/c/pl', limit=1).index)

    ok_('/c/en/present' in vectors.similar_terms(
        '/c/en/gift', filter='/c/en/present', limit=1).index)
Example #28
0
def test_similar_terms(simple_frame):
    """
    Check if VectorSpaceWrapper's index is sorted and its elements are concepts.
    """
    vectors = VectorSpaceWrapper(frame=simple_frame)
    vectors.load()
    assert ('/c/en/figure skating'
            in vectors.similar_terms('/c/en/figure skating', limit=3).index)
    assert ('/c/en/figure skater'
            in vectors.similar_terms('/c/en/figure skating', limit=3).index)
    assert ('/c/en/figure' in vectors.similar_terms('/c/en/figure skating',
                                                    limit=3).index)
def test_similar_terms():
    """
    Check if VectorSpaceWrapper's index is sorted and its elements are concepts.
    """
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    ok_('/c/en/figure skating' in vectors.similar_terms('/c/en/figure skating',
                                                        limit=3).index)
    ok_('/c/en/figure skater' in vectors.similar_terms('/c/en/figure skating',
                                                       limit=3).index)
    ok_('/c/en/figure' in vectors.similar_terms('/c/en/figure skating',
                                                limit=3).index)
def test_cache_with_oov():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    # check the vector of all zeros is returned if the term is not present
    ok_(not vectors.get_vector('/c/en/test', oov_vector=False).any())

    # If include_neighbors=True, the neighbor of 'test' in ConceptNet ('trial')
    #  will be used to approximate its vector
    ok_(vectors.get_vector('/c/en/test', oov_vector=True).any())
Example #31
0
def test_vector_space_wrapper():
    """
    Check if VectorSpaceWrapper's index is sorted and its elements are concepts.
    """
    wrap = VectorSpaceWrapper(frame=TEST_FRAME)
    wrap.load()
    ok_(all(is_term(label) for label in wrap.frame.index))
    ok_(wrap.frame.index.is_monotonic_increasing)

    # test there are no transformations to raw terms other than adding the english tag
    ok_('/c/en/figure skater' in wrap.frame.index) # no underscore
    ok_('/c/en/Island' in wrap.frame.index) # no case folding

    # test index_prefix_range
    ok_(wrap.index_prefix_range('/c/en/figure') == (3, 6))
    ok_(wrap.index_prefix_range('/c/en/skating') == (0, 0))

    # test_similar_terms
    ok_('/c/en/figure skating' in wrap.similar_terms('/c/en/figure skating', limit=3).index)
    ok_('/c/en/figure skater' in wrap.similar_terms('/c/en/figure skating', limit=3).index)
    ok_('/c/en/figure' in wrap.similar_terms('/c/en/figure skating', limit=3).index)
Example #32
0
def measure_bias(frame):
    """
    Return a DataFrame that measures biases in a semantic space, on four
    data sets:

    - Gender
    - Fine-grained ethnicity
    - Coarse-grained ethnicity
    - Religious beliefs
    """
    vsw = VectorSpaceWrapper(frame=frame)
    vsw.load()

    gender_binary_axis = normalize_vec(
        get_category_axis(frame, FEMALE_WORDS) -
        get_category_axis(frame, MALE_WORDS))
    gender_bias_numbers = []
    for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS:
        female_biased_uri = standardized_uri('en', female_biased_word)
        male_biased_uri = standardized_uri('en', male_biased_word)
        diff = normalize_vec(
            vsw.get_vector(female_biased_uri) -
            vsw.get_vector(male_biased_uri)).dot(gender_binary_axis)
        gender_bias_numbers.append(diff)

    mean = np.mean(gender_bias_numbers)
    sem = scipy.stats.sem(gender_bias_numbers)
    gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2],
                            index=['bias', 'low', 'high'])

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = pd.DataFrame(
        np.vstack(
            [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS]))
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS)
    belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    return pd.DataFrame({
        'gender': gender_bias,
        'ethnicity-fine': fine_ethnic_bias,
        'ethnicity-coarse': coarse_ethnic_bias,
        'ethnicity-names': name_ethnic_bias,
        'beliefs': belief_bias
    }).T
Example #33
0
def _load_vectors():
    frame = load_hdf(resource_filename('codenames', 'data/mini.h5'))
    selections = [
        label for label in frame.index
        if label.startswith('/c/en/') and '_' not in label and '#' not in label
        and wordfreq.zipf_frequency(label[6:], 'en') > 3.0
    ]
    # Make sure all the words in Codenames are represented
    wordlist = [
        standardized_uri('en', line.strip()) for line in open(
            resource_filename('codenames', 'data/codenames-words.txt'))
    ]
    additions = [word for word in wordlist if word not in selections]
    selections += additions
    frame = l2_normalize_rows(frame.loc[selections].astype('f'))
    return VectorSpaceWrapper(frame=frame)
Example #34
0
def test_similar_terms_filter(multi_ling_frame):
    vectors = VectorSpaceWrapper(frame=multi_ling_frame)
    vectors.load()
    assert ('/c/pl/kombinacja'
            in vectors.similar_terms('/c/en/nordic_combined',
                                     filter='/c/pl',
                                     limit=1).index)

    assert ('/c/en/present' in vectors.similar_terms('/c/en/gift',
                                                     filter='/c/en/present',
                                                     limit=1).index)
def test_similar_terms():
    """
    Check if VectorSpaceWrapper's index is sorted and its elements are concepts.
    """
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    ok_(
        '/c/en/figure skating'
        in vectors.similar_terms('/c/en/figure skating', limit=3).index
    )
    ok_(
        '/c/en/figure skater'
        in vectors.similar_terms('/c/en/figure skating', limit=3).index
    )
    ok_('/c/en/figure' in vectors.similar_terms('/c/en/figure skating', limit=3).index)
Example #36
0
from conceptnet5.vectors.query import VectorSpaceWrapper
from conceptnet5.query import field_match, VALID_KEYS
from conceptnet5.db.query import MAX_GROUP_SIZE
from conceptnet5.nodes import standardized_concept_uri
from conceptnet5.nodes import ld_node
import itertools

VECTORS = VectorSpaceWrapper()
FINDER = VECTORS.finder
VALID_KEYS = VALID_KEYS  # re-export
CONTEXT = [
    "http://api.conceptnet.io/ld/conceptnet5.5/context.ld.json",
    "http://api.conceptnet.io/ld/conceptnet5.5/pagination.ld.json"
]


def success(response):
    response['@context'] = CONTEXT
    return response


def error(response, status, details):
    response['@context'] = CONTEXT
    response['error'] = {'status': status, 'details': details}
    return response


def make_query_url(url, items):
    str_items = ['{}={}'.format(*item) for item in items]
    if not str_items:
        return url
Example #37
0
def evaluate(frame,
             analogy_filename,
             subset='test',
             tune_analogies=True,
             scope='global',
             google_vocab_size=200000):
    """
    Run SAT and Semeval12-2 evaluations.

    Required parameters:
      frame
          a DataFrame containing term vectors
      analogy_filename
          the filename of Turney's SAT evaluation data

    Optional parameters:
      subset (string, default 'test')
          a subset of a data to evaluate on, either 'test' or 'dev'
      tune_analogies (boolean, default True)
          tune the weights in eval_pairwise_analogies()
      semeval_scope (string, default 'global')
          'global' to get the average of the results across all subclasses of semeval12-2,
          or another string to get the results broken down by a subclass (1a, 1b, etc.)
    """
    vectors = VectorSpaceWrapper(frame=frame)
    results = empty_comparison_table()

    if tune_analogies:
        sat_weights = optimize_weights(eval_pairwise_analogies, vectors,
                                       analogy_filename)
        semeval_weights = optimize_weights(eval_semeval2012_global, vectors)
    else:
        sat_weights = (0.35, 0.65)
        semeval_weights = (0.3, 0.35)

    sat_results = eval_pairwise_analogies(vectors, analogy_filename,
                                          sat_weights[0], sat_weights[1],
                                          subset)
    results.loc['sat-analogies'] = sat_results

    for gsubset in ['semantic', 'syntactic']:
        google_results = eval_google_analogies(vectors,
                                               subset=gsubset,
                                               vocab_size=google_vocab_size)
        results.loc['google-%s' % gsubset] = google_results

    # There's no meaningful "all" subset for semeval12, because the dev and
    # test data are stored entirely separately. Just use "test".
    if subset == 'dev':
        semeval12_subset = 'dev'
    else:
        semeval12_subset = 'test'
    if scope == 'global':
        maxdiff_score, spearman_score = eval_semeval2012_global(
            vectors, semeval_weights[0], semeval_weights[1], semeval12_subset)
        results.loc['semeval12-spearman'] = spearman_score
        results.loc['semeval12-maxdiff'] = maxdiff_score
    else:
        for subclass in product(range(1, 11), 'a b c d e f g h i j'):
            subclass = ''.join([str(element) for element in subclass])
            try:
                maxdiff_score, spearman_score = eval_semeval2012_analogies(
                    vectors, semeval_weights[0], semeval_weights[1],
                    semeval12_subset, subclass)
                results.loc['semeval12-{}-spearman'.format(
                    subclass)] = spearman_score
                results.loc['semeval12-{}-maxdiff'.format(
                    subclass)] = maxdiff_score
            except FileNotFoundError:
                continue

    bats_results = []
    for category in product('DEIL', range(1, 11)):
        category = ''.join([str(element) for element in category])
        quads = read_bats(category)
        category_results = eval_open_vocab_analogies(vectors, quads)
        bats_results.append((category, category_results))

    if scope == 'global':
        average_scores = []
        for interval in ['acc', 'low', 'high']:
            average_scores.append(
                np.mean([result[interval] for name, result in bats_results]))
        results.loc['bats'] = pd.Series(average_scores,
                                        index=['acc', 'low', 'high'])
    else:
        for name, result in bats_results:
            results.loc['bats-{}'.format(''.join(name))] = result

    return results
def test_index_prefix_range():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    eq_(vectors._index_prefix_range('/c/en/figure'), (3, 6))
    eq_(vectors._index_prefix_range('/c/en/skating'), (0, 0))
def test_englishify():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    eq_(vectors._englishify('/c/sv/harry_potter'), '/c/en/harry_potter')
def test_index_prefix_range():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    eq_(vectors._index_prefix_range('/c/en/figure'), (3, 6))
    eq_(vectors._index_prefix_range('/c/en/skating'), (0, 0))
Example #41
0
def test_cache_with_oov(multi_ling_frame):
    vectors = VectorSpaceWrapper(frame=multi_ling_frame)
    vectors.load()
    # check the vector of all zeros is returned if the term is not present
    assert not vectors.get_vector('/c/en/test', oov_vector=False).any()
def test_englishify():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    eq_(vectors._englishify('/c/sv/harry_potter'), '/c/en/harry_potter')