def test_vector_space_wrapper_filter(): wrap = VectorSpaceWrapper(frame=TEST_FRAME) wrap.load() ok_('/c/pl/kombinacja' in wrap.similar_terms('/c/en/nordic_combined', filter='/c/pl', limit=1).index) ok_('/c/en/present' in wrap.similar_terms('/c/en/gift', filter='/c/en/present', limit=1).index)
def test_missing_language(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() # The frame contains no Esperanto, of course, so the out-of-vocabulary # mechanism will fail. We should simply get no results, not crash. similarity = vectors.similar_terms('/c/eo/ekzemplo') eq_(len(similarity), 0)
def test_similar_terms_filter(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() ok_('/c/pl/kombinacja' in vectors.similar_terms( '/c/en/nordic_combined', filter='/c/pl', limit=1).index) ok_('/c/en/present' in vectors.similar_terms( '/c/en/gift', filter='/c/en/present', limit=1).index)
def test_match_prefix(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() term = '/c/en/figure_skate' expected_prefix_matches = [('/c/en/figure', 0.0033333333333333335), ('/c/en/figure skater', 0.0033333333333333335), ('/c/en/figure skating', 0.0033333333333333335)] prefix_matches = vectors._match_prefix(term=term, prefix_weight=0.01) eq_(expected_prefix_matches, prefix_matches)
def test_cache_with_oov(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() # check the vector of all zeros is returned if the term is not present ok_(not vectors.get_vector('/c/en/test', oov_vector=False).any()) # If include_neighbors=True, the neighbor of 'test' in ConceptNet ('trial') # will be used to approximate its vector ok_(vectors.get_vector('/c/en/test', oov_vector=True).any())
def measure_bias(frame): """ Return a DataFrame that measures biases in a semantic space, on four data sets: - Gender - Fine-grained ethnicity - Coarse-grained ethnicity - Religious beliefs """ vsw = VectorSpaceWrapper(frame=frame) vsw.load() gender_binary_axis = normalize_vec( get_category_axis(frame, FEMALE_WORDS) - get_category_axis(frame, MALE_WORDS)) gender_bias_numbers = [] for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS: female_biased_uri = standardized_uri('en', female_biased_word) male_biased_uri = standardized_uri('en', male_biased_word) diff = normalize_vec( vsw.get_vector(female_biased_uri) - vsw.get_vector(male_biased_uri)).dot(gender_binary_axis) gender_bias_numbers.append(diff) mean = np.mean(gender_bias_numbers) sem = scipy.stats.sem(gender_bias_numbers) gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2], index=['bias', 'low', 'high']) stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = pd.DataFrame( np.vstack( [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS])) stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS) name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF) stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS) belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2) return pd.DataFrame({ 'gender': gender_bias, 'ethnicity-fine': fine_ethnic_bias, 'ethnicity-coarse': coarse_ethnic_bias, 'ethnicity-names': name_ethnic_bias, 'beliefs': belief_bias }).T
def test_expand_terms(multi_ling_frame): vectors = VectorSpaceWrapper(frame=multi_ling_frame) vectors.load() term = [('/c/en/ski_jumper', 1.0)] expanded_terms = vectors.expand_terms(terms=term, oov_vector=True) expected_expanded_terms = [ ('/c/en/ski_jumper', 0.9900990099009901), ('/c/en/ski_jumping', 0.009900990099009901), ] assert expected_expanded_terms == expanded_terms
def test_match_prefix(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() term = '/c/en/figure_skate' expected_prefix_matches = [ ('/c/en/figure', 0.0033333333333333335), ('/c/en/figure skater', 0.0033333333333333335), ('/c/en/figure skating', 0.0033333333333333335), ] prefix_matches = vectors._match_prefix(term=term, prefix_weight=0.01) eq_(expected_prefix_matches, prefix_matches)
def test_similar_terms_filter(multi_ling_frame): vectors = VectorSpaceWrapper(frame=multi_ling_frame) vectors.load() assert ('/c/pl/kombinacja' in vectors.similar_terms('/c/en/nordic_combined', filter='/c/pl', limit=1).index) assert ('/c/en/present' in vectors.similar_terms('/c/en/gift', filter='/c/en/present', limit=1).index)
def test_match_prefix(simple_frame): vectors = VectorSpaceWrapper(frame=simple_frame) vectors.load() term = '/c/en/figure_skate' expected_prefix_matches = [ ('/c/en/figure', 0.0033333333333333335), ('/c/en/figure skater', 0.0033333333333333335), ('/c/en/figure skating', 0.0033333333333333335), ] prefix_matches = vectors._match_prefix(term=term, prefix_weight=0.01) assert expected_prefix_matches == prefix_matches
def test_similar_terms(): """ Check if VectorSpaceWrapper's index is sorted and its elements are concepts. """ vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() ok_('/c/en/figure skating' in vectors.similar_terms('/c/en/figure skating', limit=3).index) ok_('/c/en/figure skater' in vectors.similar_terms('/c/en/figure skating', limit=3).index) ok_('/c/en/figure' in vectors.similar_terms('/c/en/figure skating', limit=3).index)
def test_lookup_neighbors(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() term = '/c/pl/skoki_narciarskie' neighbors = vectors._find_neighbors(term=term, limit_per_term=10, weight=1.0) expected_neighbors = [('/c/en/ski_jumping', 0.02), ('http://pl.dbpedia.org/resource/Skoki_narciarskie', 0.01), ('/c/en/ski_jumping', 0.01), ('/c/en/ski_jumping', 0.005)] eq_(expected_neighbors, neighbors)
def test_similar_terms_filter(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() ok_( '/c/pl/kombinacja' in vectors.similar_terms('/c/en/nordic_combined', filter='/c/pl', limit=1).index ) ok_( '/c/en/present' in vectors.similar_terms('/c/en/gift', filter='/c/en/present', limit=1).index )
def test_similar_terms(simple_frame): """ Check if VectorSpaceWrapper's index is sorted and its elements are concepts. """ vectors = VectorSpaceWrapper(frame=simple_frame) vectors.load() assert ('/c/en/figure skating' in vectors.similar_terms('/c/en/figure skating', limit=3).index) assert ('/c/en/figure skater' in vectors.similar_terms('/c/en/figure skating', limit=3).index) assert ('/c/en/figure' in vectors.similar_terms('/c/en/figure skating', limit=3).index)
def test_lookup_neighbors(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() term = '/c/pl/skoki_narciarskie' neighbors = vectors._find_neighbors(term=term, limit_per_term=10, weight=1.0) expected_neighbors = { ('/c/en/ski_jumping', 0.02), ('/c/en/ski_jumping', 0.01), ('http://pl.dbpedia.org/resource/Skoki_narciarskie', 0.01), ('/c/de/skispringen', 0.01), ('/c/en/ski_jumping', 0.005), } eq_(expected_neighbors, set(neighbors))
def test_expand_terms(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() term = [('/c/en/ski_jumper', 1.0)] expanded_terms = vectors.expand_terms(terms=term, limit_per_term=2, oov_vector=True) expected_expanded_terms = [ ('/c/en/ski_jumper', 0.9523809523809523), ('/c/pt/saltadores_de_esqui', 0.019047619047619046), ('/c/pl/skoczek_narciarski', 0.019047619047619046), ('/c/en/ski_jumping', 0.009523809523809523), ] eq_(expected_expanded_terms, expanded_terms)
def test_load(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() ok_(vectors.frame is not None) ok_(vectors.small_frame is not None) ok_(all(label.startswith('/c/en/') for label in vectors.frame.index)) ok_(vectors.frame.index.is_monotonic_increasing) ok_(vectors.small_frame.shape[1] <= 100) ok_(vectors._trie is not None) # test there are no transformations to raw terms other than adding the # English tag ok_('/c/en/figure skater' in vectors.frame.index) # no underscore ok_('/c/en/Island' in vectors.frame.index) # no case folding
def test_load(simple_frame): vectors = VectorSpaceWrapper(frame=simple_frame) vectors.load() assert vectors.frame is not None assert vectors.small_frame is not None assert all(label.startswith('/c/en/') for label in vectors.frame.index) assert vectors.frame.index.is_monotonic_increasing assert vectors.small_frame.shape[1] <= 100 assert vectors._trie is not None # test there are no transformations to raw terms other than adding the # English tag assert '/c/en/figure skater' in vectors.frame.index # no underscore assert '/c/en/Island' in vectors.frame.index # no case folding
def test_expand_terms(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() term = [('/c/en/ski_jumper', 1.0)] expanded_terms = vectors.expand_terms(terms=term, limit_per_term=2, oov_vector=True) # /c/en/bounder and /c/en/skier from neighbor search # /c/en/ski_jumping from prefix match expected_expanded_terms = [('/c/en/ski_jumper', 0.9523809523809523), ('/c/en/bounder', 0.019047619047619046), ('/c/en/skier', 0.019047619047619046), ('/c/en/ski_jumping', 0.009523809523809523)] eq_(expected_expanded_terms, expanded_terms)
def test_similar_terms(): """ Check if VectorSpaceWrapper's index is sorted and its elements are concepts. """ vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() ok_( '/c/en/figure skating' in vectors.similar_terms('/c/en/figure skating', limit=3).index ) ok_( '/c/en/figure skater' in vectors.similar_terms('/c/en/figure skating', limit=3).index ) ok_('/c/en/figure' in vectors.similar_terms('/c/en/figure skating', limit=3).index)
def test_vector_space_wrapper(): """ Check if VectorSpaceWrapper's index is sorted and its elements are concepts. """ wrap = VectorSpaceWrapper(frame=TEST_FRAME) wrap.load() ok_(all(is_term(label) for label in wrap.frame.index)) ok_(wrap.frame.index.is_monotonic_increasing) # test there are no transformations to raw terms other than adding the english tag ok_('/c/en/figure skater' in wrap.frame.index) # no underscore ok_('/c/en/Island' in wrap.frame.index) # no case folding # test index_prefix_range ok_(wrap.index_prefix_range('/c/en/figure') == (3, 6)) ok_(wrap.index_prefix_range('/c/en/skating') == (0, 0)) # test_similar_terms ok_('/c/en/figure skating' in wrap.similar_terms('/c/en/figure skating', limit=3).index) ok_('/c/en/figure skater' in wrap.similar_terms('/c/en/figure skating', limit=3).index) ok_('/c/en/figure' in wrap.similar_terms('/c/en/figure skating', limit=3).index)
def test_vector_space_wrapper(frame=None): """ Check if VectorSpaceWrapper's index is sorted and its elements are concepts. """ # Load a VSW from a user-supplied frame if frame: frame = load_any_embeddings(frame) wrap = VectorSpaceWrapper(frame=frame) wrap.load() ok_(all(label.startswith('/c') for label in wrap.frame.index[1:])) ok_(wrap.frame.index.is_monotonic_increasing) # Load a VSW from a filename vector_filename = DATA + '/vectors/glove12-840B.h5' wrap = VectorSpaceWrapper(vector_filename=vector_filename) wrap.load() ok_(all(label.startswith('/c') for label in wrap.frame.index[1:])) ok_(wrap.frame.index.is_monotonic_increasing) # Load a VSW from a frame frame = load_any_embeddings(DATA + '/vectors/glove12-840B.h5') wrap = VectorSpaceWrapper(frame=frame) wrap.load() ok_(all(label.startswith('/c') for label in wrap.frame.index[1:])) ok_(wrap.frame.index.is_monotonic_increasing)
def test_index_prefix_range(): vectors = VectorSpaceWrapper(frame=TEST_FRAME) vectors.load() eq_(vectors._index_prefix_range('/c/en/figure'), (3, 6)) eq_(vectors._index_prefix_range('/c/en/skating'), (0, 0))
def test_cache_with_oov(multi_ling_frame): vectors = VectorSpaceWrapper(frame=multi_ling_frame) vectors.load() # check the vector of all zeros is returned if the term is not present assert not vectors.get_vector('/c/en/test', oov_vector=False).any()
def test_index_prefix_range(simple_frame): vectors = VectorSpaceWrapper(frame=simple_frame) vectors.load() assert vectors._index_prefix_range('/c/en/figure') == (3, 6) assert vectors._index_prefix_range('/c/en/skating') == (0, 0)