Esempio n. 1
0
def test_vector_space_wrapper(frame=None):
    """
    Check if VectorSpaceWrapper's index is sorted and its elements are concepts.
    """

    # Load a VSW from a user-supplied frame
    if frame:
        frame = load_any_embeddings(frame)
        wrap = VectorSpaceWrapper(frame=frame)
        wrap.load()
        ok_(all(label.startswith('/c') for label in wrap.frame.index[1:]))
        ok_(wrap.frame.index.is_monotonic_increasing)

    # Load a VSW from a filename
    vector_filename = DATA + '/vectors/glove12-840B.h5'
    wrap = VectorSpaceWrapper(vector_filename=vector_filename)
    wrap.load()
    ok_(all(label.startswith('/c') for label in wrap.frame.index[1:]))
    ok_(wrap.frame.index.is_monotonic_increasing)

    # Load a VSW from a frame
    frame = load_any_embeddings(DATA + '/vectors/glove12-840B.h5')
    wrap = VectorSpaceWrapper(frame=frame)
    wrap.load()
    ok_(all(label.startswith('/c') for label in wrap.frame.index[1:]))
    ok_(wrap.frame.index.is_monotonic_increasing)
Esempio n. 2
0
def test_get_vector(frame=None):
    """
    Check if vectors.get_vector() returns the same vector given labels that are shaped in a
    different way.
    """
    if frame:
        vectors = load_any_embeddings(frame)
        ok_(
            get_vector(vectors,
                       '/c/en/cat').equals(get_vector(vectors, 'cat', 'en')))

    vectors = load_any_embeddings(DATA + '/vectors/glove12-840B.h5')
    ok_(
        get_vector(vectors,
                   '/c/en/cat').equals(get_vector(vectors, 'cat', 'en')))
Esempio n. 3
0
def test_standardize_row_labels(frame=None):
    if not frame:
        frame = DATA + '/raw/vectors/glove12.840B.300d.txt.gz'
    vectors = load_any_embeddings(frame)

    vec1 = vectors.loc['island']
    vec2 = vectors.loc['Island']
    vec3 = vectors.loc['things']
    standardized_vectors = standardize_row_labels(vectors)

    # Check if all labels are concepts
    ok_(all(
        label.startswith('/c') for label in standardized_vectors.index[1:]))

    # Check if all terms standardized to the same concept are merged
    ok_(standardized_vectors.index.is_unique)
    ok_('/c/en/Island' not in standardized_vectors.index)
    ok_('/c/en/island' in standardized_vectors.index)
    ok_('/c/en/thing' in standardized_vectors.index)
    ok_(not standardized_vectors.loc['/c/en/island'].equals(vec1))
    ok_(not standardized_vectors.loc['/c/en/island'].equals(vec2))
    ok_(not standardized_vectors.loc['/c/en/things'].equals(vec3))

    # Check if numbers are substituted with '#'
    ok_('/c/en/##' in standardized_vectors.index)
Esempio n. 4
0
def test_l1_normalize_columns(frame=None):
    if not frame:
        frame = DATA + '/vectors/glove12-840B.h5'
    vectors = load_any_embeddings(frame)

    vectors = l1_normalize_columns(vectors)
    sums = np.sum(np.abs(vectors))
    for s in sums:
        assert_almost_equal(s, 1.0, places=4)
Esempio n. 5
0
def test_l2_normalize_rows(frame=None):
    if not frame:
        frame = DATA + '/vectors/glove12-840B.h5'
    vectors = load_any_embeddings(frame)

    vectors = l2_normalize_rows(vectors)

    lengths = np.sqrt(np.sum(np.power(vectors, 2), axis='columns'))
    for length in lengths:
        assert_almost_equal(length, 1.0, places=4)

    # Check if a data frame of all zeroes will be normalized to NaN
    frame = pd.DataFrame(np.zeros(shape=(1, 10)))
    frame = l2_normalize_rows(frame)
    lengths = np.sqrt(np.sum(np.power(frame, 2), axis=1))
    ok_(all(np.isnan(length) for length in lengths))
Esempio n. 6
0
def test_shrink_and_sort(frame=None):
    if not frame:
        frame = DATA + '/vectors/glove12-840B.h5'
    vectors = load_any_embeddings(frame)

    n, k = 10, 20
    shrank = shrink_and_sort(vectors, n, k)

    # Check the size of the frame
    ok_(shrank.shape == (n, k))

    # Check if the frame is l2 normalized
    lengths = np.sqrt(np.sum(np.power(shrank, 2), axis='columns'))
    for length in lengths:
        assert_almost_equal(length, 1.0, places=4)

    # Check if the index is sorted
    ok_(shrank.index.is_monotonic_increasing)