Esempio n. 1
0
def test_vectorizing_and_similar_terms():
    # Simple test for vectorizing weighted terms
    assoc = AssocSpace.from_entries(ENTRIES, k=3)
    weighted_terms = [('apple', 5), ('banana', 22), ('not a term', 17)]
    apple = assoc.row_named('apple')
    banana = assoc.row_named('banana')
    vector = assoc.vector_from_terms(weighted_terms)

    # The similarity of 'apple' to itself is approximately 1
    assert abs(assoc.assoc_between_two_terms('apple', 'apple') - 1.0) < 1e-3

    # 'apple' and 'banana' are at least 10% less similar to each other than
    # to themselves
    assert assoc.assoc_between_two_terms('apple', 'banana') < 0.9

    # The vector is some linear combination of apple and banana. Test this
    # by subtracting out apple and banana components, so that there is nothing
    # left.
    norm_apple = normalize(apple)
    banana_perp_apple = normalize(banana - norm_apple * norm_apple.dot(banana))
    residual = vector - norm_apple * norm_apple.dot(vector)
    residual -= banana_perp_apple * banana_perp_apple.dot(residual)
    assert norm(residual) < 1e-3

    # Simple test for finding similar terms
    labels, scores = zip(*assoc.terms_similar_to_vector(vector))
    eq_(list(scores), sorted(scores, reverse=True))

    most_similar = assoc.most_similar_to_vector(vector)
    eq_(most_similar[0], labels[0])
    eq_(most_similar[1], scores[0])

    assert labels.index('banana') < labels.index('apple')
    assert labels.index('apple') < labels.index('green')
    assert labels.index('apple') < labels.index('celery')
def test_vectorizing_and_similar_terms():
    # Simple test for vectorizing weighted terms
    assoc = AssocSpace.from_entries(ENTRIES, k=3)
    weighted_terms = [('apple', 5), ('banana', 22), ('not a term', 17)]
    apple = assoc.row_named('apple')
    banana = assoc.row_named('banana')
    vector = assoc.vector_from_terms(weighted_terms)

    # The similarity of 'apple' to itself is approximately 1
    assert abs(assoc.assoc_between_two_terms('apple', 'apple') - 1.0) < 1e-3

    # 'apple' and 'banana' are at least 10% less similar to each other than
    # to themselves
    assert assoc.assoc_between_two_terms('apple', 'banana') < 0.9

    # The vector is some linear combination of apple and banana. Test this
    # by subtracting out apple and banana components, so that there is nothing
    # left.
    norm_apple = normalize(apple)
    banana_perp_apple = normalize(banana - norm_apple * norm_apple.dot(banana))
    residual = vector - norm_apple * norm_apple.dot(vector)
    residual -= banana_perp_apple * banana_perp_apple.dot(residual)
    assert norm(residual) < 1e-3

    # Simple test for finding similar terms
    labels, scores = zip(*assoc.terms_similar_to_vector(vector))
    eq_(list(scores), sorted(scores, reverse=True))

    most_similar = assoc.most_similar_to_vector(vector)
    eq_(most_similar[0], labels[0])
    eq_(most_similar[1], scores[0])

    assert labels.index('banana') < labels.index('apple')
    assert labels.index('apple') < labels.index('green')
    assert labels.index('apple') < labels.index('celery')
Esempio n. 3
0
def vector_from_terms(self, terms):
    """
	Get a vector representing a weighted set of terms, provided as a
	collection of (term, weight) tuples.  Note that this does not normalize
	the rows of U e^(S/2) before taking their weighted sum; this applies a
	natural penalty to low-quality terms.
	"""
    result = np.zeros((self.k,))
    for term, weight in terms:
        if term in self.labels:
            if term not in self._row_cache:
                # Prevent the cache from growing too large
                if len(self._row_cache) > 15000:
                    self._row_cache = {}
                    # Avoid keeping a slice of a memmap object; Numpy handles
                    # these inefficiently if you have a lot of them (especially
                    # in 1.7, but even in 1.6 or 1.8)
                row = np.copy(self.u[self.labels.index(term)])
                self._row_cache[term] = row
            result += self._row_cache[term] * weight
    return eigenmath.normalize(result * np.exp(self.sigma / 2))
Esempio n. 4
0
def test_norm_and_normalize():
    vec = np.asarray([8.0, 9.0, 12.0])
    assert np.allclose(norm(vec), 17.0)
    assert np.allclose(normalize(vec), vec / 17.0)
    # We normalize the zero vector to itself rather than raising an error
    assert (np.zeros(5) == normalize(np.zeros(5))).all()
Esempio n. 5
0
def cos_diff(a, b):
	return normalize(a).dot(normalize(b))
Esempio n. 6
0
def cos_diff(a, b):
    return normalize(a).dot(normalize(b))
Esempio n. 7
0
def test_norm_and_normalize():
    vec = np.asarray([8.0, 9.0, 12.0])
    assert np.allclose(norm(vec), 17.0)
    assert np.allclose(normalize(vec), vec / 17.0)
    # We normalize the zero vector to itself rather than raising an error
    assert (np.zeros(5) == normalize(np.zeros(5))).all()