def test_content_hash_not_dependent_on_chunks_or_matches(): doc1 = Document() doc1.content = 'one' doc1.update_content_hash() doc2 = Document() doc2.content = 'one' doc2.update_content_hash() assert doc1.content_hash == doc2.content_hash doc3 = Document() doc3.content = 'one' for _ in range(3): with Document() as m: m.content = 'some chunk' doc3.chunks.append(m) doc3.update_content_hash() assert doc1.content_hash == doc3.content_hash doc4 = Document() doc4.content = 'one' for _ in range(3): with Document() as m: m.content = 'some match' doc4.matches.append(m) doc4.update_content_hash() assert doc1.content_hash == doc4.content_hash
def test_uri_get_set(): a = Document() a.uri = 'https://abc.com/a.jpg' assert a.uri == 'https://abc.com/a.jpg' assert a.mime_type == 'image/jpeg' a.uri = 'abcdefg' assert a.uri == 'abcdefg' a.content = 'abcdefg' assert a.text == 'abcdefg' assert not a.uri
def test_sparse_get_set(): d = Document() assert d.content is None mat1 = coo_matrix(np.array([1, 2, 3])) d.content = mat1 assert (d.content != mat1).nnz == 0 mat2 = coo_matrix(np.array([3, 2, 1])) assert (d.content != mat2).nnz != 0 d.blob = mat2 assert (d.content != mat2).nnz == 0