def test_input_types(): # Both strings and bytestrings are unpackable assert np.all(unpack64('abcd') == unpack64(b'abcd')) # Anything that can be converted to a NumPy array is packable eq_(pack64([1.0, 2.0]), 'ZIAAQAA') eq_(pack64((1.0, 2.0)), 'ZIAAQAA') eq_(pack64(np.array([1.0, 2.0], dtype=np.float32)), 'ZIAAQAA') eq_(pack64(np.array([1.0, 2.0], dtype=np.float64)), 'ZIAAQAA') eq_(pack64(np.array([1.0, 2.0], dtype=np.int32)), 'ZIAAQAA')
def test_speed(): vectors = [np.random.normal(size=(i%40+1,)) for i in xrange(40)] start1 = time.time() for vec in vectors: reference_unpack64(reference_pack64(vec)) time_reference = (time.time() - start1)*1000 start2 = time.time() for vec in vectors: unpack64(pack64(vec)) time_ours = (time.time() - start2)*1000 assert time_ours < time_reference,\ "Took %4.4f ms. Time to beat: %4.4f ms." % (time_ours, time_reference)
def _check(vector, expected=None, exact=False): # Checks that the given vector: # * Encodes successfully (and to a particular string, if specified) # * Decodes successfully to exactly the same value (if specified) or to # within the expected tolerance (see below for what this tolerance is) # * Produces exactly the same string when the decoded value is reencoded # Returns the maximum absolute deviation between the given and decoded # vectors, and the tolerance to which it was compared. encoded = pack64(vector) if expected is not None: eq_(encoded, expected) decoded = unpack64(encoded) eq_(pack64(decoded), encoded) if not len(vector): deviation = 0.0 else: deviation = np.max(np.abs(decoded - vector)) if exact: tolerance = 0.0 else: # Generally pack64 guarantees a precision of 2 ** -17 times the largest # magnitude entry. However, we have to adjust for two details. # * The largest magnitude entry may be rounded for packing in such a # way that the precision is slightly less than that guarantee. # * The smallest positive number that can be packed at all is # 2 ** -40, so the absolute precision available for very small # vectors, regardless of the size of the vector, is 2 ** -41. tolerance = max(np.max(np.abs(vector)) / (2.0 ** 17 - 0.5), 2.0 ** -41) assert deviation <= tolerance return deviation, tolerance
def unpack_vectors(dataframecolumn): """ Arguments: dataframecolumn (pandas dataframe column): single column of a pandas dataframe containing pack64'd document vectors Returns: newarray (numpy array): array of unpacked document vectors """ newarray = np.asarray([unpack64(x) for x in dataframecolumn]) return newarray
def round_trip_check(vec): newvec = unpack64(pack64(vec, rounded=True)) if len(vec) == 0: precision = 0. maxdiff = 0. else: precision = np.max(np.abs(vec)) * (2**-17) + 2**-40 maxdiff = np.max(np.abs(newvec - vec)) assert np.allclose(newvec, vec, 1e-10, precision),\ "%s isn't close enough to %s; difference=%s, precision=%s" % (newvec, vec, maxdiff, precision)
def test_errors(): # Nonfinite values are rejected for value in (float('inf'), float('nan')): with assert_raises(ValueError): pack64([value]) # Out of range values are rejected; check near the edge of the range with assert_raises(OverflowError): pack64([(2.0 ** 17 - 0.5) * 2.0 ** 23]) _check([(2.0 ** 17 - 0.6) * 2.0 ** 23], expected='_f__') with assert_raises(OverflowError): # (This could actually be encoded as '_gAA'.) pack64([-(2.0 ** 17 - 0.5) * 2.0 ** 23]) _check([-(2.0 ** 17 - 0.6) * 2.0 ** 23], expected='_gAB') # Strings with bad lengths or characters are rejected for string in ('', 'xx', b'xx', '\U0001f43c', 'Hey!', 'panda', 'rutabaga'): with assert_raises(ValueError): unpack64(string) # Some (but not all) bad strings are accepted if error checking is disabled for string in ('xx', 'Hey!', 'panda'): unpack64(string, check=False) with assert_raises(ValueError): unpack64('rutabaga', check=False)
def decoding_check(vec): encoded = reference_pack64(vec) a = reference_unpack64(encoded) b = unpack64(encoded) assert np.allclose(a, b), '%s should have decoded to %s, got %s' % (encoded, a, b)