Beispiel #1
0
    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(
            mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(
            recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result),
                                    np.sort(expected))
Beispiel #2
0
    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
Beispiel #3
0
    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            self.assertFalse((a == b).all())
Beispiel #4
0
    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            self.assertFalse((a == b).all())
Beispiel #5
0
    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)
Beispiel #6
0
    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        self.assertTrue((a != b).all())
Beispiel #7
0
    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)
Beispiel #8
0
    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        self.assertTrue((a != b).all())
Beispiel #9
0
    def test_mixed(self):
        # mixed objects
        obj = Series(['1', 2, 3])
        self.check_equal(obj)
        self.check_not_equal_with_index(obj)

        # mixed are actually equal when stringified
        a = hash_pandas_object(obj)
        b = hash_pandas_object(Series(list('123')))
        self.assert_series_equal(a, b)
Beispiel #10
0
    def test_pandas_errors(self):

        for obj in [pd.Timestamp('20130101')]:
            with pytest.raises(TypeError):
                hash_pandas_object(obj)

        with catch_warnings(record=True):
            obj = tm.makePanel()
        with pytest.raises(TypeError):
            hash_pandas_object(obj)
Beispiel #11
0
    def test_pandas_errors(self):

        for obj in [pd.Timestamp('20130101')]:
            with pytest.raises(TypeError):
                hash_pandas_object(obj)

        with catch_warnings(record=True):
            obj = tm.makePanel()
        with pytest.raises(TypeError):
            hash_pandas_object(obj)
Beispiel #12
0
    def test_mixed(self):
        # mixed objects
        obj = Series(['1', 2, 3])
        self.check_equal(obj)
        self.check_not_equal_with_index(obj)

        # mixed are actually equal when stringified
        a = hash_pandas_object(obj)
        b = hash_pandas_object(Series(list('123')))
        self.assert_series_equal(a, b)
Beispiel #13
0
    def hash_pandas_object(obj,
                           index=True,
                           encoding='utf8',
                           hash_key=None,
                           categorize=True):
        if hash_key is None:
            hash_key = _default_hash_key

        def adder(h, hashed_to_add):
            h = np.multiply(h, np.uint(3), h)
            return np.add(h, hashed_to_add, h)

        if isinstance(obj, pd.Index):
            h = hash_array(obj.values, encoding, hash_key,
                           categorize).astype('uint64')
            h = pd.Series(h, index=obj, dtype='uint64')
        elif isinstance(obj, pd.Series):
            h = hash_array(obj.values, encoding, hash_key,
                           categorize).astype('uint64')
            if index:
                h = adder(
                    h,
                    hash_pandas_object(obj.index,
                                       index=False,
                                       encoding=encoding,
                                       hash_key=hash_key,
                                       categorize=categorize).values)
            h = pd.Series(h, index=obj.index, dtype='uint64')
        elif isinstance(obj, pd.DataFrame):
            cols = obj.iteritems()
            first_series = next(cols)[1]
            h = hash_array(first_series.values, encoding, hash_key,
                           categorize).astype('uint64')
            for _, col in cols:
                h = adder(
                    h, hash_array(col.values, encoding, hash_key, categorize))
            if index:
                h = adder(
                    h,
                    hash_pandas_object(obj.index,
                                       index=False,
                                       encoding=encoding,
                                       hash_key=hash_key,
                                       categorize=categorize).values)

            h = pd.Series(h, index=obj.index, dtype='uint64')
        else:
            raise TypeError("Unexpected type for hashing %s" % type(obj))
        return h
Beispiel #14
0
 def test_categorical_consistency(self):
     # GH15143
     # Check that categoricals hash consistent with their values, not codes
     # This should work for categoricals of any dtype
     for s1 in [Series(['a', 'b', 'c', 'd']),
                Series([1000, 2000, 3000, 4000]),
                Series(pd.date_range(0, periods=4))]:
         s2 = s1.astype('category').cat.set_categories(s1)
         s3 = s2.cat.set_categories(list(reversed(s1)))
         for categorize in [True, False]:
             # These should all hash identically
             h1 = hash_pandas_object(s1, categorize=categorize)
             h2 = hash_pandas_object(s2, categorize=categorize)
             h3 = hash_pandas_object(s3, categorize=categorize)
             tm.assert_series_equal(h1, h2)
             tm.assert_series_equal(h1, h3)
Beispiel #15
0
 def test_consistency(self):
     # check that our hash doesn't change because of a mistake
     # in the actual code; this is the ground truth
     result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
     expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                 477881037637427054], dtype='uint64'),
                       index=['foo', 'bar', 'baz'])
     tm.assert_series_equal(result, expected)
Beispiel #16
0
 def test_consistency(self):
     # check that our hash doesn't change because of a mistake
     # in the actual code; this is the ground truth
     result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
     expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                 477881037637427054], dtype='uint64'),
                       index=['foo', 'bar', 'baz'])
     tm.assert_series_equal(result, expected)
Beispiel #17
0
    def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        self.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        self.assertEqual(result, expected[0])
Beispiel #18
0
    def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        self.assertEqual(result, expected[0])
Beispiel #19
0
 def test_categorical_consistency(self):
     # GH15143
     # Check that categoricals hash consistent with their values, not codes
     # This should work for categoricals of any dtype
     for s1 in [
             Series(['a', 'b', 'c', 'd']),
             Series([1000, 2000, 3000, 4000]),
             Series(pd.date_range(0, periods=4))
     ]:
         s2 = s1.astype('category').cat.set_categories(s1)
         s3 = s2.cat.set_categories(list(reversed(s1)))
         for categorize in [True, False]:
             # These should all hash identically
             h1 = hash_pandas_object(s1, categorize=categorize)
             h2 = hash_pandas_object(s2, categorize=categorize)
             h3 = hash_pandas_object(s3, categorize=categorize)
             tm.assert_series_equal(h1, h2)
             tm.assert_series_equal(h1, h3)
Beispiel #20
0
    def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
                           categorize=True):
        if hash_key is None:
            hash_key = _default_hash_key

        def adder(h, hashed_to_add):
            h = np.multiply(h, np.uint(3), h)
            return np.add(h, hashed_to_add, h)

        if isinstance(obj, pd.Index):
            h = hash_array(obj.values, encoding, hash_key,
                           categorize).astype('uint64')
            h = pd.Series(h, index=obj, dtype='uint64')
        elif isinstance(obj, pd.Series):
            h = hash_array(obj.values, encoding, hash_key,
                           categorize).astype('uint64')
            if index:
                h = adder(h, hash_pandas_object(obj.index,
                                                index=False,
                                                encoding=encoding,
                                                hash_key=hash_key,
                                                categorize=categorize).values)
            h = pd.Series(h, index=obj.index, dtype='uint64')
        elif isinstance(obj, pd.DataFrame):
            cols = obj.iteritems()
            first_series = next(cols)[1]
            h = hash_array(first_series.values, encoding,
                           hash_key, categorize).astype('uint64')
            for _, col in cols:
                h = adder(h, hash_array(col.values, encoding, hash_key,
                                        categorize))
            if index:
                h = adder(h, hash_pandas_object(obj.index,
                                                index=False,
                                                encoding=encoding,
                                                hash_key=hash_key,
                                                categorize=categorize).values)

            h = pd.Series(h, index=obj.index, dtype='uint64')
        else:
            raise TypeError("Unexpected type for hashing %s" % type(obj))
        return h
Beispiel #21
0
 def f():
     hash_pandas_object(f)
Beispiel #22
0
 def time_frame(self):
     hash_pandas_object(self.df)
Beispiel #23
0
 def time_series_int(self):
     hash_pandas_object(self.df.E)
Beispiel #24
0
 def time_series_string(self):
     hash_pandas_object(self.df.B)
Beispiel #25
0
 def f():
     hash_pandas_object(Series(list('abc')), hash_key='foo')
Beispiel #26
0
 def time_series_categorical(self):
     hash_pandas_object(self.df.C)
Beispiel #27
0
 def test_multiindex_unique(self):
     mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                  (51, 204), (102, 51)])
     self.assertTrue(mi.is_unique)
     result = hash_pandas_object(mi)
     self.assertTrue(result.is_unique)
Beispiel #28
0
 def test_multiindex_unique(self):
     mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204),
                                  (102, 51)])
     self.assertTrue(mi.is_unique)
     result = hash_pandas_object(mi)
     self.assertTrue(result.is_unique)
Beispiel #29
0
 def f():
     hash_pandas_object(f)
Beispiel #30
0
 def f():
     hash_pandas_object(Series(list('abc')), hash_key='foo')