Example #1
0
def test_multiindex_objects():
    mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]],
                    codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
                    names=["col1", "col2"])
    recons = mi._sort_levels_monotonic()

    # These are equal.
    assert mi.equals(recons)
    assert Index(mi.values).equals(Index(recons.values))

    # _hashed_values and hash_pandas_object(..., index=False) equivalency.
    expected = hash_pandas_object(mi, index=False).values
    result = mi._hashed_values

    tm.assert_numpy_array_equal(result, expected)

    expected = hash_pandas_object(recons, index=False).values
    result = recons._hashed_values

    tm.assert_numpy_array_equal(result, expected)

    expected = mi._hashed_values
    result = recons._hashed_values

    # Values should match, but in different order.
    tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
Example #2
0
def test_hash_pandas_object(obj):
    a = hash_pandas_object(obj)
    b = hash_pandas_object(obj)
    if isinstance(a, np.ndarray):
        np.testing.assert_equal(a, b)
    else:
        assert_eq(a, b)
Example #3
0
def test_multiindex_objects():
    mi = MultiIndex(
        levels=[["b", "d", "a"], [1, 2, 3]],
        codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
        names=["col1", "col2"],
    )
    recons = mi._sort_levels_monotonic()

    # These are equal.
    assert mi.equals(recons)
    assert Index(mi.values).equals(Index(recons.values))

    # _hashed_values and hash_pandas_object(..., index=False) equivalency.
    expected = hash_pandas_object(mi, index=False).values
    result = mi._hashed_values

    tm.assert_numpy_array_equal(result, expected)

    expected = hash_pandas_object(recons, index=False).values
    result = recons._hashed_values

    tm.assert_numpy_array_equal(result, expected)

    expected = mi._hashed_values
    result = recons._hashed_values

    # Values should match, but in different order.
    tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
Example #4
0
def test_object_missing_values():
    # Check that the presence of missing values doesn't change how object dtype
    # is hashed.
    s = pd.Series(['a', 'b', 'c', None])
    h1 = hash_pandas_object(s).iloc[:3]
    h2 = hash_pandas_object(s.iloc[:3])
    tm.assert_series_equal(h1, h2)
Example #5
0
def test_hash_pandas_object(obj):
    a = hash_pandas_object(obj)
    b = hash_pandas_object(obj)
    if isinstance(a, np.ndarray):
        np.testing.assert_equal(a, b)
    else:
        assert_eq(a, b)
Example #6
0
    def save(self, filename=None):
        """Save the current recommender.

        :param filename: string or None
            Name of file to load
        """
        if filename is None:
            fn = self.serialized_rec_path
        else:
            fn = filename
        if os.path.isfile(fn):
            logger.warning('overwriting ' + fn)

        save_dict = copy.deepcopy(self.__dict__)

        # remove results_df to save space. this gets loaded by load() fn.
        if 'results_df' in save_dict.keys():
            logger.debug('deleting save_dict[results_df]:' +
                         str(save_dict['results_df'].head()))
            rowHashes = hash_pandas_object(save_dict['results_df']).values
            save_dict['results_df_hash'] = hashlib.sha256(
                rowHashes).hexdigest()
            del save_dict['results_df']

        # remove ml_p to save space
        rowHashes = hash_pandas_object(save_dict['_ml_p'].apply(str)).values
        save_dict['ml_p_hash'] = hashlib.sha256(rowHashes).hexdigest()
        del save_dict['_ml_p']
        del save_dict['mlp_combos']

        logger.info('saving recommender as ' + fn)
        f = gzip.open(fn, 'wb')
        pickle.dump(save_dict, f, 2)
        f.close()
    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
Example #8
0
def test_object_missing_values():
    # Check that the presence of missing values doesn't change how object dtype
    # is hashed.
    s = pd.Series(["a", "b", "c", None])
    h1 = hash_pandas_object(s).iloc[:3]
    h2 = hash_pandas_object(s.iloc[:3])
    tm.assert_series_equal(h1, h2)
Example #9
0
    def test_drift_detector_lightgbm(self):
        df = load_bank()
        y = df.pop('y')
        X_train, X_test = train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527)
        dd = DriftDetector()
        dd.fit(X_train, X_test)

        assert len(dd.feature_names_) == 17
        assert len(dd.feature_importances_) == 17
        assert dd.auc_
        assert len(dd.estimator_) == 5

        proba = dd.predict_proba(df)
        assert proba.shape[0] == df.shape[0]

        df = load_bank()
        y = df.pop('y')
        p = int(df.shape[0] * 0.2)
        X_train, X_test, y_train, y_test = dd.train_test_split(df.copy(), y, test_size=0.2)
        assert X_train.shape == (df.shape[0] - p, df.shape[1])
        assert y_train.shape == (df.shape[0] - p,)
        assert X_test.shape == (p, df.shape[1])
        assert y_test.shape == (p,)

        df['y'] = y
        X_train['y'] = y_train
        X_test['y'] = y_test
        df_split = pd.concat([X_train, X_test])
        df_hash = hash_pandas_object(df).sort_values()
        splitted_hash = hash_pandas_object(df_split).sort_values()
        assert (df_hash == splitted_hash).all()
Example #10
0
    def test_drift_detector_split(self):
        df = dd.from_pandas(load_bank(), npartitions=2)
        y = df.pop('y')
        X_train, X_test = DaskToolBox.train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527)
        ddr = dd_selector().get_detector()
        ddr.fit(X_train, X_test)

        assert len(ddr.feature_names_) == 17
        assert len(ddr.feature_importances_) == 17
        assert ddr.auc_
        assert len(ddr.estimator_) == 5

        proba = ddr.predict_proba(df)
        assert proba.compute().shape[0] == len(df)

        df = dd.from_pandas(load_bank(), npartitions=2)
        y = df.pop('y')
        p = int(len(df) * 0.2)
        X_train, X_test, y_train, y_test = ddr.train_test_split(df.copy(), y, test_size=0.2, remain_for_train=0.)

        df, X_train, X_test, y_train, y_test = DaskToolBox.compute(df, X_train, X_test, y_train, y_test)
        assert X_train.shape == (df.shape[0] - p, df.shape[1])
        assert y_train.shape == (df.shape[0] - p,)
        assert X_test.shape == (p, df.shape[1])
        assert y_test.shape == (p,)

        df['y'] = y
        X_train['y'] = y_train
        X_test['y'] = y_test
        df_split = pd.concat([X_train, X_test])
        df_hash = hash_pandas_object(df).sort_values()
        splitted_hash = hash_pandas_object(df_split).sort_values()
        assert (df_hash == splitted_hash).all()
Example #11
0
def test_hash_with_tuple():
    # GH#28969 array containing a tuple raises on call to arr.astype(str)
    #  apparently a numpy bug github.com/numpy/numpy/issues/9441

    df = DataFrame({"data": [tuple("1"), tuple("2")]})
    result = hash_pandas_object(df)
    expected = Series([10345501319357378243, 8331063931016360761],
                      dtype=np.uint64)
    tm.assert_series_equal(result, expected)

    df2 = DataFrame({"data": [(1, ), (2, )]})
    result = hash_pandas_object(df2)
    expected = Series([9408946347443669104, 3278256261030523334],
                      dtype=np.uint64)
    tm.assert_series_equal(result, expected)

    # require that the elements of such tuples are themselves hashable

    df3 = DataFrame({"data": [
        (
            1,
            [],
        ),
        (
            2,
            {},
        ),
    ]})
    with pytest.raises(TypeError, match="unhashable type: 'list'"):
        hash_pandas_object(df3)
Example #12
0
    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(
            mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(
            recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result),
                                    np.sort(expected))
Example #13
0
def identify_compatible_groups(dataframes_with_metadata):
    already_classified = set()
    compatible_groups = []

    for t1, path1, md1 in dataframes_with_metadata:
        # these local variables are for this one view
        compatible_group = [path1]
        hashes1 = hash_pandas_object(t1, index=False)
        ht1 = hashes1.sum()
        if path1 in already_classified:
            continue
        for t2, path2, md2 in dataframes_with_metadata:
            if path1 == path2:  # same table
                continue
            # if t2 is in remove group
            if path2 in already_classified:
                continue
            hashes2 = hash_pandas_object(t2, index=False)
            ht2 = hashes2.sum()

            # are views compatible
            if ht1 == ht2:
                compatible_group.append(path2)
                already_classified.add(path1)
                already_classified.add(path2)
        # if len(compatible_group) > 1:
        #  cannot check this condition because now all views are analyzed from compatible groups
        compatible_groups.append(compatible_group)
    return compatible_groups
Example #14
0
    def test_pandas_errors(self):
        with pytest.raises(TypeError):
            hash_pandas_object(pd.Timestamp('20130101'))

        obj = tm.makePanel()

        with pytest.raises(TypeError):
            hash_pandas_object(obj)
Example #15
0
    def test_pandas_errors(self):
        with pytest.raises(TypeError):
            hash_pandas_object(pd.Timestamp('20130101'))

        obj = tm.makePanel()

        with pytest.raises(TypeError):
            hash_pandas_object(obj)
Example #16
0
    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)
    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all()
    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all()
Example #19
0
    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all()
    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)
Example #21
0
    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all()
Example #22
0
    def test_pandas_errors(self):
        with pytest.raises(TypeError):
            hash_pandas_object(pd.Timestamp('20130101'))

        with catch_warnings(record=True):
            obj = tm.makePanel()

        with pytest.raises(TypeError):
            hash_pandas_object(obj)
Example #23
0
def test_df_hash_keys():
    # DataFrame version of the test_hash_keys.
    # https://github.com/pandas-dev/pandas/issues/41404
    obj = DataFrame({"x": np.arange(3), "y": list("abc")})

    a = hash_pandas_object(obj, hash_key="9876543210123456")
    b = hash_pandas_object(obj, hash_key="9876543210123465")

    assert (a != b).all()
Example #24
0
def summarize_views_and_find_candidate_complementary(dataframes_with_metadata):
    already_processed_complementary_pairs = set()

    contained_groups = []
    candidate_complementary_groups = []

    for df1, path1, md1 in dataframes_with_metadata:
        # these local variables are for this one view
        contained_group = [path1]

        hashes1_list = hash_pandas_object(
            df1, index=False)  # we only consider content
        hashes1_set = set(hashes1_list)
        for df2, path2, md2 in dataframes_with_metadata:
            if path1 == path2:  # same table
                continue
            hashes2_list = hash_pandas_object(df2, index=False)
            hashes2_set = set(hashes2_list)
            # are views potentially contained
            if len(hashes1_set) > len(hashes2_set):
                # is t2 contained in t1?
                if len(hashes2_set - hashes1_set) == 0:
                    contained_group.append(path2)
            else:
                if (path1 + "%%%" + path2) in already_processed_complementary_pairs\
                        or (path2 + "%%%" + path1) in already_processed_complementary_pairs:
                    continue  # already processed, skip computation
                # Verify that views are potentially complementary
                s12 = (hashes1_set - hashes2_set)
                s1_complement = set()
                if len(s12) > 0:
                    s1_complement.update((s12))
                s21 = (hashes2_set - hashes1_set)
                s2_complement = set()
                if len(s21) > 0:
                    s2_complement.update((s21))
                if len(s1_complement) > 0 and len(
                        s2_complement
                ) > 0:  # and, otherwise it's a containment rel
                    idx1 = [
                        idx for idx, value in enumerate(hashes1_list)
                        if value in s1_complement
                    ]
                    idx2 = [
                        idx for idx, value in enumerate(hashes2_list)
                        if value in s2_complement
                    ]
                    candidate_complementary_groups.append(
                        (df1, md1, path1, idx1, df2, md2, path2, idx2))
                    already_processed_complementary_pairs.add(
                        (path1 + "%%%" + path2))
                    already_processed_complementary_pairs.add(
                        (path2 + "%%%" + path1))
        if len(contained_group) > 1:
            contained_groups.append(contained_group)
    return contained_groups, candidate_complementary_groups
Example #25
0
def test_hash_keys():
    # Using different hash keys, should have
    # different hashes for the same data.
    #
    # This only matters for object dtypes.
    obj = Series(list("abc"))

    a = hash_pandas_object(obj, hash_key="9876543210123456")
    b = hash_pandas_object(obj, hash_key="9876543210123465")

    assert (a != b).all()
Example #26
0
def test_hash_keys():
    # Using different hash keys, should have
    # different hashes for the same data.
    #
    # This only matters for object dtypes.
    obj = Series(list("abc"))

    a = hash_pandas_object(obj, hash_key="9876543210123456")
    b = hash_pandas_object(obj, hash_key="9876543210123465")

    assert (a != b).all()
Example #27
0
def test_deprecation():
    with tm.assert_produces_warning(DeprecationWarning,
                                    check_stacklevel=False):
        from pandas.tools.hashing import hash_pandas_object
        obj = Series(list('abc'))
        hash_pandas_object(obj, hash_key='9876543210123456')

    with tm.assert_produces_warning(DeprecationWarning,
                                    check_stacklevel=False):
        from pandas.tools.hashing import hash_array
        obj = np.array([1, 2, 3])
        hash_array(obj, hash_key='9876543210123456')
Example #28
0
def test_df_encoding():
    # Check that DataFrame recognizes optional encoding.
    # https://github.com/pandas-dev/pandas/issues/41404
    # https://github.com/pandas-dev/pandas/pull/42049
    obj = DataFrame({"x": np.arange(3), "y": list("a+c")})

    a = hash_pandas_object(obj, encoding="utf8")
    b = hash_pandas_object(obj, encoding="utf7")

    # Note that the "+" is encoded as "+-" in utf-7.
    assert a[0] == b[0]
    assert a[1] != b[1]
    assert a[2] == b[2]
Example #29
0
    def test_categorical_consistency(self, s1, categorize):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        s2 = s1.astype('category').cat.set_categories(s1)
        s3 = s2.cat.set_categories(list(reversed(s1)))

        # These should all hash identically
        h1 = hash_pandas_object(s1, categorize=categorize)
        h2 = hash_pandas_object(s2, categorize=categorize)
        h3 = hash_pandas_object(s3, categorize=categorize)
        tm.assert_series_equal(h1, h2)
        tm.assert_series_equal(h1, h3)
Example #30
0
def test_deprecation():

    with tm.assert_produces_warning(DeprecationWarning,
                                    check_stacklevel=False):
        from pandas.tools.hashing import hash_pandas_object
        obj = Series(list('abc'))
        hash_pandas_object(obj, hash_key='9876543210123456')

    with tm.assert_produces_warning(DeprecationWarning,
                                    check_stacklevel=False):
        from pandas.tools.hashing import hash_array
        obj = np.array([1, 2, 3])
        hash_array(obj, hash_key='9876543210123456')
Example #31
0
    def test_categorical_consistency(self, s1, categorize):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        s2 = s1.astype('category').cat.set_categories(s1)
        s3 = s2.cat.set_categories(list(reversed(s1)))

        # These should all hash identically
        h1 = hash_pandas_object(s1, categorize=categorize)
        h2 = hash_pandas_object(s2, categorize=categorize)
        h3 = hash_pandas_object(s3, categorize=categorize)
        tm.assert_series_equal(h1, h2)
        tm.assert_series_equal(h1, h3)
Example #32
0
def _check_equal(obj, **kwargs):
    """
    Check that hashing an objects produces the same value each time.

    Parameters
    ----------
    obj : object
        The object to hash.
    kwargs : kwargs
        Keyword arguments to pass to the hashing function.
    """
    a = hash_pandas_object(obj, **kwargs)
    b = hash_pandas_object(obj, **kwargs)
    tm.assert_series_equal(a, b)
Example #33
0
def _check_equal(obj, **kwargs):
    """
    Check that hashing an objects produces the same value each time.

    Parameters
    ----------
    obj : object
        The object to hash.
    kwargs : kwargs
        Keyword arguments to pass to the hashing function.
    """
    a = hash_pandas_object(obj, **kwargs)
    b = hash_pandas_object(obj, **kwargs)
    tm.assert_series_equal(a, b)
Example #34
0
    def hash_pandas_object(obj,
                           index=True,
                           encoding='utf8',
                           hash_key=None,
                           categorize=True):
        if hash_key is None:
            hash_key = _default_hash_key

        def adder(h, hashed_to_add):
            h = np.multiply(h, np.uint(3), h)
            return np.add(h, hashed_to_add, h)

        if isinstance(obj, pd.Index):
            h = hash_array(obj.values, encoding, hash_key,
                           categorize).astype('uint64')
            h = pd.Series(h, index=obj, dtype='uint64')
        elif isinstance(obj, pd.Series):
            h = hash_array(obj.values, encoding, hash_key,
                           categorize).astype('uint64')
            if index:
                h = adder(
                    h,
                    hash_pandas_object(obj.index,
                                       index=False,
                                       encoding=encoding,
                                       hash_key=hash_key,
                                       categorize=categorize).values)
            h = pd.Series(h, index=obj.index, dtype='uint64')
        elif isinstance(obj, pd.DataFrame):
            cols = obj.iteritems()
            first_series = next(cols)[1]
            h = hash_array(first_series.values, encoding, hash_key,
                           categorize).astype('uint64')
            for _, col in cols:
                h = adder(
                    h, hash_array(col.values, encoding, hash_key, categorize))
            if index:
                h = adder(
                    h,
                    hash_pandas_object(obj.index,
                                       index=False,
                                       encoding=encoding,
                                       hash_key=hash_key,
                                       categorize=categorize).values)

            h = pd.Series(h, index=obj.index, dtype='uint64')
        else:
            raise TypeError("Unexpected type for hashing %s" % type(obj))
        return h
Example #35
0
def test_categorical_consistency():
    # Check that categoricals hash consistent with their values, not codes
    # This should work for categoricals of any dtype
    for s1 in [pd.Series(['a', 'b', 'c', 'd']),
               pd.Series([1000, 2000, 3000, 4000]),
               pd.Series(pd.date_range(0, periods=4))]:
        s2 = s1.astype('category').cat.set_categories(s1)
        s3 = s2.cat.set_categories(list(reversed(s1)))
        for categorize in [True, False]:
            # These should all hash identically
            h1 = hash_pandas_object(s1, categorize=categorize)
            h2 = hash_pandas_object(s2, categorize=categorize)
            h3 = hash_pandas_object(s3, categorize=categorize)
            tm.assert_series_equal(h1, h2)
            tm.assert_series_equal(h1, h3)
Example #36
0
def _check_not_equal_with_index(obj):
    """
    Check the hash of an object with and without its index is not the same.

    Parameters
    ----------
    obj : object
        The object to hash.
    """
    if not isinstance(obj, Index):
        a = hash_pandas_object(obj, index=True)
        b = hash_pandas_object(obj, index=False)

        if len(obj):
            assert not (a == b).all()
Example #37
0
def test_hashable_tuple_args():
    # require that the elements of such tuples are themselves hashable

    df3 = DataFrame({"data": [
        (
            1,
            [],
        ),
        (
            2,
            {},
        ),
    ]})
    with pytest.raises(TypeError, match="unhashable type: 'list'"):
        hash_pandas_object(df3)
Example #38
0
def _check_not_equal_with_index(obj):
    """
    Check the hash of an object with and without its index is not the same.

    Parameters
    ----------
    obj : object
        The object to hash.
    """
    if not isinstance(obj, Index):
        a = hash_pandas_object(obj, index=True)
        b = hash_pandas_object(obj, index=False)

        if len(obj):
            assert not (a == b).all()
Example #39
0
def test_multiindex_unique():
    mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                 (51, 204), (102, 51)])
    assert mi.is_unique is True

    result = hash_pandas_object(mi)
    assert result.is_unique is True
Example #40
0
    def evaluate(self):
        """ Use cryptocomapre API to evaluate portfolio in given currency """
        # load backup file if exist and compare hash of previous p_raw to current p_raw before getting historical data cryptocompare API (slow)

        # create a tmp dir in the same folder as the as the ledger csv
        tmp_dir = os.path.dirname(os.path.abspath(
            self.ledger.csv_db_path)) + '/tmp'
        if not os.path.exists(
                tmp_dir):  # create the tmp directory if not existing
            os.makedirs(tmp_dir)

        file_path = tmp_dir + '/p_eval_' + self.eval_symbol + '.pkl'
        hash_path = tmp_dir + '/p_raw_hash_' + self.eval_symbol
        current_hash = str(hash_pandas_object(self.p_raw).sum())
        old_hash = ''
        is_loaded = True
        # NOTE: better way would be to check if path is valid (although doesn't guarantee it can be open)
        try:
            self.p_eval = pd.read_pickle(
                file_path)  #try to load previous file back
            with open(hash_path) as f:
                old_hash = f.read()
        except IOError:
            # If fail,  evalulate the portfolio and save the file after
            is_loaded = False

        if not is_loaded or old_hash != current_hash:  # file was open but is not up to date (note that new transaction will only be updated on next day)
            self.p_eval = self.p_raw.apply(Portfolio.valuation,
                                           args=(self.eval_symbol, ))
            self.p_eval.to_pickle(file_path)
            with open(hash_path, "w") as f:
                f.write(current_hash)

        return self.p_eval
Example #41
0
 def test_categorical_consistency(self):
     # GH15143
     # Check that categoricals hash consistent with their values, not codes
     # This should work for categoricals of any dtype
     for s1 in [Series(['a', 'b', 'c', 'd']),
                Series([1000, 2000, 3000, 4000]),
                Series(pd.date_range(0, periods=4))]:
         s2 = s1.astype('category').cat.set_categories(s1)
         s3 = s2.cat.set_categories(list(reversed(s1)))
         for categorize in [True, False]:
             # These should all hash identically
             h1 = hash_pandas_object(s1, categorize=categorize)
             h2 = hash_pandas_object(s2, categorize=categorize)
             h3 = hash_pandas_object(s3, categorize=categorize)
             tm.assert_series_equal(h1, h2)
             tm.assert_series_equal(h1, h3)
Example #42
0
def compute_hll_array(obj, b):
    # b is the number of bits

    if not 8 <= b <= 16:
        raise ValueError("b should be between 8 and 16")
    num_bits_discarded = 32 - b
    m = 1 << b

    # Get an array of the hashes
    hashes = hash_pandas_object(obj, index=False)
    if isinstance(hashes, pd.Series):
        hashes = hashes._values
    hashes = hashes.astype(np.uint32)

    # Of the first b bits, which is the first nonzero?
    j = hashes >> num_bits_discarded
    first_bit = compute_first_bit(hashes)

    # Pandas can do the max aggregation
    df = pd.DataFrame({"j": j, "first_bit": first_bit})
    series = df.groupby("j").max()["first_bit"]

    # Return a dense array so we can concat them and get a result
    # that is easy to deal with
    return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8)
Example #43
0
    def __init__(self, args):
        super(RecoModelRTAE, self).__init__()

        self.cpu_device = torch.device('cpu')
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.device = self.cpu_device

        self.args = args
        self.run_name = f"{args['algorithm']}_{args['P']}_{args['batch_size']}_{args['K']}_{datetime.datetime.now()}"
        self.use_em = args['use_em']

        # Dataset stuff
        if isinstance(args['dataset'], str):

            self.data_hash = None

            fn_train = args['dataset'] + '/train.csv'
            fn_test = args['dataset'] + '/test.csv'
            self.dataset = PandasDataset(args['P'],
                                         pd.read_csv(fn_train),
                                         testing=True)
            self.dataset_test = PandasDataset(args['P'],
                                              pd.read_csv(fn_test),
                                              testing=True)

        else:  # support the direct insertion of a dataframe into the dataset
            self.dataset = PandasDataset(args['P'],
                                         args['dataset'],
                                         testing=True)
            self.data_hash = hash_pandas_object(args['dataset'].v).sum()
Example #44
0
def isotherm_to_hash(isotherm):
    """
    Convert an isotherm object to a unique hash.

    Parameters
    ----------
    isotherm : PointIsotherm
        Isotherm to be hashed.

    Returns
    -------
    str
        A string with the Isotherm hash.
    """
    # Isotherm properties
    raw_dict = isotherm.to_dict()

    # Isotherm data or model
    if isinstance(isotherm, pygaps.PointIsotherm):
        raw_dict["data_hash"] = str(
            hash_pandas_object(isotherm.data_raw.round(8)).sum())
    elif isinstance(isotherm, pygaps.ModelIsotherm):
        raw_dict["data_hash"] = isotherm.model.to_dict()

    md_hasher = hashlib.md5(
        json.dumps(raw_dict, sort_keys=True).encode('utf-8'))

    return md_hasher.hexdigest()
Example #45
0
def compute_hll_array(obj, b):
    # b is the number of bits

    if not 8 <= b <= 16:
        raise ValueError('b should be between 8 and 16')
    num_bits_discarded = 32 - b
    m = 1 << b

    # Get an array of the hashes
    hashes = hash_pandas_object(obj, index=False)
    if isinstance(hashes, pd.Series):
        hashes = hashes._values
    hashes = hashes.astype(np.uint32)

    # Of the first b bits, which is the first nonzero?
    j = hashes >> num_bits_discarded
    first_bit = compute_first_bit(hashes)

    # Pandas can do the max aggregation
    df = pd.DataFrame({'j': j, 'first_bit': first_bit})
    series = df.groupby('j').max()['first_bit']

    # Return a dense array so we can concat them and get a result
    # that is easy to deal with
    return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8)
Example #46
0
def _hash_array_like_obj_as_bytes(data):
    """
    Helper method to convert pandas dataframe/numpy array/list into bytes for
    MD5 calculation purpose.
    """
    from pandas.util import hash_pandas_object
    import numpy as np
    import pandas as pd

    if isinstance(data, pd.DataFrame):
        # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
        # run code not related to pyspark.
        if "pyspark" in sys.modules:
            from pyspark.ml.linalg import Vector as spark_vector_type
        else:
            spark_vector_type = None

        def _hash_array_like_element_as_bytes(v):
            if spark_vector_type is not None:
                if isinstance(v, spark_vector_type):
                    return _hash_ndarray_as_bytes(v.toArray())
            if isinstance(v, np.ndarray):
                return _hash_ndarray_as_bytes(v)
            if isinstance(v, list):
                return _hash_ndarray_as_bytes(np.array(v))
            return v

        data = data.applymap(_hash_array_like_element_as_bytes)
        return _hash_uint64_ndarray_as_bytes(hash_pandas_object(data))
    elif isinstance(data, np.ndarray):
        return _hash_ndarray_as_bytes(data)
    elif isinstance(data, list):
        return _hash_ndarray_as_bytes(np.array(data))
    else:
        raise ValueError("Unsupported data type.")
Example #47
0
    def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0]
Example #48
0
 def test_consistency(self):
     # check that our hash doesn't change because of a mistake
     # in the actual code; this is the ground truth
     result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
     expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                 477881037637427054], dtype='uint64'),
                       index=['foo', 'bar', 'baz'])
     tm.assert_series_equal(result, expected)
Example #49
0
def test_hash_tuples():
    tuples = [(1, "one"), (1, "two"), (2, "one")]
    result = hash_tuples(tuples)

    expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values
    tm.assert_numpy_array_equal(result, expected)

    result = hash_tuples(tuples[0])
    assert result == expected[0]
Example #50
0
    def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
                           categorize=True):
        if hash_key is None:
            hash_key = _default_hash_key

        def adder(h, hashed_to_add):
            h = np.multiply(h, np.uint(3), h)
            return np.add(h, hashed_to_add, h)

        if isinstance(obj, pd.Index):
            h = hash_array(obj.values, encoding, hash_key,
                           categorize).astype('uint64')
            h = pd.Series(h, index=obj, dtype='uint64')
        elif isinstance(obj, pd.Series):
            h = hash_array(obj.values, encoding, hash_key,
                           categorize).astype('uint64')
            if index:
                h = adder(h, hash_pandas_object(obj.index,
                                                index=False,
                                                encoding=encoding,
                                                hash_key=hash_key,
                                                categorize=categorize).values)
            h = pd.Series(h, index=obj.index, dtype='uint64')
        elif isinstance(obj, pd.DataFrame):
            cols = obj.iteritems()
            first_series = next(cols)[1]
            h = hash_array(first_series.values, encoding,
                           hash_key, categorize).astype('uint64')
            for _, col in cols:
                h = adder(h, hash_array(col.values, encoding, hash_key,
                                        categorize))
            if index:
                h = adder(h, hash_pandas_object(obj.index,
                                                index=False,
                                                encoding=encoding,
                                                hash_key=hash_key,
                                                categorize=categorize).values)

            h = pd.Series(h, index=obj.index, dtype='uint64')
        else:
            raise TypeError("Unexpected type for hashing %s" % type(obj))
        return h
Example #51
0
def partitioning_index(df, npartitions):
    """
    Computes a deterministic index mapping each record to a partition.

    Identical rows are mapped to the same partition.

    Parameters
    ----------
    df : DataFrame/Series/Index
    npartitions : int
        The number of partitions to group into.

    Returns
    -------
    partitions : ndarray
        An array of int64 values mapping each record to a partition.
    """
    return hash_pandas_object(df, index=False) % int(npartitions)
Example #52
0
def shuffle_group(df, col, stage, k, npartitions):
    """ Splits dataframe into groups

    The group is determined by their final partition, and which stage we are in
    in the shuffle

    Parameters
    ----------
    df: DataFrame
    col: str
        Column name on which to split the dataframe
    stage: int
        We shuffle dataframes with many partitions we in a few stages to avoid
        a quadratic number of tasks.  This number corresponds to which stage
        we're in, starting from zero up to some small integer
    k: int
        Desired number of splits from this dataframe
    npartition: int
        Total number of output partitions for the full dataframe

    Returns
    -------
    out: Dict[int, DataFrame]
        A dictionary mapping integers in {0..k} to dataframes such that the
        hash values of ``df[col]`` are well partitioned.
    """
    if col == '_partitions':
        ind = df[col]
    else:
        ind = hash_pandas_object(df[col], index=False)

    c = ind._values
    typ = np.min_scalar_type(npartitions * 2)

    c = np.mod(c, npartitions).astype(typ, copy=False)
    np.floor_divide(c, k ** stage, out=c)
    np.mod(c, k, out=c)

    indexer, locations = groupsort_indexer(c.astype(np.int64), k)
    df2 = df.take(indexer)
    locations = locations.cumsum()
    parts = [df2.iloc[a:b] for a, b in zip(locations[:-1], locations[1:])]

    return dict(zip(range(k), parts))
Example #53
0
def test_pandas_errors(obj):
    msg = "Unexpected type for hashing"
    with pytest.raises(TypeError, match=msg):
        hash_pandas_object(obj)
Example #54
0
def test_invalid_key():
    # This only matters for object dtypes.
    msg = "key should be a 16-byte string encoded"

    with pytest.raises(ValueError, match=msg):
        hash_pandas_object(Series(list("abc")), hash_key="foo")
Example #55
0
 def f():
     hash_pandas_object(Series(list('abc')), hash_key='foo')
Example #56
0
 def test_invalid_key(self):
     # this only matters for object dtypes
     msg = 'key should be a 16-byte string encoded'
     with tm.assert_raises_regex(ValueError, msg):
         hash_pandas_object(Series(list('abc')), hash_key='foo')