Ejemplo n.º 1
0
    def test_copy_from_csv_once(self):
        """We'd rather just convert csv to parquet or whatever one time;
        this test makes sure that that works"""
        dfsdir = 'dfs_test_copy_once'
        dfs_orig = dfset.make_dfset(dfsdir=dfsdir,
                                    filetype='parquet',
                                    csvsdir=MOCK_IN_DIR)
        dfs2 = dfset.make_dfset(dfsdir=dfsdir, filetype='parquet')
        assert dfs2.equals(dfs_orig)

        del dfs_orig
        dfs3 = dfset.make_dfset(dfsdir=dfsdir, filetype='parquet')
        assert dfs2.equals(dfs3)
        del dfs2
        dfs4 = dfset.make_dfset(dfsdir=dfsdir, filetype='parquet')
        assert dfs3.equals(dfs4)
Ejemplo n.º 2
0
    def _test_filetype(self, filetype):
        dfs = dfset.make_dfset(filetype=filetype, csvsdir=MOCK_IN_DIR)

        encs = [codec.Debug(), codec.Delta()]
        # encs = [codec.Delta()]
        # encs = [codec.Debug()]

        sizes_df_orig, sizes_df_comp = sq.encode_measure_decode(dfs, encs)
Ejemplo n.º 3
0
def main():
    _populate_mock_input_dir()

    pipelines = []
    # pipelines.append([codec.Delta()])
    # pipelines.append([codec.Delta(), codec.Zigzag()])

    # just quantize
    pipelines.append([])
    # quantize and bz2
    pipelines.append([codec.Bzip2()])
    # quantize, byteshuf, bz2
    pipelines.append([codec.ByteShuffle(), codec.Bzip2()])
    # quantize + {dynamic,double,plain}-delta code, with and without byteshuf
    pipelines.append([codec.Delta(), codec.Zigzag(), codec.Bzip2()])  # noqa
    pipelines.append([codec.DoubleDelta(), codec.Zigzag(), codec.Bzip2()])  # noqa
    pipelines.append([codec.DynamicDelta(), codec.Zigzag(), codec.Bzip2()])  # noqa
    pipelines.append([codec.Delta(), codec.Zigzag(), codec.ByteShuffle(), codec.Bzip2()])  # noqa
    pipelines.append([codec.DoubleDelta(), codec.Zigzag(), codec.ByteShuffle(), codec.Bzip2()])  # noqa
    pipelines.append([codec.DynamicDelta(), codec.Zigzag(), codec.ByteShuffle(), codec.Bzip2()])  # noqa

    csearch = codec.CodecSearch(pipelines=pipelines, loss='nbytes')

    codeclist = [codec.Quantize(), csearch]

    dfs = dfset.make_dfset(filetype='parquet', csvsdir=MOCK_IN_DIR)
    sizes_df_orig, sizes_df_comp = sq.encode_measure_decode(
            dfs, codeclist)


    # TODO construct a bunch of pipelines of preprocs and formats/compressors,
    # get the sizes for every (dfid, col) combo, dump to a file, and then
    # make simple plots or something breaking down results
    #   -tricky part is that we might wanna try different combos of stuff
    #   on different cols
    #       -in particular, colsum codec is weird
    #   -need a class to try out different pipelines on each col and use the
    #   best
    #       -actually, do we need this? if just want plots or something, can
    #       we just figure out what best would have been?
    #           -no, this just tells you conditional averages of each var;
    #           want to know actual overall compression of each df when you
    #           try "for real" to get as much compression as possible
    #       -would be nice to have it try running a real compressor on the
    #       results as its loss, instead of just some proxy

    # SELF: pick up by adding this class to codec.py
        # then construct some pipelines and run them
        # also, have dyndelta actually bitpack its masks


    _rm_mock_input_files()
Ejemplo n.º 4
0
    def _test_dfs(self, filetype):
        dfs = dfset.make_dfset(filetype=filetype, csvsdir=MOCK_IN_DIR)
        # dfs = dfset.make_dfset(filetype=filetype)
        # print("dfs._dfsdir", dfs._dfsdir)
        # print("os.listdir(self._dfsdir)", os.listdir(dfs._dfsdir))
        # print("endswith:", dfs._endswith)
        # dfs.copy_from_csvs_dir(MOCK_IN_DIR)
        # print("os.listdir(self._dfsdir)", os.listdir(dfs._dfsdir))
        # print("dfs._dfsdir", dfs._dfsdir)
        # print("dfs.ids:", dfs.ids)
        # print("dfs._find_ids():", dfs._find_ids())

        # print(dfs._cols_stored_for_dfid('df0'))
        # print(dfs['df0'].columns)
        # import sys; sys.exit()

        assert sorted(dfs.ids) == ['df0', 'df1']
        # print(dfs._cols_stored_for_dfid('df0'))
        assert sorted(dfs._cols_stored_for_dfid('df0')) == 'a b c'.split()
        assert sorted(dfs._cols_stored_for_dfid('df1')) == 'a b d'.split()

        df0_hat = dfs['df0']
        df1_hat = dfs['df1']

        # print("df0_hat: ", df0_hat)
        # print("df0_hat cols: ", df0_hat.columns)
        # print("df0_hat shape: ", df0_hat.shape)

        assert self.df0.shape == df0_hat.shape
        assert self.df1.shape == df1_hat.shape
        if filetype != 'csv':
            # print(self.df0.dtypes)
            # print(df0_hat.dtypes)
            # print(self.df0.index)
            # print(df0_hat.index)
            assert set(self.df0.dtypes) == set(df0_hat.dtypes)
            assert set(self.df1.dtypes) == set(df1_hat.dtypes)
        # print(self.df0.dtypes)
        # print(df0_hat.dtypes)
        assert set(self.df0.columns) == set(df0_hat.columns)
        assert set(self.df1.columns) == set(df1_hat.columns)
        for col in self.df0:
            assert np.array_equal(self.df0[col], df0_hat[col])
            assert np.allclose(self.df0[col], df0_hat[col], equal_nan=True)
        for col in self.df1:
            assert np.array_equal(self.df1[col], df1_hat[col])
            assert np.allclose(self.df1[col], df1_hat[col], equal_nan=True)
Ejemplo n.º 5
0
    def test_santize_cols(self):
        _rm_mock_input_files()  # just want df2 here
        df = _debug_df2()
        dfpath = os.path.join(MOCK_IN_DIR, 'df2.csv')
        df.to_csv(dfpath, index=False)

        dfs = dfset.make_dfset(filetype='parquet', csvsdir=MOCK_IN_DIR)

        assert sorted(dfs.ids) == ['df2']
        assert set(dfs._cols_stored_for_dfid('df2')) == set(df.columns)
        df_hat = dfs['df2']
        assert set(df.columns) == set(df_hat.columns)
        assert df.shape == df_hat.shape
        assert set(df.dtypes) == set(df_hat.dtypes)

        for col in df:
            assert np.allclose(df[col], df_hat[col], equal_nan=True)

        os.remove(dfpath)
Ejemplo n.º 6
0
    def _test_codecs_for_filetype(self, filetype, codeclist):
        dfs = dfset.make_dfset(filetype=filetype, csvsdir=MOCK_IN_DIR)

        sizes_df_orig, sizes_df_comp = sq.encode_measure_decode(dfs, codeclist)