Example #1
0
def test_hdf_reader(hdf_files, columns):
    hdf_df_file, hdf_series, format, nrows = hdf_files
    if format == 'fixed' and columns is not None:
        pytest.skip("Can't use columns with format 'fixed'")
    if format == 'table' and nrows == 0:
        pytest.skip("Can't read 0 row table with format 'table'")
    expect_df = pd.read_hdf(hdf_df_file, columns=columns)
    got_df = cudf.read_hdf(hdf_df_file, columns=columns)

    assert_eq(expect_df, got_df, check_categorical=False)

    for column in hdf_series.keys():

        expect_series = pd.read_hdf(hdf_series[column])
        got_series = cudf.read_hdf(hdf_series[column])

        assert_eq(expect_series, got_series)
Example #2
0
    def load_cache(self, filename=None):
        """
        Defines the behavior of how to load the cache file from the `filename`.
        Node can override this method. Default implementation assumes cudf
        dataframes.

        Arguments
        -------
        filename: str
            filename of the cache file. Leave as none to use default.

        """
        cache_dir = os.getenv('GQUANT_CACHE_DIR', self.cache_dir)
        if filename is None:
            filename = cache_dir + '/' + self.uid + '.hdf5'

        if self._using_ports():
            output_df = {}
            with pd.HDFStore(filename, mode='r') as hf:
                for oport, pspec in \
                        self._get_output_ports(full_port_spec=True).items():
                    ptype = pspec.get(PortsSpecSchema.port_type)
                    ptype = [ptype] if not isinstance(ptype, list) else ptype
                    key = '{}/{}'.format(self.uid, oport)
                    # check hdf store for the key
                    if key not in hf:
                        raise Exception(
                            'The task "{}" port "{}" key "{}" not found in '
                            'the hdf file "{}". Cannot load from cache.'.
                            format(self.uid, oport, key, filename))
                    if cudf.DataFrame not in ptype:
                        warnings.warn(
                            RuntimeWarning,
                            'Task "{}" port "{}" port type is not set to '
                            'cudf.DataFrame. Attempting to load port data '
                            'with cudf.read_hdf.'.format(self.uid, oport))
                    output_df[oport] = cudf.read_hdf(hf, key)
        else:
            output_df = cudf.read_hdf(filename, key=self.uid)

        return output_df
Example #3
0
    def load_cache(self, filename):
        """
        defines the behavior of how to load the cache file from the `filename`.
        Node can override this method.

        Arguments
        -------
        filename: str
            filename of the cache file

        """
        output_df = cudf.read_hdf(filename, key=self.uid)
        return output_df