def test_hdf_reader(hdf_files, columns): hdf_df_file, hdf_series, format, nrows = hdf_files if format == 'fixed' and columns is not None: pytest.skip("Can't use columns with format 'fixed'") if format == 'table' and nrows == 0: pytest.skip("Can't read 0 row table with format 'table'") expect_df = pd.read_hdf(hdf_df_file, columns=columns) got_df = cudf.read_hdf(hdf_df_file, columns=columns) assert_eq(expect_df, got_df, check_categorical=False) for column in hdf_series.keys(): expect_series = pd.read_hdf(hdf_series[column]) got_series = cudf.read_hdf(hdf_series[column]) assert_eq(expect_series, got_series)
def load_cache(self, filename=None): """ Defines the behavior of how to load the cache file from the `filename`. Node can override this method. Default implementation assumes cudf dataframes. Arguments ------- filename: str filename of the cache file. Leave as none to use default. """ cache_dir = os.getenv('GQUANT_CACHE_DIR', self.cache_dir) if filename is None: filename = cache_dir + '/' + self.uid + '.hdf5' if self._using_ports(): output_df = {} with pd.HDFStore(filename, mode='r') as hf: for oport, pspec in \ self._get_output_ports(full_port_spec=True).items(): ptype = pspec.get(PortsSpecSchema.port_type) ptype = [ptype] if not isinstance(ptype, list) else ptype key = '{}/{}'.format(self.uid, oport) # check hdf store for the key if key not in hf: raise Exception( 'The task "{}" port "{}" key "{}" not found in ' 'the hdf file "{}". Cannot load from cache.'. format(self.uid, oport, key, filename)) if cudf.DataFrame not in ptype: warnings.warn( RuntimeWarning, 'Task "{}" port "{}" port type is not set to ' 'cudf.DataFrame. Attempting to load port data ' 'with cudf.read_hdf.'.format(self.uid, oport)) output_df[oport] = cudf.read_hdf(hf, key) else: output_df = cudf.read_hdf(filename, key=self.uid) return output_df
def load_cache(self, filename): """ defines the behavior of how to load the cache file from the `filename`. Node can override this method. Arguments ------- filename: str filename of the cache file """ output_df = cudf.read_hdf(filename, key=self.uid) return output_df