def read_dataframe(self, *args, **kwargs): """ Read Parquet file as a dataframe (costing network requests). Pass *args and **kwargs to `fastparquet.ParquetFile.to_pandas()`. TODO make this raise OSError/FastparquetCouldNotHandleFile. (Currently we return an empty dataframe on error.) """ try: return parquet.read(minio.CachedRenderResultsBucket, self.parquet_key, args, kwargs) except OSError: # Two possibilities: # # 1. The file is missing. # 2. The file is empty. (We used to write empty files in # assign_wf_module.) # # Either way, our cached DataFrame is "empty", and we represent # that as None. return pd.DataFrame() except parquet.FastparquetCouldNotHandleFile: # Treat bugs as "empty file" return pd.DataFrame()
def test_empty_categorical_has_object_dtype(self): expected = pd.DataFrame({'A': []}, dtype='str').astype('category') assert expected['A'].cat.categories.dtype == object try: parquet.write(bucket, key, expected) result = parquet.read(bucket, key) finally: minio.remove(bucket, key) assert_frame_equal(result, expected)
def get_table(self): if not self.bucket or not self.key: # Old (obsolete) objects have no bucket/key, usually because # empty tables weren't being written. return pd.DataFrame() try: return parquet.read(self.bucket, self.key) except parquet.FastparquetCouldNotHandleFile: return pd.DataFrame() # empty table
def test_na_only_categorical_has_object_dtype(self): # Start with a Categorical with no values. (In Workbench, all # Categoricals are text.) expected = pd.DataFrame({'A': [np.nan]}, dtype=str).astype('category') assert expected['A'].cat.categories.dtype == object try: parquet.write(bucket, key, expected) result = parquet.read(bucket, key) finally: minio.remove(bucket, key) assert_frame_equal(result, expected)
def get_table(self): if not self.size: return pd.DataFrame() # empty table try: return parquet.read(self.file.name) except FileNotFoundError: # Spotted on production for a duplicated workflow dated # 2018-08-01. [adamhooper, 2018-09-20] I can think of no harm in # returning an empty dataframe here. return pd.DataFrame() # empty table
def get_table(self): if not self.bucket or not self.key: # Old (obsolete) objects have no bucket/key, usually because # empty tables weren't being written. return pd.DataFrame() try: return parquet.read(self.bucket, self.key) except FileNotFoundError: # There was a pre-delete that never got committed; or maybe there's # some other, long-existing DB inconsistency. return pd.DataFrame() except parquet.FastparquetCouldNotHandleFile: return pd.DataFrame() # empty table
def get_table(self): if not self.file: # Before 2018-11-09, we did not write empty data frames. # # We changed this because #160865813 shows that zero-row Parquet # files still contain important data: column information. return pd.DataFrame() try: return parquet.read(self.file.name) except (FileNotFoundError, parquet.FastparquetCouldNotHandleFile): # Spotted on production for a duplicated workflow dated # 2018-08-01. [adamhooper, 2018-09-20] I can think of no harm in # returning an empty dataframe here. return pd.DataFrame() # empty table
def test_read_issue_375_snappy(self): with self._file_on_s3('fastparquet-issue-375-snappy.par'): with self.assertRaises(parquet.FastparquetIssue375): parquet.read(bucket, key)
def test_read_issue_375_uncompressed(self): with self._file_on_s3('fastparquet-issue-375.par'): with self.assertRaises(parquet.FastparquetIssue375): parquet.read(bucket, key)
def test_read_issue_361(self): # https://github.com/dask/fastparquet/issues/361 with self._file_on_s3('fastparquet-issue-361.par'): dataframe = parquet.read(bucket, key) self.assertEqual(list(dataframe.columns), []) self.assertEqual(len(dataframe), 3)
def test_read_issue_375_snappy(self): with self.assertRaises(parquet.FastparquetIssue375): parquet.read(_path('fastparquet-issue-375-snappy.par'))
def test_read_issue_375_uncompressed(self): with self.assertRaises(parquet.FastparquetIssue375): parquet.read(_path('fastparquet-issue-375.par'))
def test_read_issue_361(self): with self.assertRaises(parquet.FastparquetIssue361): parquet.read(_path('fastparquet-issue-361.par'))
def get_table(self): if not self.size: return pd.DataFrame() # empty table return parquet.read(self.file.name)