def parquet_file(self): if not hasattr(self, '_parquet_file'): try: self._parquet_file = parquet.read_header(self.parquet_path) except OSError: # Two possibilities: # # 1. The file is missing. # 2. The file is empty. (We used to write empty files in # assign_wf_module.) # # Either way, our cached DataFrame is "empty", and we represent # that as None. self._parquet_file = None except IndexError: # TODO nix this when fastparquet resolves # https://github.com/dask/fastparquet/issues/361 # # The file has a zero-length column list, and fastparquet can't # handle that. # # Our cached DataFrame should be "empty". No columns means no # rows. self._parquet_file = None return self._parquet_file
def parquet_file(self): if not hasattr(self, '_parquet_file'): try: self._parquet_file = parquet.read_header(self.parquet_path) except OSError: # Two possibilities: # # 1. The file is missing. # 2. The file is empty. (We used to write empty files in # assign_wf_module.) # # Either way, our cached DataFrame is "empty", and we represent # that as None. self._parquet_file = None except parquet.FastparquetCouldNotHandleFile: # Treat bugs as "empty file" self._parquet_file = None # TODO keep a handle on the file, to guarantee it doesn't disappear # from disk until after this CachedRenderResult is destroyed. Until # then, every read from self._parquet_file is a race. return self._parquet_file
def test_read_header_issue_361(self): # https://github.com/dask/fastparquet/issues/361 with self._file_on_s3('fastparquet-issue-361.par'): header = parquet.read_header(bucket, key) self.assertEqual(header.columns, []) self.assertEqual(header.count, 3)
def test_read_header_issue_361(self): with self.assertRaises(parquet.FastparquetIssue361): parquet.read_header(_path('fastparquet-issue-361.par'))