def read_arrow_table(bucket: str, key: str, only_columns: Optional[List[str]] = None) -> pa.Table: """ Return data from minio, as an Apache Arrow Table. The table is stored entirely in RAM. TODO stream it to an mmapped file. """ with minio.temporarily_download(bucket, key) as path: table = pyarrow.parquet.read_table(path, use_threads=False, columns=only_columns) # Avoid a problem calling .to_pandas() with fastparquet-dumped files. # # File "pyarrow/array.pxi", line 441, in pyarrow.lib._PandasConvertible.to_pandas # File "pyarrow/table.pxi", line 1367, in pyarrow.lib.Table._to_pandas # File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.7/site-packages/pyarrow/pandas_compat.py", line 644, in table_to_blockmanager # table = _add_any_metadata(table, pandas_metadata) # File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.7/site-packages/pyarrow/pandas_compat.py", line 967, in _add_any_metadata # idx = schema.get_field_index(raw_name) # File "pyarrow/types.pxi", line 902, in pyarrow.lib.Schema.get_field_index # File "stringsource", line 15, in string.from_py.__pyx_convert_string_from_py_std__in_string # TypeError: expected bytes, dict found # # [2019-08-22] fastparquet-dumped files will be around for a long time. # # We don't care about schema metadata, anyway. Workbench has its own # restrictive schema; we don't need extra Pandas-specific data because # we don't support everything Pandas supports. table = table.replace_schema_metadata(None) # FIXME unit-test this! return table
def _load_uploaded_file(bucket, key, mime_type) -> ProcessResult: """BLOCKING: download from S3 and load with parse_bytesio().""" try: # Download, don't stream: it's faster because it's concurrent with minio.temporarily_download(bucket, key) as tf: with open(tf.name, 'rb') as f: result = parse_bytesio(f, mime_type, None) except minio.error.ClientError as err: return ProcessResult(error=str(err)) result.truncate_in_place_if_too_big() return result
def _load_external_module_uncached(module_id_name: str, version_sha1: str) -> ModuleType: """ Load a Python Module given a name and version. """ prefix = '%s/%s/' % (module_id_name, version_sha1) all_keys = minio.list_file_keys(minio.ExternalModulesBucket, prefix) python_code_key = next(k for k in all_keys if _is_basename_python_code(k[len(prefix):])) # Now we can load the code into memory. name = '%s.%s' % (module_id_name, version_sha1) with minio.temporarily_download(minio.ExternalModulesBucket, python_code_key) as path: logger.info(f'Loading {name} from {path}') return module_loader.load_python_module(name, path)
def test_file_not_found(self): with self.assertRaises(FileNotFoundError): with minio.temporarily_download(Bucket, Key) as path: raise NotImplemented
def test_allows_reading_file(self): _put(b'1234') with minio.temporarily_download(Bucket, Key) as path: self.assertEqual(path.read_bytes(), b'1234')