Beispiel #1
0
def read_arrow_table(bucket: str,
                     key: str,
                     only_columns: Optional[List[str]] = None) -> pa.Table:
    """
    Return data from minio, as an Apache Arrow Table.

    The table is stored entirely in RAM. TODO stream it to an mmapped file.
    """
    with minio.temporarily_download(bucket, key) as path:
        table = pyarrow.parquet.read_table(path,
                                           use_threads=False,
                                           columns=only_columns)

        # Avoid a problem calling .to_pandas() with fastparquet-dumped files.
        #
        #   File "pyarrow/array.pxi", line 441, in pyarrow.lib._PandasConvertible.to_pandas
        #   File "pyarrow/table.pxi", line 1367, in pyarrow.lib.Table._to_pandas
        #   File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.7/site-packages/pyarrow/pandas_compat.py", line 644, in table_to_blockmanager
        #     table = _add_any_metadata(table, pandas_metadata)
        #   File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.7/site-packages/pyarrow/pandas_compat.py", line 967, in _add_any_metadata
        #     idx = schema.get_field_index(raw_name)
        #   File "pyarrow/types.pxi", line 902, in pyarrow.lib.Schema.get_field_index
        #   File "stringsource", line 15, in string.from_py.__pyx_convert_string_from_py_std__in_string
        # TypeError: expected bytes, dict found
        #
        # [2019-08-22] fastparquet-dumped files will be around for a long time.
        #
        # We don't care about schema metadata, anyway. Workbench has its own
        # restrictive schema; we don't need extra Pandas-specific data because
        # we don't support everything Pandas supports.
        table = table.replace_schema_metadata(None)  # FIXME unit-test this!

        return table
Beispiel #2
0
def _load_uploaded_file(bucket, key, mime_type) -> ProcessResult:
    """BLOCKING: download from S3 and load with parse_bytesio()."""
    try:
        # Download, don't stream: it's faster because it's concurrent
        with minio.temporarily_download(bucket, key) as tf:
            with open(tf.name, 'rb') as f:
                result = parse_bytesio(f, mime_type, None)

    except minio.error.ClientError as err:
        return ProcessResult(error=str(err))

    result.truncate_in_place_if_too_big()
    return result
Beispiel #3
0
def _load_external_module_uncached(module_id_name: str,
                                   version_sha1: str) -> ModuleType:
    """
    Load a Python Module given a name and version.
    """
    prefix = '%s/%s/' % (module_id_name, version_sha1)
    all_keys = minio.list_file_keys(minio.ExternalModulesBucket, prefix)
    python_code_key = next(k for k in all_keys
                           if _is_basename_python_code(k[len(prefix):]))

    # Now we can load the code into memory.
    name = '%s.%s' % (module_id_name, version_sha1)
    with minio.temporarily_download(minio.ExternalModulesBucket,
                                    python_code_key) as path:
        logger.info(f'Loading {name} from {path}')
        return module_loader.load_python_module(name, path)
Beispiel #4
0
 def test_file_not_found(self):
     with self.assertRaises(FileNotFoundError):
         with minio.temporarily_download(Bucket, Key) as path:
             raise NotImplemented
Beispiel #5
0
 def test_allows_reading_file(self):
     _put(b'1234')
     with minio.temporarily_download(Bucket, Key) as path:
         self.assertEqual(path.read_bytes(), b'1234')