def to_parquet(self, path, compression=None, profile_name=None): """Write Woodwork table to disk in the parquet format, location specified by `path`. Path could be a local path or a S3 path. If writing to S3 a tar archive of files will be written. Note: As the engine `fastparquet` cannot handle nullable pandas dtypes, `pyarrow` will be used for serialization to parquet. Args: path (str): location on disk to write to (will be created as a directory) compression (str) : Name of the compression to use. Possible values are: {'snappy', 'gzip', 'brotli', None}. profile_name (str) : Name of AWS profile to use, False to use an anonymous profile, or None. """ if self._schema is None: _raise_init_error() import_error_message = ( "The pyarrow library is required to serialize to parquet.\n" "Install via pip:\n" " pip install pyarrow\n" "Install via conda:\n" " conda install pyarrow -c conda-forge") import_or_raise('pyarrow', import_error_message) serialize.write_woodwork_table(self._dataframe, path, format='parquet', engine='pyarrow', compression=compression, profile_name=profile_name)
def serialize(self, dataframe, profile_name, **kwargs): import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE) # Serialization to orc relies on pyarrow.Table.from_pandas which doesn't work with Dask if _is_dask_dataframe(dataframe): msg = "DataFrame type not compatible with orc serialization. Please serialize to another format." raise ValueError(msg) self.kwargs["engine"] = "pyarrow" return super().serialize(dataframe, profile_name, **kwargs)
def serialize(self, dataframe, profile_name, **kwargs): import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE) if self.filename is not None and _is_dask_dataframe(dataframe): raise ValueError( "Writing a Dask dataframe to parquet with a filename specified is not supported" ) if self.filename is not None and _is_spark_dataframe(dataframe): raise ValueError( "Writing a Spark dataframe to parquet with a filename specified is not supported" ) self.kwargs["engine"] = "pyarrow" return super().serialize(dataframe, profile_name, **kwargs)
def get_transport_params(profile_name): boto3 = import_or_raise("boto3", BOTO3_ERR_MSG) UNSIGNED = import_or_raise("botocore", BOTOCORE_ERR_MSG).UNSIGNED Config = import_or_raise("botocore.config", BOTOCORE_ERR_MSG).Config if isinstance(profile_name, str): session = boto3.Session(profile_name=profile_name) transport_params = {"client": session.client("s3")} elif profile_name is False or boto3.Session().get_credentials() is None: session = boto3.Session() client = session.client("s3", config=Config(signature_version=UNSIGNED)) transport_params = {"client": client} else: transport_params = None return transport_params
def get_transport_params(profile_name): boto3 = import_or_raise("boto3", BOTO3_ERR_MSG) UNSIGNED = import_or_raise("botocore", BOTOCORE_ERR_MSG).UNSIGNED Config = import_or_raise("botocore.config", BOTOCORE_ERR_MSG).Config if isinstance(profile_name, str): transport_params = { 'session': boto3.Session(profile_name=profile_name) } elif profile_name is False or boto3.Session().get_credentials() is None: transport_params = { 'resource_kwargs': { 'config': Config(signature_version=UNSIGNED) } } else: transport_params = None return transport_params
def use_smartopen(file_path, path, transport_params=None, read=True): open = import_or_raise("smart_open", SMART_OPEN_ERR_MSG).open if read: with open(path, "rb", transport_params=transport_params) as fin: with open(file_path, 'wb') as fout: shutil.copyfileobj(fin, fout) else: with open(file_path, 'rb') as fin: with open(path, 'wb', transport_params=transport_params) as fout: shutil.copyfileobj(fin, fout)
def _get_library(self): table_type = self.typing_info["loading_info"]["table_type"] if table_type == "dask": DASK_ERR_MSG = ( "Cannot load Dask DataFrame - unable to import Dask.\n\n" "Please install with pip or conda:\n\n" 'python -m pip install "woodwork[dask]"\n\n' "conda install dask") lib = import_or_raise("dask.dataframe", DASK_ERR_MSG) elif table_type == "spark": SPARK_ERR_MSG = ( "Cannot load Spark DataFrame - unable to import Spark.\n\n" "Please install with pip or conda:\n\n" 'python -m pip install "woodwork[spark]"\n\n' "conda install spark\n\n" "conda install pyspark") lib = import_or_raise("pyspark.pandas", SPARK_ERR_MSG) if "compression" in self.kwargs.keys(): self.kwargs["compression"] = str(self.kwargs["compression"]) else: lib = pd return lib
def test_import_or_raise(): assert import_or_raise("pandas", "Module pandas could not be found") == pd error = "Module nonexistent could not be found." with pytest.raises(ImportError, match=error): import_or_raise("nonexistent", error)
def test_import_or_raise(): assert import_or_raise('pandas', 'Module pandas could not be found') == pd error = 'Module nonexistent could not be found.' with pytest.raises(ImportError, match=error): import_or_raise('nonexistent', error)
def _typing_information_to_woodwork_table(table_typing_info, validate, **kwargs): """Deserialize Woodwork table from table description. Args: table_typing_info (dict) : Woodwork typing information. Likely generated using :meth:`.serialize.typing_info_to_dict` validate (bool): Whether parameter and data validation should occur during table initialization kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: DataFrame: DataFrame with Woodwork typing information initialized. """ _check_schema_version(table_typing_info['schema_version']) path = table_typing_info['path'] loading_info = table_typing_info['loading_info'] file = os.path.join(path, loading_info['location']) load_format = loading_info['type'] assert load_format in FORMATS kwargs = loading_info.get('params', {}) table_type = loading_info.get('table_type', 'pandas') logical_types = {} semantic_tags = {} column_descriptions = {} column_metadata = {} use_standard_tags = {} category_dtypes = {} for col in table_typing_info['column_typing_info']: col_name = col['name'] ltype_metadata = col['logical_type'] ltype = ww.type_system.str_to_logical_type( ltype_metadata['type'], params=ltype_metadata['parameters']) tags = col['semantic_tags'] if 'index' in tags: tags.remove('index') elif 'time_index' in tags: tags.remove('time_index') logical_types[col_name] = ltype semantic_tags[col_name] = tags column_descriptions[col_name] = col['description'] column_metadata[col_name] = col['metadata'] use_standard_tags[col_name] = col['use_standard_tags'] if col['physical_type']['type'] == 'category': # Make sure categories are recreated properly cat_values = col['physical_type']['cat_values'] if table_type == 'pandas' and pd.__version__ > '1.1.5': cat_object = pd.CategoricalDtype( pd.Index(cat_values, dtype='object')) else: cat_object = pd.CategoricalDtype(pd.Series(cat_values)) category_dtypes[col_name] = cat_object compression = kwargs['compression'] if table_type == 'dask': DASK_ERR_MSG = ( 'Cannot load Dask DataFrame - unable to import Dask.\n\n' 'Please install with pip or conda:\n\n' 'python -m pip install "woodwork[dask]"\n\n' 'conda install dask') lib = import_or_raise('dask.dataframe', DASK_ERR_MSG) elif table_type == 'koalas': KOALAS_ERR_MSG = ( 'Cannot load Koalas DataFrame - unable to import Koalas.\n\n' 'Please install with pip or conda:\n\n' 'python -m pip install "woodwork[koalas]"\n\n' 'conda install koalas\n\n' 'conda install pyspark') lib = import_or_raise('databricks.koalas', KOALAS_ERR_MSG) compression = str(compression) else: lib = pd if load_format == 'csv': dataframe = lib.read_csv( file, engine=kwargs['engine'], compression=compression, encoding=kwargs['encoding'], dtype=category_dtypes, ) elif load_format == 'pickle': dataframe = pd.read_pickle(file, **kwargs) elif load_format == 'parquet': dataframe = lib.read_parquet(file, engine=kwargs['engine']) dataframe.ww.init(name=table_typing_info.get('name'), index=table_typing_info.get('index'), time_index=table_typing_info.get('time_index'), logical_types=logical_types, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, table_metadata=table_typing_info.get('table_metadata'), column_metadata=column_metadata, column_descriptions=column_descriptions, validate=validate) return dataframe
def description_to_datatable(table_description, **kwargs): '''Deserialize DataTable from table description. Args: table_description (dict) : Description of a :class:`.DataTable`. Likely generated using :meth:`.serialize.datatable_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: datatable (woodwork.DataTable) : Instance of :class:`.DataTable`. ''' _check_schema_version(table_description['schema_version']) path = table_description['path'] loading_info = table_description['loading_info'] file = os.path.join(path, loading_info['location']) load_format = loading_info['type'] assert load_format in FORMATS kwargs = loading_info.get('params', {}) table_type = loading_info.get('table_type', 'pandas') compression = kwargs['compression'] if table_type == 'dask': DASK_ERR_MSG = ( 'Cannot load Dask DataTable - unable to import Dask.\n\n' 'Please install with pip or conda:\n\n' 'python -m pip install "woodwork[dask]"\n\n' 'conda install dask' ) lib = import_or_raise('dask.dataframe', DASK_ERR_MSG) elif table_type == 'koalas': KOALAS_ERR_MSG = ( 'Cannot load Koalas DataTable - unable to import Koalas.\n\n' 'Please install with pip or conda:\n\n' 'python -m pip install "woodwork[koalas]"\n\n' 'conda install koalas\n\n' 'conda install pyspark' ) lib = import_or_raise('databricks.koalas', KOALAS_ERR_MSG) compression = str(compression) else: lib = pd if load_format == 'csv': dataframe = lib.read_csv( file, engine=kwargs['engine'], compression=compression, encoding=kwargs['encoding'], ) elif load_format == 'pickle': dataframe = pd.read_pickle(file, **kwargs) elif load_format == 'parquet': dataframe = lib.read_parquet(file, engine=kwargs['engine']) logical_types = {} semantic_tags = {} column_descriptions = {} column_metadata = {} for col in table_description['column_metadata']: col_name = col['name'] ltype_metadata = col['logical_type'] ltype = ww.type_system.str_to_logical_type(ltype_metadata['type'], params=ltype_metadata['parameters']) tags = col['semantic_tags'] if 'index' in tags: tags.remove('index') elif 'time_index' in tags: tags.remove('time_index') logical_types[col_name] = ltype semantic_tags[col_name] = tags column_descriptions[col_name] = col['description'] column_metadata[col_name] = col['metadata'] return DataTable(dataframe, name=table_description.get('name'), index=table_description.get('index'), time_index=table_description.get('time_index'), logical_types=logical_types, semantic_tags=semantic_tags, use_standard_tags=False, table_metadata=table_description.get('table_metadata'), column_metadata=column_metadata, column_descriptions=column_descriptions)
def serialize(self, dataframe, profile_name, **kwargs): import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE) return super().serialize(dataframe, profile_name, **kwargs)