Esempio n. 1
0
    def to_parquet(self, path, compression=None, profile_name=None):
        """Write Woodwork table to disk in the parquet format, location specified by `path`.

            Path could be a local path or a S3 path.

            If writing to S3 a tar archive of files will be written.

            Note:
                As the engine `fastparquet` cannot handle nullable pandas dtypes, `pyarrow` will be used
                for serialization to parquet.

            Args:
                path (str): location on disk to write to (will be created as a directory)
                compression (str) : Name of the compression to use. Possible values are: {'snappy', 'gzip', 'brotli', None}.
                profile_name (str) : Name of AWS profile to use, False to use an anonymous profile, or None.
        """
        if self._schema is None:
            _raise_init_error()
        import_error_message = (
            "The pyarrow library is required to serialize to parquet.\n"
            "Install via pip:\n"
            "    pip install pyarrow\n"
            "Install via conda:\n"
            "   conda install pyarrow -c conda-forge")
        import_or_raise('pyarrow', import_error_message)
        serialize.write_woodwork_table(self._dataframe,
                                       path,
                                       format='parquet',
                                       engine='pyarrow',
                                       compression=compression,
                                       profile_name=profile_name)
Esempio n. 2
0
 def serialize(self, dataframe, profile_name, **kwargs):
     import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE)
     # Serialization to orc relies on pyarrow.Table.from_pandas which doesn't work with Dask
     if _is_dask_dataframe(dataframe):
         msg = "DataFrame type not compatible with orc serialization. Please serialize to another format."
         raise ValueError(msg)
     self.kwargs["engine"] = "pyarrow"
     return super().serialize(dataframe, profile_name, **kwargs)
Esempio n. 3
0
 def serialize(self, dataframe, profile_name, **kwargs):
     import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE)
     if self.filename is not None and _is_dask_dataframe(dataframe):
         raise ValueError(
             "Writing a Dask dataframe to parquet with a filename specified is not supported"
         )
     if self.filename is not None and _is_spark_dataframe(dataframe):
         raise ValueError(
             "Writing a Spark dataframe to parquet with a filename specified is not supported"
         )
     self.kwargs["engine"] = "pyarrow"
     return super().serialize(dataframe, profile_name, **kwargs)
Esempio n. 4
0
def get_transport_params(profile_name):
    boto3 = import_or_raise("boto3", BOTO3_ERR_MSG)
    UNSIGNED = import_or_raise("botocore", BOTOCORE_ERR_MSG).UNSIGNED
    Config = import_or_raise("botocore.config", BOTOCORE_ERR_MSG).Config

    if isinstance(profile_name, str):
        session = boto3.Session(profile_name=profile_name)
        transport_params = {"client": session.client("s3")}
    elif profile_name is False or boto3.Session().get_credentials() is None:
        session = boto3.Session()
        client = session.client("s3", config=Config(signature_version=UNSIGNED))
        transport_params = {"client": client}
    else:
        transport_params = None
    return transport_params
Esempio n. 5
0
def get_transport_params(profile_name):
    boto3 = import_or_raise("boto3", BOTO3_ERR_MSG)
    UNSIGNED = import_or_raise("botocore", BOTOCORE_ERR_MSG).UNSIGNED
    Config = import_or_raise("botocore.config", BOTOCORE_ERR_MSG).Config

    if isinstance(profile_name, str):
        transport_params = {
            'session': boto3.Session(profile_name=profile_name)
        }
    elif profile_name is False or boto3.Session().get_credentials() is None:
        transport_params = {
            'resource_kwargs': {
                'config': Config(signature_version=UNSIGNED)
            }
        }
    else:
        transport_params = None
    return transport_params
Esempio n. 6
0
def use_smartopen(file_path, path, transport_params=None, read=True):
    open = import_or_raise("smart_open", SMART_OPEN_ERR_MSG).open
    if read:
        with open(path, "rb", transport_params=transport_params) as fin:
            with open(file_path, 'wb') as fout:
                shutil.copyfileobj(fin, fout)
    else:
        with open(file_path, 'rb') as fin:
            with open(path, 'wb', transport_params=transport_params) as fout:
                shutil.copyfileobj(fin, fout)
Esempio n. 7
0
    def _get_library(self):
        table_type = self.typing_info["loading_info"]["table_type"]
        if table_type == "dask":
            DASK_ERR_MSG = (
                "Cannot load Dask DataFrame - unable to import Dask.\n\n"
                "Please install with pip or conda:\n\n"
                'python -m pip install "woodwork[dask]"\n\n'
                "conda install dask")
            lib = import_or_raise("dask.dataframe", DASK_ERR_MSG)
        elif table_type == "spark":
            SPARK_ERR_MSG = (
                "Cannot load Spark DataFrame - unable to import Spark.\n\n"
                "Please install with pip or conda:\n\n"
                'python -m pip install "woodwork[spark]"\n\n'
                "conda install spark\n\n"
                "conda install pyspark")
            lib = import_or_raise("pyspark.pandas", SPARK_ERR_MSG)
            if "compression" in self.kwargs.keys():
                self.kwargs["compression"] = str(self.kwargs["compression"])
        else:
            lib = pd

        return lib
Esempio n. 8
0
def test_import_or_raise():
    assert import_or_raise("pandas", "Module pandas could not be found") == pd

    error = "Module nonexistent could not be found."
    with pytest.raises(ImportError, match=error):
        import_or_raise("nonexistent", error)
Esempio n. 9
0
def test_import_or_raise():
    assert import_or_raise('pandas', 'Module pandas could not be found') == pd

    error = 'Module nonexistent could not be found.'
    with pytest.raises(ImportError, match=error):
        import_or_raise('nonexistent', error)
Esempio n. 10
0
def _typing_information_to_woodwork_table(table_typing_info, validate,
                                          **kwargs):
    """Deserialize Woodwork table from table description.

    Args:
        table_typing_info (dict) : Woodwork typing information. Likely generated using :meth:`.serialize.typing_info_to_dict`
        validate (bool): Whether parameter and data validation should occur during table initialization
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        DataFrame: DataFrame with Woodwork typing information initialized.
    """
    _check_schema_version(table_typing_info['schema_version'])

    path = table_typing_info['path']
    loading_info = table_typing_info['loading_info']

    file = os.path.join(path, loading_info['location'])

    load_format = loading_info['type']
    assert load_format in FORMATS

    kwargs = loading_info.get('params', {})
    table_type = loading_info.get('table_type', 'pandas')

    logical_types = {}
    semantic_tags = {}
    column_descriptions = {}
    column_metadata = {}
    use_standard_tags = {}
    category_dtypes = {}
    for col in table_typing_info['column_typing_info']:
        col_name = col['name']

        ltype_metadata = col['logical_type']
        ltype = ww.type_system.str_to_logical_type(
            ltype_metadata['type'], params=ltype_metadata['parameters'])

        tags = col['semantic_tags']

        if 'index' in tags:
            tags.remove('index')
        elif 'time_index' in tags:
            tags.remove('time_index')

        logical_types[col_name] = ltype
        semantic_tags[col_name] = tags
        column_descriptions[col_name] = col['description']
        column_metadata[col_name] = col['metadata']
        use_standard_tags[col_name] = col['use_standard_tags']

        if col['physical_type']['type'] == 'category':
            # Make sure categories are recreated properly
            cat_values = col['physical_type']['cat_values']
            if table_type == 'pandas' and pd.__version__ > '1.1.5':
                cat_object = pd.CategoricalDtype(
                    pd.Index(cat_values, dtype='object'))
            else:
                cat_object = pd.CategoricalDtype(pd.Series(cat_values))
            category_dtypes[col_name] = cat_object

    compression = kwargs['compression']
    if table_type == 'dask':
        DASK_ERR_MSG = (
            'Cannot load Dask DataFrame - unable to import Dask.\n\n'
            'Please install with pip or conda:\n\n'
            'python -m pip install "woodwork[dask]"\n\n'
            'conda install dask')
        lib = import_or_raise('dask.dataframe', DASK_ERR_MSG)
    elif table_type == 'koalas':
        KOALAS_ERR_MSG = (
            'Cannot load Koalas DataFrame - unable to import Koalas.\n\n'
            'Please install with pip or conda:\n\n'
            'python -m pip install "woodwork[koalas]"\n\n'
            'conda install koalas\n\n'
            'conda install pyspark')
        lib = import_or_raise('databricks.koalas', KOALAS_ERR_MSG)
        compression = str(compression)
    else:
        lib = pd

    if load_format == 'csv':
        dataframe = lib.read_csv(
            file,
            engine=kwargs['engine'],
            compression=compression,
            encoding=kwargs['encoding'],
            dtype=category_dtypes,
        )
    elif load_format == 'pickle':
        dataframe = pd.read_pickle(file, **kwargs)
    elif load_format == 'parquet':
        dataframe = lib.read_parquet(file, engine=kwargs['engine'])

    dataframe.ww.init(name=table_typing_info.get('name'),
                      index=table_typing_info.get('index'),
                      time_index=table_typing_info.get('time_index'),
                      logical_types=logical_types,
                      semantic_tags=semantic_tags,
                      use_standard_tags=use_standard_tags,
                      table_metadata=table_typing_info.get('table_metadata'),
                      column_metadata=column_metadata,
                      column_descriptions=column_descriptions,
                      validate=validate)

    return dataframe
Esempio n. 11
0
def description_to_datatable(table_description, **kwargs):
    '''Deserialize DataTable from table description.

    Args:
        table_description (dict) : Description of a :class:`.DataTable`. Likely generated using :meth:`.serialize.datatable_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        datatable (woodwork.DataTable) : Instance of :class:`.DataTable`.
    '''
    _check_schema_version(table_description['schema_version'])

    path = table_description['path']
    loading_info = table_description['loading_info']

    file = os.path.join(path, loading_info['location'])

    load_format = loading_info['type']
    assert load_format in FORMATS

    kwargs = loading_info.get('params', {})
    table_type = loading_info.get('table_type', 'pandas')

    compression = kwargs['compression']
    if table_type == 'dask':
        DASK_ERR_MSG = (
            'Cannot load Dask DataTable - unable to import Dask.\n\n'
            'Please install with pip or conda:\n\n'
            'python -m pip install "woodwork[dask]"\n\n'
            'conda install dask'
        )
        lib = import_or_raise('dask.dataframe', DASK_ERR_MSG)
    elif table_type == 'koalas':
        KOALAS_ERR_MSG = (
            'Cannot load Koalas DataTable - unable to import Koalas.\n\n'
            'Please install with pip or conda:\n\n'
            'python -m pip install "woodwork[koalas]"\n\n'
            'conda install koalas\n\n'
            'conda install pyspark'
        )
        lib = import_or_raise('databricks.koalas', KOALAS_ERR_MSG)
        compression = str(compression)
    else:
        lib = pd

    if load_format == 'csv':
        dataframe = lib.read_csv(
            file,
            engine=kwargs['engine'],
            compression=compression,
            encoding=kwargs['encoding'],
        )
    elif load_format == 'pickle':
        dataframe = pd.read_pickle(file, **kwargs)
    elif load_format == 'parquet':
        dataframe = lib.read_parquet(file, engine=kwargs['engine'])

    logical_types = {}
    semantic_tags = {}
    column_descriptions = {}
    column_metadata = {}
    for col in table_description['column_metadata']:
        col_name = col['name']

        ltype_metadata = col['logical_type']
        ltype = ww.type_system.str_to_logical_type(ltype_metadata['type'], params=ltype_metadata['parameters'])

        tags = col['semantic_tags']

        if 'index' in tags:
            tags.remove('index')
        elif 'time_index' in tags:
            tags.remove('time_index')

        logical_types[col_name] = ltype
        semantic_tags[col_name] = tags
        column_descriptions[col_name] = col['description']
        column_metadata[col_name] = col['metadata']

    return DataTable(dataframe,
                     name=table_description.get('name'),
                     index=table_description.get('index'),
                     time_index=table_description.get('time_index'),
                     logical_types=logical_types,
                     semantic_tags=semantic_tags,
                     use_standard_tags=False,
                     table_metadata=table_description.get('table_metadata'),
                     column_metadata=column_metadata,
                     column_descriptions=column_descriptions)
Esempio n. 12
0
 def serialize(self, dataframe, profile_name, **kwargs):
     import_or_raise("pyarrow", PYARROW_IMPORT_ERROR_MESSAGE)
     return super().serialize(dataframe, profile_name, **kwargs)