Example #1
0
def read_woodwork_table(path, profile_name=None, validate=False, **kwargs):
    """Read Woodwork table from disk, S3 path, or URL.

        Args:
            path (str): Directory on disk, S3 path, or URL to read `woodwork_typing_info.json`.
            profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials.
                Set to False to use an anonymous profile.
            validate (bool, optional): Whether parameter and data validation should occur when initializing Woodwork dataframe
                during deserialization. Defaults to False. Note: If serialized data was modified outside of Woodwork and you
                are unsure of the validity of the data or typing information, `validate` should be set to True.
            kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method.

        Returns:
            DataFrame: DataFrame with Woodwork typing information initialized.
    """
    if _is_url(path) or _is_s3(path):
        with tempfile.TemporaryDirectory() as tmpdir:
            file_name = Path(path).name
            file_path = os.path.join(tmpdir, file_name)
            transport_params = None

            if _is_s3(path):
                transport_params = get_transport_params(profile_name)

            use_smartopen(file_path, path, transport_params)
            with tarfile.open(str(file_path)) as tar:
                tar.extractall(path=tmpdir)

            table_typing_info = read_table_typing_information(tmpdir)
            return _typing_information_to_woodwork_table(
                table_typing_info, validate, **kwargs)
    else:
        table_typing_info = read_table_typing_information(path)
        return _typing_information_to_woodwork_table(table_typing_info,
                                                     validate, **kwargs)
Example #2
0
def read_datatable(path, profile_name=None, **kwargs):
    '''Read DataTable from disk, S3 path, or URL.

        Args:
            path (str): Directory on disk, S3 path, or URL to read `table_description.json`.
            profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials.
                Set to False to use an anonymous profile.
            kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method.
    '''
    if _is_url(path) or _is_s3(path):
        with tempfile.TemporaryDirectory() as tmpdir:
            file_name = Path(path).name
            file_path = os.path.join(tmpdir, file_name)
            transport_params = None

            if _is_s3(path):
                transport_params = get_transport_params(profile_name)

            use_smartopen(file_path, path, transport_params)
            with tarfile.open(str(file_path)) as tar:
                tar.extractall(path=tmpdir)

            table_description = read_table_description(tmpdir)
            return description_to_datatable(table_description, **kwargs)
    else:
        table_description = read_table_description(path)
        return description_to_datatable(table_description, **kwargs)
Example #3
0
def write_woodwork_table(dataframe, path, profile_name=None, **kwargs):
    """Serialize Woodwork table and write to disk or S3 path.

    Args:
        dataframe (pd.DataFrame, dd.DataFrame, ks.DataFrame): DataFrame with Woodwork typing information initialized.
        path (str) : Location on disk to write the Woodwork table.
        profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials.
                Set to False to use an anonymous profile.
        kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile.
    """
    if _is_s3(path):
        with tempfile.TemporaryDirectory() as tmpdir:
            os.makedirs(os.path.join(tmpdir, 'data'))
            _dump_table(dataframe, tmpdir, **kwargs)
            file_path = _create_archive(tmpdir)

            transport_params = get_transport_params(profile_name)
            use_smartopen(file_path,
                          path,
                          read=False,
                          transport_params=transport_params)
    elif _is_url(path):
        raise ValueError("Writing to URLs is not supported")
    else:
        path = os.path.abspath(path)
        os.makedirs(os.path.join(path, 'data'), exist_ok=True)
        _dump_table(dataframe, path, **kwargs)
Example #4
0
def write_datatable(datatable, path, profile_name=None, **kwargs):
    '''Serialize datatable and write to disk or S3 path.

    Args:
    datatable (DataTable) : Instance of :class:`.DataTable`.
    path (str) : Location on disk to write datatable data and description.
    profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials.
            Set to False to use an anonymous profile.
    kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile.
    '''
    if _is_s3(path):
        with tempfile.TemporaryDirectory() as tmpdir:
            os.makedirs(os.path.join(tmpdir, 'data'))
            dump_table(datatable, tmpdir, **kwargs)
            file_path = create_archive(tmpdir)

            transport_params = get_transport_params(profile_name)
            use_smartopen(file_path,
                          path,
                          read=False,
                          transport_params=transport_params)
    elif _is_url(path):
        raise ValueError("Writing to URLs is not supported")
    else:
        path = os.path.abspath(path)
        os.makedirs(os.path.join(path, 'data'), exist_ok=True)
        dump_table(datatable, path, **kwargs)
Example #5
0
 def configure_deserializer(self):
     """Extract info from typing information required to read data and initialize Woodwork"""
     _check_schema_version(self.typing_info["schema_version"])
     loading_info = self.typing_info["loading_info"]
     if not (_is_s3(self.path) or _is_url(self.path)):
         path = self.typing_info["path"]
         self.read_path = os.path.join(path, loading_info["location"])
     self.kwargs = loading_info.get("params", {})
     self._set_init_dict(loading_info)
Example #6
0
 def deserialize(self, profile_name, validate):
     """Reconstruct Woodwork dataframe from saved data and typing information"""
     self.configure_deserializer()
     if _is_url(self.path) or _is_s3(self.path):
         dataframe = self.read_from_s3(profile_name)
     else:
         dataframe = self.read_from_local_path()
     dataframe.ww.init(**self.ww_init_dict, validate=validate)
     return dataframe
Example #7
0
    def serialize(self, dataframe, profile_name, **kwargs):
        """Serialize data and typing information to disk."""
        self.dataframe = dataframe
        self.typing_info = typing_info_to_dict(self.dataframe)

        if _is_s3(self.path):
            self.save_to_s3(profile_name)
        elif _is_url(self.path):
            raise ValueError("Writing to URLs is not supported")
        else:
            self.write_path = os.path.abspath(self.path)
            self.save_to_local_path()
Example #8
0
    def deserialize(self, profile_name, validate):
        if _is_url(self.path) or _is_s3(self.path):
            dataframe = self.read_from_s3(profile_name)
        else:
            if self.data_subdirectory:
                self.path = os.path.join(self.path, self.data_subdirectory)
            self.read_path = self.path
            if self.filename:
                self.read_path = os.path.join(self.path, self.filename)

            dataframe = self.read_from_local_path()
        dataframe.ww.init(**self.ww_init_dict, validate=validate)
        return dataframe
Example #9
0
def read_table_typing_information(path, typing_info_filename, profile_name):
    """Read Woodwork typing information from disk, S3 path, or URL.

    Args:
        path (str): Location on disk, S3 path, or URL to read typing info file.
        typing_info_filename (str): Name of JSON file in which typing info is stored.
        profile_name (str, bool): The AWS profile specified to access to S3.

    Returns:
        dict: Woodwork typing information dictionary
    """
    if _is_url(path) or _is_s3(path):
        with tempfile.TemporaryDirectory() as tmpdir:
            file_name = Path(path).name
            file_path = os.path.join(tmpdir, file_name)
            transport_params = None

            if _is_s3(path):
                transport_params = get_transport_params(profile_name)

            use_smartopen(file_path, path, transport_params)
            with tarfile.open(str(file_path)) as tar:
                tar.extractall(path=tmpdir)

            file = os.path.join(tmpdir, typing_info_filename)
            with open(file, "r") as file:
                typing_info = json.load(file)
    else:
        path = os.path.abspath(path)
        assert os.path.exists(path), '"{}" does not exist'.format(path)
        file = os.path.join(path, typing_info_filename)
        with open(file, "r") as file:
            typing_info = json.load(file)
        typing_info["path"] = path

    return typing_info
Example #10
0
    def read_from_s3(self, profile_name):
        with tempfile.TemporaryDirectory() as tmpdir:
            tar_filename = Path(self.path).name
            tar_filepath = os.path.join(tmpdir, tar_filename)
            transport_params = None

            if _is_s3(self.path):
                transport_params = get_transport_params(profile_name)

            use_smartopen(tar_filepath, self.path, transport_params)
            with tarfile.open(str(tar_filepath)) as tar:
                tar.extractall(path=tmpdir)

            self.read_path = os.path.join(tmpdir, self.data_subdirectory, self.filename)

            return self.read_from_local_path()
Example #11
0
    def read_from_s3(self, profile_name):
        """Read data from S3 into a dataframe"""
        with tempfile.TemporaryDirectory() as tmpdir:
            tar_filename = Path(self.path).name
            tar_filepath = os.path.join(tmpdir, tar_filename)
            transport_params = None

            if _is_s3(self.path):
                transport_params = get_transport_params(profile_name)

            use_smartopen(tar_filepath, self.path, transport_params)
            with tarfile.open(str(tar_filepath)) as tar:
                tar.extractall(path=tmpdir)
            self.read_path = os.path.join(
                tmpdir, self.typing_info["loading_info"]["location"])
            return self.read_from_local_path()
Example #12
0
def test_is_s3():
    assert _is_s3("s3://test-bucket/test-key")
    assert not _is_s3("https://woodwork-static.s3.amazonaws.com/")