def read_woodwork_table(path, profile_name=None, validate=False, **kwargs): """Read Woodwork table from disk, S3 path, or URL. Args: path (str): Directory on disk, S3 path, or URL to read `woodwork_typing_info.json`. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. validate (bool, optional): Whether parameter and data validation should occur when initializing Woodwork dataframe during deserialization. Defaults to False. Note: If serialized data was modified outside of Woodwork and you are unsure of the validity of the data or typing information, `validate` should be set to True. kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method. Returns: DataFrame: DataFrame with Woodwork typing information initialized. """ if _is_url(path) or _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: file_name = Path(path).name file_path = os.path.join(tmpdir, file_name) transport_params = None if _is_s3(path): transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, transport_params) with tarfile.open(str(file_path)) as tar: tar.extractall(path=tmpdir) table_typing_info = read_table_typing_information(tmpdir) return _typing_information_to_woodwork_table( table_typing_info, validate, **kwargs) else: table_typing_info = read_table_typing_information(path) return _typing_information_to_woodwork_table(table_typing_info, validate, **kwargs)
def write_woodwork_table(dataframe, path, profile_name=None, **kwargs): """Serialize Woodwork table and write to disk or S3 path. Args: dataframe (pd.DataFrame, dd.DataFrame, ks.DataFrame): DataFrame with Woodwork typing information initialized. path (str) : Location on disk to write the Woodwork table. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile. """ if _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: os.makedirs(os.path.join(tmpdir, 'data')) _dump_table(dataframe, tmpdir, **kwargs) file_path = _create_archive(tmpdir) transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, read=False, transport_params=transport_params) elif _is_url(path): raise ValueError("Writing to URLs is not supported") else: path = os.path.abspath(path) os.makedirs(os.path.join(path, 'data'), exist_ok=True) _dump_table(dataframe, path, **kwargs)
def read_datatable(path, profile_name=None, **kwargs): '''Read DataTable from disk, S3 path, or URL. Args: path (str): Directory on disk, S3 path, or URL to read `table_description.json`. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method. ''' if _is_url(path) or _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: file_name = Path(path).name file_path = os.path.join(tmpdir, file_name) transport_params = None if _is_s3(path): transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, transport_params) with tarfile.open(str(file_path)) as tar: tar.extractall(path=tmpdir) table_description = read_table_description(tmpdir) return description_to_datatable(table_description, **kwargs) else: table_description = read_table_description(path) return description_to_datatable(table_description, **kwargs)
def write_datatable(datatable, path, profile_name=None, **kwargs): '''Serialize datatable and write to disk or S3 path. Args: datatable (DataTable) : Instance of :class:`.DataTable`. path (str) : Location on disk to write datatable data and description. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile. ''' if _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: os.makedirs(os.path.join(tmpdir, 'data')) dump_table(datatable, tmpdir, **kwargs) file_path = create_archive(tmpdir) transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, read=False, transport_params=transport_params) elif _is_url(path): raise ValueError("Writing to URLs is not supported") else: path = os.path.abspath(path) os.makedirs(os.path.join(path, 'data'), exist_ok=True) dump_table(datatable, path, **kwargs)
def configure_deserializer(self): """Extract info from typing information required to read data and initialize Woodwork""" _check_schema_version(self.typing_info["schema_version"]) loading_info = self.typing_info["loading_info"] if not (_is_s3(self.path) or _is_url(self.path)): path = self.typing_info["path"] self.read_path = os.path.join(path, loading_info["location"]) self.kwargs = loading_info.get("params", {}) self._set_init_dict(loading_info)
def deserialize(self, profile_name, validate): """Reconstruct Woodwork dataframe from saved data and typing information""" self.configure_deserializer() if _is_url(self.path) or _is_s3(self.path): dataframe = self.read_from_s3(profile_name) else: dataframe = self.read_from_local_path() dataframe.ww.init(**self.ww_init_dict, validate=validate) return dataframe
def serialize(self, dataframe, profile_name, **kwargs): """Serialize data and typing information to disk.""" self.dataframe = dataframe self.typing_info = typing_info_to_dict(self.dataframe) if _is_s3(self.path): self.save_to_s3(profile_name) elif _is_url(self.path): raise ValueError("Writing to URLs is not supported") else: self.write_path = os.path.abspath(self.path) self.save_to_local_path()
def deserialize(self, profile_name, validate): if _is_url(self.path) or _is_s3(self.path): dataframe = self.read_from_s3(profile_name) else: if self.data_subdirectory: self.path = os.path.join(self.path, self.data_subdirectory) self.read_path = self.path if self.filename: self.read_path = os.path.join(self.path, self.filename) dataframe = self.read_from_local_path() dataframe.ww.init(**self.ww_init_dict, validate=validate) return dataframe
def read_table_typing_information(path, typing_info_filename, profile_name): """Read Woodwork typing information from disk, S3 path, or URL. Args: path (str): Location on disk, S3 path, or URL to read typing info file. typing_info_filename (str): Name of JSON file in which typing info is stored. profile_name (str, bool): The AWS profile specified to access to S3. Returns: dict: Woodwork typing information dictionary """ if _is_url(path) or _is_s3(path): with tempfile.TemporaryDirectory() as tmpdir: file_name = Path(path).name file_path = os.path.join(tmpdir, file_name) transport_params = None if _is_s3(path): transport_params = get_transport_params(profile_name) use_smartopen(file_path, path, transport_params) with tarfile.open(str(file_path)) as tar: tar.extractall(path=tmpdir) file = os.path.join(tmpdir, typing_info_filename) with open(file, "r") as file: typing_info = json.load(file) else: path = os.path.abspath(path) assert os.path.exists(path), '"{}" does not exist'.format(path) file = os.path.join(path, typing_info_filename) with open(file, "r") as file: typing_info = json.load(file) typing_info["path"] = path return typing_info
def test_is_url(): assert _is_url("https://www.google.com/") assert not _is_url("google.com")
def test_is_url(): assert _is_url('https://www.google.com/') assert not _is_url('google.com')