def _get_filesystem_and_path(passed_filesystem, path): if passed_filesystem is None: return get_filesystem_from_uri(path) else: passed_filesystem = _ensure_filesystem(passed_filesystem) parsed_path = _parse_uri(path) return passed_filesystem, parsed_path
def _get_filesystem_and_path(passed_filesystem, path): if passed_filesystem is None: return resolve_filesystem_and_path(path, passed_filesystem) else: passed_filesystem = _ensure_filesystem(passed_filesystem) parsed_path = _parse_uri(path) return passed_filesystem, parsed_path
def write_dataset( df, path, partition_cols, preserve_index, session_primitives, file_format, mode ): fs = get_fs(session_primitives=session_primitives) fs = _ensure_filesystem(fs) mkdir_if_not_exists(fs, path) partition_paths = [] dead_keys = [] for keys, subgroup in df.groupby(partition_cols): subgroup = subgroup.drop(partition_cols, axis="columns") if not isinstance(keys, tuple): keys = (keys,) subdir = "/".join( [ "{colname}={value}".format(colname=name, value=val) for name, val in zip(partition_cols, keys) ] ) prefix = "/".join([path, subdir]) if mode == "overwrite_partitions": dead_keys += list_objects(prefix, session_primitives=session_primitives) full_path = write_file( df=subgroup, path=prefix, preserve_index=preserve_index, session_primitives=session_primitives, file_format=file_format, ) partition_path = full_path.rpartition("/")[0] + "/" keys_str = [str(x) for x in keys] partition_paths.append((partition_path, keys_str)) if mode == "overwrite_partitions" and dead_keys: bucket = path.replace("s3://", "").split("/", 1)[0] delete_listed_objects(bucket, dead_keys, session_primitives=session_primitives) return partition_paths
def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1): if filesystem is None: a_path = path_or_paths if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.paths = path_or_paths (self.pieces, self.partitions, self.common_metadata_path, self.metadata_path) = _make_manifest( path_or_paths, self.fs, metadata_nthreads=metadata_nthreads) if self.common_metadata_path is not None: with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata else: self.common_metadata = None if metadata is None and self.metadata_path is not None: with self.fs.open(self.metadata_path) as f: self.metadata = ParquetFile(f).metadata else: self.metadata = metadata self.schema = schema self.split_row_groups = split_row_groups if split_row_groups: raise NotImplementedError("split_row_groups not yet implemented") if validate_schema: self.validate_schemas() if filters is not None: filters = _check_filters(filters) self._filter(filters)
def write_file(df, path, preserve_index, session_primitives, file_format): fs = get_fs(session_primitives=session_primitives) fs = _ensure_filesystem(fs) mkdir_if_not_exists(fs, path) if file_format == "parquet": outfile = guid() + ".parquet" elif file_format == "csv": outfile = guid() + ".csv" full_path = "/".join([path, outfile]) if file_format == "parquet": write_parquet_dataframe( df=df, path=full_path, preserve_index=preserve_index, fs=fs ) elif file_format == "csv": write_csv_dataframe(df=df, path=full_path, preserve_index=preserve_index, fs=fs) return full_path
def write_to_dataset(table, root_path, partition_cols=None, filesystem=None, preserve_index=True, **kwargs): """ Wrapper around parquet.write_table for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following manner: root_dir/ group1=value1 group2=value1 <uuid>.parquet group2=value2 <uuid>.parquet group1=valueN group2=value1 <uuid>.parquet group2=valueN <uuid>.parquet Parameters ---------- table : pyarrow.Table root_path : string, The root directory of the dataset filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem partition_cols : list, Column names by which to partition the dataset Columns are partitioned in the order they are given preserve_index : bool, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ if filesystem is None: fs = _get_fs_from_path(root_path) else: fs = _ensure_filesystem(filesystem) _mkdir_if_not_exists(fs, root_path) if partition_cols is not None and len(partition_cols) > 0: df = table.to_pandas() partition_keys = [df[col] for col in partition_cols] data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: raise ValueError('No data left to save outside partition columns') subschema = table.schema # ARROW-2891: Ensure the output_schema is preserved when writing a # partitioned dataset for partition_col in partition_cols: subschema = subschema.remove( subschema.get_field_index(partition_col)) for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys, ) subdir = '/'.join([ '{colname}={value}'.format(colname=name, value=val) for name, val in zip(partition_cols, keys) ]) subtable = pa.Table.from_pandas(subgroup, preserve_index=preserve_index, schema=subschema, safe=False) prefix = '/'.join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) outfile = guid() + '.parquet' full_path = '/'.join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: outfile = guid() + '.parquet' full_path = '/'.join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs)
def __init__(self, path, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, filesystem=None): self.path = path self.url_path = urlparse(path) if str(path).endswith(".manifest"): self.manifest_path = path if str(path).startswith(LOCAL_FILE_PREFIX): self.manifest_path = str(path)[len(LOCAL_FILE_PREFIX):] if filesystem is None: a_path = self.path if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.pieces = list() if self.url_path.scheme == 's3a': if key is None or secret is None or endpoint is None: raise ValueError('key, secret, endpoint should not be None') if proxy is None and proxy_port is None: carbon_splits = ArrowCarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .getSplits(True) configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) self.configuration = configuration elif proxy is not None and proxy_port is not None: carbon_splits = ArrowCarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .withHadoopConf("fs.s3a.proxy.host", proxy) \ .withHadoopConf("fs.s3a.proxy.port", proxy_port) \ .getSplits(True) configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) configuration.set("fs.s3a.proxy.host", proxy) configuration.set("fs.s3a.proxy.port", proxy_port) self.configuration = configuration else: raise ValueError('wrong proxy & proxy_port configuration') if str(path).endswith(".manifest"): from obs import ObsClient obsClient = ObsClient(access_key_id=key, secret_access_key=secret, server=str(endpoint).replace( 'http://', ''), long_conn_mode=True) sources = manifest.getSources(self.manifest_path, CARBON, obsClient) if sources: self.file_path = sources[0] else: raise Exception("Manifest source can't be None!") carbon_schema = CarbonSchemaReader().readSchema( self.file_path, self.configuration.conf) else: carbon_schema = CarbonSchemaReader().readSchema( self.path, self.configuration.conf) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split folder_path = path if str(path).endswith(".manifest"): folder_path = str( self.file_path)[0:(str(self.file_path).rindex('/'))] self.pieces.append( CarbonDatasetPiece(folder_path, carbon_schema, split, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port)) else: if str(path).endswith(".manifest"): sources = manifest.getSources(self.manifest_path, CARBON) if sources: self.file_path = sources[0] else: raise Exception("Manifest source can't be None!") try: carbon_schema = CarbonSchemaReader().readSchema( self.file_path) except: raise Exception("readSchema has some errors: " + self.file_path) else: try: carbon_schema = CarbonSchemaReader().readSchema(self.path) except: raise Exception("readSchema has some errors") carbon_splits = ArrowCarbonReader().builder(self.path) \ .getSplits(True) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split if str(path).endswith(".manifest"): self.pieces.append( CarbonDatasetPiece( str(self.file_path)[0:( str(self.file_path).rindex('/'))], carbon_schema, split)) else: self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split)) self.number_of_splits = len(self.pieces) self.schema = self.getArrowSchema() # TODO add mechanism to get the file path based on file filter self.common_metadata_path = self.url_path.path + '/_common_metadata' self.common_metadata = None try: if self.fs.exists(self.common_metadata_path): with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata except: self.common_metadata = None
def __init__(self, path, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, filesystem=None): self.path = path self.url_path = urlparse(path) if filesystem is None: a_path = self.path if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.pieces = list() if self.url_path.scheme == 's3a': if key is None or secret is None or endpoint is None: raise ValueError('key, secret, endpoint should not be None') if proxy is None and proxy_port is None: carbon_splits = CarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .getSplits() configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) self.configuration = configuration elif proxy is not None and proxy_port is not None: carbon_splits = CarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .withHadoopConf("fs.s3a.proxy.host", proxy) \ .withHadoopConf("fs.s3a.proxy.port", proxy_port) \ .getSplits() configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) configuration.set("fs.s3a.proxy.host", proxy) configuration.set("fs.s3a.proxy.port", proxy_port) self.configuration = configuration else: raise ValueError('wrong proxy & proxy_port configuration') carbon_schema = CarbonSchemaReader().readSchema( self.path, self.configuration.conf) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port)) else: carbon_splits = CarbonReader().builder(self.path) \ .getSplits() carbon_schema = CarbonSchemaReader().readSchema(self.path) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split)) self.number_of_splits = len(self.pieces) self.schema = self.getArrowSchema() # TODO add mechanism to get the file path based on file filter self.common_metadata_path = self.url_path.path + '/_common_metadata' self.common_metadata = None try: if self.fs.exists(self.common_metadata_path): with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata except: self.common_metadata = None