def add_partitions(self, database, table, partition_paths, file_format, compression, extra_args=None): if not partition_paths: return None partitions = list() for partition in partition_paths: if file_format == "parquet": partition_def = Glue.parquet_partition_definition(partition=partition, compression=compression) elif file_format == "csv": partition_def = Glue.csv_partition_definition(partition=partition, compression=compression, extra_args=extra_args) else: raise UnsupportedFileFormat(file_format) partitions.append(partition_def) pages_num = int(ceil(len(partitions) / 100.0)) for _ in range(pages_num): page = partitions[:100] del partitions[:100] res = self._client_glue.batch_create_partition(DatabaseName=database, TableName=table, PartitionInputList=page) for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: if error["ErrorDetail"]["ErrorCode"] != "AlreadyExistsException": raise ApiError(f"{error}")
def _data_to_s3_object_writer(dataframe, path, preserve_index, session_primitives, file_format): fs = s3.get_fs(session_primitives=session_primitives) fs = pyarrow.filesystem._ensure_filesystem(fs) s3.mkdir_if_not_exists(fs, path) if file_format == "parquet": outfile = pyarrow.compat.guid() + ".parquet" elif file_format == "csv": outfile = pyarrow.compat.guid() + ".csv" else: raise UnsupportedFileFormat(file_format) object_path = "/".join([path, outfile]) if file_format == "parquet": Pandas.write_parquet_dataframe( dataframe=dataframe, path=object_path, preserve_index=preserve_index, fs=fs, ) elif file_format == "csv": Pandas.write_csv_dataframe( dataframe=dataframe, path=object_path, preserve_index=preserve_index, fs=fs, ) return object_path
def create_table( self, database, table, schema, path, file_format, compression, partition_cols_schema=None, extra_args=None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None) -> None: """ Create Glue table (Catalog) :param database: AWS Glue Database name :param table: AWS Glue table name :param schema: Table schema :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/ :param file_format: "csv" or "parquet" :param compression: None, gzip, snappy, etc :param partition_cols_schema: Partitions schema :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV) :param description: Table description :param parameters: Key/value pairs to tag the table (Optional[Dict[str, str]]) :param columns_comments: Columns names and the related comments (Optional[Dict[str, str]]) :return: None """ if file_format == "parquet": table_input = Glue.parquet_table_definition( table, partition_cols_schema, schema, path, compression) elif file_format == "csv": table_input = Glue.csv_table_definition(table, partition_cols_schema, schema, path, compression, extra_args=extra_args) else: raise UnsupportedFileFormat(file_format) if description is not None: table_input["Description"] = description if parameters is not None: for k, v in parameters.items(): table_input["Parameters"][k] = v if columns_comments is not None: for col in table_input["StorageDescriptor"]["Columns"]: name = col["Name"] if name in columns_comments: col["Comment"] = columns_comments[name] for par in table_input["PartitionKeys"]: name = par["Name"] if name in columns_comments: par["Comment"] = columns_comments[name] self._client_glue.create_table(DatabaseName=database, TableInput=table_input)
def read( path, header="infer", names=None, dtype=None, sep=",", lineterminator="\n", quotechar='"', quoting=0, escapechar=None, parse_dates=False, infer_datetime_format=False, encoding=None, file_format="csv", region=None, key=None, secret=None, profile=None, ): file_format = file_format.lower() if file_format not in ["parquet", "csv"]: raise UnsupportedFileFormat(file_format) session_primitives = SessionPrimitives( region=region, key=key, secret=secret, profile=profile ) session = get_session(session_primitives=session_primitives) bucket_name, key_path = parse_path(path) s3_client = session.client("s3", use_ssl=True) buff = BytesIO() s3_client.download_fileobj(bucket_name, key_path, buff) buff.seek(0), df = None if file_format == "csv": df = pandas.read_csv( buff, header=header, names=names, sep=sep, quotechar=quotechar, quoting=quoting, escapechar=escapechar, parse_dates=parse_dates, infer_datetime_format=infer_datetime_format, lineterminator=lineterminator, dtype=dtype, encoding=encoding, ) buff.close() return df
def _data_to_s3_object_writer(dataframe, path, preserve_index, compression, session_primitives, file_format, cast_columns=None, extra_args=None, isolated_dataframe=False): fs = s3.get_fs(session_primitives=session_primitives) fs = pyarrow.filesystem._ensure_filesystem(fs) s3.mkdir_if_not_exists(fs, path) if compression is None: compression_end = "" elif compression == "snappy": compression_end = ".snappy" elif compression == "gzip": compression_end = ".gz" else: raise InvalidCompression(compression) guid = pyarrow.compat.guid() if file_format == "parquet": outfile = f"{guid}.parquet{compression_end}" elif file_format == "csv": outfile = f"{guid}.csv{compression_end}" else: raise UnsupportedFileFormat(file_format) object_path = "/".join([path, outfile]) if file_format == "parquet": Pandas.write_parquet_dataframe( dataframe=dataframe, path=object_path, preserve_index=preserve_index, compression=compression, fs=fs, cast_columns=cast_columns, isolated_dataframe=isolated_dataframe) elif file_format == "csv": Pandas.write_csv_dataframe(dataframe=dataframe, path=object_path, preserve_index=preserve_index, compression=compression, fs=fs, extra_args=extra_args) return object_path
def create_table(self, database, table, schema, path, file_format, partition_cols=None): if file_format == "parquet": table_input = Glue.parquet_table_definition( table, partition_cols, schema, path) elif file_format == "csv": table_input = Glue.csv_table_definition(table, partition_cols, schema, path) else: raise UnsupportedFileFormat(file_format) self._client_glue.create_table(DatabaseName=database, TableInput=table_input)
def _write_data( df, path, session_primitives, partition_cols=None, preserve_index=True, file_format="parquet", mode="append", num_procs=None, num_files=2, ): """ Write the parquet files to s3 """ if not num_procs: num_procs = mp.cpu_count() if path[-1] == "/": path = path[:-1] file_format = file_format.lower() if file_format not in ["parquet", "csv"]: raise UnsupportedFileFormat(file_format) partition_paths = None if partition_cols is not None and len(partition_cols) > 0: partition_paths = write_dataset_manager( df=df, path=path, partition_cols=partition_cols, session_primitives=session_primitives, preserve_index=preserve_index, file_format=file_format, mode=mode, num_procs=num_procs, num_files=num_files, ) else: write_file_manager( df=df, path=path, preserve_index=preserve_index, session_primitives=session_primitives, file_format=file_format, num_procs=num_procs, ) return partition_paths
def create_table(self, database, table, schema, path, file_format, partition_cols=None): client = self._session.boto3_session.client( service_name="glue", config=self._session.botocore_config) if file_format == "parquet": table_input = Glue.parquet_table_definition( table, partition_cols, schema, path) elif file_format == "csv": table_input = Glue.csv_table_definition(table, partition_cols, schema, path) else: raise UnsupportedFileFormat(file_format) client.create_table(DatabaseName=database, TableInput=table_input)
def add_partitions(self, database, table, partition_paths, file_format): if not partition_paths: return None partitions = list() for partition in partition_paths: if file_format == "parquet": partition_def = Glue.parquet_partition_definition(partition) elif file_format == "csv": partition_def = Glue.csv_partition_definition(partition) else: raise UnsupportedFileFormat(file_format) partitions.append(partition_def) pages_num = int(ceil(len(partitions) / 100.0)) for _ in range(pages_num): page = partitions[:100] del partitions[:100] self._client_glue.batch_create_partition(DatabaseName=database, TableName=table, PartitionInputList=page)
def create_glue_table(self, database, path, dataframe, file_format, compression, table=None, serde=None, sep=",", partition_by=None, load_partitions=True, replace_if_exists=True, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None): """ Create a Glue metadata table pointing for some dataset stored on AWS S3. :param dataframe: PySpark Dataframe :param file_format: File format (E.g. "parquet", "csv") :param partition_by: Columns used for partitioning :param path: AWS S3 path :param compression: Compression (e.g. gzip, snappy, lzo, etc) :param sep: Separator token for CSV formats (e.g. ",", ";", "|") :param serde: Serializer/Deserializer (e.g. "OpenCSVSerDe", "LazySimpleSerDe") :param database: Glue database name :param table: Glue table name. If not passed, extracted from the path :param load_partitions: Load partitions after the table creation :param replace_if_exists: Drop table and recreates that if already exists :param description: Table description :param parameters: Key/value pairs to tag the table (Optional[Dict[str, str]]) :param columns_comments: Columns names and the related comments (Optional[Dict[str, str]]) :return: None """ file_format = file_format.lower() if file_format not in ["parquet", "csv"]: raise UnsupportedFileFormat(file_format) table = table if table else self._session.glue.parse_table_name(path) table = table.lower().replace(".", "_") logger.debug(f"table: {table}") full_schema = dataframe.dtypes if partition_by is None: partition_by = [] schema = [x for x in full_schema if x[0] not in partition_by] partitions_schema_tmp = { x[0]: x[1] for x in full_schema if x[0] in partition_by } partitions_schema = [(x, partitions_schema_tmp[x]) for x in partition_by] logger.debug(f"schema: {schema}") logger.debug(f"partitions_schema: {partitions_schema}") if replace_if_exists is not None: self._session.glue.delete_table_if_exists(database=database, table=table) extra_args = {} if file_format == "csv": extra_args["sep"] = sep if serde is None: serde = "OpenCSVSerDe" extra_args["serde"] = serde self._session.glue.create_table( database=database, table=table, schema=schema, partition_cols_schema=partitions_schema, path=path, file_format=file_format, compression=compression, extra_args=extra_args, description=description, parameters=parameters, columns_comments=columns_comments) if load_partitions: self._session.athena.repair_table(database=database, table=table)
def to_s3(self, dataframe, path, file_format, database=None, table=None, partition_cols=None, preserve_index=True, mode="append", compression=None, procs_cpu_bound=None, procs_io_bound=None, cast_columns=None, extra_args=None, inplace=True): """ Write a Pandas Dataframe on S3 Optionally writes metadata on AWS Glue. :param dataframe: Pandas Dataframe :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/ :param file_format: "csv" or "parquet" :param database: AWS Glue Database name :param table: AWS Glue table name :param partition_cols: List of columns names that will be partitions on S3 :param preserve_index: Should preserve index on S3? :param mode: "append", "overwrite", "overwrite_partitions" :param compression: None, gzip, snappy, etc :param procs_cpu_bound: Number of cores used for CPU bound tasks :param procs_io_bound: Number of cores used for I/O bound tasks :param cast_columns: Dictionary of columns names and Athena/Glue types to be casted. (E.g. {"col name": "bigint", "col2 name": "int"}) (Only for "parquet" file_format) :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV) :param inplace: True is cheapest (CPU and Memory) but False leaves your DataFrame intact :return: List of objects written on S3 """ if not partition_cols: partition_cols = [] if not cast_columns: cast_columns = {} dataframe = Pandas.normalize_columns_names_athena(dataframe, inplace=inplace) cast_columns = {Athena.normalize_column_name(k): v for k, v in cast_columns.items()} logger.debug(f"cast_columns: {cast_columns}") partition_cols = [Athena.normalize_column_name(x) for x in partition_cols] logger.debug(f"partition_cols: {partition_cols}") dataframe = Pandas.drop_duplicated_columns(dataframe=dataframe, inplace=inplace) if compression is not None: compression = compression.lower() file_format = file_format.lower() if file_format == "csv": if compression not in Pandas.VALID_CSV_COMPRESSIONS: raise InvalidCompression( f"{compression} isn't a valid CSV compression. Try: {Pandas.VALID_CSV_COMPRESSIONS}") elif file_format == "parquet": if compression not in Pandas.VALID_PARQUET_COMPRESSIONS: raise InvalidCompression( f"{compression} isn't a valid PARQUET compression. Try: {Pandas.VALID_PARQUET_COMPRESSIONS}") else: raise UnsupportedFileFormat(file_format) if dataframe.empty: raise EmptyDataframe() if ((mode == "overwrite") or ((mode == "overwrite_partitions") and # noqa (not partition_cols))): self._session.s3.delete_objects(path=path) elif mode not in ["overwrite_partitions", "append"]: raise UnsupportedWriteMode(mode) objects_paths = self.data_to_s3(dataframe=dataframe, path=path, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, compression=compression, procs_cpu_bound=procs_cpu_bound, procs_io_bound=procs_io_bound, cast_columns=cast_columns, extra_args=extra_args) if database: self._session.glue.metadata_to_glue(dataframe=dataframe, path=path, objects_paths=objects_paths, database=database, table=table, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, compression=compression, cast_columns=cast_columns, extra_args=extra_args) return objects_paths
def to_s3(self, dataframe, path, file_format, database=None, table=None, partition_cols=None, preserve_index=True, mode="append", compression=None, procs_cpu_bound=None, procs_io_bound=None, cast_columns=None, extra_args=None): """ Write a Pandas Dataframe on S3 Optionally writes metadata on AWS Glue. :param dataframe: Pandas Dataframe :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/ :param file_format: "csv" or "parquet" :param database: AWS Glue Database name :param table: AWS Glue table name :param partition_cols: List of columns names that will be partitions on S3 :param preserve_index: Should preserve index on S3? :param mode: "append", "overwrite", "overwrite_partitions" :param compression: None, gzip, snappy, etc :param procs_cpu_bound: Number of cores used for CPU bound tasks :param procs_io_bound: Number of cores used for I/O bound tasks :param cast_columns: Dictionary of columns indexes and Arrow types to be casted. (E.g. {2: "int64", 5: "int32"}) (Only for "parquet" file_format) :param extra_args: Extra arguments specific for each file formats (E.g. "sep" for CSV) :return: List of objects written on S3 """ if compression is not None: compression = compression.lower() file_format = file_format.lower() if file_format not in ["parquet", "csv"]: raise UnsupportedFileFormat(file_format) if file_format == "csv": if compression not in Pandas.VALID_CSV_COMPRESSIONS: raise InvalidCompression( f"{compression} isn't a valid CSV compression. Try: {Pandas.VALID_CSV_COMPRESSIONS}" ) elif file_format == "parquet": if compression not in Pandas.VALID_PARQUET_COMPRESSIONS: raise InvalidCompression( f"{compression} isn't a valid PARQUET compression. Try: {Pandas.VALID_PARQUET_COMPRESSIONS}" ) if dataframe.empty: raise EmptyDataframe() if not partition_cols: partition_cols = [] if mode == "overwrite" or (mode == "overwrite_partitions" and not partition_cols): self._session.s3.delete_objects(path=path) elif mode not in ["overwrite_partitions", "append"]: raise UnsupportedWriteMode(mode) objects_paths = self.data_to_s3(dataframe=dataframe, path=path, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, compression=compression, procs_cpu_bound=procs_cpu_bound, procs_io_bound=procs_io_bound, cast_columns=cast_columns, extra_args=extra_args) if database: self._session.glue.metadata_to_glue(dataframe=dataframe, path=path, objects_paths=objects_paths, database=database, table=table, partition_cols=partition_cols, preserve_index=preserve_index, file_format=file_format, mode=mode, compression=compression, cast_columns=cast_columns, extra_args=extra_args) return objects_paths
def data_to_s3( self, dataframe, path, file_format, partition_cols=None, preserve_index=True, mode="append", procs_cpu_bound=None, procs_io_bound=None, ): if not procs_cpu_bound: procs_cpu_bound = self._session.procs_cpu_bound if not procs_io_bound: procs_io_bound = self._session.procs_io_bound logger.debug(f"procs_cpu_bound: {procs_cpu_bound}") logger.debug(f"procs_io_bound: {procs_io_bound}") if path[-1] == "/": path = path[:-1] file_format = file_format.lower() if file_format not in ["parquet", "csv"]: raise UnsupportedFileFormat(file_format) objects_paths = [] if procs_cpu_bound > 1: bounders = _get_bounders(dataframe=dataframe, num_partitions=procs_cpu_bound) procs = [] receive_pipes = [] for bounder in bounders: receive_pipe, send_pipe = mp.Pipe() proc = mp.Process( target=self._data_to_s3_dataset_writer_remote, args=( send_pipe, dataframe.iloc[bounder[0]:bounder[1], :], path, partition_cols, preserve_index, self._session.primitives, file_format, ), ) proc.daemon = False proc.start() procs.append(proc) receive_pipes.append(receive_pipe) for i in range(len(procs)): objects_paths += receive_pipes[i].recv() procs[i].join() receive_pipes[i].close() else: objects_paths += self._data_to_s3_dataset_writer( dataframe=dataframe, path=path, partition_cols=partition_cols, preserve_index=preserve_index, session_primitives=self._session.primitives, file_format=file_format, ) if mode == "overwrite_partitions" and partition_cols: if procs_io_bound > procs_cpu_bound: num_procs = floor( float(procs_io_bound) / float(procs_cpu_bound)) else: num_procs = 1 logger.debug( f"num_procs for delete_not_listed_objects: {num_procs}") self._session.s3.delete_not_listed_objects( objects_paths=objects_paths, procs_io_bound=num_procs) return objects_paths