def transition_s3_path(self, s3_path, transition_to, options={}, transformation_ctx=""): """ Transition files in a given s3 path recursively :param s3_path: s3 path of the files to be transitioned in the format s3://<bucket>/<prefix>/ :param transition_to: S3 storage class to transition to https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/StorageClass.html :param options: Options to filter files to be deleted and manifest file generation retentionPeriod Number of hours. Files newer than the retention period will be retained. 168 hours - (7 days) by default excludeStorageClasses Files with storage class in the excludeStorageClasses set are not deleted. Set() - empty set by default manifestFilePath optional path for manifest file generation. All files that were successfully purged or transitioned will be recorded in Success.csv and those that failed in Failed.csv accountId: AWS accountId to run the Transition batch job. Mandatory for Transition transform roleArn: AWS role to run the Transition batch job. Mandatory for Transition transform :param transformation_ctx: transformation context (used in manifest file path) :return: void return type """ self._ssql_ctx.transitionS3Path(s3_path, transition_to, makeOptions(self._sc, options), transformation_ctx)
def getSource(self, connection_type, format = None, transformation_ctx = "", push_down_predicate= "", **options): """Creates a DataSource object. This can be used to read DynamicFrames from external sources. Example: >>> data_source = context.getSource("file", paths=["/in/path"]) >>> data_source.setFormat("json") >>> myFrame = data_source.getFrame() """ options["callSite"] = callsite() if(format and format.lower() in self.Spark_SQL_Formats): connection_type = format j_source = self._ssql_ctx.getSource(connection_type, makeOptions(self._sc, options), transformation_ctx, push_down_predicate) prefix = None if 'paths' in options and options['paths'] != None: paths = options['paths'] prefix = os.path.commonprefix(paths) if prefix != None: prefix = prefix.split(':')[-1] prefix = re.sub('[:/.]', '', prefix) # in case paths is not in options or no common prefix if prefix == None: prefix = str(uuid.uuid1()) prefix = re.sub('[-]', '_', prefix) return DataSource(j_source, self, prefix)
def write_dynamic_frame_from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs): """ Writes a DynamicFrame to a location defined in the catalog's database, table name and an optional catalog id :param frame: dynamic frame to be written :param database: database in catalog :param table_name: table name :param redshift_tmp_dir: tmp dir :param transformation_ctx: transformation context :param additional_options :param catalog_id catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog). Set to None by default (None defaults to the catalog id of the calling account in the service) :return: dynamic frame with potential errors """ if database is not None and "name_space" in kwargs: raise Exception("Parameter name_space and database are both specified, choose one.") elif database is None and "name_space" not in kwargs: raise Exception("Parameter name_space or database is missing.") elif "name_space" in kwargs: db = kwargs.pop("name_space") else: db = database if table_name is None: raise Exception("Parameter table_name is missing.") j_sink = self._ssql_ctx.getCatalogSink(db, table_name, redshift_tmp_dir, transformation_ctx, makeOptions(self._sc, additional_options), catalog_id) return DataSink(j_sink, self).write(frame)
def create_dynamic_frame_from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", push_down_predicate="", additional_options = {}, catalog_id = None, **kwargs): """ Creates a DynamicFrame with catalog database, table name and an optional catalog id :param database: database in catalog :param table_name: table name :param redshift_tmp_dir: tmp dir :param transformation_ctx: transformation context :param push_down_predicate :param additional_options :param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog). Set to None by default (None defaults to the catalog id of the calling account in the service) :return: dynamic frame with potential errors """ if database is not None and "name_space" in kwargs: raise Exception("Parameter name_space and database are both specified, choose one.") elif database is None and "name_space" not in kwargs: raise Exception("Parameter name_space or database is missing.") elif "name_space" in kwargs: db = kwargs.pop("name_space") else: db = database if table_name is None: raise Exception("Parameter table_name is missing.") source = DataSource(self._ssql_ctx.getCatalogSource(db, table_name, redshift_tmp_dir, transformation_ctx, push_down_predicate, makeOptions(self._sc, additional_options), catalog_id), self, table_name) return source.getFrame(**kwargs)
def relationalize(self, root_table_name, staging_path, options={}, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0): """ Relationalizes a dynamic frame. i.e. produces a list of frames that are generated by unnesting nested columns and pivoting array columns. The pivoted array column can be joined to the root table using the joinkey generated in unnest phase :param root_table_name: name for the root table :param staging_path: path to store partitions of pivoted tables in csv format. Pivoted tables are read back from this path :param options: dict of optional parameters for relationalize :param transformation_ctx: context key to retrieve metadata about the current transformation :param info: String, any string to be associated with errors in this transformation. :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out. :param totalThreshold: Long, total number of errors upto and including in this transformation for which the processing needs to error out. :return: DynamicFrameCollection """ _rFrames = _as_java_list( self._sc, self._jdf.relationalize(root_table_name, staging_path, makeOptions(self._sc, options), transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold))) return DynamicFrameCollection( dict((df.getName(), DynamicFrame(df, self.glue_ctx, df.getName())) for df in _rFrames), self.glue_ctx)
def transition_table(self, database, table_name, transition_to, options={}, transformation_ctx="", catalog_id=None): """ Transitions the storage class of the files stored on s3 for the given catalog's database and table :param database: database name in catalog :param table_name: table name in catalog :param transition_to: S3 storage class to transition to https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/StorageClass.html :param options: Options to filter files to be transitioned and manifest file generation retentionPeriod: Number of hours. Files newer than the retention period will be retained. 168 hours - (7 days) by default partitionPredicate: Partitions satisfying this predicate will be deleted. Files within the retention period in these partitions will not be deleted. "" - empty by default excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted. Set() - empty set by default manifestFilePath: optional path for manifest file generation. All files that were successfully purged or transitioned will be recorded in Success.csv and those that failed in Failed.csv accountId: AWS accountId to run the Transition batch job. Mandatory for Transition transform roleArn: AWS role to run the Transition batch job. Mandatory for Transition transform :param transformation_ctx: transformation context (used in manifest file path) :param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog). Set to None by default (None defaults to the catalog id of the calling account in the service) :return: void return type """ self._ssql_ctx.transitionTable(database, table_name, transition_to, makeOptions(self._sc, options), transformation_ctx, catalog_id)
def purge_table(self, database, table_name, options={}, transformation_ctx="", catalog_id=None): """ Delete files from s3 for the given catalog's database and table. If all files in a partition are deleted, that partition is deleted from the catalog too :param database: database name in catalog :param table_name: table name in catalog :param options: Options to filter files to be deleted and manifest file generation retentionPeriod: Number of hours. Files newer than the retention period will be retained. 168 hours - (7 days) by default partitionPredicate: Partitions satisfying this predicate will be deleted. Files within the retention period in these partitions will not be deleted. "" - empty by default excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted. Set() - empty set by default manifestFilePath: optional path for manifest file generation. All files that were successfully purged or transitioned will be recorded in Success.csv and those that failed in Failed.csv :param transformation_ctx: transformation context (used in manifest file path) :param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog). Set to None by default (None defaults to the catalog id of the calling account in the service) :return: void return type """ self._ssql_ctx.purgeTable(database, table_name, makeOptions(self._sc, options), transformation_ctx, catalog_id)
def mergeDynamicFrame(self, stage_dynamic_frame, primary_keys, transformation_ctx="", options={}, info="", stageThreshold=0, totalThreshold=0): """ Merge this DynamicFrame with a staging DynamicFrame based on the provided primary keys to identify records. Duplicate records (records with same primary keys) are not de-duplicated. All records (including duplicates) are retained from the source, if there is no matching record in staging frame. If staging frame has matching records then the records from the staging frame overwrites the records in the source. :param stage_dynamic_frame: Staging DynamicFrame :param primary_keys: List of primary key fields to match records from source and staging dynamic frame :param transformation_ctx: context key to retrieve metadata about the current transformation :param options: optional options for the transformation :param info: String, any string to be associated with errors in this transformation. :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out. :param totalThreshold: Long, total number of errors upto and including in this transformation for which the processing needs to error out. :return: DynamicFrame """ if isinstance(primary_keys, basestring): primary_keys = [primary_keys] return DynamicFrame( self._jdf.mergeDynamicFrames( stage_dynamic_frame._jdf, self.glue_ctx._jvm.PythonUtils.toSeq(primary_keys), transformation_ctx, makeOptions(self._sc, options), _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
def spigot(self, path, options={}, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0): return DynamicFrame( self._jdf.spigot(path, makeOptions(self._sc, options), transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
def write_from_jdbc_conf(self, frame_or_dfc, catalog_connection, connection_options={}, redshift_tmp_dir = "", transformation_ctx = "", catalog_id = None): if isinstance(frame_or_dfc, DynamicFrameCollection): new_options = dict(connection_options.items() + [("useFrameName", True)]) elif isinstance(frame_or_dfc, DynamicFrame): new_options = connection_options else: raise TypeError("frame_or_dfc must be DynamicFrame or" "DynamicFrameCollection. Got " + str(type(frame_or_dfc))) j_sink = self._ssql_ctx.getJDBCSink(catalog_connection, makeOptions(self._sc, new_options), redshift_tmp_dir, transformation_ctx, catalog_id) return DataSink(j_sink, self).write(frame_or_dfc)
def purge_s3_path(self, s3_path, options={}, transformation_ctx=""): """ Deletes files from a given s3 path recursively :param s3_path: s3 path of the files to be deleted in the format s3://<bucket>/<prefix>/ :param options: Options to filter files to be deleted and manifest file generation retentionPeriod: Number of hours. Files newer than the retention period will be retained. 168 hours - (7 days) by default excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted. Set() - empty set by default manifestFilePath: optional path for manifest file generation. All files that were successfully purged or transitioned will be recorded in Success.csv and those that failed in Failed.csv :param transformation_ctx: transformation context (used in manifest file path) :return: void return type """ self._ssql_ctx.purgeS3Path(s3_path, makeOptions(self._sc, options), transformation_ctx)
def getSink(self, connection_type, format = None, transformation_ctx = "", **options): """Gets a DataSink object. This can be used to write DynamicFrames to external targets. Check SparkSQL format first to make sure to return the expected sink Example: >>> data_sink = context.getSink("s3") >>> data_sink.setFormat("json"), >>> data_sink.writeFrame(myFrame) """ if(format and format.lower() in self.Spark_SQL_Formats): connection_type = format j_sink = self._ssql_ctx.getSink(connection_type, makeOptions(self._sc, options), transformation_ctx) return DataSink(j_sink, self)
def setFormat(self, format, **options): options["callSite"] = callsite() self._jsource.setFormat(format, makeOptions(self._sql_ctx._sc, options))
def setFormat(self, format, **options): self._jsink.setFormat(format, makeOptions(self._sql_ctx._sc, options))
def spigot(self, path, options={}): return DynamicFrame( self._jdf.pySpigot(path, makeOptions(self._sc, options)), self.glue_ctx, self.name)