Esempio n. 1
0
 def transition_s3_path(self,
                        s3_path,
                        transition_to,
                        options={},
                        transformation_ctx=""):
     """
     Transition files in a given s3 path recursively
     :param s3_path: s3 path of the files to be transitioned in the format s3://<bucket>/<prefix>/
     :param transition_to: S3 storage class to transition to
         https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/StorageClass.html
     :param options: Options to filter files to be deleted and manifest file generation
         retentionPeriod Number of hours. Files newer than the retention period will be retained.
             168 hours - (7 days) by default
         excludeStorageClasses Files with storage class in the excludeStorageClasses set are not deleted.
             Set() - empty set by default
         manifestFilePath optional path for manifest file generation. All files that were successfully purged
             or transitioned will be recorded in Success.csv and those that failed in Failed.csv
         accountId: AWS accountId to run the Transition batch job. Mandatory for Transition transform
         roleArn: AWS role to run the Transition batch job. Mandatory for Transition transform
     :param transformation_ctx: transformation context (used in manifest file path)
     :return: void return type
     """
     self._ssql_ctx.transitionS3Path(s3_path, transition_to,
                                     makeOptions(self._sc, options),
                                     transformation_ctx)
Esempio n. 2
0
    def getSource(self, connection_type, format = None, transformation_ctx = "", push_down_predicate= "", **options):
        """Creates a DataSource object.

        This can be used to read DynamicFrames from external sources.

        Example:
        >>> data_source = context.getSource("file", paths=["/in/path"])
        >>> data_source.setFormat("json")
        >>> myFrame = data_source.getFrame()
        """
        options["callSite"] = callsite()
        if(format and format.lower() in self.Spark_SQL_Formats):
            connection_type = format

        j_source = self._ssql_ctx.getSource(connection_type,
                                            makeOptions(self._sc, options), transformation_ctx, push_down_predicate)

        prefix = None
        if 'paths' in options and options['paths'] != None:
            paths = options['paths']
            prefix = os.path.commonprefix(paths)
            if prefix != None:
                prefix = prefix.split(':')[-1]
                prefix = re.sub('[:/.]', '', prefix)

        # in case paths is not in options or no common prefix
        if prefix == None:
            prefix = str(uuid.uuid1())
            prefix = re.sub('[-]', '_', prefix)

        return DataSource(j_source, self, prefix)
Esempio n. 3
0
    def write_dynamic_frame_from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "",
                                         transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs):
        """
        Writes a DynamicFrame to a location defined in the catalog's database, table name and an optional catalog id
        :param frame: dynamic frame to be written
        :param database: database in catalog
        :param table_name: table name
        :param redshift_tmp_dir: tmp dir
        :param transformation_ctx: transformation context
        :param additional_options
        :param catalog_id catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
                Set to None by default (None defaults to the catalog id of the calling account in the service)
        :return: dynamic frame with potential errors
        """

        if database is not None and "name_space" in kwargs:
            raise Exception("Parameter name_space and database are both specified, choose one.")
        elif database is None and "name_space" not in kwargs:
            raise Exception("Parameter name_space or database is missing.")
        elif "name_space" in kwargs:
            db = kwargs.pop("name_space")
        else:
            db = database

        if table_name is None:
            raise Exception("Parameter table_name is missing.")

        j_sink = self._ssql_ctx.getCatalogSink(db, table_name, redshift_tmp_dir, transformation_ctx,
                                               makeOptions(self._sc, additional_options), catalog_id)
        return DataSink(j_sink, self).write(frame)
Esempio n. 4
0
    def create_dynamic_frame_from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "",
                                          transformation_ctx = "", push_down_predicate="", additional_options = {},
                                          catalog_id = None, **kwargs):
        """
        Creates a DynamicFrame with catalog database, table name and an optional catalog id
        :param database: database in catalog
        :param table_name: table name
        :param redshift_tmp_dir: tmp dir
        :param transformation_ctx: transformation context
        :param push_down_predicate
        :param additional_options
        :param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
                Set to None by default (None defaults to the catalog id of the calling account in the service)
        :return: dynamic frame with potential errors
        """
        if database is not None and "name_space" in kwargs:
            raise Exception("Parameter name_space and database are both specified, choose one.")
        elif database is None and "name_space" not in kwargs:
            raise Exception("Parameter name_space or database is missing.")
        elif "name_space" in kwargs:
            db = kwargs.pop("name_space")
        else:
            db = database

        if table_name is None:
            raise Exception("Parameter table_name is missing.")
        source = DataSource(self._ssql_ctx.getCatalogSource(db, table_name, redshift_tmp_dir, transformation_ctx,
                                                            push_down_predicate,
                                                            makeOptions(self._sc, additional_options), catalog_id),
                            self, table_name)
        return source.getFrame(**kwargs)
Esempio n. 5
0
 def relationalize(self,
                   root_table_name,
                   staging_path,
                   options={},
                   transformation_ctx="",
                   info="",
                   stageThreshold=0,
                   totalThreshold=0):
     """
     Relationalizes a dynamic frame. i.e. produces a list of frames that are
     generated by unnesting nested columns and pivoting array columns. The
     pivoted array column can be joined to the root table using the joinkey
     generated in unnest phase
     :param root_table_name: name for the root table
     :param staging_path: path to store partitions of pivoted tables in csv format. Pivoted tables are read back from
         this path
     :param options: dict of optional parameters for relationalize
     :param transformation_ctx: context key to retrieve metadata about the current transformation
     :param info: String, any string to be associated with errors in this transformation.
     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
     :param totalThreshold: Long, total number of errors upto and including in this transformation
       for which the processing needs to error out.
     :return: DynamicFrameCollection
     """
     _rFrames = _as_java_list(
         self._sc,
         self._jdf.relationalize(root_table_name, staging_path,
                                 makeOptions(self._sc,
                                             options), transformation_ctx,
                                 _call_site(self._sc, callsite(), info),
                                 long(stageThreshold),
                                 long(totalThreshold)))
     return DynamicFrameCollection(
         dict((df.getName(), DynamicFrame(df, self.glue_ctx, df.getName()))
              for df in _rFrames), self.glue_ctx)
Esempio n. 6
0
 def transition_table(self,
                      database,
                      table_name,
                      transition_to,
                      options={},
                      transformation_ctx="",
                      catalog_id=None):
     """
     Transitions the storage class of the files stored on s3 for the given catalog's database and table
     :param database: database name in catalog
     :param table_name: table name in catalog
     :param transition_to: S3 storage class to transition to
         https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/StorageClass.html
     :param options: Options to filter files to be transitioned and manifest file generation
         retentionPeriod: Number of hours. Files newer than the retention period will be retained.
             168 hours - (7 days) by default
         partitionPredicate: Partitions satisfying this predicate will be deleted.
             Files within the retention period in these partitions will not be deleted.
             "" - empty by default
         excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
             Set() - empty set by default
         manifestFilePath: optional path for manifest file generation. All files that were successfully purged
             or transitioned will be recorded in Success.csv and those that failed in Failed.csv
         accountId: AWS accountId to run the Transition batch job. Mandatory for Transition transform
         roleArn: AWS role to run the Transition batch job. Mandatory for Transition transform
     :param transformation_ctx: transformation context (used in manifest file path)
     :param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog).
             Set to None by default (None defaults to the catalog id of the calling account in the service)
     :return: void return type
     """
     self._ssql_ctx.transitionTable(database, table_name, transition_to,
                                    makeOptions(self._sc, options),
                                    transformation_ctx, catalog_id)
Esempio n. 7
0
 def purge_table(self,
                 database,
                 table_name,
                 options={},
                 transformation_ctx="",
                 catalog_id=None):
     """
     Delete files from s3 for the given catalog's database and table. If all files in a partition are deleted, that
     partition is deleted from the catalog too
     :param database: database name in catalog
     :param table_name: table name in catalog
     :param options: Options to filter files to be deleted and manifest file generation
         retentionPeriod: Number of hours. Files newer than the retention period will be retained.
             168 hours - (7 days) by default
         partitionPredicate: Partitions satisfying this predicate will be deleted.
             Files within the retention period in these partitions will not be deleted.
             "" - empty by default
         excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
             Set() - empty set by default
         manifestFilePath: optional path for manifest file generation. All files that were successfully purged
             or transitioned will be recorded in Success.csv and those that failed in Failed.csv
     :param transformation_ctx: transformation context (used in manifest file path)
     :param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog).
             Set to None by default (None defaults to the catalog id of the calling account in the service)
     :return: void return type
     """
     self._ssql_ctx.purgeTable(database, table_name,
                               makeOptions(self._sc, options),
                               transformation_ctx, catalog_id)
Esempio n. 8
0
 def mergeDynamicFrame(self,
                       stage_dynamic_frame,
                       primary_keys,
                       transformation_ctx="",
                       options={},
                       info="",
                       stageThreshold=0,
                       totalThreshold=0):
     """
     Merge this DynamicFrame with a staging DynamicFrame based on the provided primary keys to identify records.
     Duplicate records (records with same primary keys) are not de-duplicated. All records (including duplicates) are
     retained from the source, if there is no matching record in staging frame. If staging frame has matching records
     then the records from the staging frame overwrites the records in the source.
     :param stage_dynamic_frame: Staging DynamicFrame
     :param primary_keys: List of primary key fields to match records from source and staging dynamic frame
     :param transformation_ctx: context key to retrieve metadata about the current transformation
     :param options: optional options for the transformation
     :param info: String, any string to be associated with errors in this transformation.
     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
     :param totalThreshold: Long, total number of errors upto and including in this transformation
       for which the processing needs to error out.
     :return: DynamicFrame
     """
     if isinstance(primary_keys, basestring):
         primary_keys = [primary_keys]
     return DynamicFrame(
         self._jdf.mergeDynamicFrames(
             stage_dynamic_frame._jdf,
             self.glue_ctx._jvm.PythonUtils.toSeq(primary_keys),
             transformation_ctx, makeOptions(self._sc, options),
             _call_site(self._sc, callsite(), info), long(stageThreshold),
             long(totalThreshold)), self.glue_ctx, self.name)
Esempio n. 9
0
 def spigot(self,
            path,
            options={},
            transformation_ctx="",
            info="",
            stageThreshold=0,
            totalThreshold=0):
     return DynamicFrame(
         self._jdf.spigot(path, makeOptions(self._sc, options),
                          transformation_ctx,
                          _call_site(self._sc, callsite(), info),
                          long(stageThreshold), long(totalThreshold)),
         self.glue_ctx, self.name)
Esempio n. 10
0
    def write_from_jdbc_conf(self, frame_or_dfc, catalog_connection, connection_options={},
                             redshift_tmp_dir = "", transformation_ctx = "", catalog_id = None):
        if isinstance(frame_or_dfc, DynamicFrameCollection):
            new_options = dict(connection_options.items()
                               + [("useFrameName", True)])
        elif isinstance(frame_or_dfc, DynamicFrame):
            new_options = connection_options
        else:
            raise TypeError("frame_or_dfc must be DynamicFrame or"
                            "DynamicFrameCollection. Got " +
                            str(type(frame_or_dfc)))

        j_sink = self._ssql_ctx.getJDBCSink(catalog_connection, makeOptions(self._sc, new_options), redshift_tmp_dir,
                                            transformation_ctx, catalog_id)
        return DataSink(j_sink, self).write(frame_or_dfc)
Esempio n. 11
0
 def purge_s3_path(self, s3_path, options={}, transformation_ctx=""):
     """
     Deletes files from a given s3 path recursively
     :param s3_path: s3 path of the files to be deleted in the format s3://<bucket>/<prefix>/
     :param options: Options to filter files to be deleted and manifest file generation
         retentionPeriod: Number of hours. Files newer than the retention period will be retained.
             168 hours - (7 days) by default
         excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
             Set() - empty set by default
         manifestFilePath: optional path for manifest file generation. All files that were successfully purged
             or transitioned will be recorded in Success.csv and those that failed in Failed.csv
     :param transformation_ctx: transformation context (used in manifest file path)
     :return: void return type
     """
     self._ssql_ctx.purgeS3Path(s3_path, makeOptions(self._sc, options),
                                transformation_ctx)
Esempio n. 12
0
    def getSink(self, connection_type, format = None, transformation_ctx = "", **options):
        """Gets a DataSink object.

        This can be used to write DynamicFrames to external targets.
        Check SparkSQL format first to make sure to return the expected sink

        Example:
        >>> data_sink = context.getSink("s3")
        >>> data_sink.setFormat("json"),
        >>> data_sink.writeFrame(myFrame)
        """

        if(format and format.lower() in self.Spark_SQL_Formats):
            connection_type = format
        j_sink = self._ssql_ctx.getSink(connection_type,
                                        makeOptions(self._sc, options), transformation_ctx)
        return DataSink(j_sink, self)
Esempio n. 13
0
 def setFormat(self, format, **options):
     options["callSite"] = callsite()
     self._jsource.setFormat(format, makeOptions(self._sql_ctx._sc,
                                                 options))
Esempio n. 14
0
 def setFormat(self, format, **options):
     self._jsink.setFormat(format, makeOptions(self._sql_ctx._sc, options))
Esempio n. 15
0
 def spigot(self, path, options={}):
     return DynamicFrame(
         self._jdf.pySpigot(path, makeOptions(self._sc, options)),
         self.glue_ctx, self.name)