Beispiel #1
0
    def unbox(self,
              path,
              format,
              transformation_ctx="",
              info="",
              stageThreshold=0,
              totalThreshold=0,
              **options):
        """
        unbox a string field

        :param path: full path to the StringNode you want to unbox
        :param format: "avro" or "json"
        :param info: String, any string to be associated with errors in this transformation.
        :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
        :param totalThreshold: Long, total number of errors upto and including in this transformation
          for which the processing needs to error out.
        :param options:
            separator: String,
            escaper: String,
            skipFirst: Boolean,
            withSchema: String, schema string should always be called by using StructType.json()
            withHeader: Boolean
        :return: a new DynamicFrame with unboxed DynamicRecords

        >>>unbox("a.b.c", "csv", separator="|")
        """
        return DynamicFrame(
            self._jdf.unbox(path, format, json.dumps(options),
                            transformation_ctx,
                            _call_site(self._sc, callsite(), info),
                            long(stageThreshold), long(totalThreshold)),
            self.glue_ctx, self.name)
Beispiel #2
0
    def apply_mapping(self, mappings, case_sensitive = False, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
        def _to_java_mapping(mapping_tup):
            if not isinstance(mapping_tup, tuple):
                raise TypeError("Mapping must be specified as a tuple. Got " +
                                mapping_tup)

            tup2 = self.glue_ctx._jvm.scala.Tuple2
            tup3 = self.glue_ctx._jvm.scala.Tuple3
            tup4 = self.glue_ctx._jvm.scala.Tuple4
            java_cls = self.glue_ctx._jvm.MappingSpec

            if len(mapping_tup) == 2:
                return java_cls.apply(tup2.apply(mapping_tup[0], mapping_tup[1]))
            elif len(mapping_tup) == 3:
                return java_cls.apply(tup3.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2]))
            elif len(mapping_tup) == 4:
                return java_cls.apply(tup4.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2], mapping_tup[3]))
            else:
                raise ValueError("Mapping tuple must be of length 2, 3, or 4"
                                 "Got tuple of length " + str(len(mapping_tup)))

        if isinstance(mappings, tuple):
            mappings = [mappings]

        mappings_list = [ _to_java_mapping(m) for m in mappings ]

        new_jdf = self._jdf.applyMapping(
            self.glue_ctx._jvm.PythonUtils.toSeq(mappings_list),
            case_sensitive,
            transformation_ctx,
            _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold))

        return DynamicFrame(new_jdf, self.glue_ctx, self.name)
Beispiel #3
0
    def split_fields(self,
                     paths,
                     name1,
                     name2,
                     transformation_ctx="",
                     info="",
                     stageThreshold=0,
                     totalThreshold=0):
        """
        :param paths: List of strings, each the full path to a node you want to split into a new DynamicFrame
        :param name1: name for the dynamic frame to be split off
        :param name2: name for the dynamic frame remains on original
        :param info: String, any string to be associated with errors in this transformation.
        :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
        :param totalThreshold: Long, total number of errors upto and including in this transformation
          for which the processing needs to error out.
        :return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
          the second containing the nodes remaining on the original.
        """
        if isinstance(paths, basestring):
            paths = [paths]

        jdfs = _as_java_list(
            self._sc,
            self._jdf.splitFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths),
                                  transformation_ctx,
                                  _call_site(self._sc, callsite(), info),
                                  long(stageThreshold), long(totalThreshold)))
        return DynamicFrameCollection(
            {
                name1: DynamicFrame(jdfs[0], self.glue_ctx, name1),
                name2: DynamicFrame(jdfs[1], self.glue_ctx, name2)
            }, self.glue_ctx)
Beispiel #4
0
 def relationalize(self,
                   root_table_name,
                   staging_path,
                   options={},
                   transformation_ctx="",
                   info="",
                   stageThreshold=0,
                   totalThreshold=0):
     """
     Relationalizes a dynamic frame. i.e. produces a list of frames that are
     generated by unnesting nested columns and pivoting array columns. The
     pivoted array column can be joined to the root table using the joinkey
     generated in unnest phase
     :param root_table_name: name for the root table
     :param staging_path: path to store partitions of pivoted tables in csv format. Pivoted tables are read back from
         this path
     :param options: dict of optional parameters for relationalize
     :param transformation_ctx: context key to retrieve metadata about the current transformation
     :param info: String, any string to be associated with errors in this transformation.
     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
     :param totalThreshold: Long, total number of errors upto and including in this transformation
       for which the processing needs to error out.
     :return: DynamicFrameCollection
     """
     _rFrames = _as_java_list(
         self._sc,
         self._jdf.relationalize(root_table_name, staging_path,
                                 makeOptions(self._sc,
                                             options), transformation_ctx,
                                 _call_site(self._sc, callsite(), info),
                                 long(stageThreshold),
                                 long(totalThreshold)))
     return DynamicFrameCollection(
         dict((df.getName(), DynamicFrame(df, self.glue_ctx, df.getName()))
              for df in _rFrames), self.glue_ctx)
Beispiel #5
0
 def mergeDynamicFrame(self,
                       stage_dynamic_frame,
                       primary_keys,
                       transformation_ctx="",
                       options={},
                       info="",
                       stageThreshold=0,
                       totalThreshold=0):
     """
     Merge this DynamicFrame with a staging DynamicFrame based on the provided primary keys to identify records.
     Duplicate records (records with same primary keys) are not de-duplicated. All records (including duplicates) are
     retained from the source, if there is no matching record in staging frame. If staging frame has matching records
     then the records from the staging frame overwrites the records in the source.
     :param stage_dynamic_frame: Staging DynamicFrame
     :param primary_keys: List of primary key fields to match records from source and staging dynamic frame
     :param transformation_ctx: context key to retrieve metadata about the current transformation
     :param options: optional options for the transformation
     :param info: String, any string to be associated with errors in this transformation.
     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
     :param totalThreshold: Long, total number of errors upto and including in this transformation
       for which the processing needs to error out.
     :return: DynamicFrame
     """
     if isinstance(primary_keys, basestring):
         primary_keys = [primary_keys]
     return DynamicFrame(
         self._jdf.mergeDynamicFrames(
             stage_dynamic_frame._jdf,
             self.glue_ctx._jvm.PythonUtils.toSeq(primary_keys),
             transformation_ctx, makeOptions(self._sc, options),
             _call_site(self._sc, callsite(), info), long(stageThreshold),
             long(totalThreshold)), self.glue_ctx, self.name)
Beispiel #6
0
    def getSource(self, connection_type, format = None, transformation_ctx = "", push_down_predicate= "", **options):
        """Creates a DataSource object.

        This can be used to read DynamicFrames from external sources.

        Example:
        >>> data_source = context.getSource("file", paths=["/in/path"])
        >>> data_source.setFormat("json")
        >>> myFrame = data_source.getFrame()
        """
        options["callSite"] = callsite()
        if(format and format.lower() in self.Spark_SQL_Formats):
            connection_type = format

        j_source = self._ssql_ctx.getSource(connection_type,
                                            makeOptions(self._sc, options), transformation_ctx, push_down_predicate)

        prefix = None
        if 'paths' in options and options['paths'] != None:
            paths = options['paths']
            prefix = os.path.commonprefix(paths)
            if prefix != None:
                prefix = prefix.split(':')[-1]
                prefix = re.sub('[:/.]', '', prefix)

        # in case paths is not in options or no common prefix
        if prefix == None:
            prefix = str(uuid.uuid1())
            prefix = re.sub('[-]', '_', prefix)

        return DataSource(j_source, self, prefix)
Beispiel #7
0
 def repartition(self,
                 num_partitions,
                 transformation_ctx="",
                 info="",
                 stageThreshold=0,
                 totalThreshold=0):
     new_jdf = self._jdf.repartition(num_partitions, transformation_ctx,
                                     _call_site(self._sc, callsite(), info),
                                     long(stageThreshold),
                                     long(totalThreshold))
     return DynamicFrame(new_jdf, self.glue_ctx, self.name)
Beispiel #8
0
 def spigot(self,
            path,
            options={},
            transformation_ctx="",
            info="",
            stageThreshold=0,
            totalThreshold=0):
     return DynamicFrame(
         self._jdf.spigot(path, makeOptions(self._sc, options),
                          transformation_ctx,
                          _call_site(self._sc, callsite(), info),
                          long(stageThreshold), long(totalThreshold)),
         self.glue_ctx, self.name)
Beispiel #9
0
    def resolveChoice(self,
                      specs=None,
                      choice="",
                      database=None,
                      table_name=None,
                      transformation_ctx="",
                      info="",
                      stageThreshold=0,
                      totalThreshold=0,
                      catalog_id=None):
        """
        :param specs: specification for choice type and corresponding resolve action,
                      if the specs is empty, then tape backend would go one round of the data
                      to get schema, and then based on the schema to resolve choice.
        :param choice: default option when choice type path found missing from specs
        :param database: Glue catalog database name, required for MATCH_CATALOG choice
        :param table_name: Glue catalog table name, required for MATCH_CATALOG choice
        :return: a new DynamicFrame
        """
        def _to_java_specs(specs_tup):
            path, action = specs_tup
            return self.glue_ctx._jvm.ResolveSpec.apply(path, action)

        if specs is None and not choice:
            raise Exception(
                "Parameter specs and option are both missing, add one.")

        if specs is not None and choice:
            raise Exception(
                "Parameter specs and option are both specified, choose one.")

        if specs is None:
            specs = []

        if isinstance(specs, tuple):
            specs = [specs]

        specs_list = [_to_java_specs(m) for m in specs]

        choice_option = _as_scala_option(
            self._sc, _as_resolve_choiceOption(self._sc, choice))
        database_option = _as_scala_option(self._sc, database)
        table_name_option = _as_scala_option(self._sc, table_name)

        new_jdf = self._jdf.resolveChoice(
            self.glue_ctx._jvm.PythonUtils.toSeq(specs_list), choice_option,
            database_option, table_name_option, transformation_ctx,
            _call_site(self._sc, callsite(), info), long(stageThreshold),
            long(totalThreshold), _as_scala_option(self._sc, catalog_id))

        return DynamicFrame(new_jdf, self.glue_ctx, self.name)
Beispiel #10
0
 def mapPartitionsWithIndex(self,
                            f,
                            preservesPartitioning=False,
                            transformation_ctx="",
                            info="",
                            stageThreshold=0,
                            totalThreshold=0):
     return DynamicFrame(
         self.glue_ctx._jvm.DynamicFrame.fromPythonRDD(
             self._jdf,
             PipelinedRDD(self._rdd, f, preservesPartitioning)._jrdd,
             self.glue_ctx._ssql_ctx, transformation_ctx, self.name,
             _call_site(self._sc, callsite(), info), long(stageThreshold),
             long(totalThreshold)), self.glue_ctx, self.name)
Beispiel #11
0
    def select_fields(self, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
        """
        :param paths: List of strings, each the full path to a node you want to get
        :param info: String, any string to be associated with errors in this transformation.
        :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
        :param totalThreshold: Long, total number of errors upto and including in this transformation
          for which the processing needs to error out.
        :return: DynamicFrame
        """
        if isinstance(paths, basestring):
            paths = [paths]

        return DynamicFrame(self._jdf.selectFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
                                                   _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)),
                            self.glue_ctx, self.name)
Beispiel #12
0
    def split_rows(self,
                   comparison_dict,
                   name1,
                   name2,
                   transformation_ctx="",
                   info="",
                   stageThreshold=0,
                   totalThreshold=0):
        """
        :param comparison_dict: a dictionary where the key is the path to a column, the the value is another
        dictionary maping comparators to the value to which the column will be compared.
        e.g. {"age": {">": 10, "<": 20}} will give back rows where age between 10 and 20 exclusive split from those
        that do not meet this criteria.
        :param name1: name for the dynamic frame to be split off
        :param name2: name for the dynamic frame remains on original
        :param info: String, any string to be associated with errors in this transformation.
        :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
        :param totalThreshold: Long, total number of errors upto and including in this transformation
          for which the processing needs to error out.
        :return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
          the second containing the nodes remaining on the original.
        """
        paths, values, operators = [], [], []

        for key, value in comparison_dict.items():
            paths.extend([key] * len(value))
            for k, v in value.items():
                operators.append(k)
                if isinstance(v, int):
                    values.append(long(v))
                else:
                    values.append(v)

        jdfs = _as_java_list(
            self._sc,
            self._jdf.splitRows(
                self.glue_ctx._jvm.PythonUtils.toSeq(paths),
                self.glue_ctx._jvm.PythonUtils.toSeq(values),
                self.glue_ctx._jvm.PythonUtils.toSeq(operators),
                transformation_ctx, _call_site(self._sc, callsite(), info),
                long(stageThreshold), long(totalThreshold)))
        return DynamicFrameCollection(
            {
                name1: DynamicFrame(jdfs[0], self.glue_ctx, name1),
                name2: DynamicFrame(jdfs[1], self.glue_ctx, name2)
            }, self.glue_ctx)
Beispiel #13
0
    def join(self,
             paths1,
             paths2,
             frame2,
             transformation_ctx="",
             info="",
             stageThreshold=0,
             totalThreshold=0):
        if isinstance(paths1, basestring):
            paths1 = [paths1]
        if isinstance(paths2, basestring):
            paths2 = [paths2]

        return DynamicFrame(
            self._jdf.pyJoin(self.glue_ctx._jvm.PythonUtils.toSeq(paths1),
                             self.glue_ctx._jvm.PythonUtils.toSeq(paths2),
                             frame2._jdf, transformation_ctx, callsite(), info,
                             long(stageThreshold), long(totalThreshold)),
            self.glue_ctx, self.name + frame2.name)
Beispiel #14
0
    def unnest(self,
               transformation_ctx="",
               info="",
               stageThreshold=0,
               totalThreshold=0):
        """
        unnest a dynamic frame. i.e. flattens nested objects to top level elements.
        It also generates joinkeys for array objects
        :param info: String, any string to be associated with errors in this transformation.
        :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
        :param totalThreshold: Long, total number of errors upto and including in this transformation
          for which the processing needs to error out.
        :return: a new unnested dynamic frame

        >>>unnest()
        """
        return DynamicFrame(
            self._jdf.pyUnnest(transformation_ctx, callsite(), info,
                               long(stageThreshold), long(totalThreshold)),
            self.glue_ctx, self.name)
Beispiel #15
0
 def rename_field(self,
                  oldName,
                  newName,
                  transformation_ctx="",
                  info="",
                  stageThreshold=0,
                  totalThreshold=0):
     """
     :param oldName: String, full path to the node you want to rename
     :param newName: String, new name including full path
     :param info: String, any string to be associated with errors in this transformation.
     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
     :param totalThreshold: Long, total number of errors upto and including in this transformation
       for which the processing needs to error out.
     :return: DynamicFrame
     """
     return DynamicFrame(
         self._jdf.renameField(oldName, newName, transformation_ctx,
                               _call_site(self._sc, callsite(), info),
                               long(stageThreshold), long(totalThreshold)),
         self.glue_ctx, self.name)
Beispiel #16
0
    def apply_mapping(self,
                      mappings,
                      case_sensitive=False,
                      transformation_ctx="",
                      info="",
                      stageThreshold=0,
                      totalThreshold=0):
        def _to_java_mapping(mapping_tup):
            source_path, source_type, target_path, target_type = mapping_tup
            return self.glue_ctx._jvm.MappingSpec.apply(
                source_path, source_type, target_path, target_type)

        if isinstance(mappings, tuple):
            mappings = [mappings]

        mappings_list = [_to_java_mapping(m) for m in mappings]

        new_jdf = self._jdf.pyApplyMapping(
            self.glue_ctx._jvm.PythonUtils.toSeq(mappings_list),
            case_sensitive, transformation_ctx, callsite(), info,
            long(stageThreshold), long(totalThreshold))

        return DynamicFrame(new_jdf, self.glue_ctx, self.name)
 def setFormat(self, format, **options):
     options["callSite"] = callsite()
     self._jsource.setFormat(format, makeOptions(self._sql_ctx._sc,
                                                 options))
Beispiel #18
0
 def union(self, other_frame, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
     """Returns a DynamicFrame containing all records in this frame and all records in other_frame.
     :param other_frame: DynamicFrame to union with this one.
     :param transformation_ctx: context key to retrieve metadata about the current transformation
     :param info: String, any string to be associated with errors in this transformation.
     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
     :param totalThreshold: Long, total number of errors upto and including in this transformation
       for which the processing needs to error out.
     :return: DynamicFrame
     """
     union = self._jdf.union(other_frame._jdf, transformation_ctx, _call_site(self._sc, callsite(), info),
                             long(stageThreshold), long(totalThreshold))
     return DynamicFrame(union, self.glue_ctx, union.name)
Beispiel #19
0
 def writeFrame(self, dynamic_frame, info = ""):
     return DynamicFrame(self._jsink.pyWriteDynamicFrame(dynamic_frame._jdf, callsite(), info), dynamic_frame.glue_ctx, dynamic_frame.name + "_errors")