예제 #1
0
    def split_fields(self,
                     paths,
                     name1,
                     name2,
                     transformation_ctx="",
                     info="",
                     stageThreshold=0,
                     totalThreshold=0):
        """
        :param paths: List of strings, each the full path to a node you want to split into a new DynamicFrame
        :param name1: name for the dynamic frame to be split off
        :param name2: name for the dynamic frame remains on original
        :param info: String, any string to be associated with errors in this transformation.
        :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
        :param totalThreshold: Long, total number of errors upto and including in this transformation
          for which the processing needs to error out.
        :return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
          the second containing the nodes remaining on the original.
        """
        if isinstance(paths, basestring):
            paths = [paths]

        jdfs = _as_java_list(
            self._sc,
            self._jdf.splitFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths),
                                  transformation_ctx,
                                  _call_site(self._sc, callsite(), info),
                                  long(stageThreshold), long(totalThreshold)))
        return DynamicFrameCollection(
            {
                name1: DynamicFrame(jdfs[0], self.glue_ctx, name1),
                name2: DynamicFrame(jdfs[1], self.glue_ctx, name2)
            }, self.glue_ctx)
예제 #2
0
 def relationalize(self,
                   root_table_name,
                   staging_path,
                   options={},
                   transformation_ctx="",
                   info="",
                   stageThreshold=0,
                   totalThreshold=0):
     """
     Relationalizes a dynamic frame. i.e. produces a list of frames that are
     generated by unnesting nested columns and pivoting array columns. The
     pivoted array column can be joined to the root table using the joinkey
     generated in unnest phase
     :param root_table_name: name for the root table
     :param staging_path: path to store partitions of pivoted tables in csv format. Pivoted tables are read back from
         this path
     :param options: dict of optional parameters for relationalize
     :param transformation_ctx: context key to retrieve metadata about the current transformation
     :param info: String, any string to be associated with errors in this transformation.
     :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
     :param totalThreshold: Long, total number of errors upto and including in this transformation
       for which the processing needs to error out.
     :return: DynamicFrameCollection
     """
     _rFrames = _as_java_list(
         self._sc,
         self._jdf.relationalize(root_table_name, staging_path,
                                 makeOptions(self._sc,
                                             options), transformation_ctx,
                                 _call_site(self._sc, callsite(), info),
                                 long(stageThreshold),
                                 long(totalThreshold)))
     return DynamicFrameCollection(
         dict((df.getName(), DynamicFrame(df, self.glue_ctx, df.getName()))
              for df in _rFrames), self.glue_ctx)
예제 #3
0
    def split_rows(self, comparison_dict, name1, name2, transformation_ctx = "", info= "", stageThreshold = 0, totalThreshold = 0):
        """
        :param comparison_dict: a dictionary where the key is the path to a column, the the value is another
        dictionary maping comparators to the value to which the column will be compared.
        e.g. {"age": {">": 10, "<": 20}} will give back rows where age between 10 and 20 exclusive split from those
        that do not meet this criteria.
        :param name1: name for the dynamic frame to be split off
        :param name2: name for the dynamic frame remains on original
        :param info: String, any string to be associated with errors in this transformation.
        :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
        :param totalThreshold: Long, total number of errors upto and including in this transformation
          for which the processing needs to error out.
        :return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
          the second containing the nodes remaining on the original.
        """
        paths, values, operators = [], [], []

        for key, value in comparison_dict.items():
            paths.extend([key] * len(value))
            for k, v in value.items():
                operators.append(k)
                if isinstance(v, int):
                    values.append(long(v))
                else:
                    values.append(v)

        jdfs = _as_java_list(self._sc, self._jdf.splitRows(self.glue_ctx._jvm.PythonUtils.toSeq(paths),
                                                      self.glue_ctx._jvm.PythonUtils.toSeq(values),
                                                      self.glue_ctx._jvm.PythonUtils.toSeq(operators),
                                                      transformation_ctx, _call_site(self._sc, callsite(), info),
                                                      long(stageThreshold), long(totalThreshold)))
        return DynamicFrameCollection({name1 : DynamicFrame(jdfs[0], self.glue_ctx, name1), name2 : DynamicFrame(jdfs[1], self.glue_ctx, name2)}, self.glue_ctx)