Exemple #1
0
    def reduce_group(self, operator, combinable=False):
        """
        Applies a GroupReduce transformation.

        The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a
        non-grouped DataSet.
        The GroupReduceFunction can iterate over all elements of the DataSet and
        emit any number of output elements including none.

        :param operator: The GroupReduceFunction that is applied on the DataSet.
        :return:A GroupReduceOperator that represents the reduced DataSet.
        """
        self._finalize()
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = GroupReduceFunction()
            operator.reduce = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.GROUPREDUCE
        child.parent = self._info
        child.operator = operator
        child.types = _createArrayTypeInfo()
        child.name = "PythonGroupReduce"
        child.key1 = self._child_chain[0].keys
        self._info.parallelism = child.parallelism
        self._info.children.append(child)
        self._env._sets.append(child)

        return child_set
Exemple #2
0
    def using(self, operator):
        """
        Finalizes a CoGroup transformation.

        Applies a CoGroupFunction to groups of elements with identical keys.
        Each CoGroupFunction call returns an arbitrary number of keys.

        :param operator: The CoGroupFunction that is called for all groups of elements with identical keys.
        :return:An CoGroupOperator that represents the co-grouped result DataSet.
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = CoGroupFunction()
            operator.co_group = f
        new_set = OperatorSet(self._env, self._info)
        self._info.key1 = tuple([x for x in range(len(self._info.key1))])
        self._info.key2 = tuple([x for x in range(len(self._info.key2))])
        operator._keys1 = self._info.key1
        operator._keys2 = self._info.key2
        self._info.parent.parallelism = self._info.parallelism
        self._info.other.parallelism = self._info.parallelism
        self._info.operator = operator
        self._info.types = _createArrayTypeInfo()
        self._info.name = "PythonCoGroup"
        self._env._sets.append(self._info)
        return new_set
Exemple #3
0
    def using(self, operator):
        """
        Finalizes a CoGroup transformation.

        Applies a CoGroupFunction to groups of elements with identical keys.
        Each CoGroupFunction call returns an arbitrary number of keys.

        :param operator: The CoGroupFunction that is called for all groups of elements with identical keys.
        :return:An CoGroupOperator that represents the co-grouped result DataSet.
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = CoGroupFunction()
            operator.co_group = f
        new_set = OperatorSet(self._env, self._info)
        self._info.key1 = tuple([x for x in range(len(self._info.key1))])
        self._info.key2 = tuple([x for x in range(len(self._info.key2))])
        operator._keys1 = self._info.key1
        operator._keys2 = self._info.key2
        self._info.parent.parallelism = self._info.parallelism
        self._info.other.parallelism = self._info.parallelism
        self._info.operator = operator
        self._info.types = _createArrayTypeInfo()
        self._info.name = "PythonCoGroup"
        self._env._sets.append(self._info)
        return new_set
Exemple #4
0
    def map_partition(self, operator):
        """
        Applies a MapPartition transformation on a DataSet.

        The transformation calls a MapPartitionFunction once per parallel partition of the DataSet.
        The entire partition is available through the given Iterator.
        Each MapPartitionFunction may return an arbitrary number of results.

        The number of elements that each instance of the MapPartition function
        sees is non deterministic and depends on the degree of parallelism of the operation.

        :param operator: The MapFunction that is called for each element of the DataSet.
        :return:A MapOperator that represents the transformed DataSet
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = MapPartitionFunction()
            operator.map_partition = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.MAPPARTITION
        child.parent = self._info
        child.operator = operator
        child.types = _createArrayTypeInfo()
        child.name = "PythonMapPartition"
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Exemple #5
0
    def map_partition(self, operator):
        """
        Applies a MapPartition transformation on a DataSet.

        The transformation calls a MapPartitionFunction once per parallel partition of the DataSet.
        The entire partition is available through the given Iterator.
        Each MapPartitionFunction may return an arbitrary number of results.

        The number of elements that each instance of the MapPartition function
        sees is non deterministic and depends on the degree of parallelism of the operation.

        :param operator: The MapFunction that is called for each element of the DataSet.
        :return:A MapOperator that represents the transformed DataSet
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = MapPartitionFunction()
            operator.map_partition = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.MAPPARTITION
        child.parent = self._info
        child.operator = operator
        child.types = _createArrayTypeInfo()
        child.name = "PythonMapPartition"
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Exemple #6
0
    def reduce(self, operator):
        """
        Applies a Reduce transformation on a non-grouped DataSet.

        The transformation consecutively calls a ReduceFunction until only a single element remains which is the result
        of the transformation. A ReduceFunction combines two elements into one new element of the same type.

        :param operator:The ReduceFunction that is applied on the DataSet.
        :return:A ReduceOperator that represents the reduced DataSet.
        """
        self._finalize()
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = ReduceFunction()
            operator.reduce = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.REDUCE
        child.parent = self._info
        child.operator = operator
        child.name = "PythonReduce"
        child.types = _createArrayTypeInfo()
        child.key1 = self._child_chain[0].keys
        self._info.parallelism = child.parallelism
        self._info.children.append(child)
        self._env._sets.append(child)

        return child_set
Exemple #7
0
    def reduce_group(self, operator, combinable=False):
        """
        Applies a GroupReduce transformation.

        The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a
        non-grouped DataSet.
        The GroupReduceFunction can iterate over all elements of the DataSet and
        emit any number of output elements including none.

        :param operator: The GroupReduceFunction that is applied on the DataSet.
        :return:A GroupReduceOperator that represents the reduced DataSet.
        """
        self._finalize()
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = GroupReduceFunction()
            operator.reduce = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.GROUPREDUCE
        child.parent = self._info
        child.operator = operator
        child.types = _createArrayTypeInfo()
        child.name = "PythonGroupReduce"
        child.key1 = self._child_chain[0].keys
        self._info.children.append(child)
        self._env._sets.append(child)

        return child_set
Exemple #8
0
    def reduce(self, operator):
        """
        Applies a Reduce transformation on a non-grouped DataSet.

        The transformation consecutively calls a ReduceFunction until only a single element remains which is the result
        of the transformation. A ReduceFunction combines two elements into one new element of the same type.

        :param operator:The ReduceFunction that is applied on the DataSet.
        :return:A ReduceOperator that represents the reduced DataSet.
        """
        self._finalize()
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = ReduceFunction()
            operator.reduce = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.REDUCE
        child.parent = self._info
        child.operator = operator
        child.name = "PythonReduce"
        child.types = _createArrayTypeInfo()
        child.key1 = self._child_chain[0].keys
        self._info.parallelism = child.parallelism
        self._info.children.append(child)
        self._env._sets.append(child)

        return child_set
Exemple #9
0
 def _createProjector(env, info):
     child = OperationInfo()
     child_set = Projector(env, child)
     child.identifier = _Identifier.MAP
     child.operator = MapFunction()
     child.parent = info
     child.types = _createArrayTypeInfo()
     child.name = "Projector"
     info.children.append(child)
     env._sets.append(child)
     return child_set
Exemple #10
0
 def _createProjector(env, info):
     child = OperationInfo()
     child_set = Projector(env, child)
     child.identifier = _Identifier.MAP
     child.operator = MapFunction()
     child.parent = info
     child.types = _createArrayTypeInfo()
     child.name = "Projector"
     info.children.append(child)
     env._sets.append(child)
     return child_set
Exemple #11
0
 def _reduce_group(self, operator, combinable=False):
     if isinstance(operator, TYPES.FunctionType):
         f = operator
         operator = GroupReduceFunction()
         operator.reduce = f
     child = OperationInfo()
     child.identifier = _Identifier.GROUPREDUCE
     child.parent = self._info
     child.operator = operator
     child.types = _createArrayTypeInfo()
     child.name = "PythonGroupReduce"
     return child
Exemple #12
0
 def _reduce_group(self, operator, combinable=False):
     if isinstance(operator, TYPES.FunctionType):
         f = operator
         operator = GroupReduceFunction()
         operator.reduce = f
     child = OperationInfo()
     child.identifier = _Identifier.GROUPREDUCE
     child.parent = self._info
     child.operator = operator
     child.types = _createArrayTypeInfo()
     child.name = "PythonGroupReduce"
     return child
Exemple #13
0
 def read_custom(self, path, filter, splits, format):
     """
     Creates a DataSet using a custom input format that is executed directly in the Python process.
     """
     child = OperationInfo()
     child_set = DataSet(self, child)
     child.identifier = _Identifier.SOURCE_CUSTOM
     child.name = "PythonInputFormat"
     child.path = path
     child.filter = filter
     child.computeSplits = splits
     child.operator = copy.deepcopy(format)
     child.types = _createArrayTypeInfo()
     self._sources.append(child)
     return child_set
Exemple #14
0
 def read_custom(self, path, filter, splits, format):
     """
     Creates a DataSet using a custom input format that is executed directly in the Python process.
     """
     child = OperationInfo()
     child_set = DataSet(self, child)
     child.identifier = _Identifier.SOURCE_CUSTOM
     child.name = "PythonInputFormat"
     child.path = path
     child.filter = filter
     child.computeSplits = splits
     child.operator = copy.deepcopy(format)
     child.types = _createArrayTypeInfo()
     self._sources.append(child)
     return child_set
Exemple #15
0
    def using(self, operator):
        """
        Finalizes a Cross transformation.

        Applies a CrossFunction to each pair of joined elements. Each CrossFunction call returns exactly one element.

        :param operator:The CrossFunction that is called for each pair of joined elements.
        :return:An Set that represents the joined result DataSet.
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = CrossFunction()
            operator.cross = f
        self._info.operator = operator
        self._info.types = _createArrayTypeInfo()
        self._info.name = "PythonCross"
        self._info.uses_udf = True
        return OperatorSet(self._env, self._info)
Exemple #16
0
    def using(self, operator):
        """
        Finalizes a Cross transformation.

        Applies a CrossFunction to each pair of joined elements. Each CrossFunction call returns exactly one element.

        :param operator:The CrossFunction that is called for each pair of joined elements.
        :return:An Set that represents the joined result DataSet.
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = CrossFunction()
            operator.cross = f
        self._info.operator = operator
        self._info.types = _createArrayTypeInfo()
        self._info.name = "PythonCross"
        self._info.uses_udf = True
        return OperatorSet(self._env, self._info)
Exemple #17
0
    def filter(self, operator):
        """
        Applies a Filter transformation on a DataSet.

        he transformation calls a FilterFunction for each element of the DataSet and retains only those element
        for which the function returns true. Elements for which the function returns false are filtered.

        :param operator: The FilterFunction that is called for each element of the DataSet.
        :return:A FilterOperator that represents the filtered DataSet.
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = FilterFunction()
            operator.filter = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.FILTER
        child.parent = self._info
        child.operator = operator
        child.name = "PythonFilter"
        child.types = _createArrayTypeInfo()
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Exemple #18
0
    def map(self, operator):
        """
        Applies a Map transformation on a DataSet.

        The transformation calls a MapFunction for each element of the DataSet.
        Each MapFunction call returns exactly one element.

        :param operator: The MapFunction that is called for each element of the DataSet.
        :return:A MapOperator that represents the transformed DataSet
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = MapFunction()
            operator.map = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.MAP
        child.parent = self._info
        child.operator = operator
        child.types = _createArrayTypeInfo()
        child.name = "PythonMap"
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Exemple #19
0
    def flat_map(self, operator):
        """
        Applies a FlatMap transformation on a DataSet.

        The transformation calls a FlatMapFunction for each element of the DataSet.
        Each FlatMapFunction call can return any number of elements including none.

        :param operator: The FlatMapFunction that is called for each element of the DataSet.
        :return:A FlatMapOperator that represents the transformed DataSe
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = FlatMapFunction()
            operator.flat_map = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.FLATMAP
        child.parent = self._info
        child.operator = operator
        child.types = _createArrayTypeInfo()
        child.name = "PythonFlatMap"
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Exemple #20
0
    def filter(self, operator):
        """
        Applies a Filter transformation on a DataSet.

        he transformation calls a FilterFunction for each element of the DataSet and retains only those element
        for which the function returns true. Elements for which the function returns false are filtered.

        :param operator: The FilterFunction that is called for each element of the DataSet.
        :return:A FilterOperator that represents the filtered DataSet.
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = FilterFunction()
            operator.filter = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.FILTER
        child.parent = self._info
        child.operator = operator
        child.name = "PythonFilter"
        child.types = _createArrayTypeInfo()
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Exemple #21
0
    def flat_map(self, operator):
        """
        Applies a FlatMap transformation on a DataSet.

        The transformation calls a FlatMapFunction for each element of the DataSet.
        Each FlatMapFunction call can return any number of elements including none.

        :param operator: The FlatMapFunction that is called for each element of the DataSet.
        :return:A FlatMapOperator that represents the transformed DataSe
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = FlatMapFunction()
            operator.flat_map = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.FLATMAP
        child.parent = self._info
        child.operator = operator
        child.types = _createArrayTypeInfo()
        child.name = "PythonFlatMap"
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set
Exemple #22
0
    def map(self, operator):
        """
        Applies a Map transformation on a DataSet.

        The transformation calls a MapFunction for each element of the DataSet.
        Each MapFunction call returns exactly one element.

        :param operator: The MapFunction that is called for each element of the DataSet.
        :return:A MapOperator that represents the transformed DataSet
        """
        if isinstance(operator, TYPES.FunctionType):
            f = operator
            operator = MapFunction()
            operator.map = f
        child = OperationInfo()
        child_set = OperatorSet(self._env, child)
        child.identifier = _Identifier.MAP
        child.parent = self._info
        child.operator = operator
        child.types = _createArrayTypeInfo()
        child.name = "PythonMap"
        self._info.children.append(child)
        self._env._sets.append(child)
        return child_set