def reduce(self, operator): """ Applies a Reduce transformation on a non-grouped DataSet. The transformation consecutively calls a ReduceFunction until only a single element remains which is the result of the transformation. A ReduceFunction combines two elements into one new element of the same type. :param operator:The ReduceFunction that is applied on the DataSet. :return:A ReduceOperator that represents the reduced DataSet. """ operator._set_grouping_keys(self._child_chain[0].keys) for i in self._child_chain: self._env._sets.append(i) child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.REDUCE child.parent = self._info child.operator = copy.deepcopy(operator) child.operator._combine = False child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.combine = True child.combineop = operator child.combineop._combine = True child.name = "PythonReduce" child.types = deduct_output_type(self._info) self._info.children.append(child) self._env._sets.append(child) return child_set
def _join(self, other_set, identifier): child = OperationInfo() child_set = JoinOperatorWhere(self._env, child) child.identifier = identifier child.parent_set = self child.other_set = other_set return child_set
def _output(self, to_error): child = OperationInfo() child.identifier = _Identifier.SINK_PRINT child.parent = self._info child.to_err = to_error self._info.sinks.append(child) self._env._sinks.append(child)
def reduce_group(self, operator, types, combinable=False): """ Applies a GroupReduce transformation. The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a non-grouped DataSet. The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of output elements including none. :param operator: The GroupReduceFunction that is applied on the DataSet. :param types: The type of the resulting DataSet. :return:A GroupReduceOperator that represents the reduced DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = copy.deepcopy(operator) child.operator._combine = False child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.types = types child.combine = combinable child.combineop = operator child.combineop._combine = True child.name = "PythonGroupReduce" self._info.children.append(child) self._env._sets.append(child) return child_set
def _write_text(self, path, write_mode): child = OperationInfo() child.identifier = _Identifier.SINK_TEXT child.parent = self._info child.path = path child.write_mode = write_mode self._info.sinks.append(child) self._env._sinks.append(child)
def with_broadcast_set(self, name, set): child = OperationInfo() child.parent = self._info child.other = set._info child.name = name self._info.bcvars.append(child) set._info.children.append(child) self._env._broadcast.append(child) return self
def with_broadcast_set(self, name, set): child = OperationInfo() child.identifier = _Identifier.BROADCAST child.parent = self._info child.other = set._info child.name = name self._info.bcvars.append(child) self._env._broadcast.append(child) return self
def _cross(self, other_set, identifier): child = OperationInfo() child_set = CrossOperator(self._env, child) child.identifier = identifier child.parent = self._info child.other = other_set._info self._info.children.append(child) other_set._info.children.append(child) self._env._sets.append(child) return child_set
def output(self, to_error=False): """ Writes a DataSet to the standard output stream (stdout). """ child = OperationInfo() child.identifier = _Identifier.SINK_PRINT child.parent = self._info child.to_err = to_error self._info.sinks.append(child) self._env._sinks.append(child)
def _output(self, to_error): child = OperationInfo() child_set = DataSink(self._env, child) child.identifier = _Identifier.SINK_PRINT child.parent = self._info child.to_err = to_error self._info.parallelism = child.parallelism self._info.sinks.append(child) self._env._sinks.append(child) return child_set
def _distinct(self, fields): self._info.types = _createKeyValueTypeInfo(len(fields)) child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.DISTINCT child.parent = self._info child.keys = fields self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce(self, operator): """ Applies a Reduce transformation on a non-grouped DataSet. The transformation consecutively calls a ReduceFunction until only a single element remains which is the result of the transformation. A ReduceFunction combines two elements into one new element of the same type. :param operator:The ReduceFunction that is applied on the DataSet. :return:A ReduceOperator that represents the reduced DataSet. """ operator._set_grouping_keys(self._child_chain[0].keys) for i in self._child_chain: self._env._sets.append(i) child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.REDUCE child.parent = self._info child.operator = copy.deepcopy(operator) child.operator._combine = False child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.combine = True child.combineop = operator child.combineop._combine = True child.name = "PythonReduce" child.types = deduct_output_type(self._info) self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce_group(self, operator, types, combinable=False): """ Applies a GroupReduce transformation. The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a non-grouped DataSet. The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of output elements including none. :param operator: The GroupReduceFunction that is applied on the DataSet. :param types: The type of the resulting DataSet. :return:A GroupReduceOperator that represents the reduced DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f operator._set_grouping_keys(self._child_chain[0].keys) operator._set_sort_ops([(x.field, x.order) for x in self._child_chain[1:]]) child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = copy.deepcopy(operator) child.operator._combine = False child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.types = types child.combine = combinable child.combineop = operator child.combineop._combine = True child.name = "PythonGroupReduce" self._info.children.append(child) self._env._sets.append(child) return child_set
def _group_by(self, keys): child = OperationInfo() child_chain = [] child_set = UnsortedGrouping(self._env, child, child_chain) child.identifier = _Identifier.GROUP child.parent = self._info child.keys = keys child_chain.append(child) self._info.children.append(child) self._env._sets.append(child) return child_set
def _write_text(self, path, write_mode): child = OperationInfo() child_set = DataSink(self._env, child) child.identifier = _Identifier.SINK_TEXT child.parent = self._info child.path = path child.write_mode = write_mode self._info.parallelism = child.parallelism self._info.sinks.append(child) self._env._sinks.append(child) return child_set
def reduce_group(self, operator, combinable=False): """ Applies a GroupReduce transformation. The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a non-grouped DataSet. The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of output elements including none. :param operator: The GroupReduceFunction that is applied on the DataSet. :return:A GroupReduceOperator that represents the reduced DataSet. """ self._finalize() if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = operator child.types = _createArrayTypeInfo() child.name = "PythonGroupReduce" child.key1 = self._child_chain[0].keys self._info.parallelism = child.parallelism self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce(self, operator): """ Applies a Reduce transformation on a non-grouped DataSet. The transformation consecutively calls a ReduceFunction until only a single element remains which is the result of the transformation. A ReduceFunction combines two elements into one new element of the same type. :param operator:The ReduceFunction that is applied on the DataSet. :return:A ReduceOperator that represents the reduced DataSet. """ self._finalize() if isinstance(operator, TYPES.FunctionType): f = operator operator = ReduceFunction() operator.reduce = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.REDUCE child.parent = self._info child.operator = operator child.name = "PythonReduce" child.types = _createArrayTypeInfo() child.key1 = self._child_chain[0].keys self._info.parallelism = child.parallelism self._info.children.append(child) self._env._sets.append(child) return child_set
def write_text(self, path, write_mode=WriteMode.NO_OVERWRITE): """ Writes a DataSet as a text file to the specified location. :param path: he path pointing to the location the text file is written to. :param write_mode: OutputFormat.WriteMode value, indicating whether files should be overwritten """ child = OperationInfo() child.identifier = _Identifier.SINK_TEXT child.parent = self._info child.path = path child.write_mode = write_mode self._info.sinks.append(child) self._env._sinks.append(child)
def filter(self, operator): """ Applies a Filter transformation on a DataSet. he transformation calls a FilterFunction for each element of the DataSet and retains only those element for which the function returns true. Elements for which the function returns false are filtered. :param operator: The FilterFunction that is called for each element of the DataSet. :return:A FilterOperator that represents the filtered DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = FilterFunction() operator.filter = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.FILTER child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.name = "PythonFilter" child.types = deduct_output_type(self._info) self._info.children.append(child) self._env._sets.append(child) return child_set
def map_partition(self, operator, types): """ Applies a MapPartition transformation on a DataSet. The transformation calls a MapPartitionFunction once per parallel partition of the DataSet. The entire partition is available through the given Iterator. Each MapPartitionFunction may return an arbitrary number of results. The number of elements that each instance of the MapPartition function sees is non deterministic and depends on the degree of parallelism of the operation. :param operator: The MapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet :return:A MapOperator that represents the transformed DataSet """ if isinstance(operator, TYPES.FunctionType): f = operator operator = MapPartitionFunction() operator.map_partition = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.MAPPARTITION child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.types = types child.name = "PythonMapPartition" self._info.children.append(child) self._env._sets.append(child) return child_set
def map(self, operator, types): """ Applies a Map transformation on a DataSet. The transformation calls a MapFunction for each element of the DataSet. Each MapFunction call returns exactly one element. :param operator: The MapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet :return:A MapOperator that represents the transformed DataSet """ if isinstance(operator, TYPES.FunctionType): f = operator operator = MapFunction() operator.map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.MAP child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.types = types child.name = "PythonMap" self._info.children.append(child) self._env._sets.append(child) return child_set
def flat_map(self, operator, types): """ Applies a FlatMap transformation on a DataSet. The transformation calls a FlatMapFunction for each element of the DataSet. Each FlatMapFunction call can return any number of elements including none. :param operator: The FlatMapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet. :return:A FlatMapOperator that represents the transformed DataSe """ if isinstance(operator, TYPES.FunctionType): f = operator operator = FlatMapFunction() operator.flat_map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.FLATMAP child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.types = types child.name = "PythonFlatMap" self._info.children.append(child) self._env._sets.append(child) return child_set
def generate_sequence(self, frm, to): """ Creates a new data set that contains the given sequence :param frm: The start number for the sequence. :param to: The end number for the sequence. :return: A DataSet representing the given sequence of numbers. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_SEQ child.frm = frm child.to = to self._sources.append(child) return child_set
def read_text(self, path): """ Creates a DataSet that represents the Strings produced by reading the given file line wise. The file will be read with the system's default character set. :param path: The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). :return: A DataSet that represents the data read from the given file as text lines. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_TEXT child.path = path self._sources.append(child) return child_set
def rebalance(self): """ Enforces a re-balancing of the DataSet, i.e., the DataSet is evenly distributed over all parallel instances of the following task. This can help to improve performance in case of heavy data skew and compute intensive operations. Important:This operation shuffles the whole DataSet over the network and can take significant amount of time. :return: The re-balanced DataSet. """ child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.REBALANCE child.parent = self._info self._info.children.append(child) self._env._sets.append(child) return child_set
def first(self, count): """ Returns a new set containing the first n elements in this DataSet. :param count: The desired number of elements. :return: A DataSet containing the elements. """ child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.FIRST child.parent = self._info child.count = count self._info.children.append(child) self._env._sets.append(child) return child_set
def from_elements(self, *elements): """ Creates a new data set that contains the given elements. The elements must all be of the same type, for example, all of the String or Integer. The sequence of elements must not be empty. :param elements: The elements to make up the data set. :return: A DataSet representing the given list of elements. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_VALUE child.values = elements self._sources.append(child) return child_set
def _partition_by_hash(self, fields): """ Hash-partitions a DataSet on the specified key fields. Important:This operation shuffles the whole DataSet over the network and can take significant amount of time. :param fields: The field indexes on which the DataSet is hash-partitioned. :return: The partitioned DataSet. """ self._info.types = _createKeyValueTypeInfo(len(fields)) child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.PARTITION_HASH child.parent = self._info child.keys = fields self._info.children.append(child) self._env._sets.append(child) return child_set
def _write_csv(self, path, line_delimiter, field_delimiter, write_mode): child = OperationInfo() child.identifier = _Identifier.SINK_CSV child.path = path child.parent = self._info child.delimiter_field = field_delimiter child.delimiter_line = line_delimiter child.write_mode = write_mode self._info.sinks.append(child) self._env._sinks.append(child)
def filter(self, operator): """ Applies a Filter transformation on a DataSet. he transformation calls a FilterFunction for each element of the DataSet and retains only those element for which the function returns true. Elements for which the function returns false are filtered. :param operator: The FilterFunction that is called for each element of the DataSet. :return:A FilterOperator that represents the filtered DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = FilterFunction() operator.filter = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.FILTER child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.name = "PythonFilter" child.types = deduct_output_type(self._info) self._info.children.append(child) self._env._sets.append(child) return child_set
def map(self, operator, types): """ Applies a Map transformation on a DataSet. The transformation calls a MapFunction for each element of the DataSet. Each MapFunction call returns exactly one element. :param operator: The MapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet :return:A MapOperator that represents the transformed DataSet """ if isinstance(operator, TYPES.FunctionType): f = operator operator = MapFunction() operator.map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.MAP child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.types = types child.name = "PythonMap" self._info.children.append(child) self._env._sets.append(child) return child_set
def flat_map(self, operator, types): """ Applies a FlatMap transformation on a DataSet. The transformation calls a FlatMapFunction for each element of the DataSet. Each FlatMapFunction call can return any number of elements including none. :param operator: The FlatMapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet. :return:A FlatMapOperator that represents the transformed DataSe """ if isinstance(operator, TYPES.FunctionType): f = operator operator = FlatMapFunction() operator.flat_map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.FLATMAP child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.types = types child.name = "PythonFlatMap" self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce_group(self, operator, combinable=False): """ Applies a GroupReduce transformation. The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a non-grouped DataSet. The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of output elements including none. :param operator: The GroupReduceFunction that is applied on the DataSet. :return:A GroupReduceOperator that represents the reduced DataSet. """ self._finalize() if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = operator child.types = _createArrayTypeInfo() child.name = "PythonGroupReduce" child.key1 = self._child_chain[0].keys self._info.children.append(child) self._env._sets.append(child) return child_set
def map_partition(self, operator, types): """ Applies a MapPartition transformation on a DataSet. The transformation calls a MapPartitionFunction once per parallel partition of the DataSet. The entire partition is available through the given Iterator. Each MapPartitionFunction may return an arbitrary number of results. The number of elements that each instance of the MapPartition function sees is non deterministic and depends on the degree of parallelism of the operation. :param operator: The MapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet :return:A MapOperator that represents the transformed DataSet """ if isinstance(operator, TYPES.FunctionType): f = operator operator = MapPartitionFunction() operator.map_partition = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.MAPPARTITION child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.types = types child.name = "PythonMapPartition" self._info.children.append(child) self._env._sets.append(child) return child_set
def union(self, other_set): """ Creates a union of this DataSet with an other DataSet. The other DataSet must be of the same data type. :param other_set: The other DataSet which is unioned with the current DataSet. :return:The resulting DataSet. """ child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.UNION child.parent = self._info child.other = other_set._info self._info.children.append(child) other_set._info.children.append(child) self._env._sets.append(child) return child_set
def read_custom(self, path, filter, splits, format, types=None): """ Creates a DataSet using a custom input format that is executed directly in the Python process. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_CUSTOM child.name = "PythonInputFormat" child.path = path child.filter = filter child.computeSplits = splits child.operator = copy.deepcopy(format) if types is None: child.types = _createArrayTypeInfo() else: child.types = types self._sources.append(child) return child_set
def _createProjector(env, info): child = OperationInfo() child_set = Projector(env, child) child.identifier = _Identifier.MAP child.operator = MapFunction() child.parent = info child.types = _createArrayTypeInfo() child.name = "Projector" child.parallelism = info.parallelism info.children.append(child) env._sets.append(child) return child_set
def co_group(self, other_set): """ Initiates a CoGroup transformation which combines the elements of two DataSets into on DataSet. It groups each DataSet individually on a key and gives groups of both DataSets with equal keys together into a CoGroupFunction. If a DataSet has a group with no matching key in the other DataSet, the CoGroupFunction is called with an empty group for the non-existing group. The CoGroupFunction can iterate over the elements of both groups and return any number of elements including none. :param other_set: The other DataSet of the CoGroup transformation. :return:A CoGroupOperator to continue the definition of the CoGroup transformation. """ child = OperationInfo() other_set._info.children.append(child) child_set = CoGroupOperatorWhere(self._env, child) child.identifier = _Identifier.COGROUP child.parent_set = self child.other_set = other_set return child_set
def project(self, *fields): """ Applies a Project transformation on a Tuple DataSet. Note: Only Tuple DataSets can be projected. The transformation projects each Tuple of the DataSet onto a (sub)set of fields. :param fields: The field indexes of the input tuples that are retained. The order of fields in the output tuple corresponds to the order of field indexes. :return: The projected DataSet. """ child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.PROJECTION child.parent = self._info child.keys = fields self._info.children.append(child) self._env._sets.append(child) return child_set
def sort_group(self, field, order): """ Sorts Tuple elements within a group on the specified field in the specified Order. Note: Only groups of Tuple elements can be sorted. Groups can be sorted by multiple fields by chaining sort_group() calls. :param field:The Tuple field on which the group is sorted. :param order: The Order in which the specified Tuple field is sorted. See DataSet.Order. :return:A SortedGrouping with specified order of group element. """ child = OperationInfo() child_set = SortedGrouping(self._env, child, self._child_chain) child.identifier = _Identifier.SORT child.parent = self._info child.field = field child.order = order self._info.children.append(child) self._child_chain.append(child) self._env._sets.append(child) return child_set
def _reduce_group(self, operator, combinable=False): self._finalize() if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f child = OperationInfo() child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = operator child.types = _createArrayTypeInfo() child.name = "PythonGroupReduce" child.key1 = self._child_chain[0].keys return child