def _distinct(self, fields): self._info.types = _createKeyValueTypeInfo(len(fields)) child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.DISTINCT child.parent = self._info child.keys = fields self._info.children.append(child) self._env._sets.append(child) return child_set
def _group_by(self, keys): child = OperationInfo() child_chain = [] child_set = UnsortedGrouping(self._env, child, child_chain) child.identifier = _Identifier.GROUP child.parent = self._info child.keys = keys child_chain.append(child) self._info.children.append(child) self._env._sets.append(child) return child_set
def _partition_by_hash(self, fields): """ Hash-partitions a DataSet on the specified key fields. Important:This operation shuffles the whole DataSet over the network and can take significant amount of time. :param fields: The field indexes on which the DataSet is hash-partitioned. :return: The partitioned DataSet. """ self._info.types = _createKeyValueTypeInfo(len(fields)) child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.PARTITION_HASH child.parent = self._info child.keys = fields self._info.children.append(child) self._env._sets.append(child) return child_set
def project(self, *fields): """ Applies a Project transformation on a Tuple DataSet. Note: Only Tuple DataSets can be projected. The transformation projects each Tuple of the DataSet onto a (sub)set of fields. :param fields: The field indexes of the input tuples that are retained. The order of fields in the output tuple corresponds to the order of field indexes. :return: The projected DataSet. """ child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.PROJECTION child.parent = self._info child.keys = fields self._info.children.append(child) self._env._sets.append(child) return child_set
def group_by(self, *keys): """ Groups a Tuple DataSet using field position keys. Note: Field position keys only be specified for Tuple DataSets. The field position keys specify the fields of Tuples on which the DataSet is grouped. This method returns an UnsortedGrouping on which one of the following grouping transformation can be applied. sort_group() to get a SortedGrouping. reduce() to apply a Reduce transformation. group_reduce() to apply a GroupReduce transformation. :param keys: One or more field positions on which the DataSet will be grouped. :return:A Grouping on which a transformation needs to be applied to obtain a transformed DataSet. """ child = OperationInfo() child_chain = [] child_set = UnsortedGrouping(self._env, child, child_chain) child.identifier = _Identifier.GROUP child.parent = self._info child.keys = keys child_chain.append(child) self._info.children.append(child) self._env._sets.append(child) return child_set