def equal_to(self, *fields): """ Continues a Join transformation. Defines the Tuple fields of the second join DataSet that should be used as join keys. Note: Fields can only be selected as join keys on Tuple DataSets. :param fields:The indexes of the Tuple fields of the second join DataSet that should be used as keys. :return:An incomplete Join Transformation. """ if isinstance(fields[0], (KeySelectorFunction, TYPES.FunctionType)): self._info[_Fields.KEY2] = fields[0] self._info[_Fields.TYPE2] = fields[1] else: self._info[_Fields.KEY2] = fields tmp1 = self._parent[0] tmp2 = self._parent[1] key1 = self._info[_Fields.KEY1] key2 = self._info[_Fields.KEY2] use_key_selector = False if isinstance(key1, (KeySelectorFunction, TYPES.FunctionType)): if isinstance(key1, KeySelectorFunction): f1 = key1 else: f1 = KeySelectorFunction() f1.get_key = key1 tmp1 = self._parent[0].map(f1, (self._info[_Fields.TYPE1], deduct_output_type(self._parent[0]._info))) use_key_selector = True key1 = (0,) if isinstance(key2, (KeySelectorFunction, TYPES.FunctionType)): if isinstance(key2, KeySelectorFunction): f2 = key2 else: f2 = KeySelectorFunction() f2.get_key = key2 tmp2 = self._parent[1].map(f2, (self._info[_Fields.TYPE2], deduct_output_type(self._parent[1]._info))) use_key_selector = True key2 = (0,) if use_key_selector: if self._info[_Fields.IDENTIFIER] == _Identifier.JOIN: tmp = tmp1.join(tmp2) elif self._info[_Fields.IDENTIFIER] == _Identifier.JOINH: tmp = tmp1.join_with_huge(tmp2) else: tmp = tmp1.join_with_tiny(tmp2) tmp._info[_Fields.DISCARD1] = self._info[_Fields.TYPE1] is not None tmp._info[_Fields.DISCARD2] = self._info[_Fields.TYPE2] is not None tmp = tmp.where(*key1).equal_to(*key2) return tmp else: self._info[_Fields.PARENT][_Fields.CHILDREN].append(self._info) self._info[_Fields.OTHER][_Fields.CHILDREN].append(self._info) return JoinOperator(self._env, self._info)
def using(self, operator, types): """ Finalizes a CoGroup transformation. Applies a CoGroupFunction to groups of elements with identical keys. Each CoGroupFunction call returns an arbitrary number of keys. :param operator: The CoGroupFunction that is called for all groups of elements with identical keys. :param types: The type of the resulting DataSet. :return:An CoGroupOperator that represents the co-grouped result DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = CoGroupFunction() operator.co_group = f tmp1 = self._parent[0] tmp2 = self._parent[1] key1 = self._info[_Fields.KEY1] key2 = self._info[_Fields.KEY2] use_key_selector = False if isinstance(key1, (KeySelectorFunction, TYPES.FunctionType)): if isinstance(key1, KeySelectorFunction): f1 = key1 else: f1 = KeySelectorFunction() f1.get_key = key1 tmp1 = self._parent[0].map(f1, (self._info[_Fields.TYPE1], deduct_output_type(self._parent[0]._info))) use_key_selector = True operator._discard_key1 = True key1 = (0,) if isinstance(key2, (KeySelectorFunction, TYPES.FunctionType)): if isinstance(key2, KeySelectorFunction): f2 = key2 else: f2 = KeySelectorFunction() f2.get_key = key2 tmp2 = self._parent[1].map(f2, (self._info[_Fields.TYPE2], deduct_output_type(self._parent[1]._info))) use_key_selector = True operator._discard_key2 = True key2 = (0,) if use_key_selector: return tmp1.co_group(tmp2).where(*key1).equal_to(*key2).using(operator, types) else: new_set = OperatorSet(self._env, self._info) operator._keys1 = self._info[_Fields.KEY1] operator._keys2 = self._info[_Fields.KEY2] self._info[_Fields.OPERATOR] = operator self._info[_Fields.META] = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) self._info[_Fields.TYPES] = types self._info[_Fields.NAME] = "PythonCoGroup" self._info[_Fields.PARENT][_Fields.CHILDREN].append(self._info) self._info[_Fields.OTHER][_Fields.CHILDREN].append(self._info) self._env._sets.append(self._info) return new_set
def group_by(self, *keys): """ Groups a Tuple DataSet using field position keys. Note: Field position keys only be specified for Tuple DataSets. The field position keys specify the fields of Tuples on which the DataSet is grouped. This method returns an UnsortedGrouping on which one of the following grouping transformation can be applied. sort_group() to get a SortedGrouping. reduce() to apply a Reduce transformation. group_reduce() to apply a GroupReduce transformation. :param keys: One or more field positions on which the DataSet will be grouped. :return:A Grouping on which a transformation needs to be applied to obtain a transformed DataSet. """ if isinstance(keys[0], (KeySelectorFunction, TYPES.FunctionType)): if isinstance(keys[0], TYPES.FunctionType): selector = KeySelectorFunction() selector.get_key = keys[0] else: selector = keys[0] grouping = self.map(selector, (keys[1], deduct_output_type(self._info))).group_by(0) grouping._info[_Fields.TYPE1] = True return grouping else: child = OperationInfo(self._env) child_chain = [] child_set = UnsortedGrouping(self._env, child, child_chain) child[_Fields.IDENTIFIER] = _Identifier.GROUP child[_Fields.PARENT] = self._info child[_Fields.KEYS] = keys child[_Fields.TYPE1] = None child_chain.append(child) self._info[_Fields.CHILDREN].append(child) self._env._sets.append(child) return child_set