def groupBy(self, keys, *fields, **kw): numSplits = kw.pop('numSplits', None) if not isinstance(keys, (list, tuple)): keys = [keys] key_names = [self._create_field_name(e) for e in keys] expr = ','.join(self._create_expression(e) for e in keys) gen_key = eval('lambda _v:(%s,)' % expr) values = [self._create_field_name(e) for e in fields] + list(kw.keys()) kw.update((values[i], fields[i]) for i in range(len(fields))) codes = [self._create_reducer(i, kw[n]) for i, n in enumerate(values)] creater = eval('lambda _v:(%s,)' % (','.join(c[0] for c in codes))) merger = eval('lambda _x, _v:(%s,)' % (','.join(c[1] for c in codes))) combiner = eval('lambda _x, _y:(%s,)' % (','.join(c[2] for c in codes))) mapper = eval('lambda _x:(%s,)' % ','.join(c[3] for c in codes)) agg = Aggregator(creater, merger, combiner) g = self.prev.map(lambda v: (gen_key(v), v)).combineByKey(agg, numSplits) return g.map(lambda k_v1: k_v1[0] + mapper(k_v1[1])).asTable(key_names + values, self.name)
def combineByKey(self, createCombiner, mergeValue, mergeCombiner, partitioner): agg = Aggregator(createCombiner, mergeValue, mergeCombiner) return ShuffledDStream(self, agg, partitioner)