def filter(self, udf): def func(iterator): return filter(udf, iterator) return DataQuanta( Operator( operator_type="filter", udf=func, previous=[self.operator], python_exec=True ), descriptor=self.descriptor )
def union(self, other): def func(iterator): return itertools.chain(iterator, other.operator.getIterator()) return DataQuanta( Operator( operator_type="union", udf=func, previous=[self.operator, other.operator], python_exec=False ), descriptor=self.descriptor )
def sort(self, udf): def func(iterator): return sorted(iterator, key=udf) return DataQuanta( Operator( operator_type="sort", udf=func, previous=[self.operator], python_exec=True ), descriptor=self.descriptor )
def reduce_by_key(self, keys, udf): op = Operator( operator_type="reduce_by_key", udf=udf, previous=[self.operator], python_exec=False ) #print(len(keys), keys) for i in range(0, len(keys)): """if keys[i] is int: op.set_parameter("vector_position|"+str(i), keys[i]) else: op.set_parameter("dimension_key|"+str(i), keys[i])""" # TODO maybe would be better just leave the number as key op.set_parameter("dimension|"+str(i+1), keys[i]) return DataQuanta( op, descriptor=self.descriptor )
def group_by(self, udf): def func(iterator): # TODO key should be given by "udf" return itertools.groupby(iterator, key=operator.itemgetter(0)) #return itertools.groupby(sorted(iterator), key=itertools.itemgetter(0)) return DataQuanta( Operator( operator_type="group_by", udf=func, previous=[self.operator], python_exec=True ), descriptor=self.descriptor )
def source(self, source): if type(source) is str: source_ori = open(source, "r") else: source_ori = source return DataQuanta( Operator( operator_type="source", udf=source, iterator=iter(source_ori), previous=[], python_exec=False ), descriptor=self.descriptor )
def sink(self, path, end="\n"): def consume(iterator): with open(path, 'w') as f: for x in iterator: f.write(str(x) + end) def func(iterator): consume(iterator) # return self.__run(consume) return DataQuanta( Operator( operator_type="sink", udf=path, # To execute directly uncomment # udf=func, previous=[self.operator], python_exec=False ), descriptor=self.descriptor )
def flatmap(self, udf): def auxfunc(iterator): return itertools.chain.from_iterable(map(udf, iterator)) def func(iterator): mapped = map(udf, iterator) flattened = flatten_single_dim(mapped) yield from flattened def flatten_single_dim(mapped): for item in mapped: for subitem in item: yield subitem return DataQuanta( Operator( operator_type="flatmap", udf=func, previous=[self.operator], python_exec=True ), descriptor=self.descriptor )