Esempio n. 1
0
    def filter(self, udf):
        def func(iterator):
            return filter(udf, iterator)

        return DataQuanta(
            Operator(
                operator_type="filter",
                udf=func,
                previous=[self.operator],
                python_exec=True
            ),
            descriptor=self.descriptor
        )
Esempio n. 2
0
    def union(self, other):

        def func(iterator):
            return itertools.chain(iterator, other.operator.getIterator())

        return DataQuanta(
            Operator(
                operator_type="union",
                udf=func,
                previous=[self.operator, other.operator],
                python_exec=False
            ),
            descriptor=self.descriptor
        )
Esempio n. 3
0
    def sort(self, udf):

        def func(iterator):
            return sorted(iterator, key=udf)

        return DataQuanta(
            Operator(
                operator_type="sort",
                udf=func,
                previous=[self.operator],
                python_exec=True
            ),
            descriptor=self.descriptor
        )
Esempio n. 4
0
    def reduce_by_key(self, keys, udf):

        op = Operator(
            operator_type="reduce_by_key",
            udf=udf,
            previous=[self.operator],
            python_exec=False
        )

        #print(len(keys), keys)
        for i in range(0, len(keys)):
            """if keys[i] is int:
                op.set_parameter("vector_position|"+str(i), keys[i])
            else:
                op.set_parameter("dimension_key|"+str(i), keys[i])"""

            # TODO maybe would be better just leave the number as key
            op.set_parameter("dimension|"+str(i+1), keys[i])

        return DataQuanta(
            op,
            descriptor=self.descriptor
        )
Esempio n. 5
0
    def group_by(self, udf):
        def func(iterator):
            # TODO key should be given by "udf"
            return itertools.groupby(iterator, key=operator.itemgetter(0))
            #return itertools.groupby(sorted(iterator), key=itertools.itemgetter(0))

        return DataQuanta(
            Operator(
                operator_type="group_by",
                udf=func,
                previous=[self.operator],
                python_exec=True
            ),
            descriptor=self.descriptor
        )
Esempio n. 6
0
    def source(self, source):

        if type(source) is str:
            source_ori = open(source, "r")
        else:
            source_ori = source
        return DataQuanta(
            Operator(
                operator_type="source",
                udf=source,
                iterator=iter(source_ori),
                previous=[],
                python_exec=False
            ),
            descriptor=self.descriptor
        )
Esempio n. 7
0
    def sink(self, path, end="\n"):
        def consume(iterator):
            with open(path, 'w') as f:
                for x in iterator:
                    f.write(str(x) + end)

        def func(iterator):
            consume(iterator)
            # return self.__run(consume)

        return DataQuanta(
            Operator(
                operator_type="sink",

                udf=path,
                # To execute directly uncomment
                # udf=func,

                previous=[self.operator],
                python_exec=False
            ),
            descriptor=self.descriptor
        )
Esempio n. 8
0
    def flatmap(self, udf):

        def auxfunc(iterator):
            return itertools.chain.from_iterable(map(udf, iterator))

        def func(iterator):
            mapped = map(udf, iterator)
            flattened = flatten_single_dim(mapped)
            yield from flattened

        def flatten_single_dim(mapped):
            for item in mapped:
                for subitem in item:
                    yield subitem

        return DataQuanta(
            Operator(
                operator_type="flatmap",
                udf=func,
                previous=[self.operator],
                python_exec=True
            ),
            descriptor=self.descriptor
        )