Exemple #1
0
            def run(self, _, fn, group, inp):
                batching = isinstance(inp, list)
                serialized = False

                if batching:
                    if type(inp[0]) == bytes:
                        serialized = True
                        inp = [deserialize(tbl) for tbl in inp]
                else:
                    if type(inp) == bytes:
                        serialized = True
                        inp = deserialize(inp)

                if batching:
                    # Because we have batching enabled by default, we have to
                    # assume these are lists if these are not merged into a multi
                    # operator. We have to check these because a whole flow
                    # operator will not have lists even when batching is
                    # enabled.
                    if type(group) == list:
                        group = group[0]

                    if type(fn) == list:
                        fn = fn[0]
                    inp, mappings = merge_tables(inp)

                if group and not isinstance(inp, GroupbyTable):
                    raise RuntimeError(
                        "Can't run a group filter over a non-grouped" +
                        " table.")

                if group:
                    result = GroupbyTable(inp.schema, inp.col)
                    for group, gtable in inp.get():
                        if fn(self, next(gtable.get())):
                            result.add_group(group, gtable)
                else:
                    result = Table(inp.schema)
                    for row in inp.get():
                        if fn(self, row):
                            result.insert(row)

                if batching:
                    result = demux_tables(result, mappings)
                    if serialized:
                        result = [serialize(tbl) for tbl in result]
                else:
                    if serialized:
                        result = serialize(result)

                return result
Exemple #2
0
            def run(self, cloudburst, aggregate, column, inp):
                serialized = False
                if type(inp) == bytes:
                    serialized = True
                    inp = deserialize(inp)

                if aggregate == 'count':
                    aggfn = self.count
                if aggregate == 'min':
                    aggfn = self.min
                if aggregate == 'max':
                    aggfn = self.max
                if aggregate == 'sum':
                    aggfn = self.sum
                if aggregate == 'average':
                    aggfn = self.average

                if isinstance(inp, GroupbyTable):
                    gb_col = inp.col
                    val, _ = next(inp.get())
                    gb_typ = get_type(type(val))

                    result = Table([(gb_col, gb_typ), (aggregate, FloatType)])

                    for val, tbl in inp.get():
                        agg = aggfn(tbl, column)
                        result.insert([val, float(agg)])
                else:
                    result = Table([(aggregate, FloatType)])
                    result.insert([float(aggnf(inp, column))])

                if serialized:
                    result = serialize(result)

                return result
Exemple #3
0
            def run(self, cloudburst, lookup_key, dynamic: bool, input_object,
                    inp: Table):
                from flow.types.basic import get_type

                serialized = False
                if type(inp) == bytes:
                    inp = deserialize(inp)
                    serialized = True

                if cloudburst is None or dynamic:
                    obj = input_object
                    lookup_key = next(inp.get())[lookup_key]
                else:
                    obj = cloudburst.get(lookup_key)

                schema = list(inp.schema)
                schema.append((lookup_key, get_type(type(obj))))

                new_table = Table(schema)
                for row in inp.get():
                    vals = [row[key] for key, _ in inp.schema]
                    vals.append(obj)

                    new_table.insert(vals)

                if serialized:
                    new_table = serialize(new_table)
                return new_table
Exemple #4
0
            def run(self, _, col: str, inp: Table):
                serialized = False
                if type(inp) == bytes:
                    serialized = True
                    inp = deserialize(inp)

                gb_table = GroupbyTable(inp.schema, col)

                for row in inp.get():
                    gb_table.add_row(row)

                if serialized:
                    gb_table = serialize(gb_table)

                return gb_table
Exemple #5
0
            def run(self, _, on, how, left, right):
                serialized = False
                if type(left) == bytes:
                    left = deserialize(left)
                    right = deserialize(right)
                    serialized = True

                # Note: We currently don't support batching with custom
                # seriralization for joins. Shouldn't be hard to implement but
                # skipping it for expediency.
                batching = False
                if type(left) == list:
                    batching = True
                    _, left = merge_tables(left)
                    mappings, right = merge_tables(right)

                new_schema = merge_schema(left.schema, right.schema)
                result = Table(new_schema)
                ljoin = (how == 'left')
                ojoin = (how == 'outer')

                # Track whether each right row has been inserted for outer
                # joins.
                rindex_map = {}

                for lrow in left.get():
                    lrow_inserted = False

                    idx = 0
                    for rrow in right.get():
                        if lrow[on] == rrow[on]:
                            new_row = merge_row(lrow, rrow, new_schema)
                            result.insert(new_row)
                            lrow_inserted = True

                            rindex_map[idx] = True
                            idx += 1

                    if not lrow_inserted and (ljoin or ojoin):
                        rvals = [None] * len(right.schema)
                        rrow = Row(right.schema, rvals, lrow[Row.qid_key])
                        new_row = merge_row(lrow, rrow, new_schema)
                        result.insert(new_row)

                if ojoin:
                    idx = 0
                    for row in right.get():
                        if idx not in rindex_map:
                            lvals = [None] * len(left.schema)
                            lrow = Row(left.schema, lvals, row[Row.qid_key])
                            new_row = merge_row(lrow, row, new_schema)
                            result.insert(new_row)

                        idx += 1

                if serialized:
                    result = serialize(result)

                if batching:
                    result = demux_tables(result, mappings)

                return result
Exemple #6
0
            def run(self, cloudburst, final, *inp):
                # inp is a tuple because we might take in multiple things for a
                # lookup situation.
                if len(inp) == 1:
                    inp = inp[0]

                prev = inp  # inp should either be a Table or a list of Tables.
                if type(inp) == bytes:
                    print('Received a non-batched serialized input.')

                # If the input is a list of Tables, then batching is enabled.
                batching = all([op.batching for op in ops])
                serialized = False
                if batching:
                    if type(prev[0]) == bytes:
                        serialized = True
                        prev = [deserialize(tbl) for tbl in prev]

                    prev, mappings = merge_tables(prev)

                    # This will all be repeated because of the way Cloudburst's
                    # batching works, so we just pick the first one.
                    final = final[0]
                else:
                    if type(prev) == bytes:
                        serialized = True
                        prev = deserialize(prev)

                # NOTE: We currenetly don't support inputs from
                # LookupHelperOperators with batching enabled.
                if type(inp) == tuple:
                    if type(inp[1]) == bytes:
                        sereialized = True
                        inp = (inp[0], deserialize(inp[1]))

                for i in range(len(self.logics)):
                    logic = self.logics[i]

                    if self.whole:
                        # Populate this once for instantiation.
                        if logic.cloudburst is None:
                            queue = [logic]

                            while len(queue) > 0:
                                op = queue.pop(0)
                                op.cloudburst = cloudburst

                                queue.extend(op.downstreams)

                        # prev will never be a tuple with whole beacuse there
                        # will never be a look. See comment at the top of this
                        # function for why inp might be a tuple.
                        args = self.exec_args[i] + (prev, )
                    else:
                        if type(prev) != tuple:
                            args = (cloudburst, ) + self.exec_args[i] + (
                                prev, )
                        else:
                            args = (cloudburst, ) + self.exec_args[i] + prev

                    prev = logic.run(*args)

                if self.whole:
                    prev = logic.results()[0]

                if batching:
                    if type(prev) == tuple:
                        prev = demux_tables(prev[0], mappings)
                    else:
                        prev = demux_tables(prev, mappings)

                    if serialized:
                        prev = [serialize(tbl) for tbl in prev]
                else:
                    if serialized and not isinstance(prev, tuple):
                        prev = serialize(prev)

                return prev
Exemple #7
0
            def run(self, cloudburst, fn, fntype, col, names, inp):
                # Merge all of the tables.
                serialized = False
                batching = self.batching and isinstance(inp, list)
                if batching:
                    if type(inp[0]) == bytes:
                        inp = [deserialize(tbl) for tbl in inp]
                        serialized = True

                    # inp will be a list of Tables. If it not, this is part of
                    # a MultiOperator, and everything is taken care of for us.
                    merged, mappings = merge_tables(inp)
                    inp = merged

                    # This will all be repeated because of the way Cloudburst's
                    # batching works, so we just pick the first one. But we
                    # check because even with batching enabled, in a multi
                    # operator, we will not have to deal with this.
                    if type(fn) == list:
                        fn = fn[0]
                    if type(fntype) == list:
                        fntype = fntype[0]
                    if type(col) == list:
                        col = col[0]
                    if type(names) == list and type(names[0]) == list:
                        names = names[0]
                else:
                    if type(inp) == bytes:
                        inp = deserialize(inp)
                        serialized = True

                schema = []
                if col is None:
                    if len(names) != 0:
                        schema = list(zip(names, fntype.ret))
                    else:
                        for i in range(len(fntype.ret)):
                            schema.append((str(i), fntype.ret[i]))
                else:
                    for name, tp in inp.schema:
                        if name != col:
                            schema.append((name, tp))
                        else:
                            if len(names) != 0:
                                schema.append((names[0], fntype.ret[0]))
                            else:
                                schema.append((name, fntype.ret[0]))

                if isinstance(inp, GroupbyTable):
                    result = GroupbyTable(schema, inp.col)
                    for group, gtable in inp.get():
                        result.add_group(group, self.run(fn, fntype, col, gtable))
                else:
                    result = Table(schema)

                    if self.batching or self.multi:
                        res = fn(self, inp)
                        for val in res:
                            if type(val) == tuple:
                                val = list(val)
                            elif type(val) != list:
                                val = [val]

                            result.insert(val)
                    else:
                        for row in inp.get():
                            if col is None:
                                vals = fn(self, row)
                                if type(vals) == tuple:
                                    vals = list(vals)
                                elif type(vals) != list:
                                    vals = [vals]

                                result.insert(vals, row[Row.qid_key])
                            else:
                                val = fn(self, row[col])
                                new_vals = []
                                for name, _ in inp.schema:
                                    if name == col:
                                        new_vals.append(val)
                                    else:
                                        new_vals.append(row[name])

                                result.insert(new_vals, row[Row.qid_key])

                if batching: # Unmerge all the tables.
                    tables = demux_tables(result, mappings)
                    result = tables

                    if serialized:
                        result = [serialize(tbl) for tbl in result]
                else:
                    if serialized:
                        result = serialize(result)

                if self.send_broadcast:
                    import uuid
                    uid = str(uuid.uuid4())
                    cloudburst.put(uid, result)
                    result = uid

                return result