Exemple #1
0
            def run(self, _, fn, group, inp):
                batching = isinstance(inp, list)
                serialized = False

                if batching:
                    if type(inp[0]) == bytes:
                        serialized = True
                        inp = [deserialize(tbl) for tbl in inp]
                else:
                    if type(inp) == bytes:
                        serialized = True
                        inp = deserialize(inp)

                if batching:
                    # Because we have batching enabled by default, we have to
                    # assume these are lists if these are not merged into a multi
                    # operator. We have to check these because a whole flow
                    # operator will not have lists even when batching is
                    # enabled.
                    if type(group) == list:
                        group = group[0]

                    if type(fn) == list:
                        fn = fn[0]
                    inp, mappings = merge_tables(inp)

                if group and not isinstance(inp, GroupbyTable):
                    raise RuntimeError(
                        "Can't run a group filter over a non-grouped" +
                        " table.")

                if group:
                    result = GroupbyTable(inp.schema, inp.col)
                    for group, gtable in inp.get():
                        if fn(self, next(gtable.get())):
                            result.add_group(group, gtable)
                else:
                    result = Table(inp.schema)
                    for row in inp.get():
                        if fn(self, row):
                            result.insert(row)

                if batching:
                    result = demux_tables(result, mappings)
                    if serialized:
                        result = [serialize(tbl) for tbl in result]
                else:
                    if serialized:
                        result = serialize(result)

                return result
Exemple #2
0
            def run(self, cloudburst, final, *inp):
                # inp is a tuple because we might take in multiple things for a
                # lookup situation.
                if len(inp) == 1:
                    inp = inp[0]

                prev = inp  # inp should either be a Table or a list of Tables.
                if type(inp) == bytes:
                    print('Received a non-batched serialized input.')

                # If the input is a list of Tables, then batching is enabled.
                batching = all([op.batching for op in ops])
                serialized = False
                if batching:
                    if type(prev[0]) == bytes:
                        serialized = True
                        prev = [deserialize(tbl) for tbl in prev]

                    prev, mappings = merge_tables(prev)

                    # This will all be repeated because of the way Cloudburst's
                    # batching works, so we just pick the first one.
                    final = final[0]
                else:
                    if type(prev) == bytes:
                        serialized = True
                        prev = deserialize(prev)

                # NOTE: We currenetly don't support inputs from
                # LookupHelperOperators with batching enabled.
                if type(inp) == tuple:
                    if type(inp[1]) == bytes:
                        sereialized = True
                        inp = (inp[0], deserialize(inp[1]))

                for i in range(len(self.logics)):
                    logic = self.logics[i]

                    if self.whole:
                        # Populate this once for instantiation.
                        if logic.cloudburst is None:
                            queue = [logic]

                            while len(queue) > 0:
                                op = queue.pop(0)
                                op.cloudburst = cloudburst

                                queue.extend(op.downstreams)

                        # prev will never be a tuple with whole beacuse there
                        # will never be a look. See comment at the top of this
                        # function for why inp might be a tuple.
                        args = self.exec_args[i] + (prev, )
                    else:
                        if type(prev) != tuple:
                            args = (cloudburst, ) + self.exec_args[i] + (
                                prev, )
                        else:
                            args = (cloudburst, ) + self.exec_args[i] + prev

                    prev = logic.run(*args)

                if self.whole:
                    prev = logic.results()[0]

                if batching:
                    if type(prev) == tuple:
                        prev = demux_tables(prev[0], mappings)
                    else:
                        prev = demux_tables(prev, mappings)

                    if serialized:
                        prev = [serialize(tbl) for tbl in prev]
                else:
                    if serialized and not isinstance(prev, tuple):
                        prev = serialize(prev)

                return prev
Exemple #3
0
            def run(self, _, on, how, left, right):
                serialized = False
                if type(left) == bytes:
                    left = deserialize(left)
                    right = deserialize(right)
                    serialized = True

                # Note: We currently don't support batching with custom
                # seriralization for joins. Shouldn't be hard to implement but
                # skipping it for expediency.
                batching = False
                if type(left) == list:
                    batching = True
                    _, left = merge_tables(left)
                    mappings, right = merge_tables(right)

                new_schema = merge_schema(left.schema, right.schema)
                result = Table(new_schema)
                ljoin = (how == 'left')
                ojoin = (how == 'outer')

                # Track whether each right row has been inserted for outer
                # joins.
                rindex_map = {}

                for lrow in left.get():
                    lrow_inserted = False

                    idx = 0
                    for rrow in right.get():
                        if lrow[on] == rrow[on]:
                            new_row = merge_row(lrow, rrow, new_schema)
                            result.insert(new_row)
                            lrow_inserted = True

                            rindex_map[idx] = True
                            idx += 1

                    if not lrow_inserted and (ljoin or ojoin):
                        rvals = [None] * len(right.schema)
                        rrow = Row(right.schema, rvals, lrow[Row.qid_key])
                        new_row = merge_row(lrow, rrow, new_schema)
                        result.insert(new_row)

                if ojoin:
                    idx = 0
                    for row in right.get():
                        if idx not in rindex_map:
                            lvals = [None] * len(left.schema)
                            lrow = Row(left.schema, lvals, row[Row.qid_key])
                            new_row = merge_row(lrow, row, new_schema)
                            result.insert(new_row)

                        idx += 1

                if serialized:
                    result = serialize(result)

                if batching:
                    result = demux_tables(result, mappings)

                return result
Exemple #4
0
            def run(self, cloudburst, fn, fntype, col, names, inp):
                # Merge all of the tables.
                serialized = False
                batching = self.batching and isinstance(inp, list)
                if batching:
                    if type(inp[0]) == bytes:
                        inp = [deserialize(tbl) for tbl in inp]
                        serialized = True

                    # inp will be a list of Tables. If it not, this is part of
                    # a MultiOperator, and everything is taken care of for us.
                    merged, mappings = merge_tables(inp)
                    inp = merged

                    # This will all be repeated because of the way Cloudburst's
                    # batching works, so we just pick the first one. But we
                    # check because even with batching enabled, in a multi
                    # operator, we will not have to deal with this.
                    if type(fn) == list:
                        fn = fn[0]
                    if type(fntype) == list:
                        fntype = fntype[0]
                    if type(col) == list:
                        col = col[0]
                    if type(names) == list and type(names[0]) == list:
                        names = names[0]
                else:
                    if type(inp) == bytes:
                        inp = deserialize(inp)
                        serialized = True

                schema = []
                if col is None:
                    if len(names) != 0:
                        schema = list(zip(names, fntype.ret))
                    else:
                        for i in range(len(fntype.ret)):
                            schema.append((str(i), fntype.ret[i]))
                else:
                    for name, tp in inp.schema:
                        if name != col:
                            schema.append((name, tp))
                        else:
                            if len(names) != 0:
                                schema.append((names[0], fntype.ret[0]))
                            else:
                                schema.append((name, fntype.ret[0]))

                if isinstance(inp, GroupbyTable):
                    result = GroupbyTable(schema, inp.col)
                    for group, gtable in inp.get():
                        result.add_group(group, self.run(fn, fntype, col, gtable))
                else:
                    result = Table(schema)

                    if self.batching or self.multi:
                        res = fn(self, inp)
                        for val in res:
                            if type(val) == tuple:
                                val = list(val)
                            elif type(val) != list:
                                val = [val]

                            result.insert(val)
                    else:
                        for row in inp.get():
                            if col is None:
                                vals = fn(self, row)
                                if type(vals) == tuple:
                                    vals = list(vals)
                                elif type(vals) != list:
                                    vals = [vals]

                                result.insert(vals, row[Row.qid_key])
                            else:
                                val = fn(self, row[col])
                                new_vals = []
                                for name, _ in inp.schema:
                                    if name == col:
                                        new_vals.append(val)
                                    else:
                                        new_vals.append(row[name])

                                result.insert(new_vals, row[Row.qid_key])

                if batching: # Unmerge all the tables.
                    tables = demux_tables(result, mappings)
                    result = tables

                    if serialized:
                        result = [serialize(tbl) for tbl in result]
                else:
                    if serialized:
                        result = serialize(result)

                if self.send_broadcast:
                    import uuid
                    uid = str(uuid.uuid4())
                    cloudburst.put(uid, result)
                    result = uid

                return result