def run(self, _, fn, group, inp): batching = isinstance(inp, list) serialized = False if batching: if type(inp[0]) == bytes: serialized = True inp = [deserialize(tbl) for tbl in inp] else: if type(inp) == bytes: serialized = True inp = deserialize(inp) if batching: # Because we have batching enabled by default, we have to # assume these are lists if these are not merged into a multi # operator. We have to check these because a whole flow # operator will not have lists even when batching is # enabled. if type(group) == list: group = group[0] if type(fn) == list: fn = fn[0] inp, mappings = merge_tables(inp) if group and not isinstance(inp, GroupbyTable): raise RuntimeError( "Can't run a group filter over a non-grouped" + " table.") if group: result = GroupbyTable(inp.schema, inp.col) for group, gtable in inp.get(): if fn(self, next(gtable.get())): result.add_group(group, gtable) else: result = Table(inp.schema) for row in inp.get(): if fn(self, row): result.insert(row) if batching: result = demux_tables(result, mappings) if serialized: result = [serialize(tbl) for tbl in result] else: if serialized: result = serialize(result) return result
def run(self, cloudburst, aggregate, column, inp): serialized = False if type(inp) == bytes: serialized = True inp = deserialize(inp) if aggregate == 'count': aggfn = self.count if aggregate == 'min': aggfn = self.min if aggregate == 'max': aggfn = self.max if aggregate == 'sum': aggfn = self.sum if aggregate == 'average': aggfn = self.average if isinstance(inp, GroupbyTable): gb_col = inp.col val, _ = next(inp.get()) gb_typ = get_type(type(val)) result = Table([(gb_col, gb_typ), (aggregate, FloatType)]) for val, tbl in inp.get(): agg = aggfn(tbl, column) result.insert([val, float(agg)]) else: result = Table([(aggregate, FloatType)]) result.insert([float(aggnf(inp, column))]) if serialized: result = serialize(result) return result
def run(self, cloudburst, lookup_key, dynamic: bool, input_object, inp: Table): from flow.types.basic import get_type serialized = False if type(inp) == bytes: inp = deserialize(inp) serialized = True if cloudburst is None or dynamic: obj = input_object lookup_key = next(inp.get())[lookup_key] else: obj = cloudburst.get(lookup_key) schema = list(inp.schema) schema.append((lookup_key, get_type(type(obj)))) new_table = Table(schema) for row in inp.get(): vals = [row[key] for key, _ in inp.schema] vals.append(obj) new_table.insert(vals) if serialized: new_table = serialize(new_table) return new_table
def run(self, _, col: str, inp: Table): serialized = False if type(inp) == bytes: serialized = True inp = deserialize(inp) gb_table = GroupbyTable(inp.schema, col) for row in inp.get(): gb_table.add_row(row) if serialized: gb_table = serialize(gb_table) return gb_table
def run(self, _, on, how, left, right): serialized = False if type(left) == bytes: left = deserialize(left) right = deserialize(right) serialized = True # Note: We currently don't support batching with custom # seriralization for joins. Shouldn't be hard to implement but # skipping it for expediency. batching = False if type(left) == list: batching = True _, left = merge_tables(left) mappings, right = merge_tables(right) new_schema = merge_schema(left.schema, right.schema) result = Table(new_schema) ljoin = (how == 'left') ojoin = (how == 'outer') # Track whether each right row has been inserted for outer # joins. rindex_map = {} for lrow in left.get(): lrow_inserted = False idx = 0 for rrow in right.get(): if lrow[on] == rrow[on]: new_row = merge_row(lrow, rrow, new_schema) result.insert(new_row) lrow_inserted = True rindex_map[idx] = True idx += 1 if not lrow_inserted and (ljoin or ojoin): rvals = [None] * len(right.schema) rrow = Row(right.schema, rvals, lrow[Row.qid_key]) new_row = merge_row(lrow, rrow, new_schema) result.insert(new_row) if ojoin: idx = 0 for row in right.get(): if idx not in rindex_map: lvals = [None] * len(left.schema) lrow = Row(left.schema, lvals, row[Row.qid_key]) new_row = merge_row(lrow, row, new_schema) result.insert(new_row) idx += 1 if serialized: result = serialize(result) if batching: result = demux_tables(result, mappings) return result
def run(self, cloudburst, final, *inp): # inp is a tuple because we might take in multiple things for a # lookup situation. if len(inp) == 1: inp = inp[0] prev = inp # inp should either be a Table or a list of Tables. if type(inp) == bytes: print('Received a non-batched serialized input.') # If the input is a list of Tables, then batching is enabled. batching = all([op.batching for op in ops]) serialized = False if batching: if type(prev[0]) == bytes: serialized = True prev = [deserialize(tbl) for tbl in prev] prev, mappings = merge_tables(prev) # This will all be repeated because of the way Cloudburst's # batching works, so we just pick the first one. final = final[0] else: if type(prev) == bytes: serialized = True prev = deserialize(prev) # NOTE: We currenetly don't support inputs from # LookupHelperOperators with batching enabled. if type(inp) == tuple: if type(inp[1]) == bytes: sereialized = True inp = (inp[0], deserialize(inp[1])) for i in range(len(self.logics)): logic = self.logics[i] if self.whole: # Populate this once for instantiation. if logic.cloudburst is None: queue = [logic] while len(queue) > 0: op = queue.pop(0) op.cloudburst = cloudburst queue.extend(op.downstreams) # prev will never be a tuple with whole beacuse there # will never be a look. See comment at the top of this # function for why inp might be a tuple. args = self.exec_args[i] + (prev, ) else: if type(prev) != tuple: args = (cloudburst, ) + self.exec_args[i] + ( prev, ) else: args = (cloudburst, ) + self.exec_args[i] + prev prev = logic.run(*args) if self.whole: prev = logic.results()[0] if batching: if type(prev) == tuple: prev = demux_tables(prev[0], mappings) else: prev = demux_tables(prev, mappings) if serialized: prev = [serialize(tbl) for tbl in prev] else: if serialized and not isinstance(prev, tuple): prev = serialize(prev) return prev
def run(self, cloudburst, fn, fntype, col, names, inp): # Merge all of the tables. serialized = False batching = self.batching and isinstance(inp, list) if batching: if type(inp[0]) == bytes: inp = [deserialize(tbl) for tbl in inp] serialized = True # inp will be a list of Tables. If it not, this is part of # a MultiOperator, and everything is taken care of for us. merged, mappings = merge_tables(inp) inp = merged # This will all be repeated because of the way Cloudburst's # batching works, so we just pick the first one. But we # check because even with batching enabled, in a multi # operator, we will not have to deal with this. if type(fn) == list: fn = fn[0] if type(fntype) == list: fntype = fntype[0] if type(col) == list: col = col[0] if type(names) == list and type(names[0]) == list: names = names[0] else: if type(inp) == bytes: inp = deserialize(inp) serialized = True schema = [] if col is None: if len(names) != 0: schema = list(zip(names, fntype.ret)) else: for i in range(len(fntype.ret)): schema.append((str(i), fntype.ret[i])) else: for name, tp in inp.schema: if name != col: schema.append((name, tp)) else: if len(names) != 0: schema.append((names[0], fntype.ret[0])) else: schema.append((name, fntype.ret[0])) if isinstance(inp, GroupbyTable): result = GroupbyTable(schema, inp.col) for group, gtable in inp.get(): result.add_group(group, self.run(fn, fntype, col, gtable)) else: result = Table(schema) if self.batching or self.multi: res = fn(self, inp) for val in res: if type(val) == tuple: val = list(val) elif type(val) != list: val = [val] result.insert(val) else: for row in inp.get(): if col is None: vals = fn(self, row) if type(vals) == tuple: vals = list(vals) elif type(vals) != list: vals = [vals] result.insert(vals, row[Row.qid_key]) else: val = fn(self, row[col]) new_vals = [] for name, _ in inp.schema: if name == col: new_vals.append(val) else: new_vals.append(row[name]) result.insert(new_vals, row[Row.qid_key]) if batching: # Unmerge all the tables. tables = demux_tables(result, mappings) result = tables if serialized: result = [serialize(tbl) for tbl in result] else: if serialized: result = serialize(result) if self.send_broadcast: import uuid uid = str(uuid.uuid4()) cloudburst.put(uid, result) result = uid return result