def reduce(self, _table, func): func_id, func_bytes = self.serialize_and_hash_func(func) rtn = None results = [] for partition in range(_table.partition): operand = EggRoll.__get_storage_locator(_table, partition) proc_id = partition % len(self.proc_list) channel, stub = self.proc_list[proc_id] unary_p = processor_pb2.UnaryProcess( operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) results = results + list(stub.reduce(unary_p)) rs = [] for val in results: if len(val.value) > 0: rs.append(self._serdes.deserialize(val.value)) rs = [r for r in filter(partial(is_not, None), rs)] if len(results) <= 0: return rtn rtn = rs[0] for r in rs[1:]: rtn = func(rtn, r) return rtn
def __create_task_info(self, func, is_in_place_computing): if func: func_id, func_bytes = self.serialize_and_hash_func(func) else: func_id = str(uuid.uuid1()) func_bytes = b'blank' return processor_pb2.TaskInfo(task_id=self.session_id, function_id=func_id, function_bytes=func_bytes, isInPlaceComputing=is_in_place_computing)
def glom(self, _table): results = [] func_id = str(uuid.uuid1()) for p in range(_table.partition): operand = EggRoll.__get_storage_locator(_table, p) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo( task_id=self.job_id, function_id=func_id)) proc_id = p % len(self.proc_list) channel, stub = self.proc_list[proc_id] results.append(stub.glom.future(unary_p)) for r in results: result = r.result() return _DTable(self, result.type, result.namespace, result.name, _table.partition)
def mapValues(self, _table, func): func_id, func_bytes = self.serialize_and_hash_func(func) results = [] for partition in range(_table.partition): operand = EggRoll.__get_storage_locator(_table, partition) unary_p = processor_pb2.UnaryProcess( operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) proc_id = partition % len(self.proc_list) channel, stub = self.proc_list[proc_id] results.append(stub.mapValues.future(unary_p)) for r in results: result = r.result() return _DTable(self, result.type, result.namespace, result.name, _table.partition)
def join(self, left, right, func): func_id, func_bytes = self.serialize_and_hash_func(func) results = [] res = None for partition in range(left.partition): l_op = EggRoll.__get_storage_locator(left, partition) r_op = EggRoll.__get_storage_locator(right, partition) binary_p = processor_pb2.BinaryProcess( left=l_op, right=r_op, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) proc_id = partition % len(self.proc_list) channel, stub = self.proc_list[proc_id] results.append(stub.join.future(binary_p)) for r in results: res = r.result() return _DTable(self, res.type, res.namespace, res.name, left.partition)
def sample(self, _table, fraction, seed): if fraction < 0 or fraction > 1: raise ValueError("fraction must be in [0, 1]") func_bytes = self._serdes.serialize((fraction, seed)) results = [] func_id = str(uuid.uuid1()) for p in range(_table.partition): operand = EggRoll.__get_storage_locator(_table, p) unary_p = processor_pb2.UnaryProcess( operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) proc_id = p % len(self.proc_list) channel, stub = self.proc_list[proc_id] results.append(stub.sample.future(unary_p)) for r in results: result = r.result() return _DTable(self, result.type, result.namespace, result.name, _table.partition)