def reduce(self, _table, func): func_id, func_bytes = self.serialize_and_hash_func(func) rtn = None results = [] for partition in range(_table.partition): operand = EggRoll.__get_storage_locator(_table, partition) proc_id = partition % len(self.proc_list) channel, stub = self.proc_list[proc_id] unary_p = processor_pb2.UnaryProcess( operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) results = results + list(stub.reduce(unary_p)) rs = [] for val in results: if len(val.value) > 0: rs.append(self._serdes.deserialize(val.value)) rs = [r for r in filter(partial(is_not, None), rs)] if len(results) <= 0: return rtn rtn = rs[0] for r in rs[1:]: rtn = func(rtn, r) return rtn
def glom(self, _table: _DTable): func_id = str(uuid.uuid1()) operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id)) resp = self.proc_stub.glom(unary_p) return self._create_table_from_locator(resp, _table._partitions)
def map_partitions(self, _table: _DTable, func): func_id, func_bytes = self.serialize_and_hash_func(func) operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) resp = self.proc_stub.mapPartitions(unary_p) return self._create_table_from_locator(resp, _table._partitions)
def __create_unary_process(self, table: _DTable, func): operand = self.__create_storage_locator_from_dtable(table) task_info = self.__create_task_info( func=func, is_in_place_computing=table.get_in_place_computing()) return processor_pb2.UnaryProcess( info=task_info, operand=operand, conf=processor_pb2.ProcessConf( namingPolicy=self.eggroll_context.get_naming_policy().name))
def sample(self, _table: _DTable, fraction, seed): if fraction < 0 or fraction > 1: raise ValueError("fraction must be in [0, 1]") func_bytes = self.value_serdes.serialize((fraction, seed)) func_id = str(uuid.uuid1()) operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) resp = self.proc_stub.sample(unary_p) return self._create_table_from_locator(resp, _table._partitions)
def glom(self, _table): results = [] func_id = str(uuid.uuid1()) for p in range(_table.partition): operand = EggRoll.__get_storage_locator(_table, p) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id)) proc_id = p % len(self.proc_list) channel, stub = self.proc_list[proc_id] results.append(stub.glom.future(unary_p)) for r in results: result = r.result() return _DTable(self, result.type, result.namespace, result.name, _table.partition)
def mapValues(self, _table, func): func_id, func_bytes = self.serialize_and_hash_func(func) results = [] for partition in range(_table.partition): operand = EggRoll.__get_storage_locator(_table, partition) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) proc_id = partition % len(self.proc_list) channel, stub = self.proc_list[proc_id] results.append(stub.mapValues.future(unary_p)) for r in results: result = r.result() return _DTable(self, result.type, result.namespace, result.name, _table.partition)
def sample(self, _table, fraction, seed): if fraction < 0 or fraction > 1: raise ValueError("fraction must be in [0, 1]") func_bytes = self._serdes.serialize((fraction, seed)) results = [] func_id = str(uuid.uuid1()) for p in range(_table.partition): operand = EggRoll.__get_storage_locator(_table, p) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) proc_id = p % len(self.proc_list) channel, stub = self.proc_list[proc_id] results.append(stub.sample.future(unary_p)) for r in results: result = r.result() return _DTable(self, result.type, result.namespace, result.name, _table.partition)
def reduce(self, _table: _DTable, func): func_id, func_bytes = self.serialize_and_hash_func(func) operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name) unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id, function_id=func_id, function_bytes=func_bytes)) values = [_EggRoll._deserialize_operand(operand) for operand in self.proc_stub.reduce(unary_p)] values = [v for v in filter(partial(is_not, None), values)] if len(values) <= 0: return None if len(values) == 1: return values[0] else: val, *remain = values for _nv in remain: val = func(val, _nv) return val