Beispiel #1
0
 def reduce(self, _table, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     rtn = None
     results = []
     for partition in range(_table.partition):
         operand = EggRoll.__get_storage_locator(_table, partition)
         proc_id = partition % len(self.proc_list)
         channel, stub = self.proc_list[proc_id]
         unary_p = processor_pb2.UnaryProcess(
             operand=operand,
             info=processor_pb2.TaskInfo(task_id=self.job_id,
                                         function_id=func_id,
                                         function_bytes=func_bytes))
         results = results + list(stub.reduce(unary_p))
     rs = []
     for val in results:
         if len(val.value) > 0:
             rs.append(self._serdes.deserialize(val.value))
     rs = [r for r in filter(partial(is_not, None), rs)]
     if len(results) <= 0:
         return rtn
     rtn = rs[0]
     for r in rs[1:]:
         rtn = func(rtn, r)
     return rtn
Beispiel #2
0
    def glom(self, _table: _DTable):
        func_id = str(uuid.uuid1())
        operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)

        unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                          function_id=func_id))
        resp = self.proc_stub.glom(unary_p)
        return self._create_table_from_locator(resp, _table._partitions)
Beispiel #3
0
 def map_partitions(self, _table: _DTable, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)
     unary_p = processor_pb2.UnaryProcess(operand=operand,
                                          info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                      function_id=func_id,
                                                                      function_bytes=func_bytes))
     resp = self.proc_stub.mapPartitions(unary_p)
     return self._create_table_from_locator(resp, _table._partitions)
Beispiel #4
0
    def __create_unary_process(self, table: _DTable, func):
        operand = self.__create_storage_locator_from_dtable(table)
        task_info = self.__create_task_info(
            func=func, is_in_place_computing=table.get_in_place_computing())

        return processor_pb2.UnaryProcess(
            info=task_info,
            operand=operand,
            conf=processor_pb2.ProcessConf(
                namingPolicy=self.eggroll_context.get_naming_policy().name))
Beispiel #5
0
    def sample(self, _table: _DTable, fraction, seed):
        if fraction < 0 or fraction > 1:
            raise ValueError("fraction must be in [0, 1]")
        func_bytes = self.value_serdes.serialize((fraction, seed))
        func_id = str(uuid.uuid1())
        operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)

        unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                          function_id=func_id,
                                                                                          function_bytes=func_bytes))
        resp = self.proc_stub.sample(unary_p)
        return self._create_table_from_locator(resp, _table._partitions)
Beispiel #6
0
 def glom(self, _table):
     results = []
     func_id = str(uuid.uuid1())
     for p in range(_table.partition):
         operand = EggRoll.__get_storage_locator(_table, p)
         unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                           function_id=func_id))
         proc_id = p % len(self.proc_list)
         channel, stub = self.proc_list[proc_id]
         results.append(stub.glom.future(unary_p))
     for r in results:
         result = r.result()
     return _DTable(self, result.type, result.namespace, result.name, _table.partition)
Beispiel #7
0
    def mapValues(self, _table, func):
        func_id, func_bytes = self.serialize_and_hash_func(func)
        results = []
        for partition in range(_table.partition):
            operand = EggRoll.__get_storage_locator(_table, partition)
            unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                              function_id=func_id,
                                                                                              function_bytes=func_bytes))

            proc_id = partition % len(self.proc_list)
            channel, stub = self.proc_list[proc_id]
            results.append(stub.mapValues.future(unary_p))

        for r in results:
            result = r.result()
        return _DTable(self, result.type, result.namespace, result.name, _table.partition)
Beispiel #8
0
 def sample(self, _table, fraction, seed):
     if fraction < 0 or fraction > 1:
         raise ValueError("fraction must be in [0, 1]")
     func_bytes = self._serdes.serialize((fraction, seed))
     results = []
     func_id = str(uuid.uuid1())
     for p in range(_table.partition):
         operand = EggRoll.__get_storage_locator(_table, p)
         unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                           function_id=func_id,
                                                                                           function_bytes=func_bytes))
         proc_id = p % len(self.proc_list)
         channel, stub = self.proc_list[proc_id]
         results.append(stub.sample.future(unary_p))
     for r in results:
         result = r.result()
     return _DTable(self, result.type, result.namespace, result.name, _table.partition)
Beispiel #9
0
 def reduce(self, _table: _DTable, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)
     unary_p = processor_pb2.UnaryProcess(operand=operand,
                                          info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                      function_id=func_id,
                                                                      function_bytes=func_bytes))
     values = [_EggRoll._deserialize_operand(operand) for operand in self.proc_stub.reduce(unary_p)]
     values = [v for v in filter(partial(is_not, None), values)]
     if len(values) <= 0:
         return None
     if len(values) == 1:
         return values[0]
     else:
         val, *remain = values
         for _nv in remain:
             val = func(val, _nv)
     return val