Example #1
0
 def parallelize(self,
                 data: Iterable,
                 include_key=False,
                 name=None,
                 partition=1,
                 namespace=None,
                 create_if_missing=True,
                 error_if_exist=False,
                 persistent=False,
                 chunk_size=100000,
                 in_place_computing=False):
     if namespace is None:
         namespace = _EggRoll.get_instance().job_id
     if name is None:
         name = str(uuid.uuid1())
     storage_locator = storage_basic_pb2.StorageLocator(
         type=storage_basic_pb2.LMDB, namespace=namespace,
         name=name) if persistent else storage_basic_pb2.StorageLocator(
             type=storage_basic_pb2.IN_MEMORY,
             namespace=namespace,
             name=name)
     create_table_info = kv_pb2.CreateTableInfo(
         storageLocator=storage_locator, fragmentCount=partition)
     _table = self._create_table(create_table_info)
     _table.set_in_place_computing(in_place_computing)
     _iter = data if include_key else enumerate(data)
     _table.put_all(_iter, chunk_size=chunk_size)
     LOGGER.debug("created table: %s", _table)
     return _table
Example #2
0
 def __get_locator(self, obj, name=None):
     if isinstance(obj, _DTable):
         return storage_basic_pb2.StorageLocator(type=obj._type, namespace=obj._namespace, name=obj._name,
                                                 fragment=obj._partitions)
     else:
         return storage_basic_pb2.StorageLocator(type=storage_basic_pb2.LMDB, namespace=self.job_id,
                                                 name=name)
Example #3
0
 def join(self, _left: _DTable, _right: _DTable, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     l_op = storage_basic_pb2.StorageLocator(namespace=_left._namespace, type=_left._type, name=_left._name)
     r_op = storage_basic_pb2.StorageLocator(namespace=_right._namespace, type=_right._type, name=_right._name)
     binary_p = processor_pb2.BinaryProcess(left=l_op, right=r_op, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                               function_id=func_id,
                                                                                               function_bytes=func_bytes))
     resp = self.proc_stub.join(binary_p)
     return self._create_table_from_locator(resp, _left._partitions)
Example #4
0
    def sample(self, request, context):
        task_info = request.info
        LOGGER.debug(PROCESS_RECV_FORMAT.format('send', task_info))

        op = request.operand
        _serdes = self._serdes
        fraction, seed = cloudpickle.loads(task_info.function_bytes)
        source_db_path = Processor.get_path(op)
        rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id,
                                               fragment=op.fragment,
                                               type=storage_basic_pb2.IN_MEMORY)

        with Processor.get_environment(Processor.get_path(rtn),
                                       create_if_missing=True) as dest_env, Processor.get_environment(
            source_db_path) as source_env:
            with source_env.begin() as source_txn:
                with dest_env.begin(write=True) as dest_txn:
                    cursor = source_txn.cursor()
                    cursor.first()
                    random_state = np.random.RandomState(seed)
                    for k, v in cursor:
                        if random_state.rand() < fraction:
                            dest_txn.put(k, v)
        LOGGER.debug(PROCESS_DONE_FORMAT.format('sample', rtn))
        return rtn
Example #5
0
    def join(self, request, context):
        task_info = request.info
        LOGGER.debug(PROCESS_RECV_FORMAT.format('join', task_info))
        _joiner, _serdes = self.get_function_and_serdes(task_info)
        left_op = request.left
        right_op = request.right
        rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id,
                                               fragment=left_op.fragment,
                                               type=storage_basic_pb2.IN_MEMORY)

        with Processor.get_environment(Processor.get_path(left_op)) as left_env, Processor.get_environment(
                Processor.get_path(right_op)) as right_env, Processor.get_environment(Processor.get_path(rtn),
                                                                                      create_if_missing=True) as dst_env:
            with left_env.begin() as left_txn, right_env.begin() as right_txn, dst_env.begin(write=True) as dst_txn:
                cursor = left_txn.cursor()
                for k_bytes, v1_bytes in cursor:
                    v2_bytes = right_txn.get(k_bytes)
                    if v2_bytes is None:
                        continue
                    v1 = _serdes.deserialize(v1_bytes)
                    v2 = _serdes.deserialize(v2_bytes)
                    v3 = _joiner(v1, v2)
                    dst_txn.put(k_bytes, _serdes.serialize(v3))
                cursor.close()
        LOGGER.debug(PROCESS_DONE_FORMAT.format('join', rtn))
        return rtn
Example #6
0
 def table(self, name, namespace, partition=1, create_if_missing=True, error_if_exist=False, persistent=True):
     _type = storage_basic_pb2.LMDB if persistent else storage_basic_pb2.IN_MEMORY
     storage_locator = storage_basic_pb2.StorageLocator(type=_type, namespace=namespace, name=name)
     create_table_info = kv_pb2.CreateTableInfo(storageLocator=storage_locator, fragmentCount=partition)
     _table = self._create_table(create_table_info)
     LOGGER.debug("created table: %s", _table)
     return _table
Example #7
0
    def __create_output_storage_locator(self, src_op, task_info, process_conf,
                                        is_in_place_computing_effective):
        if is_in_place_computing_effective:
            if self.__get_in_place_computing_from_task_info(task_info):
                return src_op

        naming_policy = process_conf.namingPolicy
        LOGGER.info('naming policy in processor: {}'.format(naming_policy))
        if naming_policy == 'ITER_AWARE':
            storage_name = DELIMETER.join([
                src_op.namespace, src_op.name,
                storage_basic_pb2.StorageType.Name(src_op.type)
            ])
            name_ba = bytearray(storage_name.encode())
            name_ba.extend(DELIMETER_ENCODED)
            name_ba.extend(task_info.function_bytes)

            name = hashlib.md5(name_ba).hexdigest()
        else:
            name = task_info.function_id

        return storage_basic_pb2.StorageLocator(
            namespace=task_info.task_id,
            name=name,
            fragment=src_op.fragment,
            type=storage_basic_pb2.IN_MEMORY)
Example #8
0
    def glom(self, _table: _DTable):
        func_id = str(uuid.uuid1())
        operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)

        unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                          function_id=func_id))
        resp = self.proc_stub.glom(unary_p)
        return self._create_table_from_locator(resp, _table._partitions)
Example #9
0
 def map_partitions(self, _table: _DTable, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)
     unary_p = processor_pb2.UnaryProcess(operand=operand,
                                          info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                      function_id=func_id,
                                                                      function_bytes=func_bytes))
     resp = self.proc_stub.mapPartitions(unary_p)
     return self._create_table_from_locator(resp, _table._partitions)
Example #10
0
    def sample(self, _table: _DTable, fraction, seed):
        if fraction < 0 or fraction > 1:
            raise ValueError("fraction must be in [0, 1]")
        func_bytes = self.value_serdes.serialize((fraction, seed))
        func_id = str(uuid.uuid1())
        operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)

        unary_p = processor_pb2.UnaryProcess(operand=operand, info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                                          function_id=func_id,
                                                                                          function_bytes=func_bytes))
        resp = self.proc_stub.sample(unary_p)
        return self._create_table_from_locator(resp, _table._partitions)
Example #11
0
    def cleanup(self, name, namespace, persistent):
        if namespace is None or name is None:
            raise ValueError("neither name nor namespace can be None")

        _type = storage_basic_pb2.LMDB if persistent else storage_basic_pb2.IN_MEMORY

        storage_locator = storage_basic_pb2.StorageLocator(type=_type, namespace=namespace, name=name)
        _table = _DTable(storage_locator=storage_locator)

        self.destroy_all(_table)

        LOGGER.debug("cleaned up: %s", _table)
Example #12
0
 def reduce(self, _table: _DTable, func):
     func_id, func_bytes = self.serialize_and_hash_func(func)
     operand = storage_basic_pb2.StorageLocator(namespace=_table._namespace, type=_table._type, name=_table._name)
     unary_p = processor_pb2.UnaryProcess(operand=operand,
                                          info=processor_pb2.TaskInfo(task_id=self.job_id,
                                                                      function_id=func_id,
                                                                      function_bytes=func_bytes))
     values = [_EggRoll._deserialize_operand(operand) for operand in self.proc_stub.reduce(unary_p)]
     values = [v for v in filter(partial(is_not, None), values)]
     if len(values) <= 0:
         return None
     if len(values) == 1:
         return values[0]
     else:
         val, *remain = values
         for _nv in remain:
             val = func(val, _nv)
     return val
Example #13
0
 def map(self, request, context):
     task_info = request.info
     LOGGER.debug(PROCESS_RECV_FORMAT.format('map', task_info))
     _mapper, _serdes = self.get_function_and_serdes(task_info)
     op = request.operand
     rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id,
                                            fragment=op.fragment,
                                            type=storage_basic_pb2.IN_MEMORY)
     src_db_path = Processor.get_path(op)
     dst_db_path = Processor.get_path(rtn)
     with Processor.get_environment(dst_db_path, create_if_missing=True) as dst_env, Processor.get_environment(
             src_db_path) as source_env:
         with source_env.begin() as source_txn, dst_env.begin(write=True) as dst_txn:
             cursor = source_txn.cursor()
             for k_bytes, v_bytes in cursor:
                 k, v = _serdes.deserialize(k_bytes), _serdes.deserialize(v_bytes)
                 k1, v1 = _mapper(k, v)
                 dst_txn.put(_serdes.serialize(k1), _serdes.serialize(v1))
             cursor.close()
     LOGGER.debug(PROCESS_DONE_FORMAT.format('map', rtn))
     return rtn
Example #14
0
    def mapPartitions(self, request, context):
        task_info = request.info
        LOGGER.debug(PROCESS_RECV_FORMAT.format('mapPartitions', task_info))
        _mapper, _serdes = self.get_function_and_serdes(task_info)
        op = request.operand

        rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id,
                                               fragment=op.fragment,
                                               type=storage_basic_pb2.IN_MEMORY)
        src_db_path = Processor.get_path(op)
        dst_db_path = Processor.get_path(rtn)
        with Processor.get_environment(dst_db_path, create_if_missing=True) as dst_env, Processor.get_environment(
                src_db_path) as src_env:
            with src_env.begin() as src_txn, dst_env.begin(write=True) as dst_txn:
                cursor = src_txn.cursor()
                v = _mapper(generator(_serdes, cursor))
                if cursor.last():
                    k_bytes = cursor.key()
                    dst_txn.put(k_bytes, _serdes.serialize(v))
                cursor.close()
        LOGGER.debug(PROCESS_DONE_FORMAT.format('mapPartitions', rtn))
        return rtn
Example #15
0
    def glom(self, request, context):
        task_info = request.info
        LOGGER.debug(PROCESS_RECV_FORMAT.format('glom', task_info))

        op = request.operand
        _serdes = self._serdes
        src_db_path = Processor.get_path(op)
        rtn = storage_basic_pb2.StorageLocator(namespace=task_info.task_id, name=task_info.function_id,
                                               fragment=op.fragment,
                                               type=storage_basic_pb2.IN_MEMORY)
        with Processor.get_environment(src_db_path) as source_env, Processor.get_environment(Processor.get_path(rtn),
                                                                                             create_if_missing=True) as dst_env:

            with source_env.begin() as srce_txn, dst_env.begin(write=True) as dst_txn:
                cursor = srce_txn.cursor()
                v_list = []
                k_bytes = None
                for k, v in cursor:
                    v_list.append((_serdes.deserialize(k), _serdes.deserialize(v)))
                    k_bytes = k
                if k_bytes is not None:
                    dst_txn.put(k_bytes, _serdes.serialize(v_list))
        LOGGER.debug(PROCESS_DONE_FORMAT.format('glom', rtn))
        return rtn
Example #16
0
 def __create_storage_locator(self, namespace, name, type):
     return storage_basic_pb2.StorageLocator(namespace=namespace,
                                             name=name,
                                             type=type)