def join(self, other, func, output=None, options: dict = None): if options is None: options = {} inputs = self.__repartition_with(other) outputs = self._maybe_set_output(output) functor = ErFunctor(name=RollPair.JOIN, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) final_options = {} final_options.update(self.__store._options) final_options.update(options) job = ErJob(id=generate_job_id(self.__session_id, RollPair.JOIN), name=RollPair.JOIN, inputs=inputs, outputs=outputs, functors=[functor], options=final_options) task_results = self._run_job(job=job) er_store = self.__get_output_from_result(task_results) return RollPair(er_store, self.ctx)
def sample(self, fraction, seed=None, output=None, options: dict = None): if options is None: options = {} er_fraction = ErFunctor(name=RollPair.REDUCE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(fraction)) er_seed = ErFunctor(name=RollPair.REDUCE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(seed)) outputs = [] if output: outputs.append(output) job = ErJob(id=generate_job_id(self.__session_id, RollPair.SAMPLE), name=RollPair.SAMPLE, inputs=[self.__store], outputs=outputs, functors=[er_fraction, er_seed]) job_result = self.__command_client.simple_sync_send( input=job, output_type=ErJob, endpoint=self.ctx.get_roll()._command_endpoint, command_uri=CommandURI( f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'), serdes_type=self.__command_serdes) er_store = job_result._outputs[0] return RollPair(er_store, self.ctx)
def map_values(self, func, output=None, options: dict = None): if options is None: options = {} functor = ErFunctor(name=RollPair.MAP_VALUES, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) outputs = [] if output: outputs.append(output) # todo:1: options issues. refer to line 77 final_options = {} final_options.update(self.__store._options) final_options.update(options) job = ErJob(id=generate_job_id(self.__session_id, RollPair.MAP_VALUES), name=RollPair.MAP_VALUES, inputs=[self.__store], outputs=outputs, functors=[functor], options=final_options) job_result = self.__command_client.simple_sync_send( input=job, output_type=ErJob, endpoint=self.ctx.get_roll()._command_endpoint, command_uri=CommandURI( f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'), serdes_type=self.__command_serdes) er_store = job_result._outputs[0] return RollPair(er_store, self.ctx)
def count(self): total_partitions = self.__store._store_locator._total_partitions job_id = generate_job_id(self.__session_id, tag=RollPair.COUNT) job = ErJob(id=job_id, name=RollPair.COUNT, inputs=[self.ctx.populate_processor(self.__store)]) args = list() for i in range(total_partitions): partition_input = job._inputs[0]._partitions[i] task = ErTask(id=generate_task_id(job_id, i), name=job._name, inputs=[partition_input], job=job) args.append(([task], partition_input._processor._command_endpoint)) futures = self.__command_client.async_call( args=args, output_types=[ErPair], command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}')) done = wait(futures, return_when=FIRST_EXCEPTION).done result = 0 for future in done: pair = future.result()[0] result += self.functor_serdes.deserialize(pair._value) return result
def union(self, other, func=lambda v1, v2: v1, output=None, options: dict = None): if options is None: options = {} functor = ErFunctor(name=RollPair.UNION, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) outputs = [] if output: outputs.append(output) job = ErJob(id=generate_job_id(self.__session_id, RollPair.UNION), name=RollPair.UNION, inputs=self.__repartition_with(other), outputs=outputs, functors=[functor]) job_result = self.__command_client.simple_sync_send( input=job, output_type=ErJob, endpoint=self.ctx.get_roll()._command_endpoint, command_uri=CommandURI( f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'), serdes_type=self.__command_serdes) er_store = job_result._outputs[0] return RollPair(er_store, self.ctx)
def destroy(self): if len(self.ctx.get_session()._cluster_manager_client.get_store( self.get_store())._partitions) == 0: L.info(f"store:{self.get_store()} has been destroyed before") raise ValueError( f"store:{self.get_store()} has been destroyed before") total_partitions = self.__store._store_locator._total_partitions job = ErJob(id=generate_job_id(self.__session_id, RollPair.DESTROY), name=RollPair.DESTROY, inputs=[self.__store], outputs=[self.__store], functors=[]) job_resp = self.__command_client.simple_sync_send( input=job, output_type=ErJob, endpoint=self.ctx.get_roll()._command_endpoint, command_uri=CommandURI( f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'), serdes_type=self.__command_serdes) self.ctx.get_session()._cluster_manager_client.delete_store( self.__store) L.info(f'{RollPair.DESTROY}: {self.__store}') self.destroyed = True
def filter(self, func, output=None, options: dict = None): if options is None: options = {} functor = ErFunctor(name=RollPair.FILTER, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) outputs = [] if output: outputs.append(output) job = ErJob(id=generate_job_id(self.__session_id, RollPair.FILTER), name=RollPair.FILTER, inputs=[self.__store], outputs=outputs, functors=[functor]) job_result = self.__command_client.simple_sync_send( input=job, output_type=ErJob, endpoint=self.ctx.get_roll()._command_endpoint, command_uri=CommandURI( f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'), serdes_type=self.__command_serdes) er_store = job_result._outputs[0] L.info(er_store) return RollPair(er_store, self.ctx)
def map_partitions(self, func, reduce_op=None, output=None, options: dict = None): if options is None: options = {} outputs = self._maybe_set_output(output) shuffle = options.get('shuffle', True) if not shuffle and reduce_op: raise ValueError(f"shuffle cannot be False when reduce is needed!") functor = ErFunctor(name=RollPair.MAP_PARTITIONS, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) reduce_functor = ErFunctor(name=RollPair.MAP_PARTITIONS, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(reduce_op)) need_shuffle = ErFunctor(name=RollPair.MAP_PARTITIONS, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(shuffle)) job = ErJob(id=generate_job_id(self.__session_id, RollPair.MAP_PARTITIONS), name=RollPair.MAP_PARTITIONS, inputs=[self.__store], outputs=outputs, functors=[functor, reduce_functor, need_shuffle]) task_future = self._run_job(job=job) er_store = self.__get_output_from_result(task_future) return RollPair(er_store, self.ctx)
def map_partitions_with_index(self, func, output=None, options: dict = None): if options is None: options = {} outputs = self._maybe_set_output(output) shuffle = options.get('shuffle', True) functor = ErFunctor(name=RollPair.MAP_PARTITIONS_WITH_INDEX, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) need_shuffle = ErFunctor(name=RollPair.MAP_PARTITIONS_WITH_INDEX, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(shuffle)) job = ErJob(id=generate_job_id(self.__session_id, RollPair.MAP_PARTITIONS_WITH_INDEX), name=RollPair.MAP_PARTITIONS_WITH_INDEX, inputs=[self.__store], outputs=outputs, functors=[functor, need_shuffle]) task_future = self._run_job(job=job) er_store = self.__get_output_from_result(task_future) return RollPair(er_store, self.ctx)
def subtract_by_key(self, other, output=None, options: dict = None): if options is None: options = {} functor = ErFunctor(name=RollPair.SUBTRACT_BY_KEY, serdes=SerdesTypes.CLOUD_PICKLE) outputs = [] if output: outputs.append(output) job = ErJob(id=generate_job_id(self.__session_id, RollPair.SUBTRACT_BY_KEY), name=RollPair.SUBTRACT_BY_KEY, inputs=self.__repartition_with(other), outputs=outputs, functors=[functor]) job_result = self.__command_client.simple_sync_send( input=job, output_type=ErJob, endpoint=self.ctx.get_roll()._command_endpoint, command_uri=CommandURI( f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'), serdes_type=self.__command_serdes) er_store = job_result._outputs[0] return RollPair(er_store, self.ctx)
def send_command(): job = ErJob(id=job_id, name=RollPair.PUT_ALL, inputs=[self.__store], outputs=[self.__store], functors=[]) task_results = self._run_job(job) return self.__get_output_from_result(task_results)
def send_command(): job = ErJob(id=job_id, name=RollPair.GET_ALL, inputs=[self.__store], outputs=[self.__store], functors=[ErFunctor(name=RollPair.GET_ALL, body=cloudpickle.dumps(er_pair))]) task_results = self._run_job(job=job) er_store = self.__get_output_from_result(task_results) return er_store
def destroy(self, options: dict = None): tasks = [ ErTask( id=f"{self._replicate_job_id}-partition-{self._partition_id}", name=RollPair.DESTROY, inputs=[self._er_partition], outputs=[], job=ErJob(id=self._replicate_job_id, name=RollPair.DESTROY)) ] return self._cm_client.sync_send(inputs=tasks, output_types=[ErTask], endpoint=self.remote_cmd_endpoint, command_uri=RollPair.RUN_TASK_URI)
def count(self): job_id = generate_job_id(self.__session_id, tag=RollPair.COUNT) job = ErJob(id=job_id, name=RollPair.COUNT, inputs=[self.__store]) task_results = self._run_job(job=job, output_types=[ErPair], create_output_if_missing=False) result = 0 for task_result in task_results: pair = task_result[0] result += self.functor_serdes.deserialize(pair._value) return result
def aggregate(self, zero_value, seq_op, comb_op, output=None, options: dict = None): total_partitions = self.__store._store_locator._total_partitions job_id = generate_job_id(self.__session_id, tag=RollPair.AGGREGATE) serialized_zero_value = ErFunctor(name=RollPair.AGGREGATE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(zero_value)) serialized_seq_op = ErFunctor(name=RollPair.AGGREGATE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(seq_op)) job = ErJob(id=job_id, name=RollPair.AGGREGATE, inputs=[self.ctx.populate_processor(self.__store)], functors=[serialized_zero_value, serialized_seq_op]) args = list() for i in range(total_partitions): partition_input = job._inputs[0]._partitions[i] task = ErTask(id=generate_task_id(job_id, i), name=job._name, inputs=[partition_input], job=job) args.append(([task], partition_input._processor._command_endpoint)) futures = self.__command_client.async_call( args=args, output_types=[ErPair], command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}')) done = wait(futures, return_when=FIRST_EXCEPTION).done result = None first = True for future in done: pair = future.result()[0] seq_op_result = self.functor_serdes.deserialize(pair._value) if not first: result = comb_op(result, seq_op_result) else: result = seq_op_result first = False return result
def send_command(): job = ErJob(id=job_id, name=RollPair.PUT_ALL, inputs=[self.__store], outputs=[self.__store], functors=[]) result = self.__command_client.simple_sync_send( input=job, output_type=ErJob, endpoint=self.ctx.get_roll()._command_endpoint, command_uri=CommandURI( f'{RollPair.ROLL_PAIR_URI_PREFIX}/{RollPair.RUN_JOB}'), serdes_type=SerdesTypes.PROTOBUF) return result
def map(self, func, output=None, options: dict = None): if options is None: options = {} functor = ErFunctor(name=RollPair.MAP, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) job = ErJob(id=generate_job_id(self.__session_id, RollPair.MAP), name=RollPair.MAP, inputs=[self.__store], outputs=[output], functors=[functor], options=options) task_results = self._run_job(job=job) er_store = self.__get_output_from_result(task_results) return RollPair(er_store, self.ctx)
def with_stores(self, func, others=None, options: dict = None): if options is None: options = {} tag = "withStores" if others is None: others = [] total_partitions = self.get_partitions() for other in others: if other.get_partitions() != total_partitions: raise ValueError( f"diff partitions: expected:{total_partitions}, actual:{other.get_partitions()}" ) job_id = generate_job_id(self.__session_id, tag=tag) job = ErJob(id=job_id, name=tag, inputs=[ self.ctx.populate_processor(rp.get_store()) for rp in [self] + others ], functors=[ ErFunctor(name=tag, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) ], options=options) args = list() for i in range(total_partitions): partition_self = job._inputs[0]._partitions[i] task = ErTask( id=generate_task_id(job_id, i), name=job._name, inputs=[store._partitions[i] for store in job._inputs], job=job) args.append(([task], partition_self._processor._command_endpoint)) futures = self.__command_client.async_call( args=args, output_types=[ErPair], command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}')) result = list() for future in futures: ret_pair = future.result()[0] result.append((self.functor_serdes.deserialize(ret_pair._key), self.functor_serdes.deserialize(ret_pair._value))) return result
def delete(self, k, options: dict = None): if options is None: options = {} key = create_serdes(self.__store._store_locator._serdes).serialize(k) er_pair = ErPair(key=key, value=None) value = None partition_id = self.partitioner(key) egg = self.ctx.route_to_egg(self.__store._partitions[partition_id]) job_id = generate_job_id(self.__session_id, RollPair.DELETE) job = ErJob(id=job_id, name=RollPair.DELETE, inputs=[self.__store], outputs=[], functors=[ErFunctor(name=RollPair.DELETE, body=cloudpickle.dumps(er_pair))]) task_results = self._run_job(job=job, create_output_if_missing=False)
def write(self): L.info("RemoteRollPairWriteBatch write calling") if len(self.manual_merger) == 0: L.info(f"self.manual_merger={self.manual_merger}") return self.has_write_op = True batches = TransferPair.pair_to_bin_batch( sorted(self.manual_merger.items(), key=lambda kv: kv[0])) task_id = f"{self.adapter._replicate_job_id}-partition-{self.adapter._partition_id}" L.info(f"task_id={task_id}") tasks = [ ErTask(id=task_id, name=RollPair.PUT_BATCH, inputs=[self.adapter._er_partition], outputs=[self.adapter._er_partition], job=ErJob(id=self.adapter._replicate_job_id, name=RollPair.PUT_BATCH)) ] def send_command(tasks, remote_cmd_endpoint): cmd_client = CommandClient() return cmd_client.sync_send( inputs=tasks, output_types=[ErTask], endpoint=remote_cmd_endpoint, command_uri=CommandURI(f'v1/egg-pair/runTask')) L.info(f"start to send cmd") t = Thread(target=send_command, name=task_id, args=[tasks, self.adapter.remote_cmd_endpoint]) t.start() transfer_client = TransferClient() f = transfer_client.send( batches, endpoint=self.adapter.remote_transfer_endpoint, tag=task_id) f.result() t.join() self.manual_merger.clear() L.info("RemoteRollPairWriteBatch write called")
def glom(self, output=None, options: dict = None): if options is None: options = {} outputs = self._maybe_set_output(output) functor = ErFunctor(name=RollPair.GLOM, serdes=SerdesTypes.CLOUD_PICKLE) job = ErJob(id=generate_job_id(self.__session_id, RollPair.GLOM), name=RollPair.GLOM, inputs=[self.__store], outputs=outputs, functors=[functor]) task_results = self._run_job(job=job) er_store = self.__get_output_from_result(task_results) return RollPair(er_store, self.ctx)
def get(self, k, options: dict = None): if options is None: options = {} L.debug(f"get k: {k}") k = create_serdes(self.__store._store_locator._serdes).serialize(k) er_pair = ErPair(key=k, value=None) outputs = [] value = None partition_id = self.partitioner(k) egg = self.ctx.route_to_egg(self.__store._partitions[partition_id]) L.info( f"partitions count: {self.__store._store_locator._total_partitions}, target partition: {partition_id}, endpoint: {egg._command_endpoint}" ) inputs = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] output = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] job_id = generate_job_id(self.__session_id, RollPair.GET) job = ErJob(id=job_id, name=RollPair.GET, inputs=[self.__store], outputs=outputs, functors=[ErFunctor(body=cloudpickle.dumps(er_pair))]) task = ErTask(id=generate_task_id(job_id, partition_id), name=RollPair.GET, inputs=inputs, outputs=output, job=job) job_resp = self.__command_client.simple_sync_send( input=task, output_type=ErPair, endpoint=egg._command_endpoint, command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'), serdes_type=self.__command_serdes) return self.value_serdes.deserialize( job_resp._value) if job_resp._value != b'' else None
def destroy(self, options: dict = None): if len(self.ctx.get_session()._cluster_manager_client.get_store(self.get_store())._partitions) == 0: L.exception(f"store:{self.get_store()} has been destroyed before") raise ValueError(f"store:{self.get_store()} has been destroyed before") if options is None: options = {} job = ErJob(id=generate_job_id(self.__session_id, RollPair.DESTROY), name=RollPair.DESTROY, inputs=[self.__store], outputs=[self.__store], functors=[], options=options) task_results = self._run_job(job=job, create_output_if_missing=False) self.ctx.get_session()._cluster_manager_client.delete_store(self.__store) L.debug(f'{RollPair.DESTROY}={self.__store}') self.destroyed = True
def subtract_by_key(self, other, output=None, options: dict = None): if options is None: options = {} inputs = self.__repartition_with(other) outputs = self._maybe_set_output(output) functor = ErFunctor(name=RollPair.SUBTRACT_BY_KEY, serdes=SerdesTypes.CLOUD_PICKLE) job = ErJob(id=generate_job_id(self.__session_id, RollPair.SUBTRACT_BY_KEY), name=RollPair.SUBTRACT_BY_KEY, inputs=inputs, outputs=outputs, functors=[functor]) task_future = self._run_job(job=job) er_store = self.__get_output_from_result(task_future) return RollPair(er_store, self.ctx)
def collapse_partitions(self, func, output=None, options: dict = None): if options is None: options = {} outputs = [] if output: RollPair.__check_partition(self.get_partitions(), output._store_locator._total_partitions) outputs.append(output) functor = ErFunctor(name=RollPair.COLLAPSE_PARTITIONS, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) job = ErJob(id=generate_job_id(self.__session_id, RollPair.COLLAPSE_PARTITIONS), name=RollPair.COLLAPSE_PARTITIONS, inputs=[self.__store], outputs=outputs, functors=[functor]) task_results = self._run_job(job=job) er_store = self.__get_output_from_result(task_results) return RollPair(er_store, self.ctx)
def glom(self, output=None, options: dict = None): if options is None: options = {} outputs = [] if output: RollPair.__check_partition(self.get_partitions(), output._store_locator._total_partitions) outputs.append(output) functor = ErFunctor(name=RollPair.GLOM, serdes=SerdesTypes.CLOUD_PICKLE) job = ErJob(id=generate_job_id(self.__session_id, RollPair.GLOM), name=RollPair.GLOM, inputs=[self.__store], outputs=outputs, functors=[functor]) task_results = self._run_job(job=job) er_store = self.__get_output_from_result(task_results) return RollPair(er_store, self.ctx)
def get(self, k, options: dict = None): if options is None: options = {} k = create_serdes(self.__store._store_locator._serdes).serialize(k) er_pair = ErPair(key=k, value=None) partition_id = self.partitioner(k) egg = self.ctx.route_to_egg(self.__store._partitions[partition_id]) inputs = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] outputs = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] job_id = generate_job_id(self.__session_id, RollPair.GET) job = ErJob(id=job_id, name=RollPair.GET, inputs=[self.__store], outputs=[self.__store], functors=[ ErFunctor(name=RollPair.GET, body=cloudpickle.dumps(er_pair)) ]) task = ErTask(id=generate_task_id(job_id, partition_id), name=RollPair.GET, inputs=inputs, outputs=outputs, job=job) job_resp = self.__command_client.simple_sync_send( input=task, output_type=ErPair, endpoint=egg._command_endpoint, command_uri=self.RUN_TASK_URI, serdes_type=self.__command_serdes) return self.value_serdes.deserialize( job_resp._value) if job_resp._value != b'' else None
def sample(self, fraction, seed=None, output=None, options: dict = None): if options is None: options = {} outputs = [] if output: RollPair.__check_partition(self.get_partitions(), output._store_locator._total_partitions) outputs.append(output) er_fraction = ErFunctor(name=RollPair.REDUCE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(fraction)) er_seed = ErFunctor(name=RollPair.REDUCE, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(seed)) job = ErJob(id=generate_job_id(self.__session_id, RollPair.SAMPLE), name=RollPair.SAMPLE, inputs=[self.__store], outputs=outputs, functors=[er_fraction, er_seed]) task_results = self._run_job(job=job) er_store = self.__get_output_from_result(task_results) return RollPair(er_store, self.ctx)
def delete(self, k, options: dict = None): if options is None: options = {} key = create_serdes(self.__store._store_locator._serdes).serialize(k) er_pair = ErPair(key=key, value=None) outputs = [] value = None partition_id = self.partitioner(key) egg = self.ctx.route_to_egg(self.__store._partitions[partition_id]) L.info(egg._command_endpoint) L.info(f"count: {self.__store._store_locator._total_partitions}") inputs = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] output = [ ErPartition(id=partition_id, store_locator=self.__store._store_locator) ] job_id = generate_job_id(self.__session_id, RollPair.DELETE) job = ErJob(id=job_id, name=RollPair.DELETE, inputs=[self.__store], outputs=outputs, functors=[ErFunctor(body=cloudpickle.dumps(er_pair))]) task = ErTask(id=generate_task_id(job_id, partition_id), name=RollPair.DELETE, inputs=inputs, outputs=output, job=job) L.info("start send req") job_resp = self.__command_client.simple_sync_send( input=task, output_type=ErPair, endpoint=egg._command_endpoint, command_uri=CommandURI( f'{RollPair.EGG_PAIR_URI_PREFIX}/{RollPair.RUN_TASK}'), serdes_type=self.__command_serdes)
def union(self, other, func=lambda v1, v2: v1, output=None, options: dict = None): if options is None: options = {} inputs = self.__repartition_with(other) outputs = [] if output: RollPair.__check_partition(inputs[0]._store_locator._total_partitions, output._store_locator._total_partitions) outputs.append(output) functor = ErFunctor(name=RollPair.UNION, serdes=SerdesTypes.CLOUD_PICKLE, body=cloudpickle.dumps(func)) job = ErJob(id=generate_job_id(self.__session_id, RollPair.UNION), name=RollPair.UNION, inputs=inputs, outputs=outputs, functors=[functor]) task_future = self._run_job(job=job) er_store = self.__get_output_from_result(task_future) return RollPair(er_store, self.ctx)