def test_portal_worker(self): self._prepare_test() map_task = dp_pb.MapTask() map_task.output_base_dir = self._partition_output_dir map_task.output_partition_num = self._output_partition_num map_task.partition_id = 0 map_task.task_name = 'map_part_{}'.format(map_task.partition_id) map_task.part_field = 'example_id' map_task.data_portal_type = dp_pb.DataPortalType.Streaming for partition_id in range(self._input_partition_num): map_task.fpaths.append(self._get_input_fpath(partition_id)) # partitioner task = dp_pb.NewTaskResponse() task.map_task.CopyFrom(map_task) self._portal_worker._run_map_task(task.map_task) self._check_partitioner(task.map_task) # merge total_cnt = 0 for partition_id in range(self._output_partition_num): reduce_task = dp_pb.ReduceTask() reduce_task.map_base_dir = self._partition_output_dir reduce_task.reduce_base_dir = self._merge_output_dir reduce_task.partition_id = partition_id reduce_task.task_name = 'reduce_part_{}'.format(partition_id) self._portal_worker._run_reduce_task(reduce_task) total_cnt += self._check_merge(reduce_task) self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num) self._clean_up()
def test_portal_worker(self): self._prepare_test() map_task = dp_pb.MapTask() map_task.output_base_dir = self._partition_output_dir map_task.output_partition_num = self._output_partition_num map_task.partition_id = 0 for partition_id in range(self._input_partition_num): map_task.fpaths.append(self._get_input_fpath(partition_id)) # partitioner task = dp_pb.NewTaskResponse() task.map_task.CopyFrom(map_task) self._portal_worker._run_map_task(task.map_task) self._check_partitioner(task.map_task) # merge total_cnt = 0 for partition_id in range(self._output_partition_num): reduce_task = dp_pb.ReduceTask() reduce_task.map_base_dir = self._partition_output_dir reduce_task.reduce_base_dir = self._merge_output_dir reduce_task.partition_id = partition_id self._portal_worker._run_reduce_task(reduce_task) total_cnt += self._check_merge(reduce_task) self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num) self._clean_up()
def RequestNewTask(self, request, context): response = dp_pb.NewTaskResponse() finished, task = \ self._data_portal_job_manager.alloc_task(request.rank_id) if task is not None: if isinstance(task, dp_pb.MapTask): response.map_task.MergeFrom(task) else: assert isinstance(task, dp_pb.ReduceTask) response.reduce_task.MergeFrom(task) elif not finished: response.pending.MergeFrom(empty_pb2.Empty()) else: response.finished.MergeFrom(empty_pb2.Empty()) return response
def _run_map_task(self): map_task = dp_pb.MapTask() map_task.output_base_dir = self._partition_output_dir map_task.output_partition_num = self._output_partition_num map_task.partition_id = 0 map_task.task_name = 'map_part_{}'.format(map_task.partition_id) map_task.part_field = 'example_id' map_task.data_portal_type = dp_pb.DataPortalType.Streaming for partition_id in range(self._input_partition_num): map_task.fpaths.append(self._get_input_fpath(partition_id)) # partitioner task = dp_pb.NewTaskResponse() task.map_task.CopyFrom(map_task) self._portal_worker._run_map_task(task.map_task) return task