Ejemplo n.º 1
0
    def _list_input_dir(self, portal_options, file_wildcard,
                        target_fnames, max_files_per_job=8000):
        portal_manifest = dp_pb.DataPortalManifest(
            name=self._data_portal_name,
            data_portal_type=dp_pb.DataPortalType.Streaming,
            output_partition_num=4,
            input_file_wildcard=file_wildcard,
            input_base_dir=self._portal_input_base_dir,
            output_base_dir=self._portal_output_base_dir,
            raw_data_publish_dir=self._raw_data_publish_dir,
            processing_job_id=-1,
            next_job_id=0
        )
        self._kvstore.set_data(
            common.portal_kvstore_base_dir(self._data_portal_name),
            text_format.MessageToString(portal_manifest))

        with Timer("DataPortalJobManager initialization"):
            data_portal_job_manager = DataPortalJobManager(
                self._kvstore, self._data_portal_name,
                portal_options.long_running,
                portal_options.check_success_tag,
                portal_options.single_subfolder,
                portal_options.files_per_job_limit,
                max_files_per_job
            )
        portal_job = data_portal_job_manager._sync_processing_job()
        target_fnames.sort()
        fpaths = [os.path.join(self._portal_input_base_dir, f)
                  for f in target_fnames]
        self.assertEqual(len(fpaths), len(portal_job.fpaths))
        for index, fpath in enumerate(fpaths):
            self.assertEqual(fpath, portal_job.fpaths[index])
Ejemplo n.º 2
0
 def __init__(self, portal_name, etcd, portal_options):
     super(DataPortalMaster, self).__init__()
     self._portal_name = portal_name
     self._etcd = etcd
     self._portal_options = portal_options
     self._data_portal_job_manager = DataPortalJobManager(
         self._etcd, self._portal_name, self._portal_options.long_running)
     self._bg_worker = None
Ejemplo n.º 3
0
 def __init__(self, portal_name, kvstore, portal_options):
     super(DataPortalMaster, self).__init__()
     self._portal_name = portal_name
     self._kvstore = kvstore
     self._portal_options = portal_options
     self._data_portal_job_manager = DataPortalJobManager(
         self._kvstore,
         self._portal_name,
         self._portal_options.long_running,
         self._portal_options.check_success_tag,
     )
     self._bg_worker = None
Ejemplo n.º 4
0
class DataPortalMaster(dp_grpc.DataPortalMasterServiceServicer):
    def __init__(self, portal_name, kvstore, portal_options):
        super(DataPortalMaster, self).__init__()
        self._portal_name = portal_name
        self._kvstore = kvstore
        self._portal_options = portal_options
        self._data_portal_job_manager = DataPortalJobManager(
            self._kvstore,
            self._portal_name,
            self._portal_options.long_running,
            self._portal_options.check_success_tag,
            self._portal_options.single_subfolder,
            self._portal_options.files_per_job_limit,
            start_date=self._portal_options.start_date,
            end_date=self._portal_options.end_date)
        self._bg_worker = None

    def GetDataPortalManifest(self, request, context):
        return self._data_portal_job_manager.get_portal_manifest()

    def RequestNewTask(self, request, context):
        response = dp_pb.NewTaskResponse()
        finished, task = \
            self._data_portal_job_manager.alloc_task(request.rank_id)
        if task is not None:
            if isinstance(task, dp_pb.MapTask):
                response.map_task.MergeFrom(task)
            else:
                assert isinstance(task, dp_pb.ReduceTask)
                response.reduce_task.MergeFrom(task)
        elif not finished:
            response.pending.MergeFrom(empty_pb2.Empty())
        else:
            response.finished.MergeFrom(empty_pb2.Empty())
        return response

    def FinishTask(self, request, context):
        self._data_portal_job_manager.finish_task(request.rank_id,
                                                  request.partition_id,
                                                  request.part_state)
        return common_pb.Status()

    def start(self):
        self._bg_worker = RoutineWorker(
            'portal_master_bg_worker',
            self._data_portal_job_manager.backgroup_task, lambda: True, 30)
        self._bg_worker.start_routine()

    def stop(self):
        if self._bg_worker is not None:
            self._bg_worker.stop_routine()
        self._bg_worker = None
Ejemplo n.º 5
0
 def __init__(self, portal_name, kvstore, portal_options):
     super(DataPortalMaster, self).__init__()
     self._portal_name = portal_name
     self._kvstore = kvstore
     self._portal_options = portal_options
     self._data_portal_job_manager = DataPortalJobManager(
         self._kvstore,
         self._portal_name,
         self._portal_options.long_running,
         self._portal_options.check_success_tag,
         self._portal_options.single_subfolder,
         self._portal_options.files_per_job_limit,
         start_date=self._portal_options.start_date,
         end_date=self._portal_options.end_date)
     self._bg_worker = None