def _publish_raw_data(self, job_id): portal_manifest = self._sync_portal_manifest() output_dir = None if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: output_dir = common.portal_map_output_dir( portal_manifest.output_base_dir, job_id) else: output_dir = common.portal_reduce_output_dir( portal_manifest.output_base_dir, job_id) for partition_id in range(self._output_partition_num): dpath = path.join(output_dir, common.partition_repr(partition_id)) fnames = [] if gfile.Exists(dpath) and gfile.IsDirectory(dpath): fnames = [ f for f in gfile.ListDirectory(dpath) if f.endswith(common.RawDataFileSuffix) ] publish_fpaths = [] if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: publish_fpaths = self._publish_psi_raw_data( partition_id, dpath, fnames) else: publish_fpaths = self._publish_streaming_raw_data( partition_id, dpath, fnames) logging.info("Data Portal Master publish %d file for partition "\ "%d of streaming job %d\n----------\n", len(publish_fpaths), partition_id, job_id) for seq, fpath in enumerate(publish_fpaths): logging.info("%d. %s", seq, fpath) logging.info("------------------------------------------\n")
def _check_reduce_task(self, reduce_task, partition_id, portal_manifest): self.assertEqual(reduce_task.partition_id, partition_id) self.assertEqual(reduce_task.map_base_dir, common.portal_map_output_dir(portal_manifest.output_base_dir, portal_manifest.name, 0)) self.assertEqual(reduce_task.reduce_base_dir, common.portal_reduce_output_dir(portal_manifest.output_base_dir, portal_manifest.name, 0))
def _check_map_task(self, map_task, fnames, partition_id, portal_manifest): self.assertEqual(map_task.output_partition_num, portal_manifest.output_partition_num) fnames.sort() fpaths = [os.path.join(portal_manifest.input_base_dir, f) for f in fnames if (fnmatch(f, portal_manifest.input_file_wildcard) and \ hash(f) % map_task.output_partition_num == partition_id)] self.assertEqual(len(fpaths), len(map_task.fpaths)) for index, fpath in enumerate(fpaths): self.assertEqual(fpath, map_task.fpaths[index]) self.assertEqual(map_task.output_base_dir, common.portal_map_output_dir(portal_manifest.output_base_dir, 0))
def _publish_raw_data(self, job_id): portal_manifest = self._sync_portal_manifest() output_dir = None if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: output_dir = common.portal_map_output_dir( portal_manifest.output_base_dir, portal_manifest.name, job_id) else: output_dir = common.portal_reduce_output_dir( portal_manifest.output_base_dir, portal_manifest.name, job_id) for partition_id in range(self._output_partition_num): dpath = path.join(output_dir, common.partition_repr(partition_id)) fpaths = [ path.join(dpath, f) for f in gfile.ListDirectory(dpath) if f.endswith(common.RawDataFileSuffix) ] self._publisher.publish_raw_data(partition_id, fpaths) if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: self._publisher.finish_raw_data(partition_id)
def _publish_raw_data(self, job_id): portal_manifest = self._sync_portal_manifest() output_dir = None if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: output_dir = common.portal_map_output_dir( portal_manifest.output_base_dir, job_id) else: output_dir = common.portal_reduce_output_dir( portal_manifest.output_base_dir, job_id) for partition_id in range(self._output_partition_num): dpath = path.join(output_dir, common.partition_repr(partition_id)) fnames = [] if gfile.Exists(dpath) and gfile.IsDirectory(dpath): fnames = [ f for f in gfile.ListDirectory(dpath) if f.endswith(common.RawDataFileSuffix) ] if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: self._publish_psi_raw_data(partition_id, dpath, fnames) else: self._publish_streaming_raw_data(partition_id, dpath, fnames)
def _map_output_dir(self, job_id): return common.portal_map_output_dir( self._portal_manifest.output_base_dir, job_id)