Example #1
0
 def _run_map_task(self, task):
     partition_options = self._make_partitioner_options(task)
     data_partitioner = None
     type_repr = ''
     if task.data_portal_type == dp_pb.DataPortalType.Streaming:
         data_partitioner = RawDataSortPartitioner(partition_options,
                                                   task.part_field,
                                                   self._kvstore_type,
                                                   self._use_mock_etcd)
         type_repr = 'streaming'
     else:
         assert task.data_portal_type == dp_pb.DataPortalType.PSI
         data_partitioner = RawDataPartitioner(partition_options,
                                               task.part_field,
                                               self._kvstore_type,
                                               self._use_mock_etcd)
         type_repr = 'psi'
     logging.info("Partitioner rank_id-[%d] start run task %s of type %s "\
                  "for partition %d, input %d files", self._rank_id,
                  partition_options.partitioner_name, type_repr,
                  partition_options.partitioner_rank_id,
                  len(partition_options.input_file_paths))
     data_partitioner.start_process()
     data_partitioner.wait_for_finished()
     logging.info("Partitioner rank_id-[%d] finish run partition task %s "\
                  "for partition %d.", self._rank_id,
                  partition_options.partitioner_name,
                  partition_options.partitioner_rank_id)
     del data_partitioner
     gc.collect()
Example #2
0
 def finish(self):
     meta = None
     if len(self._buffer) > 0:
         writer = self._get_output_writer()
         self._sort_buffer()
         for item in self._buffer:
             writer.write_item(item)
         writer.close()
         meta = RawDataPartitioner.FileMeta(
             self._options.partitioner_rank_id, self._process_index,
             self._begin_index, self._end_index)
         fpath = os.path.join(self._options.output_dir,
                              common.partition_repr(self._partition_id),
                              meta.encode_meta_to_fname())
         gfile.Rename(self.get_tmp_fpath(), fpath, True)
         self._buffer = []
         self._begin_index = None
         self._end_index = None
     return meta
Example #3
0
    partitioner_options = dj_pb.RawDataPartitionerOptions(
        partitioner_name=args.partitioner_name,
        input_file_paths=all_fpaths,
        output_dir=args.output_dir,
        output_partition_num=args.output_partition_num,
        raw_data_options=dj_pb.RawDataOptions(
            raw_data_iter=args.raw_data_iter,
            compressed_type=args.compressed_type,
            read_ahead_size=args.read_ahead_size,
            read_batch_size=args.read_batch_size),
        writer_options=dj_pb.WriterOptions(
            output_writer=args.output_builder,
            compressed_type=args.builder_compressed_type,
        ),
        partitioner_rank_id=args.partitioner_rank_id,
        batch_processor_options=dj_pb.BatchProcessorOptions(
            batch_size=4096, max_flying_item=-1))
    db_database, db_addr, db_username, db_password, db_base_dir = \
        get_kvstore_config(args.kvstore_type)
    partitioner = RawDataPartitioner(partitioner_options, args.part_field,
                                     db_database, db_base_dir, db_addr,
                                     db_username, db_password)
    logging.info("RawDataPartitioner %s of rank %d launched",
                 partitioner_options.partitioner_name,
                 partitioner_options.partitioner_rank_id)
    partitioner.start_process()
    partitioner.wait_for_finished()
    logging.info("RawDataPartitioner %s of rank %d finished",
                 partitioner_options.partitioner_name,
                 partitioner_options.partitioner_rank_id)
Example #4
0
            input_file_paths=all_fpaths,
            output_dir=args.output_dir,
            output_partition_num=args.output_partition_num,
            raw_data_options=dj_pb.RawDataOptions(
                raw_data_iter=args.raw_data_iter,
                compressed_type=args.compressed_type,
                read_ahead_size=args.read_ahead_size,
                read_batch_size=args.read_batch_size
            ),
            writer_options=dj_pb.WriterOptions(
                output_writer=args.output_builder,
                compressed_type=args.builder_compressed_type,
            ),
            partitioner_rank_id=args.partitioner_rank_id,
            batch_processor_options=dj_pb.BatchProcessorOptions(
                batch_size=4096,
                max_flying_item=-1
            )
        )
    partitioner = RawDataPartitioner(partitioner_options, args.part_field,
                                     args.etcd_name, args.etcd_addrs,
                                     args.etcd_base_dir)
    logging.info("RawDataPartitioner %s of rank %d launched",
                 partitioner_options.partitioner_name,
                 partitioner_options.partitioner_rank_id)
    partitioner.start_process()
    partitioner.wait_for_finished()
    logging.info("RawDataPartitioner %s of rank %d finished",
                 partitioner_options.partitioner_name,
                 partitioner_options.partitioner_rank_id)
Example #5
0
                     "input files", args.partitioner_rank_id,
                     len(all_fpaths), origin_file_num)
    partitioner_options = dj_pb.RawDataPartitionerOptions(
        partitioner_name=args.partitioner_name,
        input_file_paths=all_fpaths,
        output_dir=args.output_dir,
        output_partition_num=args.output_partition_num,
        raw_data_options=dj_pb.RawDataOptions(
            raw_data_iter=args.raw_data_iter,
            compressed_type=args.compressed_type,
            read_ahead_size=args.read_ahead_size,
            read_batch_size=args.read_batch_size),
        writer_options=dj_pb.WriterOptions(
            output_writer=args.output_builder,
            compressed_type=args.builder_compressed_type,
        ),
        partitioner_rank_id=args.partitioner_rank_id,
        batch_processor_options=dj_pb.BatchProcessorOptions(
            batch_size=4096, max_flying_item=-1),
        memory_limit_ratio=args.memory_limit_ratio / 100)
    partitioner = RawDataPartitioner(partitioner_options, args.part_field,
                                     args.kvstore_type)
    logging.info("RawDataPartitioner %s of rank %d launched",
                 partitioner_options.partitioner_name,
                 partitioner_options.partitioner_rank_id)
    partitioner.start_process()
    partitioner.wait_for_finished()
    logging.info("RawDataPartitioner %s of rank %d finished",
                 partitioner_options.partitioner_name,
                 partitioner_options.partitioner_rank_id)