def _make_partitioner_options(self, task): return dj_pb.RawDataPartitionerOptions( partitioner_name="dp_worker_partitioner_{}".format(self._rank_id), input_file_paths=task.fpaths, output_dir=task.output_base_dir, output_partition_num=task.output_partition_num, partitioner_rank_id=task.partition_id, batch_processor_options=self._options.batch_processor_options, raw_data_options=self._options.raw_data_options, writer_options=self._options.writer_options)
def _make_partitioner_options(self, task): return dj_pb.RawDataPartitionerOptions( partitioner_name="{}-rank_{}".format(task.task_name, self._rank_id), input_file_paths=task.fpaths, output_dir=task.output_base_dir, output_partition_num=task.output_partition_num, partitioner_rank_id=task.partition_id, batch_processor_options=self._options.batch_processor_options, raw_data_options=self._options.raw_data_options, writer_options=self._options.writer_options, memory_limit_ratio=self._options.memory_limit_ratio)
if CityHash32(os.path.basename(fpath)) % partitioner_num == \ args.partitioner_rank_id] logging.info("Partitioner of rank id %d will process %d/%d "\ "input files", args.partitioner_rank_id, len(all_fpaths), origin_file_num) partitioner_options = dj_pb.RawDataPartitionerOptions( partitioner_name=args.partitioner_name, input_file_paths=all_fpaths, output_dir=args.output_dir, output_partition_num=args.output_partition_num, raw_data_options=dj_pb.RawDataOptions( raw_data_iter=args.raw_data_iter, compressed_type=args.compressed_type, read_ahead_size=args.read_ahead_size, read_batch_size=args.read_batch_size ), writer_options=dj_pb.WriterOptions( output_writer=args.output_builder, compressed_type=args.builder_compressed_type, ), partitioner_rank_id=args.partitioner_rank_id, batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=4096, max_flying_item=-1 ) ) partitioner = RawDataPartitioner(partitioner_options, args.part_field, args.etcd_name, args.etcd_addrs, args.etcd_base_dir) logging.info("RawDataPartitioner %s of rank %d launched", partitioner_options.partitioner_name,
if args.total_partitioner_num > 1: rest_fpaths = [fpath for (index, fpath) in enumerate(all_fpaths) if index % args.total_partitioner_num == \ args.partitioner_rank_id] logging.info("Partitioner of rank id %d will process %d/%d "\ "input files", args.partitioner_rank_id, len(rest_fpaths), len(all_fpaths)) all_fpaths = rest_fpaths partitioner_options = dj_pb.RawDataPartitionerOptions( partitioner_name=args.partitioner_name, input_file_paths=list(set(all_fpaths)), output_dir=args.output_dir, output_partition_num=args.output_partition_num, raw_data_options=dj_pb.RawDataOptions( raw_data_iter=args.raw_data_iter, compressed_type=args.compressed_type, read_ahead_size=args.read_ahead_size), output_builder=args.output_builder, output_item_threshold=args.output_item_threshold, partitioner_rank_id=args.partitioner_rank_id, batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=args.raw_data_batch_size, max_flying_item=args.max_flying_raw_data)) partitioner = RawDataPartitioner(partitioner_options, args.etcd_name, args.etcd_addrs, args.etcd_base_dir) logging.info("RawDataPartitioner %s of rank %d launched", partitioner_options.partitioner_name, partitioner_options.partitioner_rank_id) partitioner.start_process() partitioner.wait_for_finished() logging.info("RawDataPartitioner %s of rank %d finished",