Exemple #1
0
 def _make_partitioner_options(self, task):
     return dj_pb.RawDataPartitionerOptions(
         partitioner_name="dp_worker_partitioner_{}".format(self._rank_id),
         input_file_paths=task.fpaths,
         output_dir=task.output_base_dir,
         output_partition_num=task.output_partition_num,
         partitioner_rank_id=task.partition_id,
         batch_processor_options=self._options.batch_processor_options,
         raw_data_options=self._options.raw_data_options,
         writer_options=self._options.writer_options)
Exemple #2
0
 def _make_partitioner_options(self, task):
     return dj_pb.RawDataPartitionerOptions(
         partitioner_name="{}-rank_{}".format(task.task_name,
                                              self._rank_id),
         input_file_paths=task.fpaths,
         output_dir=task.output_base_dir,
         output_partition_num=task.output_partition_num,
         partitioner_rank_id=task.partition_id,
         batch_processor_options=self._options.batch_processor_options,
         raw_data_options=self._options.raw_data_options,
         writer_options=self._options.writer_options,
         memory_limit_ratio=self._options.memory_limit_ratio)
Exemple #3
0
          if CityHash32(os.path.basename(fpath)) %  partitioner_num == \
                  args.partitioner_rank_id]
     logging.info("Partitioner of rank id %d will process %d/%d "\
                  "input files", args.partitioner_rank_id,
                  len(all_fpaths), origin_file_num)
 partitioner_options = dj_pb.RawDataPartitionerOptions(
         partitioner_name=args.partitioner_name,
         input_file_paths=all_fpaths,
         output_dir=args.output_dir,
         output_partition_num=args.output_partition_num,
         raw_data_options=dj_pb.RawDataOptions(
             raw_data_iter=args.raw_data_iter,
             compressed_type=args.compressed_type,
             read_ahead_size=args.read_ahead_size,
             read_batch_size=args.read_batch_size
         ),
         writer_options=dj_pb.WriterOptions(
             output_writer=args.output_builder,
             compressed_type=args.builder_compressed_type,
         ),
         partitioner_rank_id=args.partitioner_rank_id,
         batch_processor_options=dj_pb.BatchProcessorOptions(
             batch_size=4096,
             max_flying_item=-1
         )
     )
 partitioner = RawDataPartitioner(partitioner_options, args.part_field,
                                  args.etcd_name, args.etcd_addrs,
                                  args.etcd_base_dir)
 logging.info("RawDataPartitioner %s of rank %d launched",
              partitioner_options.partitioner_name,
Exemple #4
0
 if args.total_partitioner_num > 1:
     rest_fpaths = [fpath for (index, fpath) in enumerate(all_fpaths)
                    if index % args.total_partitioner_num == \
                            args.partitioner_rank_id]
     logging.info("Partitioner of rank id %d will process %d/%d "\
                  "input files", args.partitioner_rank_id,
                  len(rest_fpaths), len(all_fpaths))
     all_fpaths = rest_fpaths
 partitioner_options = dj_pb.RawDataPartitionerOptions(
     partitioner_name=args.partitioner_name,
     input_file_paths=list(set(all_fpaths)),
     output_dir=args.output_dir,
     output_partition_num=args.output_partition_num,
     raw_data_options=dj_pb.RawDataOptions(
         raw_data_iter=args.raw_data_iter,
         compressed_type=args.compressed_type,
         read_ahead_size=args.read_ahead_size),
     output_builder=args.output_builder,
     output_item_threshold=args.output_item_threshold,
     partitioner_rank_id=args.partitioner_rank_id,
     batch_processor_options=dj_pb.BatchProcessorOptions(
         batch_size=args.raw_data_batch_size,
         max_flying_item=args.max_flying_raw_data))
 partitioner = RawDataPartitioner(partitioner_options, args.etcd_name,
                                  args.etcd_addrs, args.etcd_base_dir)
 logging.info("RawDataPartitioner %s of rank %d launched",
              partitioner_options.partitioner_name,
              partitioner_options.partitioner_rank_id)
 partitioner.start_process()
 partitioner.wait_for_finished()
 logging.info("RawDataPartitioner %s of rank %d finished",