Beispiel #1
0
def _get_input_dimensions(known_args, pipeline_args):
    pipeline_mode = pipeline_common.get_pipeline_mode(known_args.all_patterns)
    beam_pipeline_options = pipeline_options.PipelineOptions(pipeline_args)
    google_cloud_options = beam_pipeline_options.view_as(
        pipeline_options.GoogleCloudOptions)

    estimate_sizes_job_name = pipeline_common.generate_unique_name(
        _ESTIMATE_SIZES_JOB_NAME)
    if google_cloud_options.job_name:
        google_cloud_options.job_name += '-' + estimate_sizes_job_name
    else:
        google_cloud_options.job_name = estimate_sizes_job_name
    temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp()
    temp_estimated_input_size_file_name = '-'.join(
        [google_cloud_options.job_name, _ESTIMATE_SIZES_FILE_NAME])
    temp_estimated_input_size_file_path = filesystems.FileSystems.join(
        temp_directory, temp_estimated_input_size_file_name)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        estimates = pipeline_common.get_estimates(p, pipeline_mode,
                                                  known_args.all_patterns)

        files_size = (estimates
                      | 'GetFilesSize' >> extract_input_size.GetFilesSize())
        file_count = (estimates
                      | 'CountAllFiles' >> beam.combiners.Count.Globally())
        sample_map = (estimates
                      |
                      'ExtractSampleMap' >> extract_input_size.GetSampleMap())
        estimated_value_count = (sample_map
                                 | extract_input_size.GetEstimatedValueCount())
        estimated_sample_count = (sample_map
                                  |
                                  extract_input_size.GetEstimatedSampleCount())
        estimated_variant_count = (
            estimates
            | 'GetEstimatedVariantCount' >>
            extract_input_size.GetEstimatedVariantCount())
        _ = (estimated_variant_count
             | beam.ParDo(extract_input_size.print_estimates_to_file,
                          beam.pvalue.AsSingleton(estimated_sample_count),
                          beam.pvalue.AsSingleton(estimated_value_count),
                          beam.pvalue.AsSingleton(files_size),
                          beam.pvalue.AsSingleton(file_count),
                          temp_estimated_input_size_file_path))

    with filesystems.FileSystems.open(
            temp_estimated_input_size_file_path) as f:
        estimates = f.readlines()
    if len(estimates) != 5:
        raise ValueError('Exactly 5 estimates were expected in {}.'.format(
            temp_estimated_input_size_file_path))

    known_args.estimated_variant_count = int(estimates[0].strip())
    known_args.estimated_sample_count = int(estimates[1].strip())
    known_args.estimated_value_count = int(estimates[2].strip())
    known_args.files_size = int(estimates[3].strip())
    known_args.file_count = int(estimates[4].strip())
Beispiel #2
0
    def test_get_estimated_value_count(self):
        sample_map = self._create_sample_map()

        pipeline = TestPipeline()
        estimated_value_count = (pipeline
                                 | transforms.Create(sample_map)
                                 | 'GetEstimatedValueCount' >>
                                 extract_input_size.GetEstimatedValueCount())
        assert_that(estimated_value_count, equal_to([71]))
        pipeline.run()