def test_dataflow_worker_jar_flag_adds_use_staged_worker_jar_experiment(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--experiment=beam_fn_api')
    self.default_properties.append('--dataflow_worker_jar=test.jar')

    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

    experiments_for_job = (
        remote_runner.job.options.view_as(DebugOptions).experiments)
    self.assertIn('beam_fn_api', experiments_for_job)
    self.assertIn('use_staged_dataflow_worker_jar', experiments_for_job)
Example #2
0
  def test_remote_runner_display_data(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    # TODO: Should not subclass ParDo. Switch to PTransform as soon as
    # composite transforms support display data.
    class SpecialParDo(beam.ParDo):
      def __init__(self, fn, now):
        super(SpecialParDo, self).__init__(fn)
        self.fn = fn
        self.now = now

      # Make this a list to be accessible within closure
      def display_data(self):
        return {'asubcomponent': self.fn,
                'a_class': SpecialParDo,
                'a_time': self.now}

    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

      def process(self):
        pass

    now = datetime.now()
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> SpecialParDo(SpecialDoFn(), now))

    remote_runner.job = apiclient.Job(p.options)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    steps = [step
             for step in job_dict['steps']
             if len(step['properties'].get('display_data', [])) > 0]
    step = steps[0]
    disp_data = step['properties']['display_data']
    disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
    nspace = SpecialParDo.__module__+ '.'
    expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
                      'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
                      'key': 'a_time'},
                     {'type': 'STRING', 'namespace': nspace+'SpecialParDo',
                      'value': nspace+'SpecialParDo', 'key': 'a_class',
                      'shortValue': 'SpecialParDo'},
                     {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
                      'value': 42, 'key': 'dofn_value'}]
    expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
    self.assertEqual(len(disp_data), 3)
    self.assertEqual(disp_data, expected_data)
Example #3
0
    def test_use_fastavro_experiment_is_not_added_when_use_avro_is_present(
            self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--experiment=use_avro')

        with Pipeline(remote_runner,
                      PipelineOptions(self.default_properties)) as p:
            p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

        debug_options = remote_runner.job.options.view_as(DebugOptions)

        self.assertFalse(debug_options.lookup_experiment(
            'use_fastavro', False))
    def test_dataflow_worker_jar_flag_non_fnapi_noop(self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--experiment=some_other_experiment')
        self.default_properties.append('--dataflow_worker_jar=test.jar')

        p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
        p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        p.run()

        experiments_for_job = (
            remote_runner.job.options.view_as(DebugOptions).experiments)
        self.assertIn('some_other_experiment', experiments_for_job)
        self.assertNotIn('use_staged_dataflow_worker_jar', experiments_for_job)
Example #5
0
  def test_direct_runner_metrics(self):
    from apache_beam.metrics.metric import Metrics

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    runner = DirectRunner()
    p = Pipeline(runner,
                 options=PipelineOptions(self.default_properties))
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> beam.ParDo(MyDoFn()))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))
    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))
Example #6
0
    def test_streaming_create_translation(self):
        remote_runner = DataflowRunner()
        self.default_properties.append("--streaming")
        p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
        p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        p.run()
        job_dict = json.loads(str(remote_runner.job))
        self.assertEqual(len(job_dict[u'steps']), 2)

        self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
        self.assertEqual(
            job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
            '_starting_signal/')
        self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
  def test_streaming_engine_flag_adds_windmill_experiments(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--streaming')
    self.default_properties.append('--enable_streaming_engine')
    self.default_properties.append('--experiment=some_other_experiment')

    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

    experiments_for_job = (
        remote_runner.job.options.view_as(DebugOptions).experiments)
    self.assertIn('enable_streaming_engine', experiments_for_job)
    self.assertIn('enable_windmill_service', experiments_for_job)
    self.assertIn('some_other_experiment', experiments_for_job)
Example #8
0
 def test_environment_override_translation(self):
   self.default_properties.append('--experiments=beam_fn_api')
   self.default_properties.append('--worker_harness_container_image=FOO')
   remote_runner = DataflowRunner()
   p = Pipeline(remote_runner,
                options=PipelineOptions(self.default_properties))
   (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
    | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
    | ptransform.GroupByKey())
   p.run()
   self.assertEqual(
       list(remote_runner.proto_pipeline.components.environments.values()),
       [beam_runner_api_pb2.Environment(
           urn=common_urns.environments.DOCKER.urn,
           payload=beam_runner_api_pb2.DockerPayload(
               container_image='FOO').SerializeToString())])
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    remote_runner.job = apiclient.Job(p._options)
    # Performing configured PTransform overrides here.
    p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Example #10
0
    def test_call_names_combiner_pipeline(self):
        call_names = ['sample1', 'sample2', 'sample3']
        variant_calls = [
            vcfio.VariantCall(name=call_names[0]),
            vcfio.VariantCall(name=call_names[1]),
            vcfio.VariantCall(name=call_names[2])
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        combined_call_names = (
            pipeline
            | transforms.Create(variants)
            | 'CombineCallNames' >> combine_call_names.CallNamesCombiner())
        assert_that(combined_call_names, equal_to([call_names]))
        pipeline.run()
Example #11
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on
        # this test.
        p.run(test_runner_api=False)
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[1]
        disp_data = step['properties']['display_data']
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        self.assertUnhashableCountEqual(disp_data, expected_data)
  def test_convert_sample_info_to_row(self, mocked_obj):
    vcf_header_1 = vcf_header_io.VcfHeader(
        samples=SAMPLE_LINE, file_path='gs://bucket1/dir1/file1.vcf')
    vcf_header_2 = vcf_header_io.VcfHeader(
        samples=SAMPLE_LINE, file_path='gs://bucket1/dir1/file2.vcf')
    current_minute = mocked_obj()

    expected_rows = [
        {sample_info_table_schema_generator.SAMPLE_ID: 7715696391291253656,
         sample_info_table_schema_generator.SAMPLE_NAME: (
             'gs___bucket1_dir1_file1_vcf_Sample1'),
         sample_info_table_schema_generator.FILE_PATH: (
             'gs://bucket1/dir1/file1.vcf'),
         sample_info_table_schema_generator.INGESTION_DATETIME: current_minute},
        {sample_info_table_schema_generator.SAMPLE_ID: 5682150464643626236,
         sample_info_table_schema_generator.SAMPLE_NAME: (
             'gs___bucket1_dir1_file1_vcf_Sample2'),
         sample_info_table_schema_generator.FILE_PATH: (
             'gs://bucket1/dir1/file1.vcf'),
         sample_info_table_schema_generator.INGESTION_DATETIME: current_minute},
        {sample_info_table_schema_generator.SAMPLE_ID: 668336000922978678,
         sample_info_table_schema_generator.SAMPLE_NAME: (
             'gs___bucket1_dir1_file2_vcf_Sample1'),
         sample_info_table_schema_generator.FILE_PATH: (
             'gs://bucket1/dir1/file2.vcf'),
         sample_info_table_schema_generator.INGESTION_DATETIME: current_minute},
        {sample_info_table_schema_generator.SAMPLE_ID: 5498327443813165683,
         sample_info_table_schema_generator.SAMPLE_NAME: (
             'gs___bucket1_dir1_file2_vcf_Sample2'),
         sample_info_table_schema_generator.FILE_PATH: (
             'gs://bucket1/dir1/file2.vcf'),
         sample_info_table_schema_generator.INGESTION_DATETIME: current_minute},
    ]
    pipeline = test_pipeline.TestPipeline()
    bigquery_rows = (
        pipeline
        | transforms.Create([vcf_header_1, vcf_header_2])
        | 'ConvertToRow'
        >> transforms.ParDo(sample_info_to_bigquery.ConvertSampleInfoToRow(
            SampleNameEncoding.WITH_FILE_PATH), ))

    assert_that(bigquery_rows, equal_to(expected_rows))
    pipeline.run()
Example #13
0
    def test_call_names_combiner_pipeline_preserve_call_names_order_error(
            self):
        call_names = ['sample1', 'sample2', 'sample3']
        variant_calls = [
            vcfio.VariantCall(name=call_names[0]),
            vcfio.VariantCall(name=call_names[1]),
            vcfio.VariantCall(name=call_names[2])
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        _ = (pipeline
             | transforms.Create(variants)
             | 'CombineCallNames' >> combine_call_names.CallNamesCombiner(
                 preserve_call_names_order=True))
        with self.assertRaises(ValueError):
            pipeline.run()
Example #14
0
    def test_sample_ids_combiner_pipeline(self):
        sample_ids = [
            hash_name('sample3'),
            hash_name('sample2'),
            hash_name('sample1')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2])
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        combined_sample_ids = (
            pipeline
            | transforms.Create(variants)
            | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner()
            | combiners.ToList())
        assert_that(combined_sample_ids, equal_to([sample_ids]))
        pipeline.run()
Example #15
0
    def test_sample_ids_combiner_pipeline_preserve_sample_order_error(self):
        sample_ids = [
            hash_name('sample1'),
            hash_name('sample2'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2])
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        _ = (pipeline
             | transforms.Create(variants)
             | 'CombineSampleIds' >>
             combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True)
             | combiners.ToList())
        with self.assertRaises(ValueError):
            pipeline.run()
def _bigquery_to_vcf_shards(
        known_args,  # type: argparse.Namespace
        beam_pipeline_options,  # type: pipeline_options.PipelineOptions
        vcf_data_temp_folder,  # type: str
        header_file_path,  # type: str
):
    # type: (...) -> None
    """Runs BigQuery to VCF shards pipelines.

  It reads the variants from BigQuery table, groups a collection of variants
  within a contiguous region of the genome (the size of the collection is
  adjustable through flag `--number_of_bases_per_shard`), sorts them, and then
  writes to one VCF file. All VCF data files are saved in
  `vcf_data_temp_folder`.

  Also, it writes the meta info and data header with the sample names to
  `vcf_header_file_path`.
  """
    schema = _get_schema(known_args.input_table)
    variant_query = _get_variant_query(known_args, schema)
    logging.info('Processing BigQuery query %s:', variant_query)
    project_id, dataset_id, table_id = bigquery_util.parse_table_reference(
        known_args.input_table)
    bq_variant_source = bigquery.BigQuerySource(query=variant_query,
                                                validate=True,
                                                use_standard_sql=True)
    annotation_names = _extract_annotation_names(schema)

    base_table_id = bigquery_util.get_table_base_name(table_id)
    sample_query = _SAMPLE_INFO_QUERY_TEMPLATE.format(
        PROJECT_ID=project_id,
        DATASET_ID=dataset_id,
        TABLE_NAME=bigquery_util.compose_table_name(base_table_id,
                                                    SAMPLE_INFO_TABLE_SUFFIX))
    bq_sample_source = bigquery.BigQuerySource(query=sample_query,
                                               validate=True,
                                               use_standard_sql=True)
    with beam.Pipeline(options=beam_pipeline_options) as p:
        variants = (p
                    | 'ReadFromBigQuery ' >> beam.io.Read(bq_variant_source)
                    | bigquery_to_variant.BigQueryToVariant(annotation_names))
        sample_table_rows = (
            p
            | 'ReadFromSampleTable' >> beam.io.Read(bq_sample_source))
        if known_args.sample_names:
            temp_sample_names = (p
                                 | transforms.Create(known_args.sample_names,
                                                     reshuffle=False))
        else:
            # Get sample names from sample IDs in the variants and sort.
            id_to_name_hash_table = (sample_table_rows
                                     | 'SampleIdToNameDict' >>
                                     sample_mapping_table.SampleIdToNameDict())
            temp_sample_ids = (
                variants
                | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(
                    known_args.preserve_sample_order))
            temp_sample_names = (
                temp_sample_ids
                | 'GetSampleNames' >> sample_mapping_table.GetSampleNames(
                    beam.pvalue.AsSingleton(id_to_name_hash_table))
                | 'CombineToList' >> beam.combiners.ToList()
                | 'SortSampleNames' >> beam.ParDo(sorted))

        name_to_id_hash_table = (
            sample_table_rows
            |
            'SampleNameToIdDict' >> sample_mapping_table.SampleNameToIdDict())
        sample_ids = (temp_sample_names
                      | 'GetSampleIds' >> sample_mapping_table.GetSampleIds(
                          beam.pvalue.AsSingleton(name_to_id_hash_table))
                      | 'CombineSortedSampleIds' >> beam.combiners.ToList())
        sample_names = temp_sample_names | beam.combiners.ToList()

        _ = (sample_names
             | 'GenerateVcfDataHeader' >> beam.ParDo(
                 _write_vcf_header_with_sample_names, _VCF_FIXED_COLUMNS,
                 known_args.representative_header_file, header_file_path))

        _ = (variants
             | densify_variants.DensifyVariants(
                 beam.pvalue.AsSingleton(sample_ids))
             | 'PairVariantWithKey' >> beam.Map(
                 _pair_variant_with_key, known_args.number_of_bases_per_shard)
             | 'GroupVariantsByKey' >> beam.GroupByKey()
             | beam.ParDo(_get_file_path_and_sorted_variants,
                          vcf_data_temp_folder)
             | vcfio.WriteVcfDataLines(known_args.bq_uses_1_based_coordinate))