def test_one_job_fails_all_jobs_fail(self):

    # If one of the import jobs fails, then other jobs must not be performed.
    # This is to avoid reinsertion of some records when a pipeline fails and
    # is rerun.
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    self.bigquery_client.get_or_create_table(
        self.project,
        self.dataset_id,
        output_table_1.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
        None,
        None)
    self.bigquery_client.get_or_create_table(
        self.project,
        self.dataset_id,
        output_table_2.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2),
        None,
        None)

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, language FROM %s" % output_table_1,
            data=[]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, foundation FROM %s" % output_table_2,
            data=[])
    ]

    args = self.test_pipeline.get_full_options_as_args(
        experiments='use_beam_bq_sink')

    with self.assertRaises(Exception):
      # The pipeline below fails because neither a schema nor SCHEMA_AUTODETECT
      # are specified.
      with beam.Pipeline(argv=args) as p:
        input = p | beam.Create(_ELEMENTS)
        input2 = p | "Broken record" >> beam.Create(['language_broken_record'])

        input = (input, input2) | beam.Flatten()

        _ = (
            input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                table=lambda x:
                (output_table_1 if 'language' in x else output_table_2),
                create_disposition=(
                    beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                temp_file_format=bigquery_tools.FileFormat.JSON))

    hamcrest_assert(p, all_of(*pipeline_verifiers))
Beispiel #2
0
 def has_expected_values(actual):
   from hamcrest.core import assert_that as hamcrest_assert
   from hamcrest.library.collection import contains
   from hamcrest.library.collection import only_contains
   ordered = sorted(actual)
   # Early firings.
   hamcrest_assert(ordered[:4], contains(1, 3, 6, 10))
   # Different runners have different number of 15s, but there should
   # be at least one 15.
   hamcrest_assert(ordered[4:], only_contains(15))
    def check_many_files(output_pcs):
      dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]
      spilled_records_pc = output_pcs[
          bqfl.WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

      spilled_records_count = (spilled_records_pc |
                               beam.combiners.Count.Globally())
      assert_that(spilled_records_count, equal_to([3]), label='spilled count')

      files_per_dest = (dest_file_pc
                        | beam.Map(lambda x: x).with_output_types(
                            beam.typehints.KV[str, str])
                        | beam.combiners.Count.PerKey())
      files_per_dest = (
          files_per_dest
          | "GetDests" >> beam.Map(
              lambda x: (bigquery_tools.get_hashable_destination(x[0]),
                         x[1])))

      # Only table1 and table3 get files. table2 records get spilled.
      assert_that(files_per_dest,
                  equal_to([('project1:dataset1.table1', 1),
                            ('project1:dataset1.table3', 1)]),
                  label='file count')

      # Check that the files exist
      _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
Beispiel #4
0
        def check_many_files(output_pcs):
            dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]
            spilled_records_pc = output_pcs[
                bqfl.WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

            spilled_records_count = (spilled_records_pc
                                     | beam.combiners.Count.Globally())
            assert_that(spilled_records_count,
                        equal_to([3]),
                        label='spilled count')

            files_per_dest = (dest_file_pc
                              | beam.Map(lambda x: x).with_output_types(
                                  beam.typehints.KV[str, str])
                              | beam.combiners.Count.PerKey())
            files_per_dest = (
                files_per_dest
                | "GetDests" >> beam.Map(lambda x: (
                    bigquery_tools.get_hashable_destination(x[0]), x[1])))

            # Only table1 and table3 get files. table2 records get spilled.
            assert_that(files_per_dest,
                        equal_to([('project1:dataset1.table1', 1),
                                  ('project1:dataset1.table3', 1)]),
                        label='file count')

            # Check that the files exist
            _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
Beispiel #5
0
    def test_records_traverse_transform_with_mocks(self):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job

        bq_client.jobs.Insert.return_value = result_job

        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            coder=CustomRowCoder())

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline('DirectRunner') as p:
            outputs = p | beam.Create(_ELEMENTS) | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> beam.combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())

            # All files exist
            _ = (files | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # One file per destination
            assert_that(files | beam.combiners.Count.Globally(),
                        equal_to([1]),
                        label='CountFiles')

            assert_that(destinations,
                        equal_to([destination]),
                        label='CheckDestinations')

            assert_that(jobs, equal_to([job_reference]), label='CheckJobs')
  def test_one_job_fails_all_jobs_fail(self):

    # If one of the import jobs fails, then other jobs must not be performed.
    # This is to avoid reinsertion of some records when a pipeline fails and
    # is rerun.
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    self.bigquery_client.get_or_create_table(
        self.project, self.dataset_id, output_table_1.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
        None, None)
    self.bigquery_client.get_or_create_table(
        self.project, self.dataset_id, output_table_2.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2),
        None, None)

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[])]

    args = self.test_pipeline.get_full_options_as_args(
        experiments='use_beam_bq_sink')

    with self.assertRaises(Exception):
      with beam.Pipeline(argv=args) as p:
        input = p | beam.Create(_ELEMENTS)
        input2 = p | "Broken record" >> beam.Create(['language_broken_record'])

        input = (input, input2) | beam.Flatten()

        _ = (input |
             "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                 table=lambda x: (output_table_1
                                  if 'language' in x
                                  else output_table_2),
                 create_disposition=(
                     beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    hamcrest_assert(p, all_of(*pipeline_verifiers))
  def test_records_traverse_transform_with_mocks(self):
    destination = 'project1:dataset1.table1'

    job_reference = bigquery_api.JobReference()
    job_reference.projectId = 'project1'
    job_reference.jobId = 'job_name1'
    result_job = bigquery_api.Job()
    result_job.jobReference = job_reference

    mock_job = mock.Mock()
    mock_job.status.state = 'DONE'
    mock_job.status.errorResult = None
    mock_job.jobReference = job_reference

    bq_client = mock.Mock()
    bq_client.jobs.Get.return_value = mock_job

    bq_client.jobs.Insert.return_value = result_job

    transform = bqfl.BigQueryBatchFileLoads(
        destination,
        custom_gcs_temp_location=self._new_tempdir(),
        test_client=bq_client,
        validate=False)

    # Need to test this with the DirectRunner to avoid serializing mocks
    with TestPipeline('DirectRunner') as p:
      outputs = p | beam.Create(_ELEMENTS) | transform

      dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
      dest_job = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

      jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

      files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1])
      destinations = (
          dest_files
          | "GetDests" >> beam.Map(
              lambda x: (
                  bigquery_tools.get_hashable_destination(x[0]), x[1]))
          | "GetUniques" >> beam.combiners.Count.PerKey()
          | "GetFinalDests" >>beam.Keys())

      # All files exist
      _ = (files | beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

      # One file per destination
      assert_that(files | beam.combiners.Count.Globally(),
                  equal_to([1]),
                  label='CountFiles')

      assert_that(destinations,
                  equal_to([destination]),
                  label='CheckDestinations')

      assert_that(jobs,
                  equal_to([job_reference]), label='CheckJobs')
    def check_files_created(output_pc):
      files = output_pc | "GetFiles" >> beam.Map(lambda x: x[1])
      file_count = files | "CountFiles" >> beam.combiners.Count.Globally()

      _ = files | "FilesExist" >> beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
      assert_that(file_count, equal_to([3]), label='check file count')

      destinations = output_pc | "GetDests" >> beam.Map(lambda x: x[0])
      assert_that(destinations, equal_to(list(_DISTINCT_DESTINATIONS)),
                  label='check destinations ')
    def check_files_created(output_pc):
      files = output_pc | "GetFiles" >> beam.Map(lambda x: x[1])
      file_count = files | "CountFiles" >> beam.combiners.Count.Globally()

      _ = files | "FilesExist" >> beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
      assert_that(file_count, equal_to([3]), label='check file count')

      destinations = output_pc | "GetDests" >> beam.Map(lambda x: x[0])
      assert_that(destinations, equal_to(list(_DISTINCT_DESTINATIONS)),
                  label='check destinations ')
    def check_many_files(output_pcs):
      dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]

      files_per_dest = (dest_file_pc
                        | beam.Map(lambda x: x).with_output_types(
                            beam.typehints.KV[str, str])
                        | beam.combiners.Count.PerKey())
      assert_that(files_per_dest,
                  equal_to([('project1:dataset1.table1', 4),
                            ('project1:dataset1.table2', 2),
                            ('project1:dataset1.table3', 1)]))

      # Check that the files exist
      _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
    def check_many_files(output_pcs):
      dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]

      files_per_dest = (dest_file_pc
                        | beam.Map(lambda x: x).with_output_types(
                            beam.typehints.KV[str, str])
                        | beam.combiners.Count.PerKey())
      assert_that(files_per_dest,
                  equal_to([('project1:dataset1.table1', 4),
                            ('project1:dataset1.table2', 2),
                            ('project1:dataset1.table3', 1)]))

      # Check that the files exist
      _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
    def check_files_created(output_pcs):
      dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]

      files = dest_file_pc | "GetFiles" >> beam.Map(lambda x: x[1])
      file_count = files | "CountFiles" >> beam.combiners.Count.Globally()

      _ = files | "FilesExist" >> beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
      assert_that(file_count, equal_to([3]), label='check file count')

      destinations = (
          dest_file_pc
          | "GetDests" >> beam.Map(
              lambda x: bigquery_tools.get_hashable_destination(x[0])))
      assert_that(destinations, equal_to(list(_DISTINCT_DESTINATIONS)),
                  label='check destinations ')
    def check_files_created(output_pcs):
      dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]

      files = dest_file_pc | "GetFiles" >> beam.Map(lambda x: x[1][0])
      file_count = files | "CountFiles" >> combiners.Count.Globally()

      _ = files | "FilesExist" >> beam.Map(
          lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
      assert_that(file_count, equal_to([3]), label='check file count')

      destinations = (
          dest_file_pc
          | "GetDests" >> beam.Map(
              lambda x: bigquery_tools.get_hashable_destination(x[0])))
      assert_that(destinations, equal_to(list(_DISTINCT_DESTINATIONS)),
                  label='check destinations ')
Beispiel #14
0
        def check_many_files(output_pcs):
            dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]

            files_per_dest = (dest_file_pc
                              | beam.Map(lambda x: x).with_output_types(
                                  beam.typehints.KV[str, Tuple[str, int]])
                              | combiners.Count.PerKey())
            files_per_dest = (
                files_per_dest
                | "GetDests" >> beam.Map(lambda x: (
                    bigquery_tools.get_hashable_destination(x[0]), x[1])))
            assert_that(
                files_per_dest,
                equal_to([('project1:dataset1.table1', 4),
                          ('project1:dataset1.table2', 2),
                          ('project1:dataset1.table3', 1)]))

            # Check that the files exist
            _ = dest_file_pc | beam.Map(lambda x: x[1][0]) | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
Beispiel #15
0
    def _matches(actual):
        from hamcrest.core import assert_that as hamcrest_assert
        from hamcrest.library.collection import contains_inanyorder
        expected_list = list(expected)

        hamcrest_assert(actual, contains_inanyorder(*expected_list))
Beispiel #16
0
    def test_multiple_partition_files(self):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = mock.Mock()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job

        bq_client.jobs.Insert.return_value = result_job
        bq_client.tables.Delete.return_value = None

        with TestPipeline('DirectRunner') as p:
            outputs = (p
                       | beam.Create(_ELEMENTS)
                       | bqfl.BigQueryBatchFileLoads(
                           destination,
                           custom_gcs_temp_location=self._new_tempdir(),
                           test_client=bq_client,
                           validate=False,
                           coder=CustomRowCoder(),
                           max_file_size=45,
                           max_partition_size=80,
                           max_files_per_partition=2))

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_load_jobs = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]
            dest_copy_jobs = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_COPY_JOBID_PAIRS]

            load_jobs = dest_load_jobs | "GetLoadJobs" >> beam.Map(
                lambda x: x[1])
            copy_jobs = dest_copy_jobs | "GetCopyJobs" >> beam.Map(
                lambda x: x[1])

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())

            # All files exist
            _ = (files | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # One file per destination
            assert_that(files | "CountFiles" >> combiners.Count.Globally(),
                        equal_to([6]),
                        label='CheckFileCount')

            assert_that(destinations,
                        equal_to([destination]),
                        label='CheckDestinations')

            assert_that(load_jobs
                        | "CountLoadJobs" >> combiners.Count.Globally(),
                        equal_to([6]),
                        label='CheckLoadJobCount')
            assert_that(copy_jobs
                        | "CountCopyJobs" >> combiners.Count.Globally(),
                        equal_to([6]),
                        label='CheckCopyJobCount')
Beispiel #17
0
    def _matches(actual):
        expected_list = list(expected)

        hamcrest_assert(actual, contains_inanyorder(*expected_list))
    def test_triggering_frequency(self, is_streaming, with_auto_sharding):
        destination = 'project1:dataset1.table1'

        job_reference = bigquery_api.JobReference()
        job_reference.projectId = 'project1'
        job_reference.jobId = 'job_name1'
        result_job = bigquery_api.Job()
        result_job.jobReference = job_reference

        mock_job = mock.Mock()
        mock_job.status.state = 'DONE'
        mock_job.status.errorResult = None
        mock_job.jobReference = job_reference

        bq_client = mock.Mock()
        bq_client.jobs.Get.return_value = mock_job
        bq_client.jobs.Insert.return_value = result_job

        # Insert a fake clock to work with auto-sharding which needs a processing
        # time timer.
        class _FakeClock(object):
            def __init__(self, now=time.time()):
                self._now = now

            def __call__(self):
                return self._now

        start_time = timestamp.Timestamp(0)
        bq_client.test_clock = _FakeClock(now=start_time)

        triggering_frequency = 20 if is_streaming else None
        transform = bqfl.BigQueryBatchFileLoads(
            destination,
            custom_gcs_temp_location=self._new_tempdir(),
            test_client=bq_client,
            validate=False,
            temp_file_format=bigquery_tools.FileFormat.JSON,
            is_streaming_pipeline=is_streaming,
            triggering_frequency=triggering_frequency,
            with_auto_sharding=with_auto_sharding)

        # Need to test this with the DirectRunner to avoid serializing mocks
        with TestPipeline(
                runner='BundleBasedDirectRunner',
                options=StandardOptions(streaming=is_streaming)) as p:
            if is_streaming:
                _SIZE = len(_ELEMENTS)
                fisrt_batch = [
                    TimestampedValue(value, start_time + i + 1)
                    for i, value in enumerate(_ELEMENTS[:_SIZE // 2])
                ]
                second_batch = [
                    TimestampedValue(value, start_time + _SIZE // 2 + i + 1)
                    for i, value in enumerate(_ELEMENTS[_SIZE // 2:])
                ]
                # Advance processing time between batches of input elements to fire the
                # user triggers. Intentionally advance the processing time twice for the
                # auto-sharding case since we need to first fire the timer and then
                # fire the trigger.
                test_stream = (
                    TestStream().advance_watermark_to(start_time).add_elements(
                        fisrt_batch).advance_processing_time(30).
                    advance_processing_time(30).add_elements(second_batch).
                    advance_processing_time(30).advance_processing_time(
                        30).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)
            outputs = input | transform

            dest_files = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]
            dest_job = outputs[
                bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS]

            files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0])
            destinations = (
                dest_files
                | "GetDests" >>
                beam.Map(lambda x:
                         (bigquery_tools.get_hashable_destination(x[0]), x[1]))
                | "GetUniques" >> combiners.Count.PerKey()
                | "GetFinalDests" >> beam.Keys())
            jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1])

            # Check that all files exist.
            _ = (files
                 | beam.Map(
                     lambda x: hamcrest_assert(os.path.exists(x), is_(True))))

            # Expect two load jobs are generated in the streaming case due to the
            # triggering frequency. Grouping is per trigger so we expect two entries
            # in the output as opposed to one.
            file_count = files | combiners.Count.Globally().without_defaults()
            expected_file_count = [1, 1] if is_streaming else [1]
            expected_destinations = [destination, destination
                                     ] if is_streaming else [destination]
            expected_jobs = [job_reference, job_reference
                             ] if is_streaming else [job_reference]
            assert_that(file_count,
                        equal_to(expected_file_count),
                        label='CountFiles')
            assert_that(destinations,
                        equal_to(expected_destinations),
                        label='CheckDestinations')
            assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')