예제 #1
0
    def test_files_created(self):
        """Test that the files are created and written."""

        fn = bqfl.WriteRecordsToFile(coder=CustomRowCoder())
        self.tmpdir = self._new_tempdir()

        def check_files_created(output_pcs):
            dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]

            files = dest_file_pc | "GetFiles" >> beam.Map(lambda x: x[1])
            file_count = files | "CountFiles" >> beam.combiners.Count.Globally(
            )

            _ = files | "FilesExist" >> beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
            assert_that(file_count, equal_to([3]), label='check file count')

            destinations = (
                dest_file_pc
                | "GetDests" >> beam.Map(
                    lambda x: bigquery_tools.get_hashable_destination(x[0])))
            assert_that(destinations,
                        equal_to(list(_DISTINCT_DESTINATIONS)),
                        label='check destinations ')

        self._consume_input(fn, check_files_created)
예제 #2
0
    def test_many_files(self):
        """Forces records to be written to many files.

    For each destination multiple files are necessary. This is because the max
    file length is very small, so only a couple records fit in each file.
    """

        fn = bqfl.WriteRecordsToFile(max_file_size=50, coder=CustomRowCoder())
        self.tmpdir = self._new_tempdir()

        def check_many_files(output_pcs):
            dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]

            files_per_dest = (dest_file_pc
                              | beam.Map(lambda x: x).with_output_types(
                                  beam.typehints.KV[str, str])
                              | beam.combiners.Count.PerKey())
            files_per_dest = (
                files_per_dest
                | "GetDests" >> beam.Map(lambda x: (
                    bigquery_tools.get_hashable_destination(x[0]), x[1])))
            assert_that(
                files_per_dest,
                equal_to([('project1:dataset1.table1', 4),
                          ('project1:dataset1.table2', 2),
                          ('project1:dataset1.table3', 1)]))

            # Check that the files exist
            _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True)))

        self._consume_input(fn, check_many_files)
예제 #3
0
    def test_records_are_spilled(self, file_format):
        """Forces records to be written to many files.

    For each destination multiple files are necessary, and at most two files
    can be created. This forces records to be spilled to the next stage of
    processing.
    """

        fn = bqfl.WriteRecordsToFile(schema=_ELEMENTS_SCHEMA,
                                     max_files_per_bundle=2,
                                     file_format=file_format)
        self.tmpdir = self._new_tempdir()

        def check_many_files(output_pcs):
            dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG]
            spilled_records_pc = output_pcs[
                bqfl.WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

            spilled_records_count = (spilled_records_pc
                                     | combiners.Count.Globally())
            assert_that(spilled_records_count,
                        equal_to([3]),
                        label='spilled count')

            files_per_dest = (dest_file_pc
                              | beam.Map(lambda x: x).with_output_types(
                                  beam.typehints.KV[str, Tuple[str, int]])
                              | combiners.Count.PerKey())
            files_per_dest = (
                files_per_dest
                | "GetDests" >> beam.Map(lambda x: (
                    bigquery_tools.get_hashable_destination(x[0]), x[1])))

            # Only table1 and table3 get files. table2 records get spilled.
            assert_that(files_per_dest,
                        equal_to([('project1:dataset1.table1', 1),
                                  ('project1:dataset1.table3', 1)]),
                        label='file count')

            # Check that the files exist
            _ = dest_file_pc | beam.Map(lambda x: x[1][0]) | beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True)))

        self._consume_input(fn, check_many_files)