Example #1
0
    def test_multiple_files(self):
        """Forces records to be written to many files.

    For each destination multiple files are necessary. This is because the max
    file length is very small, so only a couple records fit in each file.
    """
        fn = bqfl.WriteGroupedRecordsToFile(max_file_size=50,
                                            coder=CustomRowCoder())
        self.tmpdir = self._new_tempdir()

        def check_multiple_files(output_pc):
            files_per_dest = output_pc | beam.combiners.Count.PerKey()
            files_per_dest = (
                files_per_dest
                | "GetDests" >> beam.Map(lambda x: (
                    bigquery_tools.get_hashable_destination(x[0]), x[1])))
            assert_that(
                files_per_dest,
                equal_to([
                    ('project1:dataset1.table1', 4),
                    ('project1:dataset1.table2', 2),
                    ('project1:dataset1.table3', 1),
                ]))

            # Check that the files exist
            _ = output_pc | beam.Map(lambda x: x[1]) | beam.Map(os.path.exists)

        self._consume_input(fn, _DESTINATION_ELEMENT_PAIRS,
                            check_multiple_files)
Example #2
0
    def test_files_are_created(self):
        """Test that the files are created and written."""

        fn = bqfl.WriteGroupedRecordsToFile(coder=CustomRowCoder())
        self.tmpdir = self._new_tempdir()

        def check_files_created(output_pc):
            files = output_pc | "GetFiles" >> beam.Map(lambda x: x[1])
            file_count = files | "CountFiles" >> beam.combiners.Count.Globally(
            )

            _ = files | "FilesExist" >> beam.Map(
                lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
            assert_that(file_count, equal_to([3]), label='check file count')

            destinations = (
                output_pc
                | "GetDests" >> beam.Map(
                    lambda x: bigquery_tools.get_hashable_destination(x[0])))
            assert_that(destinations,
                        equal_to(list(_DISTINCT_DESTINATIONS)),
                        label='check destinations ')

        self._consume_input(fn, _DESTINATION_ELEMENT_PAIRS,
                            check_files_created)