Exemple #1
0
 def test_read_all_from_parquet_file_pattern(self):
   file_pattern = self._write_pattern(5)
   with TestPipeline() as p:
     assert_that(
         p \
         | Create([file_pattern]) \
         | ReadAllFromParquet(),
         equal_to(self.RECORDS * 5))
Exemple #2
0
 def test_read_all_from_parquet_single_file(self):
   path = self._write_data()
   with TestPipeline() as p:
     assert_that(
         p \
         | Create([path]) \
         | ReadAllFromParquet(),
         equal_to(self.RECORDS))
Exemple #3
0
 def test_read_all_from_parquet_with_filename(self):
   file_pattern, file_paths = self._write_pattern(3, with_filename=True)
   result = [(path, record) for path in file_paths for record in self.RECORDS]
   with TestPipeline() as p:
     assert_that(
         p \
         | Create([file_pattern]) \
         | ReadAllFromParquet(with_filename=True),
         equal_to(result))
Exemple #4
0
 def test_read_all_from_parquet_many_file_patterns(self):
   file_pattern1 = self._write_pattern(5)
   file_pattern2 = self._write_pattern(2)
   file_pattern3 = self._write_pattern(3)
   with TestPipeline() as p:
     assert_that(
         p \
         | Create([file_pattern1, file_pattern2, file_pattern3]) \
         | ReadAllFromParquet(),
         equal_to(self.RECORDS * 10))
Exemple #5
0
 def test_read_all_from_parquet_many_single_files(self):
   path1 = self._write_data()
   path2 = self._write_data()
   path3 = self._write_data()
   with TestPipeline() as p:
     assert_that(
         p \
         | Create([path1, path2, path3]) \
         | ReadAllFromParquet(),
         equal_to(self.RECORDS * 3))
Exemple #6
0
 def _verify_data(self, pcol, init_size, data_size):
   read = pcol | 'read' >> ReadAllFromParquet()
   v1 = (
       read
       | 'get_number' >> Map(lambda x: x['number'])
       | 'sum_globally' >> CombineGlobally(sum)
       | 'validate_number' >>
       FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x)))
   v2 = (
       read
       | 'make_pair' >> Map(lambda x: (x['name'], x['number']))
       | 'count_per_key' >> Count.PerKey()
       | 'validate_name' >> FlatMap(
           lambda x: TestParquetIT._count_verifier(init_size, data_size, x)))
   _ = ((v1, v2, pcol)
        | 'flatten' >> Flatten()
        | 'reshuffle' >> Reshuffle()
        | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))