コード例 #1
0
 def __iter__(self):
     shuffler = RandomShuffler(
         self.shuffler_capacity if self.shuffle else 1, self.seed)
     group_count = 0
     for filepath in self.files:
         fs, path = FileSystem.from_uri(filepath)
         with fs.open_input_file(path) as fobj:
             parquet = pg.ParquetFile(fobj)
             for group_idx in range(parquet.num_row_groups):
                 # A simple form of row-group level bucketing without memory overhead.
                 # Pros:
                 #  - It requires zero communication to initialize the distributed policy
                 #  - It uses little memory and no startup overhead, i.e. collecting row groups.
                 # Cons:
                 #   The drawback would be if the world size is much larger than
                 #   the average number of row groups. As a result, many of the
                 #   file open operations would be wasted.
                 group_count += 1
                 if group_count % self.world_size != self.rank:
                     continue
                 row_group = parquet.read_row_group(group_idx,
                                                    columns=self.columns)
                 for batch in row_group.to_batches():  # type: RecordBatch
                     # TODO: read batches not using pandas
                     for _, row in batch.to_pandas().iterrows():
                         shuffler.append(row)
                         # Maintain the shuffler buffer around its capacity.
                         while shuffler.full():
                             yield self._convert(shuffler.pop().to_dict(),
                                                 self.spark_row_metadata)
     while shuffler:
         yield self._convert(shuffler.pop().to_dict(),
                             self.spark_row_metadata)
コード例 #2
0
def test_fifo_with_single_item():
    shuffler = RandomShuffler(capacity=1)
    shuffler.append(1)
    assert shuffler
    assert shuffler.full()
    assert len(shuffler) == 1
    assert shuffler.pop() == 1

    assert not shuffler.full()