コード例 #1
0
ファイル: test_shuffler.py プロジェクト: AdaZhou/rikai
 def test_randomness(self):
     shuffler = RandomShuffler(16)
     expected = list(range(100))
     actual = self.shuffle_numbers(shuffler, expected)
     self.assertEqual(100, len(actual))
     self.assertNotEqual(expected, actual)
     self.assertEqual(expected, sorted(actual))
コード例 #2
0
def test_randomness():
    shuffler = RandomShuffler(16)
    expected = list(range(100))
    actual = shuffle_numbers(shuffler, expected)
    assert len(actual) == 100
    assert expected != actual
    assert expected == sorted(actual)
コード例 #3
0
ファイル: test_shuffler.py プロジェクト: AdaZhou/rikai
 def test_randomness_with_large_capacity(self):
     """Test the case that the capacity is larger than total number of elements."""
     shuffler = RandomShuffler(128)
     expected = list(range(100))
     actual = self.shuffle_numbers(shuffler, expected)
     self.assertEqual(100, len(actual))
     self.assertNotEqual(expected, actual)
     self.assertEqual(expected, sorted(actual))
コード例 #4
0
 def __iter__(self):
     shuffler = RandomShuffler(
         self.shuffler_capacity if self.shuffle else 1, self.seed)
     group_count = 0
     for filepath in self.files:
         fs, path = FileSystem.from_uri(filepath)
         with fs.open_input_file(path) as fobj:
             parquet = pg.ParquetFile(fobj)
             for group_idx in range(parquet.num_row_groups):
                 # A simple form of row-group level bucketing without memory overhead.
                 # Pros:
                 #  - It requires zero communication to initialize the distributed policy
                 #  - It uses little memory and no startup overhead, i.e. collecting row groups.
                 # Cons:
                 #   The drawback would be if the world size is much larger than
                 #   the average number of row groups. As a result, many of the
                 #   file open operations would be wasted.
                 group_count += 1
                 if group_count % self.world_size != self.rank:
                     continue
                 row_group = parquet.read_row_group(group_idx,
                                                    columns=self.columns)
                 for batch in row_group.to_batches():  # type: RecordBatch
                     # TODO: read batches not using pandas
                     for _, row in batch.to_pandas().iterrows():
                         shuffler.append(row)
                         # Maintain the shuffler buffer around its capacity.
                         while shuffler.full():
                             yield self._convert(shuffler.pop().to_dict(),
                                                 self.spark_row_metadata)
     while shuffler:
         yield self._convert(shuffler.pop().to_dict(),
                             self.spark_row_metadata)
コード例 #5
0
def test_randomness_with_large_capacity():
    """Test the case that the capacity is larger than total number
    of elements.
    """
    shuffler = RandomShuffler(128)
    expected = list(range(100))
    actual = shuffle_numbers(shuffler, expected)
    assert len(actual) == 100
    assert expected != actual
    assert expected == sorted(actual)
コード例 #6
0
def test_fifo_with_single_item():
    shuffler = RandomShuffler(capacity=1)
    shuffler.append(1)
    assert shuffler
    assert shuffler.full()
    assert len(shuffler) == 1
    assert shuffler.pop() == 1

    assert not shuffler.full()
コード例 #7
0
ファイル: test_shuffler.py プロジェクト: AdaZhou/rikai
 def test_fifo(self):
     shuffler = RandomShuffler(capacity=1)
     returned = self.shuffle_numbers(shuffler, range(100))
     self.assertEqual(list(range(100)), returned)
コード例 #8
0
def test_fifo():
    shuffler = RandomShuffler(capacity=1)
    returned = shuffle_numbers(shuffler, range(100))
    assert len(returned) == 100