Example #1
0
    def test_metadata_comes_from_memory_when_available(self):
        result = ProcessResult(
            pandas.DataFrame({
                'A': [1],  # int64
                'B': [datetime.datetime(2018, 8, 20)],  # datetime64[ns]
                'C': ['foo'],  # str
            }))
        result.dataframe['D'] = pandas.Series(['cat'], dtype='category')
        cached_result = self.wf_module.cache_render_result(2, result)
        # cache_render_result() keeps its `result` parameter in memory, so we
        # can avoid disk entirely.
        #
        # This is great for the render pipeline: it never reads from the file
        # it writes, as it renders all modules sequentially.
        os.unlink(cached_result.parquet_path)
        self.assertFalse(cached_result._result is None)

        self.assertEqual(cached_result.result, result)

        self.assertEqual(len(cached_result), 1)
        self.assertEqual(cached_result.column_names, ['A', 'B', 'C', 'D'])
        self.assertEqual(cached_result.column_types,
                         ['number', 'datetime', 'text', 'text'])
        self.assertEqual(cached_result.columns, [
            Column('A', 'number'),
            Column('B', 'datetime'),
            Column('C', 'text'),
            Column('D', 'text'),
        ])
    def test_column_names_and_types_do_not_read_file(self):
        result = ProcessResult(
            pandas.DataFrame({
                'A': [1],  # int64
                'B': [datetime.datetime(2018, 8, 20)],  # datetime64[ns]
                'C': ['foo'],  # str
            }))
        result.dataframe['D'] = pandas.Series(['cat'], dtype='category')
        self.wf_module.cache_render_result(2, result)
        self.wf_module.save()

        cached_result = self.wf_module.get_cached_render_result()
        cached_result.parquet_file  # read header
        os.unlink(cached_result.parquet_path)
        self.assertEqual(cached_result.column_names, ['A', 'B', 'C', 'D'])
        self.assertEqual(cached_result.column_types,
                         ['number', 'datetime', 'text', 'text'])
        self.assertEqual(cached_result.columns, [
            Column('A', 'number'),
            Column('B', 'datetime'),
            Column('C', 'text'),
            Column('D', 'text'),
        ])

        with self.assertRaises(FileNotFoundError):
            # Prove that we didn't read from the file
            self.assertIsNone(cached_result.result)
Example #3
0
 def test_columns(self):
     df = DataFrame({
         'A': [1],  # number
         'B': ['foo'],  # str
         'C': datetime.datetime(2018, 8, 20),  # datetime64
     })
     df['D'] = Series(['cat'], dtype='category')
     result = ProcessResult(df)
     self.assertEqual(result.column_names, ['A', 'B', 'C', 'D'])
     self.assertEqual(result.column_types,
                      ['number', 'text', 'datetime', 'text'])
     self.assertEqual(result.columns, [
         Column('A', 'number'),
         Column('B', 'text'),
         Column('C', 'datetime'),
         Column('D', 'text'),
     ])
Example #4
0
 def columns(self):
     """Scan on-disk header for columns and their types."""
     return [Column(n, t)
             for n, t in zip(self.column_names, self.column_types)]