def test_metadata_comes_from_memory_when_available(self): result = ProcessResult( pandas.DataFrame({ 'A': [1], # int64 'B': [datetime.datetime(2018, 8, 20)], # datetime64[ns] 'C': ['foo'], # str })) result.dataframe['D'] = pandas.Series(['cat'], dtype='category') cached_result = self.wf_module.cache_render_result(2, result) # cache_render_result() keeps its `result` parameter in memory, so we # can avoid disk entirely. # # This is great for the render pipeline: it never reads from the file # it writes, as it renders all modules sequentially. os.unlink(cached_result.parquet_path) self.assertFalse(cached_result._result is None) self.assertEqual(cached_result.result, result) self.assertEqual(len(cached_result), 1) self.assertEqual(cached_result.column_names, ['A', 'B', 'C', 'D']) self.assertEqual(cached_result.column_types, ['number', 'datetime', 'text', 'text']) self.assertEqual(cached_result.columns, [ Column('A', 'number'), Column('B', 'datetime'), Column('C', 'text'), Column('D', 'text'), ])
def test_column_names_and_types_do_not_read_file(self): result = ProcessResult( pandas.DataFrame({ 'A': [1], # int64 'B': [datetime.datetime(2018, 8, 20)], # datetime64[ns] 'C': ['foo'], # str })) result.dataframe['D'] = pandas.Series(['cat'], dtype='category') self.wf_module.cache_render_result(2, result) self.wf_module.save() cached_result = self.wf_module.get_cached_render_result() cached_result.parquet_file # read header os.unlink(cached_result.parquet_path) self.assertEqual(cached_result.column_names, ['A', 'B', 'C', 'D']) self.assertEqual(cached_result.column_types, ['number', 'datetime', 'text', 'text']) self.assertEqual(cached_result.columns, [ Column('A', 'number'), Column('B', 'datetime'), Column('C', 'text'), Column('D', 'text'), ]) with self.assertRaises(FileNotFoundError): # Prove that we didn't read from the file self.assertIsNone(cached_result.result)
def test_columns(self): df = DataFrame({ 'A': [1], # number 'B': ['foo'], # str 'C': datetime.datetime(2018, 8, 20), # datetime64 }) df['D'] = Series(['cat'], dtype='category') result = ProcessResult(df) self.assertEqual(result.column_names, ['A', 'B', 'C', 'D']) self.assertEqual(result.column_types, ['number', 'text', 'datetime', 'text']) self.assertEqual(result.columns, [ Column('A', 'number'), Column('B', 'text'), Column('C', 'datetime'), Column('D', 'text'), ])
def columns(self): """Scan on-disk header for columns and their types.""" return [Column(n, t) for n, t in zip(self.column_names, self.column_types)]