def test_heterogeneous_configs(self): expected1 = pandas.DataFrame(data={'x': [3, 10]}) expected2 = pandas.DataFrame(data={'y': [4]}) expected3 = pandas.DataFrame(data={'x': ['b'], 'y': ['a']}) populate() config1 = LoadConfig(source='dataframe', destination='dataset', dataframe=expected1, data_name='a10') config2 = LoadConfig(source='query', destination='dataframe', query='select 4 as y') config3 = LoadConfig(source='query', destination='bucket', query="select 'b' as x, 'a' as y", data_name='a11') gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path) load_results = gpl.multi_load([config1, config2, config3]) self.assertEqual(len(load_results), 3) self.assertTrue(load_results[0] is None) self.assertTrue(load_results[2] is None) computed1 = load.dataset_to_dataframe('a10') self.assert_pandas_equal(expected1, computed1) computed2 = load_results[1] self.assert_pandas_equal(expected2, computed2) blob_name = ids.build_blob_name_2('a11-000000000000.csv.gz') computed3 = load.bucket_to_dataframe(blob_name, decompress=True) self.assert_pandas_equal(expected3, computed3)
def test_query_to_dataframe(self): expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']}) populate() gpl = create_loader(separator='#') computed = gpl.load( source='query', destination='dataframe', query="select 3 as x, 'a' as y union all select 2 as x, 'b' as y") self.assert_pandas_equal(expected, computed)
def test_dataframe_to_dataset(self): expected = pandas.DataFrame(data={'x': [1, 2, 3], 'y': [1, 2, 4]}) populate() gpl = create_loader_quick_setup() gpl.load(source='dataframe', destination='dataset', dataframe=expected, data_name='a1') computed = load.dataset_to_dataframe('a1') self.assert_pandas_equal(expected, computed)
def test_config_repeated(self): expected = pandas.DataFrame(data={'x': [3]}) populate() config = LoadConfig(source='query', destination='dataframe', query='select 3 as x') gpl = create_loader_quick_setup( local_dir_path=constants.local_subdir_path) computeds = gpl.multi_load(configs=[config] * 3) for computed in computeds: self.assert_pandas_equal(expected, computed)
def test_dataframe_to_bucket(self): expected = pandas.DataFrame(data={'x': [1, 2, 3], 'y': [1, 2, 4]}) populate() gpl = create_loader() gpl.load(source='dataframe', destination='bucket', dataframe=expected, data_name='a1') blob_name = ids.build_blob_name_0('a1.csv.gz') computed = load.bucket_to_dataframe(blob_name, decompress=True) self.assert_pandas_equal(expected, computed)
def test_bucket_to_dataframe(self): expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']}) populate() blob_name = ids.build_blob_name_2('a10') load.dataframe_to_bucket(expected, blob_name) gpl = create_loader(bq_client=None, dataset_id=None, bucket_dir_path=constants.bucket_subdir_path, local_dir_path=constants.local_subdir_path) computed = gpl.load(source='bucket', destination='dataframe', data_name='a10') self.assert_pandas_equal(expected, computed)
def test_post_clear_dataframe_to_dataset(self): populate() blob_name = ids.build_blob_name_2('a10') local_file_path = ids.build_local_file_path_0('a10') self.assertTrue(exist.blob_exists(blob_name)) self.assertTrue(exist.local_file_exists(local_file_path)) gpl = create_loader_quick_setup( bucket_dir_path=constants.bucket_subdir_path) gpl.load(source='dataframe', destination='dataset', dataframe=pandas.DataFrame(data={'x': [1]}), data_name='a10') self.assertFalse(exist.blob_exists(blob_name)) self.assertFalse(exist.local_file_exists(local_file_path))
def test_post_clear_query_to_dataframe(self): populate() blob_name = ids.build_blob_name_0('a10') local_file_path = ids.build_local_file_path_1('a10') self.assertTrue(exist.table_exists('a10')) self.assertTrue(exist.blob_exists(blob_name)) self.assertTrue(exist.local_file_exists(local_file_path)) gpl = create_loader(local_dir_path=constants.local_subdir_path) gpl.load(source='query', destination='dataframe', query='select 3', data_name='a10') self.assertFalse(exist.table_exists('a10')) self.assertFalse(exist.blob_exists(blob_name)) self.assertFalse(exist.local_file_exists(local_file_path))
def test_upload_download(self): expected = pandas.DataFrame(data={'x': [1], 'y': [3]}) populate() gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path, separator='#', chunk_size=2**18, timeout=15) gpl.load(source='dataframe', destination='dataset', dataframe=expected, data_name='a9') query = f'select * from {constants.dataset_id}.a9' computed = gpl.load(source='query', destination='dataframe', query=query) self.assert_pandas_equal(expected, computed)
def test_no_skip_blank_lines(self): df0 = pandas.DataFrame(data={'x': [3, numpy.nan]}) df1 = pandas.DataFrame(data={'x': [numpy.nan, 4]}) df2 = pandas.DataFrame(data={'x': [numpy.nan, 5], 'y': [numpy.nan, 6]}) df3 = pandas.DataFrame(data={'x': [7, numpy.nan], 'y': [8, numpy.nan]}) expecteds = [df0, df1, df2, df3] populate() query0 = 'select 3 as x union all select null as x' query1 = 'select null as x union all select 4 as x' query2 = 'select null as x, null as y union all ' \ 'select 5 as x, 6 as y' query3 = 'select 7 as x, 8 as y union all ' \ 'select null as x, null as y' queries = [query0, query1, query2, query3] configs = [] for query in queries: config = LoadConfig(source='query', destination='dataframe', query=query) configs.append(config) gpl = create_loader() computed = gpl.multi_load(configs) for df, dg in zip(expecteds, computed): self.assert_pandas_equal(df, dg)