def test_raise_error_if_configs_is_not_a_list(self):
     config = LoadConfig(source='bucket',
                         destination='local',
                         data_name='a1')
     with self.assertRaises(ValueError) as cm:
         create_loader().multi_load(configs={config})
     self.assertEqual('configs must be a list', str(cm.exception))
Esempio n. 2
0
    def test_list_local_file_paths(self):
        populate_local()
        gpl20 = create_loader(bucket_dir_path=constants.bucket_subdir_path)
        gpl21 = create_loader(bucket_dir_path=constants.bucket_subdir_path,
                              local_dir_path=constants.local_subdir_path)

        local_file_paths = [
            ids.build_local_file_path_0(f'a{i}') for i in range(7, 12)
        ]
        local_file_paths = sorted(local_file_paths)
        expected = [os.path.normpath(p) for p in local_file_paths]
        computed = [
            os.path.normpath(p) for p in gpl20.list_local_file_paths('a')
        ]
        self.assertEqual(expected, computed)

        local_file_paths = [
            ids.build_local_file_path_1(f'a{i}') for i in range(10, 13)
        ]
        local_file_paths = sorted(local_file_paths)
        expected = [os.path.normpath(p) for p in local_file_paths]
        computed = [
            os.path.normpath(p) for p in gpl21.list_local_file_paths('a1')
        ]
        self.assertEqual(expected, computed)

        self.assertEqual([], gpl21.list_local_file_paths('sub'))
Esempio n. 3
0
    def test_list_blob_uris(self):
        populate_bucket()

        gpl00 = create_loader()
        gpl10 = create_loader(bucket_dir_path=constants.bucket_dir_path)
        gpl20 = create_loader_quick_setup(
            bucket_dir_path=constants.bucket_subdir_path, separator='#')

        blob_names = [ids.build_blob_name_0(f'a{i}') for i in range(7, 12)]
        blob_uris = [ids.build_bucket_uri(n) for n in blob_names]
        blob_uris = sorted(blob_uris)
        self.assertEqual(blob_uris, gpl00.list_blob_uris('a'))

        blob_names = [ids.build_blob_name_1(f'a{i}') for i in range(10, 13)]
        blob_uris = [ids.build_bucket_uri(n) for n in blob_names]
        blob_uris = sorted(blob_uris)
        self.assertEqual(blob_uris, gpl10.list_blob_uris('a1'))

        blob_names = [ids.build_blob_name_2(f'a{i}') for i in range(9, 14)]
        blob_uris = [ids.build_bucket_uri(n) for n in blob_names]
        blob_uris = sorted(blob_uris)
        self.assertEqual(blob_uris, gpl20.list_blob_uris('a'))

        self.assertEqual([], gpl00.list_blob_uris('dir'))
        self.assertEqual([], gpl10.list_blob_uris('su'))
    def test_raise_error_if_dataset_id_not_contain_exactly_one_dot(self):
        msg = 'dataset_id must contain exactly one dot'

        with self.assertRaises(ValueError) as cm:
            create_loader(dataset_id='ab')
        self.assertEqual(msg, str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            create_loader(dataset_id='a.b.c')
        self.assertEqual(msg, str(cm.exception))
 def test_raise_error_if_write_empty_and_already_exists(self):
     populate_dataset()
     populate_local()
     with self.assertRaises(Conflict) as cm:
         create_loader().load(source='local',
                              destination='dataset',
                              data_name='a10',
                              write_disposition='WRITE_EMPTY')
     self.assertEqual(str(cm.exception),
                      '409 Already Exists: Table dmp-y-tests:test_gpl.a10')
 def test_raise_error_if_prefix(self):
     config1 = LoadConfig(source='dataframe',
                          destination='dataset',
                          dataframe=pandas.DataFrame(data={'x': [3]}),
                          data_name='a')
     config2 = LoadConfig(source='query',
                          destination='dataframe',
                          query='select 4 as y',
                          data_name='aa')
     with self.assertRaises(ValueError) as cm:
         create_loader().multi_load(configs=[config1, config2])
     self.assertEqual('a is a prefix of aa', str(cm.exception))
 def test_exist_in_local(self):
     gpl00 = create_loader()
     gpl01 = create_loader(bq_client=None,
                           dataset_id=None,
                           gs_client=None,
                           bucket_name=None,
                           bucket_dir_path='bucket_dir_path',
                           local_dir_path=constants.local_subdir_path)
     self.assertFalse(gpl00.exist_in_local('a'))
     self.assertFalse(gpl01.exist_in_local('a9'))
     populate_local()
     self.assertTrue(gpl00.exist_in_local('a'))
     self.assertTrue(gpl01.exist_in_local('a9'))
 def test_exist_in_bucket(self):
     gpl01 = create_loader(local_dir_path=constants.local_subdir_path)
     gpl11 = create_loader(bucket_dir_path=constants.bucket_dir_path,
                           local_dir_path=constants.local_subdir_path)
     gpl21 = create_loader_quick_setup(
         dataset_name=None,
         bucket_dir_path=constants.bucket_subdir_path,
         local_dir_path=constants.local_subdir_path)
     self.assertFalse(gpl01.exist_in_bucket('a1'))
     self.assertFalse(gpl11.exist_in_bucket('a10'))
     self.assertFalse(gpl21.exist_in_bucket('a'))
     populate_bucket()
     self.assertTrue(gpl01.exist_in_bucket('a1'))
     self.assertTrue(gpl11.exist_in_bucket('a10'))
     self.assertTrue(gpl21.exist_in_bucket('a'))
Esempio n. 9
0
    def test_heterogeneous_configs(self):
        expected1 = pandas.DataFrame(data={'x': [3, 10]})
        expected2 = pandas.DataFrame(data={'y': [4]})
        expected3 = pandas.DataFrame(data={'x': ['b'], 'y': ['a']})
        populate()
        config1 = LoadConfig(source='dataframe',
                             destination='dataset',
                             dataframe=expected1,
                             data_name='a10')
        config2 = LoadConfig(source='query',
                             destination='dataframe',
                             query='select 4 as y')
        config3 = LoadConfig(source='query',
                             destination='bucket',
                             query="select 'b' as x, 'a' as y",
                             data_name='a11')
        gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path)
        load_results = gpl.multi_load([config1, config2, config3])
        self.assertEqual(len(load_results), 3)
        self.assertTrue(load_results[0] is None)
        self.assertTrue(load_results[2] is None)

        computed1 = load.dataset_to_dataframe('a10')
        self.assert_pandas_equal(expected1, computed1)

        computed2 = load_results[1]
        self.assert_pandas_equal(expected2, computed2)

        blob_name = ids.build_blob_name_2('a11-000000000000.csv.gz')
        computed3 = load.bucket_to_dataframe(blob_name, decompress=True)
        self.assert_pandas_equal(expected3, computed3)
Esempio n. 10
0
 def test_dataset_to_local(self):
     expected = pandas.DataFrame(data={'x': [1, 2, 3, 4]})
     load.multi_dataframe_to_dataset([expected], ['b1'])
     gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path,
                         local_dir_path=constants.local_subdir_path)
     gpl.load(source='dataset', destination='local', data_name='b1')
     local_file_path = ids.build_local_file_path_1('b1-000000000000.csv.gz')
     computed = load.local_to_dataframe(local_file_path)
     self.assert_pandas_equal(expected, computed)
Esempio n. 11
0
 def test_query_to_dataframe(self):
     expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']})
     populate()
     gpl = create_loader(separator='#')
     computed = gpl.load(
         source='query',
         destination='dataframe',
         query="select 3 as x, 'a' as y union all select 2 as x, 'b' as y")
     self.assert_pandas_equal(expected, computed)
Esempio n. 12
0
 def test_dataset_to_bucket(self):
     expected = pandas.DataFrame(data={'x': ['a8_dataset']})
     populate_dataset()
     gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path,
                         local_dir_path=None)
     gpl.load(source='dataset', destination='bucket', data_name='a8')
     blob_name = ids.build_blob_name_2('a8-000000000000.csv.gz')
     computed = load.bucket_to_dataframe(blob_name, decompress=True)
     self.assert_pandas_equal(expected, computed)
Esempio n. 13
0
 def test_local_to_dataframe(self):
     expected = pandas.DataFrame(
         data={'x': [f'a{i}_local' for i in range(10, 13)]})
     populate_local()
     gpl = create_loader(bucket_dir_path=constants.bucket_dir_path,
                         local_dir_path=constants.local_subdir_path)
     computed = gpl.load(source='local',
                         destination='dataframe',
                         data_name='a1')
     self.assert_pandas_equal(expected, computed)
 def test_call_loader_getters(self):
     gpl00 = create_loader()
     gpl10 = create_loader(bucket_dir_path=bucket_dir_path)
     gpl20 = create_loader(bucket_dir_path=bucket_subdir_path)
     gpl01 = create_loader_quick_setup(project_id=None,
                                       dataset_name=None,
                                       bucket_name=None,
                                       local_dir_path=local_subdir_path)
     self.assertIsNotNone(gpl00.bq_client)
     self.assertIsNotNone(gpl00.gs_client)
     self.assertIsNotNone(gpl00.bucket)
     self.assertEqual(dataset_id, gpl00.dataset_id)
     self.assertEqual(dataset_name, gpl00.dataset_name)
     self.assertEqual(bucket_name, gpl00.bucket_name)
     self.assertIsNone(gpl00.bucket_dir_path)
     self.assertEqual(bucket_dir_path, gpl10.bucket_dir_path)
     self.assertEqual(bucket_subdir_path, gpl20.bucket_dir_path)
     self.assertEqual(local_dir_path, gpl00.local_dir_path)
     self.assertEqual(local_subdir_path, gpl01.local_dir_path)
Esempio n. 15
0
 def test_dataframe_to_bucket(self):
     expected = pandas.DataFrame(data={'x': [1, 2, 3], 'y': [1, 2, 4]})
     populate()
     gpl = create_loader()
     gpl.load(source='dataframe',
              destination='bucket',
              dataframe=expected,
              data_name='a1')
     blob_name = ids.build_blob_name_0('a1.csv.gz')
     computed = load.bucket_to_dataframe(blob_name, decompress=True)
     self.assert_pandas_equal(expected, computed)
Esempio n. 16
0
 def test_write_empty_local_to_dataset(self):
     expected = pandas.DataFrame(data={'x': [1]})
     local_file_path = ids.build_local_file_path_1('s12')
     load.dataframe_to_local(expected, local_file_path)
     gpl = create_loader(local_dir_path=constants.local_subdir_path)
     gpl.load(source='local',
              destination='dataset',
              data_name='s12',
              write_disposition='WRITE_EMPTY')
     computed = load.dataset_to_dataframe('s12')
     self.assert_pandas_equal(expected, computed)
Esempio n. 17
0
 def test_query_to_dataset(self):
     expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']})
     populate_dataset()
     gpl = create_loader(gs_client=None, bucket_name=None)
     gpl.load(
         source='query',
         destination='dataset',
         query="select 3 as x, 'a' as y union all select 2 as x, 'b' as y",
         data_name='a0')
     computed = load.dataset_to_dataframe('a0')
     self.assert_pandas_equal(expected, computed)
 def test_compress_query_to_bucket(self):
     gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path)
     gpl.load(
         source='query',
         destination='bucket',
         query='select 5',
         data_name='b100')
     blob_name = ids.build_blob_name_2('b100-000000000000.csv.gz')
     local_file_path = ids.build_local_file_path_1(
         'b100-000000000000.csv.gz')
     load.bucket_to_local(blob_name, local_file_path)
     self.assertTrue(is_gz_file(local_file_path))
Esempio n. 19
0
    def test_list_blobs(self):
        populate_bucket()

        gpl00 = create_loader_quick_setup()
        gpl10 = create_loader(bucket_dir_path=constants.bucket_dir_path)
        gpl20 = create_loader(bucket_dir_path=constants.bucket_subdir_path)

        self.assertEqual(
            sorted([ids.build_blob_name_0(f'a{i}') for i in range(7, 12)]),
            [b.name for b in gpl00.list_blobs('a')])

        self.assertEqual(
            sorted([ids.build_blob_name_1(f'a{i}') for i in range(10, 13)]),
            [b.name for b in gpl10.list_blobs('a1')])

        self.assertEqual(
            sorted([ids.build_blob_name_2(f'a{i}') for i in range(9, 14)]),
            [b.name for b in gpl20.list_blobs('')])

        self.assertEqual([], gpl00.list_blobs('dir'))
        self.assertEqual([], gpl10.list_blobs('su'))
    def test_raise_error_if_missing_required_resources(self):

        with self.assertRaises(ValueError) as cm:
            create_loader(bq_client=None,
                          dataset_id=None).load(source='query',
                                                destination='dataset',
                                                data_name='e0',
                                                query='select 3')
        self.assertEqual('bq_client must be provided if dataset is used',
                         str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            create_loader(gs_client=None,
                          bucket_name=None).load(source='bucket',
                                                 destination='local',
                                                 data_name='a')
        self.assertEqual('gs_client must be provided if bucket is used',
                         str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            create_loader(local_dir_path=None).load(
                source='dataframe',
                destination='local',
                dataframe=pandas.DataFrame(data={'x': [1]}),
                data_name='a')
        self.assertEqual('local_dir_path must be provided if local is used',
                         str(cm.exception))
Esempio n. 21
0
 def test_bucket_to_dataframe(self):
     expected = pandas.DataFrame(data={'x': [3, 2], 'y': ['a', 'b']})
     populate()
     blob_name = ids.build_blob_name_2('a10')
     load.dataframe_to_bucket(expected, blob_name)
     gpl = create_loader(bq_client=None,
                         dataset_id=None,
                         bucket_dir_path=constants.bucket_subdir_path,
                         local_dir_path=constants.local_subdir_path)
     computed = gpl.load(source='bucket',
                         destination='dataframe',
                         data_name='a10')
     self.assert_pandas_equal(expected, computed)
Esempio n. 22
0
 def test_download_upload(self):
     expected = pandas.DataFrame(data={'x': [3, 2]})
     gpl = create_loader(bucket_dir_path=constants.bucket_dir_path,
                         local_dir_path=constants.local_subdir_path)
     df0 = gpl.load(source='query',
                    destination='dataframe',
                    query='select 3 as x union all select 2 as x')
     gpl.load(source='dataframe',
              destination='dataset',
              dataframe=df0,
              data_name='b1')
     computed = load.dataset_to_dataframe('b1')
     self.assert_pandas_equal(expected, computed)
Esempio n. 23
0
 def test_query_to_bucket(self):
     with self.assertLogs('google_pandas_load.loader', level='DEBUG') as cm:
         gpl = create_loader(bucket_dir_path=constants.bucket_dir_path,
                             local_dir_path=None)
         gpl.load(source='query',
                  destination='bucket',
                  query='select 3',
                  data_name='a0')
         records = cm.records
         self.assertEqual(4, len(records))
         regexp = (r'^google_pandas_load.loader # DEBUG # '
                   r'Ended query to dataset \[[0-9]+s, [0-9]+\.[0-9]+\$\]$')
         pattern = re.compile(regexp)
         log = formatter.format(records[1])
         self.assertIsNotNone(pattern.search(log))
Esempio n. 24
0
 def test_post_clear_query_to_dataframe(self):
     populate()
     blob_name = ids.build_blob_name_0('a10')
     local_file_path = ids.build_local_file_path_1('a10')
     self.assertTrue(exist.table_exists('a10'))
     self.assertTrue(exist.blob_exists(blob_name))
     self.assertTrue(exist.local_file_exists(local_file_path))
     gpl = create_loader(local_dir_path=constants.local_subdir_path)
     gpl.load(source='query',
              destination='dataframe',
              query='select 3',
              data_name='a10')
     self.assertFalse(exist.table_exists('a10'))
     self.assertFalse(exist.blob_exists(blob_name))
     self.assertFalse(exist.local_file_exists(local_file_path))
Esempio n. 25
0
 def test_write_append_dataframe_to_dataset(self):
     expected = pandas.DataFrame(data={'x': [0, 1]})
     df00 = pandas.DataFrame(data={'x': [0]})
     df01 = pandas.DataFrame(data={'x': [1]})
     gpl = create_loader(chunk_size=2**18, timeout=5)
     gpl.load(source='dataframe',
              destination='dataset',
              dataframe=df00,
              data_name='s13')
     gpl.load(source='dataframe',
              destination='dataset',
              dataframe=df01,
              data_name='s13',
              write_disposition='WRITE_APPEND')
     computed = load.dataset_to_dataframe('s13')
     self.assert_pandas_equal(expected, computed)
Esempio n. 26
0
 def test_upload_download(self):
     expected = pandas.DataFrame(data={'x': [1], 'y': [3]})
     populate()
     gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path,
                         separator='#',
                         chunk_size=2**18,
                         timeout=15)
     gpl.load(source='dataframe',
              destination='dataset',
              dataframe=expected,
              data_name='a9')
     query = f'select * from {constants.dataset_id}.a9'
     computed = gpl.load(source='query',
                         destination='dataframe',
                         query=query)
     self.assert_pandas_equal(expected, computed)
    def test_raise_error_if_data_name_contains_slash(self):
        with self.assertRaises(ValueError) as cm:
            create_loader().list_blobs(data_name='a/b')
        msg = 'data_name=a/b must not contain a /'
        self.assertEqual(msg, str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            create_loader().list_blob_uris(data_name='a/b')
        msg = 'data_name=a/b must not contain a /'
        self.assertEqual(msg, str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            create_loader().list_local_file_paths(data_name='a/b')
        msg = 'data_name=a/b must not contain a /'
        self.assertEqual(msg, str(cm.exception))
Esempio n. 28
0
 def test_no_skip_blank_lines(self):
     df0 = pandas.DataFrame(data={'x': [3, numpy.nan]})
     df1 = pandas.DataFrame(data={'x': [numpy.nan, 4]})
     df2 = pandas.DataFrame(data={'x': [numpy.nan, 5], 'y': [numpy.nan, 6]})
     df3 = pandas.DataFrame(data={'x': [7, numpy.nan], 'y': [8, numpy.nan]})
     expecteds = [df0, df1, df2, df3]
     populate()
     query0 = 'select 3 as x union all select null as x'
     query1 = 'select null as x union all select 4 as x'
     query2 = 'select null as x, null as y union all ' \
              'select 5 as x, 6 as y'
     query3 = 'select 7 as x, 8 as y union all ' \
              'select null as x, null as y'
     queries = [query0, query1, query2, query3]
     configs = []
     for query in queries:
         config = LoadConfig(source='query',
                             destination='dataframe',
                             query=query)
         configs.append(config)
     gpl = create_loader()
     computed = gpl.multi_load(configs)
     for df, dg in zip(expecteds, computed):
         self.assert_pandas_equal(df, dg)
    def test_raise_error_if_no_data(self):
        with self.assertRaises(ValueError) as cm:
            create_loader().load(source='dataset',
                                 destination='local',
                                 data_name='e0')
        self.assertEqual('There is no data named e0 in dataset',
                         str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            create_loader().load(source='bucket',
                                 destination='dataset',
                                 data_name='e0')
        self.assertEqual('There is no data named e0 in bucket',
                         str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            create_loader().load(source='local',
                                 destination='dataframe',
                                 data_name='e0')
        self.assertEqual('There is no data named e0 in local',
                         str(cm.exception))
Esempio n. 30
0
 def test_keep_source_in_dataset(self):
     populate_dataset()
     gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path,
                         local_dir_path=constants.local_subdir_path)
     gpl.load(source='dataset', destination='local', data_name='a7')
     self.assertTrue(exist.table_exists('a7'))