def test_raise_error_if_infer_bq_schema_from_no_columns_dataframe(self):
     with self.assertRaises(ValueError) as cm:
         LoadConfig.bq_schema_inferred_from_dataframe(
             dataframe=pandas.DataFrame(data={}))
     msg = ('A non empty bq_schema cannot be inferred '
            'from a dataframe with no columns')
     self.assertEqual(str(cm.exception), msg)
Ejemplo n.º 2
0
    def test_xmload(self):
        df0 = pandas.DataFrame(data={'x': [4]})
        config1 = LoadConfig(source='dataframe',
                             destination='query',
                             data_name='e100',
                             dataframe=df0)
        config2 = LoadConfig(source='query',
                             destination='bq',
                             data_name='e101',
                             query='select 3')
        config3 = LoadConfig(source='dataframe',
                             destination='query',
                             data_name='e102',
                             dataframe=df0)
        xmlr = gpl3.xmload(configs=[config1, config2, config3])
        self.assertEqual(
            set(vars(xmlr)), {
                'load_results', 'data_names', 'duration', 'durations',
                'query_cost', 'query_costs'
            })

        self.assertEqual(xmlr.data_names, ['e100', 'e101', 'e102'])

        self.assertTrue(xmlr.duration > 0)

        self.assertEqual(set(vars(xmlr.durations)), set(ATOMIC_FUNCTION_NAMES))

        for n in ATOMIC_FUNCTION_NAMES:
            duration = vars(xmlr.durations)[n]
            if duration is not None:
                self.assertTrue(duration >= 0)

        self.assertEqual(xmlr.query_cost, 0.0)

        self.assertEqual(xmlr.query_costs, [None, 0.0, None])
Ejemplo n.º 3
0
    def test_heterogeneous_configs(self):
        expected1 = pandas.DataFrame(data={'x': [3, 10]})
        expected2 = pandas.DataFrame(data={'y': [4]})
        expected3 = pandas.DataFrame(data={'x': ['b'], 'y': ['a']})
        populate()
        config1 = LoadConfig(source='dataframe',
                             destination='dataset',
                             dataframe=expected1,
                             data_name='a10')
        config2 = LoadConfig(source='query',
                             destination='dataframe',
                             query='select 4 as y')
        config3 = LoadConfig(source='query',
                             destination='bucket',
                             query="select 'b' as x, 'a' as y",
                             data_name='a11')
        gpl = create_loader(bucket_dir_path=constants.bucket_subdir_path)
        load_results = gpl.multi_load([config1, config2, config3])
        self.assertEqual(len(load_results), 3)
        self.assertTrue(load_results[0] is None)
        self.assertTrue(load_results[2] is None)

        computed1 = load.dataset_to_dataframe('a10')
        self.assert_pandas_equal(expected1, computed1)

        computed2 = load_results[1]
        self.assert_pandas_equal(expected2, computed2)

        blob_name = ids.build_blob_name_2('a11-000000000000.csv.gz')
        computed3 = load.bucket_to_dataframe(blob_name, decompress=True)
        self.assert_pandas_equal(expected3, computed3)
    def test_raise_error_if_invalid_source_or_destination(self):
        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='queryy', destination='dataframe')
        self.assertEqual(str(cm.exception), "source must be one of 'query' or 'bq' or 'gs' or 'local' or 'dataframe'")

        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='query', destination='dataframee')
        self.assertEqual(
            str(cm.exception), "destination must be one of 'query' or 'bq' or 'gs' or 'local' or 'dataframe'")
 def test_raise_error_if_prefix(self):
     config1 = LoadConfig(source='dataframe',
                          destination='dataset',
                          dataframe=pandas.DataFrame(data={'x': [3]}),
                          data_name='a')
     config2 = LoadConfig(source='query',
                          destination='dataframe',
                          query='select 4 as y',
                          data_name='aa')
     with self.assertRaises(ValueError) as cm:
         create_loader().multi_load(configs=[config1, config2])
     self.assertEqual('a is a prefix of aa', str(cm.exception))
    def test_raise_error_if_missing_required_values(self):
        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='query', destination='bucket', query='select 3')
        msg = ("data_name must be provided if source or destination is one of "
               "'dataset' or 'bucket' or 'local'")
        self.assertEqual(msg, str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='query', destination='dataframe')
        msg = "query must be provided if source = 'query'"
        self.assertEqual(msg, str(cm.exception))

        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='dataframe', destination='local', data_name='a1')
        msg = "dataframe must be provided if source = 'dataframe'"
        self.assertEqual(msg, str(cm.exception))
 def test_raise_error_if_invalid_destination(self):
     df = pandas.DataFrame(data={'x': [1]})
     with self.assertRaises(ValueError) as cm:
         LoadConfig(source='dataframe', destination='query', dataframe=df)
     msg = ("destination must be one of 'dataset' or 'bucket' or 'local' "
            "or 'dataframe'")
     self.assertEqual(msg, str(cm.exception))
 def test_raise_error_if_configs_is_not_a_list(self):
     config = LoadConfig(source='bucket',
                         destination='local',
                         data_name='a1')
     with self.assertRaises(ValueError) as cm:
         create_loader().multi_load(configs={config})
     self.assertEqual('configs must be a list', str(cm.exception))
 def test_raise_error_if_data_name_contains_slash(self):
     with self.assertRaises(ValueError) as cm:
         LoadConfig(source='query',
                    destination='dataset',
                    query='select 3',
                    data_name='a/b')
     msg = 'data_name=a/b must not contain a /'
     self.assertEqual(msg, str(cm.exception))
 def test_raise_error_if_data_name_is_empty_string(self):
     with self.assertRaises(ValueError) as cm:
         LoadConfig(source='query',
                    destination='dataset',
                    query='select 3',
                    data_name='')
     msg = 'data_name must not be the empty string'
     self.assertEqual(msg, str(cm.exception))
Ejemplo n.º 11
0
    def test_raise_error_if_missing_required_values(self):
        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='query', destination='dataframe')
        self.assertEqual(str(cm.exception), "query must be given if source = 'query'")

        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='dataframe', destination='local')
        self.assertEqual(str(cm.exception), "dataframe must be given if source = 'dataframe'")

        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='query', destination='gs', query='select 3')
        self.assertEqual(
            str(cm.exception), "data_name must be given if source or destination is one of 'bq' or 'gs' or 'local'")

        with self.assertRaises(ValueError) as cm:
            LoadConfig(source='gs', destination='bq', data_name='e0')
        self.assertEqual(str(cm.exception), 'bq_schema is missing')
 def test_raise_error_if_invalid_source(self):
     with self.assertRaises(ValueError) as cm:
         LoadConfig(source='queryy',
                    destination='dataframe',
                    query='select 3')
     msg = ("source must be one of 'query' or 'dataset' or "
            "'bucket' or 'local' or 'dataframe")
     self.assertEqual(msg, str(cm.exception))
Ejemplo n.º 13
0
 def test_config_repeated(self):
     df0 = pandas.DataFrame(data={'x': [3]})
     populate()
     config = LoadConfig(source='query',
                         destination='dataframe',
                         query='select 3 as x')
     dfs = gpl5.mload(configs=[config] * 3)
     for df in dfs:
         self.assertTrue(df0.equals(df))
Ejemplo n.º 14
0
 def test_config_repeated(self):
     expected = pandas.DataFrame(data={'x': [3]})
     populate()
     config = LoadConfig(source='query',
                         destination='dataframe',
                         query='select 3 as x')
     gpl = create_loader_quick_setup(
         local_dir_path=constants.local_subdir_path)
     computeds = gpl.multi_load(configs=[config] * 3)
     for computed in computeds:
         self.assert_pandas_equal(expected, computed)
Ejemplo n.º 15
0
 def test_diamond(self):
     df0 = pandas.DataFrame(data={'x': [3]})
     query = 'select 3 as x'
     populate()
     df1 = gpl5.xload(source='query', destination='dataframe',
                      query=query).load_result
     config = LoadConfig(source='query',
                         destination='dataframe',
                         query=query)
     df2 = gpl5.mload(configs=[config])[0]
     self.assertTrue(df0.equals(df1))
     self.assertTrue(df0.equals(df2))
Ejemplo n.º 16
0
 def test_mload(self):
     populate()
     config1 = LoadConfig(source='dataframe',
                          destination='query',
                          data_name='a10',
                          dataframe=pandas.DataFrame(data={'x': [3]}))
     config2 = LoadConfig(source='query',
                          destination='dataframe',
                          query='select 4 as y')
     config3 = LoadConfig(source='query',
                          destination='gs',
                          data_name='e0',
                          query='select 4 as y')
     load_results = gpl5.mload(configs=[config1, config2, config3])
     self.assertEqual(len(load_results), 3)
     self.assertEqual(
         load_results[0],
         'select * from `{}.{}.a10`'.format(project_id, dataset_id))
     self.assertTrue(load_results[1].equals(
         pandas.DataFrame(data={'y': [4]})))
     self.assertTrue(load_results[2] is None)
Ejemplo n.º 17
0
def upload_dfs(gpl, dfs):
    data_names = build_data_names()
    cols_to_upload = build_cols_to_upload()

    configs = [
        LoadConfig(
            source='dataframe',
            destination='bq',
            data_name=data_names[kind],
            dataframe=dfs[kind][cols_to_upload[kind]],
            timestamp_cols=['ts'],
            date_cols=['d'])
        for kind in kinds]

    gpl.mload(configs)
Ejemplo n.º 18
0
 def test_no_skip_blank_lines(self):
     df0 = pandas.DataFrame(data={'x': [3, numpy.nan]})
     df1 = pandas.DataFrame(data={'x': [numpy.nan, 4]})
     df2 = pandas.DataFrame(data={'x': [numpy.nan, 5], 'y': [numpy.nan, 6]})
     df3 = pandas.DataFrame(data={'x': [7, numpy.nan], 'y': [8, numpy.nan]})
     expecteds = [df0, df1, df2, df3]
     populate()
     query0 = 'select 3 as x union all select null as x'
     query1 = 'select null as x union all select 4 as x'
     query2 = 'select null as x, null as y union all ' \
              'select 5 as x, 6 as y'
     query3 = 'select 7 as x, 8 as y union all ' \
              'select null as x, null as y'
     queries = [query0, query1, query2, query3]
     configs = []
     for query in queries:
         config = LoadConfig(source='query',
                             destination='dataframe',
                             query=query)
         configs.append(config)
     gpl = create_loader()
     computed = gpl.multi_load(configs)
     for df, dg in zip(expecteds, computed):
         self.assert_pandas_equal(df, dg)
Ejemplo n.º 19
0
 def test_raise_error_if_source_is_equal_to_destination(self):
     with self.assertRaises(ValueError) as cm:
         LoadConfig(source='query', destination='query')
     self.assertEqual(str(cm.exception), 'source must be different from destination')
Ejemplo n.º 20
0
 def test_raise_error_if_bq_schema_inferred_from_dataframe_is_given_a_dataframe_with_no_columns(self):
     with self.assertRaises(ValueError) as cm:
         LoadConfig.bq_schema_inferred_from_dataframe(dataframe=pandas.DataFrame(data={}))
     self.assertEqual(str(cm.exception), 'A non empty bq_schema cannot be inferred from a dataframe with no columns')
 def test_raise_error_if_source_is_equal_to_destination(self):
     with self.assertRaises(ValueError) as cm:
         LoadConfig(source='local', destination='local', data_name='a')
     msg = 'source must be different from destination'
     self.assertEqual(msg, str(cm.exception))