def test_upload_data_with_valid_user_schema(self, project_id): # Issue #46; tests test scenarios with user-provided # schemas df = make_mixed_dataframe_v1() test_id = "18" test_schema = [ { "name": "A", "type": "FLOAT" }, { "name": "B", "type": "FLOAT" }, { "name": "C", "type": "STRING" }, { "name": "D", "type": "TIMESTAMP" }, ] destination_table = self.destination_table + test_id gbq.to_gbq( df, destination_table, project_id, credentials=self.credentials, table_schema=test_schema, ) dataset, table = destination_table.split(".") assert verify_schema(self.gbq_connector, dataset, table, dict(fields=test_schema))
def test_to_gbq_with_no_project_id_given_should_fail(monkeypatch): import pydata_google_auth monkeypatch.setattr(pydata_google_auth, "default", mock_get_credentials_no_project) with pytest.raises(ValueError, match="Could not determine project ID"): gbq.to_gbq(DataFrame([[1]]), "dataset.tablename")
def test_to_gbq_doesnt_run_query(mock_bigquery_client): try: gbq.to_gbq(DataFrame([[1]]), "dataset.tablename", project_id="my-project") except gbq.TableCreationError: pass mock_bigquery_client.query.assert_not_called()
def test_upload_data_with_invalid_user_schema_raises_error( self, project_id): df = make_mixed_dataframe_v1() test_id = "19" test_schema = [ { "name": "A", "type": "FLOAT" }, { "name": "B", "type": "FLOAT" }, { "name": "C", "type": "FLOAT" }, { "name": "D", "type": "FLOAT" }, ] destination_table = self.destination_table + test_id with pytest.raises(gbq.GenericGBQException): gbq.to_gbq( df, destination_table, project_id, credentials=self.credentials, table_schema=test_schema, )
def test_upload_data_tokyo_non_existing_dataset(self, project_id, random_dataset_id, bigquery_client): from google.cloud import bigquery test_size = 10 df = make_mixed_dataframe_v2(test_size) non_existing_tokyo_dataset = random_dataset_id non_existing_tokyo_destination = "{}.to_gbq_test".format( non_existing_tokyo_dataset) # Initialize table with sample data gbq.to_gbq( df, non_existing_tokyo_destination, project_id, credentials=self.credentials, location="asia-northeast1", ) table = bigquery_client.get_table( bigquery.TableReference( bigquery.DatasetReference(project_id, non_existing_tokyo_dataset), "to_gbq_test", )) assert table.num_rows > 0
def test_upload_subset_columns_if_table_exists_append(self, project_id): # Issue 24: Upload is succesful if dataframe has columns # which are a subset of the current schema test_id = "16" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_subset_cols = df.iloc[:, :2] # Initialize table with sample data gbq.to_gbq( df, self.destination_table + test_id, project_id, chunksize=10000, credentials=self.credentials, ) # Test the if_exists parameter with value 'append' gbq.to_gbq( df_subset_cols, self.destination_table + test_id, project_id, if_exists="append", credentials=self.credentials, ) result = gbq.read_gbq( "SELECT COUNT(*) AS num_rows FROM {0}".format( self.destination_table + test_id), project_id=project_id, credentials=self.credentials, dialect="legacy", ) assert result["num_rows"][0] == test_size * 2
def test_upload_data_if_table_exists_replace(self, project_id): test_id = "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = make_mixed_dataframe_v1() # Initialize table with sample data gbq.to_gbq( df, self.destination_table + test_id, project_id, chunksize=10000, credentials=self.credentials, ) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq( df_different_schema, self.destination_table + test_id, project_id, if_exists="replace", credentials=self.credentials, ) result = gbq.read_gbq( "SELECT COUNT(*) AS num_rows FROM {0}".format( self.destination_table + test_id), project_id=project_id, credentials=self.credentials, dialect="legacy", ) assert result["num_rows"][0] == 5
def test_upload_data_with_newlines(self, project_id): test_id = "data_with_newlines" test_size = 2 df = DataFrame({"s": ["abcd", "ef\ngh"]}) gbq.to_gbq( df, self.destination_table + test_id, project_id=project_id, credentials=self.credentials, ) result_df = gbq.read_gbq( "SELECT * FROM {0}".format(self.destination_table + test_id), project_id=project_id, credentials=self.credentials, dialect="legacy", ) assert len(result_df) == test_size if sys.version_info.major < 3: pytest.skip(msg="Unicode comparison in Py2 not working") result = result_df["s"].sort_values() expected = df["s"].sort_values() tm.assert_series_equal(expected, result)
def test_upload_mixed_float_and_int(self, project_id): """Test that we can upload a dataframe containing an int64 and float64 column. See: https://github.com/pydata/pandas-gbq/issues/116 """ test_id = "mixed_float_and_int" test_size = 2 df = DataFrame( [[1, 1.1], [2, 2.2]], index=["row 1", "row 2"], columns=["intColumn", "floatColumn"], ) gbq.to_gbq( df, self.destination_table + test_id, project_id=project_id, credentials=self.credentials, ) result_df = gbq.read_gbq( "SELECT * FROM {0}".format(self.destination_table + test_id), project_id=project_id, credentials=self.credentials, dialect="legacy", ) assert len(result_df) == test_size
def test_load_does_not_modify_schema_arg(): # Test of Issue # 277 df = DataFrame({ "field1": ["a", "b"], "field2": [1, 2], "field3": [datetime.date(2019, 1, 1), datetime.date(2019, 5, 1)], }) original_schema = [ { "name": "field1", "type": "STRING", "mode": "REQUIRED" }, { "name": "field2", "type": "INTEGER" }, { "name": "field3", "type": "DATE" }, ] original_schema_cp = copy.deepcopy(original_schema) gbq.to_gbq( df, "dataset.schematest", project_id="my-project", table_schema=original_schema, if_exists="fail", ) assert original_schema == original_schema_cp # Test again now that table exists - behavior will differ internally # branch at if table.exists(table_id) original_schema = [ { "name": "field1", "type": "STRING", "mode": "REQUIRED" }, { "name": "field2", "type": "INTEGER" }, { "name": "field3", "type": "DATE" }, ] original_schema_cp = copy.deepcopy(original_schema) gbq.to_gbq( df, "dataset.schematest", project_id="my-project", table_schema=original_schema, if_exists="append", ) assert original_schema == original_schema_cp
def test_to_gbq_with_if_exists_unknown(): with pytest.raises(ValueError): gbq.to_gbq( DataFrame([[1]]), "my_dataset.my_table", project_id="myproj", if_exists="unknown", )
def test_to_gbq_with_private_key_raises_notimplementederror(): with pytest.raises(NotImplementedError, match="private_key"): gbq.to_gbq( DataFrame([[1]]), "dataset.tablename", project_id="my-project", private_key="path/to/key.json", )
def test_to_gbq_with_no_project_id_given_should_fail(monkeypatch): from pandas_gbq import auth monkeypatch.setattr(auth, 'get_application_default_credentials', mock_none_credentials) with pytest.raises(ValueError) as exception: gbq.to_gbq(DataFrame([[1]]), 'dataset.tablename') assert 'Could not determine project ID' in str(exception)
def test_to_gbq_creates_dataset(mock_bigquery_client): import google.api_core.exceptions mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( "my_table") mock_bigquery_client.get_dataset.side_effect = google.api_core.exceptions.NotFound( "my_dataset") gbq.to_gbq(DataFrame([[1]]), "my_dataset.my_table", project_id="1234") mock_bigquery_client.create_dataset.assert_called_with(mock.ANY)
def test_to_gbq_w_empty_df(mock_bigquery_client): import google.api_core.exceptions mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( "my_table") gbq.to_gbq(DataFrame(), "my_dataset.my_table", project_id="1234") mock_bigquery_client.create_table.assert_called_with(mock.ANY) mock_bigquery_client.load_table_from_dataframe.assert_not_called() mock_bigquery_client.load_table_from_file.assert_not_called()
def test_to_gbq_wo_verbose_w_new_pandas_no_warnings(monkeypatch, recwarn): monkeypatch.setattr( type(FEATURES), "pandas_has_deprecated_verbose", mock.PropertyMock(return_value=True), ) try: gbq.to_gbq(DataFrame([[1]]), "dataset.tablename", project_id="my-project") except gbq.TableCreationError: pass assert len(recwarn) == 0
def test_to_gbq_create_dataset_translates_exception(mock_bigquery_client): mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( "my_table" ) mock_bigquery_client.get_dataset.side_effect = google.api_core.exceptions.NotFound( "my_dataset" ) mock_bigquery_client.create_dataset.side_effect = ( google.api_core.exceptions.InternalServerError("something went wrong") ) with pytest.raises(gbq.GenericGBQException): gbq.to_gbq(DataFrame([[1]]), "my_dataset.my_table", project_id="1234")
def test_to_gbq_wo_verbose_w_new_pandas_no_warnings(recwarn, min_bq_version): import pkg_resources pandas_version = pkg_resources.parse_version('0.23.0') with mock.patch('pkg_resources.Distribution.parsed_version', new_callable=mock.PropertyMock) as mock_version: mock_version.side_effect = [min_bq_version, pandas_version] try: gbq.to_gbq(DataFrame([[1]]), 'dataset.tablename', project_id='my-project') except gbq.TableCreationError: pass assert len(recwarn) == 0
def test_upload_data_if_table_exists_raises_value_error(self, project_id): test_id = "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) # Test invalid value for if_exists parameter raises value error with pytest.raises(ValueError): gbq.to_gbq( df, self.destination_table + test_id, project_id, if_exists="xxxxx", credentials=self.credentials, )
def test_to_gbq_create_dataset_with_location(mock_bigquery_client): mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( "my_table" ) mock_bigquery_client.get_dataset.side_effect = google.api_core.exceptions.NotFound( "my_dataset" ) gbq.to_gbq( DataFrame([[1]]), "my_dataset.my_table", project_id="1234", location="us-west1" ) assert mock_bigquery_client.create_dataset.called args, _ = mock_bigquery_client.create_dataset.call_args sent_dataset = args[0] assert sent_dataset.location == "us-west1"
def test_upload_empty_data(self, project_id): test_id = "data_with_0_rows" df = DataFrame() gbq.to_gbq( df, self.destination_table + test_id, project_id, credentials=self.credentials, ) table = self.bqclient.get_table(self.destination_table + test_id) assert table.num_rows == 0 assert len(table.schema) == 0
def test_to_gbq_with_chunksize_warns_deprecation( api_method, warning_message, warning_type ): with pytest.warns(warning_type, match=warning_message): try: gbq.to_gbq( DataFrame([[1]]), "dataset.tablename", project_id="my-project", api_method=api_method, chunksize=100, ) except gbq.TableCreationError: pass
def test_to_gbq_with_not_verbose_new_pandas_warns_deprecation(min_bq_version): import pkg_resources pandas_version = pkg_resources.parse_version('0.23.0') with pytest.warns(FutureWarning), \ mock.patch( 'pkg_resources.Distribution.parsed_version', new_callable=mock.PropertyMock) as mock_version: mock_version.side_effect = [min_bq_version, pandas_version] try: gbq.to_gbq(DataFrame([[1]]), 'dataset.tablename', project_id='my-project', verbose=False) except gbq.TableCreationError: pass
def test_to_gbq_w_default_project(mock_bigquery_client): """If no project is specified, we should be able to use project from default credentials. """ import google.api_core.exceptions from google.cloud.bigquery.table import TableReference mock_bigquery_client.get_table.side_effect = ( google.api_core.exceptions.NotFound("my_table")) gbq.to_gbq(DataFrame(), "my_dataset.my_table") mock_bigquery_client.get_table.assert_called_with( TableReference.from_string("default-project.my_dataset.my_table")) mock_bigquery_client.create_table.assert_called_with(mock.ANY) table = mock_bigquery_client.create_table.call_args[0][0] assert table.project == "default-project"
def test_to_gbq_with_verbose_new_pandas_warns_deprecation(monkeypatch, verbose): monkeypatch.setattr( type(FEATURES), "pandas_has_deprecated_verbose", mock.PropertyMock(return_value=True), ) with pytest.warns(FutureWarning, match="verbose is deprecated"): try: gbq.to_gbq( DataFrame([[1]]), "dataset.tablename", project_id="my-project", verbose=verbose, ) except gbq.TableCreationError: pass
def test_to_gbq_with_if_exists_append(mock_bigquery_client, expected_load_method): from google.cloud.bigquery import SchemaField mock_bigquery_client.get_table.return_value = google.cloud.bigquery.Table( "myproj.my_dataset.my_table", schema=( SchemaField("col_a", "FLOAT", mode="REQUIRED"), SchemaField("col_b", "STRING", mode="REQUIRED"), ), ) gbq.to_gbq( DataFrame({"col_a": [0.25, 1.5, -1.0], "col_b": ["a", "b", "c"]}), "my_dataset.my_table", project_id="myproj", if_exists="append", ) expected_load_method.assert_called_once()
def test_to_gbq_load_method_translates_exception( mock_bigquery_client, expected_load_method ): mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( "my_table" ) expected_load_method.side_effect = google.api_core.exceptions.InternalServerError( "error loading data" ) with pytest.raises(gbq.GenericGBQException): gbq.to_gbq( DataFrame({"int_cole": [1, 2, 3]}), "my_dataset.my_table", project_id="myproj", ) expected_load_method.assert_called_once()
def test_to_gbq_with_if_exists_replace(mock_bigquery_client): mock_bigquery_client.get_table.side_effect = ( # Initial check google.cloud.bigquery.Table("myproj.my_dataset.my_table"), # Recreate check google.api_core.exceptions.NotFound("my_table"), ) gbq.to_gbq( DataFrame([[1]]), "my_dataset.my_table", project_id="myproj", if_exists="replace", ) # TODO: We can avoid these API calls by using write disposition in the load # job. See: https://github.com/googleapis/python-bigquery-pandas/issues/118 assert mock_bigquery_client.delete_table.called assert mock_bigquery_client.create_table.called
def test_to_gbq_with_verbose_new_pandas_warns_deprecation(min_bq_version): import pkg_resources pandas_version = pkg_resources.parse_version("0.23.0") with pytest.warns(FutureWarning), mock.patch( "pkg_resources.Distribution.parsed_version", new_callable=mock.PropertyMock, ) as mock_version: mock_version.side_effect = [min_bq_version, pandas_version] try: gbq.to_gbq( DataFrame([[1]]), "dataset.tablename", project_id="my-project", verbose=True, ) except gbq.TableCreationError: pass
def write_data_to_bq(data, settings=None, destination_table=None, schema=None): df = data.copy() if ('apple' in settings['branch'] or 'banana' in settings['branch'] or 'cherry' in settings['branch'] or 'staging' in settings['branch']) and ('integration' in schema or 'feature' in schema): destination_table = "{}.{}".format( "ccdata_" + settings["branch"] + '_' + schema, destination_table) else: destination_table = "{}.{}".format(schema, destination_table) print(destination_table) data_schema = data_types_bq(df) to_gbq(df, destination_table, project_id=settings["project_id"], chunksize=10000, if_exists="append", table_schema=data_schema)