def test_upload_data_if_table_exists_replace(self): raise nose.SkipTest("buggy test") destination_table = DESTINATION_TABLE + "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, _get_project_id(), if_exists='replace', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], 5)
def main(): query_android = "SELECT distinct if (af_channel is not null, concat(media_source,'_',af_channel),media_source) as source," \ "ifnull(campaign,fb_campaign_name) as campaign, regexp_extract(event_value,'09835[0-9]{7}') as orders " \ "FROM AppsFlyer.android_events " \ "WHERE regexp_extract(event_value, '09835[0-9]{7}') is not null and media_source!='Organic' " \ f"and _PARTITIONTIME = TIMESTAMP('{currdate}')" query_ios = "SELECT distinct if (af_channel is not null, concat(media_source,'_',af_channel),media_source) as source," \ "ifnull(campaign,fb_campaign_name) as campaign, regexp_extract(event_value,'09835[0-9]{7}') as orders " \ "FROM AppsFlyer.ios_events " \ "WHERE regexp_extract(event_value , '09835[0-9]{7}') is not null and media_source!='Organic' " \ f"and _PARTITIONTIME = TIMESTAMP('{currdate}')" app = [[row[i] for i in range(3)] for row in queryData(query_android)] + [[row[i] for i in range(3)] for row in queryData(query_ios)] if len(app) > 0: # app = list(dict((x[2], x) for x in app).values()) # убираем дубли транзакций - оставил для истории ) df_app = pd.DataFrame.from_records(app, columns=['source','campaign','orders']) df_app.drop_duplicates(subset='orders', inplace=True) # убираем дубли транзакций gbq.to_gbq(df_app, 'Mig_Data.temp_app', projectid, if_exists=import_action) QUERY_upd = f"UPDATE `Mig_Data.Orders` o SET o.source = t.source, o.campaign=t.campaign " \ f"FROM `Mig_Data.temp_app` t " \ f"WHERE o.transaction = t.orders" queryData(QUERY_upd) table_ref = client.dataset('Mig_Data').table('temp_app') if client.get_table(table_ref): client.delete_table(table_ref)
def load_albums_bq(albums): """Loads json to BigQuery table.""" from pandas.io import gbq # Constants project_id = 'secret-compass-181513' dataset_name = 'spotify' dic_flattened = [flatten_json(d) for d in albums['items']] df = pd.DataFrame(dic_flattened) # Save albums in Google BigQuery bigquery_client = bigquery.Client(project=project_id) dataset = bigquery_client.dataset(dataset_name) df_recent_albums = df[[ 'album_uri', 'added_at', 'album_artists_0_name', 'album_name', 'album_type', 'album_label', 'album_release_date', 'album_tracks_total', 'album_id' ]] gbq.to_gbq(df_recent_albums, 'spotify.new_albums', project_id, if_exists='append') return 'dataset loaded to BQ'
def userIdData(): p = userIdDict() for dt, u_id in p.items(): try: QUERY = "with a as ( SELECT distinct date, user.id, " \ "trafficSource.source, trafficSource.medium, trafficSource.campaign, clientId, if( visitNumber=1,'New Visitor','Returning Visitor') as userTypeGA," \ "trafficSource.keyword, t.device.browser as browser, t.device.deviceCategory as deviceCategory, geoNetwork.city," \ "count(user.id) OVER (PARTITION BY user.id order by user.id desc ) cont_number " \ "FROM `{0}.{1}.owoxbi_sessions_{2}` as t where safe_cast( user.id as int64) in UNNEST({3})) " \ "select * from a where cont_number=1".format(PROJECT_ID, DATASET_ID, dt.replace('-',''), u_id) z = [[row[i] for i in range(11)] for row in queryData(QUERY)] df = pd.DataFrame.from_records(z, columns=[ 'date', 'user', 'source', 'Medium', 'campaign', 'clientId', 'userTypeGA', 'keyword', 'browser', 'deviceCategory', 'city' ]) gbq.to_gbq(df, 'Mig_Data.temp', projectid, if_exists=import_action) # отправка данных в GBQ QUERY_update = "UPDATE `{0}.Mig_Data.Orders` as a set a.source=t.source, a.Medium=t.Medium, a.campaign=t.campaign, a.clientId = t.clientId, a.userTypeGA =t.userTypeGA, a.keyword=t.keyword," \ "a.browser=t.browser, a.device=t.deviceCategory, a.city=t.city " \ "from `{0}.Mig_Data.temp` as t " \ "where a.date=t.date and kpp=safe_cast(user as int64)".format(PROJECT_ID) queryData(QUERY_update, p='через функцию userId') table_ref = client.dataset('Mig_Data').table('temp') if client.get_table(table_ref): client.delete_table(table_ref) except: continue
def fin_obrabotchik(analytics, token): primary_data_VK = vk_data_posts(token) secondary_data_VK = vk_data_postID(token) third_data_GA = GA_Data(analytics, token) F_labels = [ 'Id', 'likes', 'reposts', 'comments', 'name', 'date', 'postUrl', 'url' ] S_labels = ['Id', 'reach_total', 'reach_subscribers', 'links', 'socialNet'] Th_labels = [ 'Id', 'sessions', 'bounceRate', 'pagePerSession', 'duration', 'transactions', 'revenue' ] F_df = pd.DataFrame.from_records(primary_data_VK, columns=F_labels) S_df = pd.DataFrame.from_records(secondary_data_VK, columns=S_labels) Th_df = pd.DataFrame.from_records(third_data_GA, columns=Th_labels) mergedData = pd.merge(F_df, S_df, on=['Id']) mergedData_Final = pd.merge(mergedData, Th_df, how='left', on=['Id']) mergedData_Final = mergedData_Final[[ 'Id', 'date', 'name', 'reach_total', 'reach_subscribers', 'links', 'likes', 'reposts', 'comments', 'url', 'sessions', 'bounceRate', 'pagePerSession', 'duration', 'transactions', 'revenue', 'postUrl', 'socialNet' ]] # определяем порядок столбцов # mergedData_Final.to_csv('111.csv') gbq.to_gbq(mergedData_Final, f'{DATASET_ID}.{TABLE_ID_Temp}', '78997000000', if_exists='replace')
def insert_row(df, table, replace_val): if len(df.index) == 0: print("gbq insert records zero") return project_id = _common_conf['bigquery']['project_id'] private_key_path = get_abspath(_common_conf['bigquery']['key_path']) dataset = _common_conf['bigquery']['dataset'] # 10000ずつinset full_table = "{}.{}".format(dataset, table) client = get_client(json_key_file=private_key_path, readonly=False, swallow_results=False) if client.check_table(dataset, table): bq_limit = 10000 q_num = len(df.index) // bq_limit for i in range(0, q_num + 1): client = get_client(json_key_file=private_key_path, readonly=False, swallow_results=False) ins_df = df[i * bq_limit:(i + 1) * bq_limit].replace( np.nan, replace_val) row_dict = ins_df.to_dict(orient='index') row_data = [x for x in row_dict.values()] ret = client.push_rows(dataset, table, row_data) if 'insertErrors' in ret: msg = "BigQuery Insert Error:\nsample:\n{}\nerror:\n{}" raise Exception(msg.format(row_data[0:5], ret)) else: print('{} CREATE TABLE'.format(full_table)) gbq.to_gbq(df, full_table, project_id)
def test_upload_data_if_table_exists_replace(self): table_name = 'new_test4' test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='replace') sleep(60) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_upload_new_table(self): # Attempting to upload to a new table with valid data and a valid schema should succeed if not os.path.exists(self.bq_token): raise nose.SkipTest( 'Skipped because authentication information is not available.') schema = [ 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER' ] array = [[ 'TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola', 99999999, False, False, 1, 'Jedi', 11210 ]] df = DataFrame(array, columns=[ 'title', 'id', 'language', 'wp_namespace', 'is_redirect', 'revision_id', 'contributor_ip', 'contributor_id', 'contributor_username', 'timestamp', 'is_minor', 'is_bot', 'reversion_id', 'comment', 'num_characters' ]) gbq.to_gbq(df, 'pandas_testing_dataset.test_data2', schema=schema, col_order=None, if_exists='append') a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data2") self.assertTrue((a == df).all().all())
def test_upload_bad_data_table(self): # Attempting to upload data that does not match schema should fail if not os.path.exists(self.bq_token): raise nose.SkipTest( 'Skipped because authentication information is not available.') schema = [ 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER' ] array = [[ 'TESTING_GBQ\',', False, 'hi', 0, True, 'STRING IN INTEGER', '00.000.00.000', 1, 'hola', 99999999, -100, 1000, 1, 'Jedi', 11210 ]] df = DataFrame(array, columns=[ 'title', 'id', 'language', 'wp_namespace', 'is_redirect', 'revision_id', 'contributor_ip', 'contributor_id', 'contributor_username', 'timestamp', 'is_minor', 'is_bot', 'reversion_id', 'comment', 'num_characters' ]) with self.assertRaises(bigquery_client.BigqueryServiceError): gbq.to_gbq(df, 'pandas_testing_dataset.test_data1', schema=schema, col_order=None, if_exists='append')
def test_upload_data(self): test_size = 1000001 #create df to test for all BQ datatypes except RECORD bools = np.random.randint(2, size=(1, test_size)).astype(bool) flts = np.random.randn(1, test_size) ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) times = [ datetime.datetime.now(pytz.timezone('US/Arizona')) for t in xrange(test_size) ] df = DataFrame( { 'bools': bools[0], 'flts': flts[0], 'ints': ints[0], 'strs': strs[0], 'times': times[0] }, index=range(test_size)) gbq.to_gbq(df, "pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_upload_replace(self): # Attempting to overwrite an existing table with valid data and a valid schema should succeed if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') schema = ['STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER'] # Setup an existing table array1 = [['', 1, '', 1, False, 1, '00.111.00.111', 1, 'hola', 1, True, True, 1, 'Sith', 1]] df1 = DataFrame(array1, columns=['title','id','language','wp_namespace','is_redirect','revision_id', 'contributor_ip','contributor_id','contributor_username','timestamp', 'is_minor','is_bot','reversion_id','comment','num_characters']) gbq.to_gbq(df1, 'pandas_testing_dataset.test_data5', schema=schema, col_order=None, if_exists='fail') array2 = [['TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola', 99999999, False, False, 1, 'Jedi', 11210]] # Overwrite the existing table with different data df2 = DataFrame(array2, columns=['title','id','language','wp_namespace','is_redirect','revision_id', 'contributor_ip','contributor_id','contributor_username','timestamp', 'is_minor','is_bot','reversion_id','comment','num_characters']) gbq.to_gbq(df2, 'pandas_testing_dataset.test_data5', schema=schema, col_order=None, if_exists='replace') # Read the table and confirm the new data is all that is there a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data5") self.assertTrue((a == df2).all().all())
def load_tracks_bq(tracks): """Loads json to BigQuery table.""" from pandas.io import gbq from pandas.io.json import json_normalize #package for flattening json in pandas df # Constants project_id = 'secret-compass-181513' dataset_name = 'spotify' dic_flattened = [flatten_json(d) for d in tracks['items']] df = pd.DataFrame(dic_flattened) # Save tracks in Google BigQuery bigquery_client = bigquery.Client(project=project_id) dataset = bigquery_client.dataset(dataset_name) df_recently_played = df[[ 'track_name', 'played_at', 'track_album_artists_0_name', 'track_album_name', 'track_type', 'track_uri' ]] gbq.to_gbq(df_recently_played, 'spotify.play_history_new', project_id, if_exists='append') return 'dataset loaded to BQ'
def test_upload_replace_schema_error(self): # Attempting to replace an existing table without specifying a schema should fail if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') df = DataFrame(self.correct_data_small) with self.assertRaises(gbq.SchemaMissing): gbq.to_gbq(df, 'pandas_testing_dataset.test_database', schema=None, col_order=None, if_exists='replace')
def test_invalid_column_name_schema(self): # Specifying a schema that contains an invalid column name should fail if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') schema = ['INCORRECT'] df = DataFrame([[1]],columns=['fake']) with self.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df, 'pandas_testing_dataset.test_data', schema=schema, col_order=None, if_exists='append')
def test_invalid_number_of_columns_schema(self): # Specifying a schema that does not have same shape as dataframe should fail if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') schema = ['INTEGER'] df = DataFrame([[1, 'STRING']],columns=['fake','fake']) with self.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df, 'pandas_testing_dataset.test_data4', schema=schema, col_order=None, if_exists='append')
def test_google_upload_errors_should_raise_exception(self): table_name = 'new_test5' test_timestamp = datetime.now(pytz.timezone('US/Arizona')) bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0], 'ints': [0, '1'], 'strs': ['a', 1], 'times': [test_timestamp, test_timestamp]}, index=range(2)) with tm.assertRaises(gbq.StreamingInsertError): gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.' + table_name, PROJECT_ID, verbose=True)
def test_google_upload_errors_should_raise_exception(self): destination_table = DESTINATION_TABLE + "5" test_timestamp = datetime.now(pytz.timezone('US/Arizona')) bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0], 'ints': [0, '1'], 'strs': ['a', 1], 'times': [test_timestamp, test_timestamp]}, index=range(2)) with tm.assertRaises(gbq.StreamingInsertError): gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True)
def test_google_upload_errors_should_raise_exception(self): test_timestamp = datetime.datetime.now(pytz.timezone('US/Arizona')) bad_df = DataFrame( {'bools': [False, False], 'flts': [0.0,1.0], 'ints': [0,'1'], 'strs': ['a', 1], 'times': [test_timestamp, test_timestamp] }, index=range(2)) with tm.assertRaises(gbq.UnknownGBQException): gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.new_test', project_id = PROJECT_ID)
def stream_dataToBQ(ins_data): # BigQuery params PROJECT_ID = '78997000000' DATASET_ID = 'Temp' TABLE_ID = 'OrderPandasPartit' import_action = 'append' gbq.to_gbq(ins_data, f'{DATASET_ID}.{TABLE_ID}', PROJECT_ID, if_exists=import_action)
def test_upload_public_data_error(self): # Attempting to upload to a public, read-only, dataset should fail if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') array = [['TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola', 99999999, False, False, 1, 'Jedi', 11210]] df = DataFrame(array) with self.assertRaises(bigquery_client.BigqueryServiceError): gbq.to_gbq(df, 'publicdata:samples.wikipedia', schema=None, col_order=None, if_exists='append')
def test_upload_replace(self): # Attempting to overwrite an existing table with valid data and a valid schema should succeed if not os.path.exists(self.bq_token): raise nose.SkipTest( 'Skipped because authentication information is not available.') schema = [ 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER' ] # Setup an existing table array1 = [[ '', 1, '', 1, False, 1, '00.111.00.111', 1, 'hola', 1, True, True, 1, 'Sith', 1 ]] df1 = DataFrame(array1, columns=[ 'title', 'id', 'language', 'wp_namespace', 'is_redirect', 'revision_id', 'contributor_ip', 'contributor_id', 'contributor_username', 'timestamp', 'is_minor', 'is_bot', 'reversion_id', 'comment', 'num_characters' ]) gbq.to_gbq(df1, 'pandas_testing_dataset.test_data5', schema=schema, col_order=None, if_exists='fail') array2 = [[ 'TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola', 99999999, False, False, 1, 'Jedi', 11210 ]] # Overwrite the existing table with different data df2 = DataFrame(array2, columns=[ 'title', 'id', 'language', 'wp_namespace', 'is_redirect', 'revision_id', 'contributor_ip', 'contributor_id', 'contributor_username', 'timestamp', 'is_minor', 'is_bot', 'reversion_id', 'comment', 'num_characters' ]) gbq.to_gbq(df2, 'pandas_testing_dataset.test_data5', schema=schema, col_order=None, if_exists='replace') # Read the table and confirm the new data is all that is there a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data5") self.assertTrue((a == df2).all().all())
def upload_data_to_gbq(data, name_table): try: gbq.to_gbq(data, name_table, project_id='de-exam-kittisak', if_exists='append') message = "status:200 Success send data to gbq" print(message) return message except Exception as e: raise e
def test_upload_data(self): table_name = 'new_test1' test_size = 1000001 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size)
def dataToGBQ(analytics, token, sdate): z = [fin_obrabotchik(analytics, token, sdate)] labels = [ 'date', 'reach', 'reach_subscribers', 'comments', 'likes', 'subscribed', 'unsubscribed', 'copies', 'sessions', 'bounceRate', 'pageviews', 'duration', 'transactions', 'revenue', 'socialNet', 'members' ] # порядок столбцов df = pd.DataFrame.from_records(z, columns=labels) gbq.to_gbq(df, 'Temp.Social', '78997000000', if_exists='append')
def test_upload_data(self): destination_table = DESTINATION_TABLE + "1" test_size = 1000001 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) self.assertEqual(result["NUM_ROWS"][0], test_size)
def test_upload_data(self): destination_table = DESTINATION_TABLE + "1" test_size = 1000001 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_upload_replace_schema_error(self): # Attempting to replace an existing table without specifying a schema should fail if not os.path.exists(self.bq_token): raise nose.SkipTest( 'Skipped because authentication information is not available.') df = DataFrame(self.correct_data_small) with self.assertRaises(gbq.SchemaMissing): gbq.to_gbq(df, 'pandas_testing_dataset.test_database', schema=None, col_order=None, if_exists='replace')
def sql_db_select(): with open(file_db_connect) as f: param_сonnect = json.load(f) db_connect = MySQLdb.connect(user=param_сonnect['user'], passwd=param_сonnect['passwd'], host=param_сonnect['host'], charset='cp1251') sql_data = "select distinct z.ZAKAZ_ID,ifnull(purchase_group_id,-1) as group_id from `ukt_sess`.`ZAK` z " \ "left join `ukt_sess`.`ZAK_ITEMS` zi on (zi.zakaz_id=z.zakaz_id) " \ "left join `ukt_prod`.`ukt_purch_group_goods` gg on (gg.goods_id=zi.item_id) " \ "where DATE(z.DATA) between '2019-01-01' and '2019-04-30' and z.date_ans is not NULL and z.ANS is not NULL and z.STATUS>0 and z.archive<>1" df_mysql = pd.read_sql(sql_data, con=db_connect) gbq.to_gbq(df_mysql, f'Temp.QQ', '78997000000', if_exists='append') # отправка данных в GBQ
def test_upload_data_if_table_exists_fail(self): destination_table = DESTINATION_TABLE + "2" test_size = 10 df = make_mixed_dataframe_v2(test_size) self.table.create(TABLE_ID + "2", gbq._generate_bq_schema(df)) # Test the default value of if_exists is 'fail' with tm.assertRaises(gbq.TableCreationError): gbq.to_gbq(df, destination_table, PROJECT_ID) # Test the if_exists parameter with value 'fail' with tm.assertRaises(gbq.TableCreationError): gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='fail')
def dataToGBQ(): df = pd.read_csv( r'C:\Python\ukt\txtcsvFile\appsflyers\organic-in-app-events_2019-01-01_2019-01-31.csv' ) z = len(df) df.columns = [i.replace(' ', '_') for i in list(df.columns.values)] # или так df.columns=df.columns.str.replace(' ','_') df = df.drop_duplicates() gbq.to_gbq(df, f'{DATASET_ID}.{TABLE}', projectid, if_exists=import_action) # отправка данных в GBQ print( 'Loaded {} row into {}. Начальный файл: {}строк. Было дубликатов - {} ' .format(len(df), TABLE, z, z - len(df)))
def test_upload_data_flexible_column_order(self): destination_table = DESTINATION_TABLE + "13" test_size = 10 df = make_mixed_dataframe_v2(test_size) # Initialize table with sample data gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) df_columns_reversed = df[df.columns[::-1]] gbq.to_gbq(df_columns_reversed, destination_table, _get_project_id(), if_exists='append', private_key=_get_private_key_path())
def test_invalid_number_of_columns_schema(self): # Specifying a schema that does not have same shape as dataframe should fail if not os.path.exists(self.bq_token): raise nose.SkipTest( 'Skipped because authentication information is not available.') schema = ['INTEGER'] df = DataFrame([[1, 'STRING']], columns=['fake', 'fake']) with self.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df, 'pandas_testing_dataset.test_data4', schema=schema, col_order=None, if_exists='append')
def test_invalid_column_name_schema(self): # Specifying a schema that contains an invalid column name should fail if not os.path.exists(self.bq_token): raise nose.SkipTest( 'Skipped because authentication information is not available.') schema = ['INCORRECT'] df = DataFrame([[1]], columns=['fake']) with self.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df, 'pandas_testing_dataset.test_data', schema=schema, col_order=None, if_exists='append')
def test_upload_data(self): test_size = 1000001 #create df to test for all BQ datatypes except RECORD bools = np.random.randint(2, size=(1,test_size)).astype(bool) flts = np.random.randn(1,test_size) ints = np.random.randint(1,10, size=(1,test_size)) strs = np.random.randint(1,10, size=(1,test_size)).astype(str) times = [datetime.datetime.now(pytz.timezone('US/Arizona')) for t in xrange(test_size)] df = DataFrame({'bools':bools[0], 'flts':flts[0], 'ints':ints[0], 'strs':strs[0], 'times':times[0]}, index=range(test_size)) gbq.to_gbq(df,"pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_upload_data_if_table_exists_fail(self): table_name = 'new_test2' test_size = 10 df = make_mixed_dataframe_v2(test_size) gbq.create_table('pydata_pandas_bq_testing.' + table_name, gbq.generate_bq_schema(df), PROJECT_ID) # Test the default value of if_exists is 'fail' with tm.assertRaises(gbq.TableCreationError): gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID) # Test the if_exists parameter with value 'fail' with tm.assertRaises(gbq.TableCreationError): gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='fail')
def load_df_to_bq_tmp(self, df, tmp_table): ''' This function inserts the provided dataframe into a temp table in BigQuery, which is used in other parts of this class (e.g. L1 and L2 expansions) to join on by patent number. ''' print('Loading dataframe with cols {}, shape {}, to {}'.format( df.columns, df.shape, tmp_table)) gbq.to_gbq(dataframe=df, destination_table=tmp_table, project_id=self.bq_project, if_exists='replace', verbose=False) print('Completed loading temp table.')
def DfFromSQL(): with open(file_db_connect) as f: param_сonnect = json.load(f) db_connect = MySQLdb.connect(user=param_сonnect['user'], passwd=param_сonnect['passwd'], host=param_сonnect['host'], db=param_сonnect['db_sess'], charset='cp1251') sql_query = "SELECT master_order_id,flag_cancel,pay_sum FROM ZAKAZ " \ "where Date(created) between '{}' and '{}' and archive=0".format(DateStart, DateStop) df_mysql = pd.read_sql(sql_query, con=db_connect) db_connect.close() gbq.to_gbq(df_mysql, f'Mig_Data.Lud', '78997000000', if_exists='replace') # отправка данных в GBQ
def test_upload_data(self): destination_table = DESTINATION_TABLE + "1" test_size = 20001 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_upload_data_as_service_account_with_key_contents(self): destination_table = "{0}.{1}".format(DATASET_ID + "3", TABLE_ID + "1") test_size = 10 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_contents()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_contents()) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_upload_data_as_service_account_with_key_contents(self): destination_table = DESTINATION_TABLE + "12" test_size = 10 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000, private_key=PRIVATE_KEY_JSON_CONTENTS) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID, private_key=PRIVATE_KEY_JSON_CONTENTS) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_google_upload_errors_should_raise_exception(self): destination_table = DESTINATION_TABLE + "5" test_timestamp = datetime.now(pytz.timezone("US/Arizona")) bad_df = DataFrame( { "bools": [False, False], "flts": [0.0, 1.0], "ints": [0, "1"], "strs": ["a", 1], "times": [test_timestamp, test_timestamp], }, index=range(2), ) with tm.assertRaises(gbq.StreamingInsertError): gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True)
def test_upload_bad_data_table(self): # Attempting to upload data that does not match schema should fail if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') schema = ['STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER'] array = [['TESTING_GBQ\',', False, 'hi', 0, True, 'STRING IN INTEGER', '00.000.00.000', 1, 'hola', 99999999, -100, 1000, 1, 'Jedi', 11210]] df = DataFrame(array, columns=['title','id','language','wp_namespace','is_redirect','revision_id', 'contributor_ip','contributor_id','contributor_username','timestamp', 'is_minor','is_bot','reversion_id','comment','num_characters']) with self.assertRaises(bigquery_client.BigqueryServiceError): gbq.to_gbq(df, 'pandas_testing_dataset.test_data1', schema=schema, col_order=None, if_exists='append')
def test_upload_data_if_table_exists_replace(self): destination_table = DESTINATION_TABLE + "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='replace') sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_upload_data_if_table_exists_replace(self): table_name = 'new_test4' test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='replace') sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_upload_new_table(self): # Attempting to upload to a new table with valid data and a valid schema should succeed if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') schema = ['STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER'] array = [['TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola', 99999999, False, False, 1, 'Jedi', 11210]] df = DataFrame(array, columns=['title','id','language','wp_namespace','is_redirect','revision_id', 'contributor_ip','contributor_id','contributor_username','timestamp', 'is_minor','is_bot','reversion_id','comment','num_characters']) gbq.to_gbq(df, 'pandas_testing_dataset.test_data2', schema=schema, col_order=None, if_exists='append') a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data2") self.assertTrue((a == df).all().all())
def test_upload_data_as_service_account_with_key_contents(self): raise nose.SkipTest( "flaky test") destination_table = DESTINATION_TABLE + "12" test_size = 10 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_contents()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_contents()) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_upload_data_if_table_exists_append(self): destination_table = DESTINATION_TABLE + "3" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) # Test the if_exists parameter with value 'append' gbq.to_gbq(df, destination_table, _get_project_id(), if_exists='append', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, destination_table, _get_project_id(), if_exists='append', private_key=_get_private_key_path())
def test_upload_data_if_table_exists_append(self): table_name = 'new_test3' test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with value 'append' gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='append') sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, if_exists='append')
def test_to_gbq_should_fail_if_invalid_table_name_passed(self): with tm.assertRaises(gbq.NotFoundException): gbq.to_gbq(DataFrame(), 'invalid_table_name', project_id="1234")
def test_to_gbq_with_no_project_id_given_should_fail(self): with tm.assertRaises(TypeError): gbq.to_gbq(DataFrame(), 'dataset.tablename')