def test_read_gbq_with_corrupted_private_key_json_should_fail(self): _skip_if_no_private_key_path() with tm.assertRaises(gbq.InvalidPrivateKeyFormat): gbq.read_gbq( 'SELECT 1', project_id='x', private_key=re.sub('[a-z]', '9', _get_private_key_path()))
def test_read_gbq_with_corrupted_private_key_json_should_fail(self): _skip_if_no_private_key_contents() with tm.assertRaises(gbq.InvalidPrivateKeyFormat): gbq.read_gbq( 'SELECT 1', project_id='x', private_key=re.sub('[a-z]', '9', PRIVATE_KEY_JSON_CONTENTS))
def test_invalid_option_for_sql_dialect(self): sql_statement = "SELECT DISTINCT id FROM " \ "`publicdata.samples.wikipedia` LIMIT 10" # Test that an invalid option for `dialect` raises ValueError with tm.assertRaises(ValueError): gbq.read_gbq(sql_statement, project_id=PROJECT_ID, dialect='invalid') # Test that a correct option for dialect succeeds # to make sure ValueError was due to invalid dialect gbq.read_gbq(sql_statement, project_id=PROJECT_ID, dialect='standard')
def test_standard_sql(self): standard_sql = "SELECT DISTINCT id FROM " \ "`publicdata.samples.wikipedia` LIMIT 10" # Test that a standard sql statement fails when using # the legacy SQL dialect (default value) with tm.assertRaises(gbq.GenericGBQException): gbq.read_gbq(standard_sql, project_id=PROJECT_ID) # Test that a standard sql statement succeeds when # setting dialect='standard' df = gbq.read_gbq(standard_sql, project_id=PROJECT_ID, dialect='standard') self.assertEqual(len(df.drop_duplicates()), 10)
def test_legacy_sql(self): legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10" # Test that a legacy sql statement fails when # setting dialect='standard' with tm.assertRaises(gbq.GenericGBQException): gbq.read_gbq(legacy_sql, project_id=PROJECT_ID, dialect='standard') # Test that a legacy sql statement succeeds when # setting dialect='legacy' df = gbq.read_gbq(legacy_sql, project_id=PROJECT_ID, dialect='legacy') self.assertEqual(len(df.drop_duplicates()), 10)
def test_upload_data(self): test_size = 1000001 #create df to test for all BQ datatypes except RECORD bools = np.random.randint(2, size=(1, test_size)).astype(bool) flts = np.random.randn(1, test_size) ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) times = [ datetime.datetime.now(pytz.timezone('US/Arizona')) for t in xrange(test_size) ] df = DataFrame( { 'bools': bools[0], 'flts': flts[0], 'ints': ints[0], 'strs': strs[0], 'times': times[0] }, index=range(test_size)) gbq.to_gbq(df, "pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_should_read_as_user_account(self): if _in_travis_environment(): raise nose.SkipTest("Cannot run local auth in travis environment") query = 'SELECT "PI" as VALID_STRING' df = gbq.read_gbq(query, project_id=_get_project_id()) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))
def test_upload_data_if_table_exists_append(self): destination_table = DESTINATION_TABLE + "3" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) # Test the if_exists parameter with value 'append' gbq.to_gbq(df, destination_table, _get_project_id(), if_exists='append', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, destination_table, _get_project_id(), if_exists='append', private_key=_get_private_key_path())
def generate_analytics_cards(service, project_id, backup_date): # We generate the analytics cards from the ProblemLog table, # only if necessary. Note that this query is expensive, so # we only do this once (and subsequent queries can just look # at the analytics cards - much smaller). # # Let's look at the most recently done analytics card # and if it's close enough to the backup date, assume # that we don't need to regenerate the table. results = gbq.read_gbq('SELECT MAX(time_done) as max_time \n' 'FROM [tony.analytics_cards_1]', project_id=project_id) usec = int(results['max_time'][0]) ts = usec / 1000 / 1000 last_ac_time = datetime.datetime.fromtimestamp(ts) last_date = datetime.datetime.strptime(backup_date, '%Y_%m_%d') gap = last_date - last_ac_time gap_in_days = gap.days print 'Analytics card table is', gap, 'old' if gap_in_days > 1: print 'Regenerating analytics_cards_1...' query_string = ( 'SELECT user_id, task_type, time_done, exercise, correct\n' 'FROM [%s.ProblemLog]\n' 'WHERE task_type = \'mastery.analytics\'\n' ) % backup_date run_query(service, project_id, query_string, 'analytics_cards_1') else: print 'Table is up to date. Skipping...'
def test_upload_data_if_table_exists_replace(self): raise nose.SkipTest("buggy test") destination_table = DESTINATION_TABLE + "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, _get_project_id(), if_exists='replace', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_download_dataset_larger_than_200k_rows(self): test_size = 200005 # Test for known BigQuery bug in datasets larger than 100k rows # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] GROUP EACH BY id ORDER BY id ASC LIMIT {0}".format(test_size), project_id=PROJECT_ID) self.assertEqual(len(df.drop_duplicates()), test_size)
def test_index_column(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2" result_frame = gbq.read_gbq( query, project_id=PROJECT_ID, index_col="STRING_1") correct_frame = DataFrame( {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1") tm.assert_equal(result_frame.index.name, correct_frame.index.name)
def test_should_properly_handle_arbitrary_timestamp(self): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({ 'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')] }))
def test_should_read_as_service_account_with_key_contents(self): _skip_if_no_private_key_contents() query = 'SELECT "PI" as VALID_STRING' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_contents()) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))
def test_zero_rows(self): # Bug fix for https://github.com/pydata/pandas/issues/10273 df = gbq.read_gbq( "SELECT title, language FROM [publicdata:samples.wikipedia] where timestamp=-9999999", project_id=PROJECT_ID) expected_result = DataFrame(columns=['title', 'language']) self.assert_frame_equal(df, expected_result)
def test_download_dataset_larger_than_200k_rows(self): # Test for known BigQuery bug in datasets larger than 100k rows # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results df = gbq.read_gbq( "SELECT id FROM [publicdata:samples.wikipedia] GROUP EACH BY id ORDER BY id ASC LIMIT 200005", project_id=PROJECT_ID) self.assertEqual(len(df.drop_duplicates()), 200005)
def test_unicode_string_conversion_and_normalization(self): correct_test_datatype = DataFrame({'UNICODE_STRING': [u("\xe9\xfc")]}) query = 'SELECT "\xc3\xa9\xc3\xbc" as UNICODE_STRING' df = gbq.read_gbq(query, project_id=PROJECT_ID) tm.assert_frame_equal(df, correct_test_datatype)
def test_upload_data_if_table_exists_replace(self): raise nose.SkipTest("buggy test") destination_table = DESTINATION_TABLE + "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, _get_project_id(), if_exists='replace', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], 5)
def test_should_properly_handle_valid_floats(self): query = 'SELECT PI() as VALID_FLOAT' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'VALID_FLOAT': [3.141592653589793]}))
def test_upload_replace(self): # Attempting to overwrite an existing table with valid data and a valid schema should succeed if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') schema = ['STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER', 'STRING', 'INTEGER', 'BOOLEAN', 'BOOLEAN', 'INTEGER', 'STRING', 'INTEGER'] # Setup an existing table array1 = [['', 1, '', 1, False, 1, '00.111.00.111', 1, 'hola', 1, True, True, 1, 'Sith', 1]] df1 = DataFrame(array1, columns=['title','id','language','wp_namespace','is_redirect','revision_id', 'contributor_ip','contributor_id','contributor_username','timestamp', 'is_minor','is_bot','reversion_id','comment','num_characters']) gbq.to_gbq(df1, 'pandas_testing_dataset.test_data5', schema=schema, col_order=None, if_exists='fail') array2 = [['TESTING_GBQ', 999999999, 'hi', 0, True, 9999999999, '00.000.00.000', 1, 'hola', 99999999, False, False, 1, 'Jedi', 11210]] # Overwrite the existing table with different data df2 = DataFrame(array2, columns=['title','id','language','wp_namespace','is_redirect','revision_id', 'contributor_ip','contributor_id','contributor_username','timestamp', 'is_minor','is_bot','reversion_id','comment','num_characters']) gbq.to_gbq(df2, 'pandas_testing_dataset.test_data5', schema=schema, col_order=None, if_exists='replace') # Read the table and confirm the new data is all that is there a = gbq.read_gbq("SELECT * FROM pandas_testing_dataset.test_data5") self.assertTrue((a == df2).all().all())
def test_vasopressor_doses(dataset, project_id): # verify vasopressors have reasonable doses # based on uptodate graphic 99963 version 19.0 # double the maximum dose used in refractory shock is the upper limit used itemids = { 'milrinone': 221986, 'dobutamine': 221653, 'dopamine': 221662, 'epinephrine': 221289, 'norepinephrine': 221906, 'phenylephrine': 221749, 'vasopressin': 222315, } max_dose = { 'milrinone': 1.5, 'dobutamine': 40, 'dopamine': 40, 'epinephrine': 4, 'norepinephrine': 6.6, 'phenylephrine': 18.2, 'vasopressin': 0.08, } for vaso, dose in max_dose.items(): query = f""" select COUNT(vaso_rate) AS n_above_rate FROM mimic_derived.{vaso} WHERE vaso_rate >= {dose} """ df = gbq.read_gbq(query, project_id=project_id, dialect="standard") n_above_rate = df.loc[0, 'n_above_rate'] assert n_above_rate == 0, f'found {vaso} rows with dose above {dose}, potentially incorrect'
def test_upload_data_if_table_exists_append(self): destination_table = DESTINATION_TABLE + "3" test_size = 10 df = make_mixed_dataframe_v2(test_size) df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with value 'append' gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='append') sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='append')
def test_should_read_as_service_account_with_key_contents(self): _skip_if_no_private_key_contents() query = 'SELECT "PI" as VALID_STRING' df = gbq.read_gbq(query, project_id=PROJECT_ID, private_key=PRIVATE_KEY_JSON_CONTENTS) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))
def test_column_order_plus_index(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ["STRING_3", "STRING_2"] result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col="STRING_1", col_order=col_order) correct_frame = DataFrame({"STRING_1": ["a"], "STRING_2": ["b"], "STRING_3": ["c"]}) correct_frame.set_index("STRING_1", inplace=True) correct_frame = correct_frame[col_order] tm.assert_frame_equal(result_frame, correct_frame)
def flag_status(): # Fetch Data from Big Query Table query = '''select count(*) from airflow.covid ''' result = gbq.read_gbq(query, project_id='airflow29thmay-08') print(result) data = pd.read_csv('/home/nineleaps/PycharmProjects/air/file1.csv') total_rows = len(data.axes[0]) print(total_rows)
def test_zero_rows(self): # Bug fix for https://github.com/pydata/pandas/issues/10273 df = gbq.read_gbq( "SELECT title, language FROM " "[publicdata:samples.wikipedia] where " "timestamp=-9999999", project_id=PROJECT_ID, ) expected_result = DataFrame(columns=["title", "language"]) self.assert_frame_equal(df, expected_result)
def test_column_order_plus_index(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_2'] result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col='STRING_1', col_order=col_order) correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']}) correct_frame.set_index('STRING_1', inplace=True) correct_frame = correct_frame[col_order] tm.assert_frame_equal(result_frame, correct_frame)
def test_column_order(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_1', 'STRING_2'] result_frame = gbq.read_gbq( query, project_id=PROJECT_ID, col_order=col_order) correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [ 'b'], 'STRING_3': ['c']})[col_order] tm.assert_frame_equal(result_frame, correct_frame)
def test_index_column(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2" result_frame = gbq.read_gbq(query, project_id=_get_project_id(), index_col="STRING_1", private_key=_get_private_key_path()) correct_frame = DataFrame( {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1") tm.assert_equal(result_frame.index.name, correct_frame.index.name)
def test_column_order_plus_index(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_2'] result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col='STRING_1', col_order=col_order) correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b'], 'STRING_3' : ['c']}) correct_frame.set_index('STRING_1', inplace=True) correct_frame = correct_frame[col_order] tm.assert_frame_equal(result_frame, correct_frame)
def test_invalid_option_for_sql_dialect(self): sql_statement = "SELECT DISTINCT id FROM " \ "`publicdata.samples.wikipedia` LIMIT 10" # Test that an invalid option for `dialect` raises ValueError with tm.assertRaises(ValueError): gbq.read_gbq(sql_statement, project_id=_get_project_id(), dialect='invalid', private_key=_get_private_key_path()) # Test that a correct option for dialect succeeds # to make sure ValueError was due to invalid dialect gbq.read_gbq(sql_statement, project_id=_get_project_id(), dialect='standard', private_key=_get_private_key_path())
def test_should_properly_handle_timestamp_unix_epoch(self): query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH' df = gbq.read_gbq(query, project_id=PROJECT_ID) tm.assert_frame_equal( df, DataFrame( {'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]}))
def load_training_data_from_pubs(self, training_publications_df): tmp_table = 'patents._tmp_training' self.load_df_to_bq_tmp(df=training_publications_df, tmp_table=tmp_table) training_data_query = ''' SELECT DISTINCT REGEXP_EXTRACT(LOWER(p.publication_number), r'[a-z]+-(\d+)-[a-z0-9]+') as pub_num, p.publication_number, p.family_id, p.priority_date, title.text as title_text, abstract.text as abstract_text, 'unused' as claims_text, --SUBSTR(claims.text, 0, 5000) as claims_text, 'unused' as description_text, --SUBSTR(description.text, 0, 5000) as description_text, STRING_AGG(citations.publication_number) AS refs, STRING_AGG(cpcs.code) AS cpcs FROM `patents-public-data.patents.publications` p, `{}` as tmp, UNNEST(p.title_localized) AS title, UNNEST(p.abstract_localized) AS abstract, UNNEST(p.claims_localized) AS claims, UNNEST(p.description_localized) AS description, UNNEST(p.title_localized) AS title_lang, UNNEST(p.abstract_localized) AS abstract_lang, UNNEST(p.claims_localized) AS claims_lang, UNNEST(p.description_localized) AS description_lang, UNNEST(citation) AS citations, UNNEST(cpc) AS cpcs WHERE p.publication_number = tmp.publication_number AND country_code = 'US' AND title_lang.language = 'en' AND abstract_lang.language = 'en' AND claims_lang.language = 'en' AND description_lang.language = 'en' GROUP BY p.publication_number, p.family_id, p.priority_date, title.text, abstract.text, claims.text, description.text ; '''.format(tmp_table) print('Loading patent texts from provided publication numbers.') #print('Training data query:\n{}'.format(training_data_query)) training_data_df = gbq.read_gbq(query=training_data_query, project_id=self.bq_project, verbose=False, dialect='standard', configuration={ 'query': { 'useQueryCache': True, 'allowLargeResults': False } }) return training_data_df
def test_should_properly_handle_arbitrary_timestamp(self): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP' df = gbq.read_gbq(query, project_id=PROJECT_ID) tm.assert_frame_equal( df, DataFrame({ 'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')] }))
def test_unicode_string_conversion_and_normalization(self): correct_test_datatype = DataFrame( {'UNICODE_STRING' : [u("\xe9\xfc")]} ) query = 'SELECT "\xc3\xa9\xc3\xbc" as UNICODE_STRING' df = gbq.read_gbq(query, project_id=PROJECT_ID) tm.assert_frame_equal(df, correct_test_datatype)
def test_column_order(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_1', 'STRING_2'] result_frame = gbq.read_gbq(query, project_id=_get_project_id(), col_order=col_order, private_key=_get_private_key_path()) correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [ 'b'], 'STRING_3': ['c']})[col_order] tm.assert_frame_equal(result_frame, correct_frame)
def test_standard_sql(self): standard_sql = "SELECT DISTINCT id FROM " \ "`publicdata.samples.wikipedia` LIMIT 10" # Test that a standard sql statement fails when using # the legacy SQL dialect (default value) with tm.assertRaises(gbq.GenericGBQException): gbq.read_gbq(standard_sql, project_id=_get_project_id(), private_key=_get_private_key_path()) # Test that a standard sql statement succeeds when # setting dialect='standard' df = gbq.read_gbq(standard_sql, project_id=_get_project_id(), dialect='standard', private_key=_get_private_key_path()) self.assertEqual(len(df.drop_duplicates()), 10)
def test_legacy_sql(self): legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10" # Test that a legacy sql statement fails when # setting dialect='standard' with tm.assertRaises(gbq.GenericGBQException): gbq.read_gbq(legacy_sql, project_id=_get_project_id(), dialect='standard', private_key=_get_private_key_path()) # Test that a legacy sql statement succeeds when # setting dialect='legacy' df = gbq.read_gbq(legacy_sql, project_id=_get_project_id(), dialect='legacy', private_key=_get_private_key_path()) self.assertEqual(len(df.drop_duplicates()), 10)
def test_download_dataset_larger_than_100k_rows(self): # Test for known BigQuery bug in datasets larger than 100k rows # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results if not os.path.exists(self.bq_token): raise nose.SkipTest('Skipped because authentication information is not available.') client = gbq._authenticate() a = gbq.read_gbq("SELECT id, FROM [publicdata:samples.wikipedia] LIMIT 100005") self.assertTrue(len(a) == 100005)
def expand_l1(self, cpc_codes_series, refs_series): self.load_df_to_bq_tmp(pd.DataFrame(refs_series, columns=['pub_num']), self.l1_tmp_table) cpc_where_clause = ",".join("'" + cpc_codes_series + "'") expansion_query = ''' SELECT DISTINCT publication_number, ExpansionLevel, refs FROM ( SELECT b.publication_number, 'L1' as ExpansionLevel, STRING_AGG(citations.publication_number) AS refs FROM `patents-public-data.patents.publications` AS b, UNNEST(citation) AS citations, UNNEST(cpc) AS cpcs WHERE ( cpcs.code IN ( {} ) ) AND citations.publication_number != '' AND country_code IN ('US') GROUP BY b.publication_number UNION ALL SELECT b.publication_number, 'L1' as ExpansionLevel, STRING_AGG(citations.publication_number) AS refs FROM `patents-public-data.patents.publications` AS b, `{}` as tmp, UNNEST(citation) AS citations WHERE ( b.publication_number = tmp.pub_num ) AND citations.publication_number != '' GROUP BY b.publication_number ) ; '''.format(cpc_where_clause, self.l1_tmp_table) #print(expansion_query) expansion_df = gbq.read_gbq(query=expansion_query, project_id=self.bq_project, verbose=False, dialect='standard') return expansion_df
def main(project_id, batch, num_retries, interval, use_legacy_sql): # [START build_service] # Construct the service object for interacting with the BigQuery API. bigquery = googleapiclient.discovery.build('bigquery', 'v2') # [END build_service] #query_string = "SELECT domain, date FROM (SELECT domain, date, bundle, SUM(available_inventory) AS auctionss, SUM(demand_side_revenue) AS demand_side_revenue FROM TABLE_DATE_RANGE(reporting_hudson_views.stats_daily_, TIMESTAMP('2018-03-01'), TIMESTAMP('2018-03-25')) GROUP BY date, domain, bundle ) GROUP BY domain, bundle, date HAVING bundle IS NULL order by demand_side_revenue DESC" # Submit the job and wait for it to complete. use_legacy_sql query_job = async_query(bigquery, project_id, query_string, batch, num_retries, use_legacy_sql) df3 = pd.read_csv("AllDevice.csv") df = gbq.read_gbq(query_string, project_id=project_id) print(df.shape) frames = [df, df3] df5 = pd.concat(frames) print(df5.shape) df5.to_csv("AllDevice.csv", index=False) df4 = df5["device_id"] df4 = df4 df4.to_csv(filename, header=False, index=False) """myFTP = ftplib.FTP("ec2-204-236-207-77.compute-1.amazonaws.com", "kunal", "kunal123") ec2-204-236-207-77.compute-1.amazonaws.comkunalkunal@123 myFTP.set_pasv(False) file=open(filename,"rb") filename1='STOR ' + filename myFTP.storbinary(filename1,file)""" #print(helper_config['ftp']['host']) host = helper_config['ftp']['host'] username = helper_config['ftp']['user_name'] password = helper_config['ftp']['password'] print(host + username + password) myFTP = ftplib.FTP(host, username, password) myFTP.set_pasv(False) file = open(filename, "rb") filename1 = 'STOR ' + filename myFTP.storbinary(filename1, file) poll_job(bigquery, query_job) # Page through the result set and print all results. page_token = None while True: page = bigquery.jobs().getQueryResults( pageToken=page_token, **query_job['jobReference']).execute(num_retries=2) page_token = page.get('pageToken') if not page_token: break
def bqtopd(): query = """ SELECT CMC_ID, CC_USD_PRICE, Timestamp FROM Market_Fetch.raw_prices WHERE CMC_ID = 'revain' ORDER BY Timestamp DESC LIMIT 10; """ data_frame = gbq.read_gbq(query, "lambo-192519") print(data_frame.head())
def test_download_dataset_larger_than_200k_rows(self): test_size = 200005 # Test for known BigQuery bug in datasets larger than 100k rows # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] " "GROUP EACH BY id ORDER BY id ASC LIMIT {0}" .format(test_size), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(len(df.drop_duplicates()), test_size)
def conexao_bq(select): #Informar o arquivo json das credenciais EXTRAIDAS DO GCP credentials = service_account.Credentials.from_service_account_file( 'C:\\Users\\bruno\\OneDrive\\BigQUery\\BigQuery-d49b6d4c827c.json') #Cria uma consulta com as credenciais do BQ e o Project ID df = gbq.read_gbq(select, project_id='bigquery-194320', credentials=credentials) #Retorna a consulta return df
def test_zero_rows(self): # Bug fix for https://github.com/pydata/pandas/issues/10273 df = gbq.read_gbq("SELECT title, id " "FROM [publicdata:samples.wikipedia] " "WHERE timestamp=-9999999", project_id=PROJECT_ID) page_array = np.zeros( (0,), dtype=[('title', object), ('id', np.dtype(float))]) expected_result = DataFrame(page_array, columns=['title', 'id']) self.assert_frame_equal(df, expected_result)
def get_row_count_from_table(): try: sql = 'select count(*) as total_rows from Covid19.statewise_daily_cases' total_rows_df = gbq.read_gbq(query=sql, project_id=project_id, credentials=credentials) return total_rows_df['total_rows'][0] except GenericGBQException as e: logging.error("Exception: " + str(e)) raise
def test_query_inside_configuration(self): query_no_use = 'SELECT "PI_WRONG" as VALID_STRING' query = 'SELECT "PI" as VALID_STRING' config = { 'query': { "query": query, "useQueryCache": False, } } # Test that it can't pass query both # inside config and as parameter with tm.assertRaises(ValueError): gbq.read_gbq(query_no_use, project_id=_get_project_id(), private_key=_get_private_key_path(), configuration=config) df = gbq.read_gbq(None, project_id=_get_project_id(), private_key=_get_private_key_path(), configuration=config) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']}))
def test_upload_data(self): table_name = 'new_test1' test_size = 1000001 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, "pydata_pandas_bq_testing." + table_name, PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing." + table_name, project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size)
def test_upload_data(self): destination_table = DESTINATION_TABLE + "1" test_size = 1000001 df = make_mixed_dataframe_v2(test_size) gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) self.assertEqual(result["NUM_ROWS"][0], test_size)
def test_unicode_string_conversion_and_normalization(self): correct_test_datatype = DataFrame({"UNICODE_STRING": [u("\xe9\xfc")]}) unicode_string = "\xc3\xa9\xc3\xbc" if compat.PY3: unicode_string = unicode_string.encode("latin-1").decode("utf8") query = 'SELECT "{0}" as UNICODE_STRING'.format(unicode_string) df = gbq.read_gbq(query, project_id=PROJECT_ID) tm.assert_frame_equal(df, correct_test_datatype)
def pandas_get_table(dTable): "fetch a table and return dataframe" from pandas.io import gbq sProjectID = dTable['projectId'] sDatasetID = dTable['datasetId'] sTableID = dTable['tableId'] sQuery = "SELECT * FROM [{}.{}]".format(sDatasetID, sTableID) df = gbq.read_gbq(sQuery, sProjectID) return df
def test_query_with_parameters(self): sql_statement = "SELECT @param1 + @param2 as VALID_RESULT" config = { 'query': { "useLegacySql": False, "parameterMode": "named", "queryParameters": [ { "name": "param1", "parameterType": { "type": "INTEGER" }, "parameterValue": { "value": 1 } }, { "name": "param2", "parameterType": { "type": "INTEGER" }, "parameterValue": { "value": 2 } } ] } } # Test that a query that relies on parameters fails # when parameters are not supplied via configuration with tm.assertRaises(ValueError): gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path()) # Test that the query is successful because we have supplied # the correct query parameters via the 'config' option df = gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path(), configuration=config) tm.assert_frame_equal(df, DataFrame({'VALID_RESULT': [3]}))
def test_configuration_without_query(self): sql_statement = 'SELECT 1' config = { 'copy': { "sourceTable": { "projectId": _get_project_id(), "datasetId": "publicdata:samples", "tableId": "wikipedia" }, "destinationTable": { "projectId": _get_project_id(), "datasetId": "publicdata:samples", "tableId": "wikipedia_copied" }, } } # Test that only 'query' configurations are supported # nor 'copy','load','extract' with tm.assertRaises(ValueError): gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path(), configuration=config)
def test_upload_data(self): test_size = 1000001 #create df to test for all BQ datatypes except RECORD bools = np.random.randint(2, size=(1,test_size)).astype(bool) flts = np.random.randn(1,test_size) ints = np.random.randint(1,10, size=(1,test_size)) strs = np.random.randint(1,10, size=(1,test_size)).astype(str) times = [datetime.datetime.now(pytz.timezone('US/Arizona')) for t in xrange(test_size)] df = DataFrame({'bools':bools[0], 'flts':flts[0], 'ints':ints[0], 'strs':strs[0], 'times':times[0]}, index=range(test_size)) gbq.to_gbq(df,"pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID, chunksize=10000) sleep(60) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size)