def test_read_gbq_without_dialect_warns_future_change(monkeypatch): # Default dialect is changing to standard SQL. See: # https://github.com/pydata/pandas-gbq/issues/195 mock_read_gbq = mock.Mock() mock_read_gbq.return_value = DataFrame([[1.0]]) monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq) with tm.assert_produces_warning(FutureWarning): pd.read_gbq("SELECT 1")
def test_read_gbq_without_new_kwargs(monkeypatch): captured_kwargs = {} def mock_read_gbq(sql, **kwargs): captured_kwargs.update(kwargs) return DataFrame([[1.0]]) monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) pd.read_gbq("SELECT 1") assert "use_bqstorage_api" not in captured_kwargs
def test_read_gbq_without_deprecated_kwargs(monkeypatch): captured_kwargs = {} def mock_read_gbq(sql, **kwargs): captured_kwargs.update(kwargs) return DataFrame([[1.0]]) monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) pd.read_gbq("SELECT 1") assert "verbose" not in captured_kwargs assert "private_key" not in captured_kwargs
def test_read_gbq_with_deprecated_kwargs(monkeypatch): captured_kwargs = {} def mock_read_gbq(sql, **kwargs): captured_kwargs.update(kwargs) return DataFrame([[1.0]]) monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) private_key = object() pd.read_gbq("SELECT 1", verbose=True, private_key=private_key) assert captured_kwargs["verbose"] assert captured_kwargs["private_key"] is private_key
def get_hist_series(table, varList, overflow=True, selection=None): if selection is None: selection = "INTEGER(selStatus)&2097151=2097151" variables = ",\n".join([ build_case_string(*var) for var in varList] + ["COUNT(1) as count"]) groupers = ','.join([var[1] for var in varList]) h = """SELECT {1} FROM {0} WHERE {3} GROUP BY {2} ORDER BY {2}""".format(table, variables, groupers, selection) # Making the request frame = pd.read_gbq(h,'ams-test-kostya') if not overflow: for __, v, __ in varList: frame = frame[frame[v] != -1.0] for __, v, bins in varList: frame[v] = frame[v].map(lambda x: -1 if x == -1 else bins[x] ) # Populating edges = [ np.append(v[2],-1) if overflow else v[2] for v in varList ] newIndex = pd.MultiIndex.from_product( edges, names=[ v[1] for v in varList ] ) series = frame.set_index(list(frame.columns[:-1]))['count'] series = series.reindex(newIndex).dropna() return series
def hist2DCustomCommand( nBinsX, firstBinX, lastBinX, nBinsY, firstBinY, lastBinY, theCommand): binWidth=float(lastBin-firstBin)/nBins try: return pd.read_gbq(theCommand, project_id=theProjectID) except ValueError: print 'no json data'
def query_to_dataframe(query): import pandas as pd import pkgutil privatekey = pkgutil.get_data('trainer', 'privatekey.json') print(privatekey[:200]) return pd.read_gbq(query, project_id='cloud-training-demos', dialect='standard', private_key=privatekey)
def query_to_dataframe(query): import pandas as pd import pkgutil, json privatekey = pkgutil.get_data(KEYDIR, 'privatekey.json') print(privatekey[:200]) return pd.read_gbq(query, project_id=PROJECT, dialect='standard', private_key=privatekey)
def histCustomCommand( theCommand, requery=False): dirName=os.environ.get('HOME')+'/.bigQueryCached/' if requery is False: cachedResult=getHistDataFrame(dirName,theCommand) if cachedResult is not None: print 'CACHED' return cachedResult df=pd.read_gbq(theCommand, project_id=theProjectID) saveHistDataFrame(df,dirName,theCommand) return df
def test_roundtrip(self): destination_table = DESTINATION_TABLE + "1" test_size = 20001 df = make_mixed_dataframe_v2(test_size) df.to_gbq(destination_table, _get_project_id(), chunksize=None, private_key=_get_private_key_path()) result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) assert result['num_rows'][0] == test_size
def _query_timeseries_model(self): query = """ SELECT * FROM ML.FORECAST( MODEL `japan-grid-carbon-api{bqStageName}.{utility}.model_intensity_timeseries`, STRUCT({horizon_size} AS horizon) ) """.format(bqStageName=self.bqStageName, utility=self.utility, horizon_size=HORIZON) return pd.read_gbq(query)
def query_policy_for_current_date(): today = date.today().strftime("%Y-%m-%d") query = """ SELECT country_name, region_name, date, school_closing_notes, workplace_closing, workplace_closing_notes, cancel_public_events, cancel_public_events_notes, restrictions_on_gatherings, restrictions_on_gatherings_notes, close_public_transit, close_public_transit_notes, stay_at_home_requirements, stay_at_home_requirements_notes, contact_tracing, contact_tracing_notes, confirmed_cases, deaths international_travel_controls, international_travel_controls_notes FROM `bigquery-public-data.covid19_govt_response.oxford_policy_tracker` where ( country_name like 'United Kingdom' or country_name like 'China' or country_name like 'Australia') and date = '%s' order by date asc """ % today df = pd.read_gbq(query, dialect='standard', project_id=project_id, credentials=credentials) df = define_data_type(df) return df
def test_pandas_gbq_query(): # [START bigquery_migration_pandas_gbq_query] import pandas sql = """ SELECT name FROM `bigquery-public-data.usa_names.usa_1910_current` WHERE state = 'TX' LIMIT 100 """ # Run a Standard SQL query using the environment's default project df = pandas.read_gbq(sql, dialect='standard') # Run a Standard SQL query with the project set explicitly project_id = 'your-project-id' # [END bigquery_migration_pandas_gbq_query] assert len(df) > 0 project_id = os.environ['GOOGLE_CLOUD_PROJECT'] # [START bigquery_migration_pandas_gbq_query] df = pandas.read_gbq(sql, project_id=project_id, dialect='standard') # [END bigquery_migration_pandas_gbq_query] assert len(df) > 0
def _extract_daily_carbon_intensity_by_month_and_weekday_from_big_query( self): query = """ SELECT EXTRACT(MONTH FROM datetime) AS month, EXTRACT(DAYOFWEEK FROM datetime) AS dayofweek, EXTRACT(HOUR FROM datetime) AS hour, """ + self._get_intensity_query_string() + """ GROUP BY month, dayofweek, hour order by month, dayofweek, hour asc """ return pd.read_gbq(query)
def get_all_status_board(): """ prod_status_board table 모두 추출 :return: DataFrame """ query = f''' select * from `geultto.status_board.{cardinal}_prod` ''' status_board_df = pd.read_gbq(query, project_id=_project_id, dialect='standard', private_key=_jwt) return status_board_df
def states_county(step): if(step == 0): query = firstQuery + str(tuple(states_list)) + 'AND county IN ' + str(tuple(county_list)) print "-----------------------------------" print "Filtered by STATE AND COUNTY." print "-----------------------------------" else: query = firstQuery + str(tuple(states_list)) print "-----------------------------------" print "Filtered by STATE." print "-----------------------------------" df = pd.read_gbq(query, "starry-braid-156516") return df, query
def api_pricing(): sql_prices = '''select location_id, member_type, plan, amount from `bikeshare-303620.TripsDataset.Pricing` ''' pricing_df = pd.read_gbq(sql_prices, project_id=gcp_project, credentials=credentials, dialect='standard') # print(pricing_df) json_obj = pricing_df.to_json(orient='records') json_loads = json.loads(json_obj) json_formatted_str = json.dumps(json_loads, indent=2) return json_formatted_str
def test_pandas_gbq_legacy_query(): # [START bigquery_migration_pandas_gbq_query_legacy] import pandas sql = """ SELECT name FROM [bigquery-public-data:usa_names.usa_1910_current] WHERE state = 'TX' LIMIT 100 """ df = pandas.read_gbq(sql, dialect='legacy') # [END bigquery_migration_pandas_gbq_query_legacy] assert len(df) > 0
def backup_database(event=None, context=None): # models currently running needing to be backed up # get live portfolio portfolio_query = """ SELECT * FROM `oval-bot-232220.algorithmic_trader.portfolio` """ pf_history = pd.read_gbq(portfolio_query).sort_values( by='Date').reset_index(drop=True) # get backup portfolio backup_query = """ SELECT * FROM `oval-bot-232220.algorithmic_trader.backup` """ backup_data = pd.read_gbq(backup_query).sort_values(by='Date').reset_index( drop=True) # add new portfolio to existing portfolio data and push to gbq pf_data = backup_data.append(pf_history.iloc[-1], ignore_index=True) pf_data.to_gbq(destination_table='algorithmic_trader.backup', project_id="oval-bot-232220", if_exists='replace')
def api_stations(): sql_stations = f'select * from `bikeshare-303620.TripsDataset.Stations` ' print(sql_stations) stations_df = pd.read_gbq(sql_stations, project_id=gcp_project, credentials=credentials, dialect='standard') stations_data = stations_df.to_json(orient='records') json_loads = json.loads(stations_data) json_formatted_str = json.dumps(json_loads, indent=2) return json_formatted_str
def get_all_slack_log(): """ 최소 두 번째 글부터 쌓여있는 prod_slack_log table 모두 추출 :return: DataFrame """ query = f''' select * from `geultto.slack_log.{cardinal}_prod` ''' slack_log_df = pd.read_gbq(query, project_id=_project_id, dialect='standard', private_key=_jwt) return slack_log_df
def _load(self) -> pd.DataFrame: load_args = copy.deepcopy(self._load_args) if self._filepath: load_path = get_filepath_str(PurePosixPath(self._filepath), self._protocol) with self._fs.open(load_path, mode="r") as fs_file: load_args["query"] = fs_file.read() return pd.read_gbq( project_id=self._project_id, credentials=self._credentials, **load_args, )
def run(): try: """Removing XML file from system if already exists""" for the_file in os.listdir(xml_folder+foldername): file_path = os.path.join(xml_folder+foldername, the_file) if os.path.isfile(file_path): os.unlink(file_path) logging.info('Old file removed from system ....') """Reading all detail records from SQL in pandas Dataframe -- in this case we are using Bigquery as Input SQL source""" df_detail_stg= pandas.read_gbq(readfileasstring(sql_query_folder+detail_sql_file), project_id=<PROJECT_ID>, dialect='standard') """Reading audit data from audit SQL in pandas Dataframe""" df_audit_stg= pandas.read_gbq(readfileasstring(sql_query_folder+audit_sql_file), project_id=<PROJECT_ID>, dialect='standard') i = 0 """Creating root elements and record tags for XML""" root = ET.Element("aaa_ncnu_billing") """Adding audit data""" audit = ET.SubElement(root, "audit_fields") ET.SubElement(audit, 'run_id').text = str(jobrunid) ET.SubElement(audit, 'processing_date').text = str(time.strftime("%Y-%m-%d")) ET.SubElement(audit, 'detail_record_count').text = str(len(df_detail.index)) for (col,c) in zip(df_audit.columns,list(df_audit)): ET.SubElement(audit, c).text = str(df_audit[col][i]) """Adding detail records data""" doc = ET.SubElement(root, "billing_records") """Iterating through all rows in Dataframe""" while (i < len(df_detail.index)): rows = ET.SubElement(doc, "MEMBERSHIP") flg=1 """Creating all the subelements as rows and columns from DStaframe""" for (col,c) in zip(df_detail.columns,list(df_detail)): ET.SubElement(rows, c).text = str(df_detail[col][i]) i = i+1 tree = ET.ElementTree(root) tree.write(xml_folder+foldername+"/"+file_name)
def fundamental_analysis_collection(project_name): #ROCE_table=pd.read_gbq('SELECT * FROM pecten_dataset.ROCE_t WHERE Status = "active";', project_id=project_name) sales_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.sales_t WHERE Status = "active";', project_id=project_name) profit_margin_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.profit_margin_t WHERE Status = "active";', project_id=project_name) PER_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.PER_t WHERE Status = "active";', project_id=project_name) EPS_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.EPS_t WHERE Status = "active";', project_id=project_name) EBITDA_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.EBITDA_t WHERE Status = "active";', project_id=project_name) dividend_table = pd.read_gbq('SELECT * FROM pecten_dataset_test.dividend_analysis_t WHERE Status = "active";', project_id=project_name) return sales_table,profit_margin_table,PER_table,EPS_table,EBITDA_table,dividend_table
def index_table(es, index_name, client, table, participant_id_column, sample_id_column, sample_file_columns, billing_project_id): """Indexes a BigQuery table. Args: es: Elasticsearch object. index_name: Name of Elasticsearch index. table_name: Fully-qualified table name of the format: "<project id>.<dataset id>.<table name>" participant_id_column: Name of the column containing the participant ID. sample_id_column: (optional) Name of the column containing the sample ID (only needed on samples tables). sample_file_columns: (optional) Mappings for columns which contain genomic files of a particular type (specified in ui.json). billing_project_id: GCP project ID to bill for reading table """ _create_nested_mappings(es, index_name, table, sample_id_column) table_name = _table_name_from_table(table) start_time = time.time() logger.info('Indexing %s into %s.' % (table_name, index_name)) # There is no easy way to import BigQuery -> Elasticsearch. Instead: # BigQuery table -> pandas dataframe -> dict -> Elasticsearch df = pd.read_gbq('SELECT * FROM `%s`' % table_name, project_id=billing_project_id, dialect='standard') elapsed_time = time.time() - start_time elapsed_time_str = time.strftime('%Hh:%Mm:%Ss', time.gmtime(elapsed_time)) logger.info('BigQuery -> pandas took %s' % elapsed_time_str) logger.info('%s has %d rows' % (table_name, len(df))) if not participant_id_column in df.columns: raise ValueError( 'Participant ID column %s not found in BigQuery table %s' % (participant_id_column, table_name)) if sample_id_column in df.columns: scripts_by_id = _sample_scripts_by_id(df, table_name, participant_id_column, sample_id_column, sample_file_columns) indexer_util.bulk_index_scripts(es, index_name, scripts_by_id) else: docs_by_id = _docs_by_id(df, table_name, participant_id_column) indexer_util.bulk_index_docs(es, index_name, docs_by_id) elapsed_time = time.time() - start_time elapsed_time_str = time.strftime("%Hh:%Mm:%Ss", time.gmtime(elapsed_time)) logger.info('pandas -> ElasticSearch index took %s' % elapsed_time_str)
def read_tces_from_bq(input_tce_table, project_id): """Read a table of Kepler KOIs from BigQuery. Args: input_tce_table: BigQuery table name containing TCE data. project_id: GCP project ID containing input TCE table. Returns: pd.DataFrame of TCEs. """ query = 'SELECT * from `{}`'.format(input_tce_table) tce_table = pd.read_gbq(query, project_id=project_id, dialect='standard') return tce_table
def query_bigquery_df(sql_query): """Query Google BigQuery using the provided SQL query and returns pandas DataFrames. Logs errors but does not throw any exceptions while accessing BigQuery. Args: sql_query (str): The SQL query string. Returns: pandas.DataFrame: The resulting DataFrame """ assert _BIGQUERY_CLIENT, 'Global bigquery client is not configured yet.' try: df = pd.read_gbq(sql_query, credentials=GLOBAL_CREDS) except Exception as e: logger.log_error('Exception during query: {}'.format(e)) return df
def get_profitability_color(): QUERY = """SELECT Constituent, Constituent_id, Constituent_name,Total_profitability_score, (CASE WHEN Total_profitability_score > 32 THEN '1' WHEN Total_profitability_score < 20 THEN '-1' ELSE '0' END) AS Profitability_color FROM pecten_dataset_test.Profitability_score_ranking WHERE Status="active" GROUP BY Constituent_name, Constituent, Constituent_id, Total_profitability_score, Profitability_color ORDER BY Constituent""" profitability_colors = pd.read_gbq(QUERY, project_id='igenie-project', private_key=None) return profitability_colors
def get_risk_color(): QUERY = """SELECT Constituent, Constituent_id, Constituent_name,Risk_score, (CASE WHEN Risk_score > 4 THEN '-1' WHEN Risk_score < 2 THEN '1' ELSE '0' END) AS Risk_color FROM pecten_dataset_test.Risk_ranking WHERE Status="active" GROUP BY Constituent_name, Constituent, Constituent_id, Risk_score, Risk_color ORDER BY Constituent""" risk_colors = pd.read_gbq(QUERY, project_id='igenie-project', private_key=None) return risk_colors
def get_stats(client, project, table_ref, empty_string='""', max_size=50): schema = client.get_table(f"{table_ref.dataset_id}.{table_ref.table_id}").schema num_columns = len(schema) num_repeats = -(-num_columns // max_size) sqls = ( " UNION ALL ".join( aggregate(f, table_ref, j * max_size + i, empty_string) for i, f in enumerate(schema[j * max_size : min(num_columns, (j + 1) * max_size)]) ) + " ORDER BY ord;" for j in range(num_repeats) ) dfs = (pd.read_gbq(sql, project_id=project, dialect="standard") for sql in sqls) return pd.concat(dfs)
def get_london_cycle_hires( credentials: service_account.Credentials, ) -> pd.DataFrame: query = sql # Create dataframe for migrating meterpoints df = pd.read_gbq( query, project_id="london-cycles-306117", dialect="standard", credentials=credentials, location="EU", ) return df
def get_covid_deaths( credentials: service_account.Credentials, ) -> pd.DataFrame: query = sql # Create dataframe for migrating meterpoints df = pd.read_gbq( query, project_id="covid-deaths", dialect="standard", credentials=credentials, location="EU", ) return df
def process_standard(indicator, identifier): sql = """ SELECT {} as {}, {}_raw as raw, {}_score as score, {}_cat as cat, {}_label as label FROM `{}.{}.{}` """.format(identifier, IDENTIFIERS[identifier], indicator, indicator, indicator, indicator, BQ_PROJECT_ID, BQ_DATASET_NAME, BQ_IN[indicator]) df = pd.read_gbq(query=sql, dialect="standard") return df
def main(): try: opts, args = getopt.getopt( sys.argv[1:], 'hi:o:k:v', ['help', 'input=', 'output=', 'key=', 'verbose']) except getopt.GetoptError as err: print(err) sys.exit(2) verbose = False inputfile = 'input.sql' outputfile = 'output.csv' key = 'key.json' project_id = '' for o, a in opts: if o in ('-h', '--help'): print( "usage: gbq-csv-cli.py -i <inputfile> -o <outputfile>. By default takes 'input.sql' as input and 'output.csv' as output, 'key.json' as credentials key" ) sys.exit() elif o in ('-v', '--verbose'): verbose = True elif o in ('-i', '--input'): inputfile = a elif o in ('-o', '--output'): outputfile = a elif o in ('-k', '--key'): key = a print("input is {}, output is {}, key is {}\n".format( inputfile, outputfile, key)) if verbose else None try: query = open(inputfile, 'r').read() print("query: {}".format(query)) if verbose else None project_id = json.load(open(key))['project_id'] print("project_id is {}".format(project_id)) if verbose else None except Exception as e: print("exception: {}".format(str(e))) sys.exit(2) try: df = pd.read_gbq(query, project_id=project_id, dialect="standard", private_key=key) print(df.head()) if verbose else None df.to_csv(outputfile, index=False) print("Done! {} rows saved to {}".format(len(df), outputfile)) except Exception as e: print("Error running query: {}".format(str(e))) sys.exit(2)
def get_recs_w_date_range(date_s,date_e): """Iterate through various date ranges to create the a timeframe sample for later aggregation""" global dfx2 # Otherwise, dfx2 is considered Local and will not be global scope of the dataframe created above query = """SELECT Timestamp, dst_addr, cast(dst_port as integer) as dst_port, cast(duration_ms as integer) as duration_ms, cast(bytes as integer) as bytes, protocol, flow_direction FROM ipfix.ipfix WHERE Timestamp BETWEEN timestamp('{}') AND timestamp('{}') LIMIT 10 """.format(date_s,date_e) #print('The value of local var date_s is: {}'.format(date_s)) #print('The value of local var date_e is: {}'.format(date_e)) dfx1 = pd.read_gbq(query, project_id="network-sec-analytics",reauth=True) dfx2 = dfx2.append(dfx1) # Append onto the dfx2 dataframe return
def get_bigquery(args): from utils.Storage import Storage storage_client = Storage(args.google_key_path) q1 = """SELECT date AS ds, closing_price AS y FROM `pecten_dataset.stockprice_historical_collection` where constituent_name = 'DAIMLER AG' order by date""" query1 = pd.read_gbq(q1, project_id='igenie-project', dialect='standard') # print(query1) args.query1 = query1
def get_attack_on_civilians(): # SQL query for attack on civilians (event code) by country code in 2018 query = (''' SELECT country_code, COUNT(*) AS attacks FROM (SELECT Actor2CountryCode country_code FROM [gdelt-bq:gdeltv2.events] WHERE YEAR=2018 AND EventCode="170") GROUP BY country_code''') # Run query and convert to panda data frame df = pd.read_gbq(query, "gdelt-201419") # Assign country name df_label = get_country_by_ISO3_code() df = pd.merge(df, df_label, on='country_code') df = df[['country_code', 'country', 'attacks']] return df
def api_weather(): # locationID = 1 # startDate = '01/01/2019' # endDate = '12/31/2019' sql_weather = f'select forecast_date, maxTempC, humidity, total_precip, avg_cloudcover, avg_windspeed, location_id ' \ f'from `bikeshare-303620.TripsDataset.HistoricalWeather` order by location_id, forecast_date' weather_df = pd.read_gbq(sql_weather, project_id=gcp_project, credentials=credentials, dialect='standard') weather = weather_df.to_json(orient='records') json_loads = json.loads(weather) json_formatted_str = json.dumps(json_loads, indent=2) return json_formatted_str
def get_geohash_of_point(centroid, project_id=None): if not project_id: project_id = DEFAULT_PROJECT sql = '''with test as ( SELECT st_contains(st_geogfromtext(wkt_geom), st_geogpoint({lon}, {lat}) ) as grid, geohash FROM `stormwaterheatmap-hydrology.geometry.geohash` ) select geohash from test where grid '''.format(lon=centroid.x, lat=centroid.y) return pandas.read_gbq(sql, project_id=project_id).geohash[0]
def load_product(): print('Loading product') query = ''' SELECT product_id, product_type_id, brand_id, manufacturer_id, product_group_id, team_id, subproduct_type_id FROM `coolblue-bi-platform-prod.master.product` ''' return pd.read_gbq(query, 'coolblue-bi-platform-prod', dialect='standard')
def test_roundtrip(self): destination_table = DESTINATION_TABLE + "1" test_size = 20001 df = make_mixed_dataframe_v2(test_size) df.to_gbq(destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) self.assertEqual(result['num_rows'][0], test_size)
def get_data(self, type, station_ids, n_years): query = """ SELECT station_number, year, month, day, {type} as value, rain, snow FROM `publicdata.samples.gsod` WHERE station_number IN ({stns}) AND year < 2010 AND year >= {minyr} """.format( type=type, stns=','.join(station_ids), minyr=2010 - n_years ) df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard') df['date'] = pd.to_datetime(df[['year', 'month', 'day']]) stations_df = pd.DataFrame({ 'location': stations.keys(), 'station_number': [int(v) for v in stations.values()] }) df = pd.merge(df, stations_df, on='station_number') return df
def main(group1, group2, d, sample_size, iterations, bin_size, test_type, experiment_id): project_id = "crafty-campaign-106420" test_data = pd.read_gbq( "select * from AvengersCrossEvent.v_ABTest_%s where experiment_id = %s" % (string.upper(test_type), experiment_id), project_id, ) p_values, day_d, day_d2 = resampling( test_data, group1, group2, iterations, sample_size, d, test_type ) # 75.2 #161.396529913 #321.671721935 rejected_perc = reject_rate(p_values) print "--------------------------------------------------------------" print "Summary" print "Out of %i simulated tests, reject null hypothesis of equal means %d%% of the time" % ( iterations, rejected_perc, ) plot_dist(day_d, day_d2, group1, group2, bin_size, test_type) print np.mean(day_d), np.mean(day_d2)
def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None): """Read data from BigQuery and split into train and validation sets. Args: full_table_path: (string) full path of the table containing training data in the format of [project_id.dataset_name.table_name]. project_id: (string, Optional) Google BigQuery Account project ID. num_samples: (int, Optional) Number of data samples to read. Returns: pandas.DataFrame """ query = metadata.BASE_QUERY.format(table=full_table_path) limit = ' LIMIT {}'.format(num_samples) if num_samples else '' query += limit # Use "application default credentials" # Use SQL syntax dialect data_df = pd.read_gbq(query, project_id=project_id, dialect='standard') return data_df
def check_ga_session_date_exist(destination_table, date, credential_path): # check if destination table has data of certain date if not check_table_exist(destination_table): # has no certain date if the table not exist return False query = 'SELECT count(*) FROM [%s] WHERE DATE(ga_session_date) == "%s"' % (destination_table, date.strftime("%Y-%m-%d")) return (pd.read_gbq(query, project_id=config.PROJECT_ID, verbose=False, private_key=credential_path).iloc[0, 0] > 0)
return {"status": "failure", "data_size": total_row} if __name__ == "__main__": # Parse the argument to get the credential_path parser = argparse.ArgumentParser(description='Input secre_json_path and corresponding dataset') parser.add_argument('--credential_path', type=str, dest='credential_path', required=True, help='input the path of service account credential from gcp, use $gcp_service_account in jenkings') args = vars(parser.parse_args()) credential_path = args["credential_path"] # Use google_auth library to get access to google Auth = google_auth(credential_path) bigquery = Auth.get_auth('bigquery_v2') analytics = Auth.get_auth('analytics_v4') # Check if the GA_BQ_UPLOAD_STATUS_LOG table exist in gbq if check_table_exist(config.GA_BQ_UPLOAD_STATUS_LOG): ga_bq_upload_status_log = pd.read_gbq(query="SELECT * FROM [%s]" % config.GA_BQ_UPLOAD_STATUS_LOG, project_id=config.PROJECT_ID, private_key=credential_path) else: ga_bq_upload_status_log = pd.DataFrame(columns=['config_name', 'ga_session_date', 'status', 'backup_date', "uploaded_data_size"]) # Set the time region d = config.DATE_INIT.split("-") date_init = datetime.date(int(d[0]),int(d[1]),int(d[2])) date_now = datetime.datetime.now().date() for config_name in config.ga_bq_config: for date in daterange(date_init, date_now): destination_table = config.ga_bq_config[config_name]["destination_table"] print "start checking (%s, %s) pair for GA to BQ" % (config_name, date) condition = (ga_bq_upload_status_log["config_name"]==config_name) & (ga_bq_upload_status_log["ga_session_date"]==date.strftime("%Y-%m-%d")) if ga_bq_upload_status_log[condition].empty: # no such condition, totally new table-date pair print 'find no pair within the record, try to upload data with (%s, %s)' % (config_name, date) if check_ga_session_date_exist(destination_table, date, credential_path):
user = "******"+str(i) print "Analysis for <arlearn"+str(i)+"@gmail.com>" # -------------------- #/ RATINGS / #-------------------- # Collect all the ratings RTquery = "SELECT * FROM "+table_name+" WHERE " \ "origin = 'rating' AND actorID='mailto:arlearn"+str(i)+"@gmail.com' " \ " AND timestamp > PARSE_UTC_USEC('"+startDay+" 07:00:00') AND timestamp < " \ " PARSE_UTC_USEC('"+endDay+" 20:00:00') ORDER by timestamp" # Populating the dataframe RTframe = pd.read_gbq(RTquery, project_id) # Filtering the results RTdf = RTframe[['timestamp','objectId','resultResponse','lat','lng']] #Rename the columns RTdf.rename(columns={'objectId':'Indicators'}, inplace=True) RTdf.rename(columns={'resultResponse':'value'}, inplace=True) #Reshape the DataFrame RTrsh = RTdf.set_index(['timestamp','Indicators'])['value'] # In case of duplicates RTrsh = RTrsh.drop(RTrsh.index.get_duplicates()) RTrsh = RTrsh.unstack() # Fix: the index will shift -1 hr. E.g. 9:00 -> 8:00 # Indicating the rating done at 9:00 for the 8:xx activities RTrsh.index = RTrsh.index-pd.offsets.Hour(1)
def query_to_dataframe(query): import pandas as pd return pd.read_gbq(query, project_id='cloud-training-demos', dialect='standard')
def get_df(self, query): return pd.read_gbq(query=query, project_id=self.project_id, private_key=self.credential_path, verbose=False)
def query_to_dataframe(query): import pandas as pd return pd.read_gbq(query, project_id='cloud-training-demos', dialect='standard', private_key='trainer/privatekey.json')
#job_id, _results = client.query('select * from tqin.v_ABRetention') #complete = False #while True: # if complete == False: # complete, row_count = client.check_job(job_id) # else: # break #results = client.get_query_rows(job_id) #retention_data = json_normalize(results) retention_data = pd.read_gbq('select * from tqin.v_ABRetention', project_id) #retention_data = pd.read_csv('/Users/thomasqin/Documents/python/abretention.csv') test_groups = ['control','v1','v2','v3'] #now = datetime.now() #current_date = now.strftime("%m-%d-%y") #date_converter = lambda x: datetime.fromtimestamp(x) #retention_data['install_date'] = retention_data['install_date'].apply(date_converter) #player_id_control = retention_data_control['player_id'] #player_id_test = retention_data_test['player_id']