def insert_row(df, table, replace_val): if len(df.index) == 0: print("gbq insert records zero") return project_id = _common_conf['bigquery']['project_id'] private_key_path = get_abspath(_common_conf['bigquery']['key_path']) dataset = _common_conf['bigquery']['dataset'] # 10000ずつinset full_table = "{}.{}".format(dataset, table) client = get_client(json_key_file=private_key_path, readonly=False, swallow_results=False) if client.check_table(dataset, table): bq_limit = 10000 q_num = len(df.index) // bq_limit for i in range(0, q_num + 1): client = get_client(json_key_file=private_key_path, readonly=False, swallow_results=False) ins_df = df[i * bq_limit:(i + 1) * bq_limit].replace( np.nan, replace_val) row_dict = ins_df.to_dict(orient='index') row_data = [x for x in row_dict.values()] ret = client.push_rows(dataset, table, row_data) if 'insertErrors' in ret: msg = "BigQuery Insert Error:\nsample:\n{}\nerror:\n{}" raise Exception(msg.format(row_data[0:5], ret)) else: print('{} CREATE TABLE'.format(full_table)) gbq.to_gbq(df, full_table, project_id)
def get_venue_names_from_venue_ids(venue_id_list): #PROJECT_ID = 'tour-miner-project' # SERVICE_ACCOUNT = '*****@*****.**' # JSON_KEY_PATH = 'tour-miner-project-8873e737b27c.json' # SERVICE_ACCOUNT ='*****@*****.**' JSON_KEY_PATH = 'tour-miner-project-2019-511090dd4bd9.json' # BigQueryClientの取得 client = get_client(json_key_file=JSON_KEY_PATH, readonly=True) where_phrase = '' for i in range(len(venue_id_list)): where_phrase += "VENUE_ID = \'" + venue_id_list[i] + "\'" if i < len(venue_id_list) - 1: # 最後でなければORをつける where_phrase += ' or ' # query = '#standardSQL\nSELECT VENUE_ID, VENUE_NAME FROM `tour-miner-project.dataset_TIST2015.JP_VENUE_DICTIONARY` WHERE ' + where_phrase query = '#standardSQL\nSELECT VENUE_ID, VENUE_NAME FROM `tour-miner-project-2019.dataset_TIST2015.JP_VENUE_DICTIONARY` WHERE ' + where_phrase # print(query) try: job_id, results = client.query(query, timeout=60) except BigQueryTimeoutException as e: print(e) # print(results) location_name_dictionary = {} for q in results: location_name_dictionary[q['VENUE_ID']] = q['VENUE_NAME'] return location_name_dictionary
def fn3(Data): client = bigquery.get_client(json_key_file='bigquery.json', readonly=False) DATABASE = "bqdb" TABLE = "Songs" data = Data if not client.check_table(DATABASE, TABLE): print("Create table {0}.{1}".format(DATABASE, TABLE), file=sys.stderr) client.create_table(DATABASE, TABLE, [ {'name': 'id', 'type': 'integer', 'description': 'mysql id'}, {'name': 'songId', 'type': 'string', 'description': 'song id'}, {'name': 'title', 'type': 'string', 'description': 'song title'}, {'name': 'genre', 'type': 'string', 'description': 'genre'}, {'name': 'albumId', 'type': 'string', 'description': 'album id'}, {'name': 'album', 'type': 'record', 'description': 'album info', 'fields':[ {'name': 'id', 'type': 'integer', 'descripition': 'mysql album id'}, {'name': 'albumId', 'type': 'string', 'descripition': 'album id'}, {'name': 'title', 'type': 'string', 'descripition': 'album string'}, {'name': 'releaseDate', 'type': 'TIMESTAMP', 'descripition': 'album releaseDate'}, {'name': 'score', 'type': 'numeric', 'descripition': 'album score'}, {'name': 'publisher', 'type': 'string', 'descripition': 'album publisher'}, {'name': 'label', 'type': 'string', 'descripition': 'album label'}]} ]) pushResult = client.push_rows(DATABASE, TABLE, data, insert_id_key='songId') print("Pushed Result is", pushResult)
def get(self): user = users.get_current_user() if user: url = users.create_logout_url(self.request.uri) url_linktext = 'Logout' query = "SELECT FLIGHTNO, CP, FO, UA, HL, DL FROM demo.sdata WHERE CP IN (SELECT latamid FROM demo.mapping WHERE userid='%s')" % (user) client = get_client(json_key_file=json_key, readonly=True) try: job_id, results = client.query(query, timeout=10) except BigQueryTimeoutException: print "Timeout" template_values = { 'user': user, 'url': url, 'url_linktext': url_linktext, 'results': results, } template = JINJA_ENVIRONMENT.get_template('index.html') else: url = users.create_login_url(self.request.uri) url_linktext = 'Login' template_values = { 'user': user, 'url': url, 'url_linktext': url_linktext } template = JINJA_ENVIRONMENT.get_template('nologin.html')
def get_post(year, month, post_id, post_type): from bigquery import get_client # JSON key provided by Google json_key = os.path.dirname( os.path.realpath(__file__)) + os.sep + 'key.json' client = get_client(json_key_file=json_key, readonly=True) if post_type == "post": post_or_comment = "posts" else: post_or_comment = "comments" table_name = "fh-bigquery:reddit_" + post_or_comment + "." + year + "_" + month # Submit an async query. query = "SELECT * FROM [" + table_name + "] WHERE id = '" + post_id + "';" job_id, _results = client.query(query) sleep(3) # Check if the query has finished running. complete, row_count = client.check_job(job_id) # Retrieve the results. results = client.get_query_rows(job_id) return str(results)
def get_bq_data(): # Download the .json key from your Google Cloud account client = get_client(json_key_file='bq-query.json', readonly=True) query = """ SELECT pitcherId, pitchTypeDescription, outcomeDescription, COUNT(pitchTypeDescription) throws FROM [bigquery-public-data:baseball.games_wide] WHERE atBatEventType = "PITCH" GROUP BY pitcherId, pitchTypeDescription, outcomeDescription """ job_id, _results = client.query(query) complete = False while not complete == True: print('Checking for job {JOB}...'.format(JOB=job_id)) complete, row_count = client.check_job(job_id) sleep(1) print('Job complete - downloading...') results = client.get_query_rows(job_id) print('Downloaded {N} results...'.format(N=len(results))) with open(FILE_NAME, 'wb') as f: pickle.dump(results, f) return results
def main(): logger = Utils.enable_logging() # get client client = get_client(PROJECT_ID, service_account=SERVICE_ACCOUNT, private_key=KEY, readonly=False) client.swallow_results = False logger.info("client: %s" % client) schema_str = Utils.read_file(GNIP_SCHEMA_FILE) schema = json.loads(schema_str) # initialize table mapping for default table table_mapping = { DATASET_ID + "." + TABLE_ID : [DATASET_ID, TABLE_ID] } l = BigQueryGnipListener(client, schema, table_mapping, logger=logger) while True: stream = None try: get_stream(l) except: logger.exception("Unexpected error:"); if stream: stream.disconnect() time.sleep(SLEEP_TIME)
def post(self): data = json.loads(self.request.body) accountid = data['accountId'] dic = data['data'] jsondata = json.dumps(dic).encode('utf8') # write data to string buffer global output output.write(jsondata + "\n") print "Wrote data to Log file.\n" if accountid is not None: print "---data for---" + accountid + "---\n" print jsondata f = file(private_key_file, 'rb') key = f.read() f.close() credentials = SignedJwtAssertionCredentials( SERVICE_ACCOUNT_EMAIL, key, scope='https://www.googleapis.com/auth/bigquery') # BigQuery project id as listed in the Google Developers Console. table = 'prefix_' + accountid client = get_client(project_id, credentials=credentials) # Check if a table exists. exists = client.check_table(bq_dataset, table) if exists is None or exists is False: rec_file = file(bq_table_schema_file, 'rb') rec = rec_file.read() f.close() schema = json.loads(rec, 'utf-8') client.create_table(bq_dataset, table, schema) print 'New Table ' + table + ' created!' self.response.out.write(jsondata) # Add the task to the default queue. taskqueue.add(url='/worker', params={ 'json': jsondata, 'dataset': bq_dataset, 'table': table }) print 'Data added to Task Queue!'
def __init__(self, url): #super().__init__(url) super(IOsCrawler, self).__init__(url) self.done = False self.url = url if url not in self.retries: self.retries[url] = 3 pq = self.pq self.result['name'] = pq('h1').text() self.check() self.result['store'] = 'Apple App Store' self.result['price'] = pq('.price').text() self.result['app_id'] = re.search('/(id\d+)', url).group(1) self.result['storeurl'] = url self.result['category'] = pq('span[itemprop="applicationCategory"]').text() self.result['content_rating'] = pq('.app-rating').text() self.result['icon'] = pq('img.artwork').attr('src-swap') self.result['screenshots'] = [x.attr('src') for x in pq('img.portrait').items()] self.result['description'] = pq('p[itemprop="description"]').text() self.result['developer'] = pq('span[itemprop="name"]').text() temp = pq('div.rating[aria-label*="Ratings"]') self.result['current_version_rating'] = temp.eq(0).attr('aria-label') self.result['rating'] = temp.eq(1).attr('aria-label') self.result['reviews'] = [] for review in pq('.customer-review').items(): extracted_data = {} extracted_data['authorname'] = review.find('span.user-info').text().split(' ')[-1] extracted_data['rating'] = review.find('div.rating').attr('aria-label') extracted_data['title'] = review.find('span.customerReviewTitle').text() extracted_data['text'] = review.find('p.content').text() self.result['reviews'].append(extracted_data) self.result['version'] = pq('span[itemprop="softwareVersion"]').text() self.result['version_history'] = pq('div.product-review > p').eq(1).text() self.result['updated'] = pq('span[itemprop="datePublished"]').text() self.result['languages'] = pq('li.language').find('span').remove().end().text() rows = [] rows = [ { 'Name': str(self.result['name']),'Store': str(self.result['store']), 'Price': self.result['price'], 'App_Id':str(self.result['app_id']),'Store_Url':str(self.result['storeurl']),'Category':str(self.result['category']) , 'Icon': str(self.result['icon']), 'Screenshots': str(self.result['screenshots']), 'Description' :self.result['description'], 'Developer': str(self.result['developer']), 'contentRating': str(self.result['content_rating']) ,'Reviews': str(self.result['reviews']) , 'Updated': str(self.result['updated']), 'Rating' : str(self.result['rating']), 'VersionHistory' : self.result['version_history'], 'Languages' : self.result['languages'], 'CurrentVersionRating' : self.result['current_version_rating'], 'Version' : str(self.result['version']) } ] service_account = '*****@*****.**'# Service account email json_key = 'sevice_key.json'# JSON key provided by Google project_id = 'apps-1149' # Inserting data into table. client = get_client(project_id, json_key_file=json_key, readonly=False) self.done = client.push_rows('Temp', 'AppStoreTest', rows, 'id')
def create_dataset(dataset_name): client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=False) try: result = client.create_dataset(dataset_name,None,None,None) return result except Exception, err: print err return err
def get_data(file_name, subreddit, offset, limit, total_rows, test_set=False): json_key = 'key.json' client = get_client(json_key_file=json_key, readonly=True) training_tables = """ [fh-bigquery:reddit_posts.2015_12], [fh-bigquery:reddit_posts.2016_01], [fh-bigquery:reddit_posts.2016_02], [fh-bigquery:reddit_posts.2016_03], [fh-bigquery:reddit_posts.2016_04], [fh-bigquery:reddit_posts.2016_05], [fh-bigquery:reddit_posts.2016_06], [fh-bigquery:reddit_posts.2016_07], [fh-bigquery:reddit_posts.2016_08], [fh-bigquery:reddit_posts.2016_09], [fh-bigquery:reddit_posts.2016_10], [fh-bigquery:reddit_posts.2016_11], [fh-bigquery:reddit_posts.2016_12], [fh-bigquery:reddit_posts.2017_01] """ test_tables = """ [fh-bigquery:reddit_posts.2017_02] """ tables = test_tables if test_set else training_tables data_query = """ SELECT gilded > 0 as gilded, title, selftext FROM {3} WHERE subreddit="{0}" LIMIT {1} OFFSET {2} """.format(subreddit, limit, offset, tables) # print('query: {0}'.format(data_query)) job_id, _result = client.query(data_query) complete, row_count = client.check_job(job_id) results = client.get_query_rows(job_id) fields = list( map( lambda x: [ clean_document(x['title'], x['selftext']), 1 if bool(x['gilded']) else 0 ], results)) with open(file_name, 'a') as f: writer = csv.writer(f) for line in fields: writer.writerow(line)
def get_tables(dataset_ID): datasetID = dataset_ID try: client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=True) result = client.get_all_tables(datasetID) except Exception, err: print err raise
def __init__(self, project_id, service_account, key_location): self.x = 'Hello' # BigQuery project id as listed in the Google Developers Console. self.project_id = project_id.lower() # Service account email address as listed in the Google Developers Console. self.service_account = service_account # PKCS12 or PEM key provided by Google. self.key = key_location self.client = get_client(self.project_id, service_account=self.service_account, private_key_file=self.key, readonly=True)
def __init__(self, json_key): """ :param json_key: path to the JSON key containing BigQuery credentials """ if not os.path.isfile(json_key): raise ValueError( 'BigQuery JSON key is missing, cannot continue... (path={f})'. format(f=json_key)) self._json_key = json_key self._client = get_client(json_key_file=self._json_key, readonly=False) self.project_id = self._extract_project_id()
def create_Table(dataset_name,table_name,schema): client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=False) datasetname = dataset_name tablename = table_name try: result = client.create_table(datasetname,tablename,schema) return result except Exception, err: print err return False
def from_private_key_file(cls, project_id, service_account, private_key_file, readonly=False): client = get_client( project_id, service_account=service_account, private_key_file=private_key_file, readonly=readonly ) return cls(client)
def Insert_Data(datasetname,table_name,DataObject,user_id=None,tenant=None): client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=False, swallow_results=False) insertObject = DataObject try: upload_size, result = client.push_rows(datasetname,table_name,insertObject) except Exception, err: print err raise
def client(self): """ Returns a BigQuery PEP 249 connection object. """ project = self._get_field('project') json_key_file = self._get_field('key_path') logging.info('project: %s', project) logging.info('json_key_file: %s', json_key_file) return get_client(project_id=project, json_key_file=json_key_file, readonly=False)
def get_fields(dataset_name,table_name): fields = [] datasetname = dataset_name tablename = table_name client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=True) results = client.get_table_schema(datasetname,tablename) for x in results: fieldtype = {'Fieldname': x['name'], 'FieldType':x['type']} fields.append(fieldtype) return fields
def from_private_key_file(cls, project_id, service_account, private_key_file, readonly=False): with open(private_key_file, 'r') as fd: private_key = fd.read() client = get_client( project_id, service_account=service_account, private_key=private_key, readonly=readonly ) return cls(client)
def load_ipython_extension(ipython): db_connections = {k[:-1 * len("_DATABASE_URI")].lower() + "_db":create_engine(v).connect() for k, v in os.environ.items() if k.endswith("_DATABASE_URI")} ipython.push(db_connections, interactive=True) bigquery_envs = [x[:x.index("_BQ_EMAIL")] for x in os.environ.keys() if "_BQ_EMAIL" in x] bigquery_connections = {} for env in bigquery_envs: key = os.environ[env + "_BQ_KEY"] email = os.environ[env + "_BQ_EMAIL"] project = os.environ[env + "_BQ_PROJECT"] bigquery_connections[env.lower() + "_bq"] = get_client(project, service_account=email, private_key=key) ipython.push(bigquery_connections, interactive=True)
def runReport(reportId, startDate, endDate, reportType, timeOuts=0, debug=0): # Update Writer Writer = ReportWriter.Writer() updateWriter(Writer, debug) # initialize google and bq clients google_credential = GoogleCredentials.get_application_default() google_service = googleapiclient.discovery.build( 'storage', 'v1', credentials=google_credential) bq_client = bigquery.get_client(project_id='leanplum-staging', credentials=google_credential) #Delete backups over time range if (reportType == 'delete'): print("Removing backups over timerange") ReportMethods.remove_multi_table(client=bq_client, dateStart=startDate, dateEnd=endDate, dataset='email_report_backups') Writer.send("\tBackups removed for time range", WriterType.INFO) return #Load all the backups for model in ['App', 'Study', 'Experiment']: Writer.send("Loading " + model, WriterType.INFO) ReportMethods.load_multi_table(service=google_service, client=bq_client, dateStart=startDate, dateEnd=endDate, bucket='leanplum_backups', dataset='email_report_backups', model=model) Writer.send("\tBackups Loaded", WriterType.INFO) #Subject Report if (reportType[0] == 's'): SubjectReport.runSubjectReport(bq_client, reportId, reportType, startDate, endDate, timeOuts, Writer) #Domain Report elif (reportType == 'd'): DomainReport.runDomainReport(bq_client, reportId, startDate, endDate, timeOuts, Writer) #Push Report elif (reportType == 'p'): PushReport.runPushReport(bq_client, reportId, startDate, endDate, timeOuts, Writer)
def authenticate(request, filename): try: global client f_path = os.path.join(BASE_DIR, 'media') fin_path = os.path.join(f_path, filename) json_key = fin_path client = get_client(json_key_file=json_key, readonly=True) request.authenticated = True request.client = client return request except Exception as e: print(e) request.authenticated = False request.client = None return request
def get_bq_data(): # Download the .json key from your Google Cloud account client = get_client(json_key_file='bq-query.json', readonly=True) query = """ #standardSQL SELECT startingBalls, startingStrikes, hitterPitchCount, SUM(isHit) hits, SUM(isStrike) strikes, SUM(isBall) balls, COUNT(*) pitches FROM (SELECT startingBalls, startingStrikes, hitterPitchCount, IF(SUBSTR(outcomeId,1, 1)='a', 1, 0) isHit, IF(SUBSTR(outcomeId,1, 1)='k', 1, 0) isStrike, IF(SUBSTR(outcomeId,1, 1)='b', 1, 0) isBall FROM `bigquery-public-data.baseball.games_wide` WHERE atBatEventType='PITCH') q GROUP BY startingBalls, startingStrikes, hitterPitchCount """ job_id, _results = client.query(query) complete = False while not complete == True: print('Checking for job {JOB}...'.format(JOB=job_id)) complete, row_count = client.check_job(job_id) sleep(1) print('Job complete - downloading...') results = client.get_query_rows(job_id) print('Downloaded {N} results...'.format(N=len(results))) with open(FILE_NAME, 'wb') as f: pickle.dump(results, f) return results
def main(): logger = Utils.enable_logging() # get client client = get_client(PROJECT_ID, service_account=SERVICE_ACCOUNT, private_key=KEY, readonly=False) client.swallow_results = False logger.info("client: %s" % client) schema_str = Utils.read_file(SCHEMA_FILE) schema = json.loads(schema_str) # create table BigQuery table created = client.create_table(DATASET_ID, TABLE_ID, schema) logger.info("created result: %s" % created) # if (len(created) == 0): # print "failed to create table" # return l = BigQueryListener(client, DATASET_ID, TABLE_ID, logger=logger) auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) while True: stream = None try: # stream = tweepy.Stream(auth, l, headers = {"Accept-Encoding": "deflate, gzip"}) stream = tweepy.Stream(auth, l) # Choose stream: filtered or sample # stream.sample() stream.filter(track=TRACK_ITEMS) # async=True except: logger.exception("Unexpected error:") if stream: stream.disconnect() time.sleep(60)
def get_last_bq_date(table, date_col, media=None, is_month=False): private_key_path = get_abspath(_common_conf['bigquery']['key_path']) dataset = _common_conf['bigquery']['dataset'] client = get_client(json_key_file=private_key_path, readonly=False) if not client.check_table(dataset, table): return None full_table = "{}.{}".format(dataset, table) if media is None: query = """ SELECT {0} as last_date FROM {1} ORDER BY {0} DESC LIMIT 1 """.format(date_col, full_table) else: query = """ SELECT {0} as last_date FROM {1} WHERE Media = '{2}' ORDER BY {0} DESC LIMIT 1 """.format(date_col, full_table, media) try: job_id, results = client.query(query, timeout=80) except errors.BigQueryTimeoutException as e: raise e if len(results) > 0: ld = results[0]['last_date'] if isinstance(ld, str): strs = ld.split('-') if is_month: return dt.date(int(strs[0]), int(strs[1]), 1) else: return dt.date(int(strs[0]), int(strs[1]), int(strs[2])) elif isinstance(ld, dt.date): return ld else: return None
def main(): logger = Utils.enable_logging(LOGGING_CONFIG) # get client client = get_client(PROJECT_ID, service_account=SERVICE_ACCOUNT, private_key=KEY, readonly=False) client.swallow_results = False logger.info("BigQuery client: %s" % client) schema_str = Utils.read_file(SCHEMA_FILE) schema = json.loads(schema_str) # create table BigQuery table created = client.create_table(DATASET_ID, TABLE_ID, schema) logger.info("BigQuery create table result: %s" % created) # if (len(created) == 0): # print "failed to create table" # return l = BigQueryListener(client, DATASET_ID, TABLE_ID, logger=logger) auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) while True: logger.info("Connecting to Twitter stream") stream = None try: # stream = tweepy.Stream(auth, l, headers = {"Accept-Encoding": "deflate, gzip"}) stream = tweepy.Stream(auth, l) # Choose stream: filtered or sample stream.sample() # stream.filter(track=TRACK_ITEMS) # async=True except: logger.exception("Unexpected error"); if stream: stream.disconnect() time.sleep(60)
def connect_bigquery(suffix='_bq', env_suffix='_BQ'): global bq_connections bq_configs = { k[:-1 * len(env_suffix + '_EMAIL')]: v for k, v in os.environ.items() if k.endswith(env_suffix + '_EMAIL') } if bq_configs: from bigquery import get_client bq_connections = { k.lower() + suffix: get_client(os.environ.get(k + env_suffix + '_PROJECT'), service_account=email, private_key=os.environ.get(k + env_suffix + '_KEY')) for k, email in bq_configs.items() } return bq_connections
def get_dates(department): request_departments = """SELECT DateTimeS, Quantity FROM [bamboo-creek-195008:test2.SalesForLastYear] WHERE Department = "%s" AND DATE(DateTimeS) >= "2017-01-01 00:00:00" AND DATE(DateTimeS) < "2018-01-01 00:00:00" AND Quantity > 0""" % (department) client = get_client(json_key_file = json_key, readonly = True) job_id, _results = client.query(request_departments) complete, row_count = client.check_job(job_id) wait_until(lambda c=client, id=job_id: client.check_job(id), 10000) results = client.get_query_rows(job_id) res = [0] * 53 for el in results: res[datetime.datetime.strptime(el['DateTimeS'][:10], "%Y-%m-%d").isocalendar()[1]] += el['Quantity'] return res
def get_departments(): request_departments = """SELECT Department FROM [bamboo-creek-195008:test2.SalesForLastYear] WHERE DATE(DateTimeS) >= "2017-01-01 00:00:00" AND DATE(DateTimeS) < "2018-01-01 00:00:00" GROUP BY Department""" client = get_client(json_key_file = json_key, readonly = True) job_id, _results = client.query(request_departments) complete, row_count = client.check_job(job_id) wait_until(lambda c=client, id=job_id: client.check_job(id), 10000) results = client.get_query_rows(job_id) res = [el['Department'] for el in results] return res
def execute_query(querystate, offset=None, limit=None, user_id=None, tenant=None): query = querystate try: client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=False) job_id, totalBytesProcessed, statistics, download_bytes, _ = client.query(query, timeout=60) totalBytesBilled = statistics['query']['totalBytesBilled'] #outputBytes = statistics['load']['outputBytes'] usages = {'totalBytesProcessed':totalBytesProcessed, 'totalBytesBilled':totalBytesBilled, 'download_bq' : download_bytes } obj = dre.RatingEngine(user_id, tenant,job_id,**usages) p1 = threading.Thread(target=obj.set_usage(), args=()) p1.start() except Exception, err: print err raise err
def authenticate(request,filename): client = None f_path = os.path.join(BASE_DIR, 'media') fin_path = os.path.join(f_path, filename) #this is the actual path of client file try: json_key = fin_path #providing name to client file client = get_client(json_key_file=json_key, readonly=True) #this provides client object if client file is valid os.remove(json_key) #deleting json file from media directory as client object is saved in client_dict print(type(client)) #request.session['clientfile']=json_key #client file path or json file path request.session['client']=id(client)#id of that client client_dict[request.session.get('client')]=client return request except Exception as e: #if client is not authenticated print(e) os.remove(fin_path) request.authenticated = False request.client=None return request
def retrieve_data(): spinner.info('authenticating') bigquery_client = bigquery.get_client(json_key_file=args.token, readonly=True) spinner.info('submitting query') bigquery_request = REQUEST.format( date=ALL if args.all else YEAR if args.year else MONTH, projects=', '.join("'{:s}'".format(project) for project in args.packages), installers=', '.join("'{:s}'".format(client) for client in args.installers)) job_id, _results = bigquery_client.query(bigquery_request) start = datetime.now() while True: duration = datetime.now() - start spinner.info('retrieving data ({:.2f}s)'.format( duration.total_seconds())) complete, row_count = bigquery_client.check_job(job_id) if complete: return bigquery_client.get_query_rows(job_id)
def populate(self, table): client = get_client(project_id, json_key_file=json_key, readonly=False) qry = "SELECT Name, Url FROM [%s] LIMIT 1000" % table try: job_id, results = client.query(qry, timeout=3000) results = client.get_query_rows(job_id) for row in results: query = """ INSERT INTO %s (Name, URL) VALUES ('%s', '%s') ON DUPLICATE KEY UPDATE URL = '%s' """ % (self.scantable, row['Name'].replace('\'', ''), row['Url'], row['Url']) cursor = self.conn.cursor() cursor.execute(query) self.conn.commit() except Exception as e: print e print 'timeout' print "Populated Table"
def get_venue_name_from_venue_id(venue_id): #PROJECT_ID = 'tour-miner-project' # SERVICE_ACCOUNT = '*****@*****.**' # JSON_KEY_PATH = 'tour-miner-project-8873e737b27c.json' SERVICE_ACCOUNT = '*****@*****.**' JSON_KEY_PATH = 'tour-miner-project-2019-511090dd4bd9.json' # BigQueryClientの取得 client = get_client(json_key_file=JSON_KEY_PATH, readonly=True) query = '#standardSQL\nSELECT VENUE_NAME FROM `tour-miner-project.dataset_TIST2015.JP_VENUE_DICTIONARY` WHERE VENUE_ID = \'' + venue_id + '\' LIMIT 1' # print(query) try: job_id, results = client.query(query, timeout=60) except BigQueryTimeoutException as e: print(e) #venue_idと VENUE_NAMEが該当しない時にエラーが出るため改良した #print(results[0]['VENUE_NAME']) if len(results) == 0: return '該当なし' else: return (results[0]['VENUE_NAME'])
for i in albums: albumid = i['album_id'] albumids.append(albumid) # songs에 albumdetail이라는 key를 만들고 value값으로 album 데이터 집어넣기 albumdetail = {} for i in range(len(albumids)): albumdetail[albumids[i]] = albums[i] for song in songs: song['albumdetail'] = albumdetail[song['album_id']] # # ---------------------bigquery-------------------------------- keyfile = os.getenv('GOOGLE_APPLICATION_CREDENTIALS') client = bigquery.get_client(json_key_file=keyfile, readonly=False) print("identification success") DATABASE = 'bqdb' TABLE = 'Songs' if not client.check_table(DATABASE, TABLE): print("Create table {}.{}".format(DATABASE, TABLE, file=sys.stderr)) client.create_table(DATABASE, TABLE, [{ 'name': 'song_no', 'type': 'string', 'description': 'song id' }, { 'name': 'title', 'type': 'string',
import tweepy import bigquery # Twitterの認証情報を読み込む。 CONSUMER_KEY = os.environ['CONSUMER_KEY'] CONSUMER_SECRET = os.environ['CONSUMER_SECRET'] ACCESS_TOKEN = os.environ['ACCESS_TOKEN'] ACCESS_TOKEN_SECRET = os.environ['ACCESS_TOKEN_SECRET'] auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) # BigQueryの認証情報(credentials.json)を指定してBigQueryのクライアントを作成する。 # 明示的にreadonly=Falseとしないと書き込みができない。 client = bigquery.get_client(json_key_file='credentials.json', readonly=False) DATASET_NAME = 'twitter' # BigQueryのデータセット名 TABLE_NAME = 'tweets' # BigQueryのテーブル名 # テーブルが存在しない場合は作成する。 if not client.check_table(DATASET_NAME, TABLE_NAME): print('Creating table {0}.{1}'.format(DATASET_NAME, TABLE_NAME), file=sys.stderr) # create_table()の第3引数にはスキーマを指定する。 client.create_table(DATASET_NAME, TABLE_NAME, [ { 'name': 'id', 'type': 'string', 'description': 'ツイートのID' },
from bigquery import get_client import yaml import datetime import sys with open("config/google.yml") as f: config = yaml.load(f) client = get_client(config["auth"]["project_id"], service_account=config["auth"]["service_account"], private_key_file=config["auth"]["key"], readonly=False) def create_dataset(): exists = client.check_dataset(config["bigquery"]["dataset"]) if not exists: print "creating %s" % config["bigquery"]["dataset"] client.create_dataset(config["bigquery"]["dataset"], friendly_name="sendgrid dataset", description="A dataset created by me") def create_tables(): now = datetime.datetime.now() for i in xrange(0, config["bigquery"]["table_counts"]): date = now + datetime.timedelta(days = i) table_name = config["bigquery"]["table_name"] + date.strftime("%Y%m%d") exists = client.check_table(config["bigquery"]["dataset"], table_name) if not exists: print "creating table %s" % table_name created = client.create_table(config["bigquery"]["dataset"], table_name, config["bigquery"]["schemas"])
from bigquery import get_client project_id = 'crypto-will-95713' service_account = '*****@*****.**' key = 'clabs-da9b4e83b6fa.p12' client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=False) fields = ['Username', 'Visits', 'Location', 'Device', 'Screensize', 'Browser', 'Products', 'Category'] if client: print "Connection to big query established." datasets = client.get_datasets() print datasets """Code for creating a table with the fields.""" # dataset = client.create_dataset('development', friendly_name="clab_dev", description="Test database for CLABS") # schema_1 = [ # {'name': 'Username', 'type': 'STRING', 'mode': 'required'}, # {'name': 'Email', 'type': 'STRING', 'mode': 'nullable'}, # {'name': 'Visits', 'type': 'INTEGER', 'mode': 'nullable'}, # {'name': 'Password', 'type': 'STRING', 'mode': 'nullable'}, # {'name': 'Location', 'type': 'STRING', 'mode': 'nullable'}, # {'name': 'Device', 'type': 'STRING', 'mode': 'nullable'}, # {'name': 'Screensize', 'type': 'INTEGER', 'mode': 'nullable'}, # {'name': 'Browser', 'type': 'STRING', 'mode': 'nullable'}, # {'name': 'Products', 'type': 'STRING', 'mode': 'nullable'}, # {'name': 'Category', 'type': 'STRING', 'mode': 'nullable'}
def get_datasets(): client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=False, swallow_results=False) datasets = client.get_datasets() return datasets
import praw import logging logger = logging.getLogger() logger.setLevel("INFO") logging.basicConfig() # Global helpers c = r.connect(os.getenv("RETHINKDB_HOST", "rethinkdb.reddit-analyzer"), os.getenv("RETHINKDB_PORT", 28015)) UA = "/r/%s flair analyzer by /u/coffenbacher for https://charlesoffenbacher.com" % sys.argv[1] pr = praw.Reddit(user_agent=UA) db = r.db("reddit") client = get_client( os.getenv("BIGQUERY_PROJECT_ID"), service_account=os.getenv("BIGQUERY_SERVICEACCOUNT_ID"), private_key_file=os.getenv("BIGQUERY_PRIVATEKEY_FILE"), readonly=False, ) # Helper def grouper(n, iterable, fillvalue=None): args = [iter(iterable)] * n return ([e for e in t if e != None] for t in itertools.izip_longest(*args)) def get_submissions_between_epochs(start, end, subreddit): logger.info( "Running search for range %s->%s" % (epoch(start).datetime.strftime("%x"), epoch(end).datetime.strftime("%x")) ) query = "timestamp:%d..%d" % (start, end) return pr.search(query, subreddit=subreddit, sort="new", limit=1000, syntax="cloudsearch")
# BigQuery parameters proj = args.proj json_key = args.json_key out_ds = args.out_ds table_loc_ungrouped = "tmp_loc_ungrouped" table_sc_ungrouped = "tmp_sc_ungrouped" table_loc = args.tb_loc table_sc = args.tb_sc table_skip = args.tb_skip # CLOC executable cloc_exec = args.cloc # Using BigQuery-Python https://github.com/tylertreat/BigQuery-Python print('\nGetting BigQuery client\n') bq_client = get_client(json_key_file=json_key, readonly=False) # Delete the final output tables if they exist delete_bq_table(bq_client, out_ds, table_loc) delete_bq_table(bq_client, out_ds, table_sc) # Get SHAs already in ungrouped tables or skipped existing_sha_loc = unique_vals(bq_client, proj, out_ds, table_loc_ungrouped, "sha") existing_sha_sc = unique_vals(bq_client, proj, out_ds, table_sc_ungrouped, "sha") if not Counter(existing_sha_loc) == Counter(existing_sha_sc): print( "Deleting tables %s and %s because they do not contain the same SHAs. Starting over." % (table_loc_ungrouped, table_sc_ungrouped)) delete_bq_table(bq_client, out_ds, table_loc_ungrouped)
def check_table(dataset, table): client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=False, swallow_results=False) result = client.check_table(dataset, table) return result
import bigquery import config import pymongo import datetime import os if __name__ == '__main__': project_id = os.environ.get('PROJECT_ID') service_account = os.environ.get('SERVICE_ACCOUNT') key = 'george-key.p12' client = bigquery.get_client(project_id, service_account=service_account, private_key_file=key, readonly=True) tables = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09'] subreddits = ['"%s"' % item['subreddit'] for item in config.SUBREDDITS] for table in tables: print 'Querying table:', table query = 'SELECT * FROM [fh-bigquery:reddit_comments.{}] WHERE subreddit IN ({})'.format(table, ','.join(subreddits)) job_id, _results = client.query(query) complete, row_count = client.check_job(job_id) if complete: print 'Downloading', table, row_count results = client.get_query_rows(job_id) records = [r for r in list(results)] for record in records: record['created_utc'] = datetime.datetime.utcfromtimestamp(record['created_utc']) db = pymongo.MongoClient()['reddit'] if records:
def get_table(dataset_ID, table): client = get_client(project_id, service_account=service_account, private_key_file=key, readonly=True) result = client.get_table(dataset_ID,table) return result
import bigquery import sys client = bigquery.get_client(json_key_file='./bigquery.json', readonly=False) DATABASE = "bqdb" TABLE = "test" if not client.check_table(DATABASE, TABLE): print("Create table {0}.{1}".format(DATABASE, TABLE), file=sys.stderr) client.create_table(DATABASE, TABLE, [ {'name': 'songno', 'type': 'string', 'description': 'song id'}, {'name': 'title', 'type': 'string', 'description': 'song title'}, {'name': 'albumid', 'type': 'string', 'description': 'album id'} ]) ttt = [ {'songno': '444', 'albumid': '444444', 'rec': {'sub1':'abc4'}} ] pushResult = client.push_rows(DATABASE, TABLE, ttt, insert_id_key='songno') print("Pushed Result is", pushResult)
def __init__(self, url, num_reviews=5): # set num_reviews to -1 to scrape everything #super().__init__(url) super(AndroidCrawler, self).__init__(url) self.done = False self.url = url if url not in self.retries: self.retries[url] = 3 self.result['name'] = self.pq('.document-title').text() self.check() self.result['store'] = 'Google Play' price = self.pq('span[itemprop="offers"] [itemprop="price"]').attr('content') if price == '0': price = 'Free' self.result['price'] = price self.result['app_id'] = url.split('id=')[1].split('&')[0] self.result['storeurl'] = url category = self.pq('a.document-subtitle.category') if category.length == 2: subcategory = category.eq(1).text() elif category.length == 1: subcategory = '' else: raise Exception('Something wrong with categories') self.result['category'] = category.eq(0).text() self.result['subcategory'] = subcategory self.result['icon'] = self.pq('.cover-container').children().attr('src') self.result['screenshots'] = [x.attr('src') for x in self.pq('.screenshot').items()] self.result['description'] = self.pq('.id-app-orig-desc').text() self.result['description'] = self.pq('div[itemprop="description"]').text() #print self.result['description'] print "kam ban gya he " self.result['downloads'] = self.pq('div[itemprop="numDownloads"]').text() if not offline: self.result['permissions'] = self.__get_permissions(self.result['app_id']) else: self.result['permissions'] = [] self.result['developer'] = self.pq('.primary').text() self.result['contentRating'] = self.pq('.meta-info.contains-text-link > .content').eq(0).text() try: self.result['developerWebsite'] = self.pq('.dev-link').attr('href').split('q=')[1].split('&')[0] except: self.result['developerWebsite'] = 'Nothing' siblings = self.pq('.dev-link') found = siblings.eq(1).text() if siblings.eq(2).text() == 'Privacy Policy' : self.result['DeveloperPrivacyPolicy'] = siblings.eq(2).attr('href') self.result['physicalAddress']=self.pq('div.content.physical-address').text() else : self.result['physicalAddress']=self.pq('div.content.physical-address').text() self.result['DeveloperPrivacyPolicy']="Not Given" self.result['whatsNew'] = self.pq('div.recent-change').text() self.result['developerEmail'] = found if not offline: print "Getting Reviews" if num_reviews > 0: self.result['reviews'] = self.__get_reviews(self.result['app_id'], num_reviews) else: self.result['reviews'] = self.__get_reviews(self.result['app_id']) else: self.result['reviews'] = [] self.result['version'] = self.pq('div[itemprop="softwareVersion"]').text() self.result['updated'] = self.pq('div[itemprop="datePublished"]').text() try: self.result['rating'] = self.pq('div[itemprop="ratingValue"]').text()+' - '+self.pq('div[itemprop="ratingCount"]').text() except: self.result['rating'] = '0' self.result['installations'] = self.pq('div[itemprop="numDownloads"]').text() self.result['LastUpdateDate'] = self.pq('div[itemprop="datePublished"]').text() self.result['AppSize'] = self.pq('div[itemprop="fileSize"]').text() self.result['MinimumOSVersion'] = self.pq('div[itemprop="operatingSystems"]').text() #self.result['contentRating'] = self.pq('div[itemprop="contentRating"]').text() ratingReason = self.pq('.meta-info.contains-text-link > .content').eq(1).text() if (ratingReason == 'Learn more'): self.result['RatingReason'] = '' else: self.result['RatingReason'] = ratingReason try: ratingText = self.pq('div[itemprop="aggregateRating"]').text().split(' ') self.result['ratingValue'] = ratingText[0] self.result['ratingCount'] = ratingText[1] except: self.result['ratingValue'] = '' self.result['ratingCount'] = '' #print self.result['AppSize'] #print self.result['LastUpdateDate'] #print self.result['installations'] #print self.result['MinimumOSVersion'] #print self.result['contentRating'] #print self.result['developerWebsite'] #print self.result['developerEmail'] #print self.result['physicalAddress'] #print self.result['whatsNew'] isfree = False if self.result['price'] == 0: isfree=True project_id = 'apps-1149' new_reviews= [] new_reviews = self.result['reviews'] #print new_reviews[1]['date'] service_account = '*****@*****.**'# Service account email json_key = 'sevice_key.json'# JSON key provided by Google # Inserting data into table. client = get_client(project_id, json_key_file=json_key, readonly=False) price = str(self.result['price']) if price == "Free": price_in_rupees = 0 else : price_in_rupees= float(price[2:]) rows = [] try: rows = [ { 'Name': str(self.result['name']),'Store': str(self.result['store']), 'Price': price_in_rupees, 'IsFree': isfree , 'App_Id':str(self.result['app_id']),'Store_Url':str(self.result['storeurl']),'Category':str(self.result['category']) , 'subCategory':str(self.result['subcategory']) , 'Icon': str(self.result['icon']), 'Screenshots': str(self.result['screenshots']), 'Description' :self.result['description'], 'Downloads':str(self.result['downloads']), 'Permissions': str(self.result['permissions']), 'Developer': str(self.result['developer']), 'contentRating': str(self.result['contentRating']) , 'developerWebsite': str(self.result['developerWebsite']) ,'Reviews': str(self.result['reviews']) ,'reviewDate':str(new_reviews[0]['date']) , 'reviewTitle': str(new_reviews[1]['title']) , 'reviewTexts' :str(new_reviews[1]['text']), 'Version' : str(self.result['version']),'Updates': str(self.result['updated']), 'Rating' : str(self.result['rating']), 'appSize':self.result['AppSize'], 'lastUpdateDate': self.result['LastUpdateDate'] , 'installations': self.result['installations'], 'MinimumOSVersion': self.result['MinimumOSVersion'], 'developerEmail': self.result['developerEmail'], 'whatsNew': self.result['whatsNew'], 'physicalAddress': self.result['physicalAddress'], 'RatingReason' : self.result['ratingReason'], 'RatingCount' : self.result['ratingCount'], 'RatingValue' : self.result['ratingValue'] } ] except: print "Incomplete" rows = [ { 'Name': str(self.result['name']),'Store': str(self.result['store']), 'Price': price_in_rupees, 'IsFree': isfree , 'App_Id':str(self.result['app_id']),'Store_Url':str(self.result['storeurl']),'Category':str(self.result['category']) , 'subCategory':str(self.result['subcategory']) , 'Icon': str(self.result['icon']), 'Screenshots': str(self.result['screenshots']), 'Description' :self.result['description'], 'Downloads':str(self.result['downloads']), 'Permissions': str(self.result['permissions']), 'Developer': str(self.result['developer']), 'contentRating': str(self.result['contentRating']), 'developerWebsite': str(self.result['developerWebsite']) ,'Reviews': str(self.result['reviews']), 'Version' : str(self.result['version']),'Updates': str(self.result['updated']), 'Rating' : str(self.result['rating']), 'appSize':self.result['AppSize'], 'lastUpdateDate': self.result['LastUpdateDate'] , 'installations': self.result['installations'], 'MinimumOSVersion': self.result['MinimumOSVersion'], 'developerEmail': self.result['developerEmail'], 'whatsNew': self.result['whatsNew'], 'physicalAddress': self.result['physicalAddress'], 'RatingReason' : self.result['RatingReason'], 'RatingCount' : self.result['ratingCount'], 'RatingValue' : self.result['ratingValue'] } ] self.done = client.push_rows('Temp', 'AndroidTest', rows, 'id') #print "success"
proj = args.proj json_key = args.json_key dataset = args.ds table = args.table sheet = args.sheet gh_username = args.gh_username gh_oauth_key = args.gh_oauth_key # Get repo names print("Getting repo names from spreadsheet") repos = get_repo_names(sheet, json_key) print("There are %s repos with use_repo = 1.\n" % len(repos)) # Using BigQuery-Python https://github.com/tylertreat/BigQuery-Python print('\nGetting BigQuery client\n') client = get_client(json_key_file=json_key, readonly=False, swallow_results=True) # Check which repos are already in the table existing_repos = unique_vals(client, proj, dataset, table, "repo_name") if len(existing_repos) > 0: repos = [repo for repo in repos if repo not in existing_repos] print("Only getting data for %s repos not yet analyzed" %len(repos)) # Table schema schema = [ {'name': 'repo_name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'commit_sha', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'commit_api_url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'commit_html_url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'commit_comments_url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'commit_message', 'type': 'STRING', 'mode': 'NULLABLE'},
#Check the BigQuery Python Setup Steps.txt for setup steps from googleapiclient import discovery from oauth2client.client import GoogleCredentials from bigquery import get_client from googleapiclient import http from bigquery.errors import BigQueryTimeoutException from bigquery import JOB_SOURCE_FORMAT_CSV credentials = GoogleCredentials.get_application_default() project_id = 'YOUR PROJECTID' dataset = 'YOUR DATASET' service = discovery.build('storage', 'v1', credentials=credentials) client = get_client(project_id, credentials=credentials) #Step 1: Upload local file to google cloud storage (GCS) filename = 'YOUR FILE PATH' bucket = 'YOUR GCS Bucket' #Note: This should not contain gs:// part destination_file_path='Destination file path inside the bucket' body = {'name': destination_file_path} req = service.objects().insert(bucket=bucket, body=body, media_body=filename) resp = req.execute() #Step 2: Load file in GCS to a table in big query #Drop table if exists