예제 #1
0
def insert_row(df, table, replace_val):
    if len(df.index) == 0:
        print("gbq insert records zero")
        return
    project_id = _common_conf['bigquery']['project_id']
    private_key_path = get_abspath(_common_conf['bigquery']['key_path'])
    dataset = _common_conf['bigquery']['dataset']

    # 10000ずつinset
    full_table = "{}.{}".format(dataset, table)
    client = get_client(json_key_file=private_key_path,
                        readonly=False,
                        swallow_results=False)

    if client.check_table(dataset, table):
        bq_limit = 10000
        q_num = len(df.index) // bq_limit
        for i in range(0, q_num + 1):
            client = get_client(json_key_file=private_key_path,
                                readonly=False,
                                swallow_results=False)
            ins_df = df[i * bq_limit:(i + 1) * bq_limit].replace(
                np.nan, replace_val)

            row_dict = ins_df.to_dict(orient='index')
            row_data = [x for x in row_dict.values()]
            ret = client.push_rows(dataset, table, row_data)
            if 'insertErrors' in ret:
                msg = "BigQuery Insert Error:\nsample:\n{}\nerror:\n{}"
                raise Exception(msg.format(row_data[0:5], ret))
    else:
        print('{} CREATE TABLE'.format(full_table))
        gbq.to_gbq(df, full_table, project_id)
def get_venue_names_from_venue_ids(venue_id_list):
    #PROJECT_ID = 'tour-miner-project'
    #    SERVICE_ACCOUNT = '*****@*****.**'
    #    JSON_KEY_PATH = 'tour-miner-project-8873e737b27c.json'
    #    SERVICE_ACCOUNT ='*****@*****.**'
    JSON_KEY_PATH = 'tour-miner-project-2019-511090dd4bd9.json'

    # BigQueryClientの取得
    client = get_client(json_key_file=JSON_KEY_PATH, readonly=True)

    where_phrase = ''
    for i in range(len(venue_id_list)):
        where_phrase += "VENUE_ID = \'" + venue_id_list[i] + "\'"
        if i < len(venue_id_list) - 1:  # 最後でなければORをつける
            where_phrase += ' or '
#    query = '#standardSQL\nSELECT VENUE_ID, VENUE_NAME FROM `tour-miner-project.dataset_TIST2015.JP_VENUE_DICTIONARY` WHERE ' + where_phrase
    query = '#standardSQL\nSELECT VENUE_ID, VENUE_NAME FROM `tour-miner-project-2019.dataset_TIST2015.JP_VENUE_DICTIONARY` WHERE ' + where_phrase
    #   print(query)

    try:
        job_id, results = client.query(query, timeout=60)
    except BigQueryTimeoutException as e:
        print(e)
#   print(results)
    location_name_dictionary = {}
    for q in results:
        location_name_dictionary[q['VENUE_ID']] = q['VENUE_NAME']
    return location_name_dictionary
예제 #3
0
def fn3(Data):
    client = bigquery.get_client(json_key_file='bigquery.json', readonly=False)

    DATABASE = "bqdb"
    TABLE = "Songs"
    data = Data
    if not client.check_table(DATABASE, TABLE):
        print("Create table {0}.{1}".format(DATABASE, TABLE), file=sys.stderr)

        client.create_table(DATABASE, TABLE, [
            {'name': 'id', 'type': 'integer', 'description': 'mysql id'},
            {'name': 'songId', 'type': 'string', 'description': 'song id'},
            {'name': 'title', 'type': 'string', 'description': 'song title'},
            {'name': 'genre', 'type': 'string', 'description': 'genre'},
            {'name': 'albumId', 'type': 'string', 'description': 'album id'},
            {'name': 'album', 'type': 'record', 'description': 'album info',
             'fields':[
                 {'name': 'id', 'type': 'integer', 'descripition': 'mysql album id'},
                 {'name': 'albumId', 'type': 'string', 'descripition': 'album id'},
                 {'name': 'title', 'type': 'string', 'descripition': 'album string'},
                 {'name': 'releaseDate', 'type': 'TIMESTAMP', 'descripition': 'album releaseDate'},
                 {'name': 'score', 'type': 'numeric', 'descripition': 'album score'},
                 {'name': 'publisher', 'type': 'string', 'descripition': 'album publisher'},
                 {'name': 'label', 'type': 'string', 'descripition': 'album label'}]}
        ])
 
    pushResult = client.push_rows(DATABASE, TABLE, data, insert_id_key='songId')
    print("Pushed Result is", pushResult)
예제 #4
0
    def get(self):
        user = users.get_current_user()
        if user:
            url = users.create_logout_url(self.request.uri)
            url_linktext = 'Logout'

            query = "SELECT FLIGHTNO, CP, FO, UA, HL, DL FROM demo.sdata WHERE CP IN (SELECT latamid FROM demo.mapping WHERE userid='%s')" % (user)
            client = get_client(json_key_file=json_key, readonly=True)
            try:
                job_id, results = client.query(query, timeout=10)
            except BigQueryTimeoutException:
                print "Timeout"

            template_values = {
                'user': user,
                'url': url,
                'url_linktext': url_linktext,
                'results': results,
            }

            template = JINJA_ENVIRONMENT.get_template('index.html')
        
        else:
            url = users.create_login_url(self.request.uri)
            url_linktext = 'Login'
			    template_values = {
				'user': user,
				'url': url,
				'url_linktext': url_linktext
	    	}

            template = JINJA_ENVIRONMENT.get_template('nologin.html')
예제 #5
0
파일: fetch_text.py 프로젝트: sw1001/gum
def get_post(year, month, post_id, post_type):
    from bigquery import get_client

    # JSON key provided by Google
    json_key = os.path.dirname(
        os.path.realpath(__file__)) + os.sep + 'key.json'

    client = get_client(json_key_file=json_key, readonly=True)

    if post_type == "post":
        post_or_comment = "posts"
    else:
        post_or_comment = "comments"
    table_name = "fh-bigquery:reddit_" + post_or_comment + "." + year + "_" + month

    # Submit an async query.
    query = "SELECT * FROM [" + table_name + "] WHERE id = '" + post_id + "';"
    job_id, _results = client.query(query)

    sleep(3)

    # Check if the query has finished running.
    complete, row_count = client.check_job(job_id)

    # Retrieve the results.
    results = client.get_query_rows(job_id)

    return str(results)
예제 #6
0
def get_bq_data():
    # Download the .json key from your Google Cloud account
    client = get_client(json_key_file='bq-query.json', readonly=True)

    query = """
    SELECT
        pitcherId,
        pitchTypeDescription,
      outcomeDescription,
        COUNT(pitchTypeDescription) throws
    FROM
        [bigquery-public-data:baseball.games_wide]
    WHERE
        atBatEventType = "PITCH"
    GROUP BY 
        pitcherId,
        pitchTypeDescription,
        outcomeDescription
    """

    job_id, _results = client.query(query)
    complete = False

    while not complete == True:
        print('Checking for job {JOB}...'.format(JOB=job_id))
        complete, row_count = client.check_job(job_id)
        sleep(1)

    print('Job complete - downloading...')
    results = client.get_query_rows(job_id)
    print('Downloaded {N} results...'.format(N=len(results)))

    with open(FILE_NAME, 'wb') as f:
        pickle.dump(results, f)
    return results
예제 #7
0
def main():
    
    logger = Utils.enable_logging()
    
    # get client
    client = get_client(PROJECT_ID, service_account=SERVICE_ACCOUNT, private_key=KEY, readonly=False)
    client.swallow_results = False
    logger.info("client: %s" % client)
    
    schema_str = Utils.read_file(GNIP_SCHEMA_FILE)
    schema = json.loads(schema_str)
    
    # initialize table mapping for default table
    table_mapping = {
         DATASET_ID + "." + TABLE_ID : [DATASET_ID, TABLE_ID]
     }
    
    l = BigQueryGnipListener(client, schema, table_mapping, logger=logger)
 
    while True:

        stream = None

        try:
            get_stream(l)
            
        except:

            logger.exception("Unexpected error:");

            if stream:
                stream.disconnect()

            time.sleep(SLEEP_TIME)
예제 #8
0
    def post(self):

        data = json.loads(self.request.body)

        accountid = data['accountId']
        dic = data['data']

        jsondata = json.dumps(dic).encode('utf8')

        # write data to string buffer
        global output
        output.write(jsondata + "\n")

        print "Wrote data to Log file.\n"

        if accountid is not None:

            print "---data for---" + accountid + "---\n"
            print jsondata

            f = file(private_key_file, 'rb')
            key = f.read()
            f.close()

            credentials = SignedJwtAssertionCredentials(
                SERVICE_ACCOUNT_EMAIL,
                key,
                scope='https://www.googleapis.com/auth/bigquery')

            # BigQuery project id as listed in the Google Developers Console.

            table = 'prefix_' + accountid

            client = get_client(project_id, credentials=credentials)

            # Check if a table exists.
            exists = client.check_table(bq_dataset, table)

            if exists is None or exists is False:
                rec_file = file(bq_table_schema_file, 'rb')
                rec = rec_file.read()
                f.close()

                schema = json.loads(rec, 'utf-8')
                client.create_table(bq_dataset, table, schema)

                print 'New Table ' + table + ' created!'

            self.response.out.write(jsondata)

            # Add the task to the default queue.
            taskqueue.add(url='/worker',
                          params={
                              'json': jsondata,
                              'dataset': bq_dataset,
                              'table': table
                          })

        print 'Data added to Task Queue!'
예제 #9
0
    def __init__(self, url):
        #super().__init__(url)
        super(IOsCrawler, self).__init__(url)

        self.done = False
        self.url = url
        
        if url not in self.retries:
            self.retries[url] = 3

        pq = self.pq
        self.result['name'] = pq('h1').text()
        self.check()
        self.result['store'] = 'Apple App Store'
        self.result['price'] = pq('.price').text()
        self.result['app_id'] = re.search('/(id\d+)', url).group(1)
        self.result['storeurl'] = url
        self.result['category'] = pq('span[itemprop="applicationCategory"]').text()
        self.result['content_rating'] = pq('.app-rating').text()
        self.result['icon'] = pq('img.artwork').attr('src-swap')
        self.result['screenshots'] = [x.attr('src') for x in pq('img.portrait').items()]
        self.result['description'] = pq('p[itemprop="description"]').text()
        self.result['developer'] = pq('span[itemprop="name"]').text()
        temp = pq('div.rating[aria-label*="Ratings"]')
        self.result['current_version_rating'] = temp.eq(0).attr('aria-label')
        self.result['rating'] = temp.eq(1).attr('aria-label')
        self.result['reviews'] = []
        for review in pq('.customer-review').items():
            extracted_data = {}
            extracted_data['authorname'] = review.find('span.user-info').text().split(' ')[-1]
            extracted_data['rating'] = review.find('div.rating').attr('aria-label')
            extracted_data['title'] = review.find('span.customerReviewTitle').text()
            extracted_data['text'] = review.find('p.content').text()
            self.result['reviews'].append(extracted_data)
        self.result['version'] = pq('span[itemprop="softwareVersion"]').text()
        self.result['version_history'] = pq('div.product-review > p').eq(1).text()
        self.result['updated'] = pq('span[itemprop="datePublished"]').text()
        self.result['languages'] = pq('li.language').find('span').remove().end().text()
       
        rows = []

        rows =  [
        {
        'Name': str(self.result['name']),'Store': str(self.result['store']), 'Price': self.result['price'],
        'App_Id':str(self.result['app_id']),'Store_Url':str(self.result['storeurl']),'Category':str(self.result['category']) , 
        'Icon': str(self.result['icon']), 'Screenshots': str(self.result['screenshots']), 'Description' :self.result['description'], 
        'Developer': str(self.result['developer']), 'contentRating': str(self.result['content_rating']) ,'Reviews': str(self.result['reviews']) ,
        'Updated': str(self.result['updated']), 'Rating' : str(self.result['rating']), 'VersionHistory' : self.result['version_history'],
        'Languages' : self.result['languages'], 'CurrentVersionRating' : self.result['current_version_rating'], 'Version' : str(self.result['version'])
        }
        ]

        service_account = '*****@*****.**'# Service account email       
        json_key = 'sevice_key.json'# JSON key provided by Google

        project_id = 'apps-1149'
        # Inserting data into table.
        client = get_client(project_id, json_key_file=json_key, readonly=False)
        self.done = client.push_rows('Temp', 'AppStoreTest', rows, 'id')
예제 #10
0
def create_dataset(dataset_name):
          client = get_client(project_id, service_account=service_account,
                            private_key_file=key, readonly=False)
          try:
               result  = client.create_dataset(dataset_name,None,None,None)
               return  result
          except Exception, err:
               print err
               return err
예제 #11
0
def get_data(file_name, subreddit, offset, limit, total_rows, test_set=False):
    json_key = 'key.json'

    client = get_client(json_key_file=json_key, readonly=True)

    training_tables = """
        [fh-bigquery:reddit_posts.2015_12],
        [fh-bigquery:reddit_posts.2016_01],
        [fh-bigquery:reddit_posts.2016_02],
        [fh-bigquery:reddit_posts.2016_03],
        [fh-bigquery:reddit_posts.2016_04],
        [fh-bigquery:reddit_posts.2016_05],
        [fh-bigquery:reddit_posts.2016_06],
        [fh-bigquery:reddit_posts.2016_07],
        [fh-bigquery:reddit_posts.2016_08],
        [fh-bigquery:reddit_posts.2016_09],
        [fh-bigquery:reddit_posts.2016_10],
        [fh-bigquery:reddit_posts.2016_11],
        [fh-bigquery:reddit_posts.2016_12],
        [fh-bigquery:reddit_posts.2017_01]
    """

    test_tables = """
        [fh-bigquery:reddit_posts.2017_02]
    """

    tables = test_tables if test_set else training_tables

    data_query = """
    SELECT
        gilded > 0 as gilded,
        title,
        selftext
    FROM
        {3}
    WHERE
        subreddit="{0}"
    LIMIT
        {1}
    OFFSET
        {2}
    """.format(subreddit, limit, offset, tables)
    # print('query: {0}'.format(data_query))
    job_id, _result = client.query(data_query)
    complete, row_count = client.check_job(job_id)
    results = client.get_query_rows(job_id)
    fields = list(
        map(
            lambda x: [
                clean_document(x['title'], x['selftext']), 1
                if bool(x['gilded']) else 0
            ], results))
    with open(file_name, 'a') as f:
        writer = csv.writer(f)
        for line in fields:
            writer.writerow(line)
예제 #12
0
def get_tables(dataset_ID):
          datasetID = dataset_ID

          try:
              client = get_client(project_id, service_account=service_account,
                            private_key_file=key, readonly=True)
              result  = client.get_all_tables(datasetID)
          except Exception, err:
              print err
              raise
예제 #13
0
    def __init__(self, project_id, service_account, key_location):
        self.x = 'Hello'
        # BigQuery project id as listed in the Google Developers Console.
        self.project_id = project_id.lower()
        # Service account email address as listed in the Google Developers Console.
        self.service_account = service_account
        # PKCS12 or PEM key provided by Google.
        self.key = key_location

        self.client = get_client(self.project_id, service_account=self.service_account,
                            private_key_file=self.key, readonly=True)
예제 #14
0
 def __init__(self, json_key):
     """
     :param json_key: path to the JSON key containing BigQuery credentials
     """
     if not os.path.isfile(json_key):
         raise ValueError(
             'BigQuery JSON key is missing, cannot continue... (path={f})'.
             format(f=json_key))
     self._json_key = json_key
     self._client = get_client(json_key_file=self._json_key, readonly=False)
     self.project_id = self._extract_project_id()
예제 #15
0
def create_Table(dataset_name,table_name,schema):
          client = get_client(project_id, service_account=service_account,
                            private_key_file=key, readonly=False)
          datasetname = dataset_name
          tablename = table_name
          try:
              result  = client.create_table(datasetname,tablename,schema)
              return result
          except Exception, err:
              print err
              return False
    def from_private_key_file(cls, project_id, service_account,
                              private_key_file, readonly=False):

        client = get_client(
            project_id,
            service_account=service_account,
            private_key_file=private_key_file,
            readonly=readonly
        )

        return cls(client)
예제 #17
0
def Insert_Data(datasetname,table_name,DataObject,user_id=None,tenant=None):
          client = get_client(project_id, service_account=service_account,
                            private_key_file=key, readonly=False, swallow_results=False)

          insertObject = DataObject
          try:
              upload_size, result  = client.push_rows(datasetname,table_name,insertObject)

          except Exception, err:
              print err
              raise
예제 #18
0
    def client(self):
        """
        Returns a BigQuery PEP 249 connection object.
        """
        project = self._get_field('project')
        json_key_file = self._get_field('key_path')

        logging.info('project: %s', project)
        logging.info('json_key_file: %s', json_key_file)
        return get_client(project_id=project,
                          json_key_file=json_key_file,
                          readonly=False)
예제 #19
0
def get_fields(dataset_name,table_name):
          fields = []
          datasetname = dataset_name
          tablename = table_name
          client = get_client(project_id, service_account=service_account,
                            private_key_file=key, readonly=True)
          results = client.get_table_schema(datasetname,tablename)
          for x in results:
              fieldtype = {'Fieldname': x['name'],
                    'FieldType':x['type']}
              fields.append(fieldtype)
          return fields
    def from_private_key_file(cls, project_id, service_account,
                              private_key_file, readonly=False):
        with open(private_key_file, 'r') as fd:
            private_key = fd.read()

        client = get_client(
            project_id,
            service_account=service_account,
            private_key=private_key,
            readonly=readonly
        )

        return cls(client)
예제 #21
0
    def __init__(self, project_id, service_account, key_location):
        self.x = 'Hello'
        # BigQuery project id as listed in the Google Developers Console.
        self.project_id = project_id.lower()
        # Service account email address as listed in the Google Developers Console.
        self.service_account = service_account
        # PKCS12 or PEM key provided by Google.
        self.key = key_location

        self.client = get_client(self.project_id,
                                 service_account=self.service_account,
                                 private_key_file=self.key,
                                 readonly=True)
def load_ipython_extension(ipython):
    db_connections = {k[:-1 * len("_DATABASE_URI")].lower() + "_db":create_engine(v).connect() for k, v in os.environ.items() if k.endswith("_DATABASE_URI")}

    ipython.push(db_connections, interactive=True)
    bigquery_envs = [x[:x.index("_BQ_EMAIL")] for x in os.environ.keys() if "_BQ_EMAIL" in x]
    bigquery_connections = {}

    for env in bigquery_envs:
        key = os.environ[env + "_BQ_KEY"]
        email = os.environ[env + "_BQ_EMAIL"]
        project = os.environ[env + "_BQ_PROJECT"]

        bigquery_connections[env.lower() + "_bq"] = get_client(project, service_account=email, private_key=key)
    ipython.push(bigquery_connections, interactive=True)
예제 #23
0
def runReport(reportId, startDate, endDate, reportType, timeOuts=0, debug=0):

    # Update Writer

    Writer = ReportWriter.Writer()
    updateWriter(Writer, debug)

    # initialize google and bq clients
    google_credential = GoogleCredentials.get_application_default()
    google_service = googleapiclient.discovery.build(
        'storage', 'v1', credentials=google_credential)
    bq_client = bigquery.get_client(project_id='leanplum-staging',
                                    credentials=google_credential)

    #Delete backups over time range
    if (reportType == 'delete'):
        print("Removing backups over timerange")
        ReportMethods.remove_multi_table(client=bq_client,
                                         dateStart=startDate,
                                         dateEnd=endDate,
                                         dataset='email_report_backups')

        Writer.send("\tBackups removed for time range", WriterType.INFO)
        return

    #Load all the backups
    for model in ['App', 'Study', 'Experiment']:
        Writer.send("Loading " + model, WriterType.INFO)
        ReportMethods.load_multi_table(service=google_service,
                                       client=bq_client,
                                       dateStart=startDate,
                                       dateEnd=endDate,
                                       bucket='leanplum_backups',
                                       dataset='email_report_backups',
                                       model=model)

    Writer.send("\tBackups Loaded", WriterType.INFO)

    #Subject Report
    if (reportType[0] == 's'):
        SubjectReport.runSubjectReport(bq_client, reportId, reportType,
                                       startDate, endDate, timeOuts, Writer)
    #Domain Report
    elif (reportType == 'd'):
        DomainReport.runDomainReport(bq_client, reportId, startDate, endDate,
                                     timeOuts, Writer)
    #Push Report
    elif (reportType == 'p'):
        PushReport.runPushReport(bq_client, reportId, startDate, endDate,
                                 timeOuts, Writer)
예제 #24
0
def authenticate(request, filename):
    try:
        global client
        f_path = os.path.join(BASE_DIR, 'media')
        fin_path = os.path.join(f_path, filename)
        json_key = fin_path
        client = get_client(json_key_file=json_key, readonly=True)
        request.authenticated = True
        request.client = client
        return request
    except Exception as e:
        print(e)
        request.authenticated = False
        request.client = None
        return request
예제 #25
0
def get_bq_data():
    # Download the .json key from your Google Cloud account
    client = get_client(json_key_file='bq-query.json', readonly=True)

    query = """
    #standardSQL
    SELECT
      startingBalls,
      startingStrikes,
      hitterPitchCount,
      SUM(isHit) hits,
      SUM(isStrike) strikes,
      SUM(isBall) balls,
      COUNT(*) pitches
    FROM
    (SELECT
      startingBalls,
      startingStrikes,
      hitterPitchCount,
      IF(SUBSTR(outcomeId,1, 1)='a', 1, 0) isHit,
      IF(SUBSTR(outcomeId,1, 1)='k', 1, 0) isStrike,
      IF(SUBSTR(outcomeId,1, 1)='b', 1, 0) isBall
    FROM
      `bigquery-public-data.baseball.games_wide`
    WHERE
      atBatEventType='PITCH') q
    GROUP BY
      startingBalls,
      startingStrikes,
      hitterPitchCount
    """

    job_id, _results = client.query(query)
    complete = False

    while not complete == True:
        print('Checking for job {JOB}...'.format(JOB=job_id))
        complete, row_count = client.check_job(job_id)
        sleep(1)

    print('Job complete - downloading...')
    results = client.get_query_rows(job_id)
    print('Downloaded {N} results...'.format(N=len(results)))

    with open(FILE_NAME, 'wb') as f:
        pickle.dump(results, f)
    return results
예제 #26
0
def main():

    logger = Utils.enable_logging()

    # get client
    client = get_client(PROJECT_ID,
                        service_account=SERVICE_ACCOUNT,
                        private_key=KEY,
                        readonly=False)
    client.swallow_results = False
    logger.info("client: %s" % client)

    schema_str = Utils.read_file(SCHEMA_FILE)
    schema = json.loads(schema_str)

    # create table BigQuery table
    created = client.create_table(DATASET_ID, TABLE_ID, schema)
    logger.info("created result: %s" % created)
    #     if (len(created) == 0):
    #         print "failed to create table"
    #         return

    l = BigQueryListener(client, DATASET_ID, TABLE_ID, logger=logger)
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

    while True:

        stream = None

        try:

            # stream = tweepy.Stream(auth, l, headers = {"Accept-Encoding": "deflate, gzip"})
            stream = tweepy.Stream(auth, l)

            # Choose stream: filtered or sample
            # stream.sample()
            stream.filter(track=TRACK_ITEMS)  # async=True

        except:

            logger.exception("Unexpected error:")

            if stream:
                stream.disconnect()

            time.sleep(60)
예제 #27
0
def get_last_bq_date(table, date_col, media=None, is_month=False):
    private_key_path = get_abspath(_common_conf['bigquery']['key_path'])
    dataset = _common_conf['bigquery']['dataset']

    client = get_client(json_key_file=private_key_path, readonly=False)
    if not client.check_table(dataset, table):
        return None

    full_table = "{}.{}".format(dataset, table)

    if media is None:
        query = """
            SELECT
              {0} as last_date
            FROM {1}
            ORDER BY
              {0} DESC
            LIMIT 1
        """.format(date_col, full_table)
    else:
        query = """
          SELECT
            {0} as last_date
          FROM {1}
          WHERE
            Media = '{2}'
          ORDER BY
            {0} DESC
          LIMIT 1
        """.format(date_col, full_table, media)
    try:
        job_id, results = client.query(query, timeout=80)
    except errors.BigQueryTimeoutException as e:
        raise e
    if len(results) > 0:
        ld = results[0]['last_date']
        if isinstance(ld, str):
            strs = ld.split('-')
            if is_month:
                return dt.date(int(strs[0]), int(strs[1]), 1)
            else:
                return dt.date(int(strs[0]), int(strs[1]), int(strs[2]))
        elif isinstance(ld, dt.date):
            return ld
    else:
        return None
예제 #28
0
def main():
    
    logger = Utils.enable_logging(LOGGING_CONFIG)
    
    # get client
    client = get_client(PROJECT_ID, service_account=SERVICE_ACCOUNT, private_key=KEY, readonly=False)
    client.swallow_results = False
    logger.info("BigQuery client: %s" % client)
    
    schema_str = Utils.read_file(SCHEMA_FILE)
    schema = json.loads(schema_str)
    
    # create table BigQuery table
    created = client.create_table(DATASET_ID, TABLE_ID, schema)
    logger.info("BigQuery create table result: %s" % created)
#     if (len(created) == 0):
#         print "failed to create table"
#         return
    
    l = BigQueryListener(client, DATASET_ID, TABLE_ID, logger=logger)
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
 
    while True:

        logger.info("Connecting to Twitter stream")

        stream = None

        try:
            
            # stream = tweepy.Stream(auth, l, headers = {"Accept-Encoding": "deflate, gzip"})
            stream = tweepy.Stream(auth, l)

            # Choose stream: filtered or sample
            stream.sample()
#             stream.filter(track=TRACK_ITEMS) # async=True
            
        except:

            logger.exception("Unexpected error");

            if stream:
                stream.disconnect()

            time.sleep(60)
예제 #29
0
def connect_bigquery(suffix='_bq', env_suffix='_BQ'):
    global bq_connections
    bq_configs = {
        k[:-1 * len(env_suffix + '_EMAIL')]: v
        for k, v in os.environ.items() if k.endswith(env_suffix + '_EMAIL')
    }

    if bq_configs:
        from bigquery import get_client
        bq_connections = {
            k.lower() + suffix:
            get_client(os.environ.get(k + env_suffix + '_PROJECT'),
                       service_account=email,
                       private_key=os.environ.get(k + env_suffix + '_KEY'))
            for k, email in bq_configs.items()
        }

    return bq_connections
예제 #30
0
def get_dates(department):
    request_departments = """SELECT DateTimeS, Quantity FROM [bamboo-creek-195008:test2.SalesForLastYear] 
    WHERE Department = "%s" AND DATE(DateTimeS) >= "2017-01-01 00:00:00" AND DATE(DateTimeS) < "2018-01-01 00:00:00" AND Quantity > 0""" % (department)

    client = get_client(json_key_file = json_key, readonly = True)
    
    job_id, _results = client.query(request_departments)

    complete, row_count = client.check_job(job_id)

    wait_until(lambda c=client, id=job_id: client.check_job(id), 10000)

    results = client.get_query_rows(job_id)
    res = [0] * 53
    for el in results:
        res[datetime.datetime.strptime(el['DateTimeS'][:10], "%Y-%m-%d").isocalendar()[1]] += el['Quantity']

    return res
예제 #31
0
def get_departments():
    request_departments = """SELECT Department FROM [bamboo-creek-195008:test2.SalesForLastYear] 
    WHERE DATE(DateTimeS) >= "2017-01-01 00:00:00" AND DATE(DateTimeS) < "2018-01-01 00:00:00" 
    GROUP BY Department"""

    client = get_client(json_key_file = json_key, readonly = True)
    
    job_id, _results = client.query(request_departments)

    complete, row_count = client.check_job(job_id)

    wait_until(lambda c=client, id=job_id: client.check_job(id), 10000)

    results = client.get_query_rows(job_id)

    res = [el['Department'] for el in results]

    return res
예제 #32
0
def execute_query(querystate, offset=None, limit=None, user_id=None, tenant=None):
          query = querystate
          try:
              client = get_client(project_id, service_account=service_account,
                                private_key_file=key, readonly=False)
              job_id, totalBytesProcessed, statistics, download_bytes, _ = client.query(query, timeout=60)
              totalBytesBilled = statistics['query']['totalBytesBilled']
              #outputBytes = statistics['load']['outputBytes']
              usages = {'totalBytesProcessed':totalBytesProcessed,
                        'totalBytesBilled':totalBytesBilled,
                        'download_bq' : download_bytes
                        }
              obj = dre.RatingEngine(user_id, tenant,job_id,**usages)
              p1 = threading.Thread(target=obj.set_usage(), args=())
              p1.start()
          except Exception, err:
              print err
              raise err
예제 #33
0
파일: views.py 프로젝트: jain100/datacheck
def authenticate(request,filename):
    client = None
    f_path = os.path.join(BASE_DIR, 'media')
    fin_path = os.path.join(f_path, filename) #this is the actual path of client file
    try:
        json_key = fin_path #providing name to client file
        client = get_client(json_key_file=json_key, readonly=True) #this provides client object if client file is valid
        os.remove(json_key) #deleting json file from media directory as client object is saved in client_dict
        print(type(client))
        #request.session['clientfile']=json_key #client file path or json file path
        request.session['client']=id(client)#id of that client
        client_dict[request.session.get('client')]=client
        return request
    except Exception as e: #if client is not authenticated
        print(e)
        os.remove(fin_path)
        request.authenticated = False
        request.client=None
        return request
예제 #34
0
def retrieve_data():
    spinner.info('authenticating')
    bigquery_client = bigquery.get_client(json_key_file=args.token,
                                          readonly=True)

    spinner.info('submitting query')
    bigquery_request = REQUEST.format(
        date=ALL if args.all else YEAR if args.year else MONTH,
        projects=', '.join("'{:s}'".format(project)
                           for project in args.packages),
        installers=', '.join("'{:s}'".format(client)
                             for client in args.installers))
    job_id, _results = bigquery_client.query(bigquery_request)

    start = datetime.now()
    while True:
        duration = datetime.now() - start
        spinner.info('retrieving data ({:.2f}s)'.format(
            duration.total_seconds()))
        complete, row_count = bigquery_client.check_job(job_id)
        if complete:
            return bigquery_client.get_query_rows(job_id)
예제 #35
0
    def populate(self, table):
        client = get_client(project_id, json_key_file=json_key, readonly=False)
        qry = "SELECT Name, Url  FROM [%s] LIMIT 1000" % table

        try:
            job_id, results = client.query(qry, timeout=3000)

            results = client.get_query_rows(job_id)

            for row in results:
                query = """
                INSERT INTO %s (Name, URL) VALUES ('%s', '%s')
                ON DUPLICATE KEY UPDATE URL = '%s'
                """ % (self.scantable, row['Name'].replace('\'', ''), row['Url'], row['Url'])
                
                cursor = self.conn.cursor()
                cursor.execute(query)

            self.conn.commit()
        except Exception as e:
            print e
            print 'timeout'

        print "Populated Table"
def get_venue_name_from_venue_id(venue_id):
    #PROJECT_ID = 'tour-miner-project'
    #    SERVICE_ACCOUNT = '*****@*****.**'
    #    JSON_KEY_PATH = 'tour-miner-project-8873e737b27c.json'
    SERVICE_ACCOUNT = '*****@*****.**'
    JSON_KEY_PATH = 'tour-miner-project-2019-511090dd4bd9.json'

    # BigQueryClientの取得
    client = get_client(json_key_file=JSON_KEY_PATH, readonly=True)

    query = '#standardSQL\nSELECT VENUE_NAME FROM `tour-miner-project.dataset_TIST2015.JP_VENUE_DICTIONARY` WHERE VENUE_ID = \'' + venue_id + '\' LIMIT 1'
    #    print(query)

    try:
        job_id, results = client.query(query, timeout=60)
    except BigQueryTimeoutException as e:
        print(e)

    #venue_idと  VENUE_NAMEが該当しない時にエラーが出るため改良した
    #print(results[0]['VENUE_NAME'])
    if len(results) == 0:
        return '該当なし'
    else:
        return (results[0]['VENUE_NAME'])
예제 #37
0
    for i in albums:
        albumid = i['album_id']
        albumids.append(albumid)

    # songs에 albumdetail이라는 key를 만들고 value값으로 album 데이터 집어넣기
    albumdetail = {}
    for i in range(len(albumids)):
        albumdetail[albumids[i]] = albums[i]

    for song in songs:
        song['albumdetail'] = albumdetail[song['album_id']]

# # ---------------------bigquery--------------------------------

keyfile = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
client = bigquery.get_client(json_key_file=keyfile, readonly=False)
print("identification success")

DATABASE = 'bqdb'
TABLE = 'Songs'

if not client.check_table(DATABASE, TABLE):
    print("Create table {}.{}".format(DATABASE, TABLE, file=sys.stderr))

    client.create_table(DATABASE, TABLE, [{
        'name': 'song_no',
        'type': 'string',
        'description': 'song id'
    }, {
        'name': 'title',
        'type': 'string',
import tweepy
import bigquery

# Twitterの認証情報を読み込む。
CONSUMER_KEY = os.environ['CONSUMER_KEY']
CONSUMER_SECRET = os.environ['CONSUMER_SECRET']
ACCESS_TOKEN = os.environ['ACCESS_TOKEN']
ACCESS_TOKEN_SECRET = os.environ['ACCESS_TOKEN_SECRET']

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

# BigQueryの認証情報(credentials.json)を指定してBigQueryのクライアントを作成する。
# 明示的にreadonly=Falseとしないと書き込みができない。
client = bigquery.get_client(json_key_file='credentials.json', readonly=False)

DATASET_NAME = 'twitter'  # BigQueryのデータセット名
TABLE_NAME = 'tweets'  # BigQueryのテーブル名

# テーブルが存在しない場合は作成する。
if not client.check_table(DATASET_NAME, TABLE_NAME):
    print('Creating table {0}.{1}'.format(DATASET_NAME, TABLE_NAME),
          file=sys.stderr)
    # create_table()の第3引数にはスキーマを指定する。
    client.create_table(DATASET_NAME, TABLE_NAME, [
        {
            'name': 'id',
            'type': 'string',
            'description': 'ツイートのID'
        },
예제 #39
0
from bigquery import get_client
import yaml
import datetime
import sys

with open("config/google.yml") as f:
    config = yaml.load(f)

client = get_client(config["auth"]["project_id"],
        service_account=config["auth"]["service_account"],
        private_key_file=config["auth"]["key"],
        readonly=False)

def create_dataset():
    exists = client.check_dataset(config["bigquery"]["dataset"])
    if not exists:
        print "creating %s" % config["bigquery"]["dataset"]
        client.create_dataset(config["bigquery"]["dataset"],
                friendly_name="sendgrid dataset",
                description="A dataset created by me")

def create_tables():
    now = datetime.datetime.now()
    for i in xrange(0, config["bigquery"]["table_counts"]):
        date = now + datetime.timedelta(days = i)
        table_name = config["bigquery"]["table_name"] + date.strftime("%Y%m%d")
        exists = client.check_table(config["bigquery"]["dataset"], table_name)
        if not exists:
            print "creating table %s" % table_name
            created = client.create_table(config["bigquery"]["dataset"], table_name, config["bigquery"]["schemas"])
예제 #40
0
파일: query_all.py 프로젝트: rags123/clabs
from bigquery import get_client

project_id = 'crypto-will-95713'

service_account = '*****@*****.**'

key = 'clabs-da9b4e83b6fa.p12'

client = get_client(project_id, service_account=service_account, 
    private_key_file=key, readonly=False)


fields = ['Username', 'Visits', 'Location', 'Device', 'Screensize', 'Browser', 'Products', 'Category']
if client:
    print "Connection to big query established."
    datasets = client.get_datasets()
    print datasets

    """Code for creating a table with the fields."""
    # dataset = client.create_dataset('development', friendly_name="clab_dev", description="Test database for CLABS")
    # schema_1 = [
    # {'name': 'Username', 'type': 'STRING', 'mode': 'required'},
    # {'name': 'Email', 'type': 'STRING', 'mode': 'nullable'},
    # {'name': 'Visits', 'type': 'INTEGER', 'mode': 'nullable'},
    # {'name': 'Password', 'type': 'STRING', 'mode': 'nullable'},
    # {'name': 'Location', 'type': 'STRING', 'mode': 'nullable'},
    # {'name': 'Device', 'type': 'STRING', 'mode': 'nullable'},
    # {'name': 'Screensize', 'type': 'INTEGER', 'mode': 'nullable'},
    # {'name': 'Browser', 'type': 'STRING', 'mode': 'nullable'},
    # {'name': 'Products', 'type': 'STRING', 'mode': 'nullable'},
    # {'name': 'Category', 'type': 'STRING', 'mode': 'nullable'}
예제 #41
0
def get_datasets():
    client = get_client(project_id, service_account=service_account,
                        private_key_file=key, readonly=False, swallow_results=False)
    datasets = client.get_datasets()
    return datasets
예제 #42
0
import praw
import logging

logger = logging.getLogger()
logger.setLevel("INFO")
logging.basicConfig()

# Global helpers
c = r.connect(os.getenv("RETHINKDB_HOST", "rethinkdb.reddit-analyzer"), os.getenv("RETHINKDB_PORT", 28015))
UA = "/r/%s flair analyzer by /u/coffenbacher for https://charlesoffenbacher.com" % sys.argv[1]
pr = praw.Reddit(user_agent=UA)
db = r.db("reddit")

client = get_client(
    os.getenv("BIGQUERY_PROJECT_ID"),
    service_account=os.getenv("BIGQUERY_SERVICEACCOUNT_ID"),
    private_key_file=os.getenv("BIGQUERY_PRIVATEKEY_FILE"),
    readonly=False,
)

# Helper
def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return ([e for e in t if e != None] for t in itertools.izip_longest(*args))


def get_submissions_between_epochs(start, end, subreddit):
    logger.info(
        "Running search for range %s->%s" % (epoch(start).datetime.strftime("%x"), epoch(end).datetime.strftime("%x"))
    )
    query = "timestamp:%d..%d" % (start, end)
    return pr.search(query, subreddit=subreddit, sort="new", limit=1000, syntax="cloudsearch")
# BigQuery parameters
proj = args.proj
json_key = args.json_key
out_ds = args.out_ds
table_loc_ungrouped = "tmp_loc_ungrouped"
table_sc_ungrouped = "tmp_sc_ungrouped"
table_loc = args.tb_loc
table_sc = args.tb_sc
table_skip = args.tb_skip

# CLOC executable
cloc_exec = args.cloc

# Using BigQuery-Python https://github.com/tylertreat/BigQuery-Python
print('\nGetting BigQuery client\n')
bq_client = get_client(json_key_file=json_key, readonly=False)

# Delete the final output tables if they exist
delete_bq_table(bq_client, out_ds, table_loc)
delete_bq_table(bq_client, out_ds, table_sc)

# Get SHAs already in ungrouped tables or skipped
existing_sha_loc = unique_vals(bq_client, proj, out_ds, table_loc_ungrouped,
                               "sha")
existing_sha_sc = unique_vals(bq_client, proj, out_ds, table_sc_ungrouped,
                              "sha")
if not Counter(existing_sha_loc) == Counter(existing_sha_sc):
    print(
        "Deleting tables %s and %s because they do not contain the same SHAs. Starting over."
        % (table_loc_ungrouped, table_sc_ungrouped))
    delete_bq_table(bq_client, out_ds, table_loc_ungrouped)
예제 #44
0
def check_table(dataset, table):
    client = get_client(project_id, service_account=service_account,
                        private_key_file=key, readonly=False, swallow_results=False)
    result = client.check_table(dataset, table)
    return result
예제 #45
0
import bigquery
import config
import pymongo
import datetime
import os

if __name__ == '__main__':
    project_id = os.environ.get('PROJECT_ID')
    service_account = os.environ.get('SERVICE_ACCOUNT')
    key = 'george-key.p12'
    client = bigquery.get_client(project_id, service_account=service_account,
                        private_key_file=key, readonly=True)
    tables = ['2007', '2008', '2009', '2010', '2011', '2012', 
              '2013', '2014', '2015_01', '2015_02', '2015_03', 
              '2015_04', '2015_05', '2015_06', '2015_07', 
              '2015_08', '2015_09']
    subreddits = ['"%s"' % item['subreddit'] 
                    for item in config.SUBREDDITS]
    for table in tables:
        print 'Querying table:', table
        query = 'SELECT * FROM [fh-bigquery:reddit_comments.{}] WHERE subreddit IN ({})'.format(table, ','.join(subreddits))
        job_id, _results = client.query(query)
        complete, row_count = client.check_job(job_id)
        if complete:
            print 'Downloading', table, row_count
            results = client.get_query_rows(job_id)
            records = [r for r in list(results)]
            for record in records:
                record['created_utc'] = datetime.datetime.utcfromtimestamp(record['created_utc'])
            db = pymongo.MongoClient()['reddit']
            if records:
예제 #46
0
def get_table(dataset_ID, table):
              client = get_client(project_id, service_account=service_account,
                            private_key_file=key, readonly=True)
              result  = client.get_table(dataset_ID,table)
              return result
예제 #47
0
파일: pushdata.py 프로젝트: joe753/Bigdata
import bigquery
import sys

client = bigquery.get_client(json_key_file='./bigquery.json', readonly=False)

DATABASE = "bqdb"
TABLE = "test"


if not client.check_table(DATABASE, TABLE):
    print("Create table {0}.{1}".format(DATABASE, TABLE), file=sys.stderr)

    client.create_table(DATABASE, TABLE, [
        {'name': 'songno', 'type': 'string', 'description': 'song id'},
        {'name': 'title', 'type': 'string', 'description': 'song title'},
        {'name': 'albumid', 'type': 'string', 'description': 'album id'}
    ])
            
ttt = [ {'songno': '444',  'albumid': '444444', 
        'rec': {'sub1':'abc4'}} ]

pushResult = client.push_rows(DATABASE, TABLE, ttt, insert_id_key='songno')

print("Pushed Result is", pushResult)
예제 #48
0
    def __init__(self, url, num_reviews=5):
        # set num_reviews to -1 to scrape everything
        #super().__init__(url)
        super(AndroidCrawler, self).__init__(url)
        self.done = False
        self.url = url
        
        if url not in self.retries:
            self.retries[url] = 3

        self.result['name'] = self.pq('.document-title').text()
        self.check()
        self.result['store'] = 'Google Play'
        price = self.pq('span[itemprop="offers"] [itemprop="price"]').attr('content')
        if price == '0':
            price = 'Free'
        self.result['price'] = price
        self.result['app_id'] = url.split('id=')[1].split('&')[0]
        self.result['storeurl'] = url
        category = self.pq('a.document-subtitle.category')
        if category.length == 2:
            subcategory = category.eq(1).text()
        elif category.length == 1:
            subcategory = ''
        else:
            raise Exception('Something wrong with categories')
        self.result['category'] = category.eq(0).text()
        self.result['subcategory'] = subcategory
        self.result['icon'] = self.pq('.cover-container').children().attr('src')
        self.result['screenshots'] = [x.attr('src') for x in self.pq('.screenshot').items()]
        self.result['description'] = self.pq('.id-app-orig-desc').text()
        self.result['description'] = self.pq('div[itemprop="description"]').text()
        
        
        
        #print self.result['description'] 
        print "kam ban gya he "
        self.result['downloads'] = self.pq('div[itemprop="numDownloads"]').text()
        if not offline:
            self.result['permissions'] = self.__get_permissions(self.result['app_id'])
        else:
            self.result['permissions'] = []

        self.result['developer'] = self.pq('.primary').text()
        self.result['contentRating'] = self.pq('.meta-info.contains-text-link > .content').eq(0).text()
        try:
            self.result['developerWebsite'] = self.pq('.dev-link').attr('href').split('q=')[1].split('&')[0]
        except:
            self.result['developerWebsite'] = 'Nothing'
        siblings = self.pq('.dev-link')
        found = siblings.eq(1).text()
        
        if siblings.eq(2).text() == 'Privacy Policy' :
            self.result['DeveloperPrivacyPolicy'] = siblings.eq(2).attr('href')
            self.result['physicalAddress']=self.pq('div.content.physical-address').text()
        else :  
            self.result['physicalAddress']=self.pq('div.content.physical-address').text()
            self.result['DeveloperPrivacyPolicy']="Not Given"
        self.result['whatsNew'] = self.pq('div.recent-change').text()
        self.result['developerEmail'] = found
        
        if not offline:
            print "Getting Reviews"
            if num_reviews > 0:
                self.result['reviews'] = self.__get_reviews(self.result['app_id'], num_reviews)
            else:
                self.result['reviews'] = self.__get_reviews(self.result['app_id'])
        else:
            self.result['reviews'] = []
        self.result['version'] = self.pq('div[itemprop="softwareVersion"]').text()
        self.result['updated'] = self.pq('div[itemprop="datePublished"]').text()

        try:
            self.result['rating'] = self.pq('div[itemprop="ratingValue"]').text()+' - '+self.pq('div[itemprop="ratingCount"]').text()
        except:
            self.result['rating'] = '0'
        self.result['installations'] = self.pq('div[itemprop="numDownloads"]').text()
        self.result['LastUpdateDate'] = self.pq('div[itemprop="datePublished"]').text()
        self.result['AppSize'] = self.pq('div[itemprop="fileSize"]').text()
        self.result['MinimumOSVersion'] = self.pq('div[itemprop="operatingSystems"]').text()
        
        #self.result['contentRating'] = self.pq('div[itemprop="contentRating"]').text()
        ratingReason = self.pq('.meta-info.contains-text-link > .content').eq(1).text()
        if (ratingReason == 'Learn more'):
            self.result['RatingReason'] = ''
        else:
            self.result['RatingReason'] = ratingReason

        try:
            ratingText = self.pq('div[itemprop="aggregateRating"]').text().split(' ')
            self.result['ratingValue'] = ratingText[0]
            self.result['ratingCount'] = ratingText[1]
        except:
            self.result['ratingValue'] = ''
            self.result['ratingCount'] = ''
        #print self.result['AppSize']
        #print self.result['LastUpdateDate']
        #print self.result['installations']
        #print self.result['MinimumOSVersion']
        #print  self.result['contentRating'] 
        #print self.result['developerWebsite']
        #print self.result['developerEmail']
        #print self.result['physicalAddress']
        #print self.result['whatsNew']
        
        isfree = False 
        if self.result['price'] == 0:
            isfree=True 

        project_id = 'apps-1149'
        new_reviews= []
        new_reviews = self.result['reviews']
        #print new_reviews[1]['date']

        service_account = '*****@*****.**'# Service account email       
        json_key = 'sevice_key.json'# JSON key provided by Google


        # Inserting data into table.
        client = get_client(project_id, json_key_file=json_key, readonly=False)
        
        price = str(self.result['price'])
        if price == "Free":
            price_in_rupees = 0
        else :
            price_in_rupees= float(price[2:])
       
        
        rows = []

        try:
            rows =  [
            {
            'Name': str(self.result['name']),'Store': str(self.result['store']), 'Price': price_in_rupees, 'IsFree': isfree ,
            'App_Id':str(self.result['app_id']),'Store_Url':str(self.result['storeurl']),'Category':str(self.result['category']) , 
            'subCategory':str(self.result['subcategory']) , 'Icon': str(self.result['icon']), 'Screenshots': str(self.result['screenshots']), 
            'Description' :self.result['description'], 'Downloads':str(self.result['downloads']), 'Permissions': str(self.result['permissions']), 
            'Developer': str(self.result['developer']), 'contentRating': str(self.result['contentRating']) , 
            'developerWebsite': str(self.result['developerWebsite']) ,'Reviews': str(self.result['reviews']) ,'reviewDate':str(new_reviews[0]['date']) ,
            'reviewTitle': str(new_reviews[1]['title']) , 
            'reviewTexts' :str(new_reviews[1]['text']), 'Version' : str(self.result['version']),'Updates': str(self.result['updated']), 'Rating' : str(self.result['rating']), 
            'appSize':self.result['AppSize'], 'lastUpdateDate': self.result['LastUpdateDate'] , 'installations': self.result['installations'], 
            'MinimumOSVersion': self.result['MinimumOSVersion'], 'developerEmail': self.result['developerEmail'], 'whatsNew': self.result['whatsNew'],
            'physicalAddress': self.result['physicalAddress'], 'RatingReason' : self.result['ratingReason'], 'RatingCount' : self.result['ratingCount'],
            'RatingValue' : self.result['ratingValue']
            }
            ]
        except:
            print "Incomplete"
            rows =  [
            {
            'Name': str(self.result['name']),'Store': str(self.result['store']), 'Price': price_in_rupees, 'IsFree': isfree ,
            'App_Id':str(self.result['app_id']),'Store_Url':str(self.result['storeurl']),'Category':str(self.result['category']) , 
            'subCategory':str(self.result['subcategory']) , 'Icon': str(self.result['icon']), 'Screenshots': str(self.result['screenshots']), 
            'Description' :self.result['description'], 'Downloads':str(self.result['downloads']), 'Permissions': str(self.result['permissions']), 
            'Developer': str(self.result['developer']), 'contentRating': str(self.result['contentRating']), 
            'developerWebsite': str(self.result['developerWebsite']) ,'Reviews': str(self.result['reviews']), 
            'Version' : str(self.result['version']),'Updates': str(self.result['updated']), 'Rating' : str(self.result['rating']), 
            'appSize':self.result['AppSize'], 'lastUpdateDate': self.result['LastUpdateDate'] , 'installations': self.result['installations'], 
            'MinimumOSVersion': self.result['MinimumOSVersion'], 'developerEmail': self.result['developerEmail'], 'whatsNew': self.result['whatsNew'],
            'physicalAddress': self.result['physicalAddress'], 'RatingReason' : self.result['RatingReason'], 'RatingCount' : self.result['ratingCount'],
            'RatingValue' : self.result['ratingValue']
            }
            ]

        self.done = client.push_rows('Temp', 'AndroidTest', rows, 'id')

#print "success"
예제 #49
0
proj = args.proj
json_key = args.json_key
dataset = args.ds
table = args.table
sheet = args.sheet
gh_username = args.gh_username
gh_oauth_key = args.gh_oauth_key
 
# Get repo names
print("Getting repo names from spreadsheet")
repos = get_repo_names(sheet, json_key)
print("There are %s repos with use_repo = 1.\n" % len(repos))

# Using BigQuery-Python https://github.com/tylertreat/BigQuery-Python
print('\nGetting BigQuery client\n')
client = get_client(json_key_file=json_key, readonly=False, swallow_results=True)
  
# Check which repos are already in the table
existing_repos = unique_vals(client, proj, dataset, table, "repo_name")
if len(existing_repos) > 0:
    repos = [repo for repo in repos if repo not in existing_repos]
    print("Only getting data for %s repos not yet analyzed" %len(repos))

# Table schema
schema = [
    {'name': 'repo_name', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'commit_sha', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'commit_api_url', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'commit_html_url', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'commit_comments_url', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'commit_message', 'type': 'STRING', 'mode': 'NULLABLE'},
#Check the BigQuery Python Setup Steps.txt for setup steps

from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
from bigquery import get_client

from googleapiclient import http
from bigquery.errors import BigQueryTimeoutException
from bigquery import JOB_SOURCE_FORMAT_CSV

credentials = GoogleCredentials.get_application_default()

project_id = 'YOUR PROJECTID'
dataset = 'YOUR DATASET'
service = discovery.build('storage', 'v1', credentials=credentials)
client = get_client(project_id, credentials=credentials)


#Step 1: Upload local file to google cloud storage (GCS)
filename = 'YOUR FILE PATH'
bucket = 'YOUR GCS Bucket' #Note: This should not contain gs:// part

destination_file_path='Destination file path inside the bucket'
body = {'name': destination_file_path}
req = service.objects().insert(bucket=bucket, body=body, media_body=filename)
resp = req.execute()


#Step 2: Load file in GCS to a table in big query

#Drop table if exists