def write_to_bq(bigquery):
    """Write the data to BigQuery in small chunks."""
    tweets = []
    CHUNK = 50  # The size of the BigQuery insertion batch.
    twstring = ''
    tweet = None
    mtweet = None
    while True:
        while len(tweets) < CHUNK:
            # We'll use a blocking list pop -- it returns when there is
            # new data.
            res = r.brpop(REDIS_LIST)
            twstring = res[1]
            try:
                tweet = json.loads(res[1])
            except Exception, bqe:
                print bqe
                continue
            # First do some massaging of the raw data
            mtweet = utils.cleanup(tweet)
            # We only want to write tweets to BigQuery; we'll skip 'delete' and
            # 'limit' information.
            if 'delete' in mtweet:
                continue
            if 'limit' in mtweet:
                print mtweet
                continue
            tweets.append(mtweet)
        # try to insert the tweets into bigquery
        utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'],
                             os.environ['BQ_TABLE'], tweets)
        tweets = []
def write_to_bq(pubsub, sub_name, bigquery):
    """Write the data to BigQuery in small chunks."""
    tweets = []
    CHUNK = 50  # The size of the BigQuery insertion batch.
    # If no data on the subscription, the time to sleep in seconds
    # before checking again.
    WAIT = 2
    tweet = None
    mtweet = None
    while True:
        while len(tweets) < CHUNK:
            twmessages = pull_messages(pubsub, PROJECT_ID, sub_name)
            if twmessages:
                for res in twmessages:
                    try:
                        tweet = json.loads(res)
                    except Exception, bqe:
                        print bqe
                    # First do some massaging of the raw data
                    mtweet = utils.cleanup(tweet)
                    # We only want to write tweets to BigQuery; we'll skip
                    # 'delete' and 'limit' information.
                    if 'delete' in mtweet:
                        continue
                    if 'limit' in mtweet:
                        print mtweet
                        continue
                    tweets.append(mtweet)
            else:
                # pause before checking again
                print 'sleeping...'
                time.sleep(WAIT)
        utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'],
                             os.environ['BQ_TABLE'], tweets)
        tweets = []
def write_to_bq(pubsub, sub_name, bigquery):
    """Write the data to BigQuery in small chunks."""
    tweets = []
    CHUNK = 50  # The size of the BigQuery insertion batch.
    # If no data on the subscription, the time to sleep in seconds
    # before checking again.
    WAIT = 2
    tweet = None
    mtweet = None
    while True:
        while len(tweets) < CHUNK:
            twmessages = pull_messages(pubsub, PROJECT_ID, sub_name)
            if twmessages:
                for res in twmessages:
                    try:
                        tweet = json.loads(res)
                    except Exception, bqe:
                        print bqe
                    # First do some massaging of the raw data
                    mtweet = utils.cleanup(tweet)
                    # We only want to write tweets to BigQuery; we'll skip
                    # 'delete' and 'limit' information.
                    if 'delete' in mtweet:
                        continue
                    if 'limit' in mtweet:
                        print mtweet
                        continue
                    tweets.append(mtweet)
            else:
                # pause before checking again
                print 'sleeping...'
                time.sleep(WAIT)
        utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'],
                             os.environ['BQ_TABLE'], tweets)
        tweets = []
def write_to_bq(pubsub_sub, pubsub_pub, sub_name, bigquery):
    """Write the data to BigQuery in small chunks."""
    tweets = []
    CHUNK = 50  # The size of the BigQuery insertion batch.
    WAIT = 2  # Sleep time in seconds if no data

    while 1 > 0:
        while len(tweets) < CHUNK:
            twmessages = pull_messages(pubsub_sub, PROJECT_ID, sub_name)
            if twmessages:
                for res in twmessages:
                    try:
                        tweet = json.loads(res)

                        if tweet.get('id') is None:
                            logging.error(f'Tweet Parse: Missing ID - {res}')
                            raise ValueError('Missing Tweet ID')

                        mtweet = utils.cleanup(tweet)
                        tweets.append(mtweet)
                    except Exception as bqe:
                        logging.error(f'Tweet Parse: Error - {bqe}')
            else:
                time.sleep(WAIT)

        utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'],
                             os.environ['BQ_TABLE'], tweets)

        tweets = []
def write_to_bq(bigquery):
    """Write the data to BigQuery in small chunks."""
    tweets = []
    CHUNK = 50  # The size of the BigQuery insertion batch.
    tweet = None
    mtweet = None
    count = 0
    count_max = 50000
    redis_errors = 0
    allowed_redis_errors = 3
    while count < count_max:
        while len(tweets) < CHUNK:
            # We'll use a blocking list pop -- it returns when there is
            # new data.
            res = None
            try:
                res = r.brpop(REDIS_LIST)
            except:
                print 'Problem getting data from Redis.'
                redis_errors += 1
                if redis_errors > allowed_redis_errors:
                    print "Too many redis errors: exiting."
                    return
                continue
            try:
                tweet = json.loads(res[1])
            except Exception, e:
                print e
                redis_errors += 1
                if redis_errors > allowed_redis_errors:
                    print "Too many redis-related errors: exiting."
                    return
                continue
            # First do some massaging of the raw data
            mtweet = utils.cleanup(tweet)
            # We only want to write tweets to BigQuery; we'll skip 'delete' and
            # 'limit' information.
            if 'delete' in mtweet:
                continue
            if 'limit' in mtweet:
                continue
            tweets.append(mtweet)
        # try to insert the tweets into bigquery
        response = utils.bq_data_insert(bigquery, PROJECT_ID,
                                        os.environ['BQ_DATASET'],
                                        os.environ['BQ_TABLE'], tweets)
        tweets = []
        count += 1
        if count % 25 == 0:
            print("processing count: %s of %s at %s: %s" %
                  (count, count_max, datetime.datetime.now(), response))
def write_to_bq(bigquery):
    """Write the data to BigQuery in small chunks."""
    tweets = []
    CHUNK = 50  # The size of the BigQuery insertion batch.
    tweet = None
    mtweet = None
    count = 0
    count_max = 50000
    redis_errors = 0
    allowed_redis_errors = 3
    while count < count_max:
        while len(tweets) < CHUNK:
            # We'll use a blocking list pop -- it returns when there is
            # new data.
            res = None
            try:
                res = r.brpop(REDIS_LIST)
            except:
                print 'Problem getting data from Redis.'
                redis_errors += 1
                if redis_errors > allowed_redis_errors:
                    print "Too many redis errors: exiting."
                    return
                continue
            try:
                tweet = json.loads(res[1])
            except Exception, e:
                print e
                redis_errors += 1
                if redis_errors > allowed_redis_errors:
                    print "Too many redis-related errors: exiting."
                    return
                continue
            # First do some massaging of the raw data
            mtweet = utils.cleanup(tweet)
            # We only want to write tweets to BigQuery; we'll skip 'delete' and
            # 'limit' information.
            if 'delete' in mtweet:
                continue
            if 'limit' in mtweet:
                continue
            tweets.append(mtweet)
        # try to insert the tweets into bigquery
        response = utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'],
                             os.environ['BQ_TABLE'], tweets)
        tweets = []
        count += 1
        if count % 25 == 0:
            print ("processing count: %s of %s at %s: %s" %
                   (count, count_max, datetime.datetime.now(), response))
def write_to_bq(pubsub, sub_name, bigquery):
    """Write the data to BigQuery in small chunks."""
    tweets = []
    CHUNK = 50  # The size of the BigQuery insertion batch.
    # If no data on the subscription, the time to sleep in seconds
    # before checking again.
    WAIT = 2
    tweet = None
    mtweet = None
    count = 0
    count_max = 50000
    while count < count_max:
        while len(tweets) < CHUNK:
            twmessages = pull_messages(pubsub, PROJECT_ID, sub_name)
            if twmessages:
                for res in twmessages:
                    print(res)
                    decoded_res = base64.urlsafe_b64decode(res)
                    print(decoded_res)
                    try:
                        tweet = json.loads(decoded_res)
                        print(tweet)
                    except Exception, bqe:
                        print bqe
                    # First do some massaging of the raw data
                    mtweet = utils.cleanup(tweet)
                    # We only want to write tweets to BigQuery; we'll skip
                    # 'delete' and 'limit' information.
                    if not mtweet:
                        continue
                    if 'delete' in mtweet:
                        continue
                    if 'limit' in mtweet:
                        continue
                    tweets.append(mtweet)
            else:
                # pause before checking again
                print 'sleeping...'
                time.sleep(WAIT)
        response = utils.bq_data_insert(bigquery, PROJECT_ID,
                                        os.environ['BQ_DATASET'],
                                        os.environ['BQ_TABLE'], tweets)
        tweets = []
        count += 1
        if count % 25 == 0:
            print("processing count: %s of %s at %s: %s" %
                  (count, count_max, datetime.datetime.now(), response))
Beispiel #8
0
                        print "NLP result - sentiment magnitude:", response[
                            'documentSentiment']['magnitude']
                        mtweet['sentiment_score'] = response[
                            'documentSentiment']['score']
                        mtweet['sentiment_magnitude'] = response[
                            'documentSentiment']['magnitude']
                    except Exception, e:
                        print e
                        print "Unsupported language, skipping the tweet"
                    tweets.append(mtweet)
            else:
                # pause before checking again
                print 'sleeping...'
                time.sleep(WAIT)
        response = utils.bq_data_insert(bigquery, PROJECT_ID,
                                        os.environ['BQ_DATASET'],
                                        os.environ['BQ_TABLE'], tweets)
        tweets = []
        count += 1
        if count % 25 == 0:
            print("processing count: %s of %s at %s: %s" %
                  (count, count_max, datetime.datetime.now(), response))


if __name__ == '__main__':
    topic_info = PUBSUB_TOPIC.split('/')
    topic_name = topic_info[-1]
    sub_name = "tweets-%s" % topic_name
    print "starting write to BigQuery...."
    credentials = utils.get_credentials()
    bigquery = utils.create_bigquery_client(credentials)