def get_tweets(): bq_service = BigQueryService() print("LIMIT:", LIMIT) job = Job() tweets = [] job.start() for row in bq_service.fetch_labeled_tweets_in_batches(limit=LIMIT): tweets.append(dict(row)) job.counter += 1 if job.counter % BATCH_SIZE == 0: job.progress_report() job.end() print("FETCHED TWEETS:", fmt_n(len(tweets))) return DataFrame(tweets)
def download_data(): job = Job() bq_service = BigQueryService() job.start() records = [] for row in bq_service.fetch_user_details_vq(limit=LIMIT): #print(row) records.append(dict(row)) job.counter += 1 if job.counter % BATCH_SIZE == 0: job.progress_report() job.end() return DataFrame(records)
# TODO: de-dup RTs so the model will only train/test on a single RT status text (PREVENT OVERFITTING) if os.path.exists(tweets_csv_filepath) and not DESTRUCTIVE: print("LOADING TWEETS...") statuses_df = read_csv(tweets_csv_filepath) else: job.start() print("DOWNLOADING TWEETS...") statuses = [] for row in bq_service.fetch_daily_active_tweeter_statuses( date=DATE, tweet_min=TWEET_MIN, limit=LIMIT): statuses.append(dict(row)) job.counter += 1 if job.counter % BATCH_SIZE == 0: job.progress_report() job.end() statuses_df = DataFrame(statuses) del statuses statuses_df.to_csv(tweets_csv_filepath) print("STATUSES:", fmt_n(len(statuses_df))) # # MAKE GRAPH # TODO: export graph as CSV format for TZ # and optionally also construct the gpickle and json graph objects local_nodes_csv_filepath = os.path.join(storage.local_dirpath, "active_nodes.csv") local_graph_csv_filepath = os.path.join(storage.local_dirpath,