def __init__(self, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, pg_destructive=PG_DESTRUCTIVE, bq_service=None): self.bq_service = bq_service or BigQueryService() if users_limit: self.users_limit = int(users_limit) else: self.users_limit = None self.tweets_limit = self.users_limit # todo: combine with users_limit for a more generic rows_limit, since we usually run one script or another, so can reset the var between runs self.batch_size = batch_size self.pg_destructive = pg_destructive self.pg_engine = db self.pg_session = BoundSession() print("-------------------------") print("PG PIPELINE...") print(" USERS LIMIT:", self.users_limit) print(" BATCH SIZE:", self.batch_size) #print(" BQ SERVICE:", type(self.bq_service)) #print(" PG SESSION:", type(self.pg_session)) print(" PG DESTRUCTIVE:", self.pg_destructive)
def cautiously_initialized_storage_service(): service = BigQueryService() print("-------------------------") print("BQ CONFIG...") print(" BIGQUERY DATASET:", service.dataset_address.upper()) print(" DESTRUCTIVE MIGRATIONS:", service.destructive) print(" VERBOSE QUERIES:", service.verbose) print("-------------------------") print("WORKER CONFIG...") print(" MIN USER ID:", MIN_ID) print(" MAX USER ID:", MAX_ID) print(" USERS LIMIT:", LIMIT) print(" MAX THREADS:", MAX_THREADS) print(" BATCH SIZE:", BATCH_SIZE) print("-------------------------") print("SCRAPER CONFIG...") print(" VERBOSE SCRAPER:", VERBOSE_SCRAPER) print(" MAX FRIENDS:", MAX_FRIENDS) print("-------------------------") if APP_ENV == "development": if input("CONTINUE? (Y/N): ").upper() != "Y": print("EXITING...") exit() #service.init_tables() # did this originally, but moving to a manual migration strategy to prevent accidental deletions return service
def __init__(self, topic=TOPIC, tweets_start_at=TWEETS_START_AT, tweets_end_at=TWEETS_END_AT, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, storage_dirpath=None, bq_service=None): Job.__init__(self) GraphStorage.__init__(self, dirpath=storage_dirpath) self.bq_service = bq_service or BigQueryService() self.fetch_edges = self.bq_service.fetch_retweet_edges_in_batches_v2 # just being less verbose. feels like javascript # CONVERSATION PARAMS (OPTIONAL) self.topic = topic self.tweets_start_at = tweets_start_at self.tweets_end_at = tweets_end_at # PROCESSING PARAMS self.users_limit = users_limit if self.users_limit: self.users_limit = int(self.users_limit) self.batch_size = int(batch_size) print("-------------------------") print("RETWEET GRAPHER...") print(" USERS LIMIT:", self.users_limit) print(" BATCH SIZE:", self.batch_size) print(" DRY RUN:", DRY_RUN) print("-------------------------") print("CONVERSATION PARAMS...") print(" TOPIC:", self.topic) print(" TWEETS START:", self.tweets_start_at) print(" TWEETS END:", self.tweets_end_at) seek_confirmation()
def test_upload_in_batches(): bq_service = BigQueryService(dataset_name="impeachment_test") # when inserting more than 10,000 rows, # is able to overcome error "too many rows present in the request, limit: 10000": lots_of_rows = [{"start_date":"2020-01-01", "user_id":i, "bot_probability": .99} for i in range(1, 36000)] errors = bq_service.upload_daily_bot_probabilities(lots_of_rows) assert not any(errors)
def __init__(self, model_name=MODEL_NAME, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None): self.model_name = model_name.lower().replace(";","") # using this model name in queries, so be super safe about SQL injection, although its not a concern right now self.limit = limit self.batch_size = batch_size self.bq_service = bq_service or BigQueryService() print("----------------") print("TOXICITY SCORER...") print(" MODEL:", self.model_name.upper()) print(" LIMIT:", fmt_n(self.limit)) print(" BATCH SIZE:", fmt_n(self.batch_size))
def create_app(): app = Flask(__name__) CORS(app) # CORS(app, resources={r"/api/*": {"origins": "*"}}) app.config["SECRET_KEY"] = SECRET_KEY app.config["BQ_SERVICE"] = BigQueryService(cautious=False) #app.config.from_mapping(SECRET_KEY=SECRET_KEY, BQ_SERVICE=BigQueryService()) app.register_blueprint(api_v0_routes) app.register_blueprint(api_v1_routes) return app
def __init__(self, bq=None, topic=TOPIC, pval_max=PVAL_MAX, results_csv_filepath=RESULTS_CSV_FILEPATH): self.topic = topic self.bq = bq or BigQueryService() self.x = [] self.y = [] self.pval_max = pval_max self.interpret_ks = interpret self.results_csv_filepath = results_csv_filepath
def get_tweets(): bq_service = BigQueryService() print("LIMIT:", LIMIT) job = Job() tweets = [] job.start() for row in bq_service.fetch_labeled_tweets_in_batches(limit=LIMIT): tweets.append(dict(row)) job.counter += 1 if job.counter % BATCH_SIZE == 0: job.progress_report() job.end() print("FETCHED TWEETS:", fmt_n(len(tweets))) return DataFrame(tweets)
def download_data(): job = Job() bq_service = BigQueryService() job.start() records = [] for row in bq_service.fetch_user_details_vq(limit=LIMIT): #print(row) records.append(dict(row)) job.counter += 1 if job.counter % BATCH_SIZE == 0: job.progress_report() job.end() return DataFrame(records)
def __init__(self, bq_service=None, twitter_service=None, user_limit=USER_LIMIT, friend_limit=FRIEND_LIMIT): self.bq_service = bq_service or BigQueryService() self.twitter_service = twitter_service or TwitterService() self.dataset_address = self.bq_service.dataset_address self.user_limit = int(user_limit) self.friend_limit = int(friend_limit) print("---------------------------") print("JOB: FRIEND LOOKUPS") print("DATASET:", self.dataset_address.upper()) print("USER LIMIT:", self.user_limit) print("FRIEND LIMIT:", self.friend_limit) print("---------------------------")
def __init__(self, bq_service=None, twitter_service=None, user_limit=USER_LIMIT, status_limit=STATUS_LIMIT): self.bq_service = bq_service or BigQueryService() self.twitter_service = twitter_service or TwitterService() self.dataset_address = self.bq_service.dataset_address self.user_limit = int(user_limit) self.status_limit = int(status_limit) self.parse_status = parse_timeline_status print("---------------------------") print("JOB: TIMELINE LOOKUPS") print("DATASET:", self.dataset_address.upper()) print("USER LIMIT:", self.user_limit) print("STATUS LIMIT:", self.status_limit) print("---------------------------")
def __init__(self, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None, model_manager=None): self.limit = limit self.batch_size = batch_size self.bq_service = bq_service or BigQueryService() self.mgr = model_manager or ModelManager() print("----------------") print("TOXICITY SCORER...") print(" MODEL CHECKPOINT:", self.mgr.checkpoint_name.upper(), self.mgr.checkpoint_url) print(" SCORES TABLE NAME:", self.scores_table_name) print(" LIMIT:", fmt_n(self.limit)) print(" BATCH SIZE:", fmt_n(self.batch_size)) self.predict = self.mgr.predict_scores # method alias seek_confirmation()
def __init__(self, bq_service=None, bot_min=BOT_MIN, batch_size=BATCH_SIZE, storage_dirpath=None): self.bq_service = bq_service or BigQueryService() self.bot_min = bot_min self.batch_size = batch_size Job.__init__(self) storage_dirpath = storage_dirpath or f"bot_follower_graphs/bot_min/{self.bot_min}" GraphStorage.__init__(self, dirpath=storage_dirpath) print("-------------------------") print("BOT FOLLOWER GRAPHER...") print(" BOT MIN:", self.bot_min) print(" BATCH SIZE:", self.batch_size) print("-------------------------") seek_confirmation()
def __init__(self, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, pg_destructive=PG_DESTRUCTIVE, bq_service=None): self.bq_service = bq_service or BigQueryService() if users_limit: self.users_limit = int(users_limit) else: self.users_limit = None self.batch_size = batch_size self.pg_destructive = pg_destructive self.pg_engine = db self.pg_session = BoundSession() print("-------------------------") print("PG PIPELINE...") print(" USERS LIMIT:", self.users_limit) print(" BATCH SIZE:", self.batch_size) #print(" BQ SERVICE:", type(self.bq_service)) #print(" PG SESSION:", type(self.pg_session)) print(" PG DESTRUCTIVE:", self.pg_destructive)
def __init__(self, bq_service=None, week_id=WEEK_ID): bq_service = bq_service or BigQueryService() self.week_id = week_id print("--------------------") print("FETCHING WEEKS...") self.weeks = [ RetweetWeek(row) for row in list(bq_service.fetch_retweet_weeks()) ] for week in self.weeks: print(" ", week.details) print("--------------------") print("SELECTING A WEEK...") if not self.week_id: self.week_id = input( "PLEASE SELECT A WEEK (E.G. '2019-52', '2020-01', ETC.): " ) # assumes you know what you're doing when setting WEEK_ID on production! once you run this once you'll see what all the week ids are. try: self.week = [ wk for wk in self.weeks if wk.week_id == self.week_id ][0] print(" ", self.week.details) except IndexError as err: print("OOPS - PLEASE CHECK WEEK ID AND TRY AGAIN...") exit() self.tweets_start_at = self.week.row.min_created self.tweets_end_at = self.week.row.max_created seek_confirmation() storage_service = self.init_storage_service(self.week_id) super().__init__(bq_service=bq_service, storage_service=storage_service)
def __init__(self, twitter_service=None, storage_env=STORAGE_ENV, bq_service=None, csv_service=None, batch_size=BATCH_SIZE): self.twitter_service = twitter_service or TwitterService() self.api = self.twitter_service.api self.auth = self.api.auth self.parse_status = parse_status self.storage_env = storage_env if self.storage_env == "local": self.storage_service = csv_service or LocalStorageService() elif self.storage_env == "remote": self.storage_service = bq_service or BigQueryService() else: raise ValueError("Expecting the STORAGE_ENV to be 'local' or 'remote'. Please try again...") self.batch_size = batch_size self.batch = [] self.counter = 0 print("-------------------------------") print("STREAM LISTENER...") print(" STORAGE ENV:", self.storage_env.upper()) print(" STORAGE SERVICE:", type(self.storage_service)) print(" BATCH SIZE:", self.batch_size) print("--------------------------------")
def __init__(self, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, storage_service=None, bq_service=None): super().__init__(users_limit=users_limit, batch_size=batch_size, storage_service=storage_service) self.bq_service = bq_service or BigQueryService()
from app.bq_service import BigQueryService if __name__ == "__main__": bq_service = BigQueryService(verbose=True) print("FLATTENING USER FRIENDS TABLE...") bq_service.destructively_migrate_user_friends_flat() print("BOTS ABOVE 80...") bq_service.destructively_migrate_bots_table() print("BOT FOLLOWERS ABOVE 80...") bq_service.destructively_migrate_bot_followers_table()
def __init__(self): self.twitter_api = TwitterService().api self.bq_service = BigQueryService() self.limit = STATUS_LIMIT self.batch_size = BATCH_SIZE
print(" TWEET_MIN:", TWEET_MIN) print(" LIMIT:", LIMIT) print(" BATCH_SIZE:", BATCH_SIZE) print(" DESTRUCTIVE:", DESTRUCTIVE) #print(" GRAPH_LIMIT:", GRAPH_LIMIT) print(" GRAPH_BATCH_SIZE:", GRAPH_BATCH_SIZE) print(" GRAPH_DESTRUCTIVE:", GRAPH_DESTRUCTIVE) print("------------------------") storage = FileStorage( dirpath=f"daily_active_friend_graphs_v4/{DATE}/tweet_min/{TWEET_MIN}") tweets_csv_filepath = os.path.join(storage.local_dirpath, "tweets.csv") bq_service = BigQueryService() job = Job() # # LOAD TWEETS # tweet_id, text, screen_name, bot, created_at # TODO: de-dup RTs so the model will only train/test on a single RT status text (PREVENT OVERFITTING) if os.path.exists(tweets_csv_filepath) and not DESTRUCTIVE: print("LOADING TWEETS...") statuses_df = read_csv(tweets_csv_filepath) else: job.start() print("DOWNLOADING TWEETS...") statuses = [] for row in bq_service.fetch_daily_active_tweeter_statuses(
def __init__(self, bq_service=None, gcs_service=None): super().__init__(gcs_service=gcs_service) self.bq_service = bq_service or BigQueryService()