def __init__(self, dirpath=None, gcs_service=None, wifi=WIFI): """ Saves and loads files, using local storage and/or Google Cloud Storage. Params: dirpath (str) a subpath of the data dir wifi (bool) whether or not to attempt uploads """ self.wifi = wifi self.gcs_service = gcs_service or GoogleCloudStorageService() self.dirpath = dirpath or DIRPATH self.gcs_dirpath = self.compile_gcs_dirpath(self.dirpath) self.local_dirpath = self.compile_local_dirpath(self.dirpath) #print("-------------------------") print("FILE STORAGE...") print(" DIRPATH:", self.dirpath) print(" GCS DIRPATH:", self.gcs_dirpath) print(" LOCAL DIRPATH:", os.path.abspath(self.local_dirpath)) print(" WIFI ENABLED:", self.wifi) seek_confirmation() if not os.path.exists(self.local_dirpath): os.makedirs(self.local_dirpath)
def __init__(self, n_clusters=N_COMMUNITIES): self.n_clusters = n_clusters self.classifier = SpectralClustering(n_clusters=self.n_clusters, eigen_solver=None, affinity="precomputed", n_init=20) self.grapher = BotSimilarityGrapher() self.local_dirpath = os.path.join(self.grapher.local_dirpath, "n_communities", str(self.n_clusters)) self.gcs_dirpath = os.path.join(self.grapher.gcs_dirpath, "n_communities", str(self.n_clusters)) self.local_bot_communities_filepath = os.path.join( self.local_dirpath, "community_assignments.csv") self.gcs_bot_communities_filepath = os.path.join( self.gcs_dirpath, "community_assignments.csv") print("-----------------------") print("SPECTRAL CLUSTERMAKER") print(" N CLUSTERS:", self.n_clusters) print(" CLASSIFIER:", type(self.classifier)) print(" LOCAL DIRPATH:", os.path.abspath(self.local_dirpath)) print(" GCS DIRPATH:", self.gcs_dirpath) seek_confirmation() if not os.path.exists(self.local_dirpath): os.makedirs(self.local_dirpath) self.grapher.similarity_graph_report() # load bot similarity graph self.similarity_graph = self.grapher.similarity_graph self.community_assignments = None
def __init__(self, topic=TOPIC, tweets_start_at=TWEETS_START_AT, tweets_end_at=TWEETS_END_AT, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, storage_dirpath=None, bq_service=None): Job.__init__(self) GraphStorage.__init__(self, dirpath=storage_dirpath) self.bq_service = bq_service or BigQueryService() self.fetch_edges = self.bq_service.fetch_retweet_edges_in_batches_v2 # just being less verbose. feels like javascript # CONVERSATION PARAMS (OPTIONAL) self.topic = topic self.tweets_start_at = tweets_start_at self.tweets_end_at = tweets_end_at # PROCESSING PARAMS self.users_limit = users_limit if self.users_limit: self.users_limit = int(self.users_limit) self.batch_size = int(batch_size) print("-------------------------") print("RETWEET GRAPHER...") print(" USERS LIMIT:", self.users_limit) print(" BATCH SIZE:", self.batch_size) print(" DRY RUN:", DRY_RUN) print("-------------------------") print("CONVERSATION PARAMS...") print(" TOPIC:", self.topic) print(" TWEETS START:", self.tweets_start_at) print(" TWEETS END:", self.tweets_end_at) seek_confirmation()
def __init__(self, dirpath=None, gcs_service=None): """ Saves and loads artifacts from the networkx graph compilation process, using local storage and/or Google Cloud Storage. Params: dirpath (str) like "graphs/my_graph/123" TODO: bot probability stuff only apples to bot retweet graphs, and should probably be moved into a child graph storage class """ self.gcs_service = gcs_service or GoogleCloudStorageService() self.dirpath = dirpath or DIRPATH self.gcs_dirpath = os.path.join("storage", "data", self.dirpath) self.local_dirpath = os.path.join( DATA_DIR, self.dirpath ) # TODO: to make compatible on windows, split the dirpath on "/" and re-join using os.sep print("-------------------------") print("GRAPH STORAGE...") print(" DIRPATH:", self.dirpath) print(" GCS DIRPATH:", self.gcs_dirpath) print(" LOCAL DIRPATH:", os.path.abspath(self.local_dirpath)) print(" WIFI ENABLED:", WIFI_ENABLED) seek_confirmation() if not os.path.exists(self.local_dirpath): os.makedirs(self.local_dirpath) self.results = None self.graph = None
def __init__(self, start_date=START_DATE, k_days=K_DAYS, n_periods=N_PERIODS): """ Generates a list of date ranges. Params: start_date (str) the first period start date, like "2020-01-01" k_days (int) number of days in each period n_periods (int) number of periods """ self.start_date = start_date self.k_days = int(k_days) self.n_periods = int(n_periods) print("-------------------------") print("DATE RANGE GENERATOR...") print(" START DATE:", self.start_date) print(" K DAYS:", self.k_days) print(" N PERIODS:", self.n_periods) print("-------------------------") print("DATE RANGES...") self.date_ranges = self.get_date_ranges(start_date=self.start_date, k_days=self.k_days, n_periods=self.n_periods) pprint(self.date_ranges) seek_confirmation()
def __init__(self, dirpath=None, gcs_service=None): """ Saves and loads artifacts from the networkx graph compilation process ...to and from local storage and/or Google Cloud Storage. Params: dirpath (str) like "graphs/my_graph/123" """ self.gcs_service = gcs_service or GoogleCloudStorageService() self.dirpath = dirpath or DIRPATH self.gcs_dirpath = os.path.join("storage", "data", self.dirpath) self.local_dirpath = os.path.join(DATA_DIR, self.dirpath) # TODO: to make compatible on windows, split the dirpath on "/" and re-join using os.sep print("-------------------------") print("GRAPH STORAGE...") print(" DIRPATH:", self.dirpath) print(" GCS DIRPATH:", self.gcs_dirpath) print(" LOCAL DIRPATH:", os.path.abspath(self.local_dirpath)) seek_confirmation() if not os.path.exists(self.local_dirpath): os.makedirs(self.local_dirpath) self.results = None self.graph = None
def __init__(self, project_name=PROJECT_NAME, dataset_name=DATASET_NAME, verbose=VERBOSE_QUERIES, destructive=DESTRUCTIVE_MIGRATIONS): self.project_name = project_name self.dataset_name = dataset_name self.dataset_address = f"{self.project_name}.{self.dataset_name}" self.verbose = (verbose == True) self.destructive = (destructive == True) self.client = bigquery.Client() print("-------------------------") print("BIGQUERY SERVICE...") print(" DATASET ADDRESS:", self.dataset_address.upper()) print(" DESTRUCTIVE MIGRATIONS:", self.destructive) print(" VERBOSE QUERIES:", self.verbose) seek_confirmation()
def delete_temp_tables_older_than(self, days=3): """Deletes all tables that: have "temp_" in their name (product of the batch jobs), and were created at least X days ago (safely avoid deleting tables being used by in-progress batch jobs) """ cutoff_date = datetime.now(tz=timezone.utc) - timedelta(days=days) print("CUTOFF DATE:", cutoff_date) tables = list(self.client.list_tables(self.dataset_name)) # API call tables_to_delete = [ t for t in tables if "temp_" in t.table_id and t.created < cutoff_date ] print("TABLES TO DELETE:") pprint([t.table_id for t in tables_to_delete]) seek_confirmation() print("DELETING...") for old_temp_table in tables_to_delete: print(" ", old_temp_table.table_id) self.client.delete_table(old_temp_table)
def __init__(self, bq_service=None, bot_min=BOT_MIN, batch_size=BATCH_SIZE, storage_dirpath=None): self.bq_service = bq_service or BigQueryService() self.bot_min = bot_min self.batch_size = batch_size Job.__init__(self) storage_dirpath = storage_dirpath or f"bot_follower_graphs/bot_min/{self.bot_min}" GraphStorage.__init__(self, dirpath=storage_dirpath) print("-------------------------") print("BOT FOLLOWER GRAPHER...") print(" BOT MIN:", self.bot_min) print(" BATCH SIZE:", self.batch_size) print("-------------------------") seek_confirmation()
def __init__(self, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None, model_manager=None): self.limit = limit self.batch_size = batch_size self.bq_service = bq_service or BigQueryService() self.mgr = model_manager or ModelManager() print("----------------") print("TOXICITY SCORER...") print(" MODEL CHECKPOINT:", self.mgr.checkpoint_name.upper(), self.mgr.checkpoint_url) print(" SCORES TABLE NAME:", self.scores_table_name) print(" LIMIT:", fmt_n(self.limit)) print(" BATCH SIZE:", fmt_n(self.batch_size)) self.predict = self.mgr.predict_scores # method alias seek_confirmation()
def promote_model(self, destination=BEST_MODEL_DIRPATH): blobs = list(self.gcs_service.bucket.list_blobs()) matching_blobs = [blob for blob in blobs if self.dirpath in blob.name] print("MODEL FILES TO PROMOTE...") pprint(matching_blobs) seek_confirmation() print("PROMOTING GCS MODEL FILES...") for blob in matching_blobs: file_name = blob.name.split("/")[-1] #> 'model.gpickle' new_path = self.compile_gcs_dirpath( f"{destination}/{file_name}" ) #f"storage/data/{destination}/{file_name}" self.gcs_service.bucket.copy_blob( blob, destination_bucket=self.gcs_service.bucket, new_name=new_path) print("PROMOTING LOCAL MODEL FILES...") local_destination = self.compile_local_dirpath(destination) local_source = self.local_dirpath copytree(local_source, local_destination, dirs_exist_ok=True)
def __init__(self, bq_service=None, week_id=WEEK_ID): bq_service = bq_service or BigQueryService() self.week_id = week_id print("--------------------") print("FETCHING WEEKS...") self.weeks = [ RetweetWeek(row) for row in list(bq_service.fetch_retweet_weeks()) ] for week in self.weeks: print(" ", week.details) print("--------------------") print("SELECTING A WEEK...") if not self.week_id: self.week_id = input( "PLEASE SELECT A WEEK (E.G. '2019-52', '2020-01', ETC.): " ) # assumes you know what you're doing when setting WEEK_ID on production! once you run this once you'll see what all the week ids are. try: self.week = [ wk for wk in self.weeks if wk.week_id == self.week_id ][0] print(" ", self.week.details) except IndexError as err: print("OOPS - PLEASE CHECK WEEK ID AND TRY AGAIN...") exit() self.tweets_start_at = self.week.row.min_created self.tweets_end_at = self.week.row.max_created seek_confirmation() storage_service = self.init_storage_service(self.week_id) super().__init__(bq_service=bq_service, storage_service=storage_service)
def __init__(self, local_dirpath=None, gcs_dirpath=None, gcs_service=None): """ Saves and loads artifacts from the networkx graph compilation process to local storage, and optionally to Google Cloud Storage. Params: local_dirpath (str) like "/Users/USERNAME/path/to/repo/data/graphs/2020-08-02-1818" gcs_dirpath (str) like "storage/data/graphs/2020-08-02-1818" """ self.gcs_service = gcs_service or GoogleCloudStorageService() self.gcs_dirpath = gcs_dirpath or os.path.join("storage", "data", "graphs", "example") self.local_dirpath = local_dirpath or os.path.join( DATA_DIR, "graphs", "example") print("----------------------") print("GRAPH STORAGE...") print(" GCS DIR:", self.gcs_dirpath) print(" LOCAL DIR:", self.local_dirpath) print("----------------------") seek_confirmation() if not os.path.exists(self.local_dirpath): os.makedirs(self.local_dirpath)
BATCH_SIZE = int(os.getenv("BATCH_SIZE", default=100)) # the max number of processed users to store in BQ at once (with a single insert API call). must be less than 10,000 to avoid error. if __name__ == "__main__": bq_service = BigQueryService() twitter_service = TwitterService() rows = list(bq_service.fetch_idless_screen_names()) row_count = len(rows) print("-------------------------") print(f"FETCHED {row_count} SCREEN NAMES") print("BATCH SIZE:", BATCH_SIZE) print("-------------------------") seek_confirmation() bq_service.migrate_user_id_lookups_table() batch = [] for index, row in enumerate(rows): counter = index + 1 try: user_id = twitter_service.get_user_id(row.screen_name) message = None except TweepError as err: #print(err) #> [{'code': 50, 'message': 'User not found.'}] #> [{'code': 63, 'message': 'User has been suspended.'}] user_id = None message = json.loads(err.reason.replace("'", '"'))[0]["message"]