def test_upload_in_batches(): bq_service = BigQueryService(dataset_name="impeachment_test") # when inserting more than 10,000 rows, # is able to overcome error "too many rows present in the request, limit: 10000": lots_of_rows = [{"start_date":"2020-01-01", "user_id":i, "bot_probability": .99} for i in range(1, 36000)] errors = bq_service.upload_daily_bot_probabilities(lots_of_rows) assert not any(errors)
def get_tweets(): bq_service = BigQueryService() print("LIMIT:", LIMIT) job = Job() tweets = [] job.start() for row in bq_service.fetch_labeled_tweets_in_batches(limit=LIMIT): tweets.append(dict(row)) job.counter += 1 if job.counter % BATCH_SIZE == 0: job.progress_report() job.end() print("FETCHED TWEETS:", fmt_n(len(tweets))) return DataFrame(tweets)
def download_data(): job = Job() bq_service = BigQueryService() job.start() records = [] for row in bq_service.fetch_user_details_vq(limit=LIMIT): #print(row) records.append(dict(row)) job.counter += 1 if job.counter % BATCH_SIZE == 0: job.progress_report() job.end() return DataFrame(records)
def __init__(self, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, pg_destructive=PG_DESTRUCTIVE, bq_service=None): self.bq_service = bq_service or BigQueryService() if users_limit: self.users_limit = int(users_limit) else: self.users_limit = None self.tweets_limit = self.users_limit # todo: combine with users_limit for a more generic rows_limit, since we usually run one script or another, so can reset the var between runs self.batch_size = batch_size self.pg_destructive = pg_destructive self.pg_engine = db self.pg_session = BoundSession() print("-------------------------") print("PG PIPELINE...") print(" USERS LIMIT:", self.users_limit) print(" BATCH SIZE:", self.batch_size) #print(" BQ SERVICE:", type(self.bq_service)) #print(" PG SESSION:", type(self.pg_session)) print(" PG DESTRUCTIVE:", self.pg_destructive)
def __init__(self, topic=TOPIC, tweets_start_at=TWEETS_START_AT, tweets_end_at=TWEETS_END_AT, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, storage_dirpath=None, bq_service=None): Job.__init__(self) GraphStorage.__init__(self, dirpath=storage_dirpath) self.bq_service = bq_service or BigQueryService() self.fetch_edges = self.bq_service.fetch_retweet_edges_in_batches_v2 # just being less verbose. feels like javascript # CONVERSATION PARAMS (OPTIONAL) self.topic = topic self.tweets_start_at = tweets_start_at self.tweets_end_at = tweets_end_at # PROCESSING PARAMS self.users_limit = users_limit if self.users_limit: self.users_limit = int(self.users_limit) self.batch_size = int(batch_size) print("-------------------------") print("RETWEET GRAPHER...") print(" USERS LIMIT:", self.users_limit) print(" BATCH SIZE:", self.batch_size) print(" DRY RUN:", DRY_RUN) print("-------------------------") print("CONVERSATION PARAMS...") print(" TOPIC:", self.topic) print(" TWEETS START:", self.tweets_start_at) print(" TWEETS END:", self.tweets_end_at) seek_confirmation()
def cautiously_initialized_storage_service(): service = BigQueryService() print("-------------------------") print("BQ CONFIG...") print(" BIGQUERY DATASET:", service.dataset_address.upper()) print(" DESTRUCTIVE MIGRATIONS:", service.destructive) print(" VERBOSE QUERIES:", service.verbose) print("-------------------------") print("WORKER CONFIG...") print(" MIN USER ID:", MIN_ID) print(" MAX USER ID:", MAX_ID) print(" USERS LIMIT:", LIMIT) print(" MAX THREADS:", MAX_THREADS) print(" BATCH SIZE:", BATCH_SIZE) print("-------------------------") print("SCRAPER CONFIG...") print(" VERBOSE SCRAPER:", VERBOSE_SCRAPER) print(" MAX FRIENDS:", MAX_FRIENDS) print("-------------------------") if APP_ENV == "development": if input("CONTINUE? (Y/N): ").upper() != "Y": print("EXITING...") exit() #service.init_tables() # did this originally, but moving to a manual migration strategy to prevent accidental deletions return service
def __init__(self, model_name=MODEL_NAME, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None): self.model_name = model_name.lower().replace(";","") # using this model name in queries, so be super safe about SQL injection, although its not a concern right now self.limit = limit self.batch_size = batch_size self.bq_service = bq_service or BigQueryService() print("----------------") print("TOXICITY SCORER...") print(" MODEL:", self.model_name.upper()) print(" LIMIT:", fmt_n(self.limit)) print(" BATCH SIZE:", fmt_n(self.batch_size))
def create_app(): app = Flask(__name__) CORS(app) # CORS(app, resources={r"/api/*": {"origins": "*"}}) app.config["SECRET_KEY"] = SECRET_KEY app.config["BQ_SERVICE"] = BigQueryService(cautious=False) #app.config.from_mapping(SECRET_KEY=SECRET_KEY, BQ_SERVICE=BigQueryService()) app.register_blueprint(api_v0_routes) app.register_blueprint(api_v1_routes) return app
def __init__(self, bq=None, topic=TOPIC, pval_max=PVAL_MAX, results_csv_filepath=RESULTS_CSV_FILEPATH): self.topic = topic self.bq = bq or BigQueryService() self.x = [] self.y = [] self.pval_max = pval_max self.interpret_ks = interpret self.results_csv_filepath = results_csv_filepath
def __init__(self, bq_service=None, twitter_service=None, user_limit=USER_LIMIT, friend_limit=FRIEND_LIMIT): self.bq_service = bq_service or BigQueryService() self.twitter_service = twitter_service or TwitterService() self.dataset_address = self.bq_service.dataset_address self.user_limit = int(user_limit) self.friend_limit = int(friend_limit) print("---------------------------") print("JOB: FRIEND LOOKUPS") print("DATASET:", self.dataset_address.upper()) print("USER LIMIT:", self.user_limit) print("FRIEND LIMIT:", self.friend_limit) print("---------------------------")
def __init__(self, bq_service=None, twitter_service=None, user_limit=USER_LIMIT, status_limit=STATUS_LIMIT): self.bq_service = bq_service or BigQueryService() self.twitter_service = twitter_service or TwitterService() self.dataset_address = self.bq_service.dataset_address self.user_limit = int(user_limit) self.status_limit = int(status_limit) self.parse_status = parse_timeline_status print("---------------------------") print("JOB: TIMELINE LOOKUPS") print("DATASET:", self.dataset_address.upper()) print("USER LIMIT:", self.user_limit) print("STATUS LIMIT:", self.status_limit) print("---------------------------")
def __init__(self, bq_service=None, bot_min=BOT_MIN, batch_size=BATCH_SIZE, storage_dirpath=None): self.bq_service = bq_service or BigQueryService() self.bot_min = bot_min self.batch_size = batch_size Job.__init__(self) storage_dirpath = storage_dirpath or f"bot_follower_graphs/bot_min/{self.bot_min}" GraphStorage.__init__(self, dirpath=storage_dirpath) print("-------------------------") print("BOT FOLLOWER GRAPHER...") print(" BOT MIN:", self.bot_min) print(" BATCH SIZE:", self.batch_size) print("-------------------------") seek_confirmation()
def __init__(self, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None, model_manager=None): self.limit = limit self.batch_size = batch_size self.bq_service = bq_service or BigQueryService() self.mgr = model_manager or ModelManager() print("----------------") print("TOXICITY SCORER...") print(" MODEL CHECKPOINT:", self.mgr.checkpoint_name.upper(), self.mgr.checkpoint_url) print(" SCORES TABLE NAME:", self.scores_table_name) print(" LIMIT:", fmt_n(self.limit)) print(" BATCH SIZE:", fmt_n(self.batch_size)) self.predict = self.mgr.predict_scores # method alias seek_confirmation()
def __init__(self, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, pg_destructive=PG_DESTRUCTIVE, bq_service=None): self.bq_service = bq_service or BigQueryService() if users_limit: self.users_limit = int(users_limit) else: self.users_limit = None self.batch_size = batch_size self.pg_destructive = pg_destructive self.pg_engine = db self.pg_session = BoundSession() print("-------------------------") print("PG PIPELINE...") print(" USERS LIMIT:", self.users_limit) print(" BATCH SIZE:", self.batch_size) #print(" BQ SERVICE:", type(self.bq_service)) #print(" PG SESSION:", type(self.pg_session)) print(" PG DESTRUCTIVE:", self.pg_destructive)
def __init__(self, twitter_service=None, storage_env=STORAGE_ENV, bq_service=None, csv_service=None, batch_size=BATCH_SIZE): self.twitter_service = twitter_service or TwitterService() self.api = self.twitter_service.api self.auth = self.api.auth self.parse_status = parse_status self.storage_env = storage_env if self.storage_env == "local": self.storage_service = csv_service or LocalStorageService() elif self.storage_env == "remote": self.storage_service = bq_service or BigQueryService() else: raise ValueError("Expecting the STORAGE_ENV to be 'local' or 'remote'. Please try again...") self.batch_size = batch_size self.batch = [] self.counter = 0 print("-------------------------------") print("STREAM LISTENER...") print(" STORAGE ENV:", self.storage_env.upper()) print(" STORAGE SERVICE:", type(self.storage_service)) print(" BATCH SIZE:", self.batch_size) print("--------------------------------")
def __init__(self, bq_service=None, week_id=WEEK_ID): bq_service = bq_service or BigQueryService() self.week_id = week_id print("--------------------") print("FETCHING WEEKS...") self.weeks = [ RetweetWeek(row) for row in list(bq_service.fetch_retweet_weeks()) ] for week in self.weeks: print(" ", week.details) print("--------------------") print("SELECTING A WEEK...") if not self.week_id: self.week_id = input( "PLEASE SELECT A WEEK (E.G. '2019-52', '2020-01', ETC.): " ) # assumes you know what you're doing when setting WEEK_ID on production! once you run this once you'll see what all the week ids are. try: self.week = [ wk for wk in self.weeks if wk.week_id == self.week_id ][0] print(" ", self.week.details) except IndexError as err: print("OOPS - PLEASE CHECK WEEK ID AND TRY AGAIN...") exit() self.tweets_start_at = self.week.row.min_created self.tweets_end_at = self.week.row.max_created seek_confirmation() storage_service = self.init_storage_service(self.week_id) super().__init__(bq_service=bq_service, storage_service=storage_service)
from tweepy.error import TweepError from pandas import DataFrame from dotenv import load_dotenv from app import DATA_DIR, seek_confirmation from app.decorators.datetime_decorators import logstamp from app.bq_service import BigQueryService from app.twitter_service import TwitterService load_dotenv() BATCH_SIZE = int(os.getenv("BATCH_SIZE", default=100)) # the max number of processed users to store in BQ at once (with a single insert API call). must be less than 10,000 to avoid error. if __name__ == "__main__": bq_service = BigQueryService() twitter_service = TwitterService() rows = list(bq_service.fetch_idless_screen_names()) row_count = len(rows) print("-------------------------") print(f"FETCHED {row_count} SCREEN NAMES") print("BATCH SIZE:", BATCH_SIZE) print("-------------------------") seek_confirmation() bq_service.migrate_user_id_lookups_table() batch = [] for index, row in enumerate(rows): counter = index + 1
from pandas import read_csv, DataFrame from app.file_storage import FileStorage from app.bq_service import BigQueryService from app.job import Job from app.decorators.number_decorators import fmt_n LIMIT = os.getenv("LIMIT") # for development purposes BATCH_SIZE = int(os.getenv("BATCH_SIZE", default="10_000")) DESTRUCTIVE = ( os.getenv("DESTRUCTIVE", default="false") == "true" ) # whether or not to re-download if a local file already exists if __name__ == "__main__": bq_service = BigQueryService() job = Job() storage = FileStorage(dirpath=f"nodes_with_active_edges_v7") nodes_csv_filepath = os.path.join(storage.local_dirpath, "sn_nodes.csv") if os.path.exists(nodes_csv_filepath) and not DESTRUCTIVE: print("LOADING SN NODES...") nodes_df = read_csv(nodes_csv_filepath) else: job.start() print("DOWNLOADING SN NODES...") records = [] for row in bq_service.fetch_sn_nodes_with_active_edges_v7(limit=LIMIT): records.append(dict(row)) job.counter += 1
def __init__(self): self.twitter_api = TwitterService().api self.bq_service = BigQueryService() self.limit = STATUS_LIMIT self.batch_size = BATCH_SIZE
class Collector: def __init__(self): self.twitter_api = TwitterService().api self.bq_service = BigQueryService() self.limit = STATUS_LIMIT self.batch_size = BATCH_SIZE def fetch_remaining_status_ids(self): sql = f""" SELECT DISTINCT a.status_id FROM `{self.bq_service.dataset_address}.all_status_ids` a LEFT JOIN `{self.bq_service.dataset_address}.recollected_statuses` completed ON completed.status_id = a.status_id WHERE completed.status_id IS NULL LIMIT {self.limit} """ return [ row["status_id"] for row in list(self.bq_service.execute_query(sql)) ] def perform(self): remaining_status_ids = self.fetch_remaining_status_ids() if any(remaining_status_ids): for batch_of_ids in split_into_batches(remaining_status_ids, batch_size=self.batch_size): self.process_batch(batch_of_ids) else: print("OH ALL DONE! SLEEPING...") server_sleep(10 * 60 * 60) def lookup_statuses(self, status_ids): """Fetch full status info including urls, and full text. Max per request is 100, so batch size must be smaller than that. See: https://docs.tweepy.org/en/stable/api.html#API.statuses_lookup https://developer.twitter.com/en/docs/twitter-api/v1/tweets/post-and-engage/api-reference/get-statuses-lookup """ return self.twitter_api.statuses_lookup( id_=status_ids, include_entities=True, # this is where the full urls are trim_user=True, # we already have this info include_ext_alt_text= True, # If alt text has been added to any attached media entities, this parameter will return an ext_alt_text value in the top-level key for the media entity. If no value has been set, this will be returned as null. include_card_uri=False, map_= True, # "Tweets that do not exist or cannot be viewed by the current user will still have their key represented but with an explicitly null value paired with it" tweet_mode="extended") def process_batch(self, status_ids): recollected_statuses = [] recollected_urls = [] success_counter = 0 for status in self.lookup_statuses(status_ids): # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field status_id = status.id # all statuses will have an id recollected_status = { "status_id": status_id, "user_id": None, "full_text": None, "created_at": None, "lookup_at": generate_timestamp() } # represent failed lookups with null text values if list(status._json.keys()) != [ "id" ]: # this will be the only field for empty statuses. otherwise try to parse them: success_counter += 1 recollected_status["user_id"] = status.user.id recollected_status["full_text"] = parse_full_text( status) # update the full text if possible recollected_status["created_at"] = generate_timestamp( status.created_at) for url in status.entities["urls"]: recollected_urls.append({ "status_id": status_id, "expanded_url": url["expanded_url"] }) recollected_statuses.append(recollected_status) print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}", "| STATUSES:", success_counter, "| URLS:", len(recollected_urls)) self.save_statuses(recollected_statuses) self.save_urls(recollected_urls) def save_statuses(self, recollected_statuses): self.bq_service.insert_records_in_batches( self.recollected_statuses_table, recollected_statuses) def save_urls(self, recollected_urls): self.bq_service.insert_records_in_batches(self.recollected_urls_table, recollected_urls) @property @lru_cache(maxsize=None) def recollected_statuses_table(self): return self.bq_service.client.get_table( f"{self.bq_service.dataset_address}.recollected_statuses") @property @lru_cache(maxsize=None) def recollected_urls_table(self): return self.bq_service.client.get_table( f"{self.bq_service.dataset_address}.recollected_status_urls")
def __init__(self, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, storage_service=None, bq_service=None): super().__init__(users_limit=users_limit, batch_size=batch_size, storage_service=storage_service) self.bq_service = bq_service or BigQueryService()
from app.bq_service import BigQueryService if __name__ == "__main__": bq_service = BigQueryService() bq_service.migrate_populate_user_details_table_v2() print("MIGRATION SUCCESSFUL!")
from app import DATA_DIR from app.bq_service import BigQueryService from app.bot_communities.tokenizers import Tokenizer, SpacyTokenizer from app.bot_communities.token_analyzer import summarize_token_frequencies from pandas import DataFrame if __name__ == "__main__": local_dirpath = os.path.join(DATA_DIR, "bot_retweet_graphs", "bot_min", str(0.8), "n_communities", str(2)) tokenizer = Tokenizer() spacy_tokenizer = SpacyTokenizer() bq_service = BigQueryService() sql = f""" SELECT c.community_id ,b.bot_id -- ,b.bot_screen_name --,b.day_count --,b.avg_daily_score ,count(distinct t.status_id) as tweet_count ,COALESCE(STRING_AGG(DISTINCT upper(t.user_screen_name), ' | ') , "") as screen_names ,COALESCE(STRING_AGG(DISTINCT upper(t.user_name), ' | ') , "") as user_names ,COALESCE(STRING_AGG(DISTINCT upper(t.user_description), ' | ') , "") as user_descriptions FROM impeachment_production.bots_above_80 b JOIN impeachment_production.2_bot_communities c ON c.user_id = b.bot_id JOIN impeachment_production.tweets t on cast(t.user_id as int64) = b.bot_id
from app.job import Job from app.bq_service import BigQueryService from app.nlp.model_storage import ModelStorage, MODELS_DIRPATH MODEL_NAME = os.getenv("MODEL_NAME", default="current_best") LIMIT = os.getenv("LIMIT") BATCH_SIZE = int(os.getenv("BATCH_SIZE", default="100000")) if __name__ == "__main__": storage = ModelStorage(dirpath=f"{MODELS_DIRPATH}/{MODEL_NAME}") tv = storage.load_vectorizer() clf = storage.load_model() bq_service = BigQueryService() print("DESTROYING PREDICTIONS TABLE???") seek_confirmation() print("DESTROYING PREDICTIONS TABLE...") bq_service.destructively_migrate_2_community_predictions_table() job = Job() job.start() ids_batch = [] statuses_batch = [] for row in bq_service.fetch_unlabeled_statuses_in_batches(limit=LIMIT): ids_batch.append(row["status_id"]) statuses_batch.append(row["status_text"])
from app import server_sleep from app.bq_service import BigQueryService from app.retweet_graphs_v2.retweet_grapher import RetweetGrapher from app.retweet_graphs_v2.k_days.generator import DateRangeGenerator if __name__ == "__main__": gen = DateRangeGenerator() bq_service = BigQueryService() for date_range in gen.date_ranges: storage_dirpath = f"retweet_graphs_v2/k_days/{gen.k_days}/{date_range.start_date}" grapher = RetweetGrapher(storage_dirpath=storage_dirpath, bq_service=bq_service, tweets_start_at=date_range.start_at, tweets_end_at=date_range.end_at ) grapher.save_metadata() grapher.start() grapher.perform() grapher.end() grapher.report() grapher.save_results() grapher.save_graph() del grapher # clearing graph from memory print("\n\n\n\n") print("JOB COMPLETE!")
bq_service.upload_basilica_embeddings(batch) #print(logstamp(), thread_name, "UPLOAD COMPLETE!") return len(batch) if __name__ == "__main__": print("-------------------") print("BASILICA EMBEDDER...") print(" MIN PARTITION VAL:", MIN_VAL) print(" MAX PARTITION VAL:", MAX_VAL) print(" LIMIT:", LIMIT) print(" BATCH SIZE:", BATCH_SIZE) bq_service = BigQueryService() bas_service = BasilicaService() job = Job() job.start() records = list( bq_service.fetch_basilica_embedless_partitioned_statuses( min_val=MIN_VAL, max_val=MAX_VAL, limit=LIMIT)) job.counter = len(records) batches = list(split_into_batches(records, BATCH_SIZE)) print("BATCHES:", len(batches)) job.end() del records job.start()
LIMIT = os.getenv("LIMIT") # for development purposes BATCH_SIZE = int(os.getenv("BATCH_SIZE", default="100000")) DESTRUCTIVE = ( os.getenv("DESTRUCTIVE", default="false") == "true" ) # whether or not to re-download if a local file already exists NODES_LIMIT = os.getenv("NODES_LIMIT") # for development purposes NODES_BATCH_SIZE = int(os.getenv("NODES_BATCH_SIZE", default="5000")) NODES_DESTRUCTIVE = ( os.getenv("NODES_DESTRUCTIVE", default="false") == "true" ) # whether or not to re-download if a local file already exists if __name__ == "__main__": gen = DateRangeGenerator(k_days=1) bq_service = BigQueryService() job = Job() for dr in gen.date_ranges: print(dr.start_date) storage = FileStorage( dirpath=f"daily_active_edge_friend_graphs_v5/{dr.start_date}") tweets_csv_filepath = os.path.join(storage.local_dirpath, "tweets.csv") nodes_csv_filepath = os.path.join(storage.local_dirpath, "nodes.csv") if os.path.exists(tweets_csv_filepath) and not DESTRUCTIVE: print("LOADING TWEETS...") tweets_df = read_csv(tweets_csv_filepath) else: job.start() print("DOWNLOADING TWEETS...")
from app.bq_service import BigQueryService if __name__ == "__main__": bq_service = BigQueryService() bq_service.migrate_daily_bot_probabilities_table() print("MIGRATION SUCCESSFUL!")
print(" TWEET_MIN:", TWEET_MIN) print(" LIMIT:", LIMIT) print(" BATCH_SIZE:", BATCH_SIZE) print(" DESTRUCTIVE:", DESTRUCTIVE) #print(" GRAPH_LIMIT:", GRAPH_LIMIT) print(" GRAPH_BATCH_SIZE:", GRAPH_BATCH_SIZE) print(" GRAPH_DESTRUCTIVE:", GRAPH_DESTRUCTIVE) print("------------------------") storage = FileStorage( dirpath=f"daily_active_friend_graphs_v4/{DATE}/tweet_min/{TWEET_MIN}") tweets_csv_filepath = os.path.join(storage.local_dirpath, "tweets.csv") bq_service = BigQueryService() job = Job() # # LOAD TWEETS # tweet_id, text, screen_name, bot, created_at # TODO: de-dup RTs so the model will only train/test on a single RT status text (PREVENT OVERFITTING) if os.path.exists(tweets_csv_filepath) and not DESTRUCTIVE: print("LOADING TWEETS...") statuses_df = read_csv(tweets_csv_filepath) else: job.start() print("DOWNLOADING TWEETS...") statuses = [] for row in bq_service.fetch_daily_active_tweeter_statuses(
) # the max number of processed users to store in BQ at once (with a single insert API call) DESTRUCTIVE = ( os.getenv("DESTRUCTIVE", default="false") == "true" ) # whether or not to re-download if a local file already exists if __name__ == "__main__": # INIT file_storage = FileStorage( dirpath="bot_retweet_graphs/bot_min/0.8/n_communities/2/analysis_v2") local_tweets_filepath = os.path.join(file_storage.local_dirpath, "community_tweets.csv") #gcs_tweets_filepath = os.path.join(file_storage.gcs_dirpath, "community_tweets.csv") bq_service = BigQueryService() tokenizer = Tokenizer() print("------------------------------") print("BOT STATUS ANALYZER V2...") print(" LIMIT:", LIMIT) print(" BATCH SIZE:", BATCH_SIZE) print(" DESTRUCTIVE:", DESTRUCTIVE) seek_confirmation() # LOAD STATUSES if os.path.isfile(local_tweets_filepath) and not DESTRUCTIVE: print("LOADING STATUSES...") statuses_df = read_csv(local_tweets_filepath) print(statuses_df.head())