def update_user_set_config_entry_in_db(db, notifiers, user_set_id, db_credentials_fpath, twitter_credential_fpath, user_set_name, user_set_input_collection, user_set_input_collection_fpaths, user_set_description, user_set_notes, user_set_user_filter, user_set_user_filter_args, user_set_collection_modules_to_run, notifier_params, run_flags, day_interval, extra_params={}): dbrow = (db_credentials_fpath, twitter_credential_fpath, user_set_name, user_set_input_collection, json.dumps(user_set_input_collection_fpaths), user_set_description, user_set_notes, user_set_user_filter, json.dumps(user_set_user_filter_args), json.dumps(user_set_collection_modules_to_run), json.dumps(notifier_params), json.dumps(run_flags), day_interval, json.dumps(extra_params), user_set_id) db.execute_sql(SQL.user_set_configuration_update, args=dbrow, commit=True) notifications.notify_all( notifiers, "Updated user set configuration with `user_set_id = {}`".format( user_set_id), notify_type="info")
def __init__(self, user_set_id, user_set_prefix, db, notifiers=[]): self.user_set_id = user_set_id self.name = "followers" self.user_set_prefix = user_set_prefix self.db = db self.notifiers = notifiers self.table_name = "{}_user_follower".format(user_set_prefix) self.table_create = """ CREATE TABLE IF NOT EXISTS {} ( user_set_id TEXT, user_id BIGINT, collection_bucket_ts TIMESTAMP, collected_ts TIMESTAMP, follower_id BIGINT, PRIMARY KEY (user_set_id, user_id, collected_ts, follower_id) ) """.format(self.table_name) self.table_insert = """ INSERT INTO {} (user_set_id, user_id, collection_bucket_ts, collected_ts, follower_id) VALUES (%s, %s, %s, %s, %s) """.format(self.table_name) db.execute_sql(self.table_create, commit=True) notifications.notify_all( notifiers, "Followers collection uploading to `{}`".format(self.table_name), notify_type="info")
def collect(self, api, user_ids, collection_bucket_ts): all_rows = [] notifications.notify_all( self.notifiers, "Starting timeline collection for {} user_ids".format( len(user_ids)), notify_type="start") for user_id in user_ids: timeline = self.get_historic_tweets(api, user_id) collected_at = datetime.now() for tweet in timeline: dbrow = (self.user_set_id, user_id, tweet["id_str"], collection_bucket_ts, collected_at, json.dumps(tweet)) all_rows.append(dbrow) notifications.notify_all( self.notifiers, "Finished collecting timelines for {} user_ids".format( len(user_ids)), notify_type="complete") return all_rows
def setup_userid_module(user_set_prefix, module_name, module_args, notifiers): notifications.notify_all(notifiers, "Created '{}' filter".format(module_name), notify_type="info") module_class = eval("{}".format(module_name)) module = module_class(**module_args) return module
def server_setup(host, user, password, db_name, notifiers): notifications.notify_all(notifiers, "Beginning data collection setup...", notify_type="start") try: db = database.Database(host, user, password, db_name, create_if_not_exists=True) # Create the tables if they don't exist db.execute_sql(SQL.user_set_configuration_create, commit=True) db.execute_sql(SQL.user_set_metadata_create, commit=True) notifications.notify_all(notifiers, "Finished server setup!", notify_type="complete") return db except Exception as e: notifications.notify_all(notifiers, "Server setup failed!", notify_type="error") notifications.notify_all(notifiers, str(e), notify_type="error") raise e
def upload_data(self, data): notifications.notify_all(self.notifiers, "Starting {} data upload for {} rows".format( self.name, len(data)), notify_type="start") self.db.execute_sql(sql=self.table_insert, args=data, commit=True, batch_insert=True) notifications.notify_all(self.notifiers, "Uploaded {} rows to {}".format( len(data), self.table_name), notify_type="complete")
def collect(self, api, user_ids, collection_bucket_ts): followerships = [] # Keeping track of which accounts throw errors (not doing anything with this right now though) error_accounts = [] notifications.notify_all( self.notifiers, "Starting follower collection for {} user_ids".format( len(user_ids)), notify_type="start") for user_id in user_ids: collected_at = datetime.now() def pull_followers(user_id): try: for page in tweepy.Cursor(api.followers_ids, id=user_id).pages(): followerships.extend([ (self.user_set_id, user_id, collection_bucket_ts, collected_at, follower_id) for follower_id in page ]) # We get 15 requests per 15-window or 1 request per 60 seconds time.sleep(60) except tweepy.RateLimitError: time.sleep(15 * 60) user_followerships = pull_followers(user_id) followerships.extend(user_followerships) except tweepy.TweepError as ex: error_accounts.append( (user_id, ex.response.status_code, ex.response.text)) pull_followers(user_id) notifications.notify_all( self.notifiers, "Finished collecting followers for {} user_ids".format( len(user_ids)), notify_type="complete") return followerships
def insert_new_user_set_config_entry_in_db(db, notifiers, db_credentials_fpath, twitter_credential_fpath, user_set_name, user_set_input_collection, user_set_input_collection_fpaths, user_set_description, user_set_notes, user_set_user_filter, user_set_user_filter_args, user_set_collection_modules_to_run, notifier_params, run_flags, day_interval, extra_params={}): user_set_creation_ts = datetime.utcnow() dbrow = (db_credentials_fpath, twitter_credential_fpath, user_set_name, user_set_creation_ts, user_set_input_collection, json.dumps(user_set_input_collection_fpaths), user_set_description, user_set_notes, user_set_user_filter, json.dumps(user_set_user_filter_args), json.dumps(user_set_collection_modules_to_run), json.dumps(notifier_params), json.dumps(run_flags), day_interval, json.dumps(extra_params)) user_set_id = db.execute_sql(SQL.user_set_configuration_insert, args=dbrow, commit=True, fetch=True)[0][0] notifications.notify_all( notifiers, "Created new user set configuration with `user_set_id = {}`".format( user_set_id), notify_type="info") return user_set_id
def collect_and_upload(self, api, user_ids, notifiers): collection_bucket_start_ts = datetime.now() for module in self.collection_modules: chunk_size = 100 chunk_id_padding_length = len(str(len(user_ids) // chunk_size)) for i, chunk_start in enumerate(range(0, len(user_ids), chunk_size)): print( "progress=" + str(int(100 * (float(chunk_start) / float(len(user_ids)))))) chunk_end = chunk_start + chunk_size user_id_chunk = user_ids[chunk_start:chunk_end] chunk_str_id = str(i).zfill(chunk_id_padding_length) collected_data = module.collect(api, user_id_chunk, collection_bucket_start_ts) module.upload_data(collected_data) module.dump_data(collected_data, self.user_set_name, collection_bucket_start_ts, chunk_str_id) collection_bucket_end_ts = datetime.now() for module in self.collection_modules: dbrow = (self.user_set_id, collection_bucket_start_ts, collection_bucket_end_ts, module.name, None) self.db.execute_sql(SQL.user_set_metadata_insert, args=dbrow, commit=True) notifications.notify_all(notifiers, "Finished collection at {}".format( str(datetime.now())), notify_type="complete")
def collect(self, api, user_ids, collection_bucket_ts): all_rows = [] notifications.notify_all( self.notifiers, "Starting profile data collection for {} user_ids".format( len(user_ids)), notify_type="start") for i in range(0, len(user_ids), 100): user_id_chunk = user_ids[i:(i + 100)] all_rows.extend( pull_user_data(api, user_id_chunk, self.user_set_id, collection_bucket_ts)) notifications.notify_all( self.notifiers, "Finished collecting profile data for {} user_ids".format( len(user_ids)), notify_type="complete") return all_rows
def __init__(self, db, user_set_id, notifiers): self.user_set_id = user_set_id self.db = db existing_data = db.execute_sql( SQL.user_set_configuration_fetch.format(user_set_id), fetch=True) if not bool(existing_data): raise ValueError( "user_set_id {} doesn't exist in the database. You'll need to set parameters to create a new user_set_id or change the id to an existing entry." .format(user_set_id)) self.db_credentials_fpath, self.twitter_credential_fpath, self.user_set_name, self.user_set_creation_ts, self.user_set_input_collection, self.user_set_input_collection_fpaths, self.user_set_description, self.user_set_notes, self.user_set_user_filter, self.user_set_user_filter_args, self.user_set_collection_modules_to_run, self.notifier_params, self.run_flags, self.day_interval, self.extra_params = existing_data[ 0][1:] ## The prefix that will be given to all user-set specific tables self.user_set_prefix = "c{}_{}".format( self.user_set_id, self.user_set_name.replace(" ", "_")) notifications.notify_all( notifiers, "Pulled user set configuration for `user_set_id` = {}".format( user_set_id), notify_type="info") self.filter_module = setup.setup_userid_module( self.user_set_prefix, self.user_set_user_filter, self.user_set_user_filter_args, notifiers) self.collection_modules = [] for module in self.user_set_collection_modules_to_run: module_class = module["class"] module_args = module.get("args", {}) self.collection_modules.append( setup.setup_collection_module(self.user_set_id, self.user_set_prefix, module_class, module_args, db, notifiers))
def __init__(self, user_set_id, user_set_prefix, db, first_timebound_type, timebound_arg, subsequent_timebound_type, notifiers=[]): if (not first_timebound_type in ["number", "date", "last_tweet"]) or ( not subsequent_timebound_type in ["number", "date", "last_tweet"]): raise ValueError( "Please specify `timebound_type` to be 'number', 'date', or 'last_tweet'" ) self.user_set_id = user_set_id self.name = "timeline" self.user_set_prefix = user_set_prefix self.db = db self.notifiers = notifiers self.table_name = "{}_timeline".format(user_set_prefix) table_create = """ CREATE TABLE IF NOT EXISTS {} ( user_set_id TEXT, user_id BIGINT, tweet_id BIGINT, collection_bucket_ts TIMESTAMP, collected_ts TIMESTAMP, tweet_json JSON, PRIMARY KEY (user_set_id, user_id, tweet_id) ) """.format(self.table_name) db.execute_sql(table_create, commit=True) select_collection_exists = """ SELECT COUNT(*) FROM {} WHERE user_set_id = '{}' """.format(self.table_name, user_set_id) collection_exists = bool( db.execute_sql(select_collection_exists, fetch=True)[0][0] > 0) if not collection_exists: self.timebound_type = first_timebound_type self.timebound_arg = timebound_arg else: select_most_recent_tweet_id = """ SELECT MAX(tweet_id) FROM {} WHERE user_set_id = '{}' """.format(self.table_name, self.user_set_id) self.most_recent_tweet_id = db.execute_sql( select_most_recent_tweet_id, fetch=True)[0][0] self.timebound_type = subsequent_timebound_type self.timebound_arg = timebound_arg if self.timebound_type == "date": self.timebound_arg = utils.twitter_str_to_dt(timebound_arg) self.table_insert = """ INSERT INTO {} (user_set_id, user_id, tweet_id, collection_bucket_ts, collected_ts, tweet_json) VALUES (%s, %s, %s, %s, %s, %s) """.format(self.table_name) notifications.notify_all( notifiers, "Historic timeline collection uploading to `{}`".format( self.table_name), notify_type="info")