Esempio n. 1
0
def update_user_set_config_entry_in_db(db,
                                       notifiers,
                                       user_set_id,
                                       db_credentials_fpath,
                                       twitter_credential_fpath,
                                       user_set_name,
                                       user_set_input_collection,
                                       user_set_input_collection_fpaths,
                                       user_set_description,
                                       user_set_notes,
                                       user_set_user_filter,
                                       user_set_user_filter_args,
                                       user_set_collection_modules_to_run,
                                       notifier_params,
                                       run_flags,
                                       day_interval,
                                       extra_params={}):

    dbrow = (db_credentials_fpath, twitter_credential_fpath, user_set_name,
             user_set_input_collection,
             json.dumps(user_set_input_collection_fpaths),
             user_set_description, user_set_notes, user_set_user_filter,
             json.dumps(user_set_user_filter_args),
             json.dumps(user_set_collection_modules_to_run),
             json.dumps(notifier_params), json.dumps(run_flags), day_interval,
             json.dumps(extra_params), user_set_id)

    db.execute_sql(SQL.user_set_configuration_update, args=dbrow, commit=True)

    notifications.notify_all(
        notifiers,
        "Updated user set configuration with `user_set_id = {}`".format(
            user_set_id),
        notify_type="info")
Esempio n. 2
0
    def __init__(self, user_set_id, user_set_prefix, db, notifiers=[]):
        self.user_set_id = user_set_id
        self.name = "followers"
        self.user_set_prefix = user_set_prefix
        self.db = db
        self.notifiers = notifiers

        self.table_name = "{}_user_follower".format(user_set_prefix)

        self.table_create = """
            CREATE TABLE IF NOT EXISTS {} (
              user_set_id TEXT,
              user_id BIGINT,
              collection_bucket_ts TIMESTAMP,
              collected_ts TIMESTAMP,
              follower_id BIGINT,
              PRIMARY KEY (user_set_id, user_id, collected_ts, follower_id)
            )
        """.format(self.table_name)

        self.table_insert = """
            INSERT INTO {} (user_set_id, user_id, collection_bucket_ts,
            collected_ts, follower_id)
            VALUES (%s, %s, %s, %s, %s)
        """.format(self.table_name)

        db.execute_sql(self.table_create, commit=True)

        notifications.notify_all(
            notifiers,
            "Followers collection uploading to `{}`".format(self.table_name),
            notify_type="info")
Esempio n. 3
0
    def collect(self, api, user_ids, collection_bucket_ts):
        all_rows = []

        notifications.notify_all(
            self.notifiers,
            "Starting timeline collection for {} user_ids".format(
                len(user_ids)),
            notify_type="start")

        for user_id in user_ids:
            timeline = self.get_historic_tweets(api, user_id)
            collected_at = datetime.now()

            for tweet in timeline:
                dbrow = (self.user_set_id, user_id, tweet["id_str"],
                         collection_bucket_ts, collected_at, json.dumps(tweet))
                all_rows.append(dbrow)

        notifications.notify_all(
            self.notifiers,
            "Finished collecting timelines for {} user_ids".format(
                len(user_ids)),
            notify_type="complete")

        return all_rows
Esempio n. 4
0
def setup_userid_module(user_set_prefix, module_name, module_args, notifiers):
    notifications.notify_all(notifiers,
                             "Created '{}' filter".format(module_name),
                             notify_type="info")
    module_class = eval("{}".format(module_name))

    module = module_class(**module_args)
    return module
Esempio n. 5
0
def server_setup(host, user, password, db_name, notifiers):

    notifications.notify_all(notifiers,
                             "Beginning data collection setup...",
                             notify_type="start")

    try:
        db = database.Database(host,
                               user,
                               password,
                               db_name,
                               create_if_not_exists=True)

        # Create the tables if they don't exist
        db.execute_sql(SQL.user_set_configuration_create, commit=True)
        db.execute_sql(SQL.user_set_metadata_create, commit=True)

        notifications.notify_all(notifiers,
                                 "Finished server setup!",
                                 notify_type="complete")

        return db

    except Exception as e:
        notifications.notify_all(notifiers,
                                 "Server setup failed!",
                                 notify_type="error")
        notifications.notify_all(notifiers, str(e), notify_type="error")
        raise e
Esempio n. 6
0
    def upload_data(self, data):
        notifications.notify_all(self.notifiers,
                                 "Starting {} data upload for {} rows".format(
                                     self.name, len(data)),
                                 notify_type="start")

        self.db.execute_sql(sql=self.table_insert,
                            args=data,
                            commit=True,
                            batch_insert=True)

        notifications.notify_all(self.notifiers,
                                 "Uploaded {} rows to {}".format(
                                     len(data), self.table_name),
                                 notify_type="complete")
Esempio n. 7
0
    def collect(self, api, user_ids, collection_bucket_ts):
        followerships = []
        # Keeping track of which accounts throw errors (not doing anything with this right now though)
        error_accounts = []

        notifications.notify_all(
            self.notifiers,
            "Starting follower collection for {} user_ids".format(
                len(user_ids)),
            notify_type="start")

        for user_id in user_ids:
            collected_at = datetime.now()

            def pull_followers(user_id):
                try:
                    for page in tweepy.Cursor(api.followers_ids,
                                              id=user_id).pages():
                        followerships.extend([
                            (self.user_set_id, user_id, collection_bucket_ts,
                             collected_at, follower_id) for follower_id in page
                        ])

                    # We get 15 requests per 15-window or 1 request per 60 seconds
                    time.sleep(60)

                except tweepy.RateLimitError:
                    time.sleep(15 * 60)
                    user_followerships = pull_followers(user_id)
                    followerships.extend(user_followerships)

                except tweepy.TweepError as ex:
                    error_accounts.append(
                        (user_id, ex.response.status_code, ex.response.text))

            pull_followers(user_id)
        notifications.notify_all(
            self.notifiers,
            "Finished collecting followers for {} user_ids".format(
                len(user_ids)),
            notify_type="complete")

        return followerships
Esempio n. 8
0
def insert_new_user_set_config_entry_in_db(db,
                                           notifiers,
                                           db_credentials_fpath,
                                           twitter_credential_fpath,
                                           user_set_name,
                                           user_set_input_collection,
                                           user_set_input_collection_fpaths,
                                           user_set_description,
                                           user_set_notes,
                                           user_set_user_filter,
                                           user_set_user_filter_args,
                                           user_set_collection_modules_to_run,
                                           notifier_params,
                                           run_flags,
                                           day_interval,
                                           extra_params={}):

    user_set_creation_ts = datetime.utcnow()

    dbrow = (db_credentials_fpath, twitter_credential_fpath, user_set_name,
             user_set_creation_ts, user_set_input_collection,
             json.dumps(user_set_input_collection_fpaths),
             user_set_description, user_set_notes, user_set_user_filter,
             json.dumps(user_set_user_filter_args),
             json.dumps(user_set_collection_modules_to_run),
             json.dumps(notifier_params), json.dumps(run_flags), day_interval,
             json.dumps(extra_params))

    user_set_id = db.execute_sql(SQL.user_set_configuration_insert,
                                 args=dbrow,
                                 commit=True,
                                 fetch=True)[0][0]

    notifications.notify_all(
        notifiers,
        "Created new user set configuration with `user_set_id = {}`".format(
            user_set_id),
        notify_type="info")

    return user_set_id
Esempio n. 9
0
    def collect_and_upload(self, api, user_ids, notifiers):
        collection_bucket_start_ts = datetime.now()

        for module in self.collection_modules:
            chunk_size = 100
            chunk_id_padding_length = len(str(len(user_ids) // chunk_size))

            for i, chunk_start in enumerate(range(0, len(user_ids),
                                                  chunk_size)):

                print(
                    "progress=" +
                    str(int(100 *
                            (float(chunk_start) / float(len(user_ids))))))

                chunk_end = chunk_start + chunk_size
                user_id_chunk = user_ids[chunk_start:chunk_end]

                chunk_str_id = str(i).zfill(chunk_id_padding_length)

                collected_data = module.collect(api, user_id_chunk,
                                                collection_bucket_start_ts)

                module.upload_data(collected_data)
                module.dump_data(collected_data, self.user_set_name,
                                 collection_bucket_start_ts, chunk_str_id)

        collection_bucket_end_ts = datetime.now()

        for module in self.collection_modules:
            dbrow = (self.user_set_id, collection_bucket_start_ts,
                     collection_bucket_end_ts, module.name, None)
            self.db.execute_sql(SQL.user_set_metadata_insert,
                                args=dbrow,
                                commit=True)

        notifications.notify_all(notifiers,
                                 "Finished collection at {}".format(
                                     str(datetime.now())),
                                 notify_type="complete")
Esempio n. 10
0
    def collect(self, api, user_ids, collection_bucket_ts):
        all_rows = []

        notifications.notify_all(
            self.notifiers,
            "Starting profile data collection for {} user_ids".format(
                len(user_ids)),
            notify_type="start")

        for i in range(0, len(user_ids), 100):
            user_id_chunk = user_ids[i:(i + 100)]
            all_rows.extend(
                pull_user_data(api, user_id_chunk, self.user_set_id,
                               collection_bucket_ts))

        notifications.notify_all(
            self.notifiers,
            "Finished collecting profile data for {} user_ids".format(
                len(user_ids)),
            notify_type="complete")

        return all_rows
Esempio n. 11
0
    def __init__(self, db, user_set_id, notifiers):
        self.user_set_id = user_set_id
        self.db = db

        existing_data = db.execute_sql(
            SQL.user_set_configuration_fetch.format(user_set_id), fetch=True)

        if not bool(existing_data):
            raise ValueError(
                "user_set_id {} doesn't exist in the database. You'll need to set parameters to create a new user_set_id or change the id to an existing entry."
                .format(user_set_id))

        self.db_credentials_fpath, self.twitter_credential_fpath, self.user_set_name, self.user_set_creation_ts, self.user_set_input_collection, self.user_set_input_collection_fpaths, self.user_set_description, self.user_set_notes, self.user_set_user_filter, self.user_set_user_filter_args, self.user_set_collection_modules_to_run, self.notifier_params, self.run_flags, self.day_interval, self.extra_params = existing_data[
            0][1:]

        ## The prefix that will be given to all user-set specific tables
        self.user_set_prefix = "c{}_{}".format(
            self.user_set_id, self.user_set_name.replace(" ", "_"))

        notifications.notify_all(
            notifiers,
            "Pulled user set configuration for `user_set_id` = {}".format(
                user_set_id),
            notify_type="info")

        self.filter_module = setup.setup_userid_module(
            self.user_set_prefix, self.user_set_user_filter,
            self.user_set_user_filter_args, notifiers)

        self.collection_modules = []
        for module in self.user_set_collection_modules_to_run:
            module_class = module["class"]
            module_args = module.get("args", {})
            self.collection_modules.append(
                setup.setup_collection_module(self.user_set_id,
                                              self.user_set_prefix,
                                              module_class, module_args, db,
                                              notifiers))
Esempio n. 12
0
    def __init__(self,
                 user_set_id,
                 user_set_prefix,
                 db,
                 first_timebound_type,
                 timebound_arg,
                 subsequent_timebound_type,
                 notifiers=[]):

        if (not first_timebound_type in ["number", "date", "last_tweet"]) or (
                not subsequent_timebound_type
                in ["number", "date", "last_tweet"]):
            raise ValueError(
                "Please specify `timebound_type` to be 'number', 'date', or 'last_tweet'"
            )

        self.user_set_id = user_set_id
        self.name = "timeline"
        self.user_set_prefix = user_set_prefix
        self.db = db
        self.notifiers = notifiers

        self.table_name = "{}_timeline".format(user_set_prefix)

        table_create = """
            CREATE TABLE IF NOT EXISTS {} (
              user_set_id TEXT,
              user_id BIGINT,
              tweet_id BIGINT,
              collection_bucket_ts TIMESTAMP,
              collected_ts TIMESTAMP,
              tweet_json JSON,
              PRIMARY KEY (user_set_id, user_id, tweet_id)
            )
            """.format(self.table_name)
        db.execute_sql(table_create, commit=True)

        select_collection_exists = """
            SELECT COUNT(*)
            FROM {}
            WHERE user_set_id = '{}'
        """.format(self.table_name, user_set_id)
        collection_exists = bool(
            db.execute_sql(select_collection_exists, fetch=True)[0][0] > 0)

        if not collection_exists:
            self.timebound_type = first_timebound_type
            self.timebound_arg = timebound_arg
        else:
            select_most_recent_tweet_id = """
                SELECT MAX(tweet_id)
                FROM {}
                WHERE user_set_id = '{}'
            """.format(self.table_name, self.user_set_id)
            self.most_recent_tweet_id = db.execute_sql(
                select_most_recent_tweet_id, fetch=True)[0][0]
            self.timebound_type = subsequent_timebound_type
            self.timebound_arg = timebound_arg

        if self.timebound_type == "date":
            self.timebound_arg = utils.twitter_str_to_dt(timebound_arg)

        self.table_insert = """
            INSERT INTO {} (user_set_id, user_id, tweet_id, collection_bucket_ts, collected_ts, tweet_json)
            VALUES (%s, %s, %s, %s, %s, %s)
        """.format(self.table_name)

        notifications.notify_all(
            notifiers,
            "Historic timeline collection uploading to `{}`".format(
                self.table_name),
            notify_type="info")