Exemple #1
0
class Collector:
    def __init__(self):
        self.twitter_api = TwitterService().api
        self.bq_service = BigQueryService()
        self.limit = STATUS_LIMIT
        self.batch_size = BATCH_SIZE

    def fetch_remaining_status_ids(self):
        sql = f"""
            SELECT DISTINCT a.status_id
            FROM `{self.bq_service.dataset_address}.all_status_ids` a
            LEFT JOIN `{self.bq_service.dataset_address}.recollected_statuses` completed ON completed.status_id = a.status_id
            WHERE completed.status_id IS NULL
            LIMIT {self.limit}
        """
        return [
            row["status_id"]
            for row in list(self.bq_service.execute_query(sql))
        ]

    def perform(self):
        remaining_status_ids = self.fetch_remaining_status_ids()
        if any(remaining_status_ids):
            for batch_of_ids in split_into_batches(remaining_status_ids,
                                                   batch_size=self.batch_size):
                self.process_batch(batch_of_ids)
        else:
            print("OH ALL DONE! SLEEPING...")
            server_sleep(10 * 60 * 60)

    def lookup_statuses(self, status_ids):
        """Fetch full status info including urls, and full text.
            Max per request is 100, so batch size must be smaller than that.
            See:
                https://docs.tweepy.org/en/stable/api.html#API.statuses_lookup
                https://developer.twitter.com/en/docs/twitter-api/v1/tweets/post-and-engage/api-reference/get-statuses-lookup
        """
        return self.twitter_api.statuses_lookup(
            id_=status_ids,
            include_entities=True,  # this is where the full urls are
            trim_user=True,  # we already have this info
            include_ext_alt_text=
            True,  # If alt text has been added to any attached media entities, this parameter will return an ext_alt_text value in the top-level key for the media entity. If no value has been set, this will be returned as null.
            include_card_uri=False,
            map_=
            True,  # "Tweets that do not exist or cannot be viewed by the current user will still have their key represented but with an explicitly null value paired with it"
            tweet_mode="extended")

    def process_batch(self, status_ids):
        recollected_statuses = []
        recollected_urls = []
        success_counter = 0
        for status in self.lookup_statuses(status_ids):
            # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field
            status_id = status.id  # all statuses will have an id

            recollected_status = {
                "status_id": status_id,
                "user_id": None,
                "full_text": None,
                "created_at": None,
                "lookup_at": generate_timestamp()
            }  # represent failed lookups with null text values
            if list(status._json.keys()) != [
                    "id"
            ]:  # this will be the only field for empty statuses. otherwise try to parse them:
                success_counter += 1
                recollected_status["user_id"] = status.user.id
                recollected_status["full_text"] = parse_full_text(
                    status)  # update the full text if possible
                recollected_status["created_at"] = generate_timestamp(
                    status.created_at)
                for url in status.entities["urls"]:
                    recollected_urls.append({
                        "status_id": status_id,
                        "expanded_url": url["expanded_url"]
                    })
            recollected_statuses.append(recollected_status)

        print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}",
              "| STATUSES:", success_counter, "| URLS:", len(recollected_urls))
        self.save_statuses(recollected_statuses)
        self.save_urls(recollected_urls)

    def save_statuses(self, recollected_statuses):
        self.bq_service.insert_records_in_batches(
            self.recollected_statuses_table, recollected_statuses)

    def save_urls(self, recollected_urls):
        self.bq_service.insert_records_in_batches(self.recollected_urls_table,
                                                  recollected_urls)

    @property
    @lru_cache(maxsize=None)
    def recollected_statuses_table(self):
        return self.bq_service.client.get_table(
            f"{self.bq_service.dataset_address}.recollected_statuses")

    @property
    @lru_cache(maxsize=None)
    def recollected_urls_table(self):
        return self.bq_service.client.get_table(
            f"{self.bq_service.dataset_address}.recollected_status_urls")
Exemple #2
0
            ,b.bot_id
            -- ,b.bot_screen_name
            --,b.day_count
            --,b.avg_daily_score
            ,count(distinct t.status_id) as tweet_count
            ,COALESCE(STRING_AGG(DISTINCT upper(t.user_screen_name), ' | ') , "")   as screen_names
            ,COALESCE(STRING_AGG(DISTINCT upper(t.user_name), ' | ')        , "")   as user_names
            ,COALESCE(STRING_AGG(DISTINCT upper(t.user_description), ' | ') , "")   as user_descriptions
        FROM impeachment_production.bots_above_80 b
        JOIN impeachment_production.2_bot_communities c ON c.user_id = b.bot_id
        JOIN impeachment_production.tweets t on cast(t.user_id as int64) = b.bot_id
        GROUP BY 1,2
        ORDER BY 1,2
    """ # TODO: move me into the BQ service

    results = [dict(row) for row in list(bq_service.execute_query(sql))]
    print("PROCESSING", len(results), "RECORDS...")

    for i, row in enumerate(results):
        row["profile_tokens"] = []
        row["profile_lemmas"] = []
        row["profile_tags"] = []
        row["profile_handles"] = []

        if row["user_descriptions"]:
            #print("--------------")
            #print("COMMUNITY", row["community_id"], i, row["bot_id"], row["screen_names"])
            #print(row["user_descriptions"])

            # we want unique tokens here because otherwise someone changing their sn will have a greater influence over the counts
            tokens = list(set(tokenizer.custom_stems(