Example #1
0
    seek_confirmation()
    #exit()

    for model_name in ["logistic_regression", "multinomial_nb"]:

        storage = ModelStorage(dirpath=f"nlp_v2/models/best/{model_name}")
        tv = storage.load_vectorizer()
        clf = storage.load_model()

        print(f"DESTROY PREDICTIONS TABLE? ({model_name})")
        seek_confirmation()
        bq_service.nlp_v2_destructively_migrate_predictions_table(model_name)
        predictions_table = bq_service.nlp_v2_get_predictions_table(model_name) # API call. cache it here once.

        job.start()

        for chunk_df in read_csv(CSV_FILEPATH, chunksize=BATCH_SIZE): # FYI: this will include the last chunk even if it is not a full batch
            status_ids = chunk_df["status_id"].tolist()
            status_texts = chunk_df["status_text"].tolist()

            preds = clf.predict(tv.transform(status_texts))

            batch = [{"status_id": status_id, "prediction": pred} for status_id, pred in zip(status_ids, preds)]
            bq_service.insert_records_in_batches(predictions_table, batch)

            job.counter += len(chunk_df)
            job.progress_report()
            batch = []

        job.end()
Example #2
0
class Collector:
    def __init__(self):
        self.twitter_api = TwitterService().api
        self.bq_service = BigQueryService()
        self.limit = STATUS_LIMIT
        self.batch_size = BATCH_SIZE

    def fetch_remaining_status_ids(self):
        sql = f"""
            SELECT DISTINCT a.status_id
            FROM `{self.bq_service.dataset_address}.all_status_ids` a
            LEFT JOIN `{self.bq_service.dataset_address}.recollected_statuses` completed ON completed.status_id = a.status_id
            WHERE completed.status_id IS NULL
            LIMIT {self.limit}
        """
        return [
            row["status_id"]
            for row in list(self.bq_service.execute_query(sql))
        ]

    def perform(self):
        remaining_status_ids = self.fetch_remaining_status_ids()
        if any(remaining_status_ids):
            for batch_of_ids in split_into_batches(remaining_status_ids,
                                                   batch_size=self.batch_size):
                self.process_batch(batch_of_ids)
        else:
            print("OH ALL DONE! SLEEPING...")
            server_sleep(10 * 60 * 60)

    def lookup_statuses(self, status_ids):
        """Fetch full status info including urls, and full text.
            Max per request is 100, so batch size must be smaller than that.
            See:
                https://docs.tweepy.org/en/stable/api.html#API.statuses_lookup
                https://developer.twitter.com/en/docs/twitter-api/v1/tweets/post-and-engage/api-reference/get-statuses-lookup
        """
        return self.twitter_api.statuses_lookup(
            id_=status_ids,
            include_entities=True,  # this is where the full urls are
            trim_user=True,  # we already have this info
            include_ext_alt_text=
            True,  # If alt text has been added to any attached media entities, this parameter will return an ext_alt_text value in the top-level key for the media entity. If no value has been set, this will be returned as null.
            include_card_uri=False,
            map_=
            True,  # "Tweets that do not exist or cannot be viewed by the current user will still have their key represented but with an explicitly null value paired with it"
            tweet_mode="extended")

    def process_batch(self, status_ids):
        recollected_statuses = []
        recollected_urls = []
        success_counter = 0
        for status in self.lookup_statuses(status_ids):
            # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field
            status_id = status.id  # all statuses will have an id

            recollected_status = {
                "status_id": status_id,
                "user_id": None,
                "full_text": None,
                "created_at": None,
                "lookup_at": generate_timestamp()
            }  # represent failed lookups with null text values
            if list(status._json.keys()) != [
                    "id"
            ]:  # this will be the only field for empty statuses. otherwise try to parse them:
                success_counter += 1
                recollected_status["user_id"] = status.user.id
                recollected_status["full_text"] = parse_full_text(
                    status)  # update the full text if possible
                recollected_status["created_at"] = generate_timestamp(
                    status.created_at)
                for url in status.entities["urls"]:
                    recollected_urls.append({
                        "status_id": status_id,
                        "expanded_url": url["expanded_url"]
                    })
            recollected_statuses.append(recollected_status)

        print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}",
              "| STATUSES:", success_counter, "| URLS:", len(recollected_urls))
        self.save_statuses(recollected_statuses)
        self.save_urls(recollected_urls)

    def save_statuses(self, recollected_statuses):
        self.bq_service.insert_records_in_batches(
            self.recollected_statuses_table, recollected_statuses)

    def save_urls(self, recollected_urls):
        self.bq_service.insert_records_in_batches(self.recollected_urls_table,
                                                  recollected_urls)

    @property
    @lru_cache(maxsize=None)
    def recollected_statuses_table(self):
        return self.bq_service.client.get_table(
            f"{self.bq_service.dataset_address}.recollected_statuses")

    @property
    @lru_cache(maxsize=None)
    def recollected_urls_table(self):
        return self.bq_service.client.get_table(
            f"{self.bq_service.dataset_address}.recollected_status_urls")
Example #3
0
        try:
            user = twitter_service.get_user(row["user_id"])
            lookup["follower_count"] = int(user.followers_count)
            lookup["friend_count"] = int(user.friends_count)
            lookup["listed_count"] = int(user.listed_count)
            lookup["status_count"] = int(user.statuses_count)
            try:
                # it is possible that... 'User' object has no attribute 'status'
                lookup["latest_status_id"] = int(user.status.id)
            except:
                pass
        except TweepError as err:
            # see: https://developer.twitter.com/ja/docs/basics/response-codes
            # ... 63 means user has been suspended, etc.
            lookup["error_code"] = err.api_code

        print(index, lookup)
        lookups.append(lookup)

    table = bq_service.client.get_table(f"{DATASET_ADDRESS}.user_lookups")
    bq_service.insert_records_in_batches(records=lookups, table=table)

    print(bq_service.query_to_df(f"SELECT count(distinct user_id) FROM `{DATASET_ADDRESS}.user_lookups`"))

    print("JOB COMPLETE...")

    if APP_ENV=="production":
        print("SLEEPING...")
        sleep(10 * 60 * 60) # let the server rest while we have time to shut it down
        exit() # don't try to do more work