def __init__(self,
                 bq_service=None,
                 twitter_service=None,
                 user_limit=USER_LIMIT,
                 friend_limit=FRIEND_LIMIT):
        self.bq_service = bq_service or BigQueryService()
        self.twitter_service = twitter_service or TwitterService()

        self.dataset_address = self.bq_service.dataset_address
        self.user_limit = int(user_limit)
        self.friend_limit = int(friend_limit)

        print("---------------------------")
        print("JOB: FRIEND LOOKUPS")
        print("DATASET:", self.dataset_address.upper())
        print("USER LIMIT:", self.user_limit)
        print("FRIEND LIMIT:", self.friend_limit)
        print("---------------------------")
    def __init__(self,
                 bq_service=None,
                 twitter_service=None,
                 user_limit=USER_LIMIT,
                 status_limit=STATUS_LIMIT):
        self.bq_service = bq_service or BigQueryService()
        self.twitter_service = twitter_service or TwitterService()

        self.dataset_address = self.bq_service.dataset_address
        self.user_limit = int(user_limit)
        self.status_limit = int(status_limit)

        self.parse_status = parse_timeline_status

        print("---------------------------")
        print("JOB: TIMELINE LOOKUPS")
        print("DATASET:", self.dataset_address.upper())
        print("USER LIMIT:", self.user_limit)
        print("STATUS LIMIT:", self.status_limit)
        print("---------------------------")
Ejemplo n.º 3
0
    def __init__(self, twitter_service=None, storage_env=STORAGE_ENV, bq_service=None, csv_service=None, batch_size=BATCH_SIZE):
        self.twitter_service = twitter_service or TwitterService()
        self.api = self.twitter_service.api
        self.auth = self.api.auth
        self.parse_status = parse_status

        self.storage_env = storage_env
        if self.storage_env == "local":
            self.storage_service = csv_service or LocalStorageService()
        elif self.storage_env == "remote":
            self.storage_service = bq_service or BigQueryService()
        else:
            raise ValueError("Expecting the STORAGE_ENV to be 'local' or 'remote'. Please try again...")

        self.batch_size = batch_size
        self.batch = []
        self.counter = 0

        print("-------------------------------")
        print("STREAM LISTENER...")
        print("  STORAGE ENV:", self.storage_env.upper())
        print("  STORAGE SERVICE:", type(self.storage_service))
        print("  BATCH SIZE:", self.batch_size)
        print("--------------------------------")
Ejemplo n.º 4
0
from pandas import DataFrame
from dotenv import load_dotenv

from app import DATA_DIR, seek_confirmation
from app.decorators.datetime_decorators import logstamp
from app.bq_service import BigQueryService
from app.twitter_service import TwitterService

load_dotenv()

BATCH_SIZE = int(os.getenv("BATCH_SIZE", default=100)) # the max number of processed users to store in BQ at once (with a single insert API call). must be less than 10,000 to avoid error.

if __name__ == "__main__":

    bq_service = BigQueryService()
    twitter_service = TwitterService()

    rows = list(bq_service.fetch_idless_screen_names())
    row_count = len(rows)
    print("-------------------------")
    print(f"FETCHED {row_count} SCREEN NAMES")
    print("BATCH SIZE:", BATCH_SIZE)
    print("-------------------------")

    seek_confirmation()
    bq_service.migrate_user_id_lookups_table()

    batch = []
    for index, row in enumerate(rows):
        counter = index + 1
Ejemplo n.º 5
0
 def __init__(self):
     self.twitter_api = TwitterService().api
     self.bq_service = BigQueryService()
     self.limit = STATUS_LIMIT
     self.batch_size = BATCH_SIZE
Ejemplo n.º 6
0
class Collector:
    def __init__(self):
        self.twitter_api = TwitterService().api
        self.bq_service = BigQueryService()
        self.limit = STATUS_LIMIT
        self.batch_size = BATCH_SIZE

    def fetch_remaining_status_ids(self):
        sql = f"""
            SELECT DISTINCT a.status_id
            FROM `{self.bq_service.dataset_address}.all_status_ids` a
            LEFT JOIN `{self.bq_service.dataset_address}.recollected_statuses` completed ON completed.status_id = a.status_id
            WHERE completed.status_id IS NULL
            LIMIT {self.limit}
        """
        return [
            row["status_id"]
            for row in list(self.bq_service.execute_query(sql))
        ]

    def perform(self):
        remaining_status_ids = self.fetch_remaining_status_ids()
        if any(remaining_status_ids):
            for batch_of_ids in split_into_batches(remaining_status_ids,
                                                   batch_size=self.batch_size):
                self.process_batch(batch_of_ids)
        else:
            print("OH ALL DONE! SLEEPING...")
            server_sleep(10 * 60 * 60)

    def lookup_statuses(self, status_ids):
        """Fetch full status info including urls, and full text.
            Max per request is 100, so batch size must be smaller than that.
            See:
                https://docs.tweepy.org/en/stable/api.html#API.statuses_lookup
                https://developer.twitter.com/en/docs/twitter-api/v1/tweets/post-and-engage/api-reference/get-statuses-lookup
        """
        return self.twitter_api.statuses_lookup(
            id_=status_ids,
            include_entities=True,  # this is where the full urls are
            trim_user=True,  # we already have this info
            include_ext_alt_text=
            True,  # If alt text has been added to any attached media entities, this parameter will return an ext_alt_text value in the top-level key for the media entity. If no value has been set, this will be returned as null.
            include_card_uri=False,
            map_=
            True,  # "Tweets that do not exist or cannot be viewed by the current user will still have their key represented but with an explicitly null value paired with it"
            tweet_mode="extended")

    def process_batch(self, status_ids):
        recollected_statuses = []
        recollected_urls = []
        success_counter = 0
        for status in self.lookup_statuses(status_ids):
            # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field
            status_id = status.id  # all statuses will have an id

            recollected_status = {
                "status_id": status_id,
                "user_id": None,
                "full_text": None,
                "created_at": None,
                "lookup_at": generate_timestamp()
            }  # represent failed lookups with null text values
            if list(status._json.keys()) != [
                    "id"
            ]:  # this will be the only field for empty statuses. otherwise try to parse them:
                success_counter += 1
                recollected_status["user_id"] = status.user.id
                recollected_status["full_text"] = parse_full_text(
                    status)  # update the full text if possible
                recollected_status["created_at"] = generate_timestamp(
                    status.created_at)
                for url in status.entities["urls"]:
                    recollected_urls.append({
                        "status_id": status_id,
                        "expanded_url": url["expanded_url"]
                    })
            recollected_statuses.append(recollected_status)

        print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}",
              "| STATUSES:", success_counter, "| URLS:", len(recollected_urls))
        self.save_statuses(recollected_statuses)
        self.save_urls(recollected_urls)

    def save_statuses(self, recollected_statuses):
        self.bq_service.insert_records_in_batches(
            self.recollected_statuses_table, recollected_statuses)

    def save_urls(self, recollected_urls):
        self.bq_service.insert_records_in_batches(self.recollected_urls_table,
                                                  recollected_urls)

    @property
    @lru_cache(maxsize=None)
    def recollected_statuses_table(self):
        return self.bq_service.client.get_table(
            f"{self.bq_service.dataset_address}.recollected_statuses")

    @property
    @lru_cache(maxsize=None)
    def recollected_urls_table(self):
        return self.bq_service.client.get_table(
            f"{self.bq_service.dataset_address}.recollected_status_urls")
Ejemplo n.º 7
0
from app.twitter_service import TwitterService

load_dotenv()

DATASET_ADDRESS = os.getenv("DATASET_ADDRESS", default="tweet-collector-py.disinfo_2021_development")
SEARCH_TERM = os.getenv("SEARCH_TERM", default="#WWG1WGA")
LIMIT = os.getenv("LIMIT") # None is OK

#class UserLookupJob:
#    def __init__(self):
#        pass

if __name__ == '__main__':

    bq_service = BigQueryService()
    twitter_service = TwitterService()

    print("SEARCH_TERM:", SEARCH_TERM)
    print("LIMIT:", LIMIT)
    print(bq_service.query_to_df(f"SELECT count(distinct user_id) FROM `{DATASET_ADDRESS}.user_lookups`"))

    seek_confirmation()

    sql = f"""
        SELECT DISTINCT u.user_id
        FROM (
            SELECT DISTINCT cast(user_id as INT64) as user_id
            FROM `{DATASET_ADDRESS}.tweets`
            WHERE REGEXP_CONTAINS(upper(status_text), '{SEARCH_TERM}')
        ) u
        LEFT JOIN (