Ejemplo n.º 1
0
    def __init__(self, dirpath=None, gcs_service=None, wifi=WIFI):
        """
        Saves and loads files, using local storage and/or Google Cloud Storage.

        Params:
            dirpath (str) a subpath of the data dir
            wifi (bool) whether or not to attempt uploads

        """
        self.wifi = wifi
        self.gcs_service = gcs_service or GoogleCloudStorageService()

        self.dirpath = dirpath or DIRPATH
        self.gcs_dirpath = self.compile_gcs_dirpath(self.dirpath)
        self.local_dirpath = self.compile_local_dirpath(self.dirpath)

        #print("-------------------------")
        print("FILE STORAGE...")
        print("   DIRPATH:", self.dirpath)
        print("   GCS DIRPATH:", self.gcs_dirpath)
        print("   LOCAL DIRPATH:", os.path.abspath(self.local_dirpath))
        print("   WIFI ENABLED:", self.wifi)

        seek_confirmation()

        if not os.path.exists(self.local_dirpath):
            os.makedirs(self.local_dirpath)
Ejemplo n.º 2
0
    def __init__(self, n_clusters=N_COMMUNITIES):
        self.n_clusters = n_clusters
        self.classifier = SpectralClustering(n_clusters=self.n_clusters,
                                             eigen_solver=None,
                                             affinity="precomputed",
                                             n_init=20)

        self.grapher = BotSimilarityGrapher()
        self.local_dirpath = os.path.join(self.grapher.local_dirpath,
                                          "n_communities",
                                          str(self.n_clusters))
        self.gcs_dirpath = os.path.join(self.grapher.gcs_dirpath,
                                        "n_communities", str(self.n_clusters))
        self.local_bot_communities_filepath = os.path.join(
            self.local_dirpath, "community_assignments.csv")
        self.gcs_bot_communities_filepath = os.path.join(
            self.gcs_dirpath, "community_assignments.csv")

        print("-----------------------")
        print("SPECTRAL CLUSTERMAKER")
        print("   N CLUSTERS:", self.n_clusters)
        print("   CLASSIFIER:", type(self.classifier))
        print("   LOCAL DIRPATH:", os.path.abspath(self.local_dirpath))
        print("   GCS DIRPATH:", self.gcs_dirpath)

        seek_confirmation()

        if not os.path.exists(self.local_dirpath):
            os.makedirs(self.local_dirpath)

        self.grapher.similarity_graph_report()  # load bot similarity graph
        self.similarity_graph = self.grapher.similarity_graph

        self.community_assignments = None
Ejemplo n.º 3
0
    def __init__(self, topic=TOPIC, tweets_start_at=TWEETS_START_AT, tweets_end_at=TWEETS_END_AT,
                        users_limit=USERS_LIMIT, batch_size=BATCH_SIZE,
                        storage_dirpath=None, bq_service=None):

        Job.__init__(self)
        GraphStorage.__init__(self, dirpath=storage_dirpath)
        self.bq_service = bq_service or BigQueryService()
        self.fetch_edges = self.bq_service.fetch_retweet_edges_in_batches_v2 # just being less verbose. feels like javascript

        # CONVERSATION PARAMS (OPTIONAL)

        self.topic = topic
        self.tweets_start_at = tweets_start_at
        self.tweets_end_at = tweets_end_at

        # PROCESSING PARAMS

        self.users_limit = users_limit
        if self.users_limit:
            self.users_limit = int(self.users_limit)

        self.batch_size = int(batch_size)

        print("-------------------------")
        print("RETWEET GRAPHER...")
        print("  USERS LIMIT:", self.users_limit)
        print("  BATCH SIZE:", self.batch_size)
        print("  DRY RUN:", DRY_RUN)
        print("-------------------------")
        print("CONVERSATION PARAMS...")
        print("  TOPIC:", self.topic)
        print("  TWEETS START:", self.tweets_start_at)
        print("  TWEETS END:", self.tweets_end_at)

        seek_confirmation()
Ejemplo n.º 4
0
    def __init__(self, dirpath=None, gcs_service=None):
        """
        Saves and loads artifacts from the networkx graph compilation process, using local storage and/or Google Cloud Storage.

        Params:
            dirpath (str) like "graphs/my_graph/123"

        TODO: bot probability stuff only apples to bot retweet graphs, and should probably be moved into a child graph storage class
        """

        self.gcs_service = gcs_service or GoogleCloudStorageService()

        self.dirpath = dirpath or DIRPATH
        self.gcs_dirpath = os.path.join("storage", "data", self.dirpath)
        self.local_dirpath = os.path.join(
            DATA_DIR, self.dirpath
        )  # TODO: to make compatible on windows, split the dirpath on "/" and re-join using os.sep

        print("-------------------------")
        print("GRAPH STORAGE...")
        print("   DIRPATH:", self.dirpath)
        print("   GCS DIRPATH:", self.gcs_dirpath)
        print("   LOCAL DIRPATH:", os.path.abspath(self.local_dirpath))
        print("   WIFI ENABLED:", WIFI_ENABLED)

        seek_confirmation()

        if not os.path.exists(self.local_dirpath):
            os.makedirs(self.local_dirpath)

        self.results = None
        self.graph = None
Ejemplo n.º 5
0
    def __init__(self,
                 start_date=START_DATE,
                 k_days=K_DAYS,
                 n_periods=N_PERIODS):
        """
        Generates a list of date ranges.

        Params:
            start_date (str) the first period start date, like "2020-01-01"
            k_days (int) number of days in each period
            n_periods (int) number of periods
        """
        self.start_date = start_date
        self.k_days = int(k_days)
        self.n_periods = int(n_periods)

        print("-------------------------")
        print("DATE RANGE GENERATOR...")
        print("  START DATE:", self.start_date)
        print("  K DAYS:", self.k_days)
        print("  N PERIODS:", self.n_periods)

        print("-------------------------")
        print("DATE RANGES...")
        self.date_ranges = self.get_date_ranges(start_date=self.start_date,
                                                k_days=self.k_days,
                                                n_periods=self.n_periods)
        pprint(self.date_ranges)
        seek_confirmation()
Ejemplo n.º 6
0
    def __init__(self, dirpath=None, gcs_service=None):
        """
        Saves and loads artifacts from the networkx graph compilation process
            ...to and from local storage and/or Google Cloud Storage.

        Params:
            dirpath (str) like "graphs/my_graph/123"
        """
        self.gcs_service = gcs_service or GoogleCloudStorageService()

        self.dirpath = dirpath or DIRPATH
        self.gcs_dirpath = os.path.join("storage", "data", self.dirpath)
        self.local_dirpath = os.path.join(DATA_DIR, self.dirpath) # TODO: to make compatible on windows, split the dirpath on "/" and re-join using os.sep

        print("-------------------------")
        print("GRAPH STORAGE...")
        print("   DIRPATH:",  self.dirpath)
        print("   GCS DIRPATH:", self.gcs_dirpath)
        print("   LOCAL DIRPATH:", os.path.abspath(self.local_dirpath))

        seek_confirmation()

        if not os.path.exists(self.local_dirpath):
            os.makedirs(self.local_dirpath)

        self.results = None
        self.graph = None
Ejemplo n.º 7
0
    def __init__(self,
                 project_name=PROJECT_NAME,
                 dataset_name=DATASET_NAME,
                 verbose=VERBOSE_QUERIES,
                 destructive=DESTRUCTIVE_MIGRATIONS):
        self.project_name = project_name
        self.dataset_name = dataset_name
        self.dataset_address = f"{self.project_name}.{self.dataset_name}"

        self.verbose = (verbose == True)
        self.destructive = (destructive == True)

        self.client = bigquery.Client()

        print("-------------------------")
        print("BIGQUERY SERVICE...")
        print("  DATASET ADDRESS:", self.dataset_address.upper())
        print("  DESTRUCTIVE MIGRATIONS:", self.destructive)
        print("  VERBOSE QUERIES:", self.verbose)

        seek_confirmation()
Ejemplo n.º 8
0
    def delete_temp_tables_older_than(self, days=3):
        """Deletes all tables that:
            have "temp_" in their name (product of the batch jobs), and were
            created at least X days ago (safely avoid deleting tables being used by in-progress batch jobs)
        """
        cutoff_date = datetime.now(tz=timezone.utc) - timedelta(days=days)
        print("CUTOFF DATE:", cutoff_date)

        tables = list(self.client.list_tables(self.dataset_name))  # API call
        tables_to_delete = [
            t for t in tables
            if "temp_" in t.table_id and t.created < cutoff_date
        ]
        print("TABLES TO DELETE:")
        pprint([t.table_id for t in tables_to_delete])
        seek_confirmation()

        print("DELETING...")
        for old_temp_table in tables_to_delete:
            print("  ", old_temp_table.table_id)
            self.client.delete_table(old_temp_table)
Ejemplo n.º 9
0
    def __init__(self,
                 bq_service=None,
                 bot_min=BOT_MIN,
                 batch_size=BATCH_SIZE,
                 storage_dirpath=None):
        self.bq_service = bq_service or BigQueryService()
        self.bot_min = bot_min
        self.batch_size = batch_size

        Job.__init__(self)

        storage_dirpath = storage_dirpath or f"bot_follower_graphs/bot_min/{self.bot_min}"
        GraphStorage.__init__(self, dirpath=storage_dirpath)

        print("-------------------------")
        print("BOT FOLLOWER GRAPHER...")
        print("  BOT MIN:", self.bot_min)
        print("  BATCH SIZE:", self.batch_size)
        print("-------------------------")

        seek_confirmation()
Ejemplo n.º 10
0
    def __init__(self,
                 limit=LIMIT,
                 batch_size=BATCH_SIZE,
                 bq_service=None,
                 model_manager=None):
        self.limit = limit
        self.batch_size = batch_size
        self.bq_service = bq_service or BigQueryService()
        self.mgr = model_manager or ModelManager()

        print("----------------")
        print("TOXICITY SCORER...")
        print("  MODEL CHECKPOINT:", self.mgr.checkpoint_name.upper(),
              self.mgr.checkpoint_url)
        print("  SCORES TABLE NAME:", self.scores_table_name)
        print("  LIMIT:", fmt_n(self.limit))
        print("  BATCH SIZE:", fmt_n(self.batch_size))

        self.predict = self.mgr.predict_scores  # method alias

        seek_confirmation()
Ejemplo n.º 11
0
    def promote_model(self, destination=BEST_MODEL_DIRPATH):
        blobs = list(self.gcs_service.bucket.list_blobs())
        matching_blobs = [blob for blob in blobs if self.dirpath in blob.name]
        print("MODEL FILES TO PROMOTE...")
        pprint(matching_blobs)
        seek_confirmation()

        print("PROMOTING GCS MODEL FILES...")
        for blob in matching_blobs:
            file_name = blob.name.split("/")[-1]  #> 'model.gpickle'
            new_path = self.compile_gcs_dirpath(
                f"{destination}/{file_name}"
            )  #f"storage/data/{destination}/{file_name}"
            self.gcs_service.bucket.copy_blob(
                blob,
                destination_bucket=self.gcs_service.bucket,
                new_name=new_path)

        print("PROMOTING LOCAL MODEL FILES...")
        local_destination = self.compile_local_dirpath(destination)
        local_source = self.local_dirpath
        copytree(local_source, local_destination, dirs_exist_ok=True)
    def __init__(self, bq_service=None, week_id=WEEK_ID):
        bq_service = bq_service or BigQueryService()
        self.week_id = week_id

        print("--------------------")
        print("FETCHING WEEKS...")
        self.weeks = [
            RetweetWeek(row) for row in list(bq_service.fetch_retweet_weeks())
        ]
        for week in self.weeks:
            print("   ", week.details)

        print("--------------------")
        print("SELECTING A WEEK...")
        if not self.week_id:
            self.week_id = input(
                "PLEASE SELECT A WEEK (E.G. '2019-52', '2020-01', ETC.): "
            )  # assumes you know what you're doing when setting WEEK_ID on production! once you run this once you'll see what all the week ids are.

        try:
            self.week = [
                wk for wk in self.weeks if wk.week_id == self.week_id
            ][0]
            print("   ", self.week.details)
        except IndexError as err:
            print("OOPS - PLEASE CHECK WEEK ID AND TRY AGAIN...")
            exit()

        self.tweets_start_at = self.week.row.min_created
        self.tweets_end_at = self.week.row.max_created

        seek_confirmation()

        storage_service = self.init_storage_service(self.week_id)
        super().__init__(bq_service=bq_service,
                         storage_service=storage_service)
Ejemplo n.º 13
0
    def __init__(self, local_dirpath=None, gcs_dirpath=None, gcs_service=None):
        """
        Saves and loads artifacts from the networkx graph compilation process to local storage, and optionally to Google Cloud Storage.

        Params:
            local_dirpath (str) like "/Users/USERNAME/path/to/repo/data/graphs/2020-08-02-1818"
            gcs_dirpath (str) like "storage/data/graphs/2020-08-02-1818"
        """
        self.gcs_service = gcs_service or GoogleCloudStorageService()
        self.gcs_dirpath = gcs_dirpath or os.path.join("storage", "data",
                                                       "graphs", "example")
        self.local_dirpath = local_dirpath or os.path.join(
            DATA_DIR, "graphs", "example")

        print("----------------------")
        print("GRAPH STORAGE...")
        print("   GCS DIR:", self.gcs_dirpath)
        print("   LOCAL DIR:", self.local_dirpath)
        print("----------------------")

        seek_confirmation()

        if not os.path.exists(self.local_dirpath):
            os.makedirs(self.local_dirpath)
Ejemplo n.º 14
0
BATCH_SIZE = int(os.getenv("BATCH_SIZE", default=100)) # the max number of processed users to store in BQ at once (with a single insert API call). must be less than 10,000 to avoid error.

if __name__ == "__main__":

    bq_service = BigQueryService()
    twitter_service = TwitterService()

    rows = list(bq_service.fetch_idless_screen_names())
    row_count = len(rows)
    print("-------------------------")
    print(f"FETCHED {row_count} SCREEN NAMES")
    print("BATCH SIZE:", BATCH_SIZE)
    print("-------------------------")

    seek_confirmation()
    bq_service.migrate_user_id_lookups_table()

    batch = []
    for index, row in enumerate(rows):
        counter = index + 1

        try:
            user_id = twitter_service.get_user_id(row.screen_name)
            message = None
        except TweepError as err:
            #print(err)
            #> [{'code': 50, 'message': 'User not found.'}]
            #> [{'code': 63, 'message': 'User has been suspended.'}]
            user_id = None
            message = json.loads(err.reason.replace("'", '"'))[0]["message"]