Esempio n. 1
0
def test_upload_in_batches():

    bq_service = BigQueryService(dataset_name="impeachment_test")

    # when inserting more than 10,000 rows,
    # is able to overcome error "too many rows present in the request, limit: 10000":
    lots_of_rows = [{"start_date":"2020-01-01", "user_id":i, "bot_probability": .99} for i in range(1, 36000)]
    errors = bq_service.upload_daily_bot_probabilities(lots_of_rows)
    assert not any(errors)
Esempio n. 2
0
def get_tweets():
    bq_service = BigQueryService()
    print("LIMIT:", LIMIT)
    job = Job()

    tweets = []
    job.start()
    for row in bq_service.fetch_labeled_tweets_in_batches(limit=LIMIT):
        tweets.append(dict(row))
        job.counter += 1
        if job.counter % BATCH_SIZE == 0:
            job.progress_report()
    job.end()
    print("FETCHED TWEETS:", fmt_n(len(tweets)))
    return DataFrame(tweets)
Esempio n. 3
0
def download_data():
    job = Job()
    bq_service = BigQueryService()

    job.start()
    records = []
    for row in bq_service.fetch_user_details_vq(limit=LIMIT):
        #print(row)
        records.append(dict(row))

        job.counter += 1
        if job.counter % BATCH_SIZE == 0:
            job.progress_report()
    job.end()

    return DataFrame(records)
Esempio n. 4
0
    def __init__(self,
                 users_limit=USERS_LIMIT,
                 batch_size=BATCH_SIZE,
                 pg_destructive=PG_DESTRUCTIVE,
                 bq_service=None):
        self.bq_service = bq_service or BigQueryService()

        if users_limit:
            self.users_limit = int(users_limit)
        else:
            self.users_limit = None
        self.tweets_limit = self.users_limit  # todo: combine with users_limit for a more generic rows_limit, since we usually run one script or another, so can reset the var between runs
        self.batch_size = batch_size
        self.pg_destructive = pg_destructive

        self.pg_engine = db
        self.pg_session = BoundSession()

        print("-------------------------")
        print("PG PIPELINE...")
        print("  USERS LIMIT:", self.users_limit)
        print("  BATCH SIZE:", self.batch_size)
        #print("  BQ SERVICE:", type(self.bq_service))
        #print("  PG SESSION:", type(self.pg_session))
        print("  PG DESTRUCTIVE:", self.pg_destructive)
Esempio n. 5
0
    def __init__(self, topic=TOPIC, tweets_start_at=TWEETS_START_AT, tweets_end_at=TWEETS_END_AT,
                        users_limit=USERS_LIMIT, batch_size=BATCH_SIZE,
                        storage_dirpath=None, bq_service=None):

        Job.__init__(self)
        GraphStorage.__init__(self, dirpath=storage_dirpath)
        self.bq_service = bq_service or BigQueryService()
        self.fetch_edges = self.bq_service.fetch_retweet_edges_in_batches_v2 # just being less verbose. feels like javascript

        # CONVERSATION PARAMS (OPTIONAL)

        self.topic = topic
        self.tweets_start_at = tweets_start_at
        self.tweets_end_at = tweets_end_at

        # PROCESSING PARAMS

        self.users_limit = users_limit
        if self.users_limit:
            self.users_limit = int(self.users_limit)

        self.batch_size = int(batch_size)

        print("-------------------------")
        print("RETWEET GRAPHER...")
        print("  USERS LIMIT:", self.users_limit)
        print("  BATCH SIZE:", self.batch_size)
        print("  DRY RUN:", DRY_RUN)
        print("-------------------------")
        print("CONVERSATION PARAMS...")
        print("  TOPIC:", self.topic)
        print("  TWEETS START:", self.tweets_start_at)
        print("  TWEETS END:", self.tweets_end_at)

        seek_confirmation()
Esempio n. 6
0
def cautiously_initialized_storage_service():
    service = BigQueryService()
    print("-------------------------")
    print("BQ CONFIG...")
    print("  BIGQUERY DATASET:", service.dataset_address.upper())
    print("  DESTRUCTIVE MIGRATIONS:", service.destructive)
    print("  VERBOSE QUERIES:", service.verbose)
    print("-------------------------")
    print("WORKER CONFIG...")
    print("  MIN USER ID:", MIN_ID)
    print("  MAX USER ID:", MAX_ID)
    print("  USERS LIMIT:", LIMIT)
    print("  MAX THREADS:", MAX_THREADS)
    print("  BATCH SIZE:", BATCH_SIZE)
    print("-------------------------")
    print("SCRAPER CONFIG...")
    print("  VERBOSE SCRAPER:", VERBOSE_SCRAPER)
    print("  MAX FRIENDS:", MAX_FRIENDS)
    print("-------------------------")
    if APP_ENV == "development":
        if input("CONTINUE? (Y/N): ").upper() != "Y":
            print("EXITING...")
            exit()
    #service.init_tables() # did this originally, but moving to a manual migration strategy to prevent accidental deletions
    return service
Esempio n. 7
0
    def __init__(self, model_name=MODEL_NAME, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None):
        self.model_name = model_name.lower().replace(";","") # using this model name in queries, so be super safe about SQL injection, although its not a concern right now
        self.limit = limit
        self.batch_size = batch_size
        self.bq_service = bq_service or BigQueryService()

        print("----------------")
        print("TOXICITY SCORER...")
        print("  MODEL:", self.model_name.upper())
        print("  LIMIT:", fmt_n(self.limit))
        print("  BATCH SIZE:", fmt_n(self.batch_size))
Esempio n. 8
0
def create_app():
    app = Flask(__name__)
    CORS(app) # CORS(app, resources={r"/api/*": {"origins": "*"}})

    app.config["SECRET_KEY"] = SECRET_KEY
    app.config["BQ_SERVICE"] = BigQueryService(cautious=False)
    #app.config.from_mapping(SECRET_KEY=SECRET_KEY, BQ_SERVICE=BigQueryService())

    app.register_blueprint(api_v0_routes)
    app.register_blueprint(api_v1_routes)

    return app
    def __init__(self,
                 bq=None,
                 topic=TOPIC,
                 pval_max=PVAL_MAX,
                 results_csv_filepath=RESULTS_CSV_FILEPATH):
        self.topic = topic

        self.bq = bq or BigQueryService()
        self.x = []
        self.y = []

        self.pval_max = pval_max
        self.interpret_ks = interpret
        self.results_csv_filepath = results_csv_filepath
    def __init__(self,
                 bq_service=None,
                 twitter_service=None,
                 user_limit=USER_LIMIT,
                 friend_limit=FRIEND_LIMIT):
        self.bq_service = bq_service or BigQueryService()
        self.twitter_service = twitter_service or TwitterService()

        self.dataset_address = self.bq_service.dataset_address
        self.user_limit = int(user_limit)
        self.friend_limit = int(friend_limit)

        print("---------------------------")
        print("JOB: FRIEND LOOKUPS")
        print("DATASET:", self.dataset_address.upper())
        print("USER LIMIT:", self.user_limit)
        print("FRIEND LIMIT:", self.friend_limit)
        print("---------------------------")
    def __init__(self,
                 bq_service=None,
                 twitter_service=None,
                 user_limit=USER_LIMIT,
                 status_limit=STATUS_LIMIT):
        self.bq_service = bq_service or BigQueryService()
        self.twitter_service = twitter_service or TwitterService()

        self.dataset_address = self.bq_service.dataset_address
        self.user_limit = int(user_limit)
        self.status_limit = int(status_limit)

        self.parse_status = parse_timeline_status

        print("---------------------------")
        print("JOB: TIMELINE LOOKUPS")
        print("DATASET:", self.dataset_address.upper())
        print("USER LIMIT:", self.user_limit)
        print("STATUS LIMIT:", self.status_limit)
        print("---------------------------")
Esempio n. 12
0
    def __init__(self,
                 bq_service=None,
                 bot_min=BOT_MIN,
                 batch_size=BATCH_SIZE,
                 storage_dirpath=None):
        self.bq_service = bq_service or BigQueryService()
        self.bot_min = bot_min
        self.batch_size = batch_size

        Job.__init__(self)

        storage_dirpath = storage_dirpath or f"bot_follower_graphs/bot_min/{self.bot_min}"
        GraphStorage.__init__(self, dirpath=storage_dirpath)

        print("-------------------------")
        print("BOT FOLLOWER GRAPHER...")
        print("  BOT MIN:", self.bot_min)
        print("  BATCH SIZE:", self.batch_size)
        print("-------------------------")

        seek_confirmation()
Esempio n. 13
0
    def __init__(self,
                 limit=LIMIT,
                 batch_size=BATCH_SIZE,
                 bq_service=None,
                 model_manager=None):
        self.limit = limit
        self.batch_size = batch_size
        self.bq_service = bq_service or BigQueryService()
        self.mgr = model_manager or ModelManager()

        print("----------------")
        print("TOXICITY SCORER...")
        print("  MODEL CHECKPOINT:", self.mgr.checkpoint_name.upper(),
              self.mgr.checkpoint_url)
        print("  SCORES TABLE NAME:", self.scores_table_name)
        print("  LIMIT:", fmt_n(self.limit))
        print("  BATCH SIZE:", fmt_n(self.batch_size))

        self.predict = self.mgr.predict_scores  # method alias

        seek_confirmation()
Esempio n. 14
0
    def __init__(self,
                 users_limit=USERS_LIMIT,
                 batch_size=BATCH_SIZE,
                 pg_destructive=PG_DESTRUCTIVE,
                 bq_service=None):
        self.bq_service = bq_service or BigQueryService()
        if users_limit:
            self.users_limit = int(users_limit)
        else:
            self.users_limit = None
        self.batch_size = batch_size

        self.pg_destructive = pg_destructive
        self.pg_engine = db
        self.pg_session = BoundSession()

        print("-------------------------")
        print("PG PIPELINE...")
        print("  USERS LIMIT:", self.users_limit)
        print("  BATCH SIZE:", self.batch_size)
        #print("  BQ SERVICE:", type(self.bq_service))
        #print("  PG SESSION:", type(self.pg_session))
        print("  PG DESTRUCTIVE:", self.pg_destructive)
Esempio n. 15
0
    def __init__(self, twitter_service=None, storage_env=STORAGE_ENV, bq_service=None, csv_service=None, batch_size=BATCH_SIZE):
        self.twitter_service = twitter_service or TwitterService()
        self.api = self.twitter_service.api
        self.auth = self.api.auth
        self.parse_status = parse_status

        self.storage_env = storage_env
        if self.storage_env == "local":
            self.storage_service = csv_service or LocalStorageService()
        elif self.storage_env == "remote":
            self.storage_service = bq_service or BigQueryService()
        else:
            raise ValueError("Expecting the STORAGE_ENV to be 'local' or 'remote'. Please try again...")

        self.batch_size = batch_size
        self.batch = []
        self.counter = 0

        print("-------------------------------")
        print("STREAM LISTENER...")
        print("  STORAGE ENV:", self.storage_env.upper())
        print("  STORAGE SERVICE:", type(self.storage_service))
        print("  BATCH SIZE:", self.batch_size)
        print("--------------------------------")
    def __init__(self, bq_service=None, week_id=WEEK_ID):
        bq_service = bq_service or BigQueryService()
        self.week_id = week_id

        print("--------------------")
        print("FETCHING WEEKS...")
        self.weeks = [
            RetweetWeek(row) for row in list(bq_service.fetch_retweet_weeks())
        ]
        for week in self.weeks:
            print("   ", week.details)

        print("--------------------")
        print("SELECTING A WEEK...")
        if not self.week_id:
            self.week_id = input(
                "PLEASE SELECT A WEEK (E.G. '2019-52', '2020-01', ETC.): "
            )  # assumes you know what you're doing when setting WEEK_ID on production! once you run this once you'll see what all the week ids are.

        try:
            self.week = [
                wk for wk in self.weeks if wk.week_id == self.week_id
            ][0]
            print("   ", self.week.details)
        except IndexError as err:
            print("OOPS - PLEASE CHECK WEEK ID AND TRY AGAIN...")
            exit()

        self.tweets_start_at = self.week.row.min_created
        self.tweets_end_at = self.week.row.max_created

        seek_confirmation()

        storage_service = self.init_storage_service(self.week_id)
        super().__init__(bq_service=bq_service,
                         storage_service=storage_service)
from tweepy.error import TweepError
from pandas import DataFrame
from dotenv import load_dotenv

from app import DATA_DIR, seek_confirmation
from app.decorators.datetime_decorators import logstamp
from app.bq_service import BigQueryService
from app.twitter_service import TwitterService

load_dotenv()

BATCH_SIZE = int(os.getenv("BATCH_SIZE", default=100)) # the max number of processed users to store in BQ at once (with a single insert API call). must be less than 10,000 to avoid error.

if __name__ == "__main__":

    bq_service = BigQueryService()
    twitter_service = TwitterService()

    rows = list(bq_service.fetch_idless_screen_names())
    row_count = len(rows)
    print("-------------------------")
    print(f"FETCHED {row_count} SCREEN NAMES")
    print("BATCH SIZE:", BATCH_SIZE)
    print("-------------------------")

    seek_confirmation()
    bq_service.migrate_user_id_lookups_table()

    batch = []
    for index, row in enumerate(rows):
        counter = index + 1
Esempio n. 18
0
from pandas import read_csv, DataFrame

from app.file_storage import FileStorage
from app.bq_service import BigQueryService
from app.job import Job
from app.decorators.number_decorators import fmt_n

LIMIT = os.getenv("LIMIT")  # for development purposes
BATCH_SIZE = int(os.getenv("BATCH_SIZE", default="10_000"))
DESTRUCTIVE = (
    os.getenv("DESTRUCTIVE", default="false") == "true"
)  # whether or not to re-download if a local file already exists

if __name__ == "__main__":

    bq_service = BigQueryService()
    job = Job()

    storage = FileStorage(dirpath=f"nodes_with_active_edges_v7")

    nodes_csv_filepath = os.path.join(storage.local_dirpath, "sn_nodes.csv")
    if os.path.exists(nodes_csv_filepath) and not DESTRUCTIVE:
        print("LOADING SN NODES...")
        nodes_df = read_csv(nodes_csv_filepath)
    else:
        job.start()
        print("DOWNLOADING SN NODES...")
        records = []
        for row in bq_service.fetch_sn_nodes_with_active_edges_v7(limit=LIMIT):
            records.append(dict(row))
            job.counter += 1
Esempio n. 19
0
 def __init__(self):
     self.twitter_api = TwitterService().api
     self.bq_service = BigQueryService()
     self.limit = STATUS_LIMIT
     self.batch_size = BATCH_SIZE
Esempio n. 20
0
class Collector:
    def __init__(self):
        self.twitter_api = TwitterService().api
        self.bq_service = BigQueryService()
        self.limit = STATUS_LIMIT
        self.batch_size = BATCH_SIZE

    def fetch_remaining_status_ids(self):
        sql = f"""
            SELECT DISTINCT a.status_id
            FROM `{self.bq_service.dataset_address}.all_status_ids` a
            LEFT JOIN `{self.bq_service.dataset_address}.recollected_statuses` completed ON completed.status_id = a.status_id
            WHERE completed.status_id IS NULL
            LIMIT {self.limit}
        """
        return [
            row["status_id"]
            for row in list(self.bq_service.execute_query(sql))
        ]

    def perform(self):
        remaining_status_ids = self.fetch_remaining_status_ids()
        if any(remaining_status_ids):
            for batch_of_ids in split_into_batches(remaining_status_ids,
                                                   batch_size=self.batch_size):
                self.process_batch(batch_of_ids)
        else:
            print("OH ALL DONE! SLEEPING...")
            server_sleep(10 * 60 * 60)

    def lookup_statuses(self, status_ids):
        """Fetch full status info including urls, and full text.
            Max per request is 100, so batch size must be smaller than that.
            See:
                https://docs.tweepy.org/en/stable/api.html#API.statuses_lookup
                https://developer.twitter.com/en/docs/twitter-api/v1/tweets/post-and-engage/api-reference/get-statuses-lookup
        """
        return self.twitter_api.statuses_lookup(
            id_=status_ids,
            include_entities=True,  # this is where the full urls are
            trim_user=True,  # we already have this info
            include_ext_alt_text=
            True,  # If alt text has been added to any attached media entities, this parameter will return an ext_alt_text value in the top-level key for the media entity. If no value has been set, this will be returned as null.
            include_card_uri=False,
            map_=
            True,  # "Tweets that do not exist or cannot be viewed by the current user will still have their key represented but with an explicitly null value paired with it"
            tweet_mode="extended")

    def process_batch(self, status_ids):
        recollected_statuses = []
        recollected_urls = []
        success_counter = 0
        for status in self.lookup_statuses(status_ids):
            # when passing param map_=True to Twitter API, if statuses are not available, the status will be present, but will only have an id field
            status_id = status.id  # all statuses will have an id

            recollected_status = {
                "status_id": status_id,
                "user_id": None,
                "full_text": None,
                "created_at": None,
                "lookup_at": generate_timestamp()
            }  # represent failed lookups with null text values
            if list(status._json.keys()) != [
                    "id"
            ]:  # this will be the only field for empty statuses. otherwise try to parse them:
                success_counter += 1
                recollected_status["user_id"] = status.user.id
                recollected_status["full_text"] = parse_full_text(
                    status)  # update the full text if possible
                recollected_status["created_at"] = generate_timestamp(
                    status.created_at)
                for url in status.entities["urls"]:
                    recollected_urls.append({
                        "status_id": status_id,
                        "expanded_url": url["expanded_url"]
                    })
            recollected_statuses.append(recollected_status)

        print(generate_timestamp(), f"| SAVING BATCH OF {len(status_ids)}",
              "| STATUSES:", success_counter, "| URLS:", len(recollected_urls))
        self.save_statuses(recollected_statuses)
        self.save_urls(recollected_urls)

    def save_statuses(self, recollected_statuses):
        self.bq_service.insert_records_in_batches(
            self.recollected_statuses_table, recollected_statuses)

    def save_urls(self, recollected_urls):
        self.bq_service.insert_records_in_batches(self.recollected_urls_table,
                                                  recollected_urls)

    @property
    @lru_cache(maxsize=None)
    def recollected_statuses_table(self):
        return self.bq_service.client.get_table(
            f"{self.bq_service.dataset_address}.recollected_statuses")

    @property
    @lru_cache(maxsize=None)
    def recollected_urls_table(self):
        return self.bq_service.client.get_table(
            f"{self.bq_service.dataset_address}.recollected_status_urls")
 def __init__(self, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, storage_service=None, bq_service=None):
     super().__init__(users_limit=users_limit, batch_size=batch_size, storage_service=storage_service)
     self.bq_service = bq_service or BigQueryService()

from app.bq_service import BigQueryService

if __name__ == "__main__":

    bq_service = BigQueryService()

    bq_service.migrate_populate_user_details_table_v2()

    print("MIGRATION SUCCESSFUL!")
Esempio n. 23
0
from app import DATA_DIR
from app.bq_service import BigQueryService
from app.bot_communities.tokenizers import Tokenizer, SpacyTokenizer
from app.bot_communities.token_analyzer import summarize_token_frequencies

from pandas import DataFrame

if __name__ == "__main__":

    local_dirpath = os.path.join(DATA_DIR, "bot_retweet_graphs", "bot_min",
                                 str(0.8), "n_communities", str(2))

    tokenizer = Tokenizer()
    spacy_tokenizer = SpacyTokenizer()

    bq_service = BigQueryService()

    sql = f"""
        SELECT
            c.community_id
            ,b.bot_id
            -- ,b.bot_screen_name
            --,b.day_count
            --,b.avg_daily_score
            ,count(distinct t.status_id) as tweet_count
            ,COALESCE(STRING_AGG(DISTINCT upper(t.user_screen_name), ' | ') , "")   as screen_names
            ,COALESCE(STRING_AGG(DISTINCT upper(t.user_name), ' | ')        , "")   as user_names
            ,COALESCE(STRING_AGG(DISTINCT upper(t.user_description), ' | ') , "")   as user_descriptions
        FROM impeachment_production.bots_above_80 b
        JOIN impeachment_production.2_bot_communities c ON c.user_id = b.bot_id
        JOIN impeachment_production.tweets t on cast(t.user_id as int64) = b.bot_id
Esempio n. 24
0
from app.job import Job
from app.bq_service import BigQueryService
from app.nlp.model_storage import ModelStorage, MODELS_DIRPATH

MODEL_NAME = os.getenv("MODEL_NAME", default="current_best")

LIMIT = os.getenv("LIMIT")
BATCH_SIZE = int(os.getenv("BATCH_SIZE", default="100000"))

if __name__ == "__main__":

    storage = ModelStorage(dirpath=f"{MODELS_DIRPATH}/{MODEL_NAME}")
    tv = storage.load_vectorizer()
    clf = storage.load_model()

    bq_service = BigQueryService()

    print("DESTROYING PREDICTIONS TABLE???")
    seek_confirmation()
    print("DESTROYING PREDICTIONS TABLE...")
    bq_service.destructively_migrate_2_community_predictions_table()

    job = Job()
    job.start()

    ids_batch = []
    statuses_batch = []
    for row in bq_service.fetch_unlabeled_statuses_in_batches(limit=LIMIT):
        ids_batch.append(row["status_id"])
        statuses_batch.append(row["status_text"])
Esempio n. 25
0
from app import server_sleep
from app.bq_service import BigQueryService
from app.retweet_graphs_v2.retweet_grapher import RetweetGrapher
from app.retweet_graphs_v2.k_days.generator import DateRangeGenerator


if __name__ == "__main__":

    gen = DateRangeGenerator()

    bq_service = BigQueryService()

    for date_range in gen.date_ranges:
        storage_dirpath = f"retweet_graphs_v2/k_days/{gen.k_days}/{date_range.start_date}"

        grapher = RetweetGrapher(storage_dirpath=storage_dirpath, bq_service=bq_service,
            tweets_start_at=date_range.start_at, tweets_end_at=date_range.end_at
        )
        grapher.save_metadata()
        grapher.start()
        grapher.perform()
        grapher.end()
        grapher.report()
        grapher.save_results()
        grapher.save_graph()

        del grapher # clearing graph from memory
        print("\n\n\n\n")

    print("JOB COMPLETE!")
Esempio n. 26
0
    bq_service.upload_basilica_embeddings(batch)
    #print(logstamp(), thread_name, "UPLOAD COMPLETE!")

    return len(batch)


if __name__ == "__main__":

    print("-------------------")
    print("BASILICA EMBEDDER...")
    print("  MIN PARTITION VAL:", MIN_VAL)
    print("  MAX PARTITION VAL:", MAX_VAL)
    print("  LIMIT:", LIMIT)
    print("  BATCH SIZE:", BATCH_SIZE)

    bq_service = BigQueryService()
    bas_service = BasilicaService()
    job = Job()

    job.start()
    records = list(
        bq_service.fetch_basilica_embedless_partitioned_statuses(
            min_val=MIN_VAL, max_val=MAX_VAL, limit=LIMIT))
    job.counter = len(records)

    batches = list(split_into_batches(records, BATCH_SIZE))
    print("BATCHES:", len(batches))
    job.end()
    del records

    job.start()
LIMIT = os.getenv("LIMIT")  # for development purposes
BATCH_SIZE = int(os.getenv("BATCH_SIZE", default="100000"))
DESTRUCTIVE = (
    os.getenv("DESTRUCTIVE", default="false") == "true"
)  # whether or not to re-download if a local file already exists

NODES_LIMIT = os.getenv("NODES_LIMIT")  # for development purposes
NODES_BATCH_SIZE = int(os.getenv("NODES_BATCH_SIZE", default="5000"))
NODES_DESTRUCTIVE = (
    os.getenv("NODES_DESTRUCTIVE", default="false") == "true"
)  # whether or not to re-download if a local file already exists

if __name__ == "__main__":

    gen = DateRangeGenerator(k_days=1)
    bq_service = BigQueryService()
    job = Job()

    for dr in gen.date_ranges:
        print(dr.start_date)
        storage = FileStorage(
            dirpath=f"daily_active_edge_friend_graphs_v5/{dr.start_date}")
        tweets_csv_filepath = os.path.join(storage.local_dirpath, "tweets.csv")
        nodes_csv_filepath = os.path.join(storage.local_dirpath, "nodes.csv")

        if os.path.exists(tweets_csv_filepath) and not DESTRUCTIVE:
            print("LOADING TWEETS...")
            tweets_df = read_csv(tweets_csv_filepath)
        else:
            job.start()
            print("DOWNLOADING TWEETS...")

from app.bq_service import BigQueryService

if __name__ == "__main__":

    bq_service = BigQueryService()

    bq_service.migrate_daily_bot_probabilities_table()

    print("MIGRATION SUCCESSFUL!")
    print("  TWEET_MIN:", TWEET_MIN)

    print("  LIMIT:", LIMIT)
    print("  BATCH_SIZE:", BATCH_SIZE)
    print("  DESTRUCTIVE:", DESTRUCTIVE)

    #print("  GRAPH_LIMIT:", GRAPH_LIMIT)
    print("  GRAPH_BATCH_SIZE:", GRAPH_BATCH_SIZE)
    print("  GRAPH_DESTRUCTIVE:", GRAPH_DESTRUCTIVE)

    print("------------------------")
    storage = FileStorage(
        dirpath=f"daily_active_friend_graphs_v4/{DATE}/tweet_min/{TWEET_MIN}")
    tweets_csv_filepath = os.path.join(storage.local_dirpath, "tweets.csv")

    bq_service = BigQueryService()
    job = Job()

    #
    # LOAD TWEETS
    # tweet_id, text, screen_name, bot, created_at

    # TODO: de-dup RTs so the model will only train/test on a single RT status text (PREVENT OVERFITTING)
    if os.path.exists(tweets_csv_filepath) and not DESTRUCTIVE:
        print("LOADING TWEETS...")
        statuses_df = read_csv(tweets_csv_filepath)
    else:
        job.start()
        print("DOWNLOADING TWEETS...")
        statuses = []
        for row in bq_service.fetch_daily_active_tweeter_statuses(
)  # the max number of processed users to store in BQ at once (with a single insert API call)
DESTRUCTIVE = (
    os.getenv("DESTRUCTIVE", default="false") == "true"
)  # whether or not to re-download if a local file already exists

if __name__ == "__main__":

    # INIT

    file_storage = FileStorage(
        dirpath="bot_retweet_graphs/bot_min/0.8/n_communities/2/analysis_v2")
    local_tweets_filepath = os.path.join(file_storage.local_dirpath,
                                         "community_tweets.csv")
    #gcs_tweets_filepath = os.path.join(file_storage.gcs_dirpath, "community_tweets.csv")

    bq_service = BigQueryService()
    tokenizer = Tokenizer()

    print("------------------------------")
    print("BOT STATUS ANALYZER V2...")
    print("  LIMIT:", LIMIT)
    print("  BATCH SIZE:", BATCH_SIZE)
    print("  DESTRUCTIVE:", DESTRUCTIVE)
    seek_confirmation()

    # LOAD STATUSES

    if os.path.isfile(local_tweets_filepath) and not DESTRUCTIVE:
        print("LOADING STATUSES...")
        statuses_df = read_csv(local_tweets_filepath)
        print(statuses_df.head())