Exemple #1
0
    def __init__(self,
                 users_limit=USERS_LIMIT,
                 batch_size=BATCH_SIZE,
                 pg_destructive=PG_DESTRUCTIVE,
                 bq_service=None):
        self.bq_service = bq_service or BigQueryService()

        if users_limit:
            self.users_limit = int(users_limit)
        else:
            self.users_limit = None
        self.tweets_limit = self.users_limit  # todo: combine with users_limit for a more generic rows_limit, since we usually run one script or another, so can reset the var between runs
        self.batch_size = batch_size
        self.pg_destructive = pg_destructive

        self.pg_engine = db
        self.pg_session = BoundSession()

        print("-------------------------")
        print("PG PIPELINE...")
        print("  USERS LIMIT:", self.users_limit)
        print("  BATCH SIZE:", self.batch_size)
        #print("  BQ SERVICE:", type(self.bq_service))
        #print("  PG SESSION:", type(self.pg_session))
        print("  PG DESTRUCTIVE:", self.pg_destructive)
def cautiously_initialized_storage_service():
    service = BigQueryService()
    print("-------------------------")
    print("BQ CONFIG...")
    print("  BIGQUERY DATASET:", service.dataset_address.upper())
    print("  DESTRUCTIVE MIGRATIONS:", service.destructive)
    print("  VERBOSE QUERIES:", service.verbose)
    print("-------------------------")
    print("WORKER CONFIG...")
    print("  MIN USER ID:", MIN_ID)
    print("  MAX USER ID:", MAX_ID)
    print("  USERS LIMIT:", LIMIT)
    print("  MAX THREADS:", MAX_THREADS)
    print("  BATCH SIZE:", BATCH_SIZE)
    print("-------------------------")
    print("SCRAPER CONFIG...")
    print("  VERBOSE SCRAPER:", VERBOSE_SCRAPER)
    print("  MAX FRIENDS:", MAX_FRIENDS)
    print("-------------------------")
    if APP_ENV == "development":
        if input("CONTINUE? (Y/N): ").upper() != "Y":
            print("EXITING...")
            exit()
    #service.init_tables() # did this originally, but moving to a manual migration strategy to prevent accidental deletions
    return service
Exemple #3
0
    def __init__(self, topic=TOPIC, tweets_start_at=TWEETS_START_AT, tweets_end_at=TWEETS_END_AT,
                        users_limit=USERS_LIMIT, batch_size=BATCH_SIZE,
                        storage_dirpath=None, bq_service=None):

        Job.__init__(self)
        GraphStorage.__init__(self, dirpath=storage_dirpath)
        self.bq_service = bq_service or BigQueryService()
        self.fetch_edges = self.bq_service.fetch_retweet_edges_in_batches_v2 # just being less verbose. feels like javascript

        # CONVERSATION PARAMS (OPTIONAL)

        self.topic = topic
        self.tweets_start_at = tweets_start_at
        self.tweets_end_at = tweets_end_at

        # PROCESSING PARAMS

        self.users_limit = users_limit
        if self.users_limit:
            self.users_limit = int(self.users_limit)

        self.batch_size = int(batch_size)

        print("-------------------------")
        print("RETWEET GRAPHER...")
        print("  USERS LIMIT:", self.users_limit)
        print("  BATCH SIZE:", self.batch_size)
        print("  DRY RUN:", DRY_RUN)
        print("-------------------------")
        print("CONVERSATION PARAMS...")
        print("  TOPIC:", self.topic)
        print("  TWEETS START:", self.tweets_start_at)
        print("  TWEETS END:", self.tweets_end_at)

        seek_confirmation()
def test_upload_in_batches():

    bq_service = BigQueryService(dataset_name="impeachment_test")

    # when inserting more than 10,000 rows,
    # is able to overcome error "too many rows present in the request, limit: 10000":
    lots_of_rows = [{"start_date":"2020-01-01", "user_id":i, "bot_probability": .99} for i in range(1, 36000)]
    errors = bq_service.upload_daily_bot_probabilities(lots_of_rows)
    assert not any(errors)
Exemple #5
0
    def __init__(self, model_name=MODEL_NAME, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None):
        self.model_name = model_name.lower().replace(";","") # using this model name in queries, so be super safe about SQL injection, although its not a concern right now
        self.limit = limit
        self.batch_size = batch_size
        self.bq_service = bq_service or BigQueryService()

        print("----------------")
        print("TOXICITY SCORER...")
        print("  MODEL:", self.model_name.upper())
        print("  LIMIT:", fmt_n(self.limit))
        print("  BATCH SIZE:", fmt_n(self.batch_size))
Exemple #6
0
def create_app():
    app = Flask(__name__)
    CORS(app) # CORS(app, resources={r"/api/*": {"origins": "*"}})

    app.config["SECRET_KEY"] = SECRET_KEY
    app.config["BQ_SERVICE"] = BigQueryService(cautious=False)
    #app.config.from_mapping(SECRET_KEY=SECRET_KEY, BQ_SERVICE=BigQueryService())

    app.register_blueprint(api_v0_routes)
    app.register_blueprint(api_v1_routes)

    return app
    def __init__(self,
                 bq=None,
                 topic=TOPIC,
                 pval_max=PVAL_MAX,
                 results_csv_filepath=RESULTS_CSV_FILEPATH):
        self.topic = topic

        self.bq = bq or BigQueryService()
        self.x = []
        self.y = []

        self.pval_max = pval_max
        self.interpret_ks = interpret
        self.results_csv_filepath = results_csv_filepath
def get_tweets():
    bq_service = BigQueryService()
    print("LIMIT:", LIMIT)
    job = Job()

    tweets = []
    job.start()
    for row in bq_service.fetch_labeled_tweets_in_batches(limit=LIMIT):
        tweets.append(dict(row))
        job.counter += 1
        if job.counter % BATCH_SIZE == 0:
            job.progress_report()
    job.end()
    print("FETCHED TWEETS:", fmt_n(len(tweets)))
    return DataFrame(tweets)
Exemple #9
0
def download_data():
    job = Job()
    bq_service = BigQueryService()

    job.start()
    records = []
    for row in bq_service.fetch_user_details_vq(limit=LIMIT):
        #print(row)
        records.append(dict(row))

        job.counter += 1
        if job.counter % BATCH_SIZE == 0:
            job.progress_report()
    job.end()

    return DataFrame(records)
    def __init__(self,
                 bq_service=None,
                 twitter_service=None,
                 user_limit=USER_LIMIT,
                 friend_limit=FRIEND_LIMIT):
        self.bq_service = bq_service or BigQueryService()
        self.twitter_service = twitter_service or TwitterService()

        self.dataset_address = self.bq_service.dataset_address
        self.user_limit = int(user_limit)
        self.friend_limit = int(friend_limit)

        print("---------------------------")
        print("JOB: FRIEND LOOKUPS")
        print("DATASET:", self.dataset_address.upper())
        print("USER LIMIT:", self.user_limit)
        print("FRIEND LIMIT:", self.friend_limit)
        print("---------------------------")
    def __init__(self,
                 bq_service=None,
                 twitter_service=None,
                 user_limit=USER_LIMIT,
                 status_limit=STATUS_LIMIT):
        self.bq_service = bq_service or BigQueryService()
        self.twitter_service = twitter_service or TwitterService()

        self.dataset_address = self.bq_service.dataset_address
        self.user_limit = int(user_limit)
        self.status_limit = int(status_limit)

        self.parse_status = parse_timeline_status

        print("---------------------------")
        print("JOB: TIMELINE LOOKUPS")
        print("DATASET:", self.dataset_address.upper())
        print("USER LIMIT:", self.user_limit)
        print("STATUS LIMIT:", self.status_limit)
        print("---------------------------")
    def __init__(self,
                 limit=LIMIT,
                 batch_size=BATCH_SIZE,
                 bq_service=None,
                 model_manager=None):
        self.limit = limit
        self.batch_size = batch_size
        self.bq_service = bq_service or BigQueryService()
        self.mgr = model_manager or ModelManager()

        print("----------------")
        print("TOXICITY SCORER...")
        print("  MODEL CHECKPOINT:", self.mgr.checkpoint_name.upper(),
              self.mgr.checkpoint_url)
        print("  SCORES TABLE NAME:", self.scores_table_name)
        print("  LIMIT:", fmt_n(self.limit))
        print("  BATCH SIZE:", fmt_n(self.batch_size))

        self.predict = self.mgr.predict_scores  # method alias

        seek_confirmation()
Exemple #13
0
    def __init__(self,
                 bq_service=None,
                 bot_min=BOT_MIN,
                 batch_size=BATCH_SIZE,
                 storage_dirpath=None):
        self.bq_service = bq_service or BigQueryService()
        self.bot_min = bot_min
        self.batch_size = batch_size

        Job.__init__(self)

        storage_dirpath = storage_dirpath or f"bot_follower_graphs/bot_min/{self.bot_min}"
        GraphStorage.__init__(self, dirpath=storage_dirpath)

        print("-------------------------")
        print("BOT FOLLOWER GRAPHER...")
        print("  BOT MIN:", self.bot_min)
        print("  BATCH SIZE:", self.batch_size)
        print("-------------------------")

        seek_confirmation()
Exemple #14
0
    def __init__(self,
                 users_limit=USERS_LIMIT,
                 batch_size=BATCH_SIZE,
                 pg_destructive=PG_DESTRUCTIVE,
                 bq_service=None):
        self.bq_service = bq_service or BigQueryService()
        if users_limit:
            self.users_limit = int(users_limit)
        else:
            self.users_limit = None
        self.batch_size = batch_size

        self.pg_destructive = pg_destructive
        self.pg_engine = db
        self.pg_session = BoundSession()

        print("-------------------------")
        print("PG PIPELINE...")
        print("  USERS LIMIT:", self.users_limit)
        print("  BATCH SIZE:", self.batch_size)
        #print("  BQ SERVICE:", type(self.bq_service))
        #print("  PG SESSION:", type(self.pg_session))
        print("  PG DESTRUCTIVE:", self.pg_destructive)
    def __init__(self, bq_service=None, week_id=WEEK_ID):
        bq_service = bq_service or BigQueryService()
        self.week_id = week_id

        print("--------------------")
        print("FETCHING WEEKS...")
        self.weeks = [
            RetweetWeek(row) for row in list(bq_service.fetch_retweet_weeks())
        ]
        for week in self.weeks:
            print("   ", week.details)

        print("--------------------")
        print("SELECTING A WEEK...")
        if not self.week_id:
            self.week_id = input(
                "PLEASE SELECT A WEEK (E.G. '2019-52', '2020-01', ETC.): "
            )  # assumes you know what you're doing when setting WEEK_ID on production! once you run this once you'll see what all the week ids are.

        try:
            self.week = [
                wk for wk in self.weeks if wk.week_id == self.week_id
            ][0]
            print("   ", self.week.details)
        except IndexError as err:
            print("OOPS - PLEASE CHECK WEEK ID AND TRY AGAIN...")
            exit()

        self.tweets_start_at = self.week.row.min_created
        self.tweets_end_at = self.week.row.max_created

        seek_confirmation()

        storage_service = self.init_storage_service(self.week_id)
        super().__init__(bq_service=bq_service,
                         storage_service=storage_service)
Exemple #16
0
    def __init__(self, twitter_service=None, storage_env=STORAGE_ENV, bq_service=None, csv_service=None, batch_size=BATCH_SIZE):
        self.twitter_service = twitter_service or TwitterService()
        self.api = self.twitter_service.api
        self.auth = self.api.auth
        self.parse_status = parse_status

        self.storage_env = storage_env
        if self.storage_env == "local":
            self.storage_service = csv_service or LocalStorageService()
        elif self.storage_env == "remote":
            self.storage_service = bq_service or BigQueryService()
        else:
            raise ValueError("Expecting the STORAGE_ENV to be 'local' or 'remote'. Please try again...")

        self.batch_size = batch_size
        self.batch = []
        self.counter = 0

        print("-------------------------------")
        print("STREAM LISTENER...")
        print("  STORAGE ENV:", self.storage_env.upper())
        print("  STORAGE SERVICE:", type(self.storage_service))
        print("  BATCH SIZE:", self.batch_size)
        print("--------------------------------")
 def __init__(self, users_limit=USERS_LIMIT, batch_size=BATCH_SIZE, storage_service=None, bq_service=None):
     super().__init__(users_limit=users_limit, batch_size=batch_size, storage_service=storage_service)
     self.bq_service = bq_service or BigQueryService()
Exemple #18
0
from app.bq_service import BigQueryService

if __name__ == "__main__":

    bq_service = BigQueryService(verbose=True)

    print("FLATTENING USER FRIENDS TABLE...")
    bq_service.destructively_migrate_user_friends_flat()

    print("BOTS ABOVE 80...")
    bq_service.destructively_migrate_bots_table()

    print("BOT FOLLOWERS ABOVE 80...")
    bq_service.destructively_migrate_bot_followers_table()
Exemple #19
0
 def __init__(self):
     self.twitter_api = TwitterService().api
     self.bq_service = BigQueryService()
     self.limit = STATUS_LIMIT
     self.batch_size = BATCH_SIZE
    print("  TWEET_MIN:", TWEET_MIN)

    print("  LIMIT:", LIMIT)
    print("  BATCH_SIZE:", BATCH_SIZE)
    print("  DESTRUCTIVE:", DESTRUCTIVE)

    #print("  GRAPH_LIMIT:", GRAPH_LIMIT)
    print("  GRAPH_BATCH_SIZE:", GRAPH_BATCH_SIZE)
    print("  GRAPH_DESTRUCTIVE:", GRAPH_DESTRUCTIVE)

    print("------------------------")
    storage = FileStorage(
        dirpath=f"daily_active_friend_graphs_v4/{DATE}/tweet_min/{TWEET_MIN}")
    tweets_csv_filepath = os.path.join(storage.local_dirpath, "tweets.csv")

    bq_service = BigQueryService()
    job = Job()

    #
    # LOAD TWEETS
    # tweet_id, text, screen_name, bot, created_at

    # TODO: de-dup RTs so the model will only train/test on a single RT status text (PREVENT OVERFITTING)
    if os.path.exists(tweets_csv_filepath) and not DESTRUCTIVE:
        print("LOADING TWEETS...")
        statuses_df = read_csv(tweets_csv_filepath)
    else:
        job.start()
        print("DOWNLOADING TWEETS...")
        statuses = []
        for row in bq_service.fetch_daily_active_tweeter_statuses(
 def __init__(self, bq_service=None, gcs_service=None):
     super().__init__(gcs_service=gcs_service)
     self.bq_service = bq_service or BigQueryService()