예제 #1
0
            batch = []
        job.end()

    seek_confirmation()
    #exit()

    for model_name in ["logistic_regression", "multinomial_nb"]:

        storage = ModelStorage(dirpath=f"nlp_v2/models/best/{model_name}")
        tv = storage.load_vectorizer()
        clf = storage.load_model()

        print(f"DESTROY PREDICTIONS TABLE? ({model_name})")
        seek_confirmation()
        bq_service.nlp_v2_destructively_migrate_predictions_table(model_name)
        predictions_table = bq_service.nlp_v2_get_predictions_table(model_name) # API call. cache it here once.

        job.start()

        for chunk_df in read_csv(CSV_FILEPATH, chunksize=BATCH_SIZE): # FYI: this will include the last chunk even if it is not a full batch
            status_ids = chunk_df["status_id"].tolist()
            status_texts = chunk_df["status_text"].tolist()

            preds = clf.predict(tv.transform(status_texts))

            batch = [{"status_id": status_id, "prediction": pred} for status_id, pred in zip(status_ids, preds)]
            bq_service.insert_records_in_batches(predictions_table, batch)

            job.counter += len(chunk_df)
            job.progress_report()
            batch = []
예제 #2
0
from app.bq_service import BigQueryService
from app.retweet_graphs_v2.k_days.generator import DateRangeGenerator

BATCH_SIZE = int(
    os.getenv("BATCH_SIZE", default=25000)
)  # the max number of processed users to store in BQ at once (with a single insert API call)

if __name__ == "__main__":

    bq_service = BigQueryService()
    job = Job()

    print(f"DESTROY PREDICTIONS TABLE? (BERT)")
    seek_confirmation()
    bq_service.nlp_v2_destructively_migrate_predictions_table("bert")
    predictions_table = bq_service.nlp_v2_get_predictions_table("bert")

    job.start()

    for dr in DateRangeGenerator(start_date="2019-12-20",
                                 k_days=1,
                                 n_periods=58).date_ranges:
        print(dr.start_date)
        csv_filepath = os.path.join(DATA_DIR,
                                    "daily_active_edge_friend_graphs_v5",
                                    dr.start_date,
                                    "tweets_BERT_Impeachment_800KTweets.csv")

        #df = read_csv(csv_filepath, usecols=["status_id", "text", "logit_0", "logit_1", "opinion_tweet"], nrows=100)
        #print(df.head())
        for chunk_df in read_csv(