batch = [] job.end() seek_confirmation() #exit() for model_name in ["logistic_regression", "multinomial_nb"]: storage = ModelStorage(dirpath=f"nlp_v2/models/best/{model_name}") tv = storage.load_vectorizer() clf = storage.load_model() print(f"DESTROY PREDICTIONS TABLE? ({model_name})") seek_confirmation() bq_service.nlp_v2_destructively_migrate_predictions_table(model_name) predictions_table = bq_service.nlp_v2_get_predictions_table(model_name) # API call. cache it here once. job.start() for chunk_df in read_csv(CSV_FILEPATH, chunksize=BATCH_SIZE): # FYI: this will include the last chunk even if it is not a full batch status_ids = chunk_df["status_id"].tolist() status_texts = chunk_df["status_text"].tolist() preds = clf.predict(tv.transform(status_texts)) batch = [{"status_id": status_id, "prediction": pred} for status_id, pred in zip(status_ids, preds)] bq_service.insert_records_in_batches(predictions_table, batch) job.counter += len(chunk_df) job.progress_report() batch = []
from app.bq_service import BigQueryService from app.retweet_graphs_v2.k_days.generator import DateRangeGenerator BATCH_SIZE = int( os.getenv("BATCH_SIZE", default=25000) ) # the max number of processed users to store in BQ at once (with a single insert API call) if __name__ == "__main__": bq_service = BigQueryService() job = Job() print(f"DESTROY PREDICTIONS TABLE? (BERT)") seek_confirmation() bq_service.nlp_v2_destructively_migrate_predictions_table("bert") predictions_table = bq_service.nlp_v2_get_predictions_table("bert") job.start() for dr in DateRangeGenerator(start_date="2019-12-20", k_days=1, n_periods=58).date_ranges: print(dr.start_date) csv_filepath = os.path.join(DATA_DIR, "daily_active_edge_friend_graphs_v5", dr.start_date, "tweets_BERT_Impeachment_800KTweets.csv") #df = read_csv(csv_filepath, usecols=["status_id", "text", "logit_0", "logit_1", "opinion_tweet"], nrows=100) #print(df.head()) for chunk_df in read_csv(