def download_and_preprocess_imdb(): """Download and preprocess IMDB sentiment classification dataset. """ download_raw_and_preprocess( dataset_name="imdb", download_list=["imdb-raw"], preprocess_fn=preprocess_imdb_data, preprocess_input_output_list=[ ("raw/aclImdb/train", "train.json"), ("raw/aclImdb/test", "test.json")])
def download_and_preprocess_yelp(): """Download and preprocess Yelp dataset.""" download_raw_and_preprocess(dataset_name="yelp", download_list=["yelp-raw"], preprocess_fn=preprocess_yelp_data, preprocess_input_output_list=[ ("raw/yelp_review_polarity_csv/train.csv", "train.json"), ("raw/yelp_review_polarity_csv/test.csv", "test.json") ])
def download_and_preprocess_snli(): """Download and preprocess SNLI dataset.""" download_raw_and_preprocess( dataset_name="snli", download_list=["snli-raw"], preprocess_fn=preprocess_snli_data, preprocess_input_output_list=[ ("raw/snli_1.0/snli_1.0_train.jsonl", "train.json"), ("raw/snli_1.0/snli_1.0_dev.jsonl", "dev.json"), ("raw/snli_1.0/snli_1.0_test.jsonl", "test.json") ])
def download_and_preprocess_mnli(): """Download and preprocess MNLI dataset.""" download_raw_and_preprocess( dataset_name="mnli", download_list=["mnli-raw"], preprocess_fn=preprocess_mnli_data, preprocess_input_output_list=[ ("raw/multinli_1.0/multinli_1.0_train.jsonl", "train.json"), ("raw/multinli_1.0/multinli_1.0_dev_matched.jsonl", "dev_matched.json"), ("raw/multinli_1.0/multinli_1.0_dev_mismatched.jsonl", "dev_mismatched.json") ])
def download_and_preprocess_ag(): """Download and preprocess AG's news dataset. """ download_raw_and_preprocess(dataset_name="ag", download_list=["ag-raw-train", "ag-raw-test"], preprocess_fn=preprocess_ag_data, preprocess_input_output_list=[ ("raw/train.csv", "train.json"), ("raw/test.csv", "test.json") ]) download_raw_and_preprocess( dataset_name="ag_no_title", download_list=["ag-raw-train", "ag-raw-test"], preprocess_fn=lambda inp, out: preprocess_ag_data( inp, out, include_title=False, include_author_media=False), preprocess_input_output_list=[("raw/train.csv", "train.json"), ("raw/test.csv", "test.json")])