def get_generator( local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None ): """ Downloads and extracts the dataset files and then returns a random batch generator that yields pandas dataframes. Args: local_cache_path ([type], optional): [description]. Defaults to None. file_split (str, optional): The subset to load. One of: {"train", "dev_matched", "dev_mismatched"} Defaults to "train". block_size (int, optional): Size of partition in bytes. random_seed (int, optional): Random seed. See random.seed().Defaults to None. num_batches (int): Number of batches to generate. batch_size (int]): Batch size. Returns: Generator[pd.Dataframe, None, None] : Random batch generator that yields pandas dataframes. """ file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])): extract_zip(os.path.join(local_cache_path, file_name), local_cache_path) loader = DaskJSONLoader( os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size ) return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches)
def download(local_path=".data"): FILE_ID = "1x0d61LP9UAN389YN00z0Pv-7jQgirVg6" FILE_NAME = "bertsum_data.zip" os.makedirs(local_path, exist_ok=True) output_dir = os.path.join(local_path, "processed_data") os.makedirs(output_dir, exist_ok=True) maybe_download_googledrive( google_file_id=FILE_ID, file_name=FILE_NAME, work_directory=local_path ) extract_zip( file_path=os.path.join(local_path, FILE_NAME), dest_path=output_dir, ) return output_dir
def load_pandas_df(local_cache_path=None, num_rows=None): """Downloads and extracts the dataset files Args: local_cache_path ([type], optional): [description]. Defaults to None. num_rows (int): Number of rows to load. If None, all data is loaded. Returns: pd.DataFrame: pandas DataFrame containing the loaded dataset. """ zip_file = URL.split("/")[-1] maybe_download(URL, zip_file, local_cache_path) zip_file_path = os.path.join(local_cache_path, zip_file) csv_file_path = os.path.join(local_cache_path, zip_file.replace(".zip", "")) if not os.path.exists(csv_file_path): extract_zip(file_path=zip_file_path, dest_path=local_cache_path) return pd.read_csv(csv_file_path, nrows=num_rows)
def download_file_and_extract(local_cache_path: str = ".", file_split: str = "train") -> None: """Download and extract the dataset files Args: local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."}) file_split {str} -- [description] (default: {"train"}) Returns: None -- Nothing is returned """ file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) if not os.path.exists( os.path.join(local_cache_path, DATA_FILES[file_split])): extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)
def load_pandas_df(local_cache_path=".", file_split="train"): """Downloads and extracts the dataset files Args: local_cache_path ([type], optional): [description]. Defaults to None. file_split (str, optional): The subset to load. One of: {"train", "dev_matched", "dev_mismatched"} Defaults to "train". Returns: pd.DataFrame: pandas DataFrame containing the specified MultiNLI subset. """ file_name = URL.split("/")[-1] maybe_download(URL, file_name, local_cache_path) if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])): extract_zip(os.path.join(local_cache_path, file_name), local_cache_path) return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True)
def download_tsv_files_and_extract(local_cache_path: str = ".") -> None: """Download and extract the dataset files in tsv format from NYU Jiant downloads both original and tsv formatted data. Args: local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."}) Returns: None -- Nothing is returned """ try: folder_name = "MNLI" file_name = f"{folder_name}.zip" maybe_download(URL_JIANT_MNLI_TSV, file_name, local_cache_path) if not os.path.exists(os.path.join(local_cache_path, folder_name)): extract_zip(os.path.join(local_cache_path, file_name), local_cache_path) # Clean up zip download if os.path.exists(os.path.join(local_cache_path, file_name)): os.remove(os.path.join(local_cache_path, file_name)) except IOError as e: raise (e) print("Downloaded file to: ", os.path.join(local_cache_path, folder_name))
def CNNDMSummarizationDatasetOrg(local_path=".", top_n=-1, return_iterable=False, return_dev_data=False): """ Downloads a version of the CNN/DailyMail dataset with minimal processing from https://github.com/microsoft/unilm/tree/master/unilm-v1 This version of the CNN/DM dataset was originally downloaded from https://github.com/harvardnlp/sent-summary and preprocessed following https://github.com/abisee/cnn-dailymail. Args: local_path (str): Path to store the downloaded data. If the data file doesn't exist in this path, it's downloaded and unzipped. top_n (int): Number of lines to read. Defaults to -1 and the entire dataset is read. return_iterable (bool): If False, returns SummarizationDataset. If True, returns IterableSummarizationDataset. Defaults to False. return_dev_data (bool): if False, returns train and test data splits. If True, returns train, test, and dev data splits. Defaults to False. Returns: tuple: tuple containing train, test (, and dev) datasets. """ # Download and unzip the data FILE_ID = "1jiDbDbAsqy_5BM79SmX6aSu5DQVCAZq1" FILE_NAME = "cnndm_data.zip" output_dir = os.path.join(local_path, "cnndm_data") os.makedirs(output_dir, exist_ok=True) # This folder contains the a version of the dataset with minimal processing org_data_dir = os.path.join(output_dir, "org_data") expected_data_files = set([ "train.src", "org_data", "dev.src", "test.tgt", "train.tgt", "dev.tgt", "test.src", ]) expected_org_data_files = set([ "training.summary", "test.article", "dev.article", "training.article", "dev.summary", "test.summary", ]) maybe_download_googledrive(google_file_id=FILE_ID, file_name=FILE_NAME, work_directory=local_path) if (set(os.listdir(output_dir)) != expected_data_files or set(os.listdir(org_data_dir)) != expected_org_data_files): extract_zip( file_path=os.path.join(local_path, FILE_NAME), dest_path=output_dir, ) train_source_file = os.path.join(org_data_dir, "training.article") train_target_file = os.path.join(org_data_dir, "training.summary") test_source_file = os.path.join(org_data_dir, "test.article") test_target_file = os.path.join(org_data_dir, "test.summary") dev_source_file = os.path.join(org_data_dir, "dev.article") dev_target_file = os.path.join(org_data_dir, "dev.summary") source_preprocessing = [detokenize] target_preprocessing = [detokenize] if return_iterable: train_dataset = IterableSummarizationDataset( source_file=train_source_file, target_file=train_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) test_dataset = IterableSummarizationDataset( source_file=test_source_file, target_file=test_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) else: train_dataset = SummarizationDataset( source_file=train_source_file, target_file=train_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) test_dataset = SummarizationDataset( source_file=test_source_file, target_file=test_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) if return_dev_data: if return_iterable: dev_dataset = IterableSummarizationDataset( source_file=dev_source_file, target_file=dev_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) else: dev_dataset = SummarizationDataset( source_file=dev_source_file, target_file=dev_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) return train_dataset, test_dataset, dev_dataset else: return train_dataset, test_dataset
def load_pandas_df(local_cache_path=".", file_split="dev", language="zh"): """Downloads and extracts the dataset files. Utilities information can be found `on this link <https://www.nyu.edu/projects/bowman/xnli/>`_. Args: local_cache_path (str, optional): Path to store the data. Defaults to "./". file_split (str, optional): The subset to load. One of: {"train", "dev", "test"} Defaults to "dev". language (str, optional): language subset to read. One of: {"en", "fr", "es", "de", "el", "bg", "ru", "tr", "ar", "vi", "th", "zh", "hi", "sw", "ur"} Defaults to "zh" (Chinese). Returns: pd.DataFrame: pandas DataFrame containing the specified XNLI subset. """ if file_split in ("dev", "test"): url = URL_XNLI sentence_1_index = 6 sentence_2_index = 7 label_index = 1 zip_file_name = url.split("/")[-1] folder_name = ".".join(zip_file_name.split(".")[:-1]) file_name = folder_name + "/" + ".".join(["xnli", file_split, "tsv"]) elif file_split == "train": url = URL_XNLI_MT sentence_1_index = 0 sentence_2_index = 1 label_index = 2 zip_file_name = url.split("/")[-1] folder_name = ".".join(zip_file_name.split(".")[:-1]) file_name = folder_name + "/multinli/" + ".".join( ["multinli", file_split, language, "tsv"]) maybe_download(url, zip_file_name, local_cache_path) if not os.path.exists(os.path.join(local_cache_path, folder_name)): extract_zip(os.path.join(local_cache_path, zip_file_name), local_cache_path) with open(os.path.join(local_cache_path, file_name), "r", encoding="utf-8") as f: lines = f.read().splitlines() line_list = [line.split("\t") for line in lines] # Remove the column name row line_list.pop(0) if file_split != "train": line_list = [line for line in line_list if line[0] == language] valid_lines = [ True if line[sentence_1_index] and line[sentence_2_index] else False for line in line_list ] total_line_count = len(line_list) line_list = [line for line, valid in zip(line_list, valid_lines) if valid] valid_line_count = len(line_list) if valid_line_count != total_line_count: print("{} invalid lines removed.".format(total_line_count - valid_line_count)) label_list = [convert_to_unicode(line[label_index]) for line in line_list] old_contradict_label = convert_to_unicode("contradictory") new_contradict_label = convert_to_unicode("contradiction") label_list = [ new_contradict_label if label == old_contradict_label else label for label in label_list ] text_list = [(convert_to_unicode(line[sentence_1_index]), convert_to_unicode(line[sentence_2_index])) for line in line_list] df = pd.DataFrame({"text": text_list, "label": label_list}) return df