コード例 #1
0
ファイル: multinli.py プロジェクト: zwbjtu123/nlp
def get_generator(
    local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None
):
    """ Downloads and extracts the dataset files and then returns a random batch generator that
    yields pandas dataframes.
    Args:
        local_cache_path ([type], optional): [description]. Defaults to None.
        file_split (str, optional): The subset to load.
            One of: {"train", "dev_matched", "dev_mismatched"}
            Defaults to "train".
        block_size (int, optional): Size of partition in bytes.
        random_seed (int, optional): Random seed. See random.seed().Defaults to None.
        num_batches (int): Number of batches to generate.
        batch_size (int]): Batch size.
    Returns:
        Generator[pd.Dataframe, None, None] : Random batch generator that yields pandas dataframes.
    """

    file_name = URL.split("/")[-1]
    maybe_download(URL, file_name, local_cache_path)

    if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])):
        extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)

    loader = DaskJSONLoader(
        os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size
    )

    return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches)
コード例 #2
0
 def download(local_path=".data"):
     FILE_ID = "1x0d61LP9UAN389YN00z0Pv-7jQgirVg6"
     FILE_NAME = "bertsum_data.zip"
     os.makedirs(local_path, exist_ok=True)
     output_dir = os.path.join(local_path, "processed_data")
     os.makedirs(output_dir, exist_ok=True)
     maybe_download_googledrive(
         google_file_id=FILE_ID, file_name=FILE_NAME, work_directory=local_path
     )
     extract_zip(
         file_path=os.path.join(local_path, FILE_NAME), dest_path=output_dir,
     )
     return output_dir
コード例 #3
0
def load_pandas_df(local_cache_path=None, num_rows=None):
    """Downloads and extracts the dataset files
    Args:
        local_cache_path ([type], optional): [description]. Defaults to None.
        num_rows (int): Number of rows to load. If None, all data is loaded.
    Returns:
        pd.DataFrame: pandas DataFrame containing the loaded dataset.
    """
    zip_file = URL.split("/")[-1]
    maybe_download(URL, zip_file, local_cache_path)

    zip_file_path = os.path.join(local_cache_path, zip_file)
    csv_file_path = os.path.join(local_cache_path, zip_file.replace(".zip", ""))

    if not os.path.exists(csv_file_path):
        extract_zip(file_path=zip_file_path, dest_path=local_cache_path)
    return pd.read_csv(csv_file_path, nrows=num_rows)
コード例 #4
0
ファイル: multinli.py プロジェクト: zuoxiaolei/nlp-recipes
def download_file_and_extract(local_cache_path: str = ".",
                              file_split: str = "train") -> None:
    """Download and extract the dataset files

    Args:
        local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."})
        file_split {str} -- [description] (default: {"train"})
    
    Returns:
        None -- Nothing is returned
    """
    file_name = URL.split("/")[-1]
    maybe_download(URL, file_name, local_cache_path)

    if not os.path.exists(
            os.path.join(local_cache_path, DATA_FILES[file_split])):
        extract_zip(os.path.join(local_cache_path, file_name),
                    local_cache_path)
コード例 #5
0
ファイル: multinli.py プロジェクト: zwbjtu123/nlp
def load_pandas_df(local_cache_path=".", file_split="train"):
    """Downloads and extracts the dataset files
    Args:
        local_cache_path ([type], optional): [description]. Defaults to None.
        file_split (str, optional): The subset to load.
            One of: {"train", "dev_matched", "dev_mismatched"}
            Defaults to "train".
    Returns:
        pd.DataFrame: pandas DataFrame containing the specified
            MultiNLI subset.
    """

    file_name = URL.split("/")[-1]
    maybe_download(URL, file_name, local_cache_path)

    if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])):
        extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)
    return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True)
コード例 #6
0
def download_tsv_files_and_extract(local_cache_path: str = ".") -> None:
    """Download and extract the dataset files in tsv format from NYU Jiant 
        downloads both original and tsv formatted data. 

    Args:
        local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."})
    
    Returns:
        None -- Nothing is returned
    """
    try:
        folder_name = "MNLI"
        file_name = f"{folder_name}.zip"
        maybe_download(URL_JIANT_MNLI_TSV, file_name, local_cache_path)
        if not os.path.exists(os.path.join(local_cache_path, folder_name)):
            extract_zip(os.path.join(local_cache_path, file_name),
                        local_cache_path)

        # Clean up zip download
        if os.path.exists(os.path.join(local_cache_path, file_name)):
            os.remove(os.path.join(local_cache_path, file_name))
    except IOError as e:
        raise (e)
    print("Downloaded file to: ", os.path.join(local_cache_path, folder_name))
コード例 #7
0
def CNNDMSummarizationDatasetOrg(local_path=".",
                                 top_n=-1,
                                 return_iterable=False,
                                 return_dev_data=False):
    """
    Downloads a version of the CNN/DailyMail dataset with minimal processing
    from https://github.com/microsoft/unilm/tree/master/unilm-v1
    This version of the CNN/DM dataset was originally downloaded from
    https://github.com/harvardnlp/sent-summary
    and preprocessed following https://github.com/abisee/cnn-dailymail.

    Args:
        local_path (str): Path to store the downloaded data. If the data file
            doesn't exist in this path, it's downloaded and unzipped.
        top_n (int): Number of lines to read. Defaults to -1 and the entire dataset
            is read.
        return_iterable (bool): If False, returns SummarizationDataset.
            If True, returns IterableSummarizationDataset. Defaults to False.
        return_dev_data (bool): if False, returns train and test data splits.
            If True, returns train, test, and dev data splits. Defaults to False.

    Returns:
        tuple: tuple containing train, test (, and dev) datasets.
    """

    # Download and unzip the data
    FILE_ID = "1jiDbDbAsqy_5BM79SmX6aSu5DQVCAZq1"
    FILE_NAME = "cnndm_data.zip"

    output_dir = os.path.join(local_path, "cnndm_data")
    os.makedirs(output_dir, exist_ok=True)

    # This folder contains the a version of the dataset with minimal processing
    org_data_dir = os.path.join(output_dir, "org_data")

    expected_data_files = set([
        "train.src",
        "org_data",
        "dev.src",
        "test.tgt",
        "train.tgt",
        "dev.tgt",
        "test.src",
    ])
    expected_org_data_files = set([
        "training.summary",
        "test.article",
        "dev.article",
        "training.article",
        "dev.summary",
        "test.summary",
    ])

    maybe_download_googledrive(google_file_id=FILE_ID,
                               file_name=FILE_NAME,
                               work_directory=local_path)

    if (set(os.listdir(output_dir)) != expected_data_files
            or set(os.listdir(org_data_dir)) != expected_org_data_files):
        extract_zip(
            file_path=os.path.join(local_path, FILE_NAME),
            dest_path=output_dir,
        )

    train_source_file = os.path.join(org_data_dir, "training.article")
    train_target_file = os.path.join(org_data_dir, "training.summary")
    test_source_file = os.path.join(org_data_dir, "test.article")
    test_target_file = os.path.join(org_data_dir, "test.summary")
    dev_source_file = os.path.join(org_data_dir, "dev.article")
    dev_target_file = os.path.join(org_data_dir, "dev.summary")

    source_preprocessing = [detokenize]
    target_preprocessing = [detokenize]

    if return_iterable:
        train_dataset = IterableSummarizationDataset(
            source_file=train_source_file,
            target_file=train_target_file,
            source_preprocessing=source_preprocessing,
            target_preprocessing=target_preprocessing,
            top_n=top_n,
        )

        test_dataset = IterableSummarizationDataset(
            source_file=test_source_file,
            target_file=test_target_file,
            source_preprocessing=source_preprocessing,
            target_preprocessing=target_preprocessing,
            top_n=top_n,
        )
    else:
        train_dataset = SummarizationDataset(
            source_file=train_source_file,
            target_file=train_target_file,
            source_preprocessing=source_preprocessing,
            target_preprocessing=target_preprocessing,
            top_n=top_n,
        )

        test_dataset = SummarizationDataset(
            source_file=test_source_file,
            target_file=test_target_file,
            source_preprocessing=source_preprocessing,
            target_preprocessing=target_preprocessing,
            top_n=top_n,
        )

    if return_dev_data:
        if return_iterable:
            dev_dataset = IterableSummarizationDataset(
                source_file=dev_source_file,
                target_file=dev_target_file,
                source_preprocessing=source_preprocessing,
                target_preprocessing=target_preprocessing,
                top_n=top_n,
            )
        else:
            dev_dataset = SummarizationDataset(
                source_file=dev_source_file,
                target_file=dev_target_file,
                source_preprocessing=source_preprocessing,
                target_preprocessing=target_preprocessing,
                top_n=top_n,
            )

        return train_dataset, test_dataset, dev_dataset
    else:
        return train_dataset, test_dataset
コード例 #8
0
def load_pandas_df(local_cache_path=".", file_split="dev", language="zh"):
    """Downloads and extracts the dataset files.

    Utilities information can be found `on this link <https://www.nyu.edu/projects/bowman/xnli/>`_.

    Args:
        local_cache_path (str, optional): Path to store the data.
            Defaults to "./".
        file_split (str, optional): The subset to load.
            One of: {"train", "dev", "test"}
            Defaults to "dev".
        language (str, optional): language subset to read.
            One of: {"en", "fr", "es", "de", "el", "bg", "ru",
            "tr", "ar", "vi", "th", "zh", "hi", "sw", "ur"}
            Defaults to "zh" (Chinese).
    Returns:
        pd.DataFrame: pandas DataFrame containing the specified
            XNLI subset.
    """

    if file_split in ("dev", "test"):
        url = URL_XNLI
        sentence_1_index = 6
        sentence_2_index = 7
        label_index = 1

        zip_file_name = url.split("/")[-1]
        folder_name = ".".join(zip_file_name.split(".")[:-1])
        file_name = folder_name + "/" + ".".join(["xnli", file_split, "tsv"])
    elif file_split == "train":
        url = URL_XNLI_MT
        sentence_1_index = 0
        sentence_2_index = 1
        label_index = 2

        zip_file_name = url.split("/")[-1]
        folder_name = ".".join(zip_file_name.split(".")[:-1])
        file_name = folder_name + "/multinli/" + ".".join(
            ["multinli", file_split, language, "tsv"])

    maybe_download(url, zip_file_name, local_cache_path)

    if not os.path.exists(os.path.join(local_cache_path, folder_name)):
        extract_zip(os.path.join(local_cache_path, zip_file_name),
                    local_cache_path)

    with open(os.path.join(local_cache_path, file_name), "r",
              encoding="utf-8") as f:
        lines = f.read().splitlines()

    line_list = [line.split("\t") for line in lines]

    # Remove the column name row
    line_list.pop(0)
    if file_split != "train":
        line_list = [line for line in line_list if line[0] == language]

    valid_lines = [
        True if line[sentence_1_index] and line[sentence_2_index] else False
        for line in line_list
    ]
    total_line_count = len(line_list)
    line_list = [line for line, valid in zip(line_list, valid_lines) if valid]
    valid_line_count = len(line_list)

    if valid_line_count != total_line_count:
        print("{} invalid lines removed.".format(total_line_count -
                                                 valid_line_count))

    label_list = [convert_to_unicode(line[label_index]) for line in line_list]
    old_contradict_label = convert_to_unicode("contradictory")
    new_contradict_label = convert_to_unicode("contradiction")
    label_list = [
        new_contradict_label if label == old_contradict_label else label
        for label in label_list
    ]
    text_list = [(convert_to_unicode(line[sentence_1_index]),
                  convert_to_unicode(line[sentence_2_index]))
                 for line in line_list]

    df = pd.DataFrame({"text": text_list, "label": label_list})

    return df