Ejemplo n.º 1
0
    def _setup_datasets(url, top_n=-1, local_cache_path=".data"):
        FILE_NAME = "cnndm.tar.gz"
        maybe_download(url, FILE_NAME, local_cache_path)
        dataset_tar = os.path.join(local_cache_path, FILE_NAME)
        extracted_files = extract_archive(dataset_tar)
        for fname in extracted_files:
            if fname.endswith("train.txt.src"):
                train_source_file = fname
            if fname.endswith("train.txt.tgt.tagged"):
                train_target_file = fname
            if fname.endswith("test.txt.src"):
                test_source_file = fname
            if fname.endswith("test.txt.tgt.tagged"):
                test_target_file = fname

        return (
            SummarizationDataset(
                train_source_file,
                train_target_file,
                [_clean, tokenize.sent_tokenize],
                [_clean, _remove_ttags, _target_sentence_tokenization],
                nltk.word_tokenize,
                top_n,
            ),
            SummarizationDataset(
                test_source_file,
                test_target_file,
                [_clean, tokenize.sent_tokenize],
                [_clean, _remove_ttags, _target_sentence_tokenization],
                nltk.word_tokenize,
                top_n,
            ),
        )
Ejemplo n.º 2
0
def get_generator(
    local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None
):
    """ Downloads and extracts the dataset files and then returns a random batch generator that
    yields pandas dataframes.
    Args:
        local_cache_path ([type], optional): [description]. Defaults to None.
        file_split (str, optional): The subset to load.
            One of: {"train", "dev_matched", "dev_mismatched"}
            Defaults to "train".
        block_size (int, optional): Size of partition in bytes.
        random_seed (int, optional): Random seed. See random.seed().Defaults to None.
        num_batches (int): Number of batches to generate.
        batch_size (int]): Batch size.
    Returns:
        Generator[pd.Dataframe, None, None] : Random batch generator that yields pandas dataframes.
    """

    file_name = URL.split("/")[-1]
    maybe_download(URL, file_name, local_cache_path)

    if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])):
        extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)

    loader = DaskJSONLoader(
        os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size
    )

    return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches)
Ejemplo n.º 3
0
def test_maybe_download():
    # ToDo: Change this url when repo goes public.
    file_url = "https://raw.githubusercontent.com/Microsoft/Recommenders/master/LICENSE"
    filepath = "license.txt"
    assert not os.path.exists(filepath)
    filepath = maybe_download(file_url, "license.txt", expected_bytes=1162)
    assert os.path.exists(filepath)
    os.remove(filepath)
    with pytest.raises(IOError):
        filepath = maybe_download(file_url, "license.txt", expected_bytes=0)
def load_train_test_dfs(local_cache_path="./",
                        test_fraction=0.5,
                        random_seed=None):
    """
    Get the training and testing data frames based on test_fraction.

    Args:
        local_cache_path (str): Path to store the data. If the data file
            doesn't exist in this path, it's downloaded.
        test_fraction (float, optional): Fraction of data ot use for
            testing. Since this is a small dataset, the default testing
            fraction is set to 0.5
        random_seed (float, optional): Random seed used to shuffle the data.

    Returns:
        tuple: (train_pandas_df, test_pandas_df), each data frame contains
            two columns
            "sentence": sentences in strings.
            "labels": list of entity labels of the words in the sentence.

    """
    file_name = URL.split("/")[-1]
    maybe_download(URL, file_name, local_cache_path)

    data_file = os.path.join(local_cache_path, file_name)

    with open(data_file, "r", encoding="utf8") as file:
        text = file.read()

    sentence_list, labels_list = preprocess_conll(text)

    if random_seed:
        random.seed(random_seed)
    sentence_and_labels = list(zip(sentence_list, labels_list))
    random.shuffle(sentence_and_labels)
    sentence_list[:], labels_list[:] = zip(*sentence_and_labels)

    sentence_count = len(sentence_list)
    test_sentence_count = round(sentence_count * test_fraction)
    test_sentence_list = sentence_list[:test_sentence_count]
    test_labels_list = labels_list[:test_sentence_count]
    train_sentence_list = sentence_list[test_sentence_count:]
    train_labels_list = labels_list[test_sentence_count:]

    train_df = pd.DataFrame({
        "sentence": train_sentence_list,
        "labels": train_labels_list
    })

    test_df = pd.DataFrame({
        "sentence": test_sentence_list,
        "labels": test_labels_list
    })

    return (train_df, test_df)
Ejemplo n.º 5
0
def download_snli(dest_path):
    """
    Download the SNLI dataset
    Args:
        dest_path (str): file path where SNLI dataset should be downloaded

    Returns:
        str: file path where SNLI dataset is downloaded

    """
    dirs, file = os.path.split(dest_path)
    maybe_download(SNLI_URL, file, work_directory=dirs)
Ejemplo n.º 6
0
def load_pandas_df(local_cache_path=None, num_rows=None):
    """Downloads and extracts the dataset files
    Args:
        local_cache_path ([type], optional): [description]. Defaults to None.
        num_rows (int): Number of rows to load. If None, all data is loaded.
    Returns:
        pd.DataFrame: pandas DataFrame containing the loaded dataset.
    """
    zip_file = URL.split("/")[-1]
    maybe_download(URL, zip_file, local_cache_path)

    zip_file_path = os.path.join(local_cache_path, zip_file)
    csv_file_path = os.path.join(local_cache_path, zip_file.replace(".zip", ""))

    if not os.path.exists(csv_file_path):
        extract_zip(file_path=zip_file_path, dest_path=local_cache_path)
    return pd.read_csv(csv_file_path, nrows=num_rows)
Ejemplo n.º 7
0
 def __init__(self, path) -> None:
     maybe_download(
         "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_train.csv",
         "data_train.csv", path)
     pd = pandas.read_csv(path + "data_train.csv")
     train = pd.values.tolist()
     source = [item[0] for item in train]
     summary = [item[1] for item in train]
     self.train_source, self.test_source, self.train_summary, self.test_summary = train_test_split(
         source,
         summary,
         train_size=0.95,
         test_size=0.005,
         random_state=123)
     self.reset()
     self.no_bertscore()
     self.add_bertscore()
Ejemplo n.º 8
0
def load_pandas_df(local_cache_path=".", file_split="train"):
    """Downloads and extracts the dataset files
    Args:
        local_cache_path ([type], optional): [description]. Defaults to None.
        file_split (str, optional): The subset to load.
            One of: {"train", "dev_matched", "dev_mismatched"}
            Defaults to "train".
    Returns:
        pd.DataFrame: pandas DataFrame containing the specified
            MultiNLI subset.
    """

    file_name = URL.split("/")[-1]
    maybe_download(URL, file_name, local_cache_path)

    if not os.path.exists(os.path.join(local_cache_path, DATA_FILES[file_split])):
        extract_zip(os.path.join(local_cache_path, file_name), local_cache_path)
    return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True)
Ejemplo n.º 9
0
def download_file_and_extract(local_cache_path: str = ".",
                              file_split: str = "train") -> None:
    """Download and extract the dataset files

    Args:
        local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."})
        file_split {str} -- [description] (default: {"train"})
    
    Returns:
        None -- Nothing is returned
    """
    file_name = URL.split("/")[-1]
    maybe_download(URL, file_name, local_cache_path)

    if not os.path.exists(
            os.path.join(local_cache_path, DATA_FILES[file_split])):
        extract_zip(os.path.join(local_cache_path, file_name),
                    local_cache_path)
Ejemplo n.º 10
0
def download_msrpc(download_dir):
    """Downloads Windows Installer for Microsoft Paraphrase Corpus.
    Args:
        download_dir (str): File path for the downloaded file

    Returns:
        str: file_path to the downloaded dataset.
    """

    url = (
        "https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B"
        "-3604ED519838/MSRParaphraseCorpus.msi")
    return maybe_download(url, work_directory=download_dir)
Ejemplo n.º 11
0
def load_pandas_df(local_cache_path=TemporaryDirectory().name):
    """
    Downloads and extracts the dataset files

    Args:
        local_cache_path (str, optional): The local file path to save the raw file.
        Defaults to TemporaryDirectory().name.
    Returns:
        pd.DataFrame: pandas DataFrame containing the loaded dataset.
    """

    zipped_file = URL.split("/")[-1]
    maybe_download(URL, zipped_file, local_cache_path)

    zipped_file_path = os.path.join(local_cache_path, zipped_file)
    tar = tarfile.open(zipped_file_path, "r:gz")
    tar.extractall(path=local_cache_path)
    tar.close()

    train_csv_file_path = os.path.join(local_cache_path, "hindi-train.csv")
    test_csv_file_path = os.path.join(local_cache_path, "hindi-test.csv")

    train_df = pd.read_csv(train_csv_file_path,
                           sep="\t",
                           encoding="utf-8",
                           header=None)

    test_df = pd.read_csv(test_csv_file_path,
                          sep="\t",
                          encoding="utf-8",
                          header=None)

    train_df = train_df.fillna("")
    test_df = test_df.fillna("")

    return (train_df, test_df)
def download_tsv_files_and_extract(local_cache_path: str = ".") -> None:
    """Download and extract the dataset files in tsv format from NYU Jiant 
        downloads both original and tsv formatted data. 

    Args:
        local_cache_path (str [optional]) -- Directory to cache files to. Defaults to current working directory (default: {"."})
    
    Returns:
        None -- Nothing is returned
    """
    try:
        folder_name = "MNLI"
        file_name = f"{folder_name}.zip"
        maybe_download(URL_JIANT_MNLI_TSV, file_name, local_cache_path)
        if not os.path.exists(os.path.join(local_cache_path, folder_name)):
            extract_zip(os.path.join(local_cache_path, file_name),
                        local_cache_path)

        # Clean up zip download
        if os.path.exists(os.path.join(local_cache_path, file_name)):
            os.remove(os.path.join(local_cache_path, file_name))
    except IOError as e:
        raise (e)
    print("Downloaded file to: ", os.path.join(local_cache_path, folder_name))
Ejemplo n.º 13
0
def _download_glove_vectors(download_dir, file_name="glove.840B.300d.zip"):
    """ Downloads gloVe word vectors trained on Common Crawl corpus. You can
    directly download the vectors from here:
    http://nlp.stanford.edu/data/glove.840B.300d.zip

    Args:
        download_dir (str): File path to download the file
        file_name (str) : File name given by default but can be changed by the user.

    Returns:
        str: file_path to the downloaded vectors.
    """

    return maybe_download(GLOVE_URL,
                          filename=file_name,
                          work_directory=download_dir)
Ejemplo n.º 14
0
def _download_sts(dirpath):
    """Download and extract data from
        http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz

    Args:
        dirpath (str): Path to data directory.

    Returns:
        str: Path to extracted STS Benchmark data.
    """
    filepath = maybe_download(STS_URL, work_directory=dirpath)
    extracted_path = _extract_sts(filepath,
                                  target_dirpath=dirpath,
                                  tmode="r:gz")
    print("Data downloaded to {}".format(extracted_path))
    return extracted_path
Ejemplo n.º 15
0
def _download_word2vec_vectors(
        download_dir, file_name="GoogleNews-vectors-negative300.bin.gz"):
    """ Downloads pretrained word vectors trained on GoogleNews corpus. You can
    directly download the vectors from here:
    https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

    Args:
        download_dir (str): File path to download the file
        file_name (str) : File name given by default but can be changed by the user.

    Returns:
        str: file_path to the downloaded vectors.
    """

    return maybe_download(WORD2VEC_URL,
                          filename=file_name,
                          work_directory=download_dir)
Ejemplo n.º 16
0
def _download_fasttext_vectors(download_dir, file_name="wiki.simple.zip"):
    """ Downloads pre-trained word vectors for English, trained on Wikipedia using
    fastText. You can directly download the vectors from here:
    https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip

    For the full version of pre-trained word vectors, change the url for
    FASTTEXT_EN_URL to https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
    in __init__.py

    Args:
        download_dir (str): File path to download the file
        file_name (str) : File name given by default but can be changed by the user.

    Returns:
        str: file_path to the downloaded vectors.
    """

    return maybe_download(
        FASTTEXT_EN_URL, filename=file_name, work_directory=download_dir
    )
Ejemplo n.º 17
0
def load_pandas_df(local_cache_path=".", squad_version="v1.1", file_split="train"):
    """Loads the SQuAD dataset in pandas data frame.

    Args:
        local_cache_path (str, optional): Path to load the data from. If the file doesn't exist,
            download it first. Defaults to the current directory.
        squad_version (str, optional): Version of the SQuAD dataset, accepted values are: 
            "v1.1" and "v2.0". Defaults to "v1.1".
        file_split (str, optional): Dataset split to load, accepted values are: "train" and "dev".
            Defaults to "train".
    """

    if file_split not in ["train", "dev"]:
        raise ValueError("file_split should be either train or dev")

    URL = URL_DICT[squad_version][file_split]
    file_name = URL.split("/")[-1]
    maybe_download(URL, file_name, local_cache_path)

    file_path = os.path.join(local_cache_path, file_name)

    with open(file_path, "r", encoding="utf-8") as reader:
        input_data = json.load(reader)["data"]

    paragraph_text_list = []
    question_text_list = []
    answer_start_list = []
    answer_text_list = []
    qa_id_list = []
    is_impossible_list = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                answer_offset = None
                is_impossible = False

                if squad_version == "v2.0":
                    is_impossible = qa["is_impossible"]

                if file_split == "train":
                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                    else:
                        orig_answer_text = ""
                else:
                    if not is_impossible:
                        orig_answer_text = []
                        answer_offset = []
                        for answer in qa["answers"]:
                            orig_answer_text.append(answer["text"])
                            answer_offset.append(answer["answer_start"])
                    else:
                        orig_answer_text = ""

                paragraph_text_list.append(paragraph_text)
                question_text_list.append(question_text)
                answer_start_list.append(answer_offset)
                answer_text_list.append(orig_answer_text)
                qa_id_list.append(qas_id)
                is_impossible_list.append(is_impossible)

    output_df = pd.DataFrame(
        {
            "doc_text": paragraph_text_list,
            "question_text": question_text_list,
            "answer_start": answer_start_list,
            "answer_text": answer_text_list,
            "qa_id": qa_id_list,
            "is_impossible": is_impossible_list,
        }
    )

    return output_df
Ejemplo n.º 18
0
def SwissSummarizationDataset(top_n=-1, validation=False):
    """Load the CNN/Daily Mail dataset preprocessed by harvardnlp group."""

    URLS = [
        "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_train.csv",
        "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_test.csv",
    ]
    LOCAL_CACHE_PATH = '.data'

    FILE_NAME = "data_train.csv"
    maybe_download(URLS[0], FILE_NAME, LOCAL_CACHE_PATH)
    dataset_path = os.path.join(LOCAL_CACHE_PATH, FILE_NAME)

    train = pandas.read_csv(dataset_path).values.tolist()
    if (top_n != -1):
        train = train[0:top_n]
    source = [item[0] for item in train]
    summary = [item[1] for item in train]
    train_source, test_source, train_summary, test_summary = train_test_split(
        source, summary, train_size=0.95, test_size=0.05, random_state=123)
    if validation:
        train_source, validation_source, train_summary, validation_summary = train_test_split(
            train_source,
            train_summary,
            train_size=0.9,
            test_size=0.1,
            random_state=123)
        return (
            SummarizationDataset(
                source_file=None,
                source=train_source,
                target=train_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
            SummarizationDataset(
                source_file=None,
                source=validation_source,
                target=validation_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
            SummarizationDataset(
                source_file=None,
                source=test_source,
                target=test_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
        )
    else:
        return (
            SummarizationDataset(
                source_file=None,
                source=train_source,
                target=train_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
            SummarizationDataset(
                source_file=None,
                source=test_source,
                target=test_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
        )
def load_pandas_df(local_cache_path=".", file_split="dev", language="zh"):
    """Downloads and extracts the dataset files.

    Utilities information can be found `on this link <https://www.nyu.edu/projects/bowman/xnli/>`_.

    Args:
        local_cache_path (str, optional): Path to store the data.
            Defaults to "./".
        file_split (str, optional): The subset to load.
            One of: {"train", "dev", "test"}
            Defaults to "dev".
        language (str, optional): language subset to read.
            One of: {"en", "fr", "es", "de", "el", "bg", "ru",
            "tr", "ar", "vi", "th", "zh", "hi", "sw", "ur"}
            Defaults to "zh" (Chinese).
    Returns:
        pd.DataFrame: pandas DataFrame containing the specified
            XNLI subset.
    """

    if file_split in ("dev", "test"):
        url = URL_XNLI
        sentence_1_index = 6
        sentence_2_index = 7
        label_index = 1

        zip_file_name = url.split("/")[-1]
        folder_name = ".".join(zip_file_name.split(".")[:-1])
        file_name = folder_name + "/" + ".".join(["xnli", file_split, "tsv"])
    elif file_split == "train":
        url = URL_XNLI_MT
        sentence_1_index = 0
        sentence_2_index = 1
        label_index = 2

        zip_file_name = url.split("/")[-1]
        folder_name = ".".join(zip_file_name.split(".")[:-1])
        file_name = folder_name + "/multinli/" + ".".join(
            ["multinli", file_split, language, "tsv"])

    maybe_download(url, zip_file_name, local_cache_path)

    if not os.path.exists(os.path.join(local_cache_path, folder_name)):
        extract_zip(os.path.join(local_cache_path, zip_file_name),
                    local_cache_path)

    with open(os.path.join(local_cache_path, file_name), "r",
              encoding="utf-8") as f:
        lines = f.read().splitlines()

    line_list = [line.split("\t") for line in lines]

    # Remove the column name row
    line_list.pop(0)
    if file_split != "train":
        line_list = [line for line in line_list if line[0] == language]

    valid_lines = [
        True if line[sentence_1_index] and line[sentence_2_index] else False
        for line in line_list
    ]
    total_line_count = len(line_list)
    line_list = [line for line, valid in zip(line_list, valid_lines) if valid]
    valid_line_count = len(line_list)

    if valid_line_count != total_line_count:
        print("{} invalid lines removed.".format(total_line_count -
                                                 valid_line_count))

    label_list = [convert_to_unicode(line[label_index]) for line in line_list]
    old_contradict_label = convert_to_unicode("contradictory")
    new_contradict_label = convert_to_unicode("contradiction")
    label_list = [
        new_contradict_label if label == old_contradict_label else label
        for label in label_list
    ]
    text_list = [(convert_to_unicode(line[sentence_1_index]),
                  convert_to_unicode(line[sentence_2_index]))
                 for line in line_list]

    df = pd.DataFrame({"text": text_list, "label": label_list})

    return df