Example #1
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Download and Prepare the BookCorpus dataset.')
    parser.add_argument('--dataset',
                        type=str,
                        choices=['gutenberg'],
                        default='gutenberg')
    parser.add_argument(
        '--mode',
        type=str,
        default='raw',
        choices=['raw', 'format'],
        help='Specify the mode for preparing the data.'
        ' "raw" means to download and extract the books into the output'
        ' folder, each file is a book and the filename is the tile of the '
        'book. "format" means to format the extracted txt files for '
        'usage of pretraining.')
    parser.add_argument(
        '--save_dir',
        type=str,
        default=None,
        help='The directory to save the dataset. Default is the same as the'
        ' dataset.')
    parser.add_argument(
        '--cache-path',
        type=str,
        default=os.path.join(get_data_home_dir(), 'book_corpus'),
        help='The temporary path to download the compressed dataset.')
    return parser
Example #2
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Downloading and Preprocessing'
        ' Language Modeling Datasets.')
    parser.add_argument(
        '--dataset',
        type=str,
        required=True,
        choices=['wikitext2', 'wikitext103', 'text8', 'enwik8', 'gbw'],
        help='The dataset to use.')
    parser.add_argument(
        '--save-dir',
        type=str,
        default=None,
        help='The directory to save the dataset.'
        ' By default, it will save to a folder with the same name as the '
        'dataset')
    parser.add_argument('--overwrite',
                        action='store_true',
                        help='Whether to overwrite the saved '
                        'files.')
    parser.add_argument('--cache-path',
                        type=str,
                        default=os.path.join(get_data_home_dir(),
                                             'lm_benchmark_data'),
                        help='The temporary path to download the dataset.')
    return parser
Example #3
0
def get_parser():
    parser = argparse.ArgumentParser(description='Download and Prepare the BookCorpus dataset. '
                                                 'We will download and extract the books into the '
                                                 'output folder, each file is a book and the '
                                                 'filename is the tile of the book.')
    parser.add_argument('--save_dir', type=str, default=None,
                        help='The directory to save the dataset. Default is the same as the'
                             ' dataset.')
    parser.add_argument('--cache-path', type=str,
                        default=os.path.join(get_data_home_dir(), 'gutenberg'),
                        help='The temporary path to download the compressed dataset.')
    return parser
def get_parser():
    parser = argparse.ArgumentParser(description='Download the raw txt BookCorpus')
    parser.add_argument("-o", "--output", default="BookCorpus",
                        help="directory for downloaded  files")
    parser.add_argument("--segment_num_worker", type=int, default=8,
                        help="process num when segmenting articles")
    parser.add_argument("--segment_sentences", action='store_true',
                        help="directory for downloaded  files")
    parser.add_argument('--cache-path', type=str,
                        default=os.path.join(get_data_home_dir(), 'bookcorpus'),
                        help='The temporary path to download the dataset.')
    return parser
Example #5
0
def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--benchmark", choices=['glue', 'superglue'],
                        default='glue', type=str)
    parser.add_argument("-d", "--data_dir", help="directory to save data to", type=str,
                        default=None)
    parser.add_argument(
        "-t",
        "--tasks",
        help="tasks to download data for as a comma separated string",
        type=str,
        default="all"
    )
    parser.add_argument('--cache-path', type=str,
                        default=os.path.join(get_data_home_dir(), 'glue'),
                        help='The temporary path to download the dataset.')
    return parser
Example #6
0
def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-t",
        "--tasks",
        help="tasks to download data for as a comma separated string",
        type=str,
        choices=list(TASK2PATH.keys()) + ['all'],
        default="all")
    parser.add_argument("-d",
                        "--data_dir",
                        help="Directory to save data to",
                        type=str,
                        default='text_classification_benchmark')
    parser.add_argument('--cache-path',
                        type=str,
                        default=os.path.join(get_data_home_dir(),
                                             'text_classification'),
                        help='The temporary path to download the dataset.')
    return parser
Example #7
0
import os
import argparse
import shutil
from gluonnlp.utils.misc import download, load_checksum_stats
from gluonnlp.base import get_data_home_dir

_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'squad')
_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'squad.txt')
_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)


_CITATIONS = """
@inproceedings{rajpurkar2016squad,
  title={Squad: 100,000+ questions for machine comprehension of text},
  author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
  booktitle={EMNLP},
  year={2016}
}

@inproceedings{rajpurkar2018know,
  title={Know What You Don't Know: Unanswerable Questions for SQuAD},
  author={Rajpurkar, Pranav and Jia, Robin and Liang, Percy},
  booktitle={ACL},
  year={2018}
}

"""

_URLS = {
    '1.1': {
Example #8
0
import os
import argparse
from gluonnlp.utils.misc import download, load_checksum_stats
from gluonnlp.base import get_data_home_dir, get_repo_url

_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa')
_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums',
                                    'searchqa.txt')
_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)

_CITATIONS = """
@article{dunn2017searchqa,
  title={Searchqa: A new q\&a dataset augmented with context from a search engine},
  author={Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney, V Ugur and Cirik, Volkan and Cho, Kyunghyun},
  journal={arXiv preprint arXiv:1704.05179},
  year={2017}
}

"""

_URLS = {
    'train': get_repo_url() + 'datasets/question_answering/searchqa/train.txt',
    'val': get_repo_url() + 'datasets/question_answering/searchqa/val.txt',
    'test': get_repo_url() + 'datasets/question_answering/searchqa/test.txt'
}


def get_parser():
    parser = argparse.ArgumentParser(
        description='Downloading the SearchQA Dataset.')
Example #9
0
  year={2016},
  school={Columbia University}
}

@inproceedings{hawthorne2018enabling,
  title={Enabling Factorized Piano Music Modeling and Generation with the {MAESTRO} Dataset},
  author={Curtis Hawthorne and Andriy Stasyuk and Adam Roberts and Ian Simon and Cheng-Zhi Anna Huang and Sander Dieleman and Erich Elsen and Jesse Engel and Douglas Eck},
  booktitle={International Conference on Learning Representations},
  year={2019},
  url={https://openreview.net/forum?id=r1lYRjC9F7},
}
"""


_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'music_midi_data')

_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'music_midi.txt')
_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)


_URLS = {
    'lmd_full': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz',
    'lmd_matched': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz',
    'lmd_aligned': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz',
    'clean_midi': 'http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz',
    'maestro_v1': 'https://storage.googleapis.com/magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0-midi.zip',
    'maestro_v2': 'https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip',
    'geocities': 'https://archive.org/download/archiveteam-geocities-midi-collection-2009/2009.GeoCities.MIDI.ArchiveTeam.zip'
}
Example #10
0
import logging
import os
import argparse
import ast
import gzip
from gluonnlp.utils.misc import download, load_checksum_stats
from gluonnlp.base import get_data_home_dir, get_repo_url

logger = logging.getLogger(__name__)
_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'NaturalQuestions')
_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums',
                                    'naturalquestions.txt')
_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)

_CITATIONS = """
@article{47761,
title	= {Natural Questions: a Benchmark for Question Answering Research},
author	= {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year	= {2019},
journal	= {Transactions of the Association of Computational Linguistics}
}
"""

_URLS = {
    'train': get_repo_url() +
    'NaturalQuestions/v1.0-simplified_simplified-nq-train.jsonl.gz',
    'dev': get_repo_url() + 'NaturalQuestions/nq-dev-all.jsonl.gz'
}

Example #11
0
import os
import tarfile
import argparse
from gluonnlp.utils.misc import download, load_checksum_stats
from gluonnlp.base import get_data_home_dir

_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'triviaqa')
_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums',
                                    'triviaqa.txt')
_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)

_CITATIONS = """
@InProceedings{JoshiTriviaQA2017,
     author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
     title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
     booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
     month = {July},
     year = {2017},
     address = {Vancouver, Canada},
     publisher = {Association for Computational Linguistics},
}

"""

_URLS = {
    'rc':
    'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz',
    'unfiltered':
    'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz'
}
Example #12
0
import os
import argparse
from gluonnlp.utils.misc import download, load_checksum_stats
from gluonnlp.base import get_data_home_dir

_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'hotpotqa')
_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums',
                                    'hotpotqa.txt')
_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)

_CITATIONS = """
@inproceedings{yang2018hotpotqa,
  title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},
  author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.},
  booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
  year={2018}
}

"""

_URLS = {
    'train':
    'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json',
    'dev_fullwiki':
    'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json',
    'dev_distractor':
    'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json',
    'test_fullwiki':
    'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json',
}