Ejemplo n.º 1
0
import os
import argparse
import shutil
from gluonnlp.utils.misc import download, load_checksum_stats
from gluonnlp.base import get_data_home_dir

_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'squad')
_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'squad.txt')
_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)


_CITATIONS = """
@inproceedings{rajpurkar2016squad,
  title={Squad: 100,000+ questions for machine comprehension of text},
  author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
  booktitle={EMNLP},
  year={2016}
}

@inproceedings{rajpurkar2018know,
  title={Know What You Don't Know: Unanswerable Questions for SQuAD},
  author={Rajpurkar, Pranav and Jia, Robin and Liang, Percy},
  booktitle={ACL},
  year={2018}
}

"""

_URLS = {
    '1.1': {
Ejemplo n.º 2
0
import os
import argparse
import pandas as pd
import shutil
import tarfile
from gluonnlp.utils.misc import download, load_checksum_stats
from gluonnlp.base import get_data_home_dir, get_repo_url

_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_URL_FILE_STATS = load_checksum_stats(
    os.path.join(_CURR_DIR, '..', 'url_checksums', 'text_classification.txt'))

TASK2PATH = {
    "ag":
    get_repo_url() + "datasets/text_classification/ag_news_csv.tar.gz",
    "imdb":
    get_repo_url() + "datasets/text_classification/imdb.tar.gz",
    "dbpedia":
    get_repo_url() + "datasets/text_classification/dbpedia_csv.tar.gz",
    "yelp2":
    get_repo_url() +
    "datasets/text_classification/yelp_review_polarity_csv.tar.gz",
    "yelp5":
    get_repo_url() +
    "datasets/text_classification/yelp_review_full_csv.tar.gz",
    "amazon2":
    get_repo_url() +
    "datasets/text_classification/amazon_review_polarity_csv.tar.gz",
    "amazon5":
    get_repo_url() +
    "datasets/text_classification/amazon_review_full_csv.tar.gz",
Ejemplo n.º 3
0
  pages={3261--3275},
  year={2019}
}
"""

GLUE_TASKS = [
    "cola", "sst", "mrpc", "qqp", "sts", "mnli", "snli", "qnli", "rte", "wnli",
    "diagnostic"
]
SUPERGLUE_TASKS = [
    "cb", "copa", "multirc", "rte", "wic", "wsc", "boolq", "record",
    'broadcoverage-diagnostic', 'winogender-diagnostic'
]

_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_URL_FILE_STATS = load_checksum_stats(
    os.path.join(_CURR_DIR, '..', 'url_checksums', 'glue.txt'))
_URL_FILE_STATS.update(
    load_checksum_stats(
        os.path.join(_CURR_DIR, '..', 'url_checksums', 'superglue.txt')))


def read_tsv_glue(tsv_file, num_skip=1, keep_column_names=False):
    out = []
    nrows = None
    if keep_column_names:
        assert num_skip == 1
    column_names = None
    with open(tsv_file, 'r') as f:
        for i, line in enumerate(f):
            line = line.strip()
            if i < num_skip:
Ejemplo n.º 4
0
  pages={3530--3534},
  year={2016}
}

@inproceedings{barrault2019findings,
  title={Findings of the 2019 conference on machine translation (wmt19)},
  author={Barrault, Lo{\"\i}c and Bojar, Ond{\v{r}}ej and Costa-juss{\`a}, Marta R and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Huck, Matthias and Koehn, Philipp and Malmasi, Shervin and others},
  booktitle={Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)},
  pages={1--61},
  year={2019}
}
"""

_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'wmt')
_URL_FILE_STATS = load_checksum_stats(os.path.join(_CURR_DIR, '..', 'url_checksums', 'wmt.txt'))


# Here, we will make sure that the languages follow the standard ISO 639-1 language tag.
# Also, for more information related to the language tag, you may refer to
# https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
_PARA_URLS = {
    'europarl': {
        'v7': {
            'cs-en': {
                'url': 'http://www.statmt.org/europarl/v7/cs-en.tgz',
                'cs': 'europarl-v7.cs-en.cs',
                'en': 'europarl-v7.cs-en.en',
            },
            'de-en': {
                'url': 'http://www.statmt.org/europarl/v7/de-en.tgz',