class Esnli(nlp.GeneratorBasedBuilder): """e-SNLI: Natural Language Inference with Natural Language Explanations corpus.""" # Version History # 0.0.2 Added explanation_2, explanation_3 fields which exist in the dev/test # splits only. # 0.0.1 Initial version BUILDER_CONFIGS = [ nlp.BuilderConfig(name="plain_text", version=nlp.Version("0.0.2"), description="Plain text import of e-SNLI",) ] def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "premise": nlp.Value("string"), "hypothesis": nlp.Value("string"), "label": nlp.features.ClassLabel(names=["entailment", "neutral", "contradiction"]), "explanation_1": nlp.Value("string"), "explanation_2": nlp.Value("string"), "explanation_3": nlp.Value("string"), } ), supervised_keys=None, homepage="https://github.com/OanaMariaCamburu/e-SNLI", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" files = dl_manager.download_and_extract( { "train": [os.path.join(_URL, "esnli_train_1.csv"), os.path.join(_URL, "esnli_train_2.csv")], "validation": [os.path.join(_URL, "esnli_dev.csv")], "test": [os.path.join(_URL, "esnli_test.csv")], } ) return [ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"files": files["train"]},), nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"files": files["validation"]},), nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"files": files["test"]},), ] def _generate_examples(self, files): """Yields examples.""" for filepath in files: with open(filepath, encoding="utf-8") as f: reader = csv.DictReader(f) for _, row in enumerate(reader): yield row["pairID"], { "premise": row["Sentence1"], "hypothesis": row["Sentence2"], "label": row["gold_label"], "explanation_1": row["Explanation_1"], "explanation_2": row.get("Explanation_2", ""), "explanation_3": row.get("Explanation_3", ""), }
class Newsroom(nlp.GeneratorBasedBuilder): """NEWSROOM Dataset.""" VERSION = nlp.Version("1.0.0") @property def manual_download_instructions(self): return """\ You should download the dataset from http://lil.nlp.cornell.edu/newsroom/ The webpage requires registration. To unzip the .tar file run `tar -zxvf complete.tar`. To unzip the .gz files run `gunzip train.json.gz` , ... After downloading, please put the files under the following names dev.jsonl, test.jsonl and train.jsonl in a dir of your choice, which will be used as a manual_dir, e.g. `~/.manual_dirs/newsroom` Newsroom can then be loaded via: `nlp.load_dataset("newsroom", data_dir="~/.manual_dirs/newsroom")`. """ def _info(self): features = {k: nlp.Value("string") for k in [_DOCUMENT, _SUMMARY] + _ADDITIONAL_TEXT_FEATURES} features.update({k: nlp.Value("float32") for k in _ADDITIONAL_FLOAT_FEATURES}) return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features(features), supervised_keys=(_DOCUMENT, _SUMMARY), homepage="http://lil.nlp.cornell.edu/newsroom/", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) if not os.path.exists(data_dir): raise FileNotFoundError( "{} does not exist. Make sure you insert a manual dir via `nlp.load_dataset('newsroom', data_dir=...)` that includes files unzipped from the reclor zip. Manual download instructions: {}".format( data_dir, self.manual_download_instructions ) ) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={"input_file": os.path.join(data_dir, "train.jsonl")}, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={"input_file": os.path.join(data_dir, "dev.jsonl")}, ), nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"input_file": os.path.join(data_dir, "test.jsonl")},), ] def _generate_examples(self, input_file=None): """Yields examples.""" with open(input_file) as f: for i, line in enumerate(f): d = json.loads(line) # fields are "url", "archive", "title", "date", "text", # "compression_bin", "density_bin", "summary", "density", # "compression', "coverage", "coverage_bin", yield i, { k: d[k] for k in [_DOCUMENT, _SUMMARY] + _ADDITIONAL_TEXT_FEATURES + _ADDITIONAL_FLOAT_FEATURES }
def __init__(self, features, data_url, citation, url, label_classes=("False", "True"), **kwargs): """BuilderConfig for SuperGLUE. Args: features: `list[string]`, list of the features that will appear in the feature dict. Should not include "label". data_url: `string`, url to download the zip file from. citation: `string`, citation for the data set. url: `string`, url for information about the data set. label_classes: `list[string]`, the list of classes for the label if the label is present as a string. Non-string labels will be cast to either 'False' or 'True'. **kwargs: keyword arguments forwarded to super. """ # Version history: # 1.0.2: Fixed non-nondeterminism in ReCoRD. # 1.0.1: Change from the pre-release trial version of SuperGLUE (v1.9) to # the full release (v2.0). # 1.0.0: S3 (new shuffling, sharding and slicing mechanism). # 0.0.2: Initial version. super(SuperGlueConfig, self).__init__(version=nlp.Version("1.0.2"), **kwargs) self.features = features self.label_classes = label_classes self.data_url = data_url self.citation = citation self.url = url
class QaZre(nlp.GeneratorBasedBuilder): """QA-ZRE: Reducing relation extraction to simple reading comprehension questions""" VERSION = nlp.Version("0.1.0") def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "relation": nlp.Value("string"), "question": nlp.Value("string"), "subject": nlp.Value("string"), "context": nlp.Value("string"), "answers": nlp.features.Sequence(nlp.Value("string")), } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="http://nlp.cs.washington.edu/zeroshot", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" dl_dir = dl_manager.download_and_extract(_DATA_URL) dl_dir = os.path.join(dl_dir, "relation_splits") return [ nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={"filepaths": [os.path.join(dl_dir, "test." + str(i)) for i in range(10)],}, ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={"filepaths": [os.path.join(dl_dir, "dev." + str(i)) for i in range(10)],}, ), nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={"filepaths": [os.path.join(dl_dir, "train." + str(i)) for i in range(10)],}, ), ] def _generate_examples(self, filepaths): """Yields examples.""" for filepath in filepaths: with open(filepath) as f: data = csv.reader(f, delimiter="\t") for idx, row in enumerate(data): yield idx, { "relation": row[0], "question": row[1], "subject": row[2], "context": row[3], "answers": row[4:], }
class Boolq(nlp.GeneratorBasedBuilder): """TODO(boolq): Short description of my dataset.""" # TODO(boolq): Set up version. VERSION = nlp.Version("0.1.0") def _info(self): # TODO(boolq): Specifies the nlp.DatasetInfo object return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features( { "question": nlp.Value("string"), "answer": nlp.Value("bool"), "passage": nlp.Value("string") # These are the features of your dataset like images, labels ... } ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/google-research-datasets/boolean-questions", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" # TODO(boolq): Downloads the data and defines the splits # dl_manager is a nlp.download.DownloadManager that can be used to # download and extract URLs urls_to_download = { "train": os.path.join(_URL, _TRAIN_FILE_NAME), "dev": os.path.join(_URL, _DEV_FILE_NAME), } downloaded_files = dl_manager.download_custom(urls_to_download, tf.io.gfile.copy) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]} ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}, ), ] def _generate_examples(self, filepath): """Yields examples.""" # TODO(boolq): Yields (key, example) tuples from the dataset with open(filepath) as f: for id_, row in enumerate(f): data = json.loads(row) question = data["question"] answer = data["answer"] passage = data["passage"] yield id_, {"question": question, "answer": answer, "passage": passage}
class ParaCrawl(nlp.GeneratorBasedBuilder): """ParaCrawl machine translation dataset.""" # Version history: # 1.0.0: S3 (new shuffling, sharding and slicing mechanism). # 0.1.0: Initial version. BUILDER_CONFIGS = [ # The version below does not refer to the version of the released # database. It only indicates the version of the TFDS integration. ParaCrawlConfig( # pylint: disable=g-complex-comprehension target_language=target_language, version=nlp.Version("1.0.0"), ) for target_language in _target_languages() ] def _info(self): target_language = self.config.target_language return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features({ "translation": nlp.features.Translation(languages=("en", target_language)) }), supervised_keys=("en", target_language), homepage=_BENCHMARK_URL, citation=_CITATION, ) def _vocab_text_gen(self, files, language): for _, ex in self._generate_examples(**files): yield ex[language] def _split_generators(self, dl_manager): # Download the data file. data_file = dl_manager.download_and_extract( {"data_file": self.config.data_url}) # Return the single split of the data. return [nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs=data_file)] def _generate_examples(self, data_file): """This function returns the examples in the raw (text) form.""" target_language = self.config.target_language with open(data_file, encoding="utf-8") as f: for idx, line in enumerate(f): line_parts = line.strip().split("\t") if len(line_parts) != 2: msg = ("Wrong data format in line {}. The line '{}' does " "not have exactly one delimiter.").format( idx, line) raise ValueError(msg) source, target = line_parts[0].strip(), line_parts[1].strip() yield idx, { "translation": { "en": source, target_language: target } }
def __init__(self, **kwargs): """Constructs a DoQA. Args: **kwargs: keyword arguments forwarded to super. """ super(DoqaConfig, self).__init__(version=nlp.Version("2.1.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs)
def __init__(self, data_url, **kwargs): """BuilderConfig for SearchQA Args: **kwargs: keyword arguments forwarded to super. """ super(SearchQaConfig, self).__init__(version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs) self.data_url = data_url
def __init__(self, **kwargs): """BuilderConfig for Art. Args: **kwargs: keyword arguments forwarded to super. """ super(ArtConfig, self).__init__(version=nlp.Version( "0.1.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs)
def __init__(self, data_url, **kwargs): """BuilderConfig for Wikitext Args: data_url: `string`, url to the dataset (word or raw level) **kwargs: keyword arguments forwarded to super. """ super(WikitextConfig, self).__init__(version=nlp.Version("1.0.0",), **kwargs) self.data_url = data_url
def __init__(self, **kwargs): """BuilderConfig for KorNLI. Args: **kwargs: keyword arguments forwarded to super. """ # Version 1.1.0 remove empty document and summary strings. super(KorNLIConfig, self).__init__(version=nlp.Version("1.0.0"), **kwargs)
def __init__(self, **kwargs): """BuilderConfig for BlogAuthorship Args: data_url: `string`, url to the dataset (word or raw level) **kwargs: keyword arguments forwarded to super. """ super(AssertionConfig, self).__init__(version=nlp.Version( "1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs)
def __init__(self, data_url, **kwargs): """BuilderConfig for BlogAuthorship Args: data_url: `string`, url to the dataset (word or raw level) **kwargs: keyword arguments forwarded to super. """ super(CrimeAndPunishConfig, self).__init__(version=nlp.Version("1.0.0",), **kwargs) self.data_url = data_url
class Snli(nlp.GeneratorBasedBuilder): """The Stanford Natural Language Inference (SNLI) Corpus.""" BUILDER_CONFIGS = [ nlp.BuilderConfig( name="plain_text", version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), description="Plain text import of SNLI", ) ] def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "premise": nlp.Value("string"), "hypothesis": nlp.Value("string"), "label": nlp.features.ClassLabel(names=["entailment", "neutral", "contradiction"]), } ), # No default supervised_keys (as we have to pass both premise # and hypothesis as input). supervised_keys=None, homepage="https://nlp.stanford.edu/projects/snli/", citation=_CITATION, ) def _split_generators(self, dl_manager): dl_dir = dl_manager.download_and_extract(_DATA_URL) data_dir = os.path.join(dl_dir, "snli_1.0") return [ nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_test.txt")} ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_dev.txt")} ), nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={"filepath": os.path.join(data_dir, "snli_1.0_train.txt")} ), ] def _generate_examples(self, filepath): """This function returns the examples in the raw (text) form.""" print(filepath) print("==" * 100) with open(filepath) as f: reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) for idx, row in enumerate(reader): label = -1 if row["gold_label"] == "-" else row["gold_label"] yield idx, { "premise": row["sentence1"], "hypothesis": row["sentence2"], "label": label, }
def __init__(self, **kwargs): """BuilderConfig for KILTTasks. Args: . **kwargs: keyword arguments forwarded to super. """ super(KILTTasksConfig, self).__init__(version=nlp.Version( "1.0.0", "KILT tasks training and evaluation data"), **kwargs)
def __init__(self, **kwargs): """BuilderConfig for KILTWikipedia. Args: . **kwargs: keyword arguments forwarded to super. """ super(KILTWikipediaConfig, self).__init__( version=nlp.Version("1.0.0", "Wikipedia pre-processed for KILT"), **kwargs )
def __init__(self, **kwargs): """ Args: data_dir: directory for the given language dataset **kwargs: keyword arguments forwarded to super. """ super(XcopaConfig, self).__init__(version=nlp.Version( "1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs)
def __init__(self, data_url, **kwargs): """BuilderConfig for MLQA Args: data_url: `string`, url to the dataset **kwargs: keyword arguments forwarded to super. """ super(MlqaConfig, self).__init__(version=nlp.Version("1.0.0", ), **kwargs) self.data_url = data_url
class CrimeAndPunish(nlp.GeneratorBasedBuilder): VERSION = nlp.Version("0.1.0") BUILDER_CONFIGS = [ CrimeAndPunishConfig( name="crime-and-punish", data_url=_DATA_URL, description="word level dataset. No processing is needed other than replacing newlines with <eos> tokens.", ), ] def _info(self): return nlp.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # nlp.features.FeatureConnectors features=nlp.Features({"line": nlp.Value("string"),}), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, homepage=_URL, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" if self.config.name == "crime-and-punish": data = dl_manager.download_and_extract(self.config.data_url) return [ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"data_file": data, "split": "train"},), ] else: raise ValueError("{} does not exist".format(self.config.name)) def _generate_examples(self, data_file, split): with open(data_file, "rb") as f: id_counter = 0 add_text = False crime_and_punishment_occ_counter = 0 for line in f: line = line.decode("UTF-8") if "CRIME AND PUNISHMENT" in line: crime_and_punishment_occ_counter += 1 add_text = crime_and_punishment_occ_counter == 3 if "End of Project" in line: add_text = False if add_text is True: result = {"line": line} id_counter += 1 yield id_counter, result
class TedMultiTranslate(nlp.GeneratorBasedBuilder): """TED talk multilingual data set.""" BUILDER_CONFIGS = [ nlp.BuilderConfig( name="plain_text", version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), description="Plain text import of multilingual TED talk translations", ) ] def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "translations": nlp.features.TranslationVariableLanguages(languages=_LANGUAGES), "talk_name": nlp.Value("string"), } ), homepage="https://github.com/neulab/word-embeddings-for-nmt", citation=_CITATION, ) def _split_generators(self, dl_manager): dl_dir = dl_manager.download_and_extract(_DATA_URL) return [ nlp.SplitGenerator( name=nlp.Split.TRAIN, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_train.tsv")} ), nlp.SplitGenerator( name=nlp.Split.VALIDATION, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_dev.tsv")} ), nlp.SplitGenerator( name=nlp.Split.TEST, gen_kwargs={"data_file": os.path.join(dl_dir, "all_talks_test.tsv")} ), ] def _generate_examples(self, data_file): """This function returns the examples in the raw (text) form.""" with open(data_file) as f: reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE) for idx, row in enumerate(reader): # Everything in the row except for 'talk_name' will be a translation. # Missing/incomplete translations will contain the string "__NULL__" or # "_ _ NULL _ _". yield idx, { "translations": { lang: text for lang, text in six.iteritems(row) if lang != "talk_name" and _is_translation_complete(text) }, "talk_name": row["talk_name"], }
def __init__(self, lang, **kwargs): """ Args: lang: string, language for the input text **kwargs: keyword arguments forwarded to super. """ super(XquadConfig, self).__init__(version=nlp.Version( "1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs) self.lang = lang
def __init__(self, features, **kwargs): """BuilderConfig for UbuntuDialogsCorpus. Args: **kwargs: keyword arguments forwarded to super. """ super(UbuntuDialogsCorpusConfig, self).__init__(version=nlp.Version("2.0.0"), **kwargs) self.features = features
class Gap(nlp.GeneratorBasedBuilder): """GAP is a gender-balanced dataset. It contains 8,908 coreference-labeled pairs of (ambiguous pronoun, antecedent name), sampled from Wikipedia. """ VERSION = nlp.Version("0.1.0") def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "ID": nlp.Value("string"), "Text": nlp.Value("string"), "Pronoun": nlp.Value("string"), "Pronoun-offset": nlp.Value("int32"), "A": nlp.Value("string"), "A-offset": nlp.Value("int32"), "A-coref": nlp.Value("bool"), "B": nlp.Value("string"), "B-offset": nlp.Value("int32"), "B-coref": nlp.Value("bool"), "URL": nlp.Value("string"), } ), supervised_keys=None, homepage="https://github.com/google-research-datasets/gap-coreference", citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" directory = dl_manager.download_and_extract( {"train": _TRAINURL, "validation": _VALIDATIONURL, "test": _TESTURL} ) return [ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": directory["train"]},), nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": directory["validation"]},), nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": directory["test"]},), ] def _generate_examples(self, filepath): """Yields examples.""" with open(filepath, encoding="utf-8") as tsvfile: reader = csv.DictReader(tsvfile, dialect="excel-tab") for i, row in enumerate(reader): row["A-coref"] = bool(row["A-coref"]) row["B-coref"] = bool(row["B-coref"]) row["A-offset"] = int(row["A-offset"]) row["B-offset"] = int(row["B-offset"]) row["Pronoun-offset"] = int(row["Pronoun-offset"]) yield i, row
def __init__(self, filename=None, **kwargs): """BuilderConfig for Wikihow. Args: filename: filename of different configs for the dataset. **kwargs: keyword arguments forwarded to super. """ # 1.1.0 remove sentence breaker <S> and </S> in summary. super(ScientificPapersConfig, self).__init__(version=nlp.Version("1.1.1"), **kwargs) self.filename = filename
def __init__(self, filename=None, **kwargs): """BuilderConfig for Wikihow. Args: filename: filename of different configs for the dataset. **kwargs: keyword arguments forwarded to super. """ # Version 1.1.0 remove empty document and summary strings. # Version 1.2.0 add train validation test split, add cleaning & filtering. super(WikihowConfig, self).__init__(version=nlp.Version("1.2.0"), **kwargs) self.filename = filename
def __init__(self, summary_key=None, **kwargs): """BuilderConfig for RedditTifu. Args: summary_key: key string of summary in downloaded json file. **kwargs: keyword arguments forwarded to super. """ # Version 1.1.0 remove empty document and summary strings. super(RedditTifuConfig, self).__init__(version=nlp.Version("1.1.0"), **kwargs) self.summary_key = summary_key
class DefinitePronounResolution(nlp.GeneratorBasedBuilder): """The Definite Pronoun Resolution Dataset.""" BUILDER_CONFIGS = [ nlp.BuilderConfig( name="plain_text", version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), description="Plain text import of the Definite Pronoun Resolution Dataset.", # pylint: disable=line-too-long ) ] def _info(self): return nlp.DatasetInfo( description=_DESCRIPTION, features=nlp.Features( { "sentence": nlp.Value("string"), "pronoun": nlp.Value("string"), "candidates": nlp.features.Sequence(nlp.Value("string"), length=2), "label": nlp.features.ClassLabel(num_classes=2), } ), supervised_keys=("sentence", "label"), homepage="http://www.hlt.utdallas.edu/~vince/data/emnlp12/", citation=_CITATION, ) def _split_generators(self, dl_manager): files = dl_manager.download_and_extract( {"train": _DATA_URL_PATTERN.format("train"), "test": _DATA_URL_PATTERN.format("test"),} ) return [ nlp.SplitGenerator(name=nlp.Split.TEST, gen_kwargs={"filepath": files["test"]}), nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"filepath": files["train"]}), ] def _generate_examples(self, filepath): with open(filepath) as f: line_num = -1 while True: line_num += 1 sentence = f.readline().strip() pronoun = f.readline().strip() candidates = [c.strip() for c in f.readline().strip().split(",")] correct = f.readline().strip() f.readline() if not sentence: break yield line_num, { "sentence": sentence, "pronoun": pronoun, "candidates": candidates, "label": candidates.index(correct), }
def __init__(self, data_size, **kwargs): """ Args: data_size: the size of the training set we want to us (xs, s, m, l, xl) **kwargs: keyword arguments forwarded to super. """ super(WinograndeConfig, self).__init__(version=nlp.Version( "1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs) self.data_size = data_size
def __init__(self, data_url, balanced=False, **kwargs): """ Args: balanced: to specify if we want to load the balanced file or the full file **kwargs: keyword arguments forwarded to super. """ super(DiscofuseConfig, self).__init__( version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs ) self.balanced = balanced self.data_url = data_url
def _generate_builder_configs(): """Generate configs with different subsets of mathematics dataset.""" configs = [] for module in sorted(set(_MODULES)): configs.append( nlp.BuilderConfig( name=module, version=nlp.Version("1.0.0"), description=_DESCRIPTION, )) return configs