def snli(split, vocab_file, sequence_length=75, batch_size=64, transform=utils.identity, filter_fn=None, data_dir=None): """Loads the SNLI dataset.""" tokenize = tokenize_fun(load_tokenizer(vocab_file)) def _preprocess(d): """Applies tokenization.""" hypothesis = tokenize(d['hypothesis']).flat_values premise = tokenize(d['premise']).flat_values sep = tokenize(SEP).flat_values tokens = tf.concat([hypothesis, sep, premise], axis=0) return transform({ 'inputs': tokens, 'labels': d['label'], 'index': tf.size(tokens), }) # Load dataset. dset = load_tfds('snli', split, _preprocess, filter_fn, data_dir=data_dir) # Pad remaining examples to the sequence length. dset = padded_batch(dset, batch_size, sequence_length) return dset
def imdb(split, vocab_file, sequence_length=1000, batch_size=64, transform=utils.identity, filter_fn=None, data_dir=None): """Loads the imdb reviews dataset.""" tokenize = tokenize_fun(load_tokenizer(vocab_file)) def _preprocess(d): """Applies tokenization.""" tokens = tokenize(d['text']).flat_values preprocessed = { 'inputs': tokens, 'labels': d['label'], 'index': tf.size(tokens), } return transform(preprocessed) # Load dataset. dset = load_tfds('imdb_reviews', split, _preprocess, filter_fn, data_dir=data_dir) # Pad remaining examples to the sequence length. dset = padded_batch(dset, batch_size, sequence_length) return dset
def ag_news(split, vocab_file, sequence_length=100, batch_size=64, transform_fn=utils.identity, filter_fn=None, data_dir=None): """Loads the ag news dataset.""" tokenize = tokenize_fun(load_tokenizer(vocab_file)) def _preprocess(d): """Applies tokenization.""" tokens = tokenize( d['description']).flat_values # Note: we ignore 'title' preprocessed = { 'inputs': tokens, 'labels': d['label'], 'index': tf.size(tokens), } return transform_fn(preprocessed) # Load dataset. dset = load_tfds('ag_news_subset', split, _preprocess, filter_fn, data_dir=data_dir) # Pad remaining examples to the sequence length. dset = padded_batch(dset, batch_size, sequence_length) return dset
def paracrawl(language_pair, vocab_files, sequence_length, batch_size=64, transform_fn=utils.identity, filter_fn=None, data_dir=None): """Loads a paracrawl translation dataset from TFDS. Arguments: language_pair: str, e.g. 'ende', specifying both languages. vocab_files: List[str], vocab filenames for each language. """ PARACRAWL_LANGUAGE_PAIRS = [ 'enbg', 'encs', 'enda', 'ende', 'enel', 'enes', 'enet', 'enfi', 'enfr', 'enga', 'enhr', 'enhu', 'enit', 'enlt', 'enlv', 'enmt', 'ennl', 'enpl', 'enpt', 'enro', 'ensk', 'ensl', 'ensv' ] if language_pair not in PARACRAWL_LANGUAGE_PAIRS: raise ValueError( f'language_pair must be one of {PARACRAWL_LANGUAGE_PAIRS}') languages = [language_pair[:2], language_pair[2:]] tokenizer_list = [ tokenize_w_punctuation(load_tokenizer(f)) for f in vocab_files ] tokenizer_dict = dict(zip(languages, tokenizer_list)) def _preprocess(d): tokens = {l: tokenizer_dict[l](d[l]).flat_values for l in languages} for l in languages: tokens.update({f'{l}_index': tf.size(tokens[l])}) tokens.update({f'{l}_orig': d[l]}) return transform_fn(tokens) dataset = tfds.load( f'para_crawl/{language_pair}', split='train', # para_crawl only has a train split data_dir=data_dir) dset = pipeline(dataset, preprocess_fun=_preprocess, filter_fn=filter_fn) # Filter out examples longer than sequence length. for l in languages: dset = dset.filter(lambda d: d[f'{l}_index'] <= sequence_length) # We assume the dataset contains inputs, labels, and an index. padded_shapes = {} for l in languages: padded_shapes[f'{l}_index'] = () padded_shapes[f'{l}_orig'] = () padded_shapes[l] = (sequence_length, ) # Pad remaining examples to the sequence length. dset = dset.padded_batch(batch_size, padded_shapes) return dset, tokenizer_dict
def test_tokenizer_fun(vocab): """Tests the subword tokenizer.""" tokenizer = load_tokenizer(vocab.name) tokenize = datasets.tokenize_fun(tokenizer) actual = list(tokenize("this is a test.").flat_values.numpy()) expected = [5, 6, 2, 6, 3, 4, 7] assert actual == expected
def dbpedia(split, num_classes, vocab_file, sequence_length=1000, batch_size=64, transform=utils.identity, filter_fn=None, data_dir=None): """Loads the dpedia text classification dataset.""" tokenize = tokenize_fun(load_tokenizer(vocab_file)) if data_dir is None: raise ValueError('DBPedia dataset requires data_dir to be provided.') def _preprocess(d): """Applies tokenization, and transforms the dbpedia labels according to the specified number of classes For a given number of classes, the classes with labels below that number are kept, and all other classes are removed. So, e.g., num_classes = 4, would keep classes 0,1,2,3""" def relabel(label): if label <= num_classes: # in DBPedia csv file, labels are # given as 1, 2, ... return label - 1 else: return tf.constant(-1, dtype=tf.int64) tokens = tokenize(d['text']).flat_values preprocessed = { 'inputs': tokens, 'labels': relabel(d['label']), 'index': tf.size(tokens), } return transform(preprocessed) filter_fn = lambda x: x['labels'] != -1 # Load dataset. dset = load_csv('dbpedia', split, _preprocess, filter_fn, data_dir=data_dir) # Pad remaining examples to the sequence length. dset = padded_batch(dset, batch_size, sequence_length) return dset
def goemotions(split, vocab_file, sequence_length=50, batch_size=64, emotions=None, transform=utils.identity, filter_fn=None, data_dir=None): """Loads the goemotions dataset.""" tokenize = tokenize_fun(load_tokenizer(vocab_file)) if emotions is None: # Use all emotions. emotions = ('admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise') def _preprocess(d): tokens = tokenize(d['comment_text']).flat_values index = tf.size(tokens) labels = tf.convert_to_tensor([d[e] for e in emotions], dtype=tf.int64) preprocessed = { 'inputs': tokens, 'labels': labels, 'index': index, } return transform(preprocessed) # Load dataset. dset = load_tfds('goemotions', split, _preprocess, filter_fn, data_dir=data_dir) # Pad remaining examples to the sequence length. dset = padded_batch(dset, batch_size, sequence_length, label_shape=(len(emotions), )) return dset
def snli_sep(split, vocab_file, hypothesis_length=40, premise_length=40, batch_size=64, transform=utils.identity, filter_fn=None, data_dir=None): """Loads the SNLI dataset, with hypothesis and premise separated as two different fields """ tokenize = tokenize_fun(load_tokenizer(vocab_file)) def _preprocess(d): """Applies tokenization.""" hypothesis = tokenize(d['hypothesis']).flat_values premise = tokenize(d['premise']).flat_values return transform({ 'hypothesis': hypothesis, 'premise': premise, 'hypothesis_index': tf.size(hypothesis), 'premise_index': tf.size(premise), 'labels': d['label'], }) # Load dataset. dset = load_tfds('snli', split, _preprocess, filter_fn, data_dir=data_dir) # Pad remaining examples to the sequence length. field_lengths = { 'hypothesis_index': hypothesis_length, 'premise_index': premise_length } padded_shapes = { 'hypothesis': (hypothesis_length, ), 'premise': (premise_length, ), 'premise_index': (), 'hypothesis_index': (), 'labels': () } dset = filter_pad_batch(dset, batch_size, field_lengths, padded_shapes) return dset
def amazon(split, num_classes, vocab_file, sequence_length=250, batch_size=64, transform=utils.identity, filter_fn=None, data_dir=None): """Loads the yelp reviews dataset.""" tokenize = tokenize_fun(load_tokenizer(vocab_file)) if data_dir is None: raise ValueError('Amazon dataset requires data_dir to be provided.') label_conversion = data_utils.sentiment_relabel(num_classes) def _preprocess(d): """Applies tokenization, and transforms the Amazon labels according to the specified number of classes""" tokens = tokenize(d['text']).flat_values preprocessed = { 'inputs': tokens, 'labels': label_conversion(d['label']), 'index': tf.size(tokens), } return transform(preprocessed) filter_fn = lambda x: x['labels'] != -1 # Load dataset. dset = load_csv('amazon', split, _preprocess, filter_fn, data_dir=data_dir) # Pad remaining examples to the sequence length. dset = padded_batch(dset, batch_size, sequence_length) return dset