def create_dataset(buffer_size, batch_size, data_format, data_dir=None): """Creates a tf.data Dataset. Args: buffer_size: Shuffle buffer size. batch_size: Batch size data_format: channels_first or channels_last data_dir: directory to store the dataset. Returns: train dataset, test dataset, metadata """ preprocess_train = Preprocess(data_format, train=True) preprocess_test = Preprocess(data_format, train=False) dataset, metadata = tfds.load( 'cifar10', data_dir=data_dir, as_supervised=True, with_info=True) train_dataset, test_dataset = dataset['train'], dataset['test'] train_dataset = train_dataset.map( preprocess_train, num_parallel_calls=AUTOTUNE) train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size) train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE) test_dataset = test_dataset.map( preprocess_test, num_parallel_calls=AUTOTUNE).batch(batch_size) test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE) return train_dataset, test_dataset, metadata
def create_dataset(buffer_size, batch_size): dataset, _ = tfds.load('mnist', as_supervised=True, with_info=True) train_dataset, _ = dataset['train'], dataset['test'] train_dataset = train_dataset.map(scale, num_parallel_calls=AUTOTUNE) train_dataset = train_dataset.shuffle(buffer_size).batch(batch_size) return train_dataset
from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.models import Sequential from tensorflow.keras import layers import sys cnn_model_name = 'cnn_text_classification.h5' rnn_model_name = 'rnn_text_classification.h5' print(tf.executing_eagerly()) max_features = 10000 max_len = 200 initial_epochs = 10 validation_steps = 20 print('loading data...') data, info = tfds.load(name="imdb_reviews/subwords8k", with_info=True, as_supervised=True, ) test_dataset = data['test'] train_dataset = data['train'] print(train_dataset) sys.exit() encoder = info.features['text'].encoder print('Vocabulary size: {}'.format(encoder.vocab_size)) # imdb_builder = tfds.builder(name="imdb_reviews/subwords8k") # imdb_builder.download_and_prepare() # info = imdb_builder.info # print("dataset name {} \ndataset size: {}\ndataset features: {}".format(info.name, info.splits, info.features)) # test_dataset = imdb_builder.as_dataset(split="test") # train_dataset = imdb_builder.as_dataset(split="train") # for train_example in train_dataset.take(1):
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
from lib.networks.segmentation import MobileDeepLabV3 from lib.networks.style_transfer import build_model from lib.networks.style_transfer.layers import TVLoss from lib.visualize import vis_segmentation if __name__ == '__main__': INPUT_WIDTH = 512 INPUT_HEIGHT = 256 cityscapes = tfds.load( 'cityscapes/semantic_segmentation', split='train[:2975]', shuffle_files=True).map( lambda d: { **d, 'segmentation_label': d['segmentation_label'] / 255, 'image_left': tf.image.resize_with_pad(d['image_left'] / 255, target_height=INPUT_HEIGHT, target_width=INPUT_WIDTH) }).batch(4) wikiart = tfds.load( 'wikiart_images', split='train[:2975]', shuffle_files=True).map( lambda d: { **d, 'image': tf.image.resize_with_crop_or_pad(d['image'] / 255, target_height=INPUT_HEIGHT, target_width=INPUT_WIDTH) }).batch(4) exC, exW = next(zip(iter(cityscapes), iter(wikiart)))
import tensorflow.compat.v2 as tf import tensorflow_datasets as tfds def normalize_img(image, label): """Normalizes images: `uint8` -> `float32`.""" return tf.cast(image, tf.float32) / 255.0, label if __name__ == "__main__": tf.enable_v2_behavior() (ds_train, ds_test), ds_info = tfds.load( "mnist", split=["train", "test"], shuffle_files=True, as_supervised=True, with_info=True, ) ds_train = ds_train.map(normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_train = ds_train.cache() ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples) ds_train = ds_train.batch(128) ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE) ds_test = ds_test.map(normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds_test = ds_test.batch(128) ds_test = ds_test.cache()
import tensorflow.compat.v2 as tf import tensorflow_datasets as tfds # Construct a tf.daatta.Dataset ds = tfds.load('mnist', split='train', shuffle_files=True) # Build your input pipeline ds = ds.shuffle(1024).batch(32).prefetch(tf.data.experimental.AUTOTUNE) for example in ds.take(1): image, label = example["image"], example["label"] print("label:",label)
print("GPU device not found, use cpu instead!") # raise SystemError('GPU device not found') else: print('Found GPU at: {}'.format(device_name)) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False # Step 1: Load dataset from 102 category flower dataset with tf.Session(config=config) as sess: # Use cifar100, which has 100 categories with size 32*32 # Preproceess the images and set the hyperparameters cifar100_train, cifar100_info = tfds.load(name="cifar100", split=tfds.Split.TRAIN, as_supervised=True, with_info=True) BATCH_SIZE = 128 EPOCH = 7 INPUT_SIZE = cifar100_info.splits["train"].num_examples BUFFER_SIZE = 8000 NUM_CLASSES = cifar100_info.features['label'].num_classes iter_number = (int)(INPUT_SIZE / BATCH_SIZE) + 1 train_ds = utils.prepare_train_ds(cifar100_train, BATCH_SIZE, BUFFER_SIZE, image_size=224) # Use third party images with 102 categories flowers. # BATCH_SIZE = 128 # EPOCH = 7
# limitations under the License. # ============================================================================== """A demo script to show to train a segmentation model.""" from keras.efficientdet_keras import EfficientDetNet import tensorflow as tf def create_mask(pred_mask): pred_mask = tf.argmax(pred_mask, axis=-1) pred_mask = pred_mask[..., tf.newaxis] return pred_mask[0] import tensorflow_datasets as tfds dataset, info = tfds.load('oxford_iiit_pet:3.*.*', with_info=True) def normalize(input_image, input_mask): input_image = tf.cast(input_image, tf.float32) / 255.0 input_mask -= 1 return input_image, input_mask def load_image_train(datapoint): input_image = tf.image.resize(datapoint['image'], (512, 512)) input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128)) if tf.random.uniform(()) > 0.5: input_image = tf.image.flip_left_right(input_image) input_mask = tf.image.flip_left_right(input_mask)
from __future__ import absolute_import, division, print_function, unicode_literals import tensorflow_datasets as tfds import tensorflow as tf print(tf.__version__) # Get the data dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True) train_dataset, test_dataset = dataset['train'], dataset['test'] tokenizer = info.features['text'].encoder BUFFER_SIZE = 10000 BATCH_SIZE = 64 train_dataset = train_dataset.shuffle(BUFFER_SIZE) train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset)) test_dataset = test_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_dataset)) model = tf.keras.Sequential([ tf.keras.layers.Embedding(tokenizer.vocab_size, 64), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.summary() model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
def get_dataset(data_dir, config, dataset_name=None): """The training dataset for the code model for fault localization. Args: data_dir: The data directory to use with tfds.load. config: The config for the model. dataset_name: If set, use this dataset name in place of the one from the config. Returns: train_dataset: The tf.data.Dataset with batched examples. info: The DatasetInfo object containing the feature connectors and other info about the dataset. """ dataset_name = dataset_name or config.dataset.name split = get_split(config) version = ( None if config.dataset.version == 'default' else config.dataset.version) # If in interact mode, use an interactive dataset. if config.runner.mode == 'interact': dbuilder = tfds.builder( dataset_name, data_dir=data_dir, version=version) unused_split_generators = dbuilder._split_generators(dl_manager=None) # pylint: disable=protected-access info = dbuilder.info info._builder.set_representation(config.dataset.representation) # pylint: disable=protected-access assert config.dataset.batch_size == 1 dataset = make_interactive_dataset(info, config) if config.dataset.batch: dataset = apply_batching(dataset, info, config) set_task = cannot_set_task return DatasetInfo( dataset=dataset, info=info, set_task=set_task ) # Load the dataset. if config.dataset.in_memory: dbuilder = tfds.builder( dataset_name, data_dir=data_dir, version=version) unused_split_generators = dbuilder._split_generators(dl_manager=None) # pylint: disable=protected-access dataset, set_task = dbuilder.as_in_memory_dataset(split='all') info = dbuilder.info else: name = dataset_name if version is not None: name = f'{name}:{version}' dataset, info = tfds.load( name=name, split=split, data_dir=data_dir, # batch_size=config.dataset.batch_size, with_info=True) set_task = cannot_set_task info._builder.set_representation(config.dataset.representation) # pylint: disable=protected-access verify_reasonable_dataset(dataset_name, info, config) dataset = dataset.repeat() dataset = apply_filtering(dataset, info, config) if config.dataset.batch: dataset = apply_batching(dataset, info, config) return DatasetInfo( dataset=dataset, info=info, set_task=set_task, )
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri May 22 18:03:44 2020 @author: jjg """ import tensorflow as tf import tensorflow_datasets from transformers import * # Load dataset, tokenizer, model from pretrained model/vocabulary tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = TFBertForSequenceClassification.from_pretrained('bert-base-cased') data = tensorflow_datasets.load('glue/mrpc') # Prepare dataset for GLUE as a tf.data.Dataset instance train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc') valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc') train_dataset = train_dataset.shuffle(100).batch(32).repeat(2) valid_dataset = valid_dataset.batch(64) # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,
def create_model(): dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True) train_dataset, test_dataset = dataset['train'], dataset['test'] class_names = ['T-shirst/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'] num_train_example = metadata.splits['train'].num_examples num_test_example = metadata.splits['test'].num_examples print(num_train_example) print(num_test_example) # train_dataset = train_dataset.map(normalize) # test_dataset = test_dataset.map(normalize) # take 1 image and remove the color dimension by reshaping for image, label in test_dataset.take(1): break image = image.numpy().reshape((28, 28)) # # plot the image # plt.figure() # plt.imshow(image, cmap=plt.cm.binary) # plt.colorbar() # plt.grid(False) # plt.show() # # # plot 25 image # plt.figure(figsize=(10, 10)) # i = 0 # for (image, label) in test_dataset.take(25): # image = image.numpy().reshape((28, 28)) # plt.subplot(5, 5, i + 1) # plt.xticks([]) # plt.yticks([]) # plt.grid(False) # plt.imshow(image, cmap=plt.cm.binary) # plt.xlabel(class_names[label]) # i += 1 # plt.show() model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(128, activation=tf.nn.relu), tf.keras.layers.Dense(10, activation=tf.nn.softmax) ]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) BATCH_SIZE = 32 train_dataset = train_dataset.repeat().shuffle(num_train_example).batch(BATCH_SIZE) test_dataset = test_dataset.batch(BATCH_SIZE) model.fit(train_dataset, epochs=5, steps_per_epoch=math.ceil(num_train_example / BATCH_SIZE)) model.save('saved_models\\Fashion_MNIST_Classify_example_not_normalize_pixel') tess_loss, test_accuracy = model.evaluate(test_dataset, steps=math.ceil(num_test_example / BATCH_SIZE)) print('Accuracy on test dataset:', test_accuracy) # for test_images, test_labels in test_dataset.take(1): # test_images = test_images.numpy() # test_labels = test_labels.numpy() # predictions = model.predict(test_images) # print(predictions.shape) # print(predictions[0]) # print(np.argmax(predictions[0])) # print(test_labels[0]) # i = 12 # plt.figure(figsize=(6,3)) # plt.subplot(1,2,1) # plot_image(i, predictions, test_labels, test_images) # plt.subplot(1,2,2) # plot_value_array(i, predictions, test_labels) # plt.show() return model
# until the input size grows to 32x32 vgg = PROG_PL_VGG19(input_dims=(32, 32, 3), layers_to_extract=[0, 1, 2], load_weights='imagenet', channel_last=True) ### DATA ### """ NWPU-RESISC45 This dataset requires you to download the source data manually into download_config.manual_dir (defaults to ~/tensorflow_datasets/manual/): Note: this dataset does not have a test/train split. """ # load data # data, info = tfds.load('resisc45', split="train", with_info=True) # visualize data # tfds.show_examples(data, info) # size of entire dataset # ds_size = info.splits["train"].num_examples image_shape = info.features['image'].shape # manually split ds into 80:20, train & test respectively # test_ds_size = int(ds_size * 0.20) train_ds_size = ds_size - test_ds_size # split # test_ds = data.take(test_ds_size) train_ds = data.skip(test_ds_size) print("size of test: {}, size of train: {}".format(test_ds_size, train_ds_size))
from __future__ import absolute_import, division, print_function, unicode_literals import tensorflow as tf from tensorflow import keras import tensorflow_datasets as datasets import numpy as np (train_data, test_data), info = datasets.load( name="imdb_reviews/subwords8k", split=(datasets.Split.TRAIN, datasets.Split.TEST), as_supervised=True, with_info=True ) encoder = info.features['text'].encoder # Exploration of the data # for train_example, train_label in train_data.take(1): # print("Encoded text:", train_example[:10].numpy()) # print("Label:", train_label.numpy()) # Data sanitization BUFFER_SIZE = 1000 # The docs are broken on google. # Found the solution on github to use the compat.v1.data.get_output_shapes method train_output_shapes = tf.compat.v1.data.get_output_shapes(train_data) train_batches = (train_data.shuffle(BUFFER_SIZE).padded_batch(32, train_output_shapes)) test_batches = (test_data.shuffle(BUFFER_SIZE).padded_batch(32, train_output_shapes)) # for example_batch, label_batch in train_batches.take(2):
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Prepare Question-Answering task # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) with training_args.strategy.scope(): model = TFAutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets if not data_args.data_dir: if data_args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically") try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") tfds_examples = tfds.load("squad") train_examples = ( SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=False) if training_args.do_train else None ) eval_examples = ( SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=True) if training_args.do_eval else None ) else: processor = SquadV2Processor() if data_args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples(data_args.data_dir) if training_args.do_train else None eval_examples = processor.get_dev_examples(data_args.data_dir) if training_args.do_eval else None train_dataset = ( squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=True, return_dataset="tf", ) if training_args.do_train else None ) eval_dataset = ( squad_convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=False, return_dataset="tf", ) if training_args.do_eval else None ) # Initialize our Trainer trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir)
def load_split(batch_size, train, data_dir, dtype=tf.float32, image_size=IMAGE_SIZE, cache=False): """Creates a split from the ImageNet dataset using TensorFlow Datasets. Args: batch_size: the batch size returned by the data pipeline. train: Whether to load the train or evaluation split. data_dir: str, directory to read/write data. Defaults to the value of the environment variable TFDS_DATA_DIR, if set, otherwise falls back to '~/tensorflow_datasets'. dtype: data type of the image. image_size: The target size of the images. cache: Whether to cache the dataset. Returns: A `tf.data.Dataset`. """ if train: split_size = TRAIN_IMAGES // jax.host_count() start = jax.host_id() * split_size split = 'train[{}:{}]'.format(start, start + split_size) else: split_size = EVAL_IMAGES // jax.host_count() start = jax.host_id() * split_size split = 'validation[{}:{}]'.format(start, start + split_size) def decode_example(example): if train: image = preprocess_for_train(example['image'], dtype, image_size) else: image = preprocess_for_eval(example['image'], dtype, image_size) return {'image': image, 'label': example['label']} ds = tfds.load('imagenet2012:5.*.*', split=split, data_dir=data_dir, decoders={ 'image': tfds.decode.SkipDecoding(), }) options = tf.data.Options() options.experimental_threading.private_threadpool_size = 48 ds = ds.with_options(options) if cache: ds = ds.cache() if train: ds = ds.repeat() ds = ds.shuffle(16 * batch_size, seed=0) ds = ds.map(decode_example, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.batch(batch_size, drop_remainder=True) if not train: ds = ds.repeat() ds = ds.prefetch(10) return ds
def load_dataset(split, batch_size, name, use_bfloat16, normalize=True, drop_remainder=True, proportion=1.0, validation_set=False, validation_proportion=0.05, aug_params=None): """Loads CIFAR dataset for training or testing. Args: split: tfds.Split. batch_size: The global batch size to use. name: A string indicates whether it is cifar10 or cifar100. use_bfloat16: data type, bfloat16 precision or float32. normalize: Whether to apply mean-std normalization on features. drop_remainder: bool. proportion: float, the proportion of dataset to be used. validation_set: bool, whether to split a validation set from training data. validation_proportion: float, the proportion of training dataset to be used as the validation split, if validation_set is set to True. aug_params: dict, data augmentation hyper parameters. Returns: Input function which returns a locally-sharded dataset batch. """ if proportion < 0. or proportion > 1.: raise ValueError('proportion needs to lie in the range [0, 1]') if validation_proportion < 0. or validation_proportion > 1.: raise ValueError( 'validation_proportion needs to lie in the range [0, 1]') if use_bfloat16: dtype = tf.bfloat16 else: dtype = tf.float32 ds_info = tfds.builder(name).info image_shape = ds_info.features['image'].shape dataset_size = ds_info.splits['train'].num_examples num_classes = ds_info.features['label'].num_classes if aug_params is None: aug_params = {} adaptive_mixup = aug_params.get('adaptive_mixup', False) random_augment = aug_params.get('random_augment', False) mixup_alpha = aug_params.get('mixup_alpha', 0) ensemble_size = aug_params.get('ensemble_size', 1) label_smoothing = aug_params.get('label_smoothing', 0.) if adaptive_mixup and 'mixup_coeff' not in aug_params: # Hard target in the first epoch! aug_params['mixup_coeff'] = tf.ones([ensemble_size, num_classes]) if mixup_alpha > 0 or label_smoothing > 0: onehot = True else: onehot = False def preprocess(image, label): """Image preprocessing function.""" if split == tfds.Split.TRAIN: image = tf.image.resize_with_crop_or_pad(image, image_shape[0] + 4, image_shape[1] + 4) image = tf.image.random_crop(image, image_shape) image = tf.image.random_flip_left_right(image) # Only random augment for now. if random_augment: count = aug_params['aug_count'] augmenter = augment_utils.RandAugment() augmented = [augmenter.distort(image) for _ in range(count)] image = tf.stack(augmented) if split == tfds.Split.TRAIN and aug_params['augmix']: augmenter = augment_utils.RandAugment() image = _augmix(image, aug_params, augmenter, dtype) elif normalize: image = normalize_convert_image(image, dtype) if split == tfds.Split.TRAIN and onehot: label = tf.cast(label, tf.int32) label = tf.one_hot(label, num_classes) else: label = tf.cast(label, dtype) return image, label if proportion == 1.0: if validation_set: new_name = '{}:3.*.*'.format(name) if split == 'validation': new_split = 'train[{}%:]'.format( int(100 * (1. - validation_proportion))) dataset = tfds.load(new_name, split=new_split, as_supervised=True) elif split == tfds.Split.TRAIN: new_split = 'train[:{}%]'.format( int(100 * (1. - validation_proportion))) dataset = tfds.load(name, split='train[:95%]', as_supervised=True) # split == tfds.Split.TEST case else: dataset = tfds.load(name, split=split, as_supervised=True) else: dataset = tfds.load(name, split=split, as_supervised=True) else: logging.warning( 'Subset of training dataset is being used without a validation set.' ) new_name = '{}:3.*.*'.format(name) if split == tfds.Split.TRAIN: new_split = 'train[:{}%]'.format(int(100 * proportion)) else: new_split = 'test[:{}%]'.format(int(100 * proportion)) dataset = tfds.load(new_name, split=new_split, as_supervised=True) if split == tfds.Split.TRAIN: dataset = dataset.shuffle(buffer_size=dataset_size).repeat() dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) if mixup_alpha > 0 and split == tfds.Split.TRAIN: if adaptive_mixup: dataset = dataset.map(functools.partial(adaptive_mixup_aug, batch_size, aug_params), num_parallel_calls=8) else: dataset = dataset.map(functools.partial(mixup, batch_size, aug_params), num_parallel_calls=8) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
import tensorflow_datasets as tfds import tensorflow as tf imdb, info = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True) train_data, test_data = imdb["train"], imdb["test"] tokenizer = info.features["text"].encoder print(tokenizer.subwords) sample_string = "Tensorflow, from basic to mastery" tokenized_string = tokenizer.encode(sample_string) print('Tokenized string is {}'.format(tokenized_string)) original_string = tokenizer.decode(tokenized_string) print('The original string: {}'.format(original_string)) for ts in tokenized_string: print('{} ----> {}'.format(ts, tokenizer.decode([ts]))) embedding_dim = 64 model = tf.keras.Sequential([ tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(6, activation="relu"), tf.keras.layers.Dense(1, activation="sigmoid") ])
from tensorflow.keras.applications.xception import Xception, preprocess_input from tensorflow.keras.layers import Dense, GlobalAveragePooling2D import tensorflow_datasets as tfds import matplotlib.pyplot as plt import tensorflow as tf import numpy as np print('Tensorflow version =', tf.__version__) # ## Loading the dataset using TensorFlow Datasets # In[2]: dataset_size = 23262 dataset = tfds.load(name='cats_vs_dogs', as_supervised=True, split=["train"])[0] label_map = {1:'dog', 0:'cat'} # ## Creating train test splits # In[3]: test_dataset = dataset.take(3000) train_dataset = dataset.skip(3000) # ## Visualizing some samples from the dataset # - This is a dataset containing images for dogs and cats
def pre_processing_test(example): # extract image and label from example image = example["image"] label = example["label"] # image is cast to a float32 and normalized to [0, 1] # label is cast to a int32 image = tf.math.divide(tf.dtypes.cast(image, tf.float32), DATA_NORM) label = tf.dtypes.cast(label, tf.int32) # return image and label return image, label # download data and split into training and testing datasets dataset_train, info = tfds.load("mnist", split=tfds.Split.TRAIN, with_info=True) dataset_test, info = tfds.load("mnist", split=tfds.Split.TEST, with_info=True) # debug - datasets # print(dataset_train) # <_OptionsDataset shapes: {image: (28, 28, 1), label: ()}, types: {image: tf.uint8, label: tf.int64}> # print(dataset_test) # <_OptionsDataset shapes: {image: (28, 28, 1), label: ()}, types: {image: tf.uint8, label: tf.int64}> # transform training dataset dataset_train = dataset_train.map(pre_processing_train, num_parallel_calls=4) dataset_train = dataset_train.shuffle(buffer_size=TRAINING_SHUFFLE_BUFFER) dataset_train = dataset_train.batch(TRAINING_BATCH_SIZE) dataset_train = dataset_train.prefetch(buffer_size=1) # transform testing dataset dataset_test = dataset_test.map(pre_processing_test, num_parallel_calls=4) dataset_test = dataset_test.batch(TRAINING_BATCH_SIZE)
# In[1]: import numpy as np import tensorflow as tf from tensorflow import keras tf.random.set_seed(42) # ## Loading the dataset : # In[2]: #importing dataset import tensorflow_datasets as tfds datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True) print(datasets.keys()) # In[3]: train_size = info.splits["train"].num_examples test_size = info.splits["test"].num_examples print(train_size, test_size) # ## Exploring the dataset : # In[4]: for X_batch, y_batch in datasets["train"].batch(2).take(2): for review, label in zip(X_batch.numpy(), y_batch.numpy()): print("Review : ", review.decode("utf-8")[:200], "...")
#!/usr/bin/python3 # -*- coding: UTF-8 -*- # TensorFlow import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # to get rid of the TF warnings import tensorflow as tf from tensorflow.keras import models from tensorflow.keras.layers import Dense import tensorflow_datasets as tfds # NLP from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences train, valid, test = tfds.load(name="imdb_reviews", split=('train[:60%]', 'train[60%:]', 'test'), as_supervised=True) Nsamples = int(1e3) Nwords = 5 # correlation words distance Nraw = int(1e3) # Ndim = int(1e4) train_iter = train.__iter__() sentences = [] for i in range(Nsamples): x, y = train_iter.get_next() sentences.append(x.numpy().decode('utf-8')) tokenizer = Tokenizer(num_words=Nraw, oov_token='<OOV>')
import tensorflow as tf import tensorflow_datasets as tfds import numpy as np import os os.environ['CUDA_VISIBLE_DEVICES'] = '-1' imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True) train_data, valid_data = imdb['train'], imdb['test'] train_sentences = [] train_labels = [] for s, l in train_data: train_sentences.append(str(s.numpy())) train_labels.append(int(l.numpy())) valid_sentences = [] valid_labels = [] for s, l in valid_data: valid_sentences.append(str(s.numpy())) valid_labels.append(int(l.numpy())) train_labels = np.array(train_labels) valid_labels = np.array(valid_labels) vocab_size = 1000 oov_token = '<OOV>' trun_type = 'post'
from __future__ import absolute_import, division, print_function, unicode_literals import tensorflow_datasets as tfds import tensorflow as tf import time import numpy as np import matplotlib.pyplot as plt from position2encoding import * examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True) train_examples, val_examples = examples['train'], examples['validation'] tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus( (en.numpy() for _, en in train_examples), target_vocab_size=2**13 ) tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus( (pt.numpy() for pt, _ in train_examples), target_vocab_size=2**13 ) sample_string = "Transformer is awesome." tokenized_string = tokenizer_en.encode(sample_string) print("Tokenized string is {}".format(tokenized_string)) original_string = tokenizer_en.decode(tokenized_string) print("The original string: {}".format(original_string)) assert original_string == sample_string for ts in tokenized_string:
# =================================================== # import numpy as np import tensorflow_datasets as tfds import matplotlib.pyplot as plt import tensorflow as tf from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout from tensorflow.keras.models import Sequential from tensorflow.keras.callbacks import ModelCheckpoint from tensorflow.keras.applications import VGG16 dataset_name = 'cats_vs_dogs' train_dataset = tfds.load(name=dataset_name, split='train[:80%]') valid_dataset = tfds.load(name=dataset_name, split='train[80%:]') def preprocess(data): x = data['image'] y = data['label'] # image 정규화(Normalization) x = x / 255 # 사이즈를 (224, 224)로 변경 x = tf.image.resize(x, size=(224, 224)) return x, y def solution_model(): batch_size=32 train_data = train_dataset.map(preprocess).batch(batch_size)
inputs = [ "Join'd to th' Ionians with their flowing robes,", # Label: 1 "the allies, and his armour flashed about him so that he seemed to all", # Label: 2 "And with loud clangor of his arms he fell.", # Label: 0 ] predicted_scores = export_model.predict(inputs) predicted_labels = tf.argmax(predicted_scores, axis=1) for input, label in zip(inputs, predicted_labels): print("Question: ", input) print("Predicted label: ", label.numpy()) # Downloading more datasets using TensorFlow Datasets (TFDS) train_ds = tfds.load("imdb_reviews", split="train", batch_size=BATCH_SIZE, shuffle_files=True, as_supervised=True) val_ds = tfds.load("imdb_reviews", split="train", batch_size=BATCH_SIZE, shuffle_files=True, as_supervised=True) for review_batch, label_batch in val_ds.take(1): for i in range(5): print("Review: ", review_batch[i].numpy()) print("Label: ", label_batch[i].numpy()) vectorize_layer = TextVectorization(max_tokens=VOCAB_SIZE,
predicted_label = np.argmax(predictions_array) if predicted_label == true_label: color = 'green' else: color = 'red' ax.set_xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label], 100 * np.max(predictions_array), class_names[true_label]), color=color) (raw_test, ), metadata = tfds.load( 'cats_vs_dogs', split=config.TEST_SPLIT, with_info=True, as_supervised=True, ) test = raw_test.map(config.img_to_model_input) test_batches = test.batch(1) interpreter = tf.lite.Interpreter(model_path=config.TFLITE_MODEL_PATH) interpreter.allocate_tensors() input_index = interpreter.get_input_details()[0]["index"] output_index = interpreter.get_output_details()[0]["index"] predictions = [] test_labels, test_imgs = [], []
from PIL import Image import numpy as np import matplotlib.pyplot as plt import imageio import glob import os BUFFER_SIZE = 50000 BATCH_SIZE = 128 LR = 2e-4 BETA1 = 0.5 EPOCHS = 100 NOISE_DIM = 100 NUM_FAKE_IMAGES = 16 cifar, info = tfds.load('cifar10', with_info=True, as_supervised=True) train_data, test_data = cifar['train'], cifar['test'] train_data = train_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE) test_data = test_data.batch(BATCH_SIZE) class Generator(tf.keras.Model): def __init__(self): super(Generator, self).__init__() self.model = tf.keras.Sequential([ tf.keras.layers.Dense(4 * 4 * 512, use_bias=False, input_shape=(100, )), tf.keras.layers.BatchNormalization(), tf.keras.layers.ReLU(), tf.keras.layers.Reshape((4, 4, 512)),
import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 不显示AVX, CUDA等警告 # 导入 import tensorflow as tf import tensorflow_datasets as tfds # 导入其他库 import matplotlib.pyplot as plt # 一种2D绘图库 import numpy as np # 数值计算的扩展,包括数学函数 import math #数学库 #print(tf.__version__) #2. 下载数据,包括训练数据和测试数据 dataset, metadata = tfds.load('fashion_mnist',as_supervised=True,with_info=True) train_dataset,test_dataset = dataset['train'],dataset['test'] class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'] # 3.数据预处理 num_train_examples = metadata.splits['train'].num_examples # 训练数据集数量 num_test_examples = metadata.splits['test'].num_examples # 测试数据集数量 ,数据会放入C:\Users\xx\tensorflow_datasets\fashion_mnist\3.0.0 print("训练数据集的数量:{}".format(num_train_examples)) #60000 print("测试数据集的数量:{}".format(num_test_examples)) #10000 # 4. 标准化 #自定义归一化函数 def normalize(images,labels): images = tf.cast(images,tf.float32) images /= 255
from itertools import combinations from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import log_loss tf.enable_v2_behavior() def get_train_partition(data,split): return data[0:int(split*len(data))] def get_test_partition(data,split): return data[int(split*len(data)):len(data)] ds_train, ds_info = tfds.load( 'iris', split=['train'], shuffle_files=False, as_supervised=True, with_info=True, ) ds_numpy = tfds.as_numpy(ds_train) profile_features = [] labels = [] for ex in ds_numpy[0]: profile_features.append(ex[0]) labels.append(ex[1]) print("dataset size:",len(labels)) """## Limited Data Experiments""" print("begin experiment")
<img src="https://tensorflow.org/images/fashion-mnist-sprite.png" alt="Fashion MNIST sprite" width="600"> </td></tr> <tr><td align="center"> <b>Figure 1.</b> <a href="https://github.com/zalandoresearch/fashion-mnist">Fashion-MNIST samples</a> (by Zalando, MIT License).<br/> </td></tr> </table> Fashion MNIST is intended as a drop-in replacement for the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset—often used as the "Hello, World" of machine learning programs for computer vision. The MNIST dataset contains images of handwritten digits (0, 1, 2, etc) in an identical format to the articles of clothing we'll use here. This guide uses Fashion MNIST for variety, and because it's a slightly more challenging problem than regular MNIST. Both datasets are relatively small and are used to verify that an algorithm works as expected. They're good starting points to test and debug code. We will use 60,000 images to train the network and 10,000 images to evaluate how accurately the network learned to classify images. You can access the Fashion MNIST directly from TensorFlow, using the [Datasets](https://www.tensorflow.org/datasets) API: """ dataset, metadata = tfds.load('fashion_mnist', as_supervised=True, with_info=True) train_dataset, test_dataset = dataset['train'], dataset['test'] """Loading the dataset returns metadata as well as a *training dataset* and *test dataset*. * The model is trained using `train_dataset`. * The model is tested against `test_dataset`. The images are 28 $\times$ 28 arrays, with pixel values in the range `[0, 255]`. The *labels* are an array of integers, in the range `[0, 9]`. These correspond to the *class* of clothing the image represents: <table> <tr> <th>Label</th> <th>Class</th> </tr> <tr>
import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from bs4 import BeautifulSoup import string imdb_sentences = [] imdb_train = tfds.as_numpy(tfds.load('imdb_reviews', split='train')) # for item in imdb_train: # imdb_sentences.append(str(item['text'])) # tokenizer = Tokenizer(num_words=5000) # tokenizer.fit_on_texts(imdb_sentences) # sequences = tokenizer.texts_to_sequences(imdb_sentences) # word_index = tokenizer.word_index # print(word_index) # most of the words in the index are stopwords and html tags stopwords = ['a', ..., 'yourselves'] print(stopwords) table = str.maketrans('', '', string.punctuation) print(table) for item in imdb_train: sentence = str(item['text'].decode('UTF-8').lower()) soup = BeautifulSoup(sentence) sentence = soup.get_text()