def motif_discovery_raw(train_file, test_file): subset_size = 690 * 190 x_shape = len(range(101)) train_gen = gen_from_fasta(train_file, None) test_gen = gen_from_fasta(test_file, None) # datasets bacth_size = 512 prefetch = tf.data.experimental.AUTOTUNE output_shapes = ((), ()) output_types = (tf.string, tf.float32) train_ds = Dataset.from_generator(train_gen, output_types, output_shapes) # takes about 30 seconds to skip the training data val_ds = train_ds.skip(subset_size).take(690 * 10).map(vectorize_text) train_ds = train_ds.take(subset_size).shuffle(500).batch(bacth_size).map( vectorize_text).prefetch(prefetch) test_ds = Dataset.from_generator(test_gen, output_types, output_shapes) test_ds = test_ds.take(subset_size).batch(bacth_size).map( vectorize_text).prefetch(prefetch) x_val, y_val = [], [] for d in val_ds: x_val.append(d[0]) y_val.append(d[1]) x_val = tf.convert_to_tensor(x_val) y_val = tf.convert_to_tensor(y_val) validation_data = (x_val, y_val) return x_shape, train_ds, validation_data, test_ds
def run(data_path, image_size=160, epochs=10, batch_size=32, learning_rate=0.0001, output='model', dataset=None): img_shape = (image_size, image_size, 3) info('Loading Data Set') # load dataset train, test, val, labels = load_dataset(data_path, dataset) # training data train_data, train_labels = zip(*train) train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)), Dataset.from_tensor_slices(list(train_labels)))) train_ds = train_ds.map(map_func=process_image, num_parallel_calls=5) train_ds = train_ds.apply(tf.data.experimental.ignore_errors()) train_ds = train_ds.batch(batch_size) train_ds = train_ds.prefetch(buffer_size=5) train_ds = train_ds.repeat() # model info('Creating Model') base_model = tf.keras.applications.ResNet50(input_shape=img_shape, include_top=False, weights='imagenet') base_model.trainable = True model = tf.keras.Sequential([ base_model, tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy']) model.summary() # training info('Training') steps_per_epoch = math.ceil(len(train)/batch_size) history = model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch) # save model info('Saving Model') # check existence of base model folder output = check_dir(output) print('Serializing into saved_model format') tf.saved_model.save(model, str(output)) # add time prefix folder #stamp = datetime.now().strftime('%y_%m_%d_%H_%M.h5') #stamped = str(Path(output).joinpath(stamp)) file_output = str(Path(output).joinpath('latest.h5')) #print('Serializing model to:\n{}\n{}'.format(stamped, output) model.save(file_output)
def make_tf_dataset(file_path='', batch_size=10): loaded_data = np.load(file_path) X_train = loaded_data['X_train'] X_test = loaded_data['X_test'] Y_train = loaded_data['Y_train'] Y_test = loaded_data['Y_test'] print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape, flush=True) X_train = tf.cast(X_train, tf.float32) X_test = tf.cast(X_test, tf.float32) Y_train = tf.cast(Y_train, tf.int32) Y_test = tf.cast(Y_test, tf.int32) train_dat = Dataset.from_tensor_slices((X_train, Y_train)) train_dat = train_dat.batch(batch_size) test_dat = Dataset.from_tensor_slices((X_test, Y_test)) test_dat = test_dat.batch(batch_size) data_dict = {} iterator = Iterator.from_structure(train_dat.output_types, train_dat.output_shapes) data_dict['iterator'] = iterator data_dict['train_it_init'] = iterator.make_initializer(train_dat) data_dict['test_it_init'] = iterator.make_initializer(test_dat) #data_dict['train_it'] = train_iterator #data_dict['test_it'] = test_iterator #data_dict['train_it_init'] = train_iterator.initializer #data_dict['test_it_init'] = test_iterator.initializer return data_dict
def main(): dataset_count = 10 def create_dataset(i): return Dataset.range(4 * i, 4 * (i + 1)) dataset = Dataset.range(dataset_count).map(create_dataset) for d in dataset: show_dataset(d) d = dataset.flat_map(lambda x: x) show_dataset(d) d = dataset.interleave(lambda x: x, cycle_length=2, block_length=3) show_dataset(d) # Repeat two datasets of different lengths and interleave them. a = Dataset.from_tensor_slices(np.arange(10)).repeat() b = Dataset.from_tensor_slices(100 + np.arange(17)).repeat() datasets = [a, b] n = len(datasets) c = Dataset.from_tensor_slices(datasets) d = c.interleave(lambda x: x, cycle_length=n).take(50) show_dataset(d)
def prepare_train_generator(self): image_names = glob.glob(self.dir_name + "/training_data/images/images/*.jpg") image_names.extend( glob.glob(self.dir_name + "/training_data/images/images/*.png")) image_names.extend( glob.glob(self.dir_name + "/training_data/images/images/*.bmp")) image_names.extend( glob.glob(self.dir_name + "/training_data/images/images/*.tif")) sample_img = cv2.imread(image_names[0]) target_shape = (sample_img.shape[0], sample_img.shape[1]) crop_generator = CropGenerator(self.dir_name, target_shape) #image_dataset = tf.data.Dataset.list_files(self.dir_name + '/training_data/images/images/*') total_dataset = Dataset.range(1, 8).interleave( lambda x: Dataset.from_generator( CropGenerator(self.dir_name, target_shape), output_types=(tf.float32, tf.float32)), cycle_length=8) total_dataset = total_dataset.shuffle(buffer_size=20) #total_dataset = total_dataset.cache("./data_cache.") total_dataset = total_dataset.repeat() total_dataset = total_dataset.prefetch(buffer_size=20) data_tf = total_dataset.make_one_shot_iterator().get_next() return data_tf, crop_generator()
def test_input_fn(x_test,y_test,batch_size): if y_test is None: ds=tds.from_tensor_slices( {'input-features':x_test}) else: ds=tds.from_tensor_slices( ({'input-features':x_test}, y_test.reshape(-1,1))) return ds.batch(batch_size)
def getData(mypath, config): # get list of filepaths onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] data_dict = [mypath + "\\" + s for s in onlyfiles] # create numpy datasets for each stock data = [] for fname in data_dict: data.append( DataLoader(fname, window=config.experiment.window, threshold=config.experiment.threshold)) # initialize numpy arrays for training and test data X_train = data[0].X_train_std Y_train = data[0].Y_train X_val = data[0].X_val_std Y_val = data[0].Y_val X_test = data[0].X_test_std Y_test = data[0].Y_test # add other stocks to previously initialized numpy arrays for i in range(1, len(data)): X_train = np.concatenate((X_train, data[i].X_train_std), axis=0) Y_train = np.concatenate((Y_train, data[i].Y_train), axis=0) X_val = np.concatenate((X_val, data[i].X_val_std), axis=0) Y_val = np.concatenate((Y_val, data[i].Y_val), axis=0) X_test = np.concatenate((X_test, data[i].X_test_std), axis=0) Y_test = np.concatenate((Y_test, data[i].Y_test), axis=0) # Save number of features and samples num_train_samples = X_train.shape[0] num_val_samples = X_val.shape[0] num_test_samples = X_test.shape[0] num_train_features = X_train.shape[1] # Generate TF dataset for Keras model logging.info('------Final Training and Test Datasets------') logging.info('Size of X_Train: %s', X_train.shape) logging.info('Size of Y_Train: %s', Y_train.shape) logging.info('Size of X_val: %s', X_val.shape) logging.info('Size of Y_val: %s', Y_val.shape) logging.info('Size of X_Test: %s', X_test.shape) logging.info('Size of Y_Test: %s', Y_test.shape) train_dataset = Dataset.from_tensor_slices((X_train, Y_train)) train_dataset = train_dataset.shuffle(config.model.shuffle).batch( config.model.batch_size).repeat() val_dataset = Dataset.from_tensor_slices((X_val, Y_val)) val_dataset = val_dataset.shuffle(config.model.shuffle).batch( config.model.batch_size).repeat() test_dataset = Dataset.from_tensor_slices((X_test, Y_test)) test_dataset = test_dataset.shuffle(config.model.shuffle).batch( config.model.batch_size).repeat() return train_dataset, val_dataset, test_dataset, num_train_features, num_train_samples, num_val_samples, num_test_samples
def batch_and_repeat(ds: Dataset, batch_size: int, shuffle: bool, repeat: bool) -> Dataset: ds = ds.prefetch(buffer_size=AUTOTUNE) if shuffle: ds = ds.shuffle(1024, seed=SEED) if repeat: ds = ds.repeat() if batch_size > 0: ds = ds.batch(batch_size, drop_remainder=False) return ds
def train_test_split(data: Dataset, test_cnt: int) -> Tuple[Dataset, Dataset]: values, labels = next( data.as_numpy_iterator()) # both numpy arrays of the same length permuted_idxs: np.array = np.random.permutation(len(values)) pvals, plabs = (values[permuted_idxs], labels[permuted_idxs]) train: tf.data.Dataset = Dataset.from_tensors( (pvals[test_cnt:], plabs[test_cnt:])) test: tf.data.Dataset = Dataset.from_tensors( (pvals[:test_cnt], plabs[:test_cnt])) return (train, test)
def train(self, train_dataset: Dataset, valid_dataset: Dataset = None, batch_size: int = 256, epochs: int = 16, checkpoints_path: Path = None): print("Training model...") ckpt = None manager = None if checkpoints_path is not None: checkpoints_path.mkdir(parents=True, exist_ok=True) ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=self.optimizer, net=self.network) manager = tf.train.CheckpointManager(ckpt, checkpoints_path, max_to_keep=3) ckpt.restore(manager.latest_checkpoint) if manager.latest_checkpoint: print(f"Restored from {manager.latest_checkpoint}") else: print("Initializing from scratch.") # Batch the datasets train_dataset = train_dataset.shuffle(1024).batch(batch_size).prefetch( buffer_size=tf.data.experimental.AUTOTUNE) valid_dataset = valid_dataset.batch(batch_size) # Start training the model. for epoch in range(1, epochs + 1): for images, labels in train_dataset: self._train_step(images, labels) for valid_images, valid_labels in valid_dataset: self._test_step(valid_images, valid_labels) if checkpoints_path is not None: ckpt.step.assign_add(1) if int(ckpt.step) % 10 == 0: save_path = manager.save() print( f"💾 Saved checkpoint for step {int(ckpt.step)}: {save_path}" ) print( f"Epoch {epoch}, " f"Loss: {self.train_loss.result()}, Accuracy: {self.train_accuracy.result() * 100}, " f"Valid Loss: {self.test_loss.result()}, Valid Accuracy: {self.test_accuracy.result() * 100}" ) # Save the model. self.network.trainable = False self.network.save(self.save_path)
def make_generator(src_dir, valid_rate, input_size, batch_size): # インスタンス作成 train_datagen = ImageDataGenerator(rescale=1. / 255, rotation_range=30, width_shift_range=0.2, height_shift_range=0.2, shear_range=30, zoom_range=[0.7, 0.3], horizontal_flip=True, vertical_flip=True, validation_split=valid_rate) # ジェネレータ作成 # --- 訓練用データ train_generator = train_datagen.flow_from_directory( directory=src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='training') # ジェネレータ作成 # --- 検証用データ valid_generator = train_datagen.flow_from_directory( directory=src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='validation') # ラッピング # --- 訓練データジェネレータ train_ds = Dataset.from_generator(lambda: train_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *train_generator.image_shape ], [None, train_generator.num_classes])) # ラッピング # --- 検証データジェネレータ valid_ds = Dataset.from_generator(lambda: valid_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *valid_generator.image_shape ], [None, valid_generator.num_classes])) # 各Datasetを無限に繰り返す設定 train_ds = train_ds.repeat() valid_ds = valid_ds.repeat() return train_ds, train_generator.n, valid_ds, valid_generator.n
def create_final_datasets(X_train, X_valid, y_train, y_valid): train = Dataset.from_tensor_slices((X_train, (y_train.identity_hate.values, y_train.insult.values, y_train.obscene.values, y_train.severe_toxic.values, y_train.threat.values, y_train.toxic.values))).map( custom_loss.preprocess_sample).batch(config.BATCH_SIZE).repeat() valid = Dataset.from_tensor_slices((X_valid, (y_valid.identity_hate.values, y_valid.insult.values, y_valid.obscene.values, y_valid.severe_toxic.values, y_valid.threat.values, y_valid.toxic.values))).map( custom_loss.preprocess_sample).batch(config.BATCH_SIZE).repeat() return train, valid
def train_sa_bilstm(pad_to, lstm_hidden, da, r, lr, loss, savefigto): (train_x, train_y), (val_x, val_y), (test_x, test_y) = \ load_ESOL('data/ESOL-solubility.csv', 'data/mol2vec_model_300dim.pkl', pad_to=pad_to) _, _, vector_size = train_x.shape model = build_sa_bilstm_model(pad_to=pad_to, vector_size=vector_size, lstm_hidden=lstm_hidden, da=da, r=r) model.compile(optimizer=keras.optimizers.Adam(lr=lr), loss=loss, metrics=['mse']) print(model.summary()) print(train_x.shape, train_y.shape) train_dataset = Dataset.from_tensor_slices( (train_x, train_y)).shuffle(buffer_size=128).batch(64, drop_remainder=True) val_dataset = Dataset.from_tensor_slices( (val_x, val_y)).batch(32, drop_remainder=True) test_dataset = Dataset.from_tensor_slices( (test_x, test_y)).batch(32, drop_remainder=True) # This eats huge HD space! tensorboard_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1, update_freq='batch') earlystop_callback = keras.callbacks.EarlyStopping(patience=10) checkpoint_callback = keras.callbacks.ModelCheckpoint( f'./checkpoints/model-sa-bilstm-{pad_to}-{lstm_hidden}-{da}-{r}-{lr}-{loss}.ckpt', save_best_only=True) model.fit(train_dataset, epochs=100, validation_data=val_dataset, callbacks=[ tensorboard_callback, earlystop_callback, checkpoint_callback ]) # std, mean predict = np.array(model.predict(test_x)).ravel() * 2.0965 - 3.058 truth = np.array(test_y).ravel() * 2.0965 - 3.058 plt.figure(figsize=(5, 5)) plt.scatter(predict, truth) plt.plot([-8, 0], [-8, 0], 'r--') plt.axis([-8, 0, -8, 0]) plt.xlabel("Prediction") plt.ylabel("Groundtruth") MSE = ((predict - truth)**2).mean() plt.title(f"MSE = {MSE:.3f}") plt.savefig( Path(savefigto) / f'./solubility_sa_bilstm-{pad_to}-{lstm_hidden}-{da}-{r}-{lr}-{loss}-{MSE:.4f}.png' ) plt.close()
def simple_dataset(config): batch_size = config["batch_size"] x_train, y_train = linear_dataset(size=NUM_TRAIN_SAMPLES) x_test, y_test = linear_dataset(size=NUM_TEST_SAMPLES) train_dataset = Dataset.from_tensor_slices((x_train, y_train)) test_dataset = Dataset.from_tensor_slices((x_test, y_test)) train_dataset = train_dataset.shuffle(NUM_TRAIN_SAMPLES).repeat().batch( batch_size) test_dataset = test_dataset.repeat().batch(batch_size) return train_dataset, test_dataset
def make_generator(src_dir, valid_rate, input_size, batch_size): '''Dataset generatorを作成する関数 dir -> generator -> Datasetの流れでデータセットを作成 src_dir下のディレクトリ名が自動でクラス名となる(flow_from_directory) ImageDataGeneratorのパラメータはターゲットにより柔軟に変更(道路標識の上下フリップは必要ないetc...) ''' train_datagen = ImageDataGenerator(rescale=1. / 255, rotation_range=30, width_shift_range=0.2, height_shift_range=0.2, shear_range=30, zoom_range=[0.7, 1.3], horizontal_flip=True, vertical_flip=True, validation_split=valid_rate) # directoryの構造・名称から自動でdata_generatorを作成 train_generator = train_datagen.flow_from_directory( src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='training') valid_generator = train_datagen.flow_from_directory( src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='validation') train_ds = Dataset.from_generator(lambda: train_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *train_generator.image_shape ], [None, train_generator.num_classes])) valid_ds = Dataset.from_generator(lambda: valid_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *valid_generator.image_shape ], [None, valid_generator.num_classes])) train_ds = train_ds.repeat() valid_ds = valid_ds.repeat() cls_info = {v: k for k, v in train_generator.class_indices.items()} return train_ds, train_generator.n, valid_ds, valid_generator.n, cls_info
def _input_fn(directory, config, mode): print("Fetching {} data...".format(mode)) all_features = [] all_labels = [] if config["cloud"] == 0: all_files = os.listdir(directory) for file in all_files: features, labels = _load_json_file(os.path.join(directory, file), config) all_features += features all_labels += labels else: s = sagemaker.Session() all_files = s.list_s3_files(config["bucket"], directory) for file in all_files[1:]: features, labels = _load_json_file( s.read_s3_file(config["bucket"], file), config) all_features += features all_labels += labels num_data_points = len(all_features) num_batches = math.ceil(len(all_features) / config["batch_size"]) dataset = Dataset.from_tensor_slices((all_features, all_labels)) if mode == "train": dataset = Dataset.from_tensor_slices((all_features, all_labels)) dataset = dataset.batch(config["batch_size"]).shuffle( 10000, seed=12345).repeat(config["num_epoch"]) num_batches = math.ceil(len(all_features) / config["batch_size"]) if mode in ("validation", "eval"): dataset = dataset.batch(config["batch_size"]).repeat( config["num_epoch"]) num_batches = math.ceil(len(all_features) / config["batch_size"]) iterator = dataset.make_one_shot_iterator() dataset_features, dataset_labels = iterator.get_next() return [{ config["input_tensor_name"]: dataset_features }, dataset_labels, { "num_data_point": num_data_points, "num_batches": num_batches }]
def make_generator(src_dir, valid_rate, input_size, batch_size): # インスタンス生成 # --- ImageDataGeneratorクラス train_datagen = ImageDataGenerator(rescale=1 / 255, validation_split=valid_rate) # ジェネレータ作成 # --- 訓練データの読込み # --- 250 * (1 - 0.2) = 200 train_generator = train_datagen.flow_from_directory( src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='training') # ジェネレータ作成 # --- 検証データの読込み # --- 250 * 0.2 = 50 valid_generator = train_datagen.flow_from_directory( src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='validation') # ラッピング # --- 訓練データジェネレータ trans_ds = Dataset.from_generator(lambda: train_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *train_generator.image_shape ], [None, train_generator.num_classes])) # ラッピング # --- 検証データジェネレータ valid_ds = Dataset.from_generator(lambda: valid_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *valid_generator.image_shape ], [None, valid_generator.num_classes])) # 各Datasetを無限に繰り返す設定 trans_ds = trans_ds.repeat() trans_ds = trans_ds.repeat() return trans_ds, train_generator.n, valid_ds, valid_generator.n
def generate_tf_data(enc_input: list, dec_input: list, batch_size: int, train_size: int, val_size: int) -> [Dataset]: '''Generates a tensorflow data set, splits it in train, test and validation sets. Problem: Feeding in three arrays containing almost two million sequences each, requires too much main memory. Solution: We use the Tensorflow Dataset, where we can feed the model with slices of the whole dataset. Also: shuffles the observations. Args: enc_input: encoder input ids, token ids for each word and each sentence dec_input: used for teacher forcing. Token ids for each word and each sentence in target lang. More specific: - decoder input, token sequences (index 0 in dec_input) - decoder target output, token sequences (for teacher forcing, index 1 in dec_input) batch_size: Number of observation passed to the Seq2Seq model during training time. train_size: Fraction of all observations to be reserved for training the model. val_size: Fraction of all observations to be reserved for evaluating the model performance during training. Returns: train_data: contains encoder_input, decoder_input, decoder_target_output for training the model. val_data: contains encoder_input, decoder_input, decoder_target_output for evaluating the model. ''' assert train_size + val_size == 1, "Train, Validation and Test size doesn't sum up to 1!" data_size = enc_input[0].shape[0] # Summarize the source language token ids and the decoder input as: model_input model_input = Dataset.from_tensor_slices((enc_input[0], dec_input[0])) # enc_token_ids dec_token_ids # convert decoder_target_output to TF.Dataset decoder_target_output = Dataset.from_tensor_slices((dec_input[1])) # dec_token_ids used as target output (shifted by one observation) # Combine the model_input and the decoder_target_output to a full TF.Dataset, shuffle it full_data = Dataset.zip( (model_input, decoder_target_output)).shuffle(data_size) # Train Val split train_size = int(train_size * data_size) val_size = int(val_size * data_size) train_data = full_data.take(train_size) val_data = full_data.skip(train_size) train_data = train_data.batch(batch_size, drop_remainder=True) val_data = val_data.batch(batch_size, drop_remainder=True) return train_data, val_data
def prepare_batch_datasets(x_train, y_train, batch_size): logger.info('Preparing train and validation datasets for batches...') # Reserve the required samples for validation x_val = x_train[-(len(x_train) * int(VALIDATION_DATA_SPLIT)):] y_val = y_train[-(len(y_train) * int(VALIDATION_DATA_SPLIT)):] # Prepare the training dataset with shuffling train_dataset = Dataset.from_tensor_slices((x_train, y_train)) train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size) # Prepare the validation dataset val_dataset = Dataset.from_tensor_slices((x_val, y_val)) val_dataset = val_dataset.batch(batch_size) logger.info( 'Completed preparing train and validation datasets for batches.') return x_val, y_val, train_dataset, val_dataset
def augment_ds(ds: Dataset, grayscale: bool) -> Dataset: if not grayscale: ds = ds.map( lambda x, y: (_random_hue_saturation_brightness_contrast(x), y), num_parallel_calls=AUTOTUNE, ) if grayscale: ds = ds.map(lambda x, y: (_random_crop_mnist(x), y), num_parallel_calls=AUTOTUNE) else: ds = ds.map(lambda x, y: (_random_crop_cifar(x), y), num_parallel_calls=AUTOTUNE) ds = ds.map(lambda x, y: (_random_horizontal_flip(x), y), num_parallel_calls=AUTOTUNE) return ds
def ds_rndm() -> Tuple[Dataset, Dataset, int, int, int]: # Hardcoded values taken from MNIST num_classes = 10 m_train = 60000 m_test = 10000 # Random noise ds_image = Dataset.from_tensor_slices( (tf.random_uniform([m_train, 28, 28, 1], maxval=255, dtype=tf.int32))) ds_label = Dataset.from_tensor_slices((tf.random_uniform([m_train], maxval=9, dtype=tf.int64))) ds_train = Dataset.zip((ds_image, ds_label)) ds_test = ds_train.take(m_test) return ds_train, ds_test, num_classes, m_train, m_test
def h3(file, word_size=3, region_size=0, expand=True): sequences, labels = read_fasta(file) test_size = 0.15 val_size = 0.15 split_options = dict(test_size=test_size, stratify=labels, random_state=3264) x_train_val, x_test, y_train_val, y_test = train_test_split( sequences, labels, **split_options) # normalize val_size and update options split_options.update( dict(test_size=val_size / (1 - test_size), stratify=y_train_val)) x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, **split_options) del x_train_val, y_train_val encode_func = encode(word_size, region_size, expand=expand) x_shape = encoded_shape(sequences[0], word_size, region_size, expand=expand) train_gen = gen_from_arrays(x_train, y_train, encode_func) val_gen = gen_from_arrays(x_val, y_val, encode_func) test_gen = gen_from_arrays(x_test, y_test, encode_func) # datasets batch_size = 32 prefetch = tf.data.experimental.AUTOTUNE output_shapes = (x_shape, ()) output_types = (tf.float32, tf.float32) train_ds = Dataset.from_generator(train_gen, output_types, output_shapes) train_ds = train_ds.shuffle(500).batch(batch_size).prefetch(prefetch) test_ds = Dataset.from_generator(test_gen, output_types, output_shapes) test_ds = test_ds.batch(batch_size).prefetch(prefetch) x_val_encode, y_val_encode = [], [] for x, y in val_gen(): x_val_encode.append(x) y_val_encode.append(y) x_val_encode = np.array(x_val_encode) y_val_encode = np.array(y_val_encode) validation_data = (x_val_encode, y_val_encode) return x_shape, train_ds, validation_data, test_ds
def simple_test(): image_path = ['/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png','/home/kamerider/Documents/DataBase/1610763/10000.png'] label = np.array([1,2]) data = np.random.uniform(size=(12,3)) image_path = convert_to_tensor(image_path, dtype=dtypes.string) label = convert_to_tensor(label, dtype=dtypes.int32) dataset = Dataset.from_tensor_slices((image_path, label)) iterator = dataset.make_one_shot_iterator() one_element = iterator.get_next() with tf.Session() as sess: try: while True: result = sess.run(one_element) #print(result[0]) image_string = tf.read_file(result[0]) image_decode = tf.image.decode_png(image_string, channels=3) image_resize = tf.image.resize_images(image_decode,[64, 64]) print (image_resize) except tf.errors.OutOfRangeError: print("end!") ''' with tf.Session() as sess: for i in range(3): print (sess.run(one_element)) ''' '''
def train(examples, labels, features=None, lr=1e-4, steps=100, batch_size=1, model=None): '''Create and train a linear regression model.''' # Create datasets. if not features: features = examples.columns fcs = [tf.feature_column.numeric_column(feature) for feature in features] ds = Ds.from_tensor_slices( ({feature: examples[feature] for feature in features}, labels)) opt = tf.contrib.estimator.clip_gradients_by_norm( tf.train.GradientDescentOptimizer(learning_rate=lr), 5.0) if not model: model = tf.estimator.LinearRegressor(fcs, optimizer=opt) for _ in range(10): model.train( train_fn(ds, batch_size=batch_size), steps=steps//10) preds = model.predict( lambda: ds.batch(1).make_one_shot_iterator().get_next()) predictions = np.hstack(pred['predictions'] for pred in preds) print("Mean squared error: ", mse(predictions, labels)) return model
def load_data(self): data = GFile(self.file_path, 'rb').read().decode(encoding='UTF-8') # Get a list of the unique characters in the text vocab = list(sorted(set(data))) vocab_size = len(vocab) chars_to_ids = StringLookup(vocabulary=vocab) self.ids_to_chars_layer = StringLookup( vocabulary=chars_to_ids.get_vocabulary(), invert=True) # Split the entire text by character chars = unicode_split(data, 'UTF-8') ids_of_chars = chars_to_ids(chars) # Group characters to form sequences (+1 since the targets are shifted by one) sequences_ds = Dataset.from_tensor_slices(ids_of_chars) sequences_ds = sequences_ds.batch(C.SEQUENCE_LENGTH + 1) # Batch the sequences ds = sequences_ds.padded_batch(C.BATCH_SIZE) ds = ds.map(self._to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.shuffle(C.BUFFER_SIZE) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) return ds
def load_validation_data(data_folder, batch_size): x_validation = np.load(data_folder + '/validation/images.npy') x_validation = np.add(x_validation, -127.5, dtype=np.float32) / 127.5 y1_validation = np.load(data_folder + '/validation/class_labels.npy') y2_validation = np.load(data_folder + '/validation/bounding_box_labels.npy') y3_validation = np.load(data_folder + '/validation/landmark_labels.npy') return Dataset.from_tensor_slices((x_validation, (y1_validation, y2_validation, y3_validation))).batch(batch_size, drop_remainder=True)
def eval_dataset(params: HParams, iterator: ner_data.Generator): """ test function for tf estimator """ data = Dataset.from_generator(iterator.generator(), iterator.datatypes(), iterator.datashape()) data = data.padded_batch(params.batch_size, iterator.datashape()) return data
def __init__(self, txt_file, mode, batch_size, img_size=227, buffer_size=1000): self.txt_file = txt_file # retrieve the data from the text file self._read_txt_file() self.img_size = img_size # number of samples in the dataset self.data_size = len(self.RSAs) self.batch_size = batch_size # convert lists to TF tensor self.img1_paths = convert_to_tensor(self.img1_paths, dtype=dtypes.string) self.img2_paths = convert_to_tensor(self.img2_paths, dtype=dtypes.string) self.RSAs = convert_to_tensor(self.RSAs, dtype=dtypes.float32) # create dataset data = Dataset.from_tensor_slices( (self.img1_paths, self.img2_paths, self.RSAs)) data = data.map(self._parse_function_train) data = data.batch(batch_size) self.data = data
def load_and_format_images_for_fitting(folder): all_images, all_image_labels = get_all_images(folder) ds = Dataset.from_tensor_slices((all_images, all_image_labels)) ds = ds.shuffle(buffer_size=len(all_images)) ds = ds.batch(batch_size) return ds
def validate(model, examples, labels, features=None): '''Check the mse on the validation set.''' if not features: features = examples.columns ds = Ds.from_tensor_slices( ({feature: examples[feature] for feature in features}, labels)) predictions = get_predictions(model, ds) plt.figure() plt.subplot(1, 2, 1) plt.scatter(examples['longitude'], examples['latitude'], cmap='coolwarm', c=labels.iloc[:, 0]) plt.subplot(1, 2, 2) plt.scatter(examples['longitude'], examples['latitude'], cmap='coolwarm', c=predictions) if "classifier" in str(type(model)).casefold(): print("Validation log loss:", log_loss(labels, predictions)) else: print("Validation mse:", mse(predictions, labels)) return predictions
def train(self, checkpoints_path: Path, train_dataset: Dataset, valid_dataset: Dataset = None, batch_size: int = 256, epochs: int = 16): checkpoints_path.mkdir(parents=True, exist_ok=True) ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=self.optimizer, net=self.network) manager = tf.train.CheckpointManager(ckpt, checkpoints_path, max_to_keep=3) ckpt.restore(manager.latest_checkpoint) if manager.latest_checkpoint: print(f"Restored from {manager.latest_checkpoint}") else: print("Initializing from scratch.") # Batch the datasets train_dataset = train_dataset.shuffle(1024).batch(batch_size).prefetch( buffer_size=tf.data.experimental.AUTOTUNE) valid_dataset = valid_dataset.batch(batch_size) # Start training the model. for epoch in range(1, epochs + 1): for images, labels in train_dataset: self._train_step(images, labels) for valid_images, valid_labels in valid_dataset: self._test_step(valid_images, valid_labels) ckpt.step.assign_add(1) if int(ckpt.step) % 10 == 0: save_path = manager.save() print(f"💾 Saved checkpoint for step {int(ckpt.step)}: {save_path}") print(f"Epoch {epoch}, " f"Loss: {self.train_loss.result()}, Accuracy: {self.train_accuracy.result() * 100}, " f"Valid Loss: {self.test_loss.result()}, Valid Accuracy: {self.test_accuracy.result() * 100}") # Save the model. self.network.trainable = False self.network.save(self.save_path)