def motif_discovery_raw(train_file, test_file): subset_size = 690 * 190 x_shape = len(range(101)) train_gen = gen_from_fasta(train_file, None) test_gen = gen_from_fasta(test_file, None) # datasets bacth_size = 512 prefetch = tf.data.experimental.AUTOTUNE output_shapes = ((), ()) output_types = (tf.string, tf.float32) train_ds = Dataset.from_generator(train_gen, output_types, output_shapes) # takes about 30 seconds to skip the training data val_ds = train_ds.skip(subset_size).take(690 * 10).map(vectorize_text) train_ds = train_ds.take(subset_size).shuffle(500).batch(bacth_size).map( vectorize_text).prefetch(prefetch) test_ds = Dataset.from_generator(test_gen, output_types, output_shapes) test_ds = test_ds.take(subset_size).batch(bacth_size).map( vectorize_text).prefetch(prefetch) x_val, y_val = [], [] for d in val_ds: x_val.append(d[0]) y_val.append(d[1]) x_val = tf.convert_to_tensor(x_val) y_val = tf.convert_to_tensor(y_val) validation_data = (x_val, y_val) return x_shape, train_ds, validation_data, test_ds
def make_generator(src_dir, valid_rate, input_size, batch_size): # インスタンス作成 train_datagen = ImageDataGenerator(rescale=1. / 255, rotation_range=30, width_shift_range=0.2, height_shift_range=0.2, shear_range=30, zoom_range=[0.7, 0.3], horizontal_flip=True, vertical_flip=True, validation_split=valid_rate) # ジェネレータ作成 # --- 訓練用データ train_generator = train_datagen.flow_from_directory( directory=src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='training') # ジェネレータ作成 # --- 検証用データ valid_generator = train_datagen.flow_from_directory( directory=src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='validation') # ラッピング # --- 訓練データジェネレータ train_ds = Dataset.from_generator(lambda: train_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *train_generator.image_shape ], [None, train_generator.num_classes])) # ラッピング # --- 検証データジェネレータ valid_ds = Dataset.from_generator(lambda: valid_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *valid_generator.image_shape ], [None, valid_generator.num_classes])) # 各Datasetを無限に繰り返す設定 train_ds = train_ds.repeat() valid_ds = valid_ds.repeat() return train_ds, train_generator.n, valid_ds, valid_generator.n
def make_generator(src_dir, valid_rate, input_size, batch_size): '''Dataset generatorを作成する関数 dir -> generator -> Datasetの流れでデータセットを作成 src_dir下のディレクトリ名が自動でクラス名となる(flow_from_directory) ImageDataGeneratorのパラメータはターゲットにより柔軟に変更(道路標識の上下フリップは必要ないetc...) ''' train_datagen = ImageDataGenerator(rescale=1. / 255, rotation_range=30, width_shift_range=0.2, height_shift_range=0.2, shear_range=30, zoom_range=[0.7, 1.3], horizontal_flip=True, vertical_flip=True, validation_split=valid_rate) # directoryの構造・名称から自動でdata_generatorを作成 train_generator = train_datagen.flow_from_directory( src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='training') valid_generator = train_datagen.flow_from_directory( src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='validation') train_ds = Dataset.from_generator(lambda: train_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *train_generator.image_shape ], [None, train_generator.num_classes])) valid_ds = Dataset.from_generator(lambda: valid_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *valid_generator.image_shape ], [None, valid_generator.num_classes])) train_ds = train_ds.repeat() valid_ds = valid_ds.repeat() cls_info = {v: k for k, v in train_generator.class_indices.items()} return train_ds, train_generator.n, valid_ds, valid_generator.n, cls_info
def make_generator(src_dir, valid_rate, input_size, batch_size): # インスタンス生成 # --- ImageDataGeneratorクラス train_datagen = ImageDataGenerator(rescale=1 / 255, validation_split=valid_rate) # ジェネレータ作成 # --- 訓練データの読込み # --- 250 * (1 - 0.2) = 200 train_generator = train_datagen.flow_from_directory( src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='training') # ジェネレータ作成 # --- 検証データの読込み # --- 250 * 0.2 = 50 valid_generator = train_datagen.flow_from_directory( src_dir, target_size=input_size, batch_size=batch_size, shuffle=True, class_mode='categorical', subset='validation') # ラッピング # --- 訓練データジェネレータ trans_ds = Dataset.from_generator(lambda: train_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *train_generator.image_shape ], [None, train_generator.num_classes])) # ラッピング # --- 検証データジェネレータ valid_ds = Dataset.from_generator(lambda: valid_generator, output_types=(tf.float32, tf.float32), output_shapes=([ None, *valid_generator.image_shape ], [None, valid_generator.num_classes])) # 各Datasetを無限に繰り返す設定 trans_ds = trans_ds.repeat() trans_ds = trans_ds.repeat() return trans_ds, train_generator.n, valid_ds, valid_generator.n
def h3(file, word_size=3, region_size=0, expand=True): sequences, labels = read_fasta(file) test_size = 0.15 val_size = 0.15 split_options = dict(test_size=test_size, stratify=labels, random_state=3264) x_train_val, x_test, y_train_val, y_test = train_test_split( sequences, labels, **split_options) # normalize val_size and update options split_options.update( dict(test_size=val_size / (1 - test_size), stratify=y_train_val)) x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, **split_options) del x_train_val, y_train_val encode_func = encode(word_size, region_size, expand=expand) x_shape = encoded_shape(sequences[0], word_size, region_size, expand=expand) train_gen = gen_from_arrays(x_train, y_train, encode_func) val_gen = gen_from_arrays(x_val, y_val, encode_func) test_gen = gen_from_arrays(x_test, y_test, encode_func) # datasets batch_size = 32 prefetch = tf.data.experimental.AUTOTUNE output_shapes = (x_shape, ()) output_types = (tf.float32, tf.float32) train_ds = Dataset.from_generator(train_gen, output_types, output_shapes) train_ds = train_ds.shuffle(500).batch(batch_size).prefetch(prefetch) test_ds = Dataset.from_generator(test_gen, output_types, output_shapes) test_ds = test_ds.batch(batch_size).prefetch(prefetch) x_val_encode, y_val_encode = [], [] for x, y in val_gen(): x_val_encode.append(x) y_val_encode.append(y) x_val_encode = np.array(x_val_encode) y_val_encode = np.array(y_val_encode) validation_data = (x_val_encode, y_val_encode) return x_shape, train_ds, validation_data, test_ds
def to_tf_dataset(self, dataset, shuffle): cfg = self.cfg keys = {"image", "bboxes", "categories"} _dataset = Dataset.from_generator( lambda: iter(dataset), {k: v for k, v in self.dtypes.items() if k in keys}, {k: v for k, v in self.shapes.items() if k in keys}, ) _dataset = _dataset.map(self.add_features, cfg.num_workers) if shuffle and cfg.shuffle_buffer_size > 0: _dataset = _dataset.shuffle( buffer_size=cfg.shuffle_buffer_size, seed=cfg.seed, reshuffle_each_iteration=cfg.reshuffle_each_iteration, ) if cfg.batch_size > 1: keys = { "image", "bboxes", "categories", "kpt", "ct", "wh", "offset" } _dataset = _dataset.padded_batch( cfg.batch_size * cfg.n_gpus, {k: v for k, v in self.shapes.items() if k in keys}, {k: v for k, v in self.pad_values.items() if k in keys}, drop_remainder=True, ) return _dataset
def eval_dataset(params: HParams, iterator: ner_data.Generator): """ test function for tf estimator """ data = Dataset.from_generator(iterator.generator(), iterator.datatypes(), iterator.datashape()) data = data.padded_batch(params.batch_size, iterator.datashape()) return data
def prepare_train_generator(self): image_names = glob.glob(self.dir_name + "/training_data/images/images/*.jpg") image_names.extend( glob.glob(self.dir_name + "/training_data/images/images/*.png")) image_names.extend( glob.glob(self.dir_name + "/training_data/images/images/*.bmp")) image_names.extend( glob.glob(self.dir_name + "/training_data/images/images/*.tif")) sample_img = cv2.imread(image_names[0]) target_shape = (sample_img.shape[0], sample_img.shape[1]) crop_generator = CropGenerator(self.dir_name, target_shape) #image_dataset = tf.data.Dataset.list_files(self.dir_name + '/training_data/images/images/*') total_dataset = Dataset.range(1, 8).interleave( lambda x: Dataset.from_generator( CropGenerator(self.dir_name, target_shape), output_types=(tf.float32, tf.float32)), cycle_length=8) total_dataset = total_dataset.shuffle(buffer_size=20) #total_dataset = total_dataset.cache("./data_cache.") total_dataset = total_dataset.repeat() total_dataset = total_dataset.prefetch(buffer_size=20) data_tf = total_dataset.make_one_shot_iterator().get_next() return data_tf, crop_generator()
def get_dataset(self): dataset = Dataset.from_generator(self.image_generator, tf.float32, self.output_shape) dataset = dataset.repeat() dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(5) return dataset
def h3_raw(file): sequences, labels = read_fasta(file) test_size = 0.15 val_size = 0.15 split_options = dict(test_size=test_size, stratify=labels, random_state=3264) x_train_val, x_test, y_train_val, y_test = train_test_split( sequences, labels, **split_options) # normalize val_size and update options split_options.update( dict(test_size=val_size / (1 - test_size), stratify=y_train_val)) x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, **split_options) del x_train_val, y_train_val x_shape = len(sequences[0]) train_gen = gen_from_arrays(x_train, y_train, None) val_gen = gen_from_arrays(x_val, y_val, None) test_gen = gen_from_arrays(x_test, y_test, None) # datasets batch_size = 32 prefetch = tf.data.experimental.AUTOTUNE output_shapes = ((), ()) output_types = (tf.string, tf.float32) train_ds = Dataset.from_generator(train_gen, output_types, output_shapes) train_ds = train_ds.shuffle(500).batch(batch_size).map( vectorize_text).prefetch(prefetch) val_ds = Dataset.from_generator(val_gen, output_types, output_shapes) val_ds = val_ds.map(vectorize_text).prefetch(prefetch) test_ds = Dataset.from_generator(test_gen, output_types, output_shapes) test_ds = test_ds.batch(batch_size).map(vectorize_text).prefetch(prefetch) x_val_encode, y_val_encode = [], [] for x, y in val_ds: x_val_encode.append(x) y_val_encode.append(y) x_val_encode = np.array(x_val_encode) y_val_encode = np.array(y_val_encode) validation_data = (x_val_encode, y_val_encode) return x_shape, train_ds, validation_data, test_ds
def train_dataset(params: HParams, iterator: ner_data.Generator): """ train function for tf estimator """ data = Dataset.from_generator(iterator.generator(), iterator.datatypes(), iterator.datashape()) data = data.shuffle(params.shuffle_buffer_size) data = data.padded_batch(params.batch_size, iterator.datashape()) data = data.prefetch(None) return data
def make_submission_file(folder_name): ''' Creates a submission file ''' # creates a dataframe with images to test pairs = pd.read_csv('sample_submission.csv', usecols=['img_pair'], squeeze=True) num_pairs = pairs.size images = pairs.str.split(pat='-', expand=True) # load model """ model = tf.keras.models.load_model('Training Plots/{}/trained_model.h5'.format(folder_name), custom_objects={'L2Norm2Prob': models.L2Norm2Prob, 'probability_logistic_loss': losses.probability_logistic_loss, 'pos_prob': losses.pos_prob, 'neg_prob': losses.neg_prob, 'pos_dist': losses.pos_dist, 'neg_dist': losses.neg_dist, 'ROC_custom_metric': losses.ROC_custom_metric}) """ model = make_facenet_based_model() model.load_weights( 'Training Plots/{}/saved_weights.h5'.format(folder_name)) def prediction_input_generator(images): for i, img_pair in images.iterrows(): img_1 = pre_processing(cv2.imread('test/' + img_pair.iloc[0])) img_2 = pre_processing(cv2.imread('test/' + img_pair.iloc[1])) # here, the 3th entry (Negative_input) is not necessary to compute the predictions, but it is needed because the model takes 3 inputs yield { 'Anchor_input': img_1, 'Positive_input': img_2, 'Negative_input': img_1 } dataset = Dataset.from_generator( lambda: prediction_input_generator(images), output_types=({ 'Anchor_input': tf.float32, 'Positive_input': tf.float32, 'Negative_input': tf.float32 })) dataset = dataset.batch(1) predictions = model.predict(x=dataset, steps=num_pairs) predictions = predictions[0::4] dataframe = pd.read_csv('sample_submission.csv') dataframe.is_related = predictions dataframe.to_csv('Training Plots/{}/submission.csv'.format(folder_name), index=False) # return every 4th element of output array, because model has 4 outputs (this was made this way because of training and training metrics) return predictions
def get_test(batch_size, shuffle=False, test_size=0.2): test = Dataset.from_generator(image_generator, output_types=(tf.float32), output_shapes=(tf.TensorShape( (512, 512, 3))), args=[test_size, True]) if shuffle: test = test.shuffle(10) test = test.repeat().batch(batch_size) return test
def to_tensorflow_dataset(self): if not self._weighted: ds = Dataset.from_generator( self._flattened_gen, output_types=(float16, float32), output_shapes=((self._target_size[0], self._target_size[1], 3), (1, )), ) else: ds = Dataset.from_generator( self._flattened_gen, output_types=(float16, float32, float32), output_shapes=( (self._target_size[0], self._target_size[1], 3), (1, ), (1, ), ), ) ds = ds.cache() ds = ds.shuffle(len(self._paths), reshuffle_each_iteration=True) ds = ds.batch(self._batch_size, num_parallel_calls=AUTOTUNE) ds = ds.prefetch(AUTOTUNE) return ds
def motif_discovery(train_file, test_file, word_size=3, region_size=2, expand=True): subset_size = 690 * 190 x_shape = encoded_shape(range(101), word_size, region_size, expand=expand) encode_func = encode(word_size, region_size, expand=expand) train_gen = gen_from_fasta(train_file, encode_func) test_gen = gen_from_fasta(test_file, encode_func) # datasets bacth_size = 512 prefetch = tf.data.experimental.AUTOTUNE output_shapes = (x_shape, ()) output_types = (tf.float32, tf.float32) train_ds = Dataset.from_generator(train_gen, output_types, output_shapes) # takes about 30 seconds to skip the training data val_ds = train_ds.skip(subset_size).take(690 * 10) train_ds = train_ds.take(subset_size).shuffle(500).batch( bacth_size).prefetch(prefetch) test_ds = Dataset.from_generator(test_gen, output_types, output_shapes) test_ds = test_ds.take(subset_size).batch(bacth_size).prefetch(prefetch) x_val, y_val = [], [] for d in val_ds: x_val.append(d[0]) y_val.append(d[1]) x_val = tf.convert_to_tensor(x_val) y_val = tf.convert_to_tensor(y_val) validation_data = (x_val, y_val) return x_shape, train_ds, validation_data, test_ds
def file_input_fn_predict(input_files): # d = tf.data.Dataset.from_tensor_slices(input_files) # d = dataset.interleave(lambda x:tf.data.TextLineDataset(x).map(parse_line), cycle_length=4, block_length=16) def generate_fn(): for input_file in input_files: with open(input_file,'r') as fp: for line in fp: # model_item format: # model_item = create_model_item(line) yield model_item dataset = Dataset.from_generator( generate_fn, output_shapes=(tf.TensorShape([seq_length]))) return dataset
def input_fn_images(image_paths,epoch=1, batch_size=1, image_shape=(512, 512)): """ input function for Estimator :param image_paths: list: list of path of png file with images :param epoch: int: number of epoch :param batch_size: int: batch size :param image_shape: (int, int): the size of image we want :param padding: bool: use padding or not :return: dataset for Estimator """ dataset = Dataset.from_generator(generator=image_generator, output_types=(tf.float32), output_shapes=(tf.TensorShape([512, 512, 3])), args=((image_paths, image_shape))) dataset = dataset.repeat(epoch).batch(batch_size) return dataset
def _get_dataset(self, filenames): # get number of data points num_transitions = 0 for file in tqdm(filenames, desc='Count dataset samples'): with open(file, 'rb') as f: num_transitions += len(pickle.load(f)) # define dataset generator def data_generator(): shuffled_filenames = random.sample(filenames, len(filenames)) for file_ix, filename in enumerate(shuffled_filenames): with open(filename, 'rb') as f: data = pickle.load(f) for transition in data: rgb = transition[0]['images'][..., :3] d = transition[0]['images'][..., [-1]] if self.config['view'] < 0: # get random view view_ix = random.randint(0, len(d) - 1) else: view_ix = self.config['view'] rgb = rgb[view_ix] d = d[view_ix] obs = np.concatenate([ transition[0]['observation'].flatten(), transition[0]['desired_goal'].flatten() ]) gt_aux = transition[0]['observation'].flatten()[3:6] gt_output = transition[1].flatten() yield rgb, d, obs, gt_aux, gt_output model_params = self.config['model_params'] data_shapes = tuple([ tf.TensorShape([model_params['image_size']] * 2 + [3]), tf.TensorShape([model_params['image_size']] * 2 + [1]), tf.TensorShape([model_params['obs_dim']]), tf.TensorShape([3]), tf.TensorShape([model_params['output_dim']]) ]) dataset = Dataset.from_generator(data_generator, output_types=tuple([tf.float32] * 5), output_shapes=data_shapes) dataset = dataset.prefetch(self.config['batch_size'] * 8).repeat() dataset = dataset.batch(self.config['batch_size']) return dataset, num_transitions
def make_triplet_dataset(families, positive_relations): # ============================================================================= # Dataset Generator that returns a random anchor, positive and negative images each time it is called # ============================================================================= dataset = Dataset.from_generator( lambda: make_triplet_generator(families, positive_relations), output_types=({ 'Anchor_input': tf.float32, 'Positive_input': tf.float32, 'Negative_input': tf.float32 }, tf.int64)) # batches the dataset dataset = dataset.batch(batch_size) return dataset
def input_fn(image_paths, watermark_paths, num_epochs=2, batch_size=5): """ input function for Estimator, uses the generator for this :param binary_mask_paths: list: list of full paths of the images :param boxes_paths: list: list of full path to pickle file with boxes :param threshold: int: treshhold for binary masks :param num_epochs: int: the number of epoch :param batch_size: int: batch size :return: tensorflow dataset for Estimator """ dataset = Dataset.from_generator(generator=generator, output_types=(tf.float32, tf.float32), output_shapes=(tf.TensorShape([512, 512, 3]), tf.TensorShape([512, 512, 1])), args=(image_paths, watermark_paths)) dataset = dataset.repeat(num_epochs).batch(batch_size) return dataset
def to_tf_test_dataset(self, dataset): cfg = self.cfg keys = {"image", "file"} _dataset = Dataset.from_generator( lambda: iter(dataset), {k: v for k, v in self.dtypes.items() if k in keys}, {k: v for k, v in self.shapes.items() if k in keys}, ) if cfg.batch_size > 1: _dataset = _dataset.batch( cfg.batch_size, drop_remainder=False, ) return _dataset
def input_fn(loader, ids, batch_size=None, random_state=None, mode=TRAIN_MODE): """Provides the input data for training, evaluation or prediction. Data is returned in the format used by tf.estimator.Estimator. Args: loader: The DataLoader instance to handle the loading of datapoints. ids (list<str>): The IDs of the datapoints to that will be needed. batch_size (int): The number of samples that comprise a batch. None if not used (in pred and eval mode) random_state (numpy.random.RandomState): Random state instance to shuffle the dataset prior to batching. Random states are used to enable different permutations of the training set for different epochs while ensuring that data is not prefetched. mode (int) The mode of the estimator to load the correct data. Defined as TRAIN_MODE = 0, EVAL_MODE = 1, PRED_MODE = 2 Returns: next_batch: A nested structure of tensors that iterate over the dataset. Every iteration contains a batch of data. """ def load(): for i in ids: feat_seq, align_seq, _ = loader.load(i) length = feat_seq.shape[0] label_seq = align_seqs_to_breaking_labels(align_seq, length) # print('<Loaded %s>' % (i,)) if mode == TRAIN_MODE or mode == EVAL_MODE: yield {'features': feat_seq, 'length': length}, label_seq elif mode == PRED_MODE: yield {'features': feat_seq, 'length': length} dtypes = ({'features': tf.float32, 'length': tf.int32}, tf.float32) shapes = ({ 'features': tf.TensorShape([None, N_FEATURES]), 'length': tf.TensorShape([]) }, tf.TensorShape([None])) if mode == TRAIN_MODE and random_state is not None: ids = shuffle(ids, random_state=random_state) elif mode == EVAL_MODE or mode == PRED_MODE or batch_size is None: batch_size = len(ids) if mode == PRED_MODE: dtypes = dtypes[0] shapes = shapes[0] dataset = Dataset.from_generator(load, dtypes, shapes) dataset = dataset.padded_batch(batch_size, shapes) return dataset.make_one_shot_iterator().get_next()
def _build_dataset(self, generator, output_types, output_shapes, batch_size, buffer_size, num_parallel_calls, take=None): if num_parallel_calls > 1: return _ParallelDataset(generator=generator, output_types=output_types, output_shapes=output_shapes, batch_size=batch_size, num_parallel_calls=num_parallel_calls, take=take) else: dataset = tf_Dataset.from_generator(generator=generator, output_types=output_types, output_shapes=output_shapes).batch(batch_size).prefetch(buffer_size) if take is not None: dataset = dataset.take(take) return dataset
## reading data train_file = dir + 'motif_discovery-train.txt' valid_file = dir + 'motif_discovery-valid.txt' test_file = dir + 'motif_discovery-test.txt' ytrain = get_label(train_file) yval = get_label(valid_file) ytest = get_label(test_file) train_gen = customize_generator(train_file) valid_gen = customize_generator(valid_file) test_gen = customize_generator(test_file) output_types = (tf.float32, tf.float32) prefetch = tf.data.experimental.AUTOTUNE xtrain_seq = Dataset.from_generator( train_gen, output_types=output_types, output_shapes=((101, ), (101, ))).batch(batch_size).prefetch(prefetch) xval_seq = Dataset.from_generator( valid_gen, output_types=output_types, output_shapes=((101, ), (101, ))).batch(batch_size).prefetch(prefetch) xtest_seq = Dataset.from_generator( test_gen, output_types=output_types, output_shapes=((101, ), (101, ))).batch(batch_size).prefetch(prefetch) latent_size = 30 seq_len = 101 encoder = keras.Sequential([ keras.Input(shape=(seq_len, )), keras.layers.Embedding(seq_len, latent_size), keras.layers.LSTM(latent_size, return_sequences=False), ])
image_tensor: the tensorflow sensor of the image. category_tensor: the tensorflow sensor of the category. """ for data in self.segment: with data.open() as fp: image_tensor = tf.convert_to_tensor(np.array(Image.open(fp)) / 255, dtype=tf.float32) category = self.category_to_index[ data.label.classification.category] category_tensor = tf.convert_to_tensor(category, dtype=tf.int32) yield image_tensor, category_tensor # """""" """Build a tensorflow dataset and run it""" ACCESS_KEY = "Accesskey-*****" dataset = Dataset.from_generator( MNISTSegment(GAS(ACCESS_KEY), "train"), output_signature=( tf.TensorSpec(shape=(28, 28), dtype=tf.float32), tf.TensorSpec(shape=(), dtype=tf.int32), ), ).batch(4) for index, (image, label) in enumerate(dataset): print(f"{index}: {label}") """"""
def main(): path = "corti-data-manager/tests/data/" entity_file_path = os.path.join(path, "may10-entity.json") intent_file_path = os.path.join(path, "may10-intent.json") loss_retained = 10 maxlen = 30 pattern_dist_history = {} loss_dist_history = {} list_labels = [] # logical form predicates/actions embedding_dims = 300 lstm_units = 128 num_features = 512 num_labels = 0 # Data Generator conf = process_conf( "corti-data-manager/tests/data/confs/stream_data_gen.yml") print(conf) sqg = StreamingDataGenerator(conf) number_of_patterns = len(sqg.streaming_parser.patterns) + 5 print("Number of patterns: ", number_of_patterns) print(sqg.total_patterns) tokenizer = get_tokenizer(sqg) print("Vocab Size: ", len(tokenizer.word_counts)) vocab_size = len(tokenizer.word_counts) list_labels = get_labels(intent_file_path) label_encoder = LabelEncoder() label_encoder.fit(list_labels) integer_encoded = label_encoder.transform(list_labels) num_labels = int(np.max(integer_encoded) + 1) print("Number of labels: ", num_labels) onehot_encoder = OneHotEncoder(sparse=False) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot_encoder.fit(integer_encoded) # print(onehot_encoder.transform(integer_encoded)) pdir = "test_norm" path1 = "data/" + pdir + "/train.json" path2 = "data/" + pdir + "/label.json" path3 = "data/" + pdir + "/index.json" with open(path1) as json_file: queries = json.load(json_file) with open(path2) as json_file: labels = json.load(json_file) with open(path3) as json_file: indexes = json.load(json_file) print(queries[0]) print(len(queries)) datagen = data_generator(tokenizer, maxlen, label_encoder, onehot_encoder, queries, labels, indexes) checkpoint_prefix = "saved/after3/" # Model model = LSTMClassifier(vocab_size + 2, num_labels, embedding_dims, lstm_units) optimizer = tf.train.AdamOptimizer() root = tf.train.Checkpoint( optimizer=optimizer, model=model, optimizer_step=tf.train.get_or_create_global_step()) root.restore(tf.train.latest_checkpoint(checkpoint_prefix)) pattern_dist = tf.math.softmax(np.zeros(number_of_patterns)).numpy() avg_loss = np.ones_like(pattern_dist) * 10 BUFFER_SIZE = len(queries) losses = defaultdict(lambda: [0] * loss_retained) counts = defaultdict(int) queries_shape = [None, maxlen] labels_shape = [None, num_labels] indexes_shape = [ None, ] minibatch_size = len(queries) dataset = Dataset.from_generator( datagen, args=[minibatch_size], output_types=(tf.int32, tf.int32, tf.int32), output_shapes=(tf.TensorShape(queries_shape), tf.TensorShape(labels_shape), tf.TensorShape(indexes_shape))).shuffle(BUFFER_SIZE) for i, (queries, labels, indexes) in enumerate(dataset): print(i) predictions = model(queries) # print(queries[0],labels[0],predictions[0]) print(np.argmax(labels[0]), np.argmax(predictions[0])) # [batch_size, num_classes] accu = evaluate_accuracy(labels, predictions) np.set_printoptions(threshold=np.inf) print("accuracy: ", accu) loss = tf.losses.softmax_cross_entropy(labels, predictions, reduction=Reduction.NONE) print(loss.shape) loss_reduced = tf.math.reduce_mean(loss) nplabels = labels.numpy() for i, (a_type, a_loss) in enumerate(zip(indexes, loss)): a_type = int(a_type.numpy()) idx = int(counts[a_type] % loss_retained) max_idx = np.argmax(nplabels[i]) losses[a_type][idx] = float(np.mean(a_loss.numpy())) counts[a_type] = counts.get(a_type, 0.0) + 1.0 for a_type in losses: num = int(counts[a_type]) if num >= loss_retained: avg_loss[a_type] = np.mean(losses[a_type]) else: avg_loss[a_type] = np.mean(losses[a_type][:num]) print('iter:{0} --> Average Loss:{1}'.format(i, loss_reduced.numpy())) with open("output/test/pattern_losses_uniform.txt", "w") as f: f.write(json.dumps(list(avg_loss)))
is_training_pl = True bn_decay = True n_pc = get_number_pc() tf.reset_default_graph() # np.random.seed(42) tf.set_random_seed(2019) with tf.Graph().as_default(): with tf.device('/gpu:0'): dataset = Dataset.from_generator( generator, (tf.float32), output_shapes=(tf.TensorShape([NUM_POINT, 3])), args=([BATCH_SIZE * int(n_pc / BATCH_SIZE)])) # dataset = Dataset.from_generator(generator, (tf.float32, tf.float32), output_shapes=(tf.TensorShape([1000]), tf.TensorShape([ NUM_POINT, 3]))) # dataset = dataset.repeat(1) dataset = dataset.shuffle(100) dataset = dataset.batch(BATCH_SIZE) # iterator = dataset.make_one_shot_iterator() iterator = dataset.make_initializable_iterator() # features, pointclouds_pl = iterator.get_next()0 pointclouds_pl = iterator.get_next() pointclouds_pl = tf.reshape(pointclouds_pl, (BATCH_SIZE, NUM_POINT, 3)) print("----------------------> ", pointclouds_pl) # print(batch)
test_batch_size=test_batch_size, train_proportion=train_proportion, class_proportion=class_proportion) train_index_generator, test_index_generator = data_stream.split_by_patch_id( features[['image']], features[['destroyed']]) train_generator = data_stream.get_train_data_generator_from_index( [features['image'], features['destroyed']], train_index_generator) test_indices = list(test_index_generator) test_generator = data_stream.get_test_data_generator_from_index( features['image'], test_indices) num_batches = ceil(len(features) / space['batch_size']) num_batches_test = len(test_indices) # Fit model and predict train_dataset = Dataset.from_generator(lambda: train_generator, (tf.float32, tf.int32)) print('Training with space: \n') print(space) model = Model(**space) model.fit_generator(train_dataset, steps_per_epoch=num_batches, verbose=1, **space) test_dataset = Dataset.from_generator(lambda: test_generator, tf.float32) predictions = model.predict_generator(test_dataset, steps=num_batches_test) test_indices_flattened = test_indices[0] for index in test_indices[1:]: test_indices_flattened = test_indices_flattened.append(index)
def main(): # Get arguments args = get_argument() models_dir = args.models_directory if args.log_frequency != 'epoch' and args.log_frequency != 'batch': log_frequency = int(args.log_frequency) else: log_frequency = args.log_frequency # Prepare training dataset train_data = Dataset.from_generator( generator=onet_generator(args.batch_size, args.steps_per_epoch, args.data_folder), output_types=(tf.uint8, (tf.float32, tf.float32, tf.float32)), output_shapes=(tf.TensorShape([args.batch_size, 48, 48, 3]), (tf.TensorShape([args.batch_size, 3]), tf.TensorShape([args.batch_size, 5]), tf.TensorShape([args.batch_size, 11]))) ) train_data = train_data.map(augment_v2) # Prepare validation dataset val_data = load_validation_data(args.data_folder, args.batch_size) # Stop training if no improvements are made early_stopping = EarlyStopping( monitor='val_loss', patience=args.early_stopping, mode='min' ) # Model checkpoints model_checkpoint = ModelCheckpoint( filepath=models_dir + '/epoch_{epoch:04d}_val_loss_{val_loss:.4f}.hdf5', monitor='val_loss', save_best_only=True, save_weights_only=True, mode='min' ) # Learning rate decay lr_decay = ReduceLROnPlateau( monitor='val_loss', factor=0.2, patience=args.lr_decay_patience, mode='min', min_delta=args.lr_decay_min_delta ) # Set up Tensorboard tensorboard = TensorBoard( log_dir=models_dir + '/log', write_graph=False, profile_batch=0, update_freq=log_frequency ) # Create and compile the model from scratch model = onet() # Load the pre-trained model for finetuning model.load_weights(filepath=args.onet, by_name=True) # Compile the model model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate), loss=[ BCE_with_sti(args.hard_sample_mining, args.num_back), MSE_with_sti(args.hard_sample_mining, args.num_back), MSE_with_sti(args.hard_sample_mining, args.num_back) ], metrics=[[accuracy_(), recall_()], None, None], loss_weights=[1, 0.5, 1] ) # Create folders if not path.exists(models_dir): os.makedirs(models_dir) if not path.exists(models_dir + '/log'): os.mkdir(models_dir + '/log') # Train the model history = model.fit( x=train_data, epochs=args.num_epochs, callbacks=[early_stopping, model_checkpoint, lr_decay, tensorboard], validation_data=val_data, steps_per_epoch=args.steps_per_epoch )
def get_input_fn(): types, shapes = self.input_pipeline.feed_shape_type_def() tf_dataset = Dataset.from_generator(dataset_encoded, types[0], shapes[0]) return tf_dataset.batch(1)