def train(model, data, gen, params): anim_frames = [] with tf.Session() as session: tf.local_variables_initializer().run() tf.global_variables_initializer().run() for step in range(params.num_steps + 1): # update discriminator x = data.sample(params.batch_size) z = gen.sample(params.batch_size) loss_d, _, = session.run([model.loss_d, model.opt_d], { model.x: np.reshape(x, (params.batch_size, 1)), model.z: np.reshape(z, (params.batch_size, 1)) }) # update generator z = gen.sample(params.batch_size) loss_g, _ = session.run([model.loss_g, model.opt_g], { model.z: np.reshape(z, (params.batch_size, 1)) }) if step % params.log_every == 0: print('{}: {:.4f}\t{:.4f}'.format(step, loss_d, loss_g)) if params.anim_path and (step % params.anim_every == 0): anim_frames.append( samples(model, session, data, gen.range, params.batch_size) ) if params.anim_path: save_animation(anim_frames, params.anim_path, gen.range) else: samps = samples(model, session, data, gen.range, params.batch_size) plot_distributions(samps, gen.range)
def test(model, config, prompts): sr = 24000 if 'blizzard' in config.data_path else 16000 meta = data_input.load_meta(config.data_path) config.r = audio.r ivocab = meta['vocab'] config.vocab_size = len(ivocab) with tf.device('/cpu:0'): batch_inputs = data_input.load_prompts(prompts, ivocab) config.num_prompts = len(prompts) with tf.Session() as sess: stft_mean = tf.get_variable('stft_mean', shape=(1025*audio.r,), dtype=tf.float16) stft_std = tf.get_variable('stft_std', shape=(1025*audio.r,), dtype=tf.float32) # initialize model model = model(config, batch_inputs, train=False) train_writer = tf.summary.FileWriter('log/' + config.save_path + '/test', sess.graph) tf.global_variables_initializer().run() tf.local_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver() print('restoring weights') latest_ckpt = tf.train.latest_checkpoint( 'weights/' + config.save_path[:config.save_path.rfind('/')] ) saver.restore(sess, latest_ckpt) stft_mean, stft_std = sess.run([stft_mean, stft_std]) try: while(True): out = sess.run([ model.output, model.alignments, batch_inputs ]) outputs, alignments, inputs = out print('saving samples') for out, words, align in zip(outputs, inputs['text'], alignments): # store a sample to listen to text = ''.join([ivocab[w] for w in words]) attention_plot = data_input.generate_attention_plot(align) sample = audio.invert_spectrogram(out*stft_std + stft_mean) merged = sess.run(tf.summary.merge( [tf.summary.audio(text, sample[None, :], sr), tf.summary.image(text, attention_plot)] )) train_writer.add_summary(merged, 0) except tf.errors.OutOfRangeError: coord.request_stop() coord.join(threads)
def test_empty_labels_and_scores_gives_nan_auc(self): with self.test_session(): labels = tf.constant([], shape=[0], dtype=tf.bool) scores = tf.constant([], shape=[0], dtype=tf.float32) score_range = [0, 1.] auc, update_op = tf.contrib.metrics.auc_using_histogram(labels, scores, score_range) tf.local_variables_initializer().run() update_op.run() self.assertTrue(np.isnan(auc.eval()))
def _check_auc(self, nbins=100, desired_auc=0.75, score_range=None, num_records=50, frac_true=0.5, atol=0.05, num_updates=10): """Check auc accuracy against synthetic data. Args: nbins: nbins arg from contrib.metrics.auc_using_histogram. desired_auc: Number in [0, 1]. The desired auc for synthetic data. score_range: 2-tuple, (low, high), giving the range of the resultant scores. Defaults to [0, 1.]. num_records: Positive integer. The number of records to return. frac_true: Number in (0, 1). Expected fraction of resultant labels that will be True. This is just in expectation...more or less may actually be True. atol: Absolute tolerance for final AUC estimate. num_updates: Update internal histograms this many times, each with a new batch of synthetic data, before computing final AUC. Raises: AssertionError: If resultant AUC is not within atol of theoretical AUC from synthetic data. """ score_range = [0, 1.] or score_range with self.test_session(): labels = tf.placeholder(tf.bool, shape=[num_records]) scores = tf.placeholder(tf.float32, shape=[num_records]) auc, update_op = tf.contrib.metrics.auc_using_histogram(labels, scores, score_range, nbins=nbins) tf.local_variables_initializer().run() # Updates, then extract auc. for _ in range(num_updates): labels_a, scores_a = synthetic_data(desired_auc, score_range, num_records, self.rng, frac_true) update_op.run(feed_dict={labels: labels_a, scores: scores_a}) labels_a, scores_a = synthetic_data(desired_auc, score_range, num_records, self.rng, frac_true) # Fetch current auc, and verify that fetching again doesn't change it. auc_eval = auc.eval() self.assertAlmostEqual(auc_eval, auc.eval(), places=5) msg = ('nbins: %s, desired_auc: %s, score_range: %s, ' 'num_records: %s, frac_true: %s, num_updates: %s') % (nbins, desired_auc, score_range, num_records, frac_true, num_updates) np.testing.assert_allclose(desired_auc, auc_eval, atol=atol, err_msg=msg)
def train(self, DGTrain, DGTest, saver=True): epoch = DGTrain.length self.LearningRateSchedule(self.LEARNING_RATE, self.K, epoch) trainable_var = tf.trainable_variables() self.regularize_model() self.optimization(trainable_var) self.ExponentialMovingAverage(trainable_var, self.DECAY_EMA) tf.global_variables_initializer().run() tf.local_variables_initializer().run() self.summary_test_writer = tf.summary.FileWriter(self.LOG + '/test', graph=self.sess.graph) self.summary_writer = tf.summary.FileWriter(self.LOG + '/train', graph=self.sess.graph) merged_summary = tf.summary.merge_all() steps = self.STEPS # for i in range(Xval.shape[0]): # imsave("/tmp/image_{}.png".format(i), Xval[i]) # imsave("/tmp/label_{}.png".format(i), Yval[i,:,:,0]) for step in range(steps): batch_data, batch_labels = DGTrain.Batch(0, self.BATCH_SIZE) feed_dict = {self.input_node: batch_data, self.train_labels_node: batch_labels} # self.optimizer is replaced by self.training_op for the exponential moving decay _, l, lr, predictions, s = self.sess.run( [self.training_op, self.loss, self.learning_rate, self.train_prediction, merged_summary], feed_dict=feed_dict) if step % self.N_PRINT == 0: i = datetime.now() print i.strftime('%Y/%m/%d %H:%M:%S: \n ') self.summary_writer.add_summary(s, step) error, acc, acc1, recall, prec, f1 = self.error_rate(predictions, batch_labels, step) print(' Step %d of %d' % (step, steps)) print(' Learning rate: %.5f \n') % lr print(' Mini-batch loss: %.5f \n Accuracy: %.1f%% \n acc1: %.1f%% \n recall: %1.f%% \n prec: %1.f%% \n f1 : %1.f%% \n' % (l, acc, acc1, recall, prec, f1)) self.Validation(DGTest, step)
def main(model_config, train_config, track_config): # Create training directory train_dir = train_config['train_dir'] if not tf.gfile.IsDirectory(train_dir): tf.logging.info('Creating training directory: %s', train_dir) tf.gfile.MakeDirs(train_dir) # Build the Tensorflow graph g = tf.Graph() with g.as_default(): # Set fixed seed np.random.seed(train_config['seed']) tf.set_random_seed(train_config['seed']) # Build the model model = siamese_model.SiameseModel(model_config, train_config, mode='inference') model.build() # Save configurations for future reference save_cfgs(train_dir, model_config, train_config, track_config) saver = tf.train.Saver(tf.global_variables(), max_to_keep=train_config['max_checkpoints_to_keep']) # Dynamically allocate GPU memory gpu_options = tf.GPUOptions(allow_growth=True) sess_config = tf.ConfigProto(gpu_options=gpu_options) sess = tf.Session(config=sess_config) model_path = tf.train.latest_checkpoint(train_config['train_dir']) if not model_path: # Initialize all variables sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) start_step = 0 # Load pretrained embedding model if needed if model_config['embed_config']['embedding_checkpoint_file']: model.init_fn(sess) else: logging.info('Restore from last checkpoint: {}'.format(model_path)) sess.run(tf.local_variables_initializer()) saver.restore(sess, model_path) start_step = tf.train.global_step(sess, model.global_step.name) + 1 checkpoint_path = osp.join(train_config['train_dir'], 'model.ckpt') saver.save(sess, checkpoint_path, global_step=start_step)
def predict(self): import cv2 import glob import numpy as np # TODO 不应该这样写,应该直接读图片预测,而不是从tfrecord读取,因为顺序变了,无法对应 predict_file_path = glob.glob(os.path.join(ORIGIN_PREDICT_DIRECTORY, '*.tif')) print(len(predict_file_path)) ckpt_path = CHECK_POINT_PATH all_parameters_saver = tf.train.Saver() with tf.Session() as sess: # 开始一个会话 sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # summary_writer = tf.summary.FileWriter(FLAGS.tb_dir, sess.graph) # tf.summary.FileWriter(FLAGS.model_dir, sess.graph) all_parameters_saver.restore(sess=sess, save_path=ckpt_path) for index, image_path in enumerate(predict_file_path): # image = cv2.imread(image_path, flags=0) image = np.reshape(a=cv2.imread(image_path, flags=0), newshape=(1, INPUT_IMG_WIDE, INPUT_IMG_HEIGHT, INPUT_IMG_CHANNEL)) predict_image = sess.run( tf.argmax(input=self.prediction, axis=3), feed_dict={ self.input_image: image, self.keep_prob: 1.0, self.lamb: 0.004 } ) cv2.imwrite(os.path.join(PREDICT_SAVED_DIRECTORY, '%d.jpg' % index), predict_image[0] * 255) print('Done prediction')
def evaluate(): """Eval ocr for a number of steps.""" with tf.Graph().as_default() as g: images, labels, seq_lengths = ocr.inputs() logits, timesteps = ocr.inference(images, FLAGS.eval_batch_size, train=True) ler = ocr.create_label_error_rate(logits, labels, timesteps) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) config = tf.ConfigProto( device_count={'GPU': 0} ) sess = tf.Session(config=config) sess.run(init_op) saver = tf.train.Saver() summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) while True: eval_once(saver, summary_writer, ler, summary_op) if FLAGS.run_once: break # print("Waiting for next evaluation for " + str(FLAGS.eval_interval_secs) + " sec") time.sleep(FLAGS.eval_interval_secs)
def main(argv): del argv # Unused. # Sanity check on the GCS bucket URL. if not FLAGS.gcs_bucket_url or not FLAGS.gcs_bucket_url.startswith("gs://"): print("ERROR: Invalid GCS bucket URL: \"%s\"" % FLAGS.gcs_bucket_url) sys.exit(1) # Verify that writing to the records file in GCS works. print("\n=== Testing writing and reading of GCS record file... ===") example_data = create_examples(FLAGS.num_examples, 5) with tf.python_io.TFRecordWriter(FLAGS.gcs_bucket_url) as hf: for e in example_data: hf.write(e.SerializeToString()) print("Data written to: %s" % FLAGS.gcs_bucket_url) # Verify that reading from the tfrecord file works and that # tf_record_iterator works. record_iter = tf.python_io.tf_record_iterator(FLAGS.gcs_bucket_url) read_count = 0 for _ in record_iter: read_count += 1 print("Read %d records using tf_record_iterator" % read_count) if read_count != FLAGS.num_examples: print("FAIL: The number of records read from tf_record_iterator (%d) " "differs from the expected number (%d)" % (read_count, FLAGS.num_examples)) sys.exit(1) # Verify that running the read op in a session works. print("\n=== Testing TFRecordReader.read op in a session... ===") with tf.Graph().as_default() as _: filename_queue = tf.train.string_input_producer([FLAGS.gcs_bucket_url], num_epochs=1) reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) tf.train.start_queue_runners() index = 0 for _ in range(FLAGS.num_examples): print("Read record: %d" % index) sess.run(serialized_example) index += 1 # Reading one more record should trigger an exception. try: sess.run(serialized_example) print("FAIL: Failed to catch the expected OutOfRangeError while " "reading one more record than is available") sys.exit(1) except tf.errors.OutOfRangeError: print("Successfully caught the expected OutOfRangeError while " "reading one more record than is available") create_dir_test() create_object_test()
def testEvaluationLoopTimeout(self): _, update_op = slim.metrics.streaming_accuracy( self._predictions, self._labels) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Create checkpoint and log directories. chkpt_dir = os.path.join(self.get_temp_dir(), 'tmp_logs/') gfile.MakeDirs(chkpt_dir) logdir = os.path.join(self.get_temp_dir(), 'tmp_logs2/') gfile.MakeDirs(logdir) # Save initialized variables to checkpoint directory. saver = tf.train.Saver() with self.test_session() as sess: init_op.run() saver.save(sess, os.path.join(chkpt_dir, 'chkpt')) # Run the evaluation loop with a timeout. with self.test_session() as sess: start = time.time() slim.evaluation.evaluation_loop( '', chkpt_dir, logdir, eval_op=update_op, eval_interval_secs=2.0, timeout=6.0) end = time.time() # Check we've waited for the timeout. self.assertGreater(end - start, 6.0) # Then the timeout kicked in and stops the loop. self.assertLess(end - start, 8.0)
def test_smoke(self): """Smoke test for a full pipeline.""" _, tname = tempfile.mkstemp() num = 100 num_epochs = 2 self._write_examples(tname, [self._random_io_data() for _ in range(num)]) tensors = data.read_from_files([tname], shuffle=True, num_epochs=num_epochs) batches = lin.shuffle_batch(tensors=tensors, batch_size=5) count = 0 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) try: while True: actual = sess.run(batches) count += len(actual[0]) except tf.errors.OutOfRangeError as ex: coord.request_stop(ex=ex) finally: coord.request_stop() coord.join(threads) self.assertEqual(num * num_epochs, count) os.remove(tname)
def get_hit_rate_and_ndcg(self, predicted_scores_by_user, items_by_user, top_k=rconst.TOP_K, match_mlperf=False): rconst.TOP_K = top_k rconst.NUM_EVAL_NEGATIVES = predicted_scores_by_user.shape[1] - 1 batch_size = items_by_user.shape[0] users = np.repeat(np.arange(batch_size)[:, np.newaxis], rconst.NUM_EVAL_NEGATIVES + 1, axis=1) users, items, duplicate_mask = \ data_pipeline.BaseDataConstructor._assemble_eval_batch( users, items_by_user[:, -1:], items_by_user[:, :-1], batch_size) g = tf.Graph() with g.as_default(): logits = tf.convert_to_tensor( predicted_scores_by_user.reshape((-1, 1)), tf.float32) softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype), logits], axis=1) duplicate_mask = tf.convert_to_tensor(duplicate_mask, tf.float32) metric_ops = neumf_model.compute_eval_loss_and_metrics( logits=logits, softmax_logits=softmax_logits, duplicate_mask=duplicate_mask, num_training_neg=NUM_TRAIN_NEG, match_mlperf=match_mlperf).eval_metric_ops hr = metric_ops[rconst.HR_KEY] ndcg = metric_ops[rconst.NDCG_KEY] init = [tf.global_variables_initializer(), tf.local_variables_initializer()] with self.test_session(graph=g) as sess: sess.run(init) return sess.run([hr[1], ndcg[1]])
def compute_one_decoding_video_metrics(iterator, feed_dict, num_videos): """Computes the average of all the metric for one decoding. Args: iterator: dataset iterator. feed_dict: feed dict to initialize iterator. num_videos: number of videos. Returns: Dictionary which contains the average of each metric per frame. """ output, target = iterator.get_next() metrics_dict = compute_metrics(output, target) metrics_names, metrics = zip(*six.iteritems(metrics_dict)) means, update_ops = tf.metrics.mean_tensor(metrics) with tf.Session() as sess: sess.run(tf.local_variables_initializer()) initalizer = iterator._initializer # pylint: disable=protected-access if initalizer is not None: sess.run(initalizer, feed_dict=feed_dict) # Compute mean over dataset for i in range(num_videos): print("Computing video: %d" % i) sess.run(update_ops) averaged_metrics = sess.run(means) results = dict(zip(metrics_names, averaged_metrics)) return results
def test(self, p1, p2, steps): loss, roc = 0., 0. acc, F1, recall = 0., 0., 0. precision, jac, AJI = 0., 0., 0. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(init_op) self.Saver() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) for step in range(steps): feed_dict = {self.is_training: False} l, prob, batch_labels = self.sess.run([self.loss, self.train_prediction, self.train_labels_node], feed_dict=feed_dict) loss += l out = ComputeMetrics(prob[0,:,:,1], batch_labels[0,:,:,0], p1, p2) acc += out[0] roc += out[1] jac += out[2] recall += out[3] precision += out[4] F1 += out[5] AJI += out[6] coord.request_stop() coord.join(threads) loss, acc, F1 = np.array([loss, acc, F1]) / steps recall, precision, roc = np.array([recall, precision, roc]) / steps jac, AJI = np.array([jac, AJI]) / steps return loss, acc, F1, recall, precision, roc, jac, AJI
def predict(self): print 'Running inference...' self.sess.run(tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())) self.load_weights('/Users/shashank/TensorFlow/SPN/weights/') coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=self.sess,coord=coord) result = [] truth = [] count =0 try: while not coord.should_stop(): print count batch_imgs, batch_labels, batch_landmarks, batch_visibility, batch_pose, batch_gender = self.sess.run([self.images,self.labels,self.land, self.vis, self.po, self.gen]) batch_imgs = (batch_imgs - 127.5) / 128.0 net_preds = self.sess.run(self.net_output, feed_dict={self.X: batch_imgs}) result.append(np.concatenate(net_preds, axis=1)) truth.append(np.concatenate([batch_labels[:, np.newaxis], batch_landmarks, batch_visibility, batch_pose, batch_gender], axis=1)) count += 1 except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: coord.request_stop() coord.join(threads) np.save('test_results', np.concatenate(result, axis = 0)) np.save('truth', np.concatenate(truth, axis = 0))
def initializeOrRestore(self): self.ckptDir = os.path.join(self.checkpoint_dir, self.dataset.name) self.ckptPrefix = os.path.join(self.ckptDir, self.name, self.name) vgg_ckpt_file = os.path.join(self.ckptDir, 'vgg_16', 'vgg_16.ckpt') mt_ckpt_file = layers.latest_checkpoint(os.path.join(self.ckptDir, 'mt')) # ckpt_file = layers.latest_checkpoint(os.path.join(self.ckptDir, 'vgg_16', 'vgg_16.ckpt')) globalVars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if vgg_ckpt_file is not None and tf.train.checkpoint_exists(vgg_ckpt_file): varsInCkpt, varsNotInCkpt = layers.scan_checkpoint_for_vars(vgg_ckpt_file, globalVars) if len(varsInCkpt) != 0: restorationSaver = tf.train.Saver(varsInCkpt) self.sess.run(tf.report_uninitialized_variables(var_list=varsInCkpt)) restorationSaver.restore(self.sess, vgg_ckpt_file) else: varsNotInCkpt = globalVars if mt_ckpt_file is not None and tf.train.checkpoint_exists(mt_ckpt_file): varsInCkpt, varsNotInCkpt = layers.scan_checkpoint_for_vars(mt_ckpt_file, varsNotInCkpt) varsInCkpt, varsNotInCkpt = layers.replaceVarInListsByName(varsInCkpt, varsNotInCkpt, 'fc6') if len(varsInCkpt) != 0: restorationSaver = tf.train.Saver(varsInCkpt) self.sess.run(tf.report_uninitialized_variables(var_list=varsInCkpt)) restorationSaver.restore(self.sess, mt_ckpt_file) else: varsNotInCkpt = globalVars self.saver = tf.train.Saver() self.sess.run(tf.group(tf.variables_initializer(varsNotInCkpt), tf.local_variables_initializer()))
def parallel_acc_by_tags(model, sess, max_parallel_calcs, data_folder, read_func, from_file=None, data_set="test", feature="images", orientations=None): total_images = 0 if orientations is None: orientations = [0, 90, 180, 270] images, labels, tags = input_pipeline(data_folder_loc, max_parallel_calcs, data_set=data_set, feature=feature, num_images=None, binary_file=False, orientations=orientations, from_file=from_file, num_epochs=1) incorrect_images_list = tf.Variable([], dtype=tf.string, trainable=False, name="Incorrect_images") adder_image_names = tf.placeholder(dtype=tf.string, shape=[None], name="Adder_images") new_incorrect_images_list = tf.concat(0, [incorrect_images_list, adder_image_names]) add_incorrect_images = tf.assign(incorrect_images_list, new_incorrect_images_list, use_locking=True, validate_shape=False) incorrect_labels_list = tf.Variable([], dtype=tf.int32, trainable=False, name="Incorrect_image_labels") adder_image_labels = tf.placeholder(dtype=tf.int32, shape=[None], name="Adder_image_labels") new_incorrect_labels_list = tf.concat(0, [incorrect_labels_list, adder_image_labels]) add_incorrect_labels = tf.assign(incorrect_labels_list, new_incorrect_labels_list, use_locking=True, validate_shape=False) init_ops = tf.group(tf.local_variables_initializer(), tf.global_variables_initializer()) sess.run(init_ops) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) steps = 0 try: print("Checking Accuracy") while not coord.should_stop(): steps += 1 raw_imgs_list, labels_list, tags_list = sess.run([images, labels, tags]) imgs_list = read_func(raw_imgs_list) preds = sess.run(model.correct_predictions, feed_dict={model.inputs: imgs_list, model.testy: labels_list, model.keep_probs: 1}) total_images += len(preds) incorrect_indices = np.where(preds == 0) # Uses locking so we do not lose any incorrect classifications sess.run(add_incorrect_images, feed_dict={adder_image_names: tags_list[incorrect_indices]}) sess.run(add_incorrect_labels, feed_dict={adder_image_labels: labels_list[incorrect_indices]}) if steps % 100 == 0: print("Calculated " + str(steps*max_parallel_calcs) + " files") except tf.errors.OutOfRangeError: print('Done training -- epoch limit reached') finally: # When done, ask the threads to stop. coord.request_stop() coord.join(threads) inc_name = sess.run(incorrect_images_list) inc_label = sess.run(incorrect_labels_list) print("Correct classifications: " + str(total_images - len(inc_name))) print("Total images: " + str(total_images)) print("Accuracy: " + str((total_images - len(inc_name))/total_images)) with open(os.path.join(data_folder, "incorrect.txt"), 'w') as f: for i in range(len(inc_name)): f.write(os.path.join(data_folder, inc_name[i].decode('utf-8')) + ', ' + str(inc_label[i]*90) + '\n') sess.close()
def test_input_pipeline(self): Xs, Ys = dsu.tiny_imagenet_load() n_batches = 0 batch_size = 10 with tf.Graph().as_default(), tf.Session() as sess: batch_generator = dsu.create_input_pipeline( Xs[:100], batch_size=batch_size, n_epochs=1, shape=(64, 64, 3), crop_shape=(64, 64, 3)) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) coord = tf.train.Coordinator() tf.get_default_graph().finalize() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): batch = sess.run(batch_generator) assert (batch.shape == (batch_size, 64, 64, 3)) n_batches += 1 except tf.errors.OutOfRangeError: pass finally: coord.request_stop() coord.join(threads) assert (n_batches == 10)
def main(): filelist = tf.train.match_filenames_once(["data/mini/part-0", "data/mini/part-1"]) filename_queue = tf.train.string_input_producer(filelist, shared_name='input_file_name_queue', num_epochs=5) ############################# new_filename_queue = tf.FIFOQueue(1, tf.string) qr = CheckpointQueueRunner(filename_queue, new_filename_queue, 3) tf.train.add_queue_runner(qr) ############################# reader = user_ops.SmStandardKvReader("[dat]", "[common]") file_name, record = reader.read(new_filename_queue) batch_record = tf.train.batch([record], batch_size=2, num_threads=5, capacity=5000, allow_smaller_final_batch=True) init_op = [tf.local_variables_initializer(), tf.global_variables_initializer()] with tf.Session() as sess: # otherwise FIFOQueue will be closed before read sess.run(init_op) print("*" * 40) print(sess.run(filelist)) print("*" * 40) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) input_schema = "data/mini/conf/input_schema.json" parse_schema = "data/mini/conf/parse_schema.json" standard_kv_parser = lib_parser.StandardKvParser(batch_record, input_schema, parse_schema) tensor_dict = standard_kv_parser.get_tensor_dict() for i in range(3): print(sess.run(tensor_dict)) coord.request_stop() coord.join(threads)
def main(output_dir, summaries_every, num_steps): graph = tf.Graph() with graph.as_default(): features = tf.placeholder(tf.float32, shape=[4, 2]) labels = tf.placeholder(tf.int32, shape=[4]) train_op, loss, gs, update_acc = make_graph(features, labels) init = tf.global_variables_initializer() init_local = tf.local_variables_initializer() summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(output_dir, graph=graph, flush_secs=1) with tf.Session(graph=graph) as sess: init.run() init_local.run() step = 0 xy = np.array([ [True, False], [True, True], [False, False], [False, True] ], dtype=np.float) y_ = np.array([True, False, False, True], dtype=np.int32) while step < num_steps: _, _, step, loss_value, summaries = sess.run( [train_op, update_acc, gs, loss, summary_op], feed_dict={features: xy, labels: y_} ) if step % summaries_every == 0: writer.add_summary(summaries, global_step=step)
def execute_cpu(self, graph_fn, inputs): """Constructs the graph, executes it on CPU and returns the result. Args: graph_fn: a callable that constructs the tensorflow graph to test. The arguments of this function should correspond to `inputs`. inputs: a list of numpy arrays to feed input to the computation graph. Returns: A list of numpy arrays or a scalar returned from executing the tensorflow graph. """ with self.test_session(graph=tf.Graph()) as sess: placeholders = [tf.placeholder_with_default(v, v.shape) for v in inputs] results = graph_fn(*placeholders) sess.run([tf.global_variables_initializer(), tf.tables_initializer(), tf.local_variables_initializer()]) materialized_results = sess.run(results, feed_dict=dict(zip(placeholders, inputs))) if (hasattr(materialized_results, '__len__') and len(materialized_results) == 1 and (isinstance(materialized_results, list) or isinstance(materialized_results, tuple))): materialized_results = materialized_results[0] return materialized_results
def testSummariesAreFlushedToDiskWithoutGlobalStep(self): output_dir = os.path.join(self.get_temp_dir(), 'flush_test_no_global_step') if tf.gfile.Exists(output_dir): # For running on jenkins. tf.gfile.DeleteRecursively(output_dir) names_to_metrics, names_to_updates = self._create_names_to_metrics( self._predictions, self._labels) for k in names_to_metrics: v = names_to_metrics[k] tf.summary.scalar(k, v) summary_writer = tf.train.SummaryWriter(output_dir) initial_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) eval_op = tf.group(*names_to_updates.values()) with self.test_session() as sess: slim.evaluation.evaluation( sess, initial_op=initial_op, eval_op=eval_op, summary_op=tf.summary.merge_all(), summary_writer=summary_writer) names_to_values = {name: names_to_metrics[name].eval() for name in names_to_metrics} self._verify_summaries(output_dir, names_to_values)
def test_batch_text_lines(self): gfile.Glob = self._orig_glob filename = self._create_temp_file("A\nB\nC\nD\nE\n") batch_size = 3 queue_capacity = 10 name = "my_batch" with tf.Graph().as_default() as g, self.test_session(graph=g) as session: inputs = tf.contrib.learn.io.read_batch_examples( [filename], batch_size, reader=tf.TextLineReader, randomize_input=False, num_epochs=1, queue_capacity=queue_capacity, read_batch_size=10, name=name) self.assertAllEqual((None,), inputs.get_shape().as_list()) session.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(session, coord=coord) self.assertAllEqual(session.run(inputs), [b"A", b"B", b"C"]) self.assertAllEqual(session.run(inputs), [b"D", b"E"]) with self.assertRaises(errors.OutOfRangeError): session.run(inputs) coord.request_stop() coord.join(threads)
def run(): with tf.Session() as sess: print("start") feature = {'image': tf.FixedLenFeature([], tf.string), 'label': tf.FixedLenFeature([], tf.int64)} # Create a list of filenames and pass it to a queue print(data_path) filename_queue = tf.train.string_input_producer(data_path, num_epochs=1) # Define a reader and read the next record reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) # Decode the record read by the reader features = tf.parse_single_example(serialized_example, features=feature) # Convert the image data from string back to the numbers image = tf.decode_raw(features['image'], tf.uint8) # image = tf.cast(image, tf.int32) # Cast label data into int32 label = tf.cast(features['label'], tf.int32) # Reshape image data into the original shape init_op = [tf.global_variables_initializer(), tf.local_variables_initializer()] sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) train_list = [] for i in range(1000): example, l = sess.run([image, label]) train_list.append((example,l)) # print (example, l) coord.request_stop() coord.join(threads) return train_list # run()
def test_keyed_read_text_lines(self): gfile.Glob = self._orig_glob filename = self._create_temp_file("ABC\nDEF\nGHK\n") batch_size = 1 queue_capacity = 5 name = "my_batch" with tf.Graph().as_default() as g, self.test_session(graph=g) as session: keys, inputs = tf.contrib.learn.io.read_keyed_batch_examples( filename, batch_size, reader=tf.TextLineReader, randomize_input=False, num_epochs=1, queue_capacity=queue_capacity, name=name) self.assertAllEqual((None,), keys.get_shape().as_list()) self.assertAllEqual((None,), inputs.get_shape().as_list()) session.run(tf.local_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(session, coord=coord) self.assertAllEqual(session.run([keys, inputs]), [[filename.encode("utf-8") + b":1"], [b"ABC"]]) self.assertAllEqual(session.run([keys, inputs]), [[filename.encode("utf-8") + b":2"], [b"DEF"]]) self.assertAllEqual(session.run([keys, inputs]), [[filename.encode("utf-8") + b":3"], [b"GHK"]]) with self.assertRaises(errors.OutOfRangeError): session.run(inputs) coord.request_stop() coord.join(threads)
def blend_images(data_folder1, data_folder2, out_folder, alpha=.5): filename_queue = tf.placeholder(dtype=tf.string) label = tf.placeholder(dtype=tf.int32) tensor_image = tf.read_file(filename_queue) image = tf.image.decode_jpeg(tensor_image, channels=3) multiplier = tf.div(tf.constant(224, tf.float32), tf.cast(tf.maximum(tf.shape(image)[0], tf.shape(image)[1]), tf.float32)) x = tf.cast(tf.round(tf.mul(tf.cast(tf.shape(image)[0], tf.float32), multiplier)), tf.int32) y = tf.cast(tf.round(tf.mul(tf.cast(tf.shape(image)[1], tf.float32), multiplier)), tf.int32) image = tf.image.resize_images(image, [x, y]) image = tf.image.rot90(image, k=label) image = tf.image.resize_image_with_crop_or_pad(image, 224, 224) sess = tf.Session() sess.run(tf.local_variables_initializer()) for root, folders, files in os.walk(data_folder1): for each in files: if each.find('.jpg') >= 0: img1 = Image.open(os.path.join(root, each)) img2_path = os.path.join(root.replace(data_folder1, data_folder2), each.split("-")[-1]) rotation = int(each.split("-")[1]) img2 = sess.run(image, feed_dict={filename_queue: img2_path, label: rotation}) imsave(os.path.join(os.getcwd(), "temp", "temp.jpg"), img2) img2 = Image.open(os.path.join(os.getcwd(), "temp", "temp.jpg")) out_image = Image.blend(img1, img2, alpha) outfile = os.path.join(root.replace(data_folder1, out_folder), each) if not os.path.exists(os.path.split(outfile)[0]): os.makedirs(os.path.split(outfile)[0]) out_image.save(outfile) else: print(each) sess.close()
def testRoundtrip(self, rate=0.25, count=5, n=500): """Tests `resample(x, weights)` and resample(resample(x, rate), 1/rate)`.""" foo = self.get_values(count) bar = self.get_values(count) weights = self.get_weights(count) resampled_in, rates = tf.contrib.training.weighted_resample([foo, bar], tf.constant(weights), rate, seed=123) resampled_back_out = tf.contrib.training.resample_at_rate(resampled_in, 1.0 / rates, seed=456) init = tf.local_variables_initializer() with self.test_session() as s: s.run(init) # initialize # outputs counts_resampled = collections.Counter() counts_reresampled = collections.Counter() for _ in range(n): resampled_vs, reresampled_vs = s.run([resampled_in, resampled_back_out]) self.assertAllEqual(resampled_vs[0], resampled_vs[1]) self.assertAllEqual(reresampled_vs[0], reresampled_vs[1]) for v in resampled_vs[0]: counts_resampled[v] += 1 for v in reresampled_vs[0]: counts_reresampled[v] += 1 # assert that resampling worked as expected self.assert_expected(weights, rate, counts_resampled, n) # and that re-resampling gives the approx identity. self.assert_expected([1.0 for _ in weights], 1.0, counts_reresampled, n, abs_delta=0.1 * n * count)
def test(self): self.test_setup() self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) # load checkpoint checkpointfile = self.conf.modeldir+ '/model.ckpt-' + str(self.conf.valid_step) self.load(self.loader, checkpointfile) # Start queue threads. threads = tf.train.start_queue_runners(coord=self.coord, sess=self.sess) # Test! confusion_matrix = np.zeros((self.conf.num_classes, self.conf.num_classes), dtype=np.int) for step in range(self.conf.valid_num_steps): preds, _, _, c_matrix = self.sess.run([self.pred, self.accu_update_op, self.mIou_update_op, self.confusion_matrix]) confusion_matrix += c_matrix if step % 100 == 0: print('step {:d}'.format(step)) print('Pixel Accuracy: {:.3f}'.format(self.accu.eval(session=self.sess))) print('Mean IoU: {:.3f}'.format(self.mIoU.eval(session=self.sess))) self.compute_IoU_per_class(confusion_matrix) # finish self.coord.request_stop() self.coord.join(threads)
def __init__(self, WindowSize=5, FeaturesDimension=41): # features normalization values self.mean_vect = np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/TrainingSetMean.npy')) self.stdev_vect = np.load(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data/TrainingSetStDev.npy')) # TF graph initialization self.config = Configuration(WindowSize, FeaturesDimension) self.graph = tf.Graph() with self.graph.as_default(): self.feat = tf.placeholder(dtype=tf.float32, shape=[1, self.config.audio_feat_dimension]) with tf.variable_scope('model'): model = VAD_DNN.Model(self.feat, self.config) logits_prob = model.softmax # the probability of speech is given by the first dimension in the softmax # so we slice the output accordingly self.speech_prob = tf.slice(logits_prob, [0, 0], [-1, 1]) init_op = tf.local_variables_initializer() saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(graph=self.graph, config=config) self.session.run(init_op) saver.restore(self.session, os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/datamean_nodeltas_model_epoch13.ckpt"))
def initialize_variables(sess, saver, logdir, checkpoint=None, resume=None): """Initialize or restore variables from a checkpoint if available. Args: sess: Session to initialize variables in. saver: Saver to restore variables. logdir: Directory to search for checkpoints. checkpoint: Specify what checkpoint name to use; defaults to most recent. resume: Whether to expect recovering a checkpoint or starting a new run. Raises: ValueError: If resume expected but no log directory specified. RuntimeError: If no resume expected but a checkpoint was found. """ sess.run(tf.group( tf.local_variables_initializer(), tf.global_variables_initializer())) if resume and not (logdir or checkpoint): raise ValueError('Need to specify logdir to resume a checkpoint.') if logdir: state = tf.train.get_checkpoint_state(logdir) if checkpoint: checkpoint = os.path.join(logdir, checkpoint) if not checkpoint and state and state.model_checkpoint_path: checkpoint = state.model_checkpoint_path if checkpoint and resume is False: message = 'Found unexpected checkpoint when starting a new run.' raise RuntimeError(message) if checkpoint: saver.restore(sess, checkpoint)
def __call__(self, inputs, training): """Add operations to classify a batch of input images. Args: inputs: A Tensor representing a batch of input images. training: A boolean. Set to True to add operations required only when training the classifier. Returns: A logits Tensor with shape [<batch_size>, self.num_classes]. """ print("Resnet Version={}".format(self.resnet_version)) print("data Format={}".format(self.data_format)) print() with self._model_variable_scope(): # with tf.variable_scope('resnet_model'): if self.data_format == 'channels_first': # Convert the inputs from channels_last (NHWC) to channels_first (NCHW). # This provides a large performance boost on GPU. See # https://www.tensorflow.org/performance/performance_guide#data_formats inputs = tf.transpose(inputs, [0, 3, 1, 2]) inputs = conv2d_fixed_padding(inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size, strides=self.conv_stride, data_format=self.data_format) # print(inputs) inputs = tf.identity(inputs, 'initial_conv') # We do not include batch normalization or activation functions in V2 # for the initial conv1 because the first ResNet unit will perform these # for both the shortcut and non-shortcut paths as part of the first # block's projection. Cf. Appendix of [2]. if self.resnet_version == 1: inputs = batch_norm(inputs, training, self.data_format) inputs = tf.nn.relu(inputs) if self.first_pool_size: inputs = tf.layers.max_pooling2d( inputs=inputs, pool_size=self.first_pool_size, strides=self.first_pool_stride, padding='SAME', data_format=self.data_format) inputs = tf.identity(inputs, 'initial_max_pool') for i, num_blocks in enumerate(self.block_sizes): num_filters = self.num_filters * (2**i) inputs = block_layer(inputs=inputs, filters=num_filters, bottleneck=self.bottleneck, block_fn=self.block_fn, blocks=num_blocks, strides=self.block_strides[i], training=training, name='block_layer{}'.format(i + 1), data_format=self.data_format) # print(inputs) # Only apply the BN and ReLU for model that does pre_activation in each # building/bottleneck block, eg resnet V2. if self.pre_activation: inputs = batch_norm(inputs, training, self.data_format) inputs = tf.nn.relu(inputs) # print(inputs) # The current top layer has shape # `batch_size x pool_size x pool_size x final_size`. # ResNet does an Average Pooling layer over pool_size, # but that is the same as doing a reduce_mean. We do a reduce_mean # here because it performs better than AveragePooling2D. axes = [2, 3] if self.data_format == 'channels_first' else [1, 2] inputs = tf.reduce_mean(inputs, axes, keepdims=True) inputs = tf.identity(inputs, 'final_reduce_mean') inputs = tf.squeeze(inputs, axes) inputs = tf.layers.dense(inputs=inputs, units=self.num_classes) inputs = tf.identity(inputs, 'final_dense') # print(inputs) print("End of __call") print("Number of classes {}".format(self.num_classes)) print("Out shape {}".format(inputs.shape)) print() print("len of trainble variables") tvar = [v for v in tf.trainable_variables()] print(len(tvar)) tf_init_g = tf.global_variables_initializer() tf_init_l = tf.local_variables_initializer() self.sess.run(tf_init_g) self.sess.run(tf_init_l) inputs = self.sess.run(inputs) return inputs
def main(unused_argv): with tf.Graph().as_default(): beta = 1e-5 if platform.system() == 'Windows': print('Running on Windows') base_dir = os.path.join('E:\\', 'Program', 'Bite') elif platform.system() == 'Linux': print('Running on Linux') base_dir = os.path.join('/media', 'md0', 'xt1800i', 'Bite') else: print('Running on unsupported system') return tfrecord = os.path.join(base_dir, 'datasets', 'tfrecord', f'{FLAGS.training_file}.tfrecord') ckpt_dir = os.path.join(base_dir, 'ckpt') training_set = tfdata_generator(filename=tfrecord, batch_size=FLAGS.batch_size, aug=True).make_one_shot_iterator() validation_set = tfdata_generator(filename=tfrecord, batch_size=FLAGS.batch_size).make_one_shot_iterator() x_train = tf.placeholder(dtype=tf.float32, shape=[None, 299, 299, 3]) y_label = tf.placeholder(dtype=tf.int32, shape=[None, 7]) outputs = model_function(x_train) with tf.name_scope('loss'): cross_entropy = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=tf.argmax(y_label, 1))) regularize = tf.add_n(tf.get_collection("losses")) # for var in tf.trainable_variables(): # print(var.name) # loss = tf.reduce_mean(cross_entropy + beta * regularize) loss = tf.add(cross_entropy, tf.multiply(beta, regularize)) tf.summary.scalar('loss', loss) with tf.name_scope('accuracy'): correct_pred = tf.equal(tf.argmax(outputs, 1), tf.argmax(y_label, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) tf.summary.scalar('accuracy', accuracy) global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(learning_rate=0.045, global_step=global_step, staircase=True, decay_steps=int(FLAGS.num_image / FLAGS.batch_size), decay_rate=0.96) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) train_op = optimizer.minimize(loss=loss, global_step=global_step) variables = tf.trainable_variables() # gradients = tf.gradients(loss,variables) # print(gradients) saver = tf.train.Saver() with tf.Session() as sess: merge = tf.summary.merge_all() writer = tf.summary.FileWriter(os.path.join(base_dir, 'logs'), sess.graph) if FLAGS.ckpt is not None: print("restore ckpt . . .") saver.restore(sess, os.path.join(ckpt_dir, FLAGS.ckpt)) else: print("new trainer . . .") sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) train_next_element = training_set.get_next() val_next_element = validation_set.get_next() import numpy as np train_acc = 0 while True: start = time.time() batch_image, batch_label = sess.run(train_next_element) _, i, a,l = sess.run([train_op, global_step, accuracy,loss], feed_dict={x_train: batch_image, y_label: batch_label}) train_acc += a print('--------------------------------------') # print(L2) # print(v[0][0][0][0][0:3]) # print(g[0][0][0][0][0:3]) if i % 10 == 0: print(f'time = {time.time() - start}, iterator = {i}, Loss = {l}, Acc = {train_acc/10}') train_acc =0 if i % 50 == 0: rs = sess.run(merge, feed_dict={x_train: batch_image, y_label: batch_label}) writer.add_summary(rs, i) if i % 100 == 0: val_batch_image, val_batch_label = sess.run(val_next_element) val_loss, val_acc = sess.run( [loss, accuracy], feed_dict={x_train: val_batch_image, y_label: val_batch_label}) print(f'iterator= {i}, val_Loss = {val_loss}, val_Acc ={val_acc}') if i % 500 == 0: saver.save(sess, os.path.join(ckpt_dir, f'model-{i}.ckpt'))
def initialize(self, config, num_classes=None): ''' Initialize the graph from scratch according to config. ''' with self.graph.as_default(): with self.sess.as_default(): # Set up placeholders h, w = config.image_size channels = config.channels self.images_A = tf.placeholder(tf.float32, shape=[None, h, w, channels], name='images_A') self.images_B = tf.placeholder(tf.float32, shape=[None, h, w, channels], name='images_B') self.labels_A = tf.placeholder(tf.int32, shape=[None], name='labels_A') self.labels_B = tf.placeholder(tf.int32, shape=[None], name='labels_B') self.scales_A = tf.placeholder(tf.float32, shape=[None], name='scales_A') self.scales_B = tf.placeholder(tf.float32, shape=[None], name='scales_B') self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') self.phase_train = tf.placeholder(tf.bool, name='phase_train') self.global_step = tf.Variable(0, trainable=False, dtype=tf.int32, name='global_step') self.setup_network_model(config, num_classes) # Build generator encode_A, styles_A = self.encoder(self.images_A) encode_B, styles_B = self.encoder(self.images_B) deform_BA, render_BA, ldmark_pred, ldmark_diff = self.decoder( encode_B, self.scales_B, None) render_AA = self.decoder(encode_A, self.scales_A, styles_A, texture_only=True) render_BB = self.decoder(encode_B, self.scales_B, styles_B, texture_only=True) self.styles_A = tf.identity(styles_A, name='styles_A') self.styles_B = tf.identity(styles_B, name='styles_B') self.deform_BA = tf.identity(deform_BA, name='deform_BA') self.ldmark_pred = tf.identity(ldmark_pred, name='ldmark_pred') self.ldmark_diff = tf.identity(ldmark_diff, name='ldmark_diff') # Build discriminator for real images patch_logits_A, logits_A = self.discriminator(self.images_A) patch_logits_B, logits_B = self.discriminator(self.images_B) patch_logits_BA, logits_BA = self.discriminator(deform_BA) # Show images in TensorBoard image_grid_A = tf.stack([self.images_A, render_AA], axis=1)[:1] image_grid_B = tf.stack([self.images_B, render_BB], axis=1)[:1] image_grid_BA = tf.stack([self.images_B, deform_BA], axis=1)[:1] image_grid = tf.concat( [image_grid_A, image_grid_B, image_grid_BA], axis=0) image_grid = tf.reshape(image_grid, [-1] + list(self.images_A.shape[1:])) image_grid = self.image_grid(image_grid, (3, 2)) tf.summary.image('image_grid', image_grid) # Build all losses self.watch_list = {} loss_list_G = [] loss_list_D = [] # Advesarial loss for deform_BA loss_D, loss_G = self.cls_adv_loss(logits_A, logits_B, logits_BA, self.labels_A, self.labels_B, self.labels_B, num_classes) loss_D, loss_G = config.coef_adv * loss_D, config.coef_adv * loss_G self.watch_list['LDg'] = loss_D self.watch_list['LGg'] = loss_G loss_list_D.append(loss_D) loss_list_G.append(loss_G) # Patch Advesarial loss for deform_BA loss_D, loss_G = self.patch_adv_loss(patch_logits_A, patch_logits_B, patch_logits_BA) loss_D, loss_G = config.coef_patch_adv * loss_D, config.coef_patch_adv * loss_G self.watch_list['LDp'] = loss_D self.watch_list['LGp'] = loss_G loss_list_D.append(loss_D) loss_list_G.append(loss_G) # Identity Mapping (Reconstruction) loss loss_idt_A = tf.reduce_mean(tf.abs(render_AA - self.images_A), name='idt_loss_A') loss_idt_A = config.coef_idt * loss_idt_A loss_idt_B = tf.reduce_mean(tf.abs(render_BB - self.images_B), name='idt_loss_B') loss_idt_B = config.coef_idt * loss_idt_B self.watch_list['idtA'] = loss_idt_A self.watch_list['idtB'] = loss_idt_B loss_list_G.append(loss_idt_A + loss_idt_B) # Collect all losses reg_loss = tf.reduce_sum(tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES), name='reg_loss') self.watch_list['reg_loss'] = reg_loss loss_list_G.append(reg_loss) loss_list_D.append(reg_loss) loss_G = tf.add_n(loss_list_G, name='loss_G') grads_G = tf.gradients(loss_G, self.G_vars) loss_D = tf.add_n(loss_list_D, name='loss_D') grads_D = tf.gradients(loss_D, self.D_vars) # Training Operaters train_ops = [] opt_G = tf.train.AdamOptimizer(self.learning_rate, beta1=0.5, beta2=0.9) opt_D = tf.train.AdamOptimizer(self.learning_rate, beta1=0.5, beta2=0.9) apply_G_gradient_op = opt_G.apply_gradients( list(zip(grads_G, self.G_vars))) apply_D_gradient_op = opt_D.apply_gradients( list(zip(grads_D, self.D_vars))) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_ops.extend([apply_G_gradient_op, apply_D_gradient_op] + update_ops) train_ops.append(tf.assign_add(self.global_step, 1)) self.train_op = tf.group(*train_ops) # Collect TF summary for k, v in self.watch_list.items(): tf.summary.scalar('losses/' + k, v) tf.summary.scalar('learning_rate', self.learning_rate) self.summary_op = tf.summary.merge_all() # Initialize variables self.sess.run(tf.local_variables_initializer()) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=99)
def _evaluate_experiment(name, input_fn, data_input): normalize_fn = data_input._normalize_image resized_h = data_input.dims[0] resized_w = data_input.dims[1] current_config = config_dict('../config.ini') exp_dir = os.path.join(current_config['dirs']['log'], 'ex', name) config_path = os.path.join(exp_dir, 'config.ini') if not os.path.isfile(config_path): config_path = '../config.ini' if not os.path.isdir(exp_dir) or not tf.train.get_checkpoint_state(exp_dir): exp_dir = os.path.join(current_config['dirs']['checkpoints'], name) config = config_dict(config_path) params = config['train'] convert_input_strings(params, config_dict('../config.ini')['dirs']) dataset_params_name = 'train_' + FLAGS.dataset if dataset_params_name in config: params.update(config[dataset_params_name]) ckpt = tf.train.get_checkpoint_state(exp_dir) if not ckpt: raise RuntimeError("Error: experiment must contain a checkpoint") ckpt_path = exp_dir + "/" + os.path.basename(ckpt.model_checkpoint_path) with tf.Graph().as_default(): #, tf.device('gpu:' + FLAGS.gpu): inputs = input_fn() im1, im2, input_shape = inputs[:3] truth = inputs[3:] height, width, _ = tf.unstack(tf.squeeze(input_shape), num=3, axis=0) im1 = resize_input(im1, height, width, resized_h, resized_w) im2 = resize_input(im2, height, width, resized_h, resized_w) # TODO adapt train.py _, flow, flow_bw = unsupervised_loss( (im1, im2), normalization=data_input.get_normalization(), params=params, augment=False, return_flow=True) im1 = resize_output(im1, height, width, 3) im2 = resize_output(im2, height, width, 3) flow = resize_output_flow(flow, height, width, 2) flow_bw = resize_output_flow(flow_bw, height, width, 2) flow_fw_int16 = flow_to_int16(flow) flow_bw_int16 = flow_to_int16(flow_bw) im1_pred = image_warp(im2, flow) im1_diff = tf.abs(im1 - im1_pred) #im2_diff = tf.abs(im1 - im2) #flow_bw_warped = image_warp(flow_bw, flow) if len(truth) == 4: flow_occ, mask_occ, flow_noc, mask_noc = truth flow_occ = resize_output_crop(flow_occ, height, width, 2) flow_noc = resize_output_crop(flow_noc, height, width, 2) mask_occ = resize_output_crop(mask_occ, height, width, 1) mask_noc = resize_output_crop(mask_noc, height, width, 1) #div = divergence(flow_occ) #div_bw = divergence(flow_bw) occ_pred = 1 - (1 - occlusion(flow, flow_bw)[0]) def_pred = 1 - (1 - occlusion(flow, flow_bw)[1]) disocc_pred = forward_warp(flow_bw) < DISOCC_THRESH disocc_fw_pred = forward_warp(flow) < DISOCC_THRESH image_slots = [((im1 * 0.5 + im2 * 0.5) / 255, 'overlay'), (im1_diff / 255, 'brightness error'), #(im1 / 255, 'first image', 1, 0), #(im2 / 255, 'second image', 1, 0), #(im2_diff / 255, '|first - second|', 1, 2), (flow_to_color(flow), 'flow'), #(flow_to_color(flow_bw), 'flow bw prediction'), #(tf.image.rgb_to_grayscale(im1_diff) > 20, 'diff'), #(occ_pred, 'occ'), #(def_pred, 'disocc'), #(disocc_pred, 'reverse disocc'), #(disocc_fw_pred, 'forward disocc prediction'), #(div, 'div'), #(div < -2, 'neg div'), #(div > 5, 'pos div'), #(flow_to_color(flow_occ, mask_occ), 'flow truth'), (flow_error_image(flow, flow_occ, mask_occ, mask_noc), 'flow error') # (blue: correct, red: wrong, dark: occluded) ] # list of (scalar_op, title) scalar_slots = [(flow_error_avg(flow_noc, flow, mask_noc), 'EPE_noc'), (flow_error_avg(flow_occ, flow, mask_occ), 'EPE_all'), (outlier_pct(flow_noc, flow, mask_noc), 'outliers_noc'), (outlier_pct(flow_occ, flow, mask_occ), 'outliers_all')] elif len(truth) == 2: flow_gt, mask = truth flow_gt = resize_output_crop(flow_gt, height, width, 2) mask = resize_output_crop(mask, height, width, 1) image_slots = [((im1 * 0.5 + im2 * 0.5) / 255, 'overlay'), (im1_diff / 255, 'brightness error'), (flow_to_color(flow), 'flow'), (flow_to_color(flow_gt, mask), 'gt'), ] # list of (scalar_op, title) scalar_slots = [(flow_error_avg(flow_gt, flow, mask), 'EPE_all')] else: image_slots = [(im1 / 255, 'first image'), #(im1_pred / 255, 'warped second image', 0, 1), (im1_diff / 255, 'warp error'), #(im2 / 255, 'second image', 1, 0), #(im2_diff / 255, '|first - second|', 1, 2), (flow_to_color(flow), 'flow prediction')] scalar_slots = [] num_ims = len(image_slots) image_ops = [t[0] for t in image_slots] scalar_ops = [t[0] for t in scalar_slots] image_names = [t[1] for t in image_slots] scalar_names = [t[1] for t in scalar_slots] all_ops = image_ops + scalar_ops image_lists = [] averages = np.zeros(len(scalar_ops)) sess_config = tf.ConfigProto(allow_soft_placement=True) exp_out_dir = os.path.join('../out', name) if FLAGS.output_visual or FLAGS.output_benchmark: if os.path.isdir(exp_out_dir): shutil.rmtree(exp_out_dir) os.makedirs(exp_out_dir) shutil.copyfile(config_path, os.path.join(exp_out_dir, 'config.ini')) with tf.Session(config=sess_config) as sess: saver = tf.train.Saver(tf.global_variables()) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) restore_networks(sess, params, ckpt, ckpt_path) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # TODO adjust for batch_size > 1 (also need to change image_lists appending) max_iter = FLAGS.num if FLAGS.num > 0 else None try: num_iters = 0 while not coord.should_stop() and (max_iter is None or num_iters != max_iter): all_results = sess.run([flow, flow_bw, flow_fw_int16, flow_bw_int16] + all_ops) flow_fw_res, flow_bw_res, flow_fw_int16_res, flow_bw_int16_res = all_results[:4] all_results = all_results[4:] image_results = all_results[:num_ims] scalar_results = all_results[num_ims:] iterstr = str(num_iters).zfill(6) if FLAGS.output_visual: path_col = os.path.join(exp_out_dir, iterstr + '_flow.png') path_overlay = os.path.join(exp_out_dir, iterstr + '_img.png') path_error = os.path.join(exp_out_dir, iterstr + '_err.png') write_rgb_png(image_results[0] * 255, path_overlay) write_rgb_png(image_results[1] * 255, path_col) write_rgb_png(image_results[2] * 255, path_error) if FLAGS.output_benchmark: path_fw = os.path.join(exp_out_dir, iterstr) if FLAGS.output_png: write_rgb_png(flow_fw_int16_res, path_fw + '_10.png', bitdepth=16) else: write_flo(flow_fw_res, path_fw + '_10.flo') if FLAGS.output_backward: path_fw = os.path.join(exp_out_dir, iterstr + '_01.png') write_rgb_png(flow_bw_int16_res, path_bw, bitdepth=16) if num_iters < FLAGS.num_vis: image_lists.append(image_results) averages += scalar_results if num_iters > 0: sys.stdout.write('\r') num_iters += 1 sys.stdout.write("-- evaluating '{}': {}/{}" .format(name, num_iters, max_iter)) sys.stdout.flush() print() except tf.errors.OutOfRangeError: pass averages /= num_iters coord.request_stop() coord.join(threads) for t, avg in zip(scalar_slots, averages): _, scalar_name = t print("({}) {} = {}".format(name, scalar_name, avg)) return image_lists, image_names
output = network.outputs loss = tf.losses.softmax_cross_entropy(onehot_labels=tf_y, logits=output) train_op = tf.train.AdamOptimizer(learning_rate=LR).minimize(loss) accuracy = tf.metrics.accuracy( labels=tf.argmax(tf_y, axis=1), predictions=tf.argmax(output, axis=1), )[1] sess = tf.Session() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) for step in range(600): b_x, b_y = mnist.train.next_batch(BATCH_SIZE) _, loss_ = sess.run([train_op, loss], {tf_x: b_x, tf_y: b_y}) if step % 50 == 0: accuracy_ = sess.run(accuracy, {tf_x: test_x, tf_y: test_y}) # print('test accuracy: %.2f' , accuracy_) print('Step:', step, '| train loss: %.4f' % loss_,
def merge_checkpoint_file(): dg = tf.Graph() with dg.as_default(): x, y_ = mnist_dataset.placeholder_inputs(external.FLAGS.batch_size) # Build the graph for the deep net y_conv, _ = mnist_deep.deepnn(x) cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.to_int64(y_), logits=y_conv)) # The op for initializing the variables. init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) # with tf.variable_scope('hidden1', reuse=True): # weights = tf.get_variable('weights') # biases = tf.get_variable('biases') # restore_saver = tf.train.Saver({'hidden1/weights': weights, 'hidden1/biases': biases}) restore_saver = tf.train.Saver(var_list=tf.trainable_variables()) multi_models = [] for filename in external.FLAGS.model_list: checkpoint_file = os.path.join(external.FLAGS.train_dir, filename) restore_saver.restore(sess, checkpoint_file) # weights1 = weights.eval(session=sess) # print(weights) # print(type(weights1)) # print(weights1) gc = tf.trainable_variables() # print(gc) model_now = [] for var in gc: # print(var) var_v = var.eval(session=sess) # print(var_v) model_now.append(var_v) # print(model_now) multi_models.append(model_now) # assign_op = weights.assign((weights1 + weights2)/2) # sess.run(assign_op) # print(weights) # print(dg.get_tensor_by_name(name='hidden1/weights:0').eval(session=sess)) # print(weights.eval(session=sess)) multi_models_mean = np.mean(multi_models, axis=0) # print('Model mean:') # print(multi_models_mean) for i, _ in enumerate(gc): sess.run(gc[i].assign(multi_models_mean[i])) t = sess.run(gc[5]) print(t) mean_ckpt_file = os.path.join(external.FLAGS.train_dir, external.FLAGS.merged_model) restore_saver.save(sess, mean_ckpt_file) print('Merged model saved.') sess.close()
def validation(): print('validation') test_feeder = DataIterator(data_dir='./data/test/') final_predict_val = [] final_predict_index = [] groundtruth = [] result = [] with tf.Session() as sess: test_images, test_labels, test_names = test_feeder.input_pipeline( batch_size=FLAGS.batch_size, num_epochs=1) graph = build_graph(1) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer() ) # initialize test_feeder's inside state coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver() ckpt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if ckpt: saver.restore(sess, ckpt) print("restore from the checkpoint {0}".format(ckpt)) logger.info('===Start validation===') try: i = 0 acc_top_1, acc_top_k = 0.0, 0.0 while not coord.should_stop(): i += 1 temp_dict = {} start_time = time.time() test_images_batch, test_labels_batch, test_names_batch = sess.run( [test_images, test_labels, test_names]) feed_dict = { graph['images']: test_images_batch, graph['labels']: test_labels_batch, graph['names']: test_names_batch, graph['keep_prob']: 1.0 } batch_labels, batch_names, probs, indices, acc_1, acc_k = sess.run( [ graph['labels'], graph['names'], graph['predicted_val_top_k'], graph['predicted_index_top_k'], graph['accuracy'], graph['accuracy_top_k'] ], feed_dict=feed_dict) final_predict_val += probs.tolist() final_predict_index += indices.tolist() groundtruth += batch_labels.tolist() temp_dict['filename'] = batch_names.tolist() temp_dict['label'] = indices.tolist() print(temp_dict) result.append(temp_dict) acc_top_1 += acc_1 acc_top_k += acc_k end_time = time.time() logger.info( "the batch {0} takes {1} seconds, accuracy = {2}(top_1) {3}(top_k)" .format(i, end_time - start_time, acc_1, acc_k)) except tf.errors.OutOfRangeError: logger.info( '==================Validation Finished================') acc_top_1 = acc_top_1 * FLAGS.batch_size / test_feeder.size acc_top_k = acc_top_k * FLAGS.batch_size / test_feeder.size logger.info('top 1 accuracy {0} top k accuracy {1}'.format( acc_top_1, acc_top_k)) finally: coord.request_stop() coord.join(threads) #return {'prob': final_predict_val, 'indices': final_predict_index, 'groundtruth': groundtruth} return result
def train(): print('Begin training') train_feeder = DataIterator(data_dir='./data/test/') test_feeder = DataIterator(data_dir='./data/test/') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: train_images, train_labels, train_names = train_feeder.input_pipeline( batch_size=FLAGS.batch_size, aug=True) test_images, test_labels, train_names = test_feeder.input_pipeline( batch_size=FLAGS.batch_size) print(train_names) graph = build_graph(top_k=1) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver() train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/val') start_step = 0 if FLAGS.restore: ckpt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if ckpt: saver.restore(sess, ckpt) print("restore from the checkpoint {0}".format(ckpt)) start_step += int(ckpt.split('-')[-1]) logger.info('===Training Start===') try: while not coord.should_stop(): start_time = time.time() train_images_batch, train_labels_batch = sess.run( [train_images, train_labels]) feed_dict = { graph['images']: train_images_batch, graph['labels']: train_labels_batch, graph['keep_prob']: 0.8 } _, loss_val, train_summary, step = sess.run( [ graph['train_op'], graph['loss'], graph['merged_summary_op'], graph['global_step'] ], feed_dict=feed_dict) train_writer.add_summary(train_summary, step) end_time = time.time() logger.info("the step {0} takes {1} loss {2}".format( step, end_time - start_time, loss_val)) if step > FLAGS.max_steps: break if step % FLAGS.eval_steps == 1: test_images_batch, test_labels_batch = sess.run( [test_images, test_labels]) feed_dict = { graph['images']: test_images_batch, graph['labels']: test_labels_batch, graph['keep_prob']: 1.0 } accuracy_test, test_summary = sess.run( [graph['accuracy'], graph['merged_summary_op']], feed_dict=feed_dict) test_writer.add_summary(test_summary, step) logger.info( '===============Eval a batch=======================') logger.info('the step {0} test accuracy: {1}'.format( step, accuracy_test)) logger.info( '===============Eval a batch=======================') if step % FLAGS.save_steps == 1: logger.info('Save the ckpt of {0}'.format(step)) saver.save(sess, os.path.join(FLAGS.checkpoint_dir, 'model'), global_step=graph['global_step']) except tf.errors.OutOfRangeError: logger.info('==================Train Finished================') saver.save(sess, os.path.join(FLAGS.checkpoint_dir, 'my-model'), global_step=graph['global_step']) finally: coord.request_stop() coord.join(threads)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_dir", type=str, required=True, help="either a directory containing subdirectories " "train, val, test, etc, or a directory containing " "the tfrecords") parser.add_argument( "--val_input_dir", type=str, help="directories containing the tfrecords. default: input_dir") parser.add_argument("--logs_dir", default='logs', help="ignored if output_dir is specified") parser.add_argument( "--output_dir", help= "output directory where json files, summary, model, gifs, etc are saved. " "default is logs_dir/model_fname, where model_fname consists of " "information from model and model_hparams") parser.add_argument("--output_dir_postfix", default="") parser.add_argument( "--checkpoint", help= "directory with checkpoint or checkpoint name (e.g. checkpoint_dir/model-200000)" ) parser.add_argument("--resume", action='store_true', help='resume from lastest checkpoint in output_dir.') parser.add_argument("--dataset", type=str, help="dataset class name") parser.add_argument( "--dataset_hparams", type=str, help="a string of comma separated list of dataset hyperparameters") parser.add_argument("--dataset_hparams_dict", type=str, help="a json file of dataset hyperparameters") parser.add_argument("--model", type=str, help="model class name") parser.add_argument( "--model_hparams", type=str, help="a string of comma separated list of model hyperparameters") parser.add_argument("--model_hparams_dict", type=str, help="a json file of model hyperparameters") parser.add_argument( "--summary_freq", type=int, default=1000, help= "save frequency of summaries (except for image and eval summaries) for train/validation set" ) parser.add_argument( "--image_summary_freq", type=int, default=5000, help="save frequency of image summaries for train/validation set") parser.add_argument( "--eval_summary_freq", type=int, default=25000, help="save frequency of eval summaries for train/validation set") parser.add_argument( "--accum_eval_summary_freq", type=int, default=100000, help= "save frequency of accumulated eval summaries for validation set only") parser.add_argument("--progress_freq", type=int, default=100, help="display progress every progress_freq steps") parser.add_argument("--save_freq", type=int, default=5000, help="save frequence of model, 0 to disable") parser.add_argument( "--aggregate_nccl", type=int, default=0, help= "whether to use nccl or cpu for gradient aggregation in multi-gpu training" ) parser.add_argument("--gpu_mem_frac", type=float, default=0.9, help="fraction of gpu memory to use") parser.add_argument("--seed", type=int) args = parser.parse_args() if args.seed is not None: tf.set_random_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.output_dir is None: list_depth = 0 model_fname = '' for t in ('model=%s,%s' % (args.model, args.model_hparams)): if t == '[': list_depth += 1 if t == ']': list_depth -= 1 if list_depth and t == ',': t = '..' if t in '=,': t = '.' if t in '[]': t = '' model_fname += t args.output_dir = os.path.join(args.logs_dir, model_fname) + args.output_dir_postfix if args.resume: if args.checkpoint: raise ValueError('resume and checkpoint cannot both be specified') args.checkpoint = args.output_dir dataset_hparams_dict = {} model_hparams_dict = {} if args.dataset_hparams_dict: with open(args.dataset_hparams_dict) as f: dataset_hparams_dict.update(json.loads(f.read())) if args.model_hparams_dict: with open(args.model_hparams_dict) as f: model_hparams_dict.update(json.loads(f.read())) if args.checkpoint: checkpoint_dir = os.path.normpath(args.checkpoint) if not os.path.isdir(args.checkpoint): checkpoint_dir, _ = os.path.split(checkpoint_dir) if not os.path.exists(checkpoint_dir): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), checkpoint_dir) with open(os.path.join(checkpoint_dir, "options.json")) as f: print("loading options from checkpoint %s" % args.checkpoint) options = json.loads(f.read()) args.dataset = args.dataset or options['dataset'] args.model = args.model or options['model'] try: with open(os.path.join(checkpoint_dir, "dataset_hparams.json")) as f: dataset_hparams_dict.update(json.loads(f.read())) except FileNotFoundError: print( "dataset_hparams.json was not loaded because it does not exist" ) try: with open(os.path.join(checkpoint_dir, "model_hparams.json")) as f: model_hparams_dict.update(json.loads(f.read())) except FileNotFoundError: print( "model_hparams.json was not loaded because it does not exist") print( '----------------------------------- Options ------------------------------------' ) for k, v in args._get_kwargs(): print(k, "=", v) print( '------------------------------------- End --------------------------------------' ) VideoDataset = datasets.get_dataset_class(args.dataset) train_dataset = VideoDataset(args.input_dir, mode='train', hparams_dict=dataset_hparams_dict, hparams=args.dataset_hparams) val_dataset = VideoDataset(args.val_input_dir or args.input_dir, mode='val', hparams_dict=dataset_hparams_dict, hparams=args.dataset_hparams) if val_dataset.hparams.long_sequence_length != val_dataset.hparams.sequence_length: # the longer dataset is only used for the accum_eval_metrics long_val_dataset = VideoDataset(args.val_input_dir or args.input_dir, mode='val', hparams_dict=dataset_hparams_dict, hparams=args.dataset_hparams) long_val_dataset.set_sequence_length( val_dataset.hparams.long_sequence_length) else: long_val_dataset = None variable_scope = tf.get_variable_scope() variable_scope.set_use_resource(True) VideoPredictionModel = models.get_model_class(args.model) hparams_dict = dict(model_hparams_dict) hparams_dict.update({ 'context_frames': train_dataset.hparams.context_frames, 'sequence_length': train_dataset.hparams.sequence_length, 'repeat': train_dataset.hparams.time_shift, }) model = VideoPredictionModel(hparams_dict=hparams_dict, hparams=args.model_hparams, aggregate_nccl=args.aggregate_nccl) batch_size = model.hparams.batch_size train_tf_dataset = train_dataset.make_dataset(batch_size) train_iterator = train_tf_dataset.make_one_shot_iterator() train_handle = train_iterator.string_handle() val_tf_dataset = val_dataset.make_dataset(batch_size) val_iterator = val_tf_dataset.make_one_shot_iterator() val_handle = val_iterator.string_handle() iterator = tf.data.Iterator.from_string_handle( train_handle, train_tf_dataset.output_types, train_tf_dataset.output_shapes) inputs = iterator.get_next() # inputs comes from the training dataset by default, unless train_handle is remapped to the val_handles model.build_graph(inputs) if long_val_dataset is not None: # separately build a model for the longer sequence. # this is needed because the model doesn't support dynamic shapes. long_hparams_dict = dict(hparams_dict) long_hparams_dict[ 'sequence_length'] = long_val_dataset.hparams.sequence_length # use smaller batch size for longer model to prevenet running out of memory long_hparams_dict['batch_size'] = model.hparams.batch_size // 2 long_model = VideoPredictionModel( mode="test", # to not build the losses and discriminators hparams_dict=long_hparams_dict, hparams=args.model_hparams, aggregate_nccl=args.aggregate_nccl) tf.get_variable_scope().reuse_variables() long_model.build_graph(long_val_dataset.make_batch(batch_size)) else: long_model = None if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, "options.json"), "w") as f: f.write(json.dumps(vars(args), sort_keys=True, indent=4)) with open(os.path.join(args.output_dir, "dataset_hparams.json"), "w") as f: f.write( json.dumps(train_dataset.hparams.values(), sort_keys=True, indent=4)) with open(os.path.join(args.output_dir, "model_hparams.json"), "w") as f: f.write(json.dumps(model.hparams.values(), sort_keys=True, indent=4)) with tf.name_scope("parameter_count"): # exclude trainable variables that are replicas (used in multi-gpu setting) trainable_variables = set(tf.trainable_variables()) & set( model.saveable_variables) parameter_count = tf.reduce_sum( [tf.reduce_prod(tf.shape(v)) for v in trainable_variables]) saver = tf.train.Saver(var_list=model.saveable_variables, max_to_keep=2) # None has the special meaning of evaluating at the end, so explicitly check for non-equality to zero if (args.summary_freq != 0 or args.image_summary_freq != 0 or args.eval_summary_freq != 0 or args.accum_eval_summary_freq != 0): summary_writer = tf.summary.FileWriter(args.output_dir) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_mem_frac) config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) global_step = tf.train.get_or_create_global_step() max_steps = model.hparams.max_steps with tf.Session(config=config) as sess: print("parameter_count =", sess.run(parameter_count)) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) model.restore(sess, args.checkpoint) sess.run(model.post_init_ops) val_handle_eval = sess.run(val_handle) sess.graph.finalize() start_step = sess.run(global_step) def should(step, freq): if freq is None: return (step + 1) == (max_steps - start_step) else: return freq and ((step + 1) % freq == 0 or (step + 1) in (0, max_steps - start_step)) def should_eval(step, freq): # never run eval summaries at the beginning since it's expensive, unless it's the last iteration return should(step, freq) and (step >= 0 or (step + 1) == (max_steps - start_step)) # start at one step earlier to log everything without doing any training # step is relative to the start_step for step in range(-1, max_steps - start_step): if step == 1: # skip step -1 and 0 for timing purposes (for warmstarting) start_time = time.time() fetches = {"global_step": global_step} if step >= 0: fetches["train_op"] = model.train_op if should(step, args.progress_freq): fetches['d_loss'] = model.d_loss fetches['g_loss'] = model.g_loss fetches['d_losses'] = model.d_losses fetches['g_losses'] = model.g_losses if isinstance(model.learning_rate, tf.Tensor): fetches["learning_rate"] = model.learning_rate if should(step, args.summary_freq): fetches["summary"] = model.summary_op if should(step, args.image_summary_freq): fetches["image_summary"] = model.image_summary_op if should_eval(step, args.eval_summary_freq): fetches["eval_summary"] = model.eval_summary_op run_start_time = time.time() print(step) results = sess.run(fetches) run_elapsed_time = time.time() - run_start_time if run_elapsed_time > 1.5 and step > 0 and set( fetches.keys()) == {"global_step", "train_op"}: print('running train_op took too long (%0.1fs)' % run_elapsed_time) if (should(step, args.summary_freq) or should(step, args.image_summary_freq) or should_eval(step, args.eval_summary_freq)): val_fetches = {"global_step": global_step} if should(step, args.summary_freq): val_fetches["summary"] = model.summary_op if should(step, args.image_summary_freq): val_fetches["image_summary"] = model.image_summary_op if should_eval(step, args.eval_summary_freq): val_fetches["eval_summary"] = model.eval_summary_op val_results = sess.run( val_fetches, feed_dict={train_handle: val_handle_eval}) for name, summary in val_results.items(): if name == 'global_step': continue val_results[name] = add_tag_suffix(summary, '_1') if should(step, args.summary_freq): print("recording summary") summary_writer.add_summary(results["summary"], results["global_step"]) summary_writer.add_summary(val_results["summary"], val_results["global_step"]) print("done") if should(step, args.image_summary_freq): print("recording image summary") summary_writer.add_summary(results["image_summary"], results["global_step"]) summary_writer.add_summary(val_results["image_summary"], val_results["global_step"]) print("done") if should_eval(step, args.eval_summary_freq): print("recording eval summary") summary_writer.add_summary(results["eval_summary"], results["global_step"]) summary_writer.add_summary(val_results["eval_summary"], val_results["global_step"]) print("done") if should_eval(step, args.accum_eval_summary_freq): val_datasets = [val_dataset] val_models = [model] if long_model is not None: val_datasets.append(long_val_dataset) val_models.append(long_model) for i, (val_dataset_, val_model) in enumerate(zip(val_datasets, val_models)): sess.run(val_model.accum_eval_metrics_reset_op) # traverse (roughly up to rounding based on the batch size) all the validation dataset accum_eval_summary_num_updates = val_dataset_.num_examples_per_epoch( ) // val_model.hparams.batch_size val_fetches = { "global_step": global_step, "accum_eval_summary": val_model.accum_eval_summary_op } for update_step in range(accum_eval_summary_num_updates): print( 'evaluating %d / %d' % (update_step + 1, accum_eval_summary_num_updates)) val_results = sess.run( val_fetches, feed_dict={train_handle: val_handle_eval}) accum_eval_summary = add_tag_suffix( val_results["accum_eval_summary"], '_%d' % (i + 1)) print("recording accum eval summary") summary_writer.add_summary(accum_eval_summary, val_results["global_step"]) print("done") if (should(step, args.summary_freq) or should(step, args.image_summary_freq) or should_eval(step, args.eval_summary_freq) or should_eval(step, args.accum_eval_summary_freq)): summary_writer.flush() if should(step, args.progress_freq): # global_step will have the correct step count if we resume from a checkpoint # global step is read before it's incremented steps_per_epoch = train_dataset.num_examples_per_epoch( ) / batch_size train_epoch = results["global_step"] / steps_per_epoch print("progress global step %d epoch %0.1f" % (results["global_step"] + 1, train_epoch)) if step > 0: elapsed_time = time.time() - start_time average_time = elapsed_time / step images_per_sec = batch_size / average_time remaining_time = (max_steps - (start_step + step + 1)) * average_time print( " image/sec %0.1f remaining %dm (%0.1fh) (%0.1fd)" % (images_per_sec, remaining_time / 60, remaining_time / 60 / 60, remaining_time / 60 / 60 / 24)) if results['d_losses']: print("d_loss", results["d_loss"]) for name, loss in results['d_losses'].items(): print(" ", name, loss) if results['g_losses']: print("g_loss", results["g_loss"]) for name, loss in results['g_losses'].items(): print(" ", name, loss) if isinstance(model.learning_rate, tf.Tensor): print("learning_rate", results["learning_rate"]) if should(step, args.save_freq): print("saving model to", args.output_dir) saver.save(sess, os.path.join(args.output_dir, "model"), global_step=global_step) print("done")
def run_training(): # Tell TensorFlow that the model will be built into the default Graph. validation_file_list = [ FLAGS.data_path + 'validation_' + '%d.tfrecords' % i for i in range(1, 11) ] train_file_list = [ FLAGS.data_path + 'train_' + '%d.tfrecords' % i for i in range(1, 11) ] test_file_list = [ FLAGS.data_path + 'test_' + '%d.tfrecords' % i for i in range(1, 11) ] with tf.Graph().as_default(): global_step = tf.train.get_or_create_global_step(graph=None) with tf.device('/cpu:0'): batch_x_pos, batch_labels_pos, batch_iterator_pos = inputs_train( 'train_*_pos.tfrecords') batch_x_neg, batch_labels_neg, batch_iterator_neg = inputs_train( 'train_*_neg.tfrecords') batch_x = tf.concat([batch_x_pos, batch_x_neg], axis=0) batch_labels = tf.concat([batch_labels_pos, batch_labels_neg], axis=0) print(batch_x.get_shape(), batch_labels.get_shape()) print(batch_labels.dtype, '====================') train_x, train_labels, train_iterator = inputs( train_file_list, FLAGS.train_size) validation_x, validation_labels, val_iterator = inputs( validation_file_list, FLAGS.valid_size) test_x, test_labels, test_iterator = inputs( test_file_list, FLAGS.test_size) print(batch_x.get_shape(), ', ', batch_labels.get_shape()) with tf.device("/cpu:0"): train_logits, train_accuracy, train_probs, train_preds = model( batch_x, batch_labels, FLAGS.activation, FLAGS.h1, FLAGS.h2, FLAGS.h3, FLAGS.h4, FLAGS.h5, FLAGS.h6, FLAGS.h7, FLAGS.h8, FLAGS.h9, FLAGS.h10, istrain=True, reuse=False) tot_train_logits, tot_train_accuracy, tot_train_probs, tot_train_preds = model( train_x, train_labels, FLAGS.activation, FLAGS.h1, FLAGS.h2, FLAGS.h3, FLAGS.h4, FLAGS.h5, FLAGS.h6, FLAGS.h7, FLAGS.h8, FLAGS.h9, FLAGS.h10, istrain=False, reuse=True) train_logits_ev, train_accuracy_ev, train_probs_ev, train_preds_ev = model( batch_x, batch_labels, FLAGS.activation, FLAGS.h1, FLAGS.h2, FLAGS.h3, FLAGS.h4, FLAGS.h5, FLAGS.h6, FLAGS.h7, FLAGS.h8, FLAGS.h9, FLAGS.h10, istrain=False, reuse=True) cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=batch_labels, logits=train_logits)) l2_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.reduce_sum(l2_loss) cost += l2_loss # cross entropy loss when drop out turned off! cost_ev_train = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=batch_labels, logits=train_logits_ev)) tot_train_cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=train_labels, logits=tot_train_logits)) # its like above cost except weights positive examples! # cost = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(batch_labels, train_logits, 1)) learning_rate = tf.train.inverse_time_decay( FLAGS.learning_rate, global_step=global_step, decay_steps=FLAGS.decay_steps, decay_rate=FLAGS.decay_rate, staircase=True) train_op = tf.contrib.layers.optimize_loss( loss=cost, global_step=global_step, learning_rate=learning_rate, optimizer=tf.train.AdamOptimizer(), # clip_gradients=2.0, name='d_optimize_loss', variables=tf.trainable_variables()) ema = tf.train.ExponentialMovingAverage(decay=0.999) # EMA weights: with tf.control_dependencies([train_op]): train_op_new = ema.apply(tf.trainable_variables()) validation_logits, validation_accuracy, validation_probs, validation_preds = model( validation_x, validation_labels, FLAGS.activation, FLAGS.h1, FLAGS.h2, FLAGS.h3, FLAGS.h4, FLAGS.h5, FLAGS.h6, FLAGS.h7, FLAGS.h8, FLAGS.h9, FLAGS.h10, istrain=False, reuse=True) test_logits, test_accuracy, test_probs, test_preds = model( test_x, test_labels, FLAGS.activation, FLAGS.h1, FLAGS.h2, FLAGS.h3, FLAGS.h4, FLAGS.h5, FLAGS.h6, FLAGS.h7, FLAGS.h8, FLAGS.h9, FLAGS.h10, istrain=False, reuse=True) validation_cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=validation_labels, logits=validation_logits)) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) # allow to use smaller amount of GPU memory and grow if needed! config.gpu_options.allow_growth = False saver = tf.train.Saver() with tf.Session(config=config) as sess: valid_perfs = dict() metrics = [ 'pos_percision', 'pos_recall', 'pos_f1', 'neg_percision', 'neg_recall', 'neg_f1', 'p_r_auc', 'auc' ] valid_perfs['pos_percision'] = [] valid_perfs['pos_recall'] = [] valid_perfs['pos_f1'] = [] valid_perfs['neg_percision'] = [] valid_perfs['neg_recall'] = [] valid_perfs['neg_f1'] = [] valid_perfs['p_r_auc'] = [] valid_perfs['auc'] = [] sess.run(init_op) # b, l = sess.run([batch_x, batch_labels]) # np.savetxt(FLAGS.data_path + 'batch1.txt', b[:1000], delimiter='\t') # print(len(l), np.sum(l)) # print('Shape of Batch=', b.shape, l.shape, batch_x.get_shape(), batch_labels.get_shape()) checkpoint_rate = int( 0.5 * (FLAGS.train_size // FLAGS.batch_size) // 100) * 100 n_iterations = checkpoint_rate * FLAGS.num_epochs # checkpoint_rate = 10000 print('Total iterations= ', n_iterations) best_validation_auc = 0 train_per_epoch_loss = 0 val_loss_list = [] train_per_epoch_loss_list = [] # pre training performance measure: tr_loss = sess.run(tot_train_cost) train_per_epoch_loss_list.append(tr_loss) val_probs, val_preds, val_labs, val_loss = sess.run([ validation_probs, validation_preds, validation_labels, validation_cost ]) val_loss_list.append(val_loss) validation_pref = performance_statistics(val_labs, val_preds, val_probs) for m in range(len(metrics)): valid_perfs[metrics[m]].append(validation_pref[m]) # training for i in range(n_iterations): start_time = time.time() batch_loss, batch_accu, _ = sess.run( [cost_ev_train, train_accuracy_ev, train_op_new]) train_per_epoch_loss += batch_loss duration = time.time() - start_time if i % 100 == 0: print( 'Step %d: BATCH loss = %.3f, accuracy=%.3f (%.3f sec)' % (i, batch_loss, batch_accu, duration)) train_per_epoch_loss += batch_loss if i % 100 == 0: print( 'Step %d: BATCH loss = %.2f, accuracy=%.2f (%.3f sec)' % (i, batch_loss, batch_accu, duration)) if (i + 1) % checkpoint_rate == 0: t1 = time.time() val_probs, val_preds, val_labs, val_loss = sess.run([ validation_probs, validation_preds, validation_labels, validation_cost ]) validation_pref = performance_statistics( val_labs, val_preds, val_probs) print('Epoch end validation performance:') print(validation_pref) val_loss_list.append(val_loss) train_per_epoch_loss_list.append(train_per_epoch_loss / checkpoint_rate) train_per_epoch_loss = 0 for m in range(len(metrics)): valid_perfs[metrics[m]].append(validation_pref[m]) if best_validation_auc < validation_pref[-1]: print('-' * 100) print( 'validation AUC improved from {} to {} '.format( best_validation_auc, validation_pref[-1]), '(%.3f sec)' % (time.time() - t1)) best_validation_auc = validation_pref[-1] # print('Train loss improved from {} to {} '.format(best_train_loss, train_loss)) print('Saving Model to file..') print('-' * 100) saver.save( sess, 'C:/behrouz/projects/behrouz-Rui-Gaurav-project/' 'excel-pbi-modeling/balanced-batch/checkpoint/model.ckpt' ) test_accu = sess.run(test_accuracy) train_val_loss = np.zeros((2, len(val_loss_list))) train_val_loss[0, :] = train_per_epoch_loss_list train_val_loss[1, :] = val_loss_list np.save( 'C:/behrouz/projects/behrouz-Rui-Gaurav-project/' 'excel-pbi-modeling/balanced-batch/train_val_loss.npy', train_val_loss) print(train_val_loss.shape) with open( 'C:/behrouz/projects/behrouz-Rui-Gaurav-project/' 'excel-pbi-modeling/balanced-batch/val_perfs.pickle', 'wb') as f: pickle.dump(valid_perfs, f) print('Out of Sample Accuracy= ', test_accu) sess.close()