def make_model(self): with tf.device(self.default_gpu): pr = self.pr if self.is_training: self.make_train_ops() else: self.make_test_ops(reuse=False) self.coord = tf.train.Coordinator() self.saver_fast = tf.train.Saver() self.saver_slow = tf.train.Saver(max_to_keep=1000) self.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init_op) tf.train.start_queue_runners(sess=self.sess, coord=self.coord) print 'Initializing' self.merged_summary = tf.summary.merge_all() print 'Tensorboard command:' summary_dir = ut.mkdir(pj(pr.summary_dir, ut.simple_timestamp())) print 'tensorboard --logdir=%s' % summary_dir self.sum_writer = tf.summary.FileWriter(summary_dir, self.sess.graph) if self.profile: #self.run_meta = tf.RunMetadata() self.profiler = tf.profiler.Profiler(self.sess.graph)
def make_train_model(self): with tf.device(self.default_gpu): pr = self.pr # steps self.step = tf.get_variable( 'global_step', [], trainable = False, initializer = tf.constant_initializer(0), dtype = tf.int64) self.lr = tf.constant(pr.base_lr) # model scale = pr.gamma ** tf.floor(cast_float(self.step) / float(pr.step_size)) self.lr_step = pr.base_lr * scale #lr = tf.Print(lr, [lr, lr*1e3, scale]) opt = shift.make_opt(pr.opt_method, self.lr_step, pr) self.inputs = read_data(pr, self.gpus) gpu_grads, gpu_losses = {}, {} for i, gpu in enumerate(self.gpus): with tf.device(gpu): reuse = (i > 0) ims = self.inputs[i]['ims'] samples = self.inputs[i]['samples'] labels = self.inputs[i]['label'] net = make_net(ims, samples, pr, reuse = reuse, train = self.is_training) self.loss = tfu.Loss('loss') self.loss.add_loss(shift.slim_losses_with_prefix(None), 'reg') self.loss.add_loss_acc(label_loss(net.logits, labels), 'label') grads = opt.compute_gradients(self.loss.total_loss()) ut.add_dict_list(gpu_grads, self.loss.name, grads) ut.add_dict_list(gpu_losses, self.loss.name, self.loss) if i == 0: self.net = net (gs, vs) = zip(*tfu.average_grads(gpu_grads['loss'])) if pr.grad_clip is not None: gs, _ = tf.clip_by_global_norm(gs, pr.grad_clip) gs = [tfu.print_every(gs[0], 100, ['grad norm:', tf.global_norm(gs)])] + list(gs[1:]) gvs = zip(gs, vs) #for g, v in zip(grads, vs): # if g[0] is not None: # tf.summary.scalar('%s_grad_norm' % v.name, tf.reduce_sum(g[0]**2)**0.5) # tf.summary.scalar('%s_val_norm' % v.name, tf.reduce_sum(v**2)**0.5) #self.train_op = opt.apply_gradients(gvs, global_step = self.step) bn_ups = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # self.train_op = tf.group(self.train_op, *bn_ups) with tf.control_dependencies(bn_ups): self.train_op = opt.apply_gradients(gvs, global_step = self.step) self.coord = tf.train.Coordinator() self.saver_fast = tf.train.Saver() self.saver_slow = tf.train.Saver(max_to_keep = 1000) #self.init_op = tf.global_variables_initializer() if self.is_training: self.init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init_op) tf.train.start_queue_runners(sess = self.sess, coord = self.coord) self.merged_summary = tf.summary.merge_all() print 'Tensorboard command:' summary_dir = ut.mkdir(pj(pr.summary_dir, ut.simple_timestamp())) print 'tensorboard --logdir=%s' % summary_dir self.sum_writer = tf.summary.FileWriter(summary_dir, self.sess.graph) if self.profile: self.profiler = tf.profiler.Profiler(self.sess.graph)
def make_model(self): with tf.device(self.default_gpu): pr = self.pr # steps self.step = tf.get_variable('global_step', [], trainable=False, initializer=tf.constant_initializer(0), dtype=tf.int64) self.lr = tf.constant(pr.base_lr) # model opt = make_opt(pr.opt_method, pr.base_lr, pr) self.inputs = read_data(pr, self.gpus) gpu_grads, gpu_losses = {}, {} for i, gpu in enumerate(self.gpus): with tf.device(gpu): reuse = (i > 0) with tf.device('/cpu:0'): ims = self.inputs[i]['ims'] samples_ex = self.inputs[i]['samples'] assert pr.both_examples assert not pr.small_augment labels = tf.random_uniform([shape(ims, 0)], 0, 2, dtype=tf.int64, name='labels_sample') samples0 = tf.where(tf.equal(labels, 1), samples_ex[:, 1], samples_ex[:, 0]) samples1 = tf.where(tf.equal(labels, 0), samples_ex[:, 1], samples_ex[:, 0]) labels1 = 1 - labels net0 = make_net(ims, samples0, pr, reuse=reuse, train=self.is_training) net1 = make_net(None, samples1, pr, im_net=net0.im_net, reuse=True, train=self.is_training) labels = tf.concat([labels, labels1], 0) net = ut.Struct( logits=tf.concat([net0.logits, net1.logits], 0), cam=tf.concat([net0.cam, net1.cam], 0), last_conv=tf.concat([net0.last_conv, net1.last_conv], 0)) loss = mu.Loss('loss') loss.add_loss(slim_losses_with_prefix(None), 'reg') loss.add_loss_acc(sigmoid_loss(net.logits, labels), 'label') grads = opt.compute_gradients(loss.total_loss()) ut.add_dict_list(gpu_grads, loss.name, grads) ut.add_dict_list(gpu_losses, loss.name, loss) #self.loss = loss if i == 0: self.net = net self.loss = mu.merge_losses(gpu_losses['loss']) for name, val in zip(self.loss.get_loss_names(), self.loss.get_losses()): tf.summary.scalar(name, val) if not self.is_training: #pr_test = pr.copy() pr_test = self.pr_test.copy() pr_test.augment_ims = False print 'pr_test =' print pr_test self.test_ims, self.test_samples, self.test_ytids = mu.on_cpu( lambda: shift_dset.make_db_reader( pr_test.test_list, pr_test, pr.test_batch, ['im', 'samples', 'ytid'], one_pass=True)) if pr_test.do_shift: self.test_labels = tf.random_uniform( [shape(self.test_ims, 0)], 0, 2, dtype=tf.int64) self.test_samples = tf.where(tf.equal(self.test_labels, 1), self.test_samples[:, 1], self.test_samples[:, 0]) else: self.test_labels = tf.ones(shape(self.test_ims, 0), dtype=tf.int64) #self.test_samples = tf.where(tf.equal(self.test_labels, 1), self.test_samples[:, 1], self.test_samples[:, 0]) print 'sample shape:', shape(self.test_samples) self.test_net = make_net(self.test_ims, self.test_samples, pr_test, reuse=True, train=self.is_training) (gs, vs) = zip(*mu.average_grads(gpu_grads['loss'])) if pr.grad_clip is not None: gs, _ = tf.clip_by_global_norm(gs, pr.grad_clip) gs = [ mu.print_every(gs[0], 100, ['grad norm:', tf.global_norm(gs)]) ] + list(gs[1:]) gvs = zip(gs, vs) bn_ups = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if pr.multipass: ops = [ opt.apply_gradients(gvs, global_step=self.step) for i in xrange(pr.multipass_count) ] def op_helper(count=[0]): op = ops[count[0] % len(ops)] count[0] += 1 return op self.train_op = op_helper else: op = tf.group(opt.apply_gradients(gvs, global_step=self.step), *bn_ups) self.train_op = lambda: op self.coord = tf.train.Coordinator() self.saver = tf.train.Saver() self.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init_op) tf.train.start_queue_runners(sess=self.sess, coord=self.coord) self.merged_summary = tf.summary.merge_all() print 'Tensorboard command:' summary_dir = ut.mkdir(pj(pr.summary_dir, ut.simple_timestamp())) print 'tensorboard --logdir=%s' % summary_dir self.sum_writer = tf.summary.FileWriter(summary_dir, self.sess.graph)