def test_model_eval(config): test_files = os.path.join( config.base_dir, config.tfrecord_dir, config.test_tfrecords) errors = [] data, labels, preds = [], [], [] with tf.device('/cpu:0'): test_data, test_labels = inputs( tfrecord_file=test_files, num_epochs=1, batch_size=config.test_batch, target_data_dims=config.param_dims, target_label_dims=config.output_hist_dims) with tf.device('/gpu:0'): with tf.variable_scope("model") as scope: model = cnn_model_struct() model.build(test_data, config.param_dims[1:], config.output_hist_dims[1:],train_mode=False) y_conv = model.output error = kl_divergence_test(y_conv, tf.reshape(test_labels,[-1,np.prod(config.output_hist_dims[1:])])) gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True gpuconfig.allow_soft_placement = True saver = tf.train.Saver() with tf.Session(config=gpuconfig) as sess: init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) step=0 try: while not coord.should_stop(): # load the model here ckpts=tf.train.latest_checkpoint(config.model_output) saver.restore(sess,ckpts) ip , op, pred, err = sess.run([test_data, test_labels, y_conv, error]) batch_err = np.sum(err, axis=1) errors.append(batch_err) data.append(ip) labels.append(op) preds.append(pred) print('{} batches complete..'.format(len(errors))) except tf.errors.OutOfRangeError: print('Epoch limit reached!') finally: coord.request_stop() coord.join(threads) err_vals = np.array(errors).reshape((-1,)) plt.hist(err_vals, bins=1000) plt.title('Model: %s, min error=%0.3f, max error=%0.3f'%(config.model_name,np.min(err_vals), np.max(err_vals)), fontsize=12) plt.gca().tick_params(axis='both', which='major', labelsize=6) plt.gca().tick_params(axis='both', which='minor', labelsize=6) #import ipdb; ipdb.set_trace() plt.savefig(os.path.join(config.results_dir, '{}_eval.png'.format(config.model_name)), dpi=300) plt.close() inp_data = np.array(data) inp_data = inp_data.reshape((inp_data.shape[0]*inp_data.shape[1],inp_data.shape[2],inp_data.shape[3])) inp_labs = np.array(labels) inp_labs = inp_labs.reshape((inp_labs.shape[0]*inp_labs.shape[1],inp_labs.shape[2],inp_labs.shape[3])) idx = np.argsort(err_vals) net_preds = np.array(preds) net_preds = net_preds.reshape((net_preds.shape[0]*net_preds.shape[1],net_preds.shape[2])) net_preds = net_preds.reshape(inp_labs.shape) # lets draw a 3x3 grid with fig, ax = plt.subplots(3,3) for k in range(9): r, c = int(k/3), k%3 cur_idx = idx[-1 * (k+1)] parameters = np.around(inp_data[cur_idx].flatten(),decimals=2) err = err_vals[cur_idx] ax[r,c].plot(inp_labs[cur_idx],'r',alpha=0.5) ax[r,c].plot(net_preds[cur_idx],'-.g',alpha=0.5) mystr = 'err=%0.2f'%(err) ax[r,c].text(0.9,.9, "\n".join(wrap('{}, params:{}'.format(mystr, parameters),30)), fontsize=6, horizontalalignment='right', verticalalignment='center', transform=ax[r,c].transAxes) #plt.show() ax[r,c].tick_params(axis='both', which='major', labelsize=6) ax[r,c].tick_params(axis='both', which='minor', labelsize=6) plt.savefig(os.path.join(config.results_dir, '{}_debug.png'.format(config.model_name)),dpi=300) plt.close()
def train_reverse_model(config): train_files = os.path.join(config.base_dir, config.tfrecord_dir, config.train_tfrecords) val_files = os.path.join(config.base_dir, config.tfrecord_dir, config.val_tfrecords) with tf.device('/cpu:0'): train_labels, train_data = inputs( tfrecord_file=train_files, num_epochs=config.epochs, batch_size=config.train_batch, target_data_dims=config.param_dims, target_label_dims=config.output_hist_dims) val_labels, val_data = inputs( tfrecord_file=val_files, num_epochs=config.epochs, batch_size=config.val_batch, target_data_dims=config.param_dims, target_label_dims=config.output_hist_dims) with tf.device('/gpu:0'): with tf.variable_scope("reversemodel") as scope: print("creating the model") model = cnn_reverse_model() model.build(train_data, config.output_hist_dims[1:], config.param_dims[1:], train_mode=True, full_cov=config.full_cov_matrix) y_conv = model.output nparams = np.prod(config.param_dims[1:]) # Define loss and optimizer with tf.name_scope('loss'): labels = tf.reshape(train_labels, [-1, nparams]) #### depending on the config, use the appropriate loss if config.full_cov_matrix: hke_loss, cov_sym = heteroskedastic_cov_loss( y_conv, labels, nparams) else: hke_loss = heteroskedastic_loss(y_conv, labels, nparams) with tf.name_scope('adam_optimizer'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_step = tf.train.AdamOptimizer(1e-4).minimize( hke_loss) ##### ## VALIDATION ##### print("building a validation model") scope.reuse_variables() val_model = cnn_reverse_model() val_model.build(val_data, config.output_hist_dims[1:], config.param_dims[1:], train_mode=False, full_cov=config.full_cov_matrix) val_res = val_model.output norm_val_labels = tf.reshape(val_labels, [-1, nparams]) #### select loss function for the val model as well if config.full_cov_matrix: val_loss, _ = heteroskedastic_cov_loss(val_res, norm_val_labels, nparams) else: val_loss = heteroskedastic_loss(val_res, norm_val_labels, nparams) tf.summary.scalar("loss", hke_loss) summary_op = tf.summary.merge_all() saver = tf.train.Saver(tf.global_variables()) gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True gpuconfig.allow_soft_placement = True with tf.Session(config=gpuconfig) as sess: train_writer = tf.summary.FileWriter( os.path.join(config.base_dir, config.summary_dir, config.model_name)) train_writer.add_graph(tf.get_default_graph()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) step = 0 start = time.time() try: while not coord.should_stop(): # train for a step if config.full_cov_matrix: _, loss, outputs, tr_data, tr_labels, norm_tr_labels, cov_mat = sess.run( [ train_step, hke_loss, y_conv, train_data, train_labels, labels, cov_sym ]) else: _, loss, outputs, tr_data, tr_labels, norm_tr_labels = sess.run( [ train_step, hke_loss, y_conv, train_data, train_labels, labels ]) step += 1 if step % config.print_iters == 0: finish = time.time() print("step={}, loss={}, time_elapsed={} s/step".format( step, loss, (finish - start) / float(config.print_iters))) start = finish saver.save(sess, os.path.join( config.model_output, config.model_name + '_' + str(step) + '.ckpt'), global_step=step) if config.full_cov_matrix: print(cov_mat) if step % config.val_iters == 0: val_forward_pass_time = time.time() v_data, v_labels, norm_v_labels, v_res, v_loss = sess.run([ val_data, val_labels, norm_val_labels, val_res, val_loss ]) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str, step) print("\t val loss = {}, time_elapsed = {}s".format( v_loss, time.time() - val_forward_pass_time)) ''' nparams = np.prod(config.param_dims[1:]) color_v = ['r', 'g', 'b', 'k', 'm', 'c', 'y'] for k in range(nparams): plt.scatter(norm_v_labels[:, k], v_res[:, k], c = color_v[k], alpha=0.5); plt.pause(1); plt.clf() ''' if config.full_cov_matrix: data_dump = { 'predictions': outputs, 'labels': norm_tr_labels, 'cov': cov_mat } pickle.dump( data_dump, open( os.path.join(config.base_dir, config.summary_dir, config.model_name, 'step%d.pickle' % step), 'wb')) except tf.errors.OutOfRangeError: print("Finished training for %d epochs" % config.epochs) finally: coord.request_stop() coord.join(threads)
def train_model(config): train_files = os.path.join( config.base_dir, config.tfrecord_dir, config.train_tfrecords) val_files = os.path.join( config.base_dir, config.tfrecord_dir, config.val_tfrecords) with tf.device('/cpu:0'): train_data, train_labels = inputs( tfrecord_file=train_files, num_epochs=config.epochs, batch_size=config.train_batch, target_data_dims=config.param_dims, target_label_dims=config.output_hist_dims) val_data, val_labels = inputs( tfrecord_file=val_files, num_epochs=config.epochs, batch_size=config.val_batch, target_data_dims=config.param_dims, target_label_dims=config.output_hist_dims) with tf.device('/gpu:0'): with tf.variable_scope("model") as scope: print ("creating the model") model = cnn_model_struct() model.build(train_data, config.param_dims[1:], config.output_hist_dims[1:],train_mode=True) y_conv = model.output # Define loss and optimizer with tf.name_scope('loss'): kl_divergence_loss = kl_divergence(y_conv, tf.reshape(train_labels,[-1,np.prod(config.output_hist_dims[1:])])) with tf.name_scope('adam_optimizer'): # wd_l = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if 'biases' not in v.name] # loss_wd = reg_loss+(0.0005 * tf.add_n([tf.nn.l2_loss(x) for x in wd_l])) # train_step = tf.train.AdamOptimizer(1e-4).minimize(loss_wd) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_step = tf.train.AdamOptimizer(1e-4).minimize(kl_divergence_loss) # with tf.name_scope('accuracy'): # res_shaped = tf.reshape(y_conv, [config.train_batch, config.num_classes]) # lab_shaped = tf.reshape(train_labels, [config.train_batch, config.num_classes]) # accuracy = calc_error(lab_shaped, res_shaped) ##### ## VALIDATION ##### print("building a validation model") #with tf.variable_scope('val_model', reuse=tf.AUTO_REUSE): scope.reuse_variables() val_model = cnn_model_struct() val_model.build(val_data, config.param_dims[1:], config.output_hist_dims[1:],train_mode=False) val_res = val_model.output val_loss = kl_divergence(val_res, tf.reshape(val_labels, [-1,np.prod(config.output_hist_dims[1:])])) #img = tf.expand_dims(tf.reshape(train_labels,[-1,32,16]),axis=-1) tf.summary.scalar("loss", kl_divergence_loss) #tf.summary.image("groundtruth", img) #tf.summary.histogram("predictions",y_conv) #tf.summary.scalar("train error", accuracy) #tf.summary.scalar("validation error", val_error) summary_op = tf.summary.merge_all() saver = tf.train.Saver(tf.global_variables()) gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True gpuconfig.allow_soft_placement = True with tf.Session(config=gpuconfig) as sess: #graph_location = tempfile.mkdtemp() #print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(os.path.join(config.base_dir,config.summary_dir)) train_writer.add_graph(tf.get_default_graph()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) step = 0 start = time.time() try: while not coord.should_stop(): # train for a step _, loss, softmax_outputs, tr_data, tr_labels = sess.run([train_step, kl_divergence_loss, y_conv, train_data, train_labels]) step+=1 if math.isnan(loss): import ipdb; ipdb.set_trace() #import ipdb; ipdb.set_trace() ''' # validating the model. main concern is if the weights are shared between # the train and validation model if step % 200 == 0: vl_img, vl_lab, vl_res, vl_err = sess.run([val_images,val_labels,val_res,val_error]) print("\t validating") print("\t val error = {}".format(vl_err)) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str,step) # save the model check point ''' if step % config.print_iters == 0: finish = time.time() print("step={}, loss={}, time_elapsed={} s/step".format(step,loss,(finish-start)/float(config.print_iters))) start = finish saver.save(sess,os.path.join( config.model_output, config.model_name+'_'+str(step)+'.ckpt' ),global_step=step) if step % config.val_iters == 0: val_forward_pass_time = time.time() v_data, v_labels, v_res, v_loss = sess.run([val_data, val_labels, val_res, val_loss]) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str, step) print("\t val loss = {}, time_elapsed = {}s".format(v_loss, time.time() - val_forward_pass_time)) for kk in range(1): X = v_res[kk].reshape(-1,config.output_hist_dims[-1]); plt.plot(X,color='r',alpha=0.5, label='Predictions'); plt.plot(v_labels[kk],'g',alpha=0.5, label='Data'); plt.legend() #plt.plot(X[:,1],color='b',alpha=0.5); #plt.plot(v_labels[kk][:,1],'-.b',alpha=0.5); plt.pause(1); plt.clf() except tf.errors.OutOfRangeError: print("Finished training for %d epochs" % config.epochs) finally: coord.request_stop() coord.join(threads)
def test_model_eval(config): test_data = os.path.join(config.tfrecord_dir, config.test_tfrecords) with tf.device('/cpu:0'): test_images, test_labels = inputs( tfrecord_file=test_data, num_epochs=None, image_target_size=config.image_target_size, label_shape=config.num_classes, batch_size=config.test_batch, augmentation=False) with tf.device('/gpu:0'): with tf.variable_scope("model") as scope: model = cnn_model_struct() model.build(test_images, config.num_classes, train_mode=False) results = tf.argmax(model.output, 1) error = tf.reduce_mean( tf.cast(tf.equal(results, tf.cast(test_labels, tf.int64)), tf.float32)) gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True gpuconfig.allow_soft_placement = True saver = tf.train.Saver() with tf.Session(config=gpuconfig) as sess: #init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) #sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) step = 0 try: while not coord.should_stop(): # load the model here ckpts = tf.train.latest_checkpoint(config.model_output) saver.restore(sess, ckpts) ims, labs, probs, err, res = sess.run([ test_images, test_labels, model.output, error, results ]) import ipdb ipdb.set_trace() except tf.errors.OutOfRangeError: print('Epoch limit reached!') finally: coord.request_stop() coord.join(threads) # def get_model_predictions(config,patches): # input = tf.placeholder(tf.float32, [None,config.image_target_size[0],config.image_target_size[1],config.image_target_size[2]], name='ip_placeholder') # with tf.device('/gpu:0'): # with tf.variable_scope("model") as scope: # model = cnn_model_struct() # model.build(input,config.num_classes,train_mode=False) # # gpuconfig = tf.ConfigProto() # gpuconfig.gpu_options.allow_growth = True # gpuconfig.allow_soft_placement = True # saver = tf.train.Saver() # # with tf.Session(config=gpuconfig) as sess: # #init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # #sess.run(init_op) # #coord = tf.train.Coordinator() # #threads = tf.train.start_queue_runners(coord=coord) # step=0 # try: # #while not coord.should_stop(): # # load the model here # ckpts=tf.train.latest_checkpoint(config.model_output) # saver.restore(sess,ckpts) # probs = sess.run(model.output,feed_dict={input:patches}) # except tf.errors.OutOfRangeError: # print('Epoch limit reached!') # finally: # #coord.request_stop() # print ('done') # #coord.join(threads) # return probs
def train_model(config): train_data = os.path.join(config.tfrecord_dir, config.train_tfrecords) val_data = os.path.join(config.tfrecord_dir, config.val_tfrecords) with tf.device('/cpu:0'): train_images, train_labels = inputs( tfrecord_file=train_data, num_epochs=config.epochs, image_target_size=config.image_target_size, label_shape=config.num_classes, batch_size=config.train_batch, augmentation=True) val_images, val_labels = inputs( tfrecord_file=val_data, num_epochs=config.epochs, image_target_size=config.image_target_size, label_shape=config.num_classes, batch_size=config.val_batch) with tf.device('/gpu:0'): with tf.variable_scope("model") as scope: print("creating the model") # Create the model # x = tf.placeholder(tf.float32, [None, config.image_target_size[0],config.image_target_size[1],config.image_target_size[2]]) # y_ = tf.placeholder(tf.int64, [None,1]) # Build the graph for the deep net #y_conv, keep_prob = deepnn(train_images) #y_conv = deepnn(train_images) model = cnn_model_struct() model.build(train_images, config.num_classes, train_mode=True) y_conv = model.output y_ = tf.cast(train_labels, tf.int64) yhat = tf.argmax(y_conv, 1) # Define loss and optimizer with tf.name_scope('loss'): cross_entropy = tf.losses.sparse_softmax_cross_entropy( labels=y_, logits=y_conv) cross_entropy = tf.reduce_mean(cross_entropy) with tf.name_scope('adam_optimizer'): train_step = tf.train.AdamOptimizer(1e-4).minimize( cross_entropy) with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(y_conv, 1), y_) correct_prediction = tf.cast(correct_prediction, tf.float32) accuracy = tf.reduce_mean(correct_prediction) print("using validation") scope.reuse_variables() val_model = cnn_model_struct() val_model.build(val_images, config.num_classes, train_mode=False) val_results = tf.argmax(val_model.output, 1) val_error = tf.reduce_mean( tf.cast(tf.equal(val_results, tf.cast(val_labels, tf.int64)), tf.float32)) tf.summary.scalar("loss", cross_entropy) tf.summary.scalar("train error", accuracy) tf.summary.scalar("validation error", val_error) summary_op = tf.summary.merge_all() saver = tf.train.Saver(tf.global_variables()) gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True gpuconfig.allow_soft_placement = True with tf.Session(config=gpuconfig) as sess: graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) #import ipdb; ipdb.set_trace(); #batch_images, batch_labels = sess.run([train_images, train_labels]) #val_batch_images, val_batch_labels = sess.run([val_images, val_labels]) step = 0 try: while not coord.should_stop(): #train for a step _, tr_images, tr_labels, loss, softmax_outputs, pred_labels, error = sess.run( [ train_step, train_images, train_labels, cross_entropy, y_conv, yhat, accuracy ]) print("step={}, loss={}, accuracy={}".format( step, loss, error)) step += 1 #validate model if step % 200 == 0: vl_img, vl_lab, vl_res, vl_err = sess.run( [val_images, val_labels, val_results, val_error]) print("\t val error = {}".format(vl_err)) #import ipdb; ipdb.set_trace(); summary_str = sess.run(summary_op) train_writer.add_summary(summary_str, step) # save the model check point if step % 1000 == 0: saver.save(sess, os.path.join( config.model_output, config.model_name + '_' + str(step) + '.ckpt'), global_step=step) except tf.errors.OutOfRangeError: print("Finished training for %d epochs" % config.epochs) finally: coord.request_stop() coord.join(threads)
def train_classification_model(config): train_files = os.path.join( config.base_dir, config.tfrecord_dir, 'train_model_classifier.tfrecords') val_files = os.path.join( config.base_dir, config.tfrecord_dir, 'val_model_classifier.tfrecords') with tf.device('/cpu:0'): train_data, train_labels = inputs( tfrecord_file=train_files, num_epochs=config.epochs, batch_size=config.train_batch, target_data_dims=[None, 1, 256, 2], target_label_dims=[None, 1, 1]) val_data, val_labels = inputs( tfrecord_file=val_files, num_epochs=config.epochs, batch_size=config.val_batch, target_data_dims=[None, 1, 256, 2], target_label_dims=[None, 1, 1]) #import ipdb; ipdb.set_trace() with tf.device('/gpu:0'): with tf.variable_scope("classmodel") as scope: print ("creating the model") model = classification_model() model.build(train_data, [1, 256, 2], [1, 5, 1], train_mode=True, full_cov=config.full_cov_matrix) y_conv = model.output nparams = np.prod(config.param_dims[1:]) # Define loss and optimizer with tf.name_scope('loss'): labels = tf.one_hot(tf.cast(tf.squeeze(train_labels), dtype=tf.uint8), 5) loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=y_conv)) with tf.name_scope('adam_optimizer'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_step = tf.train.AdamOptimizer(1e-4).minimize(loss) ##### ## VALIDATION ##### print("building a validation model") scope.reuse_variables() val_model = classification_model() val_model.build(val_data, [1, 256, 2], [1, 5, 1], train_mode=False, full_cov=config.full_cov_matrix) val_res = val_model.output val_loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(tf.cast(tf.squeeze(val_labels), dtype=tf.uint8),5), logits=val_res)) tf.summary.scalar("loss", val_loss) summary_op = tf.summary.merge_all() saver = tf.train.Saver(tf.global_variables()) gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True gpuconfig.allow_soft_placement = True with tf.Session(config=gpuconfig) as sess: train_writer = tf.summary.FileWriter(os.path.join(config.base_dir,config.summary_dir,config.model_name)) train_writer.add_graph(tf.get_default_graph()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) step = 0 start = time.time() import ipdb; ipdb.set_trace() try: while not coord.should_stop(): # train for a step _, loss_np, outputs, tr_data, tr_labels = sess.run([train_step, loss, y_conv, train_data, train_labels]) step+=1 if step % config.print_iters == 0: finish = time.time() print("step={}, loss={}, time_elapsed={} s/step".format(step,loss_np,(finish-start)/float(config.print_iters))) start = finish #saver.save(sess,os.path.join( # config.model_output, # config.model_name+'_'+str(step)+'.ckpt' #),global_step=step) #if config.full_cov_matrix: # print(cov_mat) if step % config.val_iters == 0: val_forward_pass_time = time.time() v_data, v_labels, v_res, v_loss = sess.run([val_data, val_labels, val_res, val_loss]) #summary_str = sess.run(summary_op) #train_writer.add_summary(summary_str, step) print("\t val loss = {}, time_elapsed = {}s".format(v_loss, time.time() - val_forward_pass_time)) ''' nparams = np.prod(config.param_dims[1:]) color_v = ['r', 'g', 'b', 'k', 'm', 'c', 'y'] for k in range(nparams): plt.scatter(norm_v_labels[:, k], v_res[:, k], c = color_v[k], alpha=0.5); plt.pause(1); plt.clf() ''' if config.full_cov_matrix: data_dump = {'predictions': outputs, 'labels': norm_tr_labels, 'cov':cov_mat} pickle.dump(data_dump, open( os.path.join(config.base_dir,config.summary_dir,config.model_name,'step%d.pickle'%step), 'wb')) except tf.errors.OutOfRangeError: print("Finished training for %d epochs" % config.epochs) finally: coord.request_stop() coord.join(threads)
def train_model(config): train_data = os.path.join(config.tfrecord_dir, config.train_tfrecords) val_data = os.path.join(config.tfrecord_dir, config.val_tfrecords) with tf.device('/cpu:0'): train_images, train_labels = inputs(tfrecord_file=train_data, num_epochs=config.epochs, image_target_size=config.image_target_size, label_shape=config.label_shape, batch_size=config.train_batch, augmentation=True) val_images, val_labels = inputs(tfrecord_file=val_data, num_epochs=config.epochs, image_target_size=config.image_target_size, label_shape=config.label_shape, batch_size=config.val_batch) with tf.device('/gpu:0'): with tf.variable_scope("model") as scope: print ("creating the model") model = cnn_model_struct() model.build(train_images,config.num_classes,train_mode=True) y_conv = model.output # Define loss and optimizer with tf.name_scope('loss'): reg_loss = tf.nn.l2_loss(y_conv - train_labels) with tf.name_scope('adam_optimizer'): # wd_l = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if 'biases' not in v.name] # loss_wd = reg_loss+(0.0005 * tf.add_n([tf.nn.l2_loss(x) for x in wd_l])) # train_step = tf.train.AdamOptimizer(1e-4).minimize(loss_wd) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_step = tf.train.AdamOptimizer(1e-4).minimize(reg_loss) # with tf.name_scope('accuracy'): # res_shaped = tf.reshape(y_conv, [config.train_batch, config.num_classes]) # lab_shaped = tf.reshape(train_labels, [config.train_batch, config.num_classes]) # accuracy = calc_error(lab_shaped, res_shaped) print("using validation") # scope.reuse_variables() with tf.variable_scope('val_model', reuse=tf.AUTO_REUSE): val_model = cnn_model_struct() val_model.build(val_images, config.num_classes, train_mode=False) val_res = val_model.output # val_res_shaped = tf.reshape(val_model.output, [config.val_batch, config.num_classes]) # val_lab_shaped = tf.reshape(val_labels, [config.val_batch, config.num_classes]) val_error = tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(val_labels-val_res)))) tf.summary.scalar("loss", reg_loss) #tf.summary.scalar("train error", accuracy) #tf.summary.scalar("validation error", val_error) summary_op = tf.summary.merge_all() saver = tf.train.Saver(tf.global_variables()) gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True gpuconfig.allow_soft_placement = True with tf.Session(config=gpuconfig) as sess: graph_location = tempfile.mkdtemp() print('Saving graph to: %s' % graph_location) train_writer = tf.summary.FileWriter(graph_location) train_writer.add_graph(tf.get_default_graph()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) step = 0 try: while not coord.should_stop(): # train for a step _, tr_images, tr_labels, loss, softmax_outputs = sess.run([train_step,train_images,train_labels, reg_loss, y_conv]) print("step={}, loss={}".format(step,loss)) step+=1 #import ipdb; ipdb.set_trace() # validating the model. main concern is if the weights are shared between # the train and validation model if step % 200 == 0: vl_img, vl_lab, vl_res, vl_err = sess.run([val_images,val_labels,val_res,val_error]) print("\t validating") print("\t val error = {}".format(vl_err)) summary_str = sess.run(summary_op) train_writer.add_summary(summary_str,step) # save the model check point if step % 250 == 0: saver.save(sess,os.path.join( config.model_output, config.model_name+'_'+str(step)+'.ckpt' ),global_step=step) except tf.errors.OutOfRangeError: print("Finished training for %d epochs" % config.epochs) finally: coord.request_stop() coord.join(threads)