def train(modelParams, epochNumber): # import corresponding model name as model_cnn, specifed at json file model_cnn = importlib.import_module('Model_Factory.' + modelParams['modelName']) if not os.path.exists(modelParams['dataDir']): raise ValueError("No such data directory %s" % modelParams['dataDir']) _setupLogging(os.path.join(modelParams['logDir'], "genlog")) with tf.Graph().as_default(): # track the number of train calls (basically number of batches processed) globalStep = tf.get_variable('globalStep', [], initializer=tf.constant_initializer(0), trainable=False) # Get images inputs for model_cnn. if modelParams['phase'] == 'v': filename, pngTemp, targetT = data_input.inputs_vali(**modelParams) else: filename, pngTemp, targetT = data_input.inputs(**modelParams) print('Input ready') #TEST### filenametest, pngTemptest, targetTtest = data_input.inputs_test(**modelParams) # Build a Graph that computes the HAB predictions from the # inference model #targetP = model_cnn.inference(pngTemp, **modelParams) targetP, l2reg = model_cnn.inference_l2reg(pngTemp, **modelParams) #TEST### targetPtest = model_cnn.inference(pngTemptest, **modelParams) print(targetP.get_shape()) # loss model if modelParams.get('classificationModel'): print('Classification model...') # loss on last tuple #loss = model_cnn.loss(targetP, targetT, **modelParams) loss = model_cnn.loss_l2reg(targetP, targetT, l2reg, **modelParams) #TEST### losstest = model_cnn.loss(targetPtest, targetTtest, **modelParams) else: print('Regression model...') # loss on last tuple loss = model_cnn.loss(targetP, targetT, **modelParams) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. #opTrain = model_cnn.train(loss, globalStep, **modelParams) ############################## print('Testing ready') # Create a saver. saver = tf.train.Saver(tf.global_variables()) print('Saver ready') # Build the summary operation based on the TF collection of Summaries. summaryOp = tf.summary.merge_all() print('MergeSummary ready') # Build an initialization operation to run below. #init = tf.initialize_all_variables() # init = tf.global_variables_initializer() #opCheck = tf.add_check_numerics_ops() # Start running operations on the Graph. config = tf.ConfigProto( log_device_placement=modelParams['logDevicePlacement']) config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) print('Session ready') #sess = tf_debug.LocalCLIDebugWrapperSession(sess) #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) # sess.run(init) # restore a saver. print('Loading Ex-Model with epoch number %d ...', epochNumber) print(' ', modelParams['trainLogDir'] + '_v/model.ckpt-' + str(epochNumber)) saver.restore( sess, (modelParams['trainLogDir'] + '_v/model.ckpt-' + str(epochNumber))) #saver.restore(sess, (modelParams['trainLogDir']+'_30k/model.ckpt-29000')) print('Ex-Model loaded') if True: # if True: freeze graph tf.train.write_graph(sess.graph.as_graph_def(), '.', modelParams['trainLogDir'] + '_v/model.pbtxt', as_text=True) # Output nodes output_node_names = [ n.name for n in tf.get_default_graph().as_graph_def().node ] # Freeze the graph frozen_graph_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, output_node_names) # Save the frozen graph with open(modelParams['trainLogDir'] + '_v/model.pb', 'wb') as f: f.write(frozen_graph_def.SerializeToString()) # Start the queue runners. tf.train.start_queue_runners(sess=sess) print('QueueRunner started') summaryWriter = tf.summary.FileWriter(modelParams['logDir'], sess.graph) summaryValiWriter = tf.summary.FileWriter(modelParams['logDir'] + '_v', sess.graph) #TEST### summaryValiWriter = tf.summary.FileWriter(modelParams['logDir']+'_test', sess.graph) print('Testing started') durationSum = 0 durationSumAll = 0 prevLoss = 99999 prevValiSumLoss = 99999 prevaccur = 0 prevLossStep = 0 prevStep = 21000 #TEST### prevTestSumLoss = 99999 prevStep = int(modelParams['maxSteps'] / 2) l = list() import cv2 lossValueSum = 0 l2regValueSum = 0 total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() #print(shape) #print(len(shape)) variable_parameters = 1 for dim in shape: #print(dim) variable_parameters *= dim.value #print(variable_parameters) total_parameters += variable_parameters print('-----total parameters-------- ', total_parameters) for step in xrange(0, modelParams['maxSteps']): #(0, 1000): startTime = time.time() #npfilename, npTargetP, npTargetT, lossValue, l2regValue, npPng = sess.run([filename, targetP, targetT, loss, l2reg, pngTemp]) npfilename, npTargetP, npTargetT, lossValue, l2regValue = sess.run( [filename, targetP, targetT, loss, l2reg]) duration = time.time() - startTime if step != 0: l.append(duration) print(duration, step, modelParams['maxSteps']) lossValueSum += lossValue l2regValueSum += l2regValue #print(npfilename) #print(npTargetT) #print(npTargetP) ################# DEMO for ibx in range(modelParams['activeBatchSize']): #print('hello') stat = 'False' if np.argmax(npTargetT[ibx]) == np.argmax(npTargetP[ibx]): stat = 'True' print(npfilename[ibx].decode('ascii'), 'Target:', np.argmax(npTargetT[ibx]), 'Estimate:', np.argmax(npTargetP[ibx]), stat) # npPng = cv2.imread('../Data/cold_wb/testpng352/'+npfilename[ibx].decode('ascii'), -1) # #npPng[npPng<24000] = 24000 # #npPng[npPng>31000] = 31000 # #hist,bins = np.histogram(npPng.flatten(),9000,[23000,32000]) # #plt.plot(hist) # #plt.show() # #npPng.astype('float32') # npPng = (npPng-npPng.min())/(npPng.max()-npPng.min()) # #print(npPng.shape, npPng.min(), npPng.max()) # #print(npPng.shape, npPng.min(), npPng.max(), npPng.mean()) # cv2.imshow('npPng', npPng) # #print(np.max(npPng[0,:,:,0]), np.max(npPng[0,:,:,1]), np.max(npPng[0,:,:,2])) # #print(np.mean(npPng[0,:,:,0]), np.mean(npPng[0,:,:,1]), np.mean(npPng[0,:,:,2])) # #p1 = npPng[0,:,:,1] # #p2 = npPng[0,:,:,2] # #p1 = (p1-np.min(p1)) / (np.max(p1)-np.min(p1)) # #p2 = (p2-np.min(p2)) / (np.max(p2)-np.min(p2)) # #cv2.imshow('npPng1', p1) # #cv2.imshow('npPng2', p2) # cv2.waitKey(0) ################# #p1 = npPng[0,:,:,0] #p2 = npPng[0,:,:,1] #p1 = (p1-np.min(p1)) / (np.max(p1)-np.min(p1)) #p2 = (p2-np.min(p2)) / (np.max(p2)-np.min(p2)) #print(duration, step, modelParams['maxSteps'], 'regul', l2regValue) data_output.output(str(10000 + step), npfilename, npTargetP, npTargetT, **modelParams) # Print Progress Info if ((step % FLAGS.ProgressStepReportStep) == 0) or ((step + 1) == modelParams['maxSteps']): print( 'Progress: %.2f%%, Elapsed: %.2f mins, Testing Completion in: %.2f mins --- %s' % ((100 * step) / modelParams['maxSteps'], durationSum / 60, (((durationSum * modelParams['maxSteps']) / (step + 1)) / 60) - (durationSum / 60), datetime.now())) #if step == 128: # modelParams['phase'] = 'train' # #if step == 130: # modelParams['phase'] = 'test' print(np.array(l).mean()) #l0 = np.array(l) #l1 = np.array(l[1:-1]) #print(np.average(l0)) #print(np.average(l1)) print('----- maxsteps:', modelParams['maxSteps'], '--- loss avg:', lossValueSum / modelParams['maxSteps'], '--- l2regu avg:', l2regValueSum / modelParams['maxSteps']) print('----- train scaled loss:', (lossValueSum / modelParams['maxSteps']) * modelParams['trainBatchSize']) print('----- train scaled l2regu:', (l2regValueSum / modelParams['maxSteps']) * modelParams['trainBatchSize']) print(modelParams['outputDir']) sess.close() tf.reset_default_graph()
def train(modelParams, epochNumber): # import corresponding model name as model_cnn, specifed at json file model_cnn = importlib.import_module('Model_Factory.' + modelParams['modelName']) if not os.path.exists(modelParams['dataDir']): raise ValueError("No such data directory %s" % modelParams['dataDir']) _setupLogging(os.path.join(modelParams['logDir'], "genlog")) with tf.Graph().as_default(): # track the number of train calls (basically number of batches processed) globalStep = tf.get_variable('globalStep', [], initializer=tf.constant_initializer(0), trainable=False) # Get images inputs for model_cnn. if modelParams['phase'] == 'v': filename, pngTemp, targetT = data_input.inputs_vali(**modelParams) else: filename, pngTemp, targetT = data_input.inputs(**modelParams) print('Input ready') #TEST### filenametest, pngTemptest, targetTtest = data_input.inputs_test(**modelParams) # Build a Graph that computes the HAB predictions from the # inference model #targetP = model_cnn.inference(pngTemp, **modelParams) targetP, l2reg = model_cnn.inference_l2reg(pngTemp, **modelParams) #TEST### targetPtest = model_cnn.inference(pngTemptest, **modelParams) print(targetP.get_shape()) # loss model if modelParams.get('classificationModel'): print('Classification model...') # loss on last tuple #loss = model_cnn.loss(targetP, targetT, **modelParams) loss = model_cnn.loss_l2reg(targetP, targetT, l2reg, **modelParams) #TEST### losstest = model_cnn.loss(targetPtest, targetTtest, **modelParams) else: print('Regression model...') # loss on last tuple loss = model_cnn.loss(targetP, targetT, **modelParams) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. opTrain = model_cnn.train(loss, globalStep, **modelParams) ############################## print('Training ready') # Create a saver. saver = tf.train.Saver(tf.global_variables()) print('Saver ready') # Build the summary operation based on the TF collection of Summaries. summaryOp = tf.summary.merge_all() print('MergeSummary ready') # Build an initialization operation to run below. #init = tf.initialize_all_variables() init = tf.global_variables_initializer() #opCheck = tf.add_check_numerics_ops() # Start running operations on the Graph. config = tf.ConfigProto( log_device_placement=modelParams['logDevicePlacement']) config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) print('Session ready') #sess = tf_debug.LocalCLIDebugWrapperSession(sess) #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) sess.run(init) # restore a saver. if epochNumber > 0: print('Loading Ex-Model with epoch number %d ...', epochNumber) saver.restore(sess, (modelParams['trainLogDir'] + '/model.ckpt-' + str(epochNumber))) #saver.restore(sess, (modelParams['trainLogDir']+'_30k/model.ckpt-29000')) print('Ex-Model loaded') tf.train.write_graph(sess.graph.as_graph_def(), '.', modelParams['trainLogDir'] + '/model.pbtxt', as_text=True) # Start the queue runners. tf.train.start_queue_runners(sess=sess) print('QueueRunner started') summaryWriter = tf.summary.FileWriter(modelParams['logDir'], sess.graph) summaryValiWriter = tf.summary.FileWriter(modelParams['logDir'] + '_v', sess.graph) #TEST### summaryValiWriter = tf.summary.FileWriter(modelParams['logDir']+'_test', sess.graph) total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() #print(shape) #print(len(shape)) variable_parameters = 1 for dim in shape: #print(dim) variable_parameters *= dim.value #print(variable_parameters) total_parameters += variable_parameters print('-----total parameters-------- ', total_parameters) print('Training started') durationSum = 0 durationSumAll = 0 prevLoss = 99999 prevValiSumLoss = 99999 prevaccur = 0 prevLossStep = 0 prevStep = 21000 #TEST### prevTestSumLoss = 99999 prevStep = int(modelParams['maxSteps'] / 2) for step in xrange(epochNumber, modelParams['maxSteps']): startTime = time.time() #_, lossValue = sess.run([opTrain, loss]) _, lossValue, l2regValue = sess.run([opTrain, loss, l2reg]) #print(lossValue, l2regValue) duration = time.time() - startTime durationSum += duration assert not np.isnan(lossValue), 'Model diverged with loss = NaN' if step % FLAGS.printOutStep == 0: numExamplesPerStep = modelParams['activeBatchSize'] examplesPerSec = numExamplesPerStep / duration secPerBatch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch), loss/batch = %.2f, l2reg = %.2f') logging.info(format_str % (datetime.now(), step, lossValue, examplesPerSec, secPerBatch, lossValue / modelParams['activeBatchSize'], l2regValue)) if step % FLAGS.summaryWriteStep == 0: summaryStr = sess.run(summaryOp) summaryWriter.add_summary(summaryStr, step) # Save the model checkpoint periodically. if step % FLAGS.modelCheckpointStep == 0 or ( step + 1) == modelParams['maxSteps']: checkpointPath = os.path.join(modelParams['logDir'], 'model.ckpt') saver.save(sess, checkpointPath, global_step=step) # Print Progress Info if ((step % FLAGS.ProgressStepReportStep) == 0) or ((step + 1) == modelParams['maxSteps']): print( 'Progress: %.2f%%, Elapsed: %.2f mins, Training Completion in: %.2f mins --- %s' % ((100 * step) / modelParams['maxSteps'], durationSum / 60, (((durationSum * modelParams['maxSteps']) / (step + 1)) / 60) - (durationSum / 60), datetime.now()))
def train(modelParams, epochNumber): # import corresponding model name as model_cnn, specifed at json file model_cnn = importlib.import_module('Model_Factory.' + modelParams['modelName']) if not os.path.exists(modelParams['dataDir']): raise ValueError("No such data directory %s" % modelParams['dataDir']) _setupLogging(os.path.join(modelParams['logDir'], "genlog")) with tf.Graph().as_default(): # track the number of train calls (basically number of batches processed) globalStep = tf.get_variable('globalStep', [], initializer=tf.constant_initializer(0), trainable=False) # Get images inputs for model_cnn. filename, pngTemp, targetT = data_input.inputs(**modelParams) print('Input ready') filenamevali, pngTempvali, targetTvali = data_input.inputs_vali( **modelParams) #TEST### filenametest, pngTemptest, targetTtest = data_input.inputs_test(**modelParams) # Build a Graph that computes the HAB predictions from the # inference model targetP = model_cnn.inference(pngTemp, **modelParams) targetPvali = model_cnn.inference(pngTempvali, **modelParams) #TEST### targetPtest = model_cnn.inference(pngTemptest, **modelParams) print(targetP.get_shape()) # loss model if modelParams.get('classificationModel'): print('Classification model...') # loss on last tuple loss = model_cnn.loss(targetP, targetT, **modelParams) lossvali = model_cnn.loss(targetPvali, targetTvali, **modelParams) #TEST### losstest = model_cnn.loss(targetPtest, targetTtest, **modelParams) else: print('Regression model...') # loss on last tuple loss = model_cnn.loss(targetP, targetT, **modelParams) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. opTrain = model_cnn.train(loss, globalStep, **modelParams) ############################## print('Training ready') # Create a saver. saver = tf.train.Saver(tf.global_variables()) print('Saver ready') # Build the summary operation based on the TF collection of Summaries. summaryOp = tf.summary.merge_all() print('MergeSummary ready') # Build an initialization operation to run below. #init = tf.initialize_all_variables() init = tf.global_variables_initializer() #opCheck = tf.add_check_numerics_ops() # Start running operations on the Graph. config = tf.ConfigProto( log_device_placement=modelParams['logDevicePlacement']) config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) print('Session ready') #sess = tf_debug.LocalCLIDebugWrapperSession(sess) #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) sess.run(init) # restore a saver. if epochNumber > 0: print('Loading Ex-Model with epoch number %d ...', epochNumber) saver.restore(sess, (modelParams['trainLogDir'] + '/model.ckpt-' + str(epochNumber))) #saver.restore(sess, (modelParams['trainLogDir']+'_30k/model.ckpt-29000')) print('Ex-Model loaded') # Start the queue runners. tf.train.start_queue_runners(sess=sess) print('QueueRunner started') summaryWriter = tf.summary.FileWriter(modelParams['logDir'], sess.graph) summaryValiWriter = tf.summary.FileWriter( modelParams['logDir'] + '_validation', sess.graph) #TEST### summaryValiWriter = tf.summary.FileWriter(modelParams['logDir']+'_test', sess.graph) print('Training started') durationSum = 0 durationSumAll = 0 prevLoss = 99999 prevValiSumLoss = 99999 prevaccur = 0 prevLossStep = 0 prevStep = 21000 #TEST### prevTestSumLoss = 99999 prevStep = int(modelParams['maxSteps'] / 2) for step in xrange(epochNumber, modelParams['maxSteps']): startTime = time.time() _, lossValue = sess.run([opTrain, loss]) duration = time.time() - startTime durationSum += duration assert not np.isnan(lossValue), 'Model diverged with loss = NaN' if step % FLAGS.printOutStep == 0: numExamplesPerStep = modelParams['activeBatchSize'] examplesPerSec = numExamplesPerStep / duration secPerBatch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch), loss/batch = %.2f') logging.info( format_str % (datetime.now(), step, lossValue, examplesPerSec, secPerBatch, lossValue / modelParams['activeBatchSize'])) if step % FLAGS.summaryWriteStep == 0: summaryStr = sess.run(summaryOp) summaryWriter.add_summary(summaryStr, step) # Save the model checkpoint periodically. if step % FLAGS.modelCheckpointStep == 0 or ( step + 1) == modelParams['maxSteps']: checkpointPath = os.path.join(modelParams['logDir'], 'model.ckpt') saver.save(sess, checkpointPath, global_step=step) # Print Progress Info if ((step % FLAGS.ProgressStepReportStep) == 0) or ((step + 1) == modelParams['maxSteps']): print( 'Progress: %.2f%%, Elapsed: %.2f mins, Training Completion in: %.2f mins --- %s' % ((100 * step) / modelParams['maxSteps'], durationSum / 60, (((durationSum * modelParams['maxSteps']) / (step + 1)) / 60) - (durationSum / 60), datetime.now())) if step > prevStep and step % 1000 == 0: #if step % 1000 == 0: #prevLoss = lossValue prevStep = step print(' Validation Function in progress... step ', step) lossvalidationsum = 0 for i in range(0, modelParams['testMaxSteps']): lossvalsum, pvali, tvali = sess.run( [lossvali, targetPvali, targetTvali]) lossvalidationsum += np.mean(np.array(lossvalsum)) #TEST### print(' Average loss = ', lossvalidationsum/modelParams['valiSteps']) pos1 = 0 neg1 = 0 for jacc in range(pvali.shape[0]): pidx = np.argmax(pvali[jacc]) tidx = np.argmax(tvali[jacc]) if tidx == pidx: pos1 += 1 else: neg1 += 1 accur = 100 * pos1 / (pos1 + neg1) print(" Accuracy = ", accur) print(" Prev Accuracy = ", prevaccur) print(' Average loss = ', lossvalidationsum / modelParams['testMaxSteps']) print(' Prev loss = ', prevValiSumLoss / modelParams['testMaxSteps'], ' prevLossStep = ', prevLossStep) if accur > prevaccur: print(' Saving model') shutil.copy( modelParams['logDir'] + '/model.ckpt-' + str(step) + '.data-00000-of-00001', modelParams['logDir'] + '_validation/model.ckpt-' + str(step) + '.data-00000-of-00001') shutil.copy( modelParams['logDir'] + '/model.ckpt-' + str(step) + '.index', modelParams['logDir'] + '_validation/model.ckpt-' + str(step) + '.index') shutil.copy( modelParams['logDir'] + '/model.ckpt-' + str(step) + '.meta', modelParams['logDir'] + '_validation/model.ckpt-' + str(step) + '.meta') prevaccur = accur prevValiSumLoss = lossvalidationsum prevLossStep = step summaryStr = sess.run(summaryOp) summaryValiWriter.add_summary(summaryStr, step) if step > prevStep and step - prevStep > 1001: print(' ----------------SKIPPED') print(' ----------------SKIPPED')