예제 #1
0
def train(modelParams, epochNumber):
    # import corresponding model name as model_cnn, specifed at json file
    model_cnn = importlib.import_module('Model_Factory.' +
                                        modelParams['modelName'])

    if not os.path.exists(modelParams['dataDir']):
        raise ValueError("No such data directory %s" % modelParams['dataDir'])

    _setupLogging(os.path.join(modelParams['logDir'], "genlog"))

    with tf.Graph().as_default():
        # track the number of train calls (basically number of batches processed)
        globalStep = tf.get_variable('globalStep', [],
                                     initializer=tf.constant_initializer(0),
                                     trainable=False)
        # Get images inputs for model_cnn.
        if modelParams['phase'] == 'v':
            filename, pngTemp, targetT = data_input.inputs_vali(**modelParams)
        else:
            filename, pngTemp, targetT = data_input.inputs(**modelParams)
        print('Input        ready')
        #TEST###        filenametest, pngTemptest, targetTtest = data_input.inputs_test(**modelParams)

        # Build a Graph that computes the HAB predictions from the
        # inference model
        #targetP = model_cnn.inference(pngTemp, **modelParams)
        targetP, l2reg = model_cnn.inference_l2reg(pngTemp, **modelParams)
        #TEST###        targetPtest = model_cnn.inference(pngTemptest, **modelParams)
        print(targetP.get_shape())
        # loss model
        if modelParams.get('classificationModel'):
            print('Classification model...')
            # loss on last tuple
            #loss = model_cnn.loss(targetP, targetT, **modelParams)
            loss = model_cnn.loss_l2reg(targetP, targetT, l2reg, **modelParams)


#TEST###            losstest = model_cnn.loss(targetPtest, targetTtest, **modelParams)
        else:
            print('Regression model...')
            # loss on last tuple
            loss = model_cnn.loss(targetP, targetT, **modelParams)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        #opTrain = model_cnn.train(loss, globalStep, **modelParams)
        ##############################
        print('Testing     ready')
        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())
        print('Saver        ready')

        # Build the summary operation based on the TF collection of Summaries.
        summaryOp = tf.summary.merge_all()
        print('MergeSummary ready')
        # Build an initialization operation to run below.
        #init = tf.initialize_all_variables()
        #        init = tf.global_variables_initializer()

        #opCheck = tf.add_check_numerics_ops()
        # Start running operations on the Graph.
        config = tf.ConfigProto(
            log_device_placement=modelParams['logDevicePlacement'])
        config.gpu_options.allow_growth = True
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
        sess = tf.Session(config=config)
        print('Session      ready')

        #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        #        sess.run(init)

        # restore a saver.
        print('Loading Ex-Model with epoch number %d ...', epochNumber)
        print('     ',
              modelParams['trainLogDir'] + '_v/model.ckpt-' + str(epochNumber))
        saver.restore(
            sess,
            (modelParams['trainLogDir'] + '_v/model.ckpt-' + str(epochNumber)))
        #saver.restore(sess, (modelParams['trainLogDir']+'_30k/model.ckpt-29000'))
        print('Ex-Model     loaded')

        if True:
            # if True: freeze graph
            tf.train.write_graph(sess.graph.as_graph_def(),
                                 '.',
                                 modelParams['trainLogDir'] + '_v/model.pbtxt',
                                 as_text=True)
            # Output nodes
            output_node_names = [
                n.name for n in tf.get_default_graph().as_graph_def().node
            ]
            # Freeze the graph
            frozen_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names)
            # Save the frozen graph
            with open(modelParams['trainLogDir'] + '_v/model.pb', 'wb') as f:
                f.write(frozen_graph_def.SerializeToString())

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)
        print('QueueRunner  started')

        summaryWriter = tf.summary.FileWriter(modelParams['logDir'],
                                              sess.graph)
        summaryValiWriter = tf.summary.FileWriter(modelParams['logDir'] + '_v',
                                                  sess.graph)
        #TEST###        summaryValiWriter = tf.summary.FileWriter(modelParams['logDir']+'_test', sess.graph)

        print('Testing     started')
        durationSum = 0
        durationSumAll = 0
        prevLoss = 99999
        prevValiSumLoss = 99999
        prevaccur = 0
        prevLossStep = 0
        prevStep = 21000
        #TEST###        prevTestSumLoss = 99999
        prevStep = int(modelParams['maxSteps'] / 2)
        l = list()
        import cv2
        lossValueSum = 0
        l2regValueSum = 0

        total_parameters = 0
        for variable in tf.trainable_variables():
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            #print(shape)
            #print(len(shape))
            variable_parameters = 1
            for dim in shape:
                #print(dim)
                variable_parameters *= dim.value
            #print(variable_parameters)
            total_parameters += variable_parameters
        print('-----total parameters-------- ', total_parameters)

        for step in xrange(0, modelParams['maxSteps']):  #(0, 1000):
            startTime = time.time()
            #npfilename, npTargetP, npTargetT, lossValue, l2regValue, npPng = sess.run([filename, targetP, targetT, loss, l2reg, pngTemp])
            npfilename, npTargetP, npTargetT, lossValue, l2regValue = sess.run(
                [filename, targetP, targetT, loss, l2reg])
            duration = time.time() - startTime
            if step != 0:
                l.append(duration)
            print(duration, step, modelParams['maxSteps'])
            lossValueSum += lossValue
            l2regValueSum += l2regValue
            #print(npfilename)
            #print(npTargetT)
            #print(npTargetP)
            ################# DEMO
            for ibx in range(modelParams['activeBatchSize']):
                #print('hello')
                stat = 'False'
                if np.argmax(npTargetT[ibx]) == np.argmax(npTargetP[ibx]):
                    stat = 'True'
                print(npfilename[ibx].decode('ascii'), 'Target:',
                      np.argmax(npTargetT[ibx]), 'Estimate:',
                      np.argmax(npTargetP[ibx]), stat)
                # npPng = cv2.imread('../Data/cold_wb/testpng352/'+npfilename[ibx].decode('ascii'), -1)
                # #npPng[npPng<24000] = 24000
                # #npPng[npPng>31000] = 31000
                # #hist,bins = np.histogram(npPng.flatten(),9000,[23000,32000])
                # #plt.plot(hist)
                # #plt.show()
                # #npPng.astype('float32')
                # npPng = (npPng-npPng.min())/(npPng.max()-npPng.min())
                # #print(npPng.shape, npPng.min(), npPng.max())
                # #print(npPng.shape, npPng.min(), npPng.max(), npPng.mean())
                # cv2.imshow('npPng', npPng)
                # #print(np.max(npPng[0,:,:,0]), np.max(npPng[0,:,:,1]), np.max(npPng[0,:,:,2]))
                # #print(np.mean(npPng[0,:,:,0]), np.mean(npPng[0,:,:,1]), np.mean(npPng[0,:,:,2]))
                # #p1 = npPng[0,:,:,1]
                # #p2 = npPng[0,:,:,2]
                # #p1 = (p1-np.min(p1)) / (np.max(p1)-np.min(p1))
                # #p2 = (p2-np.min(p2)) / (np.max(p2)-np.min(p2))
                # #cv2.imshow('npPng1', p1)
                # #cv2.imshow('npPng2', p2)
                # cv2.waitKey(0)

            #################
            #p1 = npPng[0,:,:,0]
            #p2 = npPng[0,:,:,1]
            #p1 = (p1-np.min(p1)) / (np.max(p1)-np.min(p1))
            #p2 = (p2-np.min(p2)) / (np.max(p2)-np.min(p2))

            #print(duration, step, modelParams['maxSteps'], 'regul', l2regValue)
            data_output.output(str(10000 + step), npfilename, npTargetP,
                               npTargetT, **modelParams)
            # Print Progress Info
            if ((step % FLAGS.ProgressStepReportStep)
                    == 0) or ((step + 1) == modelParams['maxSteps']):
                print(
                    'Progress: %.2f%%, Elapsed: %.2f mins, Testing Completion in: %.2f mins --- %s'
                    %
                    ((100 * step) / modelParams['maxSteps'], durationSum / 60,
                     (((durationSum * modelParams['maxSteps']) /
                       (step + 1)) / 60) - (durationSum / 60), datetime.now()))
            #if step == 128:
            #    modelParams['phase'] = 'train'
            #
            #if step == 130:
            #    modelParams['phase'] = 'test'
        print(np.array(l).mean())
        #l0 = np.array(l)
        #l1 = np.array(l[1:-1])
        #print(np.average(l0))
        #print(np.average(l1))
        print('----- maxsteps:', modelParams['maxSteps'], '--- loss avg:',
              lossValueSum / modelParams['maxSteps'], '--- l2regu avg:',
              l2regValueSum / modelParams['maxSteps'])
        print('----- train scaled loss:',
              (lossValueSum / modelParams['maxSteps']) *
              modelParams['trainBatchSize'])
        print('----- train scaled l2regu:',
              (l2regValueSum / modelParams['maxSteps']) *
              modelParams['trainBatchSize'])
        print(modelParams['outputDir'])

        sess.close()
    tf.reset_default_graph()
예제 #2
0
def train(modelParams, epochNumber):
    # import corresponding model name as model_cnn, specifed at json file
    model_cnn = importlib.import_module('Model_Factory.' +
                                        modelParams['modelName'])

    if not os.path.exists(modelParams['dataDir']):
        raise ValueError("No such data directory %s" % modelParams['dataDir'])

    _setupLogging(os.path.join(modelParams['logDir'], "genlog"))

    with tf.Graph().as_default():
        # track the number of train calls (basically number of batches processed)
        globalStep = tf.get_variable('globalStep', [],
                                     initializer=tf.constant_initializer(0),
                                     trainable=False)

        # Get images inputs for model_cnn.
        if modelParams['phase'] == 'v':
            filename, pngTemp, targetT = data_input.inputs_vali(**modelParams)
        else:
            filename, pngTemp, targetT = data_input.inputs(**modelParams)
        print('Input        ready')
        #TEST###        filenametest, pngTemptest, targetTtest = data_input.inputs_test(**modelParams)

        # Build a Graph that computes the HAB predictions from the
        # inference model
        #targetP = model_cnn.inference(pngTemp, **modelParams)
        targetP, l2reg = model_cnn.inference_l2reg(pngTemp, **modelParams)
        #TEST###        targetPtest = model_cnn.inference(pngTemptest, **modelParams)
        print(targetP.get_shape())
        # loss model
        if modelParams.get('classificationModel'):
            print('Classification model...')
            # loss on last tuple
            #loss = model_cnn.loss(targetP, targetT, **modelParams)
            loss = model_cnn.loss_l2reg(targetP, targetT, l2reg, **modelParams)


#TEST###            losstest = model_cnn.loss(targetPtest, targetTtest, **modelParams)
        else:
            print('Regression model...')
            # loss on last tuple
            loss = model_cnn.loss(targetP, targetT, **modelParams)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        opTrain = model_cnn.train(loss, globalStep, **modelParams)
        ##############################
        print('Training     ready')
        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())
        print('Saver        ready')

        # Build the summary operation based on the TF collection of Summaries.
        summaryOp = tf.summary.merge_all()
        print('MergeSummary ready')
        # Build an initialization operation to run below.
        #init = tf.initialize_all_variables()
        init = tf.global_variables_initializer()

        #opCheck = tf.add_check_numerics_ops()
        # Start running operations on the Graph.
        config = tf.ConfigProto(
            log_device_placement=modelParams['logDevicePlacement'])
        config.gpu_options.allow_growth = True
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
        sess = tf.Session(config=config)
        print('Session      ready')

        #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        sess.run(init)

        # restore a saver.
        if epochNumber > 0:
            print('Loading Ex-Model with epoch number %d ...', epochNumber)
            saver.restore(sess, (modelParams['trainLogDir'] + '/model.ckpt-' +
                                 str(epochNumber)))
            #saver.restore(sess, (modelParams['trainLogDir']+'_30k/model.ckpt-29000'))
            print('Ex-Model     loaded')

        tf.train.write_graph(sess.graph.as_graph_def(),
                             '.',
                             modelParams['trainLogDir'] + '/model.pbtxt',
                             as_text=True)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)
        print('QueueRunner  started')

        summaryWriter = tf.summary.FileWriter(modelParams['logDir'],
                                              sess.graph)
        summaryValiWriter = tf.summary.FileWriter(modelParams['logDir'] + '_v',
                                                  sess.graph)
        #TEST###        summaryValiWriter = tf.summary.FileWriter(modelParams['logDir']+'_test', sess.graph)

        total_parameters = 0
        for variable in tf.trainable_variables():
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            #print(shape)
            #print(len(shape))
            variable_parameters = 1
            for dim in shape:
                #print(dim)
                variable_parameters *= dim.value
                #print(variable_parameters)
            total_parameters += variable_parameters
        print('-----total parameters-------- ', total_parameters)

        print('Training     started')
        durationSum = 0
        durationSumAll = 0
        prevLoss = 99999
        prevValiSumLoss = 99999
        prevaccur = 0
        prevLossStep = 0
        prevStep = 21000
        #TEST###        prevTestSumLoss = 99999
        prevStep = int(modelParams['maxSteps'] / 2)
        for step in xrange(epochNumber, modelParams['maxSteps']):
            startTime = time.time()
            #_, lossValue = sess.run([opTrain, loss])
            _, lossValue, l2regValue = sess.run([opTrain, loss, l2reg])
            #print(lossValue, l2regValue)
            duration = time.time() - startTime
            durationSum += duration
            assert not np.isnan(lossValue), 'Model diverged with loss = NaN'

            if step % FLAGS.printOutStep == 0:
                numExamplesPerStep = modelParams['activeBatchSize']
                examplesPerSec = numExamplesPerStep / duration
                secPerBatch = float(duration)
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch), loss/batch = %.2f, l2reg = %.2f')
                logging.info(format_str %
                             (datetime.now(), step, lossValue, examplesPerSec,
                              secPerBatch, lossValue /
                              modelParams['activeBatchSize'], l2regValue))

            if step % FLAGS.summaryWriteStep == 0:
                summaryStr = sess.run(summaryOp)
                summaryWriter.add_summary(summaryStr, step)
            # Save the model checkpoint periodically.
            if step % FLAGS.modelCheckpointStep == 0 or (
                    step + 1) == modelParams['maxSteps']:
                checkpointPath = os.path.join(modelParams['logDir'],
                                              'model.ckpt')
                saver.save(sess, checkpointPath, global_step=step)
            # Print Progress Info
            if ((step % FLAGS.ProgressStepReportStep)
                    == 0) or ((step + 1) == modelParams['maxSteps']):
                print(
                    'Progress: %.2f%%, Elapsed: %.2f mins, Training Completion in: %.2f mins --- %s'
                    %
                    ((100 * step) / modelParams['maxSteps'], durationSum / 60,
                     (((durationSum * modelParams['maxSteps']) /
                       (step + 1)) / 60) - (durationSum / 60), datetime.now()))
예제 #3
0
def train(modelParams, epochNumber):
    # import corresponding model name as model_cnn, specifed at json file
    model_cnn = importlib.import_module('Model_Factory.' +
                                        modelParams['modelName'])

    if not os.path.exists(modelParams['dataDir']):
        raise ValueError("No such data directory %s" % modelParams['dataDir'])

    _setupLogging(os.path.join(modelParams['logDir'], "genlog"))

    with tf.Graph().as_default():
        # track the number of train calls (basically number of batches processed)
        globalStep = tf.get_variable('globalStep', [],
                                     initializer=tf.constant_initializer(0),
                                     trainable=False)

        # Get images inputs for model_cnn.
        filename, pngTemp, targetT = data_input.inputs(**modelParams)
        print('Input        ready')
        filenamevali, pngTempvali, targetTvali = data_input.inputs_vali(
            **modelParams)
        #TEST###        filenametest, pngTemptest, targetTtest = data_input.inputs_test(**modelParams)

        # Build a Graph that computes the HAB predictions from the
        # inference model
        targetP = model_cnn.inference(pngTemp, **modelParams)
        targetPvali = model_cnn.inference(pngTempvali, **modelParams)
        #TEST###        targetPtest = model_cnn.inference(pngTemptest, **modelParams)
        print(targetP.get_shape())
        # loss model
        if modelParams.get('classificationModel'):
            print('Classification model...')
            # loss on last tuple
            loss = model_cnn.loss(targetP, targetT, **modelParams)
            lossvali = model_cnn.loss(targetPvali, targetTvali, **modelParams)
#TEST###            losstest = model_cnn.loss(targetPtest, targetTtest, **modelParams)
        else:
            print('Regression model...')
            # loss on last tuple
            loss = model_cnn.loss(targetP, targetT, **modelParams)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        opTrain = model_cnn.train(loss, globalStep, **modelParams)
        ##############################
        print('Training     ready')
        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())
        print('Saver        ready')

        # Build the summary operation based on the TF collection of Summaries.
        summaryOp = tf.summary.merge_all()
        print('MergeSummary ready')
        # Build an initialization operation to run below.
        #init = tf.initialize_all_variables()
        init = tf.global_variables_initializer()

        #opCheck = tf.add_check_numerics_ops()
        # Start running operations on the Graph.
        config = tf.ConfigProto(
            log_device_placement=modelParams['logDevicePlacement'])
        config.gpu_options.allow_growth = True
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
        sess = tf.Session(config=config)
        print('Session      ready')

        #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        sess.run(init)

        # restore a saver.
        if epochNumber > 0:
            print('Loading Ex-Model with epoch number %d ...', epochNumber)
            saver.restore(sess, (modelParams['trainLogDir'] + '/model.ckpt-' +
                                 str(epochNumber)))
            #saver.restore(sess, (modelParams['trainLogDir']+'_30k/model.ckpt-29000'))
            print('Ex-Model     loaded')

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)
        print('QueueRunner  started')

        summaryWriter = tf.summary.FileWriter(modelParams['logDir'],
                                              sess.graph)
        summaryValiWriter = tf.summary.FileWriter(
            modelParams['logDir'] + '_validation', sess.graph)
        #TEST###        summaryValiWriter = tf.summary.FileWriter(modelParams['logDir']+'_test', sess.graph)

        print('Training     started')
        durationSum = 0
        durationSumAll = 0
        prevLoss = 99999
        prevValiSumLoss = 99999
        prevaccur = 0
        prevLossStep = 0
        prevStep = 21000
        #TEST###        prevTestSumLoss = 99999
        prevStep = int(modelParams['maxSteps'] / 2)
        for step in xrange(epochNumber, modelParams['maxSteps']):
            startTime = time.time()
            _, lossValue = sess.run([opTrain, loss])
            duration = time.time() - startTime
            durationSum += duration
            assert not np.isnan(lossValue), 'Model diverged with loss = NaN'

            if step % FLAGS.printOutStep == 0:
                numExamplesPerStep = modelParams['activeBatchSize']
                examplesPerSec = numExamplesPerStep / duration
                secPerBatch = float(duration)
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch), loss/batch = %.2f')
                logging.info(
                    format_str %
                    (datetime.now(), step, lossValue, examplesPerSec,
                     secPerBatch, lossValue / modelParams['activeBatchSize']))

            if step % FLAGS.summaryWriteStep == 0:
                summaryStr = sess.run(summaryOp)
                summaryWriter.add_summary(summaryStr, step)
            # Save the model checkpoint periodically.
            if step % FLAGS.modelCheckpointStep == 0 or (
                    step + 1) == modelParams['maxSteps']:
                checkpointPath = os.path.join(modelParams['logDir'],
                                              'model.ckpt')
                saver.save(sess, checkpointPath, global_step=step)
            # Print Progress Info
            if ((step % FLAGS.ProgressStepReportStep)
                    == 0) or ((step + 1) == modelParams['maxSteps']):
                print(
                    'Progress: %.2f%%, Elapsed: %.2f mins, Training Completion in: %.2f mins --- %s'
                    %
                    ((100 * step) / modelParams['maxSteps'], durationSum / 60,
                     (((durationSum * modelParams['maxSteps']) /
                       (step + 1)) / 60) - (durationSum / 60), datetime.now()))
            if step > prevStep and step % 1000 == 0:
                #if step % 1000 == 0:
                #prevLoss = lossValue
                prevStep = step
                print('     Validation Function in progress... step ', step)
                lossvalidationsum = 0
                for i in range(0, modelParams['testMaxSteps']):
                    lossvalsum, pvali, tvali = sess.run(
                        [lossvali, targetPvali, targetTvali])
                    lossvalidationsum += np.mean(np.array(lossvalsum))
#TEST###                print('     Average loss = ', lossvalidationsum/modelParams['valiSteps'])

                pos1 = 0
                neg1 = 0
                for jacc in range(pvali.shape[0]):
                    pidx = np.argmax(pvali[jacc])
                    tidx = np.argmax(tvali[jacc])
                    if tidx == pidx:
                        pos1 += 1
                    else:
                        neg1 += 1
                accur = 100 * pos1 / (pos1 + neg1)
                print("		Accuracy	  = ", accur)
                print("		Prev Accuracy = ", prevaccur)
                print('     Average loss  = ',
                      lossvalidationsum / modelParams['testMaxSteps'])
                print('     Prev    loss  = ',
                      prevValiSumLoss / modelParams['testMaxSteps'],
                      '    prevLossStep = ', prevLossStep)
                if accur > prevaccur:
                    print('     Saving model')
                    shutil.copy(
                        modelParams['logDir'] + '/model.ckpt-' + str(step) +
                        '.data-00000-of-00001',
                        modelParams['logDir'] + '_validation/model.ckpt-' +
                        str(step) + '.data-00000-of-00001')
                    shutil.copy(
                        modelParams['logDir'] + '/model.ckpt-' + str(step) +
                        '.index', modelParams['logDir'] +
                        '_validation/model.ckpt-' + str(step) + '.index')
                    shutil.copy(
                        modelParams['logDir'] + '/model.ckpt-' + str(step) +
                        '.meta', modelParams['logDir'] +
                        '_validation/model.ckpt-' + str(step) + '.meta')
                    prevaccur = accur
                    prevValiSumLoss = lossvalidationsum
                    prevLossStep = step
                summaryStr = sess.run(summaryOp)
                summaryValiWriter.add_summary(summaryStr, step)
            if step > prevStep and step - prevStep > 1001:
                print('     ----------------SKIPPED')
                print('     ----------------SKIPPED')