Example #1
0
def train_net():
    #------进入计算图--------
    x_train, y_train, x_val, y_val = input_data.read_img(
        FILEPATH, WIDTH, HEIGHT, CHANNELS, ratio)
    x_train_batch, y_train_batch = input_data.bulid_batch(
        x_train, y_train, BATCH_SIZE)
    x_val_batch, y_val_batch = input_data.bulid_batch(x_val, y_val, BATCH_SIZE)
    batch_train_len = x_train_batch.shape[0]
    batch_val_len = x_val_batch.shape[0]

    #定义网络    x为输入占位符      y为输出占位符
    #image_max = tf.reduce_max(x_train, name='image_max')
    #image_min = tf.reduce_min(x_train,name='image_min')

    x = tf.placeholder(tf.float32,
                       shape=[BATCH_SIZE, HEIGHT, WIDTH, CHANNELS],
                       name='input')
    y = tf.placeholder(tf.int64, shape=[BATCH_SIZE], name='labels_placeholder')
    _, _, softmax_linear = model.build_network(x, True, False)
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits\
                        (logits=softmax_linear, labels=y, name='xentropy_per_example')
    train_loss = tf.reduce_mean(cross_entropy, name='loss')

    if modeltype == 'NOQUANT': goto.end
    #fake quant插入到op之前
    tf.contrib.quantize.create_training_graph(
        input_graph=tf.get_default_graph(), quant_delay=2000)

    label.end

    train_step = trainning(train_loss, LEARNING_RATE)

    #准确略计算
    correct = tf.nn.in_top_k(softmax_linear, y, 1)
    correct = tf.cast(correct, tf.float16)
    train_acc = tf.reduce_mean(correct)

    #------------结束计算图-------------

    with tf.Session() as sess:

        saver = tf.compat.v1.train.Saver()
        sess.run(tf.global_variables_initializer())
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        valstep = 0

        #max = sess.run(image_max)
        #min = sess.run(image_min)
        #训练
        try:
            ckpt = tf.train.get_checkpoint_state(TRAIN_LOGS_DIR)
            global_step = 0
            if ckpt and ckpt.model_checkpoint_path:
                global_step = ckpt.model_checkpoint_path.split('/')[-1].split(
                    '-')[-1]
                saver.restore(sess, ckpt.model_checkpoint_path)
                print('Loading success, global_step is %s' % global_step)

            for i in range(MAX_STEP + 1):

                #if_train = True
                pos = i % batch_train_len
                _, acc, loss = sess.run([train_step, train_acc, train_loss],
                                        feed_dict={
                                            x: x_train_batch[pos],
                                            y: y_train_batch[pos]
                                        })

                #每50步打印一次准确率和损失函数
                if i % 50 == 0:
                    print(
                        'Step %d, train loss = %.2f, train accuracy = %.2f%%' %
                        (i, loss, acc * 100.0))

                #每200步用验证集的数据进行验证
                if i % 200 == 0:
                    #if_train = False    #量化模式下用变量替代占位符.注意 如果要用tflite的话,if_train不要用占位符!
                    vpos = valstep % batch_val_len
                    val_loss, val_acc = sess.run([train_loss, train_acc],
                                                 feed_dict={
                                                     x: x_val_batch[vpos],
                                                     y: y_val_batch[vpos]
                                                 })

                    valstep = valstep + 1
                    print(
                        '**  Step %d, val loss = %.2f, val accuracy = %.2f%%  **'
                        % (i, val_loss, val_acc * 100.0))

                #每500步保存一次变量值
                if i % 500 == 0:
                    checkpoint_path = os.path.join(TRAIN_LOGS_DIR,
                                                   'saved_model.ckpt')
                    tmpstep = i + int(global_step)
                    saver.save(sess, checkpoint_path, global_step=tmpstep)

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
        finally:
            coord.request_stop()
        coord.join(threads)
Example #2
0
def deep_maxent_irl():

    # hyper parameters
    H = 48
    W = 128
    N_STATES = H * W
    N_ACTIONS = 8
    SHAPE = [H, W, N_STATES, N_ACTIONS]
    DISCOUNT = 1
    LEARNING_RATE_BASE = 0.001
    DECAY_STEPS = 500
    DECAY_RATE = 0.99
    GRAPH_SAVE_INTERVAL = 50
    IMG_PATH = '/home/zhuzeyu/real_datasets/DIRL_DataSets/orig_img/'
    REF_PATH = '/home/zhuzeyu/real_datasets/DIRL_DataSets/track_ref/'

    # create model directory
    MODEL_DIR = "model"
    if not tf.gfile.Exists(MODEL_DIR):
        tf.gfile.MakeDirs(MODEL_DIR)

    # placeholders
    input_img = tf.placeholder(tf.float32, [None, H, W, 3], name='input_img')
    grad_r_placeholder = tf.placeholder(tf.float32, [H*W, 1])
    # define reward and loss
    rewards = inference.inference(input_img)
    rewards_flattened = tf.reshape(rewards, [N_STATES, 1])
    theta = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    l2_loss = tf.reduce_mean([tf.nn.l2_loss(v) for v in theta])
    l2_loss = l2_loss/100000.0
    loss = tf.multiply(grad_r_placeholder, rewards_flattened)
    loss = tf.reduce_sum(loss, name='loss') #+ l2_loss
    # define training
    global_step = tf.Variable(0, trainable=False)
    lr = tf.train.exponential_decay(
        LEARNING_RATE_BASE,
        global_step,
        DECAY_STEPS,
        DECAY_RATE
    )
    optimizer = tf.train.GradientDescentOptimizer(lr)
    train_step = optimizer.minimize(loss, global_step=global_step)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=100)

    extracts = get_queue(IMG_PATH)

    #path_orig = IMG_PATH + extracts[447]
    #path_ref = REF_PATH + extracts[447]
    #terminal, start, img, traj, ref = input_data.read_img(SHAPE, path_orig, path_ref)

    with tf.Session() as sess:
        init.run()
        for epoch in range(20):
            for iteration in range(100):
                path_orig = IMG_PATH + extracts[iteration]
                path_ref = REF_PATH + extracts[iteration]
                terminal, start, img, traj, ref = input_data.read_img(SHAPE, path_orig, path_ref)
                # get rewards
                r = sess.run(rewards, feed_dict={input_img: img})
                r_np = np.reshape(r, [-1, ])
                # get policy
                _, policy = value_iteration.value_iteration(SHAPE, r_np, DISCOUNT, terminal)
                # compute expected svf
                mu_exp = compute_state_visitation_freq2(SHAPE, traj, start, policy)
                # compute expert svf
                # mu_D = demo_svf(traj, N_STATES)
                mu_D = field_svf(ref)
                # compute loss
                grad_r = mu_exp - mu_D
                index = np.sum(np.abs(grad_r))
                grad_r = np.reshape(grad_r, [-1, 1])  # 原来是一维
                # train
                sess.run(train_step, feed_dict={grad_r_placeholder: grad_r, input_img: img})

                lss = sess.run(loss, feed_dict={grad_r_placeholder: grad_r, input_img: img})
                print(index)
                print(lss)
                # print(sess.run(l2_loss))

                # save graph
                if (iteration + 1) % GRAPH_SAVE_INTERVAL == 0:
                    MODEL_NAME = 'model' + str(epoch) + str(iteration) + '.ckpt'
                    saver.save(sess, os.path.join(MODEL_DIR, MODEL_NAME))
Example #3
0
import numpy as np
import tensorflow as tf
from input_data import read_img
from input_data import set_val
from model import inference
train_dir = 'train/'
logs_train_dir = 'save/model'

#将所有的图片resize成100*100
w=100
h=100
c=3

data,label,num_classes=read_img(train_dir)
x_train,y_train,x_val,y_val=set_val(data,label,0.7) #0.7为划分比例

# 占位符
x = tf.placeholder(tf.float32, shape=[None, w, h, c], name='x')
y_ = tf.placeholder(tf.int32, shape=[None, ], name='y_')

regularizer = tf.contrib.layers.l2_regularizer(0.0001)  # 返回一个执行L2正则化的函数.在损失函数上加上正则项是防止过拟合的一个重要方法
logits = inference(x, False, regularizer,num_classes)

# (小处理)将logits乘以1赋值给logits_eval,定义name,方便在后续调用模型时通过tensor名字调用输出tensor
b = tf.constant(value=1, dtype=tf.float32)
logits_eval = tf.multiply(logits, b, name='logits_eval')

loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y_)
train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), y_)
# tf.equal Returns:A `Tensor` of type `bool`.
Example #4
0
def epoch(weight=None,
          height=None,
          dimension=None,
          class_size=None,
          train=False,
          path=None,
          ratio=None,
          n_epoch=None,
          batch_size=None,
          model_path=None):

    regularizer = tf.contrib.layers.l2_regularizer(0.001)

    x = tf.placeholder(tf.float32,
                       shape=[None, weight, height, dimension],
                       name='x')
    y_ = tf.placeholder(tf.int32, shape=[
        None,
    ], name='y_')

    # todo
    # 选用不同的 CNN
    logits, pred = interface(input_tensor=x,
                             regularizer=regularizer,
                             train=train,
                             class_size=class_size)
    # logits, pred = cnn(input_sensor=x, class_size=class_size)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits)
    train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), y_)
    acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # 输入数据
    data, label = input_data.read_img(path, weight, height, dimension)
    x_train, y_train, x_val, y_val = input_data.shuffle_and_period(
        data, label, ratio)

    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    for epoch in range(n_epoch):
        train_loss, train_acc, n_batch = 0, 0, 0
        for x_train_a, y_train_a in input_data.mini_batches(x_train,
                                                            y_train,
                                                            batch_size,
                                                            shuffle=True):
            _, err, ac = sess.run([train_op, loss, acc],
                                  feed_dict={
                                      x: x_train_a,
                                      y_: y_train_a
                                  })
            train_loss += err
            train_acc += ac
            n_batch += 1

        print('Epoch %d - train loss: %f' % (epoch, (train_loss / n_batch)))
        print('Epoch %d - train acc: %f' % (epoch, train_acc / n_batch))

        # validation
        val_loss, val_acc, n_batch = 0, 0, 0
        for x_val_a, y_val_a in input_data.mini_batches(x_val,
                                                        y_val,
                                                        batch_size,
                                                        shuffle=False):
            err, ac = sess.run([loss, acc],
                               feed_dict={
                                   x: x_val_a,
                                   y_: y_val_a
                               })
            val_loss += err
            val_acc += ac
            n_batch += 1
        print('Epoch %d - Validation loss: %f' % (epoch, val_loss / n_batch))
        print('Epoch %d - Validation Accuracy: %f' % (epoch,
                                                      (val_acc / n_batch)))
        if epoch % 5 == 0:
            saver.save(sess, model_path + "save_net.ckpt", epoch)
            print('Trained Model Saved.')
def main(unused_argv):
    if FLAGS.job_name is None or FLAGS.job_name == "":
        raise ValueError("Must specify an explicit `job_name`")
    if FLAGS.task_index is None or FLAGS.task_index == "":
        raise ValueError("Must specify an explicit `task_index`")
    print("job name = %s" % FLAGS.job_name)
    print("task index = %d" % FLAGS.task_index)
    #Construct the cluster and start the server
    # 读取集群描述信息
    ps_spec = FLAGS.ps_hosts.split(",")
    worker_spec = FLAGS.worker_hosts.split(",")
    # Get the number of workers.
    num_workers = len(worker_spec)
    # 创建TensorFlow集群描述对象
    cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
    # 为本地执行任务创建TensorFlow Server对象。
    if not FLAGS.existing_servers:
        # Not using existing servers. Create an in-process server.
        # 创建本地Sever对象,从tf.train.Server这个定义开始,每个节点开始不同
        # 根据执行的命令的参数(作业名字)不同,决定这个任务是哪个任务
        # 如果作业名字是ps,进程就加入这里,作为参数更新的服务,等待其他工作节点给它提交参数更新的数据
        # 如果作业名字是worker,就执行后面的计算任务
        server = tf.train.Server(cluster,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)
        # 如果是参数服务器,直接启动即可。这里,进程就会阻塞在这里
        # 下面的tf.train.replica_device_setter代码会将参数批定给ps_server保管
        if FLAGS.job_name == "ps":
            server.join()
    # 处理工作节点
    # 找出worker的主节点,即task_index为0的点
    is_chief = (FLAGS.task_index == 0)
    # 如果使用gpu
    if FLAGS.num_gpus > 0:
        # Avoid gpu allocation conflict: now allocate task_num -> #gpu
        # for each worker in the corresponding machine
        gpu = (FLAGS.task_index % FLAGS.num_gpus)
        # 分配worker到指定gpu上运行
        worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
    # 如果使用cpu
    elif FLAGS.num_gpus == 0:
        # Just allocate the CPU to worker server
        # 把cpu分配给worker
        cpu = 0
        worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
    # The device setter will automatically place Variables ops on separate
    # parameter servers (ps). The non-Variable ops will be placed on the workers.
    # The ps use CPU and workers use corresponding GPU
    # 用tf.train.replica_device_setter将涉及变量操作分配到参数服务器上,使用CPU。将涉及非变量操作分配到工作节点上,使用上一步worker_device值。
    # 在这个with语句之下定义的参数,会自动分配到参数服务器上去定义。如果有多个参数服务器,就轮流循环分配
    with tf.device(
            tf.train.replica_device_setter(worker_device=worker_device,
                                           ps_device="/job:ps/cpu:0",
                                           cluster=cluster)):

        with tf.variable_scope('inputdata') as scope:
            # 获取图片和标签集
            train, train_label = input_data.read_img(FLAGS.train_dir)
            # 生成批次
            train_batch, train_label_batch = input_data.get_batch(
                train, train_label, FLAGS.IMG_W, FLAGS.IMG_H, FLAGS.BATCH_SIZE,
                FLAGS.CAPACITY)

        # 定义全局步长,默认值为0
        global_step = tf.Variable(0, name="global_step", trainable=False)

        train_logits = model.inference(train_batch, FLAGS.BATCH_SIZE,
                                       FLAGS.N_CLASSES)

        train_loss = model.losses(train_logits, train_label_batch)

        accuracy = model.evaluation(train_logits, train_label_batch)

        # merge all summaries into a single "operation" which we can execute in a session
        summary_op = tf.summary.merge_all()
        init_op = tf.global_variables_initializer()
        print("Variables initialized ...")
        # 异步训练模式:自己计算完成梯度就去更新参数,不同副本之间不会去协调进度
        opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
        # 同步训练模式
        if FLAGS.sync_replicas:
            if FLAGS.replicas_to_aggregate is None:
                replicas_to_aggregate = num_workers
            else:
                replicas_to_aggregate = FLAGS.replicas_to_aggregate
            # 使用SyncReplicasOptimizer作优化器,并且是在图间复制情况下
            # 在图内复制情况下将所有梯度平均
            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=replicas_to_aggregate,
                total_num_replicas=num_workers,
                name="mnist_sync_replicas")
        train_step = opt.minimize(train_loss, global_step=global_step)
        if FLAGS.sync_replicas:
            local_init_op = opt.local_step_init_op
            if is_chief:
                # 所有进行计算工作节点里一个主工作节点(chief)
                # 主节点负责初始化参数、模型保存、概要保存
                local_init_op = opt.chief_init_op
            ready_for_local_init_op = opt.ready_for_local_init_op
            # Initial token and chief queue runners required by the sync_replicas mode
            # 同步训练模式所需初始令牌、主队列
            chief_queue_runner = opt.get_chief_queue_runner()
            sync_init_op = opt.get_init_tokens_op()
        init_op = tf.global_variables_initializer()
        if FLAGS.sync_replicas:
            # 创建一个监管程序,用于统计训练模型过程中的信息
            # lodger 是保存和加载模型路径
            # 启动就会去这个logdir目录看是否有检查点文件,有的话就自动加载
            # 没有就用init_op指定初始化参数
            # 主工作节点(chief)负责模型参数初始化工作
            # 过程中,其他工作节点等待主节眯完成初始化工作,初始化完成后,一起开始训练数据
            # global_step值是所有计算节点共享的
            # 在执行损失函数最小值时自动加1,通过global_step知道所有计算节点一共计算多少步
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=FLAGS.logs_train_dir,
                init_op=init_op,
                local_init_op=local_init_op,
                ready_for_local_init_op=ready_for_local_init_op,
                recovery_wait_secs=1,
                global_step=global_step)
        else:
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.logs_train_dir,
                                     init_op=init_op,
                                     recovery_wait_secs=1,
                                     global_step=global_step)
        # 创建会话,设置属性allow_soft_placement为True
        # 所有操作默认使用被指定设置,如GPU
        # 如果该操作函数没有GPU实现,自动使用CPU设备
        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     log_device_placement=False,
                                     device_filters=[
                                         "/job:ps",
                                         "/job:worker/task:%d" %
                                         FLAGS.task_index
                                     ])
        # The chief worker (task_index==0) session will prepare the session,
        # while the remaining workers will wait for the preparation to complete.
        # 主工作节点(chief),task_index为0节点初始化会话
        # 其余工作节点等待会话被初始化后进行计算
        if is_chief:
            print("Worker %d: Initializing session..." % FLAGS.task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  FLAGS.task_index)
        if FLAGS.existing_servers:
            server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
            print("Using existing server at: %s" % server_grpc_url)
            # 创建TensorFlow会话对象,用于执行TensorFlow图计算
            # prepare_or_wait_for_session需要参数初始化完成且主节点准备好后,才开始训练
            sess = sv.prepare_or_wait_for_session(server_grpc_url,
                                                  config=sess_config)
        else:
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)
        print("Worker %d: Session initialization complete." % FLAGS.task_index)
        if FLAGS.sync_replicas and is_chief:
            # Chief worker will start the chief queue runner and call the init op.
            sess.run(sync_init_op)
            global threads
            threads = sv.start_queue_runners(sess, [chief_queue_runner])
        else:
            threads = sv.start_queue_runners(sess)

        # Perform training
        # 执行分布式模型训练
        time_begin = time.time()
        coord = tf.train.Coordinator()
        print("Training begins @ %f" % time_begin)
        local_step = 0
        try:
            for step in np.arange(FLAGS.MAX_STEP):
                if coord.should_stop():
                    break
                _, tra_loss, tra_acc = sess.run(
                    [train_step, train_loss, accuracy])

                if step % 50 == 0:
                    print(
                        'Step %d, train loss = %.2f, train accuracy = %.2f%%' %
                        (step, tra_loss, tra_acc * 100.0))

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
        finally:
            coord.request_stop()
        coord.join(threads)
        sess.close()
        time_end = time.time()
        print("Training ends @ %f" % time_end)
        training_time = time_end - time_begin
        print("Training elapsed time: %f s" % training_time)
Example #6
0
import img_utils
import numpy as np
import mdp.value_iteration4 as vi
import mdp.gridworld2 as gridworld

H = 24
W = 64
N_STATES = H * W
TRAJ_LEN = 4


def int_to_point(i):
    return i % W, i // W


__, img, _ = input_data.read_img(H, W)
input_img = tf.placeholder(tf.float32, [None, H, W, 3])

sess = tf.Session()

# load meta graph and restore weights
saver = tf.train.import_meta_graph(
    '/Users/David/Desktop/model/model24*64/ckpt/model119.ckpt.meta')
saver.restore(sess, '/Users/David/Desktop/model/model24*64/ckpt/model119.ckpt')

# get all tensors. not necessary
graph = tf.get_default_graph()
tensor_name_list = [
    tensor.name for tensor in tf.get_default_graph().as_graph_def().node
]
print(tensor_name_list)