Beispiel #1
0
    def _train_body(self, state, next_state, action, reward, not_done,
                    discount):
        with tf.device("/cpu:0"):

            with tf.GradientTape() as tape:
                # action_val = tf.expand_dims(tf.argmax(action, axis=1), axis=1)
                # action_val = tf.cast(action_val, dtype=tf.int32)
                action = tf.cast(tf.expand_dims(action, axis=1),
                                 dtype=tf.int32)
                indices = tf.concat(values=[
                    tf.expand_dims(tf.range(state.shape[0]), axis=1), action
                ],
                                    axis=1)
                current_Q = tf.expand_dims(tf.gather_nd(
                    self.q_func(state), indices),
                                           axis=1)
                target_Q = self.q_func(next_state)
                target_Q = reward + (not_done * discount * tf.reduce_max(
                    target_Q, keepdims=True, axis=1))
                target_Q = tf.stop_gradient(target_Q)
                td_error = current_Q - target_Q

                q_func_loss = tf.reduce_mean(huber_loss(td_error, delta=2.))
                # q_func_loss = tf.reduce_mean(tf.square(td_error))

            q_func_grad = tape.gradient(q_func_loss,
                                        self.q_func.trainable_variables)
            self.q_func_optimizer.apply_gradients(
                zip(q_func_grad, self.q_func.trainable_variables))

            return td_error, q_func_loss
Beispiel #2
0
    def _setup_loss_graph(self, s_output_tbi, s_target_tbi, s_step_size):
        """
        Connect a loss function to the graph
        See data.py for explanation of the slicing part
        """
        s_sliced_output_tbi = s_output_tbi[-s_step_size :]
        s_sliced_target_tbi = s_target_tbi[-s_step_size :]

        if self._options['loss_type'] == 'l2':
            return l2_loss(s_sliced_output_tbi, s_sliced_target_tbi)
        if self._options['loss_type'] == 'l1':
            return l1_loss(s_sliced_output_tbi, s_sliced_target_tbi)
        if self._options['loss_type'] == 'huber':
            delta = self._options['huber_delta']
            return huber_loss(s_sliced_output_tbi, s_sliced_target_tbi, delta)
        
        assert False, 'Invalid loss_type option'
        return tt.alloc(np.float32(0.))
Beispiel #3
0
    def __init__(self, observation_type):
        self.weights = {
            "fc1" : fc_init_he(8, 128),
            "fc2" : fc_init_he(128, 128),
            "fc3" : fc_init_he(128, 4)
        }

        self.observation = observation_type
        self.targets = T.matrix()               # self.targets will 99.99% of the time be bs x actions, so we'll just assume it's that way
        self.lr = T.scalar()

        Q = self.forward(self.observation)

        # squared error
        loss = huber_loss(self.targets, Q)

        updates = RMSprop(cost = loss, params = self.get_weights(), lr = self.lr)

        self.get_Q = theano.function(inputs = [self.observation], outputs = Q)
        self.train_Q = theano.function(inputs = [self.observation, self.targets, self.lr], outputs = loss, updates = updates)
	def __init__(self,batch_size=10,im_size=64,channels=3,dtype=tf.float32,analytics=True):
		self.analytics = analytics
		self.batch_size = batch_size

		self.x_a = tf.placeholder(dtype,[None,im_size,im_size,channels],name='xa')
		self.x_b = tf.placeholder(dtype,[None,im_size,im_size,channels],name='xb')

		#Generator Networks
		self.g_ab = utils.generator(self.x_a,name="gen_AB",im_size=im_size)
		self.g_ba = utils.generator(self.x_b,name="gen_BA",im_size=im_size)

		#Secondary generator networks, reusing params of previous two
		self.g_aba = utils.generator(self.g_ab,name="gen_BA",im_size=im_size,reuse=True)
		self.g_bab = utils.generator(self.g_ba,name="gen_AB",im_size=im_size,reuse=True)

		#Discriminator for input a
		self.disc_a_real = utils.discriminator(self.x_a,name="disc_a",im_size=im_size)
		self.disc_a_fake = utils.discriminator(self.g_ba,name="disc_a",im_size=im_size,reuse=True)

		#Discriminator for input b
		self.disc_b_real = utils.discriminator(self.x_b,name="disc_b")
		self.disc_b_fake = utils.discriminator(self.g_ab,name="disc_b",reuse=True)

		#Reconstruction loss for generators
		self.l_const_a = tf.reduce_mean(utils.huber_loss(self.g_aba,self.x_a))
		self.l_const_b = tf.reduce_mean(utils.huber_loss(self.g_bab,self.x_b))

		#Generation loss for generators 
		self.l_gan_a = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_a_fake,labels=tf.ones_like(self.disc_a_fake)))
		self.l_gan_b = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_b_fake,labels=tf.ones_like(self.disc_b_fake)))

		#Real example loss for discriminators
		self.l_disc_a_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_a_real,labels=tf.ones_like(self.disc_a_real)))
		self.l_disc_b_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_b_real,labels=tf.ones_like(self.disc_b_real)))

		#Fake example loss for discriminators
		self.l_disc_a_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_a_fake,labels=tf.zeros_like(self.disc_a_fake)))
		self.l_disc_b_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.disc_b_fake,labels=tf.zeros_like(self.disc_b_fake)))

		#Combined loss for individual discriminators
		self.l_disc_a = self.l_disc_a_real + self.l_disc_a_fake
		self.l_disc_b = self.l_disc_b_real + self.l_disc_b_fake

		#Total discriminator loss
		self.l_disc = self.l_disc_a + self.l_disc_b

		#Combined loss for individual generators
		self.l_ga = self.l_gan_a + self.l_const_b
		self.l_gb = self.l_gan_b + self.l_const_a

		#Total GAN loss
		self.l_g = self.l_ga + self.l_gb

		#Parameter Lists
		self.disc_params = []
		self.gen_params = []

		for v in tf.trainable_variables():
			if 'disc' in v.name:
				self.disc_params.append(v)
			if 'gen' in v.name:
				self.gen_params.append(v)

		if self.analytics:
			self.init_analytics()

		self.gen_a_dir = 'generator a->b'
		self.gen_b_dir = 'generator b->a'
		self.rec_a_dir = 'reconstruct a'
		self.rec_b_dir = 'reconstruct b'
		self.model_directory = "models"	
	
		if not os.path.exists(self.gen_a_dir):
			os.makedirs(self.gen_a_dir)
		if not os.path.exists(self.gen_b_dir):
			os.makedirs(self.gen_b_dir)
		if not os.path.exists(self.rec_b_dir):
			os.makedirs(self.rec_b_dir)
		if not os.path.exists(self.rec_a_dir):
			os.makedirs(self.rec_a_dir)	

		self.sess = tf.Session()
		self.saver = tf.train.Saver()
Beispiel #5
0
u1 = tf.compat.v1.get_variable('weights_1_2', initializer=tf.constant(0.0))
b1 = tf.compat.v1.get_variable('bias_1', initializer=tf.constant(0.0))

w2 = tf.compat.v1.get_variable('weights_2', initializer=tf.constant(0.0))
b2 = tf.compat.v1.get_variable('bias_2', initializer=tf.constant(0.0))

# Step 4: 构造用来预测 Y 的模型
# 把前面的 w, X, b 都用上
Y_predicted = w * X + b
Y_predicted_1 = w1 * X * X + u1 * X + b1
Y_predicted_2 = w2 * X + b2

# Step 5: 损失函数用 MSE,也可以使用 Huber loss
loss = tf.square(Y - Y_predicted, name='loss')
loss_1 = tf.square(Y - Y_predicted_1, name='loss_1')
loss_2 = utils.huber_loss(Y, Y_predicted_2)

# Step 6: 使用梯度下降方法来最小化 loss,学习率是 0.001
optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss)
optimizer_1 = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss_1)
optimizer_2 = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss_2)

start = time.time()

# 写入 graph
writer = tf.summary.FileWriter('data/graphs/linear_reg', tf.compat.v1.get_default_graph())

with tf.compat.v1.Session() as sess:
    # Step 7: 初始化所有的变量
    sess.run(tf.compat.v1.global_variables_initializer())
Beispiel #6
0
def dqn_worker(env,
               name,
               optimizer_spec,
               session,
               exploration,
               replay_buffer_size,
               batch_size,
               gamma,
               learn_start,
               learn_freq,
               history_frames_num,
               target_update_freq,
               grad_norm_clipping,
               stop_criterion=None):
    #################
    # build network #
    #################
    def q_net(input, act_num, scope, reuse=False):
        with tf.variable_scope(scope, reuse=reuse):
            out = input
            with tf.variable_scope('convnet'):
                out = layers.convolution2d(out,
                                           num_outputs=32,
                                           kernel_size=8,
                                           stride=4,
                                           activation_fn=tf.nn.relu)
                out = layers.convolution2d(out,
                                           num_outputs=64,
                                           kernel_size=4,
                                           stride=2,
                                           activation_fn=tf.nn.relu)
                out = layers.convolution2d(out,
                                           num_outputs=64,
                                           kernel_size=3,
                                           stride=1,
                                           activation_fn=tf.nn.relu)
            out = layers.flatten(out)
            with tf.variable_scope('action_value'):
                out = layers.fully_connected(out,
                                             num_outputs=512,
                                             activation_fn=tf.nn.relu)
                out = layers.fully_connected(out,
                                             num_outputs=act_num,
                                             activation_fn=None)
            return out

    ###################
    # build functions #
    ###################
    print('yeah')
    img_h, img_w, img_c = env.observation_size
    input_shape = (img_h, img_w, history_frames_num * img_c)
    act_num = len(env.action_space)
    # c : current
    # n : next
    ### set up placeholders ###
    obs_c_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    act_c_ph = tf.placeholder(tf.int32, [None])
    rew_c_ph = tf.placeholder(tf.float32, [None])
    obs_n_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    done_ph = tf.placeholder(tf.float32,
                             [None])  # if next state is the end, 0. or 1.
    ### transform the observation pixels value to float between 0.~1. ###
    obs_c_float = tf.cast(obs_c_ph, tf.float32)
    obs_n_float = tf.cast(obs_n_ph, tf.float32)
    ### compute the TD error ###
    q_c_values = q_net(obs_c_float, act_num, scope='q_net', reuse=False)
    q_c_selected = tf.reduce_sum(q_c_values * tf.one_hot(act_c_ph, act_num), 1)
    q_n_values = q_net(obs_n_float, act_num, scope='target_q_net')
    q_n_max = tf.reduce_max(q_n_values, 1)
    q_c_selected_target = rew_c_ph + gamma * (1.0 - done_ph) * q_n_max
    td_error = q_c_selected - tf.stop_gradient(q_c_selected_target)
    errors = utils.huber_loss(td_error)
    mean_error = tf.reduce_mean(errors)
    ### collection parameters of q_net and target_q_net ###
    q_net_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope='q_net')
    target_q_net_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope='target_q_net')
    ### optimization function ###
    learn_rate = tf.placeholder(tf.float32, (), name='learn_rate')
    optimizer = optimizer_spec.constructor(learning_rate=learn_rate,
                                           **optimizer_spec.kwargs)
    train_fn = utils.minimize_and_clip(optimizer,
                                       mean_error,
                                       var_list=q_net_params,
                                       clip_val=grad_norm_clipping)
    ### update target q network function ###
    update_target_fn = []
    for param, target_param in zip(
            sorted(q_net_params, key=lambda p: p.name),
            sorted(target_q_net_params, key=lambda p: p.name)):
        update_target_fn.append(target_param.assign(param))
    ### set up replay buffer ###
    replay_buffer = Replay_buffer(replay_buffer_size, history_frames_num)
    #######################
    # interation with env #
    #######################
    ### initialization ###
    ### webdriver setting
    options = webdriver.ChromeOptions()
    options.add_argument("--window-size=600,800")
    options.add_argument("--allow-file-access-from-files")
    options.add_argument("--disable-infobars")
    driver = webdriver.Chrome(chrome_options=options)
    driver.get('http://localhost:3000')
    driver.implicitly_wait(2)
    #print(driver.find_element_by_css_selector("#startMenuWrapper").value_of_css_property('max-height'))
    env.click_play_btn_with_name(name, driver)
    driver.implicitly_wait(2)
    time.sleep(1)
    ###
    model_initialized = False
    train_num = -1
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    episode_reward_list = []
    radius = env.get_radius(name)
    last_obs = env.process_screenshot(driver)
    LOG_EVERY_N_STEPS = 100
    init_op = tf.global_variables_initializer()
    session.run(init_op)
    ### the counter of time steps ###
    for t in itertools.count():
        if stop_criterion is not None and stop_criterion(env, t):
            break
        idx = replay_buffer.store_frame(last_obs)
        obs_c = replay_buffer.stack_recent_obs()
        explore_prob = random.random()
        ### epsilon greedy exploration policy ###
        if explore_prob < exploration.value(t):
            action = random.randrange(act_num)
        else:
            action_values = session.run(q_c_values,
                                        feed_dict={obs_c_ph: obs_c[None]})
            action = np.argmax(action_values)
        ### step to next state ###

        obs, reward, done, info = env.step(driver, name, action)
        replay_buffer.store_transition(idx, action, reward, done)
        if done:
            env.reset(driver)
            radius_ = env.get_radius(name)
            episode_reward = radius_ - radius
            episode_reward_list.append(episode_reward)
            radius = radius_
        ### train the networks ###
        if (t > learn_start and t % learn_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # 1.sample transitions #
            obs_batch,act_batch,rew_batch,next_obs_batch,done_batch\
                =replay_buffer.sample(batch_size)
            # 2.initialize the model #
            if not model_initialized:
                utils.initialize_interdependent_variables(
                    session, tf.global_variables(), {
                        obs_c_ph: obs_batch,
                        obs_n_ph: next_obs_batch,
                    })
                model_initialized = True
            # 3.train the model #
            session.run(train_fn,
                        feed_dict={
                            obs_c_ph: obs_batch,
                            act_c_ph: act_batch,
                            rew_c_ph: rew_batch,
                            obs_n_ph: next_obs_batch,
                            done_ph: done_batch,
                            learn_rate: optimizer_spec.lr_schedule.value(t)
                        })
            train_num += 1
            # 4.update target network #
            if train_num % target_update_freq == 0:
                session.run(update_target_fn)
        #######
        # log #
        #######
        if len(episode_reward_list) > 0:
            mean_episode_reward = np.mean(episode_reward_list[-100:])
        if len(episode_reward_list) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
            print('########## log ##########')
            print('Timestep %d' % (t, ))
            print('mean reward (100 episodes) %f' % mean_episode_reward)
            print('best mean reward %f' % best_mean_episode_reward)
            print('episodes %d' % len(episode_reward_list))
            print('exploration %f' % exploration.value(t))
            print('learning_rate %f' % optimizer_spec.lr_schedule.value(t))
            sys.stdout.flush(
            )  # show all information on terminal before next interation begin
        frame = env.get_screenshot(driver)
        if np.mean(frame) < 50:
            env.reset(driver)
        last_obs = env.process_screenshot(driver)
Beispiel #7
0
    def loss_function(self, y_true, y_pred):
        del_t = self.huber_loss_init(self.optical_flow.outputs)
        huber_loss = utils.huber_loss(del_t, self.delta)

        return K.square(y_pred - y_true) + (self.huber_weight * huber_loss)
Beispiel #8
0
# Create Dataset and iterator
dataset = tf.data.Dataset.from_tensor_slices((data[:,0], data[:,1]))
iterator = dataset.make_initializable_iterator()
X, Y = iterator.get_next()

# Create weight and bias, initialized to 0
w = tf.get_variable('weights', initializer=tf.constant(0.0))
b = tf.get_variable('bias', initializer=tf.constant(0.0))

# Build model to predict Y
Y_predicted = w * X + b

# Use either square (type = 'SQUARE') or Huber (type = 'HUBER') loss function
loss = tf.square(Y - Y_predicted, name='loss') if LOSS_TYPE == 'SQUARE'
    else utils.huber_loss(Y, Y_predicted)

# Use gradient descent to minimize loss
optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE).minimize(loss)

start = time.time()
with tf.Session() as sess:

    # Initialize variables
    sess.run(tf.global_variables_initializer())

    # Create writer for TensorBoard
    writer = tf.summary.FileWriter('./graphs/linear_reg', sess.graph)

    # Train the model for 100 epochs
    for i in range(N_EPOCHS):
bl = tf.get_variable('bias', initializer=tf.constant(0.0))

wq = tf.get_variable('weights_1', initializer=tf.constant(0.0))
uq = tf.get_variable('weights_2', initializer=tf.constant(0.0))
bq = tf.get_variable('biasq', initializer=tf.constant(0.0))

wlh = tf.get_variable('weightsh', initializer=tf.constant(0.0))
blh = tf.get_variable('biash', initializer=tf.constant(0.0))

Y_q = wq * X * X + uq * X + bq
Y_l = wl * X + bl
Y_lh = wlh * X + blh

loss_q = tf.square(Y - Y_q, name='loss_q')
loss_l = tf.square(Y - Y_l, name='loss_l')
loss_lh = utils.huber_loss(Y, Y_lh)

optimizer_q = tf.train.GradientDescentOptimizer(
    learning_rate=0.001).minimize(loss_q)
optimizer_l = tf.train.GradientDescentOptimizer(
    learning_rate=0.001).minimize(loss_l)
optimizer_lh = tf.train.GradientDescentOptimizer(
    learning_rate=0.001).minimize(loss_lh)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(100):
        for x, y in data:
            sess.run(optimizer_q, feed_dict={X: x, Y: y})
            sess.run(optimizer_l, feed_dict={X: x, Y: y})
            sess.run(optimizer_lh, feed_dict={X: x, Y: y})
Beispiel #10
0
g_ba = utils.generator(x_b,BATCH_SIZE,name="gen_BA")

#Secondary generator networks, reusing params of previous two
g_aba = utils.generator(g_ab,BATCH_SIZE,name="gen_BA",reuse=True)
g_bab = utils.generator(g_ba,BATCH_SIZE,name="gen_AB",reuse=True)

#Discriminator for input a
disc_a_real = utils.discriminator(x_a,name="disc_a")
disc_a_fake = utils.discriminator(g_ba,name="disc_a",reuse=True)

#Discriminator for input b
disc_b_real = utils.discriminator(x_b,name="disc_b")
disc_b_fake = utils.discriminator(g_ab,name="disc_b",reuse=True)

#Reconstruction loss for generators
l_const_a = tf.reduce_mean(utils.huber_loss(g_aba,x_a))
l_const_b = tf.reduce_mean(utils.huber_loss(g_bab,x_b))

#Generation loss for generators 
l_gan_a = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_a_fake,labels=tf.ones_like(disc_a_fake)))
l_gan_b = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_b_fake,labels=tf.ones_like(disc_b_fake)))

#Real example loss for discriminators
l_disc_a_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_a_real,labels=tf.ones_like(disc_a_real)))
l_disc_b_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_b_real,labels=tf.ones_like(disc_b_real)))

#Fake example loss for discriminators
l_disc_a_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_a_fake,labels=tf.zeros_like(disc_a_fake)))
l_disc_b_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_b_fake,labels=tf.zeros_like(disc_b_fake)))

#Combined loss for individual discriminators
Beispiel #11
0
# Remember both X and Y are scalars with type float
X = tf.placeholder(tf.float32, name='X')
Y = tf.placeholder(tf.float32, name='Y')

# Step 3: 建立 weight 和 bias, 並設定初始值 0.0
# Make sure to use tf.get_variable
w = tf.get_variable('weight', initializer=tf.constant(0.0))
b = tf.get_variable('bias', initializer=tf.constant(0.0))

# Step 4: 建立 model 來預測 Y
# e.g. how would you derive at Y_predicted given X, w, and b
Y_predicted = w * X + b

# Step 5: 利用 square error 來當作 loss function
# loss = tf.square(Y - Y_predicted, name='loss')
loss = utils.huber_loss(Y, Y_predicted)  # 自定義的loss function

# Step 6: 利用 gradient descent 搭配 learning rate 0.001 來降低 loss
optimizer = tf.train.GradientDescentOptimizer(
    learning_rate=0.001).minimize(loss)

start = time.time()

# 建立 filewriter 將 model's graph 寫入 TensorBoard
writer = tf.summary.FileWriter('./graphs/linear_reg', tf.get_default_graph())

with tf.Session() as sess:
    # Step 7: 初始化所需要的變數 w 和 b
    sess.run(tf.global_variables_initializer())

    # Step 8: 訓練 model, 100 epochs
Beispiel #12
0
# Step 2: create placeholders for input X (number of fire) and label Y (number of theft)
x = tf.placeholder(dtype=tf.float32, shape=(), name='x')
y = tf.placeholder(dtype=tf.float32, shape=(), name='y')

# Step 3: create weight and bias, initialized to 0
w = tf.Variable(initial_value=0.0, name='w')
b = tf.Variable(initial_value=0.0, name='b')

# Step 4: predict Y (number of theft) from the number of fire
# name your variable Y_predicted
y_predicted = x * w + b

# Step 5: use the square error as the loss function
#loss = tf.nn.l2_loss(y_predicted - y, name='loss')
loss = utils.huber_loss(y, y_predicted)

# Step 6: using gradient descent with learning rate of 0.01 to minimize loss
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)

# Phase 2: Train our model
with tf.Session() as sess:
  # Step 7: initialize the necessary variables, in this case, w and b
  sess.run(tf.global_variables_initializer())
  writer = tf.summary.FileWriter('/tmp/linear-reg', sess.graph)

  # Step 8: train the model
  for i in range(100):  # run 100 epochs
    total_loss = 0
    for train_x, train_y in data:
      # Session runs optimizer to minimize loss and fetch the value of loss. Name the received value as l
 def loss_func(x, y):
     return utils.huber_loss(y, prediction(x), 10.0)
Beispiel #14
0
def learn_by_dqn(env,
                 q_net,
                 optimizer_spec,
                 session,
                 exploration,
                 replay_buffer_size,
                 batch_size,
                 gamma,
                 learn_start,
                 learn_freq,
                 history_frames_num,
                 target_update_freq,
                 grad_norm_clipping,
                 stop_criterion=None):
    ###################
    # build functions #
    ###################
    img_h, img_w, img_c = env.observation_space.shape
    input_shape = (img_h, img_w, history_frames_num * img_c)
    act_num = env.action_space.n
    # c : current
    # n : next
    ### set up placeholders ###
    obs_c_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    act_c_ph = tf.placeholder(tf.int32, [None])
    rew_c_ph = tf.placeholder(tf.float32, [None])
    obs_n_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    done_ph = tf.placeholder(tf.float32,
                             [None])  # if next state is the end, 0. or 1.
    ### transform the observation pixels value to float between 0.~1. ###
    obs_c_float = tf.cast(obs_c_ph, tf.float32) / 255.0
    obs_n_float = tf.cast(obs_n_ph, tf.float32) / 255.0
    ### compute the TD error ###
    q_c_values = q_net(obs_c_float, act_num, scope='q_net', reuse=False)
    q_c_selected = tf.reduce_sum(q_c_values * tf.one_hot(act_c_ph, act_num), 1)
    q_n_values = q_net(obs_n_float, act_num, scope='target_q_net')
    q_n_max = tf.reduce_max(q_n_values, 1)
    q_c_selected_target = rew_c_ph + gamma * (1.0 - done_ph) * q_n_max
    td_error = q_c_selected - tf.stop_gradient(q_c_selected_target)
    errors = utils.huber_loss(td_error)
    mean_error = tf.reduce_mean(errors)
    ### collection parameters of q_net and target_q_net ###
    q_net_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope='q_net')
    target_q_net_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope='target_q_net')
    ### optimization function ###
    learn_rate = tf.placeholder(tf.float32, (), name='learn_rate')
    optimizer = optimizer_spec.constructor(learning_rate=learn_rate,
                                           **optimizer_spec.kwargs)
    train_fn = utils.minimize_and_clip(optimizer,
                                       mean_error,
                                       var_list=q_net_params,
                                       clip_val=grad_norm_clipping)
    ### update target q network function ###
    update_target_fn = []
    for param, target_param in zip(
            sorted(q_net_params, key=lambda p: p.name),
            sorted(target_q_net_params, key=lambda p: p.name)):
        update_target_fn.append(target_param.assign(param))
    ### set up replay buffer ###
    replay_buffer = utils.Replay_buffer(replay_buffer_size, history_frames_num)
    #######################
    # interation with env #
    #######################
    model_initialized = False
    train_num = -1
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    #env.render()
    LOG_EVERY_N_STEPS = 10000
    init_op = tf.global_variables_initializer()
    session.run(init_op)
    ### the counter of time steps ###
    for t in itertools.count():
        if stop_criterion is not None and stop_criterion(env, t):
            break
        idx = replay_buffer.store_frame(last_obs)
        obs_c = replay_buffer.stack_recent_obs()
        explore_prob = random.random()
        ### epsilon greedy exploration policy ###
        if explore_prob < exploration.value(t):
            action = env.action_space.sample()  # ??? #
        else:
            action_values = session.run(q_c_values,
                                        feed_dict={obs_c_ph: obs_c[None]})
            action = np.argmax(action_values)
        ### step to next state ###
        obs, reward, done, info = env.step(action)
        #env.render()
        replay_buffer.store_transition(idx, action, reward, done)
        if not done:
            last_obs = obs
        else:
            last_obs = env.reset()
        ### train the networks ###
        if (t > learn_start and t % learn_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # 1.sample transitions #
            obs_batch,act_batch,rew_batch,next_obs_batch,done_batch\
                =replay_buffer.sample(batch_size)
            # 2.initialize the model #
            if not model_initialized:
                utils.initialize_interdependent_variables(
                    session, tf.global_variables(), {
                        obs_c_ph: obs_batch,
                        obs_n_ph: next_obs_batch,
                    })
                model_initialized = True
            # 3.train the model #
            session.run(train_fn,
                        feed_dict={
                            obs_c_ph: obs_batch,
                            act_c_ph: act_batch,
                            rew_c_ph: rew_batch,
                            obs_n_ph: next_obs_batch,
                            done_ph: done_batch,
                            learn_rate: optimizer_spec.lr_schedule.value(t)
                        })
            train_num += 1
            # 4.update target network #
            if train_num % target_update_freq == 0:
                session.run(update_target_fn)
        #######
        # log #
        #######
        episode_rewards = utils.get_wrapper_by_name(
            env, "Monitor").get_episode_rewards()  # ??? #
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
            print('##########log##########')
            print('Timestep %d' % (t, ))
            print('mean reward (100 episodes) %f' % mean_episode_reward)
            print('best mean reward %f' % best_mean_episode_reward)
            print('episodes %d' % len(episode_rewards))
            print('exploration %f' % exploration.value(t))
            print('learning_rate %f' % optimizer_spec.lr_schedule.value(t))
            print('#######################')
            sys.stdout.flush(
            )  # show all information on terminal before next interation begin
Beispiel #15
0
# Step 2: create Dataset and iterator
dataset = tf.contrib.data.Dataset.from_tensor_slices((data[:, 0], data[:, 1]))

iterator = dataset.make_initializable_iterator()
X, Y = iterator.get_next()

# Step 3: create weight and bias, initialized to 0
w = tf.get_variable('weights', initializer=tf.constant(0.0))
b = tf.get_variable('bias', initializer=tf.constant(0.0))

# Step 4: build model to predict Y
Y_predicted = X * w + b

# Step 5: use the square error as the loss function
# loss = tf.square(Y - Y_predicted, name='loss')
loss = utils.huber_loss(Y, Y_predicted)

# Step 6: using gradient descent with learning rate of 0.001 to minimize loss
optimizer = tf.train.GradientDescentOptimizer(
    learning_rate=0.001).minimize(loss)

start = time.time()
with tf.Session() as sess:
    # Step 7: initialize the necessary variables, in this case, w and b
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('./graphs/linear_reg', sess.graph)

    # Step 8: train the model for 100 epochs
    for i in range(100):
        sess.run(iterator.initializer)  # initialize the iterator
        total_loss = 0
w, b = tf.get_variable(initializer=tf.constant(0.0),
                       name="w"), tf.get_variable(initializer=tf.constant(0.0),
                                                  name="b")
#############################
########## TO DO ############
#############################

# Step 4: build model to predict Y
# e.g. how would you derive at Y_predicted given X, w, and b
Y_predicted = w * X + b
#############################
########## TO DO ############
#############################

# Step 5: use the square error as the loss function
loss = utils.huber_loss(Y, Y_predicted, delta=14.0)
#############################
########## TO DO ############
#############################

# Step 6: using gradient descent with learning rate of 0.001 to minimize loss
optimizer = tf.train.GradientDescentOptimizer(
    learning_rate=0.001).minimize(loss)

start = time.time()

# Create a filewriter to write the model's graph to TensorBoard
#############################
########## TO DO ############
#############################
writer = tf.summary.FileWriter('./graphs/linear_reg', tf.get_default_graph())
n_samples = sheet.nrows - 1

# Step 2: create placeholders for input X (number of fire) and label Y (number of theft)
X = tf.placeholder(tf.float32, name='X')
Y = tf.placeholder(tf.float32, name='Y')

# Step 3: create weight and bias, initialized to 0
w = tf.Variable(0.0, name='weights')
b = tf.Variable(0.0, name='bias')

# Step 4: build model to predict Y
Y_predicted = X * w + b 

# Step 5: use the square error as the loss function
#loss = tf.square(Y - Y_predicted, name='loss')
loss = utils.huber_loss(Y, Y_predicted)

# Step 6: using gradient descent with learning rate of 0.01 to minimize loss
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss)

with tf.Session() as sess:
	# Step 7: initialize the necessary variables, in this case, w and b
	sess.run(tf.global_variables_initializer()) 
	
	writer = tf.summary.FileWriter('./graphs/linear_reg', sess.graph)
	
	# Step 8: train the model
	for i in range(100): # train the model 100 epochs
		total_loss = 0
		for x, y in data:
			# Session runs train_op and fetch values of loss
dataset = tf.data.Dataset.from_tensor_slices((data[:,0], data[:,1]))
#iterator = dataset.make_one_shot_iterator()
iterator = dataset.make_initializable_iterator()
X, Y = iterator.get_next()

w = tf.get_variable('weights', initializer = tf.constant(0.0))
b = tf.get_variable('bias', initializer = tf.constant(0.0))

wh = tf.get_variable('weights_h', initializer = tf.constant(0.0))
bh = tf.get_variable('bias_h', initializer = tf.constant(0.0))

Y_predict = w *X + b
Y_predicth = wh * X + bh

loss = tf.square(Y - Y_predict, name = 'loss')
loss_h = utils.huber_loss(Y, Y_predicth)

optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001).minimize(loss)
optimizerh = tf.train.GradientDescentOptimizer(learning_rate = 0.001).minimize(loss_h)


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for i in range(100):
        sess.run(iterator.initializer)
    
        try:
            while True:
               sess.run([optimizer, optimizerh])