Esempio n. 1
0
    def policy_objective(self, batch):
        states, advantages, old_log_prob, true_speed, true_similarity = batch
        policy = self.network.policy(states, training=True)
        log_prob = policy['old_log_prob']
        entropy = tf.reduce_mean(policy['entropy'])
        speed = policy['speed']
        similarity = policy['similarity']

        # Entropy
        entropy_penalty = self.entropy_strength() * entropy

        # Compute the probability ratio between the current and old policy
        ratio = tf.math.exp(log_prob - old_log_prob)
        ratio = tf.reduce_mean(ratio, axis=1)  # mean over per-action ratio

        # Compute the clipped ratio times advantage
        clip_value = self.clip_ratio()
        min_adv = tf.where(advantages > 0.0, x=(1.0 + clip_value) * advantages, y=(1.0 - clip_value) * advantages)

        # aux losses
        speed_loss = 0.5 * tf.reduce_mean(losses.MSE(y_true=true_speed, y_pred=speed))
        similarity_loss = 0.5 * tf.reduce_mean(losses.MSE(y_true=true_similarity, y_pred=similarity))

        # total loss
        policy_loss = -tf.reduce_mean(tf.minimum(ratio * advantages, min_adv))
        total_loss = policy_loss - entropy_penalty + speed_loss + similarity_loss

        # Log stuff
        self.log(ratio=tf.reduce_mean(ratio), log_prob=tf.reduce_mean(log_prob), entropy=entropy,
                 entropy_coeff=self.entropy_strength.value, ratio_clip=clip_value, loss_speed_policy=speed_loss,
                 loss_policy=policy_loss, loss_entropy=entropy_penalty, speed_pi=tf.reduce_mean(speed),
                 loss_similarity_policy=similarity_loss, similarity_pi=tf.reduce_mean(similarity))

        return total_loss
    def optimize(self):
        # 优化网络主函数
        # 从缓存中取出样本数据,转换成Tensor
        state = tf.constant([t.state for t in self.buffer], dtype=tf.float32)
        action = tf.constant([t.action for t in self.buffer], dtype=tf.int32)
        action = tf.reshape(action, [-1, 1])
        reward = [t.reward for t in self.buffer]
        old_action_log_prob = tf.constant([t.a_log_prob for t in self.buffer],
                                          dtype=tf.float32)
        old_action_log_prob = tf.reshape(old_action_log_prob, [-1, 1])
        # 通过MC方法循环计算R(st)
        R = 0
        Rs = []
        for r in reward[::-1]:
            R = r + gamma * R
            Rs.insert(0, R)
        Rs = tf.constant(Rs, dtype=tf.float32)
        # 对缓冲池数据大致迭代10遍
        for _ in range(round(10 * len(self.buffer) / batch_size)):
            # 随机从缓冲池采样batch size大小样本
            index = np.random.choice(np.arange(len(self.buffer)),
                                     batch_size,
                                     replace=False)
            # 构建梯度跟踪环境
            with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
                # 取出R(st),[b,1]
                v_target = tf.expand_dims(tf.gather(Rs, index, axis=0), axis=1)
                # 计算v(s)预测值,也就是偏置b,我们后面会介绍为什么写成v
                v = self.critic(tf.gather(state, index, axis=0))
                delta = v_target - v  # 计算优势值
                advantage = tf.stop_gradient(delta)  # 断开梯度连接
                # 由于TF的gather_nd与pytorch的gather功能不一样,需要构造
                # gather_nd需要的坐标参数,indices:[b, 2]
                # pi_a = pi.gather(1, a) # pytorch只需要一行即可实现
                a = tf.gather(action, index, axis=0)  # 取出batch的动作at
                # batch的动作分布pi(a|st)
                pi = self.actor(tf.gather(state, index, axis=0))
                indices = tf.expand_dims(tf.range(a.shape[0]), axis=1)
                indices = tf.concat([indices, a], axis=1)
                pi_a = tf.gather_nd(pi, indices)  # 动作的概率值pi(at|st), [b]
                pi_a = tf.expand_dims(pi_a, axis=1)  # [b]=> [b,1]
                # 重要性采样
                ratio = (pi_a / tf.gather(old_action_log_prob, index, axis=0))
                surr1 = ratio * advantage
                surr2 = tf.clip_by_value(ratio, 1 - epsilon,
                                         1 + epsilon) * advantage
                # PPO误差函数
                policy_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))
                # 对于偏置v来说,希望与MC估计的R(st)越接近越好
                value_loss = losses.MSE(v_target, v)
            # 优化策略网络
            grads = tape1.gradient(policy_loss, self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(grads, self.actor.trainable_variables))
            # 优化偏置值网络
            grads = tape2.gradient(value_loss, self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(grads, self.critic.trainable_variables))

        self.buffer = []  # 清空已训练数据
Esempio n. 3
0
def train_net(model, optimizer, gamma, epsilon, lmd, k_epoch):
    s, a, r, s_next, a_prob, done_flag = model.package_trans()
    for epo_i in range(k_epoch):
        td_target = r + gamma * model.get_critic(s_next) * done_flag
        td_error = td_target - model.get_critic(s)
        td_error = td_error.numpy()
        advantage_ls = []
        advantage = 0.
        for error in td_error[::-1]:
            advantage = gamma * lmd * advantage + error[0]
        advantage_ls.append(advantage)
        advantage_ls.reverse()
        with tf.GradientTape() as tape:
            advantage = tf.constant(advantage_ls, dtype=tf.float32)

            policy = model.get_policy(s, 1)
            index = tf.expand_dims(tf.range(a.shape[0]), 1)
            # print(index.shape, a.shape)
            a_index = tf.concat([index, a], axis=1)
            policy = tf.gather_nd(policy, a_index)
            policy = tf.expand_dims(policy, 1)

            ratio = tf.exp(tf.math.log(policy) - tf.math.log(a_prob))
            surr1 = ratio * advantage_ls
            surr2 = tf.clip_by_value(ratio, 1 - epsilon,
                                     1 + epsilon) * advantage
            loss = -tf.minimum(surr1, surr2) + losses.MSE(
                model.get_critic(s), td_target)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
Esempio n. 4
0
    def value_objective(self, batch):
        states, returns, true_speed, true_similarity = batch
        prediction = self.value_predict(states)
        values, speed, similarity = prediction['value'], prediction['speed'], prediction['similarity']

        # compute normalized `value loss`:
        base_loss = tf.reduce_mean(losses.MSE(y_true=returns[:, 0], y_pred=values[:, 0]))
        exp_loss = tf.reduce_mean(losses.MSE(y_true=returns[:, 1], y_pred=values[:, 1]))
        value_loss = (0.25 * base_loss) + (exp_loss / (self.network.exp_scale ** 2))

        # auxiliary losses:
        speed_loss = tf.reduce_mean(losses.MSE(y_true=true_speed, y_pred=speed))
        similarity_loss = tf.reduce_mean(losses.MSE(y_true=true_similarity, y_pred=similarity))

        self.log(speed_v=tf.reduce_mean(speed), similarity_v=tf.reduce_mean(similarity),
                 loss_v=value_loss, loss_speed_value=speed_loss, loss_similarity_value=similarity_loss)

        return (value_loss + speed_loss + similarity_loss) * 0.25
Esempio n. 5
0
    def _update_qvalue(self, batch):
        obs1, acts, rews, obs2 = batch

        targets = (rews +
                   self.gamma * self.qvalue_targ(obs2, self.policy_targ(obs2)))

        self.qvalue_opt.minimize(
            lambda: kls.MSE(targets, self.qvalue(obs1, acts)),
            self.qvalue.variables)
Esempio n. 6
0
def trainOp(model, optimizer, dLoader, it, train=True):
    data = dLoader.__getitem__(it)
    with tf.GradientTape() as tape:
        y_true, y_pred = model(data, training=train)
        loss = losses.MSE(y_true, y_pred)

    if train:
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return (loss)
Esempio n. 7
0
    def _update_qvalues(self, batch):
        obs1, acts, rews, obs2, done = batch
        done = tf.cast(done, tf.float32)

        targ_acts = self.policy_targ(obs2)
        noise = tf.random.normal(shape=targ_acts.shape,
                                 mean=0.0,
                                 stddev=self.targ_act_noise)
        noise = tf.clip_by_value(
            noise,
            -self.targ_act_clip,
            self.targ_act_clip,
        )

        targets = rews + (1. - done) * self.gamma * tf.minimum(
            self.qvalue1_targ(obs2, targ_acts),
            self.qvalue2_targ(obs2, targ_acts))

        self.qvalue_opt.minimize(
            lambda: kls.MSE(targets, self.qvalue1(obs1, acts)) + kls.MSE(
                targets, self.qvalue2(obs1, acts)),
            self.qvalue1.variables + self.qvalue2.variables)
Esempio n. 8
0
def total_loss(boxes_gt, masks_gt, input_p, box_pred, mask_pred, rel_scores):
    y1 = K.flatten(boxes_gt)
    y2 = K.flatten(masks_gt)
    y1_pred = K.flatten(box_pred)
    y2_pred = K.flatten(mask_pred)
    input_p = K.expand_dims(input_p, axis=0)

    box_loss = losses.MSE(y1, y1_pred)
    mask_loss = losses.BinaryCrossentropy(from_logits=True)(y2, y2_pred)
    cos_sim = losses.CosineSimilarity()(boxes_gt, box_pred)

    loss_predicate = losses.categorical_crossentropy(input_p, rel_scores)

    return K.mean(box_loss * 1000 + mask_loss + cos_sim + loss_predicate)
    def optimize(self):
        state = tf.constant([t.state for t in self.buffer], dtype=tf.float32)
        action = tf.constant([t.action for t in self.buffer], dtype=tf.int32)
        action = tf.reshape(action,[-1,1])
        reward = [t.reward for t in self.buffer]
        old_action_log_prob = tf.constant([t.a_log_prob for t in self.buffer], dtype=tf.float32)
        old_action_log_prob = tf.reshape(old_action_log_prob, [-1,1])

        R = 0
        Rs = []
        for r in reward[::-1]:
            R = r + gamma * R
            Rs.insert(0, R)
        Rs = tf.constant(Rs, dtype=tf.float32)

        for _ in range(round(10*len(self.buffer)/batch_size)):

            index = np.random.choice(np.arange(len(self.buffer)), batch_size, replace=False)

            with tf.GradientTape() as tape1, tf.GradientTape() as tape2:

                v_target = tf.expand_dims(tf.gather(Rs, index, axis=0), axis=1)

                v = self.critic(tf.gather(state, index, axis=0))
                delta = v_target - v
                advantage = tf.stop_gradient(delta)
                a = tf.gather(action, index, axis=0)
                pi = self.actor(tf.gather(state, index, axis=0)) 
                indices = tf.expand_dims(tf.range(a.shape[0]), axis=1)
                indices = tf.concat([indices, a], axis=1)
                pi_a = tf.gather_nd(pi, indices)
                pi_a = tf.expand_dims(pi_a, axis=1)
                # Importance Sampling
                ratio = (pi_a / tf.gather(old_action_log_prob, index, axis=0))
                surr1 = ratio * advantage
                surr2 = tf.clip_by_value(ratio, 1 - epsilon, 1 + epsilon) * advantage
                policy_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))
                value_loss = losses.MSE(v_target, v)
            grads = tape1.gradient(policy_loss, self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(zip(grads, self.actor.trainable_variables))
            grads = tape2.gradient(value_loss, self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(zip(grads, self.critic.trainable_variables))

        self.buffer = []
Esempio n. 10
0
def total_loss(boxes_gt, masks_gt, input_p, box_pred, mask_pred, rel_scores,
               loss):
    y1 = K.flatten(boxes_gt)
    y2 = K.flatten(masks_gt)
    y1_pred = K.flatten(box_pred)
    y2_pred = K.flatten(mask_pred)
    input_p = K.expand_dims(input_p, axis=0)

    if loss == 'MSE':
        box_loss = losses.MSE(y1, y1_pred)
    else:
        box_loss = losses.MAE(y1, y1_pred)
    mask_loss = losses.BinaryCrossentropy(from_logits=True)(y2, y2_pred)
    cos_sim = losses.CosineSimilarity()(boxes_gt, box_pred)

    loss_predicate = losses.categorical_crossentropy(
        input_p, K.reshape(rel_scores, input_p.shape))

    return K.mean(box_loss * 10 + 0.01 * mask_loss + 0.001 * loss_predicate)
def train(model, train_db, optimizer, normed_test_data, test_labels):
    train_mae_losses = []
    test_mae_losses = []
    for epoch in range(200):
        for step, (x, y) in enumerate(train_db):

            with tf.GradientTape() as tape:
                out = model(x)
                loss = tf.reduce_mean(losses.MSE(y, out))
                mae_loss = tf.reduce_mean(losses.MAE(y, out))

            if step % 10 == 0:
                print(epoch, step, float(loss))

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        train_mae_losses.append(float(mae_loss))
        out = model(tf.constant(normed_test_data.values))
        test_mae_losses.append(tf.reduce_mean(losses.MAE(test_labels, out)))

    return train_mae_losses, test_mae_losses
train_db = train_db.shuffle(100).batch(512)

# # 未训练时测试
# example_batch = normed_train_data[:10]
# example_result = model.predict(example_batch)
# example_result

train_mae_losses = []
test_mae_losses = []
for epoch in range(200):
    for step, (x, y) in enumerate(train_db):

        with tf.GradientTape() as tape:
            out = model(x)
            # mse_lose 均方差 Mean Square Error
            loss = tf.reduce_mean(losses.MSE(y, out))
            # mae_lose 平均绝对误差,Mean Absolute Error
            mae_loss = tf.reduce_mean(losses.MAE(y, out))

        if step % 10 == 0:
            print(epoch, step, float(loss))

        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

    train_mae_losses.append(float(mae_loss))
    # 把test的x带入计算out出来
    out = model(tf.constant(normed_test_data.values))
    # 在用这个out,统计MAE
    test_mae_losses.append(tf.reduce_mean(losses.MAE(test_labels, out)))
		# This helps to stabilize the training of the model
		advantage = normalize(advantage)

		# Calculate the Policy and Value gradients for gradient descent
		with tf.GradientTape() as policy_tape, tf.GradientTape() as value_tape:
			logits = tf.nn.log_softmax(policy_net(np.atleast_2d(np.array(states)).astype('float32')))

			"""
			Since we selected only one action out of the available ones, we need
			to identify that action using one_hot encoding
			"""
			one_hot_values = tf.squeeze(tf.one_hot(np.array(actions), env.action_space.n))
			log_probs = tf.math.reduce_sum(logits * one_hot_values, axis=1)
			policy_loss = -tf.math.reduce_mean(advantage * log_probs)
			value_loss = kls.MSE(returns,tf.squeeze(value_net(np.atleast_2d(np.array(states)).astype('float32'))))

		policy_variables = policy_net.trainable_variables
		value_variables = value_net.trainable_variables
		policy_gradients = policy_tape.gradient(policy_loss, policy_variables)
		value_gradients = value_tape.gradient(value_loss, value_variables)

		# Update the policy network weights using ADAM
		optimizer_policy_net.apply_gradients(zip(policy_gradients, policy_variables))
		"""
		Since we know the actual rewards that we got, value loss is pretty high.
		So we need to perform multiple iterations of gradient descent to achieve 
		a good performance
		"""
		for iteration in range(train_value_iterations):
			optimizer_value_net.apply_gradients(zip(value_gradients, value_variables))
Esempio n. 14
0
    def train(self, batch_size=64, epochs=4):
        criterion = losses.mean_squared_error
        optimizer = optim.Adam(lr=0.001)
        loops = self.index // batch_size
        df = pd.read_csv(f'{self.directory}/steering.csv')
        for e in range(epochs):
            for i in range(loops):
                B = np.random.randint(0, self.index, size=batch_size)
                X = np.zeros((batch_size, 160, 320, 3))
                S = np.zeros((batch_size, 1))
                for b in range(batch_size):
                    X[b] = Warp(
                        mpimg.imread(f'{self.directory}/img/{B[b]}.jpg'), src,
                        target) / 256

                    X = np.array(X, dtype=np.float32)

                    S[b] = df.iloc[B[b]].steering
                    if np.random.choice([True, False]):
                        X[b] = np.flip(X[b], 1)
                        S[b] = -S[b]

                    else:
                        pass
                if self.net.decode:
                    with tf.GradientTape() as t:

                        output, dec = self.net(X)

                        loss1 = losses.MSE(S, output)

                    grads = t.gradient(
                        loss1, self.net.encoder.trainable_variables +
                        self.net.predict_conv.trainable_variables +
                        self.net.predict.trainable_variables)
                    optimizer.apply_gradients(
                        zip(
                            grads, self.net.encoder.trainable_variables +
                            self.net.predict_conv.trainable_variables +
                            self.net.predict.trainable_variables))

                    with tf.GradientTape() as t:

                        output, dec = self.net(X)

                        loss2 = losses.MAE(X, dec)

                    grads = t.gradient(
                        loss2, self.net.encoder.trainable_variables +
                        self.net.decoder.trainable_variables)
                    optimizer.apply_gradients(
                        zip(
                            grads, self.net.encoder.trainable_variables +
                            self.net.decoder.trainable_variables))
                    """
                    if e+i == 0:
                        self.net.compile(optimizer='adam', loss='MSE')
                        self.net.fit(X,S, batch_size=batch_size, shuffle=False)"""
                    print(
                        f"epochs {self.net.epochs} | loss1 = {np.sum(loss1):.2f} | loss2 = {np.sum(loss2):.2f}\n"
                    )

                else:
                    with tf.GradientTape() as t:

                        output = self.net(X)

                        loss = losses.MSE(S, output)

                    grads = t.gradient(loss, self.net.trainable_variables)
                    optimizer.apply_gradients(
                        zip(grads, self.net.trainable_variables))
                    """
                    if e+i == 0:
                        self.net.compile(optimizer='adam', loss='MSE')
                        self.net.fit(X,S, batch_size=batch_size, shuffle=False)"""
                    print(
                        f"epochs {self.net.epochs} | loss1 = {np.sum(loss):.2f}\n"
                    )

            self.net.epochs += 1

            self.net.save_model('model_check_points')
def step_and_eval(step):
	

if __name__ == "__main__":
	gamma = 0.99
	p_lr = 0.01
	v_lr = 0.001
	lam = 0.97
	epochs = 50
	delta = 0.01
	damping_coeff = 0.1
	cg_iters = 10
	backtrack_iters = 10
	backtrack_coeff = 0.8
	train_value_iterations = 80
	num_episodes = 1000
	local_steps_per_epoch = 2000

	info_shapes = ?? ## Still to be defined

	env = gym.make('CartPole-v0')
	agent = Agent(env.action_space.n)
	Experience = namedtuple('Experience', ['states','actions', 'rewards'])
	temp_Experience = namedtuple('Experience', ['states','actions', 'rewards', 'values'])
	policy_net = Model(len(env.observation_space.sample()), [64,64], env.action_space.n, 'policy_net')
	value_net = Model(len(env.observation_space.sample()), [32], 0, 'value_net')
	memory = ReplayMemory(local_steps_per_epoch)
	temp_memory = ReplayMemory(local_steps_per_epoch)

	optimizer_policy_net = tf.optimizers.Adam(p_lr)
	optimizer_value_net = tf.optimizers.Adam(v_lr)

	# Why define the number of local iterations ? We can also define the number of episode to run
	# and then update the policy paramaeters.
	for epoch in range(epochs):
		state = env.reset()
		done = False
		ep_rewards = []
		returns = []
		advantage = []
		log_probs = []
		avg_rewards = []

		finished_rendering_this_epoch = False
		for t in range(local_steps_per_epoch):

			# To render the gym env once every epoch
			if (not finished_rendering_this_epoch):
				pass #env.render()

			action = agent.select_action(state, policy_net)
			#log_probs = tf.math.reduce_sum(policy_net(np.atleast_2d(np.array(state.reshape(1,-1))).astype('float32')) * tf.one_hot(np.array(action), env.action_space.n), axis=1)
			value = tf.squeeze(value_net(np.atleast_2d(np.array(state.reshape(1,-1))).astype('float32')))

			next_state, reward, done, _ = env.step(action.numpy())
			state = next_state

			memory.push(Experience(state, action, reward))
			temp_memory.push(temp_Experience(state, action, reward, value))
			ep_rewards.append(reward)

			if done or (t+1 == local_steps_per_epoch):
				returns += list(memory.return_func(ep_rewards, gamma))
				temp = temp_Experience(*zip(*temp_memory.memory))
				last_val = 0 if done else tf.squeeze(value_net(np.atleast_2d(np.array(state.reshape(1,-1)).astype('float32'))))

				temp_states, temp_actions, temp_rewards, temp_values = np.asarray(temp[0]),np.asarray(temp[1]),np.asarray(temp[2]),np.asarray(temp[3])
				temp_values = np.append(temp_values, last_val)
				delta = temp_rewards + gamma * temp_values[1:] - temp_values[:-1]
				advantage += list(memory.advantage_func(delta, gamma*lam))
				temp_memory.clear_memory()

				# If trajectory ends and the episode does not, we should bootstrap for the remaining value				
				#memory.update(last_val)
				avg_rewards.append(sum(ep_rewards))
				state, done, ep_rewards = env.reset(), False, []
				finished_rendering_this_epoch = True

		# Updating the policy and value function
		buf = Experience(*zip(*memory.memory))
		states, actions, rewards = np.asarray(buf[0]),np.asarray(buf[1]),np.asarray(buf[2])
		avg_rewards = np.mean(np.asarray(avg_rewards))

		advantage = normalize(advantage)

		for iteration in range(backtrack_iters):
			k_l, a_l_new = set_and_eval(backtrack_coeff**iteration)

			if iteration == backtrack_iters-1:
				k_l, a_l_new = set_and_eval(0.)

		# Training the value function
		with tf.GradientTape() as value_tape:
			value_loss = kls.MSE(returns,tf.squeeze(value_net(np.atleast_2d(np.array(states)).astype('float32'))))

		value_variables = value_net.trainable_variables
		value_gradients = value_tape.gradient(value_loss, value_variables)

		for iteration in range(train_value_iterations):
			optimizer_value_net.apply_gradients(zip(value_gradients, value_variables))
		
		with summary_writer.as_default():
			tf.summary.scalar('Episode_returns', sum(returns), step = epoch)
			tf.summary.scalar('Running_avg_reward', avg_rewards, step = epoch)
			tf.summary.scalar('Losses', policy_loss, step = epoch)

		if epoch%1 == 0:
			print(f"Episode: {epoch} |Losses: {policy_loss: 0.2f}| Return: {sum(returns)}| Avg_reward: {avg_rewards: 0.2f}")
			sys.stdout.flush()

	render_var = input("Do you want to render the env(Y/N) ?")
	if render_var == 'Y' or render_var == 'y':
		n_render_iter = int(input("How many episodes? "))
		
		for i in range(n_render_iter):
			state = env.reset()
			while True:
				action = agent.select_action(state, policy_net)
				env.render()
				n_state, reward, done, _ = env.step(action.numpy())
				if done:
					break
	else:
		print("Thankyou for using!")

	env.close()
Esempio n. 16
0
    def imitation_objective(self, batch, validation=False):
        """Imitation learning objective with `concordance loss` (i.e. a loss that encourages the network to make
           consistent predictions among augmented and non-augmented batches of data)
        """
        states, aug_states, speed, similarity = batch

        true_actions = utils.to_float(states['action'])
        true_values = states['value']

        # prediction on NON-augmented and AUGMENTED states
        policy, value = self.network.imitation_predict(states)
        policy_aug, value_aug = self.network.imitation_predict(aug_states)

        # actions, values, speed, and similarities
        actions, actions_aug = utils.to_float(policy['actions']), utils.to_float(policy_aug['actions'])
        values, values_aug = value['value'], value_aug['value']
        pi_speed, pi_speed_aug = policy['speed'], policy_aug['speed']
        v_speed, v_speed_aug = value['speed'], value_aug['speed']
        pi_similarity, pi_similarity_aug = policy['similarity'], policy_aug['similarity']
        v_similarity, v_similarity_aug = value['similarity'], value_aug['similarity']

        if not validation:
            self.log_actions(actions_pred_imitation=actions, actions_pred_aug_imitation=actions_aug)
            self.log(values_pred_imitation=values, values_pred_aug_imitation=values_aug,
                     speed_pi=pi_speed, speed_pi_aug=pi_speed_aug, speed_v=v_speed, speed_v_aug=v_speed_aug,
                     similarity_pi=pi_similarity, similarity_pi_aug=pi_similarity_aug,
                     similarity_v=v_similarity, similarity_v_aug=v_similarity_aug)

        # loss policy = sum of per-action MAE error
        loss_policy = (tf.reduce_mean(tf.reduce_sum(tf.abs(true_actions - actions), axis=1)) +
                       tf.reduce_mean(tf.reduce_sum(tf.abs(true_actions - actions_aug), axis=1))) / 2.0

        loss_value = (tf.reduce_mean(losses.MSE(y_true=true_values, y_pred=values)) +
                      tf.reduce_mean(losses.MSE(y_true=true_values, y_pred=values_aug))) / 2.0

        loss_speed_policy = (tf.reduce_mean(losses.MSE(y_true=speed, y_pred=pi_speed)) +
                             tf.reduce_mean(losses.MSE(y_true=speed, y_pred=pi_speed_aug))) / 2.0
        loss_speed_value = (tf.reduce_mean(losses.MSE(y_true=speed, y_pred=v_speed)) +
                            tf.reduce_mean(losses.MSE(y_true=speed, y_pred=v_speed_aug))) / 2.0

        loss_similarity_policy = (tf.reduce_mean(losses.MSE(y_true=similarity, y_pred=pi_similarity)) +
                                  tf.reduce_mean(losses.MSE(y_true=similarity, y_pred=pi_similarity_aug))) / 2.0
        loss_similarity_value = (tf.reduce_mean(losses.MSE(y_true=similarity, y_pred=v_similarity)) +
                                 tf.reduce_mean(losses.MSE(y_true=similarity, y_pred=v_similarity_aug))) / 2.0

        # concordance loss: make both prediction be close as possible
        concordance_policy = (tf.reduce_mean(losses.MSE(actions, actions_aug)) +
                              tf.reduce_mean(losses.MSE(pi_speed, pi_speed_aug)) +
                              tf.reduce_mean(losses.MSE(pi_similarity, pi_similarity_aug))) / 3.0

        concordance_value = (tf.reduce_mean(losses.MSE(values, values_aug)) +
                             tf.reduce_mean(losses.MSE(v_speed, v_speed_aug)) +
                             tf.reduce_mean(losses.MSE(v_similarity, v_similarity_aug))) / 3.0

        # total loss
        total_loss_policy = \
            loss_policy + self.aux * (loss_speed_policy + loss_similarity_policy) + self.delta * concordance_policy
        total_loss_value = \
            loss_value + self.aux * (loss_speed_value + loss_similarity_value) + self.eta * concordance_value

        if not validation:
            self.log(loss_policy=loss_policy, loss_value=loss_value, loss_speed_policy=loss_speed_policy,
                     loss_similarity_policy=loss_similarity_policy, loss_speed_value=loss_speed_value,
                     loss_similarity_value=loss_similarity_value,
                     loss_concordance_policy=concordance_policy, loss_concordance_value=concordance_value,
                     # loss_steer=steer_penalty, loss_throttle=throttle_penalty, loss_entropy=entropy_penalty
            )

        return total_loss_policy, total_loss_value
Esempio n. 17
0
    valList.append(np.mean(vLoss))

df = pd.DataFrame({"trainLoss": trainList, "valLoss": valList})
df.to_csv("RCC_DumbellXL_4x_MSE.csv", sep=",", header=True, encoding="UTF-8")

dL = dataOp.data_loader("C:/Datasets/MRI_Data/Recon_v4/Val", 1, 4, 10, False)
d = dL.__getitem__(200)

out = model.predict(d)

out = np.reshape(out, (256, 256, 2))
out = out[:, :, 0] + 1j * out[:, :, 1]

plt.figure()
plt.subplot(1, 2, 1)
plt.imshow(np.abs(out), cmap='gray')
plt.title('Magnitude')
plt.colorbar(orientation='horizontal', shrink=0.9)
plt.subplot(1, 2, 2)
plt.imshow(np.angle(out), cmap='gray')
plt.title('Phase')
plt.colorbar(orientation='horizontal', shrink=0.9)
plt.show()

y_pred = model.predict(d)

y_true = ifftConv(d[0])

loss = losses.MSE(y_true, y_pred)

print(y_pred.shape, y_true.shape, loss.shape)
Esempio n. 18
0
 def vae_loss(y_true, y_pred):
     xent_loss = losses.MSE(y_true, y_pred)
     kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma))
     loss = xent_loss + kl_loss
     return loss