def forward(img_a, img_b): img_a /= 255. img_b /= 255. img_ab = generator(img_a, name='atob', reuse=False) img_ba = generator(img_b, name='btoa', reuse=False) img_aba = generator(img_ab, name='btoa', reuse=True) img_bab = generator(img_ba, name='atob', reuse=True) logit_fake_a = discriminator(img_ba, name='a', reuse=False) logit_fake_b = discriminator(img_ab, name='b', reuse=False) score_fake_a = O.sigmoid(logit_fake_a) score_fake_b = O.sigmoid(logit_fake_b) for name in ['img_a', 'img_b', 'img_ab', 'img_ba', 'img_aba', 'img_bab', 'score_fake_a', 'score_fake_b']: dpc.add_output(locals()[name], name=name) if env.phase is env.Phase.TRAIN: logit_real_a = discriminator(img_a, name='a', reuse=True) logit_real_b = discriminator(img_b, name='b', reuse=True) score_real_a = O.sigmoid(logit_real_a) score_real_b = O.sigmoid(logit_real_b) all_g_loss = 0. all_d_loss = 0. r_loss_ratio = 0.9 for pair_name, (real, fake), (logit_real, logit_fake), (score_real, score_fake) in zip( ['lossa', 'lossb'], [(img_a, img_aba), (img_b, img_bab)], [(logit_real_a, logit_fake_a), (logit_real_b, logit_fake_b)], [(score_real_a, score_fake_a), (score_real_b, score_fake_b)]): with env.name_scope(pair_name): d_loss_real = O.sigmoid_cross_entropy_with_logits(logits=logit_real, labels=O.ones_like(logit_real)).mean(name='d_loss_real') d_loss_fake = O.sigmoid_cross_entropy_with_logits(logits=logit_fake, labels=O.zeros_like(logit_fake)).mean(name='d_loss_fake') g_loss = O.sigmoid_cross_entropy_with_logits(logits=logit_fake, labels=O.ones_like(logit_fake)).mean(name='g_loss') d_acc_real = (score_real > 0.5).astype('float32').mean(name='d_acc_real') d_acc_fake = (score_fake < 0.5).astype('float32').mean(name='d_acc_fake') g_accuracy = (score_fake > 0.5).astype('float32').mean(name='g_accuracy') d_accuracy = O.identity(.5 * (d_acc_real + d_acc_fake), name='d_accuracy') d_loss = O.identity(.5 * (d_loss_real + d_loss_fake), name='d_loss') # r_loss = O.raw_l2_loss('raw_r_loss', real, fake).flatten2().sum(axis=1).mean(name='r_loss') r_loss = O.raw_l2_loss('raw_r_loss', real, fake).mean(name='r_loss') # r_loss = O.raw_cross_entropy_prob('raw_r_loss', real, fake).flatten2().sum(axis=1).mean(name='r_loss') # all_g_loss += g_loss + r_loss all_g_loss += (1 - r_loss_ratio) * g_loss + r_loss_ratio * r_loss all_d_loss += d_loss for v in [d_loss_real, d_loss_fake, g_loss, d_acc_real, d_acc_fake, g_accuracy, d_accuracy, d_loss, r_loss]: dpc.add_output(v, name=re.sub('^tower/\d+/', '', v.name)[:-2], reduce_method='sum') dpc.add_output(all_g_loss, name='g_loss', reduce_method='sum') dpc.add_output(all_d_loss, name='d_loss', reduce_method='sum')
def make_network(env): is_train = env.phase is env.Phase.TRAIN if is_train: slave_devices = env.slave_devices env.set_slave_devices([]) with env.create_network() as net: h, w, c = get_input_shape() dpc = env.create_dpcontroller() with dpc.activate(): def inputs(): state = O.placeholder('state', shape=(None, h, w, c)) return [state] def forward(x): _ = x / 255.0 with O.argscope(O.conv2d, nonlin=O.relu): _ = O.conv2d('conv0', _, 32, 5) _ = O.max_pooling2d('pool0', _, 2) _ = O.conv2d('conv1', _, 32, 5) _ = O.max_pooling2d('pool1', _, 2) _ = O.conv2d('conv2', _, 64, 4) _ = O.max_pooling2d('pool2', _, 2) _ = O.conv2d('conv3', _, 64, 3) dpc.add_output(_, name='feature') dpc.set_input_maker(inputs).set_forward_func(forward) _ = dpc.outputs['feature'] _ = O.fc('fc0', _, 512, nonlin=O.p_relu) policy = O.fc('fc_policy', _, get_player_nr_actions()) value = O.fc('fc_value', _, 1) expf = O.scalar('explore_factor', 1, trainable=False) policy_explore = O.softmax(policy * expf, name='policy_explore') policy = O.softmax(policy, name='policy') value = value.remove_axis(1, name='value') net.add_output(policy_explore, name='policy_explore') net.add_output(policy, name='policy') net.add_output(value, name='value') if is_train: action = O.placeholder('action', shape=(None, ), dtype='int64') future_reward = O.placeholder('future_reward', shape=(None, )) log_policy = O.log(policy + 1e-6) log_pi_a_given_s = ( log_policy * O.one_hot(action, get_player_nr_actions())).sum(axis=1) advantage = (future_reward - O.zero_grad(value)).rename('advantage') policy_cost = (log_pi_a_given_s * advantage).mean(name='policy_cost') xentropy_cost = (-policy * log_policy).sum(axis=1).mean(name='xentropy_cost') value_loss = O.raw_l2_loss('raw_value_loss', future_reward, value).mean(name='value_loss') entropy_beta = O.scalar('entropy_beta', 0.01, trainable=False) loss = O.add_n( [-policy_cost, -xentropy_cost * entropy_beta, value_loss], name='loss') net.set_loss(loss) for v in [ policy_cost, xentropy_cost, value_loss, value.mean(name='predict_value'), advantage.rms(name='rms_advantage'), loss ]: summary.scalar(v) if is_train: env.set_slave_devices(slave_devices)
def make_network(env): with env.create_network() as net: net.dist = O.distrib.GaussianDistribution('policy', size=get_action_shape()[0], fixed_std=False) state = O.placeholder('state', shape=(None, ) + get_input_shape()) batch_size = state.shape[0] # We have to define variable scope here for later optimization. with env.variable_scope('policy'): _ = state _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh) logstd = O.variable('logstd', O.truncated_normal_initializer(stddev=0.01), shape=(net.dist.sample_size, ), trainable=True) logstd = O.tile(logstd.add_axis(0), [batch_size, 1]) theta = O.concat([mu, logstd], axis=1) policy = net.dist.sample(batch_size=batch_size, theta=theta, process_theta=True) policy = O.clip_by_value(policy, -1, 1) net.add_output(theta, name='theta') net.add_output(policy, name='policy') if env.phase == env.Phase.TRAIN: theta_old = O.placeholder('theta_old', shape=(None, net.dist.param_size)) action = O.placeholder('action', shape=(None, net.dist.sample_size)) advantage = O.placeholder('advantage', shape=(None, )) entropy_beta = O.scalar('entropy_beta', g.entropy_beta) log_prob = net.dist.log_likelihood(action, theta, process_theta=True) log_prob_old = net.dist.log_likelihood(action, theta_old, process_theta=True) ratio = O.exp(log_prob - log_prob_old) epsilon = get_env('ppo.epsilon') surr1 = ratio * advantage # surrogate from conservative policy iteration surr2 = O.clip_by_value(ratio, 1.0 - epsilon, 1.0 + epsilon) * advantage policy_loss = -O.reduce_mean(O.min( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) entropy = net.dist.entropy(theta, process_theta=True).mean() entropy_loss = -entropy_beta * entropy net.add_output(policy_loss, name='policy_loss') net.add_output(entropy_loss, name='entropy_loss') summary.scalar('policy_entropy', entropy) with env.variable_scope('value'): _ = state _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) value = O.fc('fcv', _, 1) value = value.remove_axis(1) net.add_output(value, name='value') if env.phase == env.Phase.TRAIN: value_label = O.placeholder('value_label', shape=(None, )) value_old = O.placeholder('value_old', shape=(None, )) value_surr1 = O.raw_l2_loss('raw_value_loss_surr1', value, value_label) value_clipped = value_old + O.clip_by_value( value - value_old, -epsilon, epsilon) value_surr2 = O.raw_l2_loss('raw_value_loss_surr2', value_clipped, value_label) value_loss = O.reduce_mean(O.max(value_surr1, value_surr2)) net.add_output(value_loss, name='value_loss') if env.phase == env.Phase.TRAIN: loss = O.identity(policy_loss + entropy_loss + value_loss, name='total_loss') net.set_loss(loss)
def make_network(env): use_linear_vr = get_env('trpo.use_linear_vr') with env.create_network() as net: net.dist = O.distrib.GaussianDistribution('policy', size=get_action_shape()[0], fixed_std=False) if use_linear_vr: from tartist.app.rl.utils.math import LinearValueRegressor net.value_regressor = LinearValueRegressor() state = O.placeholder('state', shape=(None, ) + get_input_shape()) # state = O.moving_average(state) # state = O.clip_by_value(state, -10, 10) batch_size = state.shape[0] # We have to define variable scope here for later optimization. with env.variable_scope('policy'): _ = state with O.argscope(O.fc): _ = O.fc('fc1', _, 64, nonlin=O.relu) _ = O.fc('fc2', _, 64, nonlin=O.relu) mu = O.fc('fc_mu', _, net.dist.sample_size, nonlin=O.tanh) logstd = O.variable( 'logstd', O.truncated_normal_initializer(stddev=0.01), shape=(net.dist.sample_size, ), trainable=True) logstd = O.tile(logstd.add_axis(0), [batch_size, 1]) theta = O.concat([mu, logstd], axis=1) policy = net.dist.sample(batch_size=batch_size, theta=theta, process_theta=True) policy = O.clip_by_value(policy, -1, 1) net.add_output(theta, name='theta') net.add_output(policy, name='policy') if env.phase == env.Phase.TRAIN: theta_old = O.placeholder('theta_old', shape=(None, net.dist.param_size)) action = O.placeholder('action', shape=(None, net.dist.sample_size)) advantage = O.placeholder('advantage', shape=(None, )) log_prob = net.dist.log_likelihood(action, theta, process_theta=True) log_prob_old = net.dist.log_likelihood(action, theta_old, process_theta=True) # Importance sampling of surrogate loss (L in paper). ratio = O.exp(log_prob - log_prob_old) policy_loss = -O.reduce_mean(ratio * advantage) kl = net.dist.kl(theta_p=theta_old, theta_q=theta, process_theta=True).mean() kl_self = net.dist.kl(theta_p=O.zero_grad(theta), theta_q=theta, process_theta=True).mean() entropy = net.dist.entropy(theta, process_theta=True).mean() net.add_output(policy_loss, name='policy_loss') net.add_output(kl, name='kl') net.add_output(kl_self, name='kl_self') summary.scalar('policy_entropy', entropy, collections=[rl.train.ACGraphKeys.POLICY_SUMMARIES]) if not use_linear_vr: with env.variable_scope('value'): value = O.fc('fcv', state, 1) net.add_output(value, name='value') if env.phase == env.Phase.TRAIN: value_label = O.placeholder('value_label', shape=(None, )) value_loss = O.raw_l2_loss('raw_value_loss', value, value_label).mean(name='value_loss') net.add_output(value_loss, name='value_loss')