def iteration(mean_and_stddev, _): mean, stddev = mean_and_stddev # Sample action proposals from belief for each env in batch, candidate and horizon step normal = tf.random_normal((original_batch, amount, horizon) + action_shape) # Action shape: (envs batch size, candidates amount, horizon) + action_shape action = normal * stddev[:, None] + mean[:, None] # Reshape to extended_batch format (original_batch * amount, horizon) + action_shape action = tf.reshape(action, (extended_batch, horizon) + action_shape) if discrete_action: # Normalize action scores action = tf.nn.l2_normalize(action, axis=-1) # Apply greedy policy postproc_action = greedy(action, action_shape[0]) else: # Clip action to valid range action = tf.clip_by_value(action, min_action, max_action) # Keep continuous actions postproc_action = action # Evaluate proposal actions (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, postproc_action, use_obs), initial_state=initial_state) reward = objective_fn(state) return_ = discounted_return.discounted_return(reward, length, discount)[:, 0] # Reshape back to (envs batch size, candidates amount) format return_ = tf.reshape(return_, (original_batch, amount)) # Indices have shape (envs batch size, topk) and those are candidates indices # for each env in the batch! _, indices = tf.nn.top_k(return_, topk, sorted=False) # Offset each index so it matches indices of action which has `extended_batch` first dim. indices += tf.range(original_batch)[:, None] * amount # best_actions have shape indices.shape + action.shape[1:], which is # (envs batch size, topk, horizon) + action_shape best_actions = tf.gather(action, indices) # Calculate new belief from best actions, shape: (envs batch size, horizon) + action_shape mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev
def iteration(mean_and_stddev, _): mean, stddev = mean_and_stddev # Sample action proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions. action = tf.reshape(action, (extended_batch, horizon) + action_shape) (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs), initial_state=initial_state) reward = objective_fn(state) return_ = discounted_return.discounted_return(reward, length, discount)[:, 0] return_ = tf.reshape(return_, (original_batch, amount)) # Re-fit belief to the best ones. _, indices = tf.nn.top_k(return_, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev
def iteration(mean_and_stddev, _): mean, stddev = mean_and_stddev # Sample action proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions. action = tf.reshape(action, (extended_batch, horizon) + action_shape) (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs), initial_state=initial_state) # objectives objectives = objective_fn( state ) # shape: ['reward':shape(1000,12), 'angular_speed_degree':shape(1000,12), ...] reward = objectives['reward'] angular_speed = objectives['angular_speed_degree'] forward_speed = objectives['forward_speed'] / 10.0 collided = objectives['collided'] intersection_offroad = objectives['intersection_offroad'] intersection_otherlane = objectives['intersection_otherlane'] # ################# #1. define reward for planning # return_ = discounted_return.discounted_return( # reward, length, discount)[:, 0] # total_return = tf.reshape(return_, (original_batch, amount)) if not PLANNING: ################## #2. define reward for planning return_ = discounted_return.discounted_return( reward, length, discount)[:, 0] # shape: (1000,) return_ = tf.reshape(return_, (original_batch, amount)) # shape: (1, 1000) # threshold_degree = tf.where(dist_to_intersection<10, 9.0*(10 - dist_to_intersection), 0) threshold_degree = tf.where(dist_to_intersection < 9, 9 * (9 - dist_to_intersection), 0) angular_turn_ = discounted_return.discounted_return( angular_speed, length, 1.0)[:, 0] # shape: (1000,) # angular_turn_abs = discounted_return.discounted_return(-tf.abs(angular_speed), length, 1.0)[:, 0] # angular_turn_relative = tf.reduce_sum(-tf.abs(angular_speed[...,1:]-angular_speed[...,:-1]),axis=-1) heading_loss = - tf.abs(delta_degree(goal_heading_degree - (current_heading_degree + angular_turn_)))* \ tf.case({ tf.equal(cmd_id,3):costn1, tf.equal(cmd_id,2):costn1, tf.equal(cmd_id,1):costn1}, default=costn0) heading_loss_weighted = heading_loss * tf.where( heading_loss > threshold_degree - 90, tf.ones((amount, )) * 0.3, tf.ones((amount, )) * 1000.0) # + 0.3*angular_turn_relative # + 0.1*angular_turn_abs return_heading = tf.reshape(heading_loss_weighted, (original_batch, amount)) total_return = return_ + return_heading # /90.0*12*4 if PLANNING: ################## #3. define reward for planning rewards = forward_speed - 300.0 * tf.where( collided > 0.3, collided, tf.ones_like(collided) * 0.0 ) - 20.0 * intersection_offroad - 10.0 * intersection_otherlane return_ = discounted_return.discounted_return( rewards, length, discount)[:, 0] # shape: (1000,) return_ = tf.reshape(return_, (original_batch, amount)) # shape: (1, 1000) # threshold_degree = tf.where(dist_to_intersection<10, 9.0*(10 - dist_to_intersection), 0) threshold_degree = tf.where(dist_to_intersection < 9, 9 * (9 - dist_to_intersection), 0) angular_turn_ = discounted_return.discounted_return( angular_speed, length, 1.0)[:, 0] # shape: (1000,) # angular_turn_abs = discounted_return.discounted_return(-tf.abs(angular_speed), length, 1.0)[:, 0] # angular_turn_relative = tf.reduce_sum(-tf.abs(angular_speed[...,1:]-angular_speed[...,:-1]),axis=-1) heading_loss = - tf.abs(delta_degree(goal_heading_degree - (current_heading_degree + angular_turn_)))* \ tf.case({ tf.equal(cmd_id,3):costn1, tf.equal(cmd_id,2):costn1, tf.equal(cmd_id,1):costn1}, default=costn0) heading_loss_weighted = heading_loss * tf.where( heading_loss > threshold_degree - 90, tf.ones((amount, )) * 0.3, tf.ones((amount, )) * 1000.0) # + 0.3*angular_turn_relative # + 0.1*angular_turn_abs return_heading = tf.reshape(heading_loss_weighted, (original_batch, amount)) total_return = return_ + return_heading # /90.0*12*4 # Re-fit belief to the best ones. _, indices = tf.nn.top_k(total_return, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev
def iteration(mean_and_stddev, _): mean, stddev, command = mean_and_stddev # mean 1 12 2 # stddev 1 12 2 # Sample action proposals from belief. normal = tf.random_normal((original_batch, amount, horizon) + action_shape) action = normal * stddev[:, None] + mean[:, None] action = tf.clip_by_value(action, min_action, max_action) # Evaluate proposal actions. action = tf.reshape(action, (extended_batch, horizon) + action_shape) (_, state), _ = tf.nn.dynamic_rnn(cell, (0 * obs, action, use_obs), initial_state=initial_state) # action # Tensor( # "graph/collection/should_collect_carla/simulate-1/train-carla-cem-12/scan/while/simulate/scan/while/Reshape:0", # shape=(1000, 12, 2), dtype=float32) reward = objective_fn(state) bond_turn = tf.reshape(tf.reduce_sum(action[:, :, 1], axis=1), [1, 1000]) bond_turn = tf.clip_by_value(bond_turn, -10, 10) bond_keep = tf.reshape(tf.reduce_sum(action[:, :, 0], axis=1), [1, 1000]) bond_straight = tf.reshape(tf.reduce_sum(action[:, :, 0], axis=1), [1, 1000]) - \ 0.2*tf.reshape(tf.reduce_sum(tf.abs(action[:, :, 1]), axis=1), [1, 1000]) bond_straight = tf.clip_by_value(bond_straight, -8, 8) bond_keep = tf.clip_by_value(bond_keep, -8, 8) def f1(): return bond_straight # go straight bond def f2(): return bond_turn + 0.2 * bond_keep # right turn bond def f3(): return -bond_turn + 0.2 * bond_keep # left turn bond def f4(): return bond_keep # lane keep bond bond = tf.case( { tf.reduce_all(tf.equal(command, 2)): f2, tf.reduce_all(tf.equal(command, 3)): f3, tf.reduce_all(tf.equal(command, 4)): f4 }, default=f1, exclusive=True) return_ = discounted_return.discounted_return(reward, length, discount)[:, 0] return_ = tf.reshape(return_, (original_batch, amount)) if PLAN_BOND: return_ += bond * 0.2 # Re-fit belief to the best ones. _, indices = tf.nn.top_k(return_, topk, sorted=False) indices += tf.range(original_batch)[:, None] * amount best_actions = tf.gather(action, indices) mean, variance = tf.nn.moments(best_actions, 1) stddev = tf.sqrt(variance + 1e-6) return mean, stddev, command