def learn(env, policy, value_fn, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): """ Traines an ACKTR model. :param env: (Gym environment) The environment to learn from :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...) :param gamma: (float) The discount value :param lam: (float) the tradeoff between exploration and exploitation :param timesteps_per_batch: (int) the number of timesteps for each batch :param num_timesteps: (int) the total number of timesteps to run :param animate: (bool) if render env :param callback: (function) called every step, used for logging and saving :param desired_kl: (float) the Kullback leibler weight for the loss """ obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2, epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = tf_util.function(inputs, update_op) tf_util.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for queue_runner in [q_runner, value_fn.q_runner]: assert queue_runner is not None enqueue_threads.extend( queue_runner.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) timesteps_this_batch += path["reward"].shape[0] timesteps_so_far += path["reward"].shape[0] if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = value_fn.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function value_fn.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl_loss = policy.compute_kl(ob_no, oldac_dist) if kl_loss > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl_loss < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular( "EpLenMean", np.mean([path["reward"].shape[0] for path in paths])) logger.record_tabular("KL", kl_loss) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, max_path_length, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = max_path_length stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular("n_timesteps", timesteps_so_far) logger.record_tabular( "AverageReturn", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) #------- Run policy for reaching ---------# play_env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 box_pos=[0.23522776, 0.2287869, 0.82162434], #shift3 box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3 #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4 #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4 #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5 #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6 #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6 #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7 #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7 ) play_env = GymWrapper(play_env, keys=None, generalized_goal=False) # false for loading prevs #Weights are loaded from reach model grasp_strange #play_env.viewer.set_camera(camera_id=2) # Switch views for eval # Setup network # ---------------------------------------- ob_space = play_env.observation_space ac_space = play_env.action_space pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False) # Hack for loading policies using tensorflow init_op = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver(max_to_keep=5) with tf.compat.v1.Session() as sess: sess.run(init_op) # Load Checkpoint #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #ckpt_path = './reach_and_grasp_weights/reach_shift2/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt_path = './reach_and_grasp_weights/reach_3_almost/trpo_gail.transition_limitation_1100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2 #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1 #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path) saver.restore(sess, ckpt.model_checkpoint_path) # Create the playback environment _, _, last_ob, last_jpos, obs_array, jpos_array = runner_1_traj( play_env, pi_reach, None, timesteps_per_batch=3500, number_trajs=1, stochastic_policy=args.stochastic_policy, save=False) # Gripping load + setting up the last observation play_ob_dim = play_env.observation_space.shape[0] play_ac_dim = play_env.action_space.shape[0] grip_policy = GaussianMlpPolicy(play_ob_dim, play_ac_dim) grip_vf = NeuralNetValueFunction(play_ob_dim, play_ac_dim) grip_saver = tf.compat.v1.train.Saver(max_to_keep=5) unchanged_ob = np.float32(np.zeros(play_ob_dim)) with tf.compat.v1.Session() as sess2: sess2.run(init_op) # Load Checkpoint #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #grip_ckpt_path = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift' grip_ckpt_path = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/' #3rd grasp #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2 #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1 #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' grip_ckpt = tf.compat.v1.train.get_checkpoint_state(grip_ckpt_path) #print(grip_ckpt) grip_saver.restore(sess2, grip_ckpt.model_checkpoint_path) tt = 0 cum_rew = 0 #ob = last_ob #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros obfilter = ZFilter(play_env.observation_space.shape) #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2, is a problem? #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2 statsu = np.load( "./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz" ) # shift 3 #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4 #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5 #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6 #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7 print("load n: ", statsu["n"]) print("load M: ", statsu["M"]) print("load S: ", statsu["S"]) obfilter.rs._n = statsu["n"] obfilter.rs._M = statsu["M"] obfilter.rs._S = statsu["S"] print("obf n: ", obfilter.rs._n) print("obf M: ", obfilter.rs._M) print("obf S: ", obfilter.rs._S) #env.set_robot_joint_positions(last_jpos) #ob = np.concatenate((last_ob,env.box_end),axis=0) ob = last_ob # Will this change the behavior of loading? play_env.set_robot_joint_positions(last_jpos) prev_ob = np.float32(np.zeros( ob.shape)) # check if indeed starts at all zeros unchanged_ob = ob ob = obfilter(ob) while True: s = np.concatenate([ob, prev_ob], -1) ac, _, _ = grip_policy.act(s) prev_ob = np.copy(ob) scaled_ac = play_env.action_space.low + (ac + 1.) * 0.5 * ( play_env.action_space.high - play_env.action_space.low) scaled_ac = np.clip(scaled_ac, play_env.action_space.low, play_env.action_space.high) ob, rew, new, _ = play_env.step(scaled_ac) unchanged_ob = ob ob = obfilter(ob) cum_rew += rew play_env.render() # check the running in for the first part #logger.log("rendering for reach policy") if new or tt >= 200: break tt += 1 print("Cumulative reward over session: " + str(cum_rew)) #obs_array = None #jpos_array = None #print(last_jpos) #print(last_ob) last_joint_after_grip = play_env._joint_positions last_after_grip = unchanged_ob # Change environment box position to start from the last position on playenv #env.model.box_pos_array = np.array(play_env.sim.data.body_xpos[play_env.cube_body_id]) #env.model.box_quat_array = convert_quat( # np.array(play_env.sim.data.body_xquat[play_env.cube_body_id]), to="xyzw" # ) #env.box_pos = play_env.model.box_pos_array #env.box_quat = play_env.model.box_quat_array # set up the environment for loading or training env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, reward_shaping=True, rob_init=last_joint_after_grip, box_pos=np.array( play_env.sim.data.body_xpos[play_env.cube_body_id]), #shift3 box_quat=convert_quat(np.array( play_env.sim.data.body_xquat[play_env.cube_body_id]), to="xyzw"), #shift3 #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3 #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3 box_end=[0.3, 0.1, 1.0] # shift 3 #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4, try to increase traj limit to 2000 #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4 #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5 #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6 #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6 #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7 #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7 ) # Switch from gym to robosuite, also add reward shaping to see reach goal env = GymWrapper(env, keys=None, generalized_goal=True) # wrap in the gym environment task = 'train' #task = 'evaluate' #task = 'cont_training' #env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) # Note: taking away the bench monitor wrapping allows rendering #env.seed(args.seed) # Sawyer does not have seed gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) logger.log("log_directories: ", args.log_dir) logger.log("environment action space range: ", env.action_space) #logging the action space if task == 'train': play_env.close() #init_op2 = tf.compat.v1.global_variables_initializer() #sess2 = tf.compat.v1.Session(config=tf.ConfigProto()) #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess2: #sess2.run(init_op) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] lift_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr") lift_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr") #sess2.run(init_op2) old_acktr_learn( env, policy=lift_policy, vf=lift_vf, gamma=0.99, lam=0.97, timesteps_per_batch=1500, desired_kl=0.001, num_timesteps=args.num_timesteps, save_per_iter=args.save_per_iter, ckpt_dir=args.checkpoint_dir, traj_limitation=args.traj_limitation, last_ob=[last_after_grip], last_jpos=[last_joint_after_grip], animate=True, pi_name="lift_pi_aktr", ) env.close() elif task == 'cont_training': play_env.close() # with tf.compat.v1.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] #with tf.compat.v1.variable_scope("vf_aktr"): cont_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr") #with tf.compat.v1.variable_scope("pi_aktr"): cont_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr") ckpt_path_cont = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift' stat_cont = "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_19713000.npz" old_acktr_learn( env, policy=cont_policy, vf=cont_vf, gamma=0.99, lam=0.97, timesteps_per_batch=1500, desired_kl=0.001, num_timesteps=args.num_timesteps, save_per_iter=args.save_per_iter, ckpt_dir=args.checkpoint_dir, traj_limitation=args.traj_limitation, last_ob=obs_array, last_jpos=jpos_array, animate=True, cont_training=True, load_path=ckpt_path_cont, obfilter_load=stat_cont, pi_name="lift_pi_aktr", ) env.close() elif task == 'evaluate': play_env.close() # with tf.compat.v1.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] eval_lift_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr") eval_lift_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr") saver_2 = tf.compat.v1.train.Saver(max_to_keep=5) #sess2 = tf.compat.v1.Session(config=tf.ConfigProto()) #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess3: with tf.compat.v1.Session() as sess3: sess3.run(init_op) ckpt_path_2 = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift' # shift 2 #ckpt_path_2 = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift' # shift 3 #ckpt_path_2 = './reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 4 #ckpt_path_2 = './reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 5 #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 6 #ckpt_path_2 = './reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 7 ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2) saver_2.restore(sess3, ckpt_2.model_checkpoint_path) tt = 0 cum_rew = 0 #ob = last_ob #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros obfilter = ZFilter(env.observation_space.shape) statsu = np.load( "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_23493000.npz" ) #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2 #statsu = np.load("./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz") # shift 3 #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4 #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5 #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6 #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7 print("load n: ", statsu["n"]) print("load M: ", statsu["M"]) print("load S: ", statsu["S"]) obfilter.rs._n = statsu["n"] obfilter.rs._M = statsu["M"] obfilter.rs._S = statsu["S"] print("obf n: ", obfilter.rs._n) print("obf M: ", obfilter.rs._M) print("obf S: ", obfilter.rs._S) env.set_robot_joint_positions(last_jpos) ob = np.concatenate((last_ob, env.box_end), axis=0) prev_ob = np.float32(np.zeros( ob.shape)) # check if indeed starts at all zeros ob = obfilter(ob) while True: s = np.concatenate([ob, prev_ob], -1) ac, _, _ = eval_lift_policy.act(s) prev_ob = np.copy(ob) scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * ( env.action_space.high - env.action_space.low) scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high) ob, rew, new, _ = env.step(scaled_ac) ob = obfilter(ob) cum_rew += rew env.render() # check the running in for the first part #logger.log("rendering for reach policy") if new or tt >= args.traj_limitation: break tt += 1 print("Cumulative reward over session: " + str(cum_rew)) env.close()
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, lr=0.03, momentum=0.9): ob_dim, ac_dim = policy.ob_dim, policy.ac_dim dbpi = GaussianMlpPolicy(ob_dim, ac_dim, 'dbp') oldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'oe') dboldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'doi') # with tf.variable_scope('dbp'): # with tf.variable_scope('oe'): # with tf.variable_scope('doi'): pi = policy do_std = U.function([], [pi.std_1a, pi.logstd_1a]) kloldnew = oldpi.pd.kl(pi.pd) dbkloldnew = dboldpi.pd.kl(dbpi.pd) dist = meankl = tf.reduce_mean(kloldnew) dbkl = tf.reduce_mean(dbkloldnew) obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(lr)), name='stepsize') inputs, loss, loss_sampled = policy.update_info var_list = [v for v in tf.global_variables() if "pi" in v.name] db_var_list = [v for v in tf.global_variables() if "dbp" in v.name] old_var_list = [v for v in tf.global_variables() if "oe" in v.name] db_old_var_list = [v for v in tf.global_variables() if "doi" in v.name] print(len(var_list), len(db_var_list), len(old_var_list), len(db_old_var_list)) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) assign_db = U.function( [], [], updates=[ tf.assign(db, o) for (db, o) in zipsame(db_var_list, var_list) ] + [ tf.assign(dbold, dbnew) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list) ]) assign_old_eq_newr = U.function( [], [], updates=[ tf.assign(newv, oldv) for (oldv, newv) in zipsame(old_var_list, var_list) ]) # assign_dbr = U.function([], [], updates= # [tf.assign(o, db) for (db, o) in zipsame(db_var_list, var_list)] + # [tf.assign(dbnew, dbold) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)]) klgrads = tf.gradients(dist, var_list) dbklgrads = tf.gradients(dbkl, db_var_list) p_grads = [tf.ones_like(v) for v in dbklgrads] get_flat = U.GetFlat(var_list) get_old_flat = U.GetFlat(old_var_list) set_from_flat = U.SetFromFlat(var_list) flat_tangent2 = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan2") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents2 = [] for shape in shapes: sz = U.intprod(shape) tangents2.append(tf.reshape(flat_tangent2[start:start + sz], shape)) start += sz gvp2 = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(dbklgrads, tangents2) ]) gvp2_grads = tf.gradients(gvp2, db_var_list) neg_term = tf.add_n([ tf.reduce_sum(g * tangent2) for (g, tangent2) in zipsame(gvp2_grads, tangents2) ]) / 2. ng1 = tf.gradients(neg_term, db_var_list) ng2 = tf.gradients(neg_term, db_old_var_list) neg_term_grads = [ a + b for (a, b) in zip(tf.gradients(neg_term, db_var_list), tf.gradients(neg_term, db_old_var_list)) ] neg_term = neg_term_grads # neg_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in neg_term_grads]) pos_term = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(gvp2_grads, p_grads) ]) pos_term_grads = [ a + b for (a, b) in zip(tf.gradients(pos_term, db_var_list), tf.gradients(pos_term, db_old_var_list)) ] pos_term_sum = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(pos_term_grads, tangents2) ]) pos_term_grads = tf.gradients(pos_term_sum, p_grads) pos_term = pos_term_grads # pos_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in pos_term_grads]) geo_term = [(p - n) * 0.5 for p, n in zip(pos_term, neg_term)] optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=momentum, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) grads = optim.compute_gradients(loss, var_list=pi_var_list) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) geo_term = [g1 + g2[0] for g1, g2 in zip(geo_term, grads)] geo_grads = list(zip(geo_term, var_list)) update_geo_op, q_runner_geo = optim.apply_gradients(geo_grads) do_update = U.function(inputs, update_op) inputs_tangent = list(inputs) + [flat_tangent2] do_update_geo = U.function(inputs_tangent, update_geo_op) do_get_geo_term = U.function(inputs_tangent, [ng1, ng2]) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner, q_runner_geo]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) assign_old_eq_new() # set old parameter values to new parameter values assign_db() # Policy update do_update(ob_no, action_na, standardized_adv_n) # ft2 = get_flat() - get_old_flat() # assign_old_eq_newr() # assign back # gnp = do_get_geo_term(ob_no, action_na, standardized_adv_n, ft2) # def check_nan(bs): # return [~np.isnan(b).all() for b in bs] # print(gnp[0]) # print('.....asdfasdfadslfkadsjfaksdfalsdkfjaldskf') # print(gnp[1]) # do_update_geo(ob_no, action_na, standardized_adv_n, ft2) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) # if kl > desired_kl * 2: # logger.log("kl too high") # tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() # elif kl < desired_kl / 2: # logger.log("kl too low") # tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() # else: # logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) print(do_std()) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, save_model_with_prefix=None, restore_model_from_file=None, outdir="/tmp/rosrl/experiments/continuous/acktr/"): obfilter = ZFilter(env.observation_space.shape) # Risto change max_pathlength = env.max_episode_steps stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() """ Here we add a possibility to resume from a previously saved model if a model file is provided """ if restore_model_from_file: saver = tf.train.Saver() saver.restore(tf.get_default_session(), restore_model_from_file) logger.log("Loaded model from {}".format(restore_model_from_file)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 if save_model_with_prefix: # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/' summary_writer = tf.summary.FileWriter(outdir, graph=tf.get_default_graph()) while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() """ Save the model at every itteration """ if save_model_with_prefix: if np.mean([path["reward"].sum() for path in paths]) > -50.0: # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/' summary = tf.Summary(value=[ tf.Summary.Value(tag="EpRewMean", simple_value=np.mean([ path["reward"].sum() for path in paths ])) ]) summary_writer.add_summary(summary, i) if not os.path.exists(outdir): os.makedirs(outdir) modelF = outdir + '/' + save_model_with_prefix + "_afterIter_" + str( i) + ".model" U.save_state(modelF) logger.log("Saved model to file :{}".format(modelF)) i += 1 coord.request_stop() coord.join(enqueue_threads)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002, fname='./training.ckpt'): mean_logger = setup_logger("Mean Logger", "log/episode_mean.txt") # print("Filter shape: ", env.observation_space.shape) space = (env.observation_space.shape[0] * 2, ) obfilter = ZFilter(space) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') #0.03 inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() #changes if fname != None and tf.train.checkpoint_exists(fname): saver = tf.train.Saver() saver.restore(tf.get_default_session(), fname) logger.log("Model loaded from file {}".format(fname)) # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None, "QR is None") enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 total_reward = float() while True: print("Timestep Number: %d of %d" % (timesteps_so_far, num_timesteps)) if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) #Save model every 100 iterations if fname != None and (i % 100 == 0): os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("Model saved to file {}".format(fname)) env.seed() # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] terminal_rew = [] while True: path, temp_rew = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) terminal_rew.append(np.array(temp_rew)) n = pathlength(path) timesteps_this_batch += n if timesteps_this_batch > timesteps_per_batch: break timesteps_so_far += 1 # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") terminal_rew = np.array(terminal_rew) rew_mean = np.mean([path.sum() for path in terminal_rew]) rew_sem = np.std( [path.sum() / np.sqrt(len(terminal_rew)) for path in terminal_rew]) len_mean = np.mean([path.shape[0] for path in terminal_rew]) # rewList = [] # for path in paths: # trew = [] # rew_i = 0 # while True: # trew.append(path["reward"][rew_i]) # rew_i += 11 # if rew_i > (len(path["reward"])-1): # break # rewList.append( np.array(trew) ) # rewList = np.array(rewList) # rew_mean = np.mean([path.sum() for path in rewList]) # rew_sem = np.std([path.sum()/np.sqrt(len(rewList)) for path in rewList]) # len_mean = np.mean([path.shape[0] for path in rewList]) # rew_mean = np.mean([path["reward"].sum() for path in paths]) # rew_sem = np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]) # len_mean = np.mean([pathlength(path) for path in paths]) total_reward += rew_mean logger.record_tabular("EpRewMean", rew_mean) logger.record_tabular("EpRewSEM", rew_sem) logger.record_tabular("EpLenMean", len_mean) logger.record_tabular("TotalRewardMean", total_reward) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() mean_logger.info( "Result for episode {} of {}: Sum: {}, Average: {}, Length: {}". format(timesteps_so_far, num_timesteps, rew_mean, rew_sem, len_mean)) i += 1 if fname != None: os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("Model saved to file {}".format(fname)) env.seed() coord.request_stop() coord.join(enqueue_threads)