def aux(nom1, nom2, g, e, sequences, tree):
     if not nom1 in tree.keys():
         seq1 = [sequences[int(nom1)]]
     else:
         g1, g2 = tree[nom1][0], tree[nom1][1]
         seq1 = aux(g1, g2, g, e, sequences, tree)
     if not nom2 in tree.keys():
         seq2 = [sequences[int(nom2)]]
     else:
         g1, g2 = tree[nom2][0], tree[nom2][1]
         seq2 = aux(g1, g2, g, e, sequences, tree)
     traceback = TD3.NW_affine_multi(seq1, seq2, g, e, f)[1]
     l1, l2 = TD3.affiche_multi(seq1, seq2, traceback)
     alignments = l1 + l2
     return alignments
Example #2
0
def test(test_epoch):
    env = gym.make('FetchReach-v1')
    
    #state_dim = env.observation_space.shape[0]
    state_dim = env.observation_space["desired_goal"].shape[0] + env.observation_space["observation"].shape[0]
    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    policy = TD3.TD3(state_dim,action_dim,max_action)
    #loadModel(policy,"models/model_36021.pt")
    filename = 'TD3_FetchReach-v1_311204_500000.0'
    policy.load(filename, './pytorch_models')

    for _ in range(1000):

        tr_obser = env.reset()
        total_reward = 0
        step_count = 0

        while(True):
            env.render()

            tr_obser = np.concatenate((tr_obser["observation"],tr_obser["desired_goal"]),axis = 0)
            tr_action = policy.select_action(tr_obser)
            tr_obser, tr_reward, is_terminal, _ = env.step(tr_action)

            total_reward += tr_reward
            step_count += 1

            if(is_terminal):
                
                break
    env.close()
    return
Example #3
0
def play(env):
    p_td3 = TD3.TD3(env.observation_space, env.action_space, 1)
    p_td3.load("TD3_Tennis_12", directory="pytorch_models")
    p_ddpg = mDDPG.DDPG(env.observation_space, env.action_space, 1)
    p_ddpg.load("mDDPG_Tennis_12", directory="pytorch_models")
    policies = [p_ddpg, p_td3]

    scores = []
    for _ in range(100):
        obs = env.reset(train_mode=True)
        done = False
        episode_score = np.zeros(env.num_agents, dtype=np.float64)
        while not done:
            action = [
                policy.select_action(np.array(observation))
                for policy, observation in zip(policies, obs)
            ]
            obs, reward, done = env.step(action)
            episode_score += reward
            done = np.any(done)
        print("Scored: {:.2f} {:.2f}".format(episode_score[0],
                                             episode_score[1]))
        scores.append(episode_score.max())
    scores = np.array(scores)
    print("Mean score {:0.2f} over {}".format(scores.mean(), len(scores)))
Example #4
0
def eval(args):
    file_name = f"{args.policy}_{args.domain_name}_{args.seed}"
    print("---------------------------------------")
    print(f"Policy: {args.policy}, Env: {args.domain_name}, Seed: {args.seed}")
    print("---------------------------------------")

    log_path = safe_path(
        os.path.join(args.log_root,
                     '{}_{}_base'.format(args.domain_name, args.task_name)))
    result_path = safe_path(os.path.join(log_path, 'results'))
    model_path = safe_path(os.path.join(log_path, 'models_bak'))

    env = dmc2gym.make(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=0,
                       visualize_reward=False,
                       from_pixels=False,
                       height=256,
                       width=256,
                       frame_skip=args.frame_skip)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(os.path.join(model_path, '{}'.format(policy_file)))

    # Evaluate untrained policy
    eval_policy(policy, env, args.seed)
def main():
    """
    Training AC model
    :return:
    """
    # 1. 创建环境
    env = gym.make('Pendulum-v0')
    # 2. 实验重现
    env.seed(RANDOMSEED)
    np.random.seed(RANDOMSEED)
    tf.random.set_seed(RANDOMSEED)
    # 2. DDPG训练方法
    td3 = TD3.TD3(env)
    td3.train()
def run_policy(env_name):
    test_rewards = []
    env_name = "MountainCarContinuous-v0"
    env_name = "ContinuousCartPoleEnv"
    random_seed = 0
    n_episodes = 10
    lr = 0.002
    max_timesteps = 2000
    render = True
    save_gif = True

    filename = "TD3_{}_{}".format(env_name, random_seed)
    filename += '_solved'

    env = gym.make(env_name)
    directory = "./preTrained/"+str(env_name)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    policy = TD3(lr, state_dim, action_dim, max_action)

    policy.load_actor(directory, filename)

    for ep in range(1, n_episodes + 1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                if save_gif:
                    img = env.render(mode='rgb_array')
                    img = Image.fromarray(img)
                    img.save('./gif/{}.jpg'.format(t))
            if done:
                break
        test_rewards.append(ep_reward)
        print('Evaluation Episode: {}\tEpisode Reward: {}'.format(ep, int(ep_reward)))
        #ep_reward = 0
        env.close()
    plot(test_rewards)
Example #7
0
def eval(args):
    file_name = f"{args.policy}_{args.env}_{args.seed}"
    print("---------------------------------------")
    print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}")
    print("---------------------------------------")

    log_path = safe_path(os.path.join(args.log_root, '{}_3leg'.format(args.env)))
    model_path = safe_path(os.path.join(log_path, 'models'))

    env = gym.make(args.env)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(os.path.join(model_path,'{}'.format(policy_file)))

    # Evaluate untrained policy
    imgpath = safe_path(os.path.join(model_path,'eval_imgs'))
    eval_policy(policy, args.env, args.seed, 100, model_path,imgpath)
 def aux(nom1, nom2, g, e, sequences, tree):
     if not nom1 in tree.keys():  # si nom1 est une feuille
         seq1 = [sequences[int(nom1)]]
         desc1 = [desc[int(nom1)]]
     else:
         g1, g2 = tree[nom1][0], tree[nom1][1]
         seq1, desc1 = aux(g1, g2, g, e, sequences, tree)
     if not nom2 in tree.keys():
         seq2 = [sequences[int(nom2)]]
         desc2 = [desc[int(nom2)]]
     else:
         g1, g2 = tree[nom2][0], tree[nom2][1]
         seq2, desc2 = aux(g1, g2, g, e, sequences, tree)
     traceback = NW_affine_multi_structure(seq1, seq2, g, e,
                                           cout_structural, desc)[1]
     l1, l2 = TD3.affiche_multi(seq1, seq2, traceback)
     alignments = l1 + l2
     new_desc = desc1 + desc2
     return alignments, new_desc
Example #9
0
File: show.py Project: llfl/TD3
def moduleShow(args):
    env = gym.make(args.env_name)
    state_dim = env.observation_space["observation"].shape[
        0] + env.observation_space["desired_goal"].shape[0]
    #state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(policy)]
    obs = env.reset()
Example #10
0
def play():
    unity = UnityEnvironment(file_name=executable(), no_graphics=False)
    env = UnityWrapper(unity, train_mode=False)

    policy = TD3.TD3(env.observation_space, env.action_space, 1)
    policy.load("TD3_Reacher-v2_3", directory="pytorch_models")

    for _ in range(5):
        obs = env.reset()
        done = False
        episode_score = np.zeros(20, dtype=np.float64)
        while not done:
            action = [
                policy.select_action(np.array(observation))
                for observation in obs
            ]
            obs, reward, done, _ = env.step(action)
            episode_score += reward
            done = np.any(done)
        print("Scored: {:.2f}".format(episode_score.mean()))

    unity.close()
Example #11
0
    env = gym.make(args.env_name)
    eval_env = gym.make(args.env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy and buffer
    policy = TD3.TD3(state_dim,
                     action_dim,
                     max_action,
                     device,
                     K=args.K,
                     rpf=args.rpf)
    if args.priority:
        replay_buffer = utils.PriorityReplayBuffer(
            timesteps=args.max_timesteps,
            alpha=args.alpha,
            beta=args.beta,
            eps=args.eps)
    else:
        replay_buffer = utils.ReplayBuffer()
    total_timesteps = 0
    total_episodes = 0
    episode_timesteps = 0
    done = True
Example #12
0
	if args.save_models and not os.path.exists("./pytorch_models"):
		os.makedirs("./pytorch_models")

	env = gym.make(args.env_name)

	# Set seeds
	env.seed(args.seed)
	torch.manual_seed(args.seed)
	np.random.seed(args.seed)
	
	state_dim = env.observation_space.shape[0]
	action_dim = env.action_space.shape[0] 
	max_action = float(env.action_space.high[0])

	# Initialize policy
	if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action)
	elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
	elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action)

	replay_buffer = utils.ReplayBuffer()
	
	# Evaluate untrained policy
	evaluations = [evaluate_policy(policy)] 

	total_timesteps = 0
	timesteps_since_eval = 0
	episode_num = 0
	done = True 

	while total_timesteps < args.max_timesteps:
		
Example #13
0
#if not os.path.exists("./data"):
#        os.makedirs("./data")

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action, args.seed)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action, args.seed)

    # Load model
    policy.load(filename, './pytorch_models/')
    #policy.load(filename, './pre_models/')

    # Start evaluation
    _ = evaluate_policy(policy,
                        filename,
                        eval_episodes=args.eval_episodes,
                        visualize=args.visualize)
Example #14
0
    "action_dim": action_dim,
    "max_action": max_action,
    "discount": config['gamma'],
    "tau": config['tau'],
    "lr": config['lr'],
    "hidden_size": config['hidden_size'],
    'cuda': config['cuda'],
    'parameter_noise_mean': config['param_noise_mean'],
    'parameter_noise_std': config['param_noise_std']
}

# Target policy smoothing is scaled wrt the action scale
kwargs["policy_noise"] = config['policy_noise'] * max_action
kwargs["noise_clip"] = config['noise_clip'] * max_action
kwargs["policy_freq"] = config['policy_freq']
agent = TD3.TD3(**kwargs)

# Memory
device = torch.device('cuda:' +
                      str(config['cuda'])) if torch.cuda.is_available(
                      ) and config['cuda'] >= 0 else torch.device('cpu')
memory = ReplayMemory(state_dim, action_dim, config['replay_size'], device)

# Training Loop
total_numsteps = 0
updates = 0

# make model save path
from os import path
import time
current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
Example #15
0
    eval_env = gym.make(env_name)
    eval_env.seed(seed + 100)

    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        while not done:
            action = policy.select_action(np.array(state))
            state, reward, done, _ = eval_env.step(action)
            eval_env.render('rgb_array')
            avg_reward += reward

    avg_reward /= eval_episodes
    eval_env.close()  #VVI

    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    print("---------------------------------------")
    return avg_reward


env_name = "Pendulum-v0"
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
model = TD3.TD3(state_dim, action_dim, max_action)
path = "models\TD3_Pendulum-v0"
model.load(path)

eval_policy(model, env_name, 0, 1)
        env = dm_control2gym.make(domain_name=domain, task_name=task)
        env_max_steps = 1000
    else:
        env = gym.make(args.env_name)
        env_max_steps = env._max_episode_steps

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    if args.policy_name == 'TD3':
        policy = TD3.load('policy', 'results/{}'.format(args.name))
    elif args.policy_name == 'EmbeddedTD3':
        policy = EmbeddedTD3.load('policy', 'results/{}'.format(args.name))
    elif args.policy_name == 'random':
        if args.decoder:
            decoder = load_decoder(args.env_name, args.decoder)
            policy = RandomEmbeddedPolicy(1, decoder, None)
        elif args.dummy_decoder:
            decoder = DummyDecoder(action_dim, args.dummy_traj_len,
                                   env.action_space)
            policy = RandomEmbeddedPolicy(1, decoder, 1)
        else:
            policy = RandomPolicy(env.action_space)
    elif args.policy_name == 'constant':
        policy = ConstantPolicy(env.action_space)
    else:
Example #17
0
def main(args):
        file_name = f"{args.policy}_{args.env}_{args.seed}"
        print("---------------------------------------")
        print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}")
        print("---------------------------------------")

        log_path = safe_path(os.path.join(args.log_root, '{}_base'.format(args.env)))
        result_path = safe_path(os.path.join(log_path, 'results'))
        model_path = safe_path(os.path.join(log_path, 'models'))
        
        '''
        ### s2r hacks
        s2r_parser = argparse.ArgumentParser()
        s2r_parser.add_argument("--encoder_type", default="mlp")
        s2r_parser.add_argument("--end_effector", default=True)
        s2r_parser.add_argument("--screen_width", type=int, default=480)
        s2r_parser.add_argument("--screen_height", type=int, default=480)
        s2r_parser.add_argument("--action_repeat", type=int, default=1)
        s2r_parser.add_argument("--puck_friction", type=float, default=2.0)
        s2r_parser.add_argument("--puck_mass", type=float, default=0.01)
        s2r_parser.add_argument("--unity",  default=False)
        s2r_parser.add_argument("--unity_editor", default=False)
        s2r_parser.add_argument("--virtual_display",  default=None)
        s2r_parser.add_argument("--port", default=1050)
        s2r_parser.add_argument("--absorbing_state", default=False)
        s2r_parser.add_argument("--dr", default=False)
        s2r_parser.add_argument("--env", default=None)
        s2r_args = s2r_parser.parse_args()
        import ipdb;ipdb.set_trace()
        env = make_s2r_env(args.env, s2r_args, env_type="real")
        '''
        env = gym.make(args.env)
        if "SawyerPush" in args.env:
            env = SawyerECWrapper(env, args.env)
            env._max_episode_steps = 70
        # Set seeds
        env.seed(args.seed)
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)

        try:
            state_dim = env.observation_space.shape[0]
        except:
            state_dim = 16 #env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])

        kwargs = {
                "state_dim": state_dim,
                "action_dim": action_dim,
                "max_action": max_action,
                "discount": args.discount,
                "tau": args.tau,
        }

        # Initialize policy
        if args.policy == "TD3":
                # Target policy smoothing is scaled wrt the action scale
                kwargs["policy_noise"] = args.policy_noise * max_action
                kwargs["noise_clip"] = args.noise_clip * max_action
                kwargs["policy_freq"] = args.policy_freq
                policy = TD3.TD3(**kwargs)

        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

        # Evaluate untrained policy
        evaluations = [eval_policy(policy, args.env, args.seed)]

        state, done = env.reset(), False
        episode_reward = 0
        episode_timesteps = 0
        episode_num = 0
        success = False
        reach_reward = 0
        push_reward = 0
        cylinder_to_target = 100
        for t in range(int(args.max_timesteps)):
                state = flatten_state(state)
                episode_timesteps += 1

                # Select action randomly or according to policy
                if t < args.start_timesteps:
                        action = env.action_space.sample()
                else:
                        action = (
                                        policy.select_action(np.array(state))
                                        + np.random.normal(0, max_action * args.expl_noise, size=action_dim)
                        ).clip(-max_action, max_action)

                # Perform action
                next_state, reward, done, info = env.step(action)
                next_state = flatten_state(next_state)
                done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

                if ("first_success" in info.keys() and info["first_success"]):
                    success = True

                # reach_reward += info["reward_reach"]
                # push_reward += info["reward_push"]
                # cylinder_to_target = min(cylinder_to_target, info["cylinder_to_target"])

                # Store data in replay buffer
                replay_buffer.add(state, action, next_state, reward, done_bool)

                state = next_state
                episode_reward += reward

                # Train agent after collecting sufficient data
                if t >= args.start_timesteps:
                        policy.train(replay_buffer, args.batch_size)

                if done:
                        # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
                        # reach_reward /= episode_timesteps
                        # push_reward /= episode_timesteps
                        #  Reach Reward: {reach_reward:.3f} Push Reward: {push_reward:.3f} cylinder_to_target: {cylinder_to_target:.3f}
                        print(
                                f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Success: {success}")
                        # Reset environment
                        success = False
                        state, done = env.reset(), False
                        episode_reward = 0
                        reach_reward, push_reward = 0, 0
                        cylinder_to_target = 100
                        episode_timesteps = 0
                        episode_num += 1

                # Evaluate episode
                if (t + 1) % args.eval_freq == 0:
                        evaluations.append(eval_policy(policy, args.env, args.seed))
                        np.save(os.path.join(result_path, '{}'.format(file_name)), evaluations)
                        if args.save_model: policy.save(os.path.join(model_path, '{}'.format(file_name)))
Example #18
0
    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": float(args.tau),
        "learning_rate": float(args.learning_rate),
        "epsilon": float(epsilon)
    }

    # Initialize policy

    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)
    if args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    if args.policy == "newDDPG":
        policy = newDDPG.DDPG(**kwargs)
    if args.policy == "newTD3":
        policy = newTD3.TD3(**kwargs)
    if args.policy == "A2C":
        policy = A2C.A2C(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim=action_dim)
Example #19
0
def main():
	
	parser = argparse.ArgumentParser()
	parser.add_argument("--policy_name", default="TD3")							# Policy name
	parser.add_argument("--env_name", default="Pendulum-v0")					# OpenAI gym environment name
	parser.add_argument("--replay_buffer", default="prioritized")				# Replay Buffer type
	parser.add_argument("--replay_buffer_size", default=5e4, type=int)			# Replay Buffer capacity
	parser.add_argument("--replay_buffer_alpha", default=0.6, type=float)		# Replay Buffer prioritization weight
	parser.add_argument("--seed", default=0, type=int)							# Sets Gym, PyTorch and Numpy seeds
	parser.add_argument("--start_timesteps", default=1e4, type=int)				# How many time steps purely random policy is run for
	parser.add_argument("--eval_freq", default=1e3, type=float)					# How often (time steps) we evaluate
	parser.add_argument("--max_timesteps", default=5e4, type=float)				# Max time steps to run environment for
	parser.add_argument("--save_models", default="True", type=bool)				# Whether or not models are saved
	parser.add_argument("--expl_noise", default=0.1, type=float)				# Std of Gaussian exploration noise
	parser.add_argument("--batch_size", default=100, type=int)					# Batch size for both actor and critic
	parser.add_argument("--discount", default=0.99, type=float)					# Discount factor
	parser.add_argument("--tau", default=0.005, type=float)						# Target network update rate
	parser.add_argument("--policy_noise", default=0.2, type=float)				# Noise added to target policy during critic update
	parser.add_argument("--noise_clip", default=0.5, type=float)				# Range to clip target policy noise
	parser.add_argument("--policy_freq", default=2, type=int)					# Frequency of delayed policy updates
	parser.add_argument("--lr_actor", default=0.001, type=float)				# Learning rate of actor
	parser.add_argument("--lr_critic", default=0.001, type=float)				# Learning rate of critic
	parser.add_argument("--prioritized_replay_eps", default=1e-3, type=float)	# Replay Buffer epsilon (PRE)
	parser.add_argument("--prioritized_replay_beta0", default=0.4, type=float)	# Replay Buffer initial beta (PRE)
	args = parser.parse_args()

#Training kwargs
	kwargs = {  "policy_name": args.policy_name,
				"env_name": args.env_name,
				"replay_buffer": args.replay_buffer,
				"replay_buffer_size": args.replay_buffer_size,
				"replay_buffer_alpha": args.replay_buffer_alpha,
				"seed": args.seed,
				"start_timesteps": args.start_timesteps,
				"eval_freq": args.eval_freq,
				"max_timesteps": args.max_timesteps,
				"save_models": args.save_models,
				"expl_noise": args.expl_noise,
				"batch_size": args.batch_size,
				"discount": args.discount,
				"tau": args.tau,
				"policy_noise": args.policy_noise,
				"noise_clip": args.noise_clip,
				"policy_freq": args.policy_freq,
				"lr_actor": args.lr_actor,
				"prioritized_replay_eps": args.prioritized_replay_eps,
				"prioritized_replay_beta0": args.prioritized_replay_beta0
         }

	# cls
	os.system('cls' if os.name == 'nt' else 'clear')

	if not os.path.exists("./results"):
    		os.makedirs("./results")
	if args.save_models and not os.path.exists("./pytorch_models"):
		os.makedirs("./pytorch_models")

	# Time stamp for repeated test names
	ts = time.time()
	ts = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H-%M-%S')

	test_name = "%s_%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed), ts)
	plot_name = "%s_%s_%s_%s_plot.png" % (args.policy_name, args.env_name, str(args.seed), ts)
	kwargs_name = "%s_%s_%s_%s_kwargs.csv" % (args.policy_name, args.env_name, str(args.seed), ts)
	scores_name = "%s_%s_%s_%s_scores.csv" % (args.policy_name, args.env_name, str(args.seed), ts)

	print("---------------------------------------")
	print("Settings: %s" % (test_name))
	utils.save_kwargs(kwargs, "./results/%s" % (kwargs_name))
	print("---------------------------------------")

	# Environment and Agent instantiation

	env = gym.make(args.env_name)

	# Set seeds
	env.seed(args.seed)
	torch.manual_seed(args.seed)
	np.random.seed(args.seed)
	
	state_dim = env.observation_space.shape[0]
	action_dim = env.action_space.shape[0] 
	max_action = float(env.action_space.high[0])

	# Instantiate Replay Buffer	
	if args.replay_buffer == "vanilla": 
		replay_buffer = rb.ReplayBuffer(size = args.replay_buffer_size)
		PER = False
	elif args.replay_buffer == "prioritized": 
		replay_buffer = rb.PrioritizedReplayBuffer(size = int(np.round(np.sqrt(args.replay_buffer_size))), 
												   alpha = args.replay_buffer_alpha)
		PER = True
		prioritized_replay_beta_iters = args.max_timesteps
		prioritized_replay_beta0 = args.prioritized_replay_beta0
		beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p = prioritized_replay_beta0,
                                       final_p = 1.0)

	# Instantiate policy
	if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps)
	elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps)

	# Evaluate untrained policy
	evaluations = [evaluate_policy(env, policy)] 

	# Training loop #######################################

	total_timesteps = 0
	timesteps_since_eval = 0
	episode_num = 0
	episode_rewards = []
	done = True 

	while total_timesteps < args.max_timesteps:
		
		if done: 

			if total_timesteps != 0: 
				print('Total T: {} Episode Num: {} Episode T: {} Reward: {}'.format(total_timesteps, episode_num, episode_timesteps, episode_reward))
				episode_rewards.append(episode_reward)
				
				# PER Beta scheduled update 
				if PER: beta = beta_schedule.value(total_timesteps)
				else: beta = 0.
				# Policy update step
				if args.policy_name == "TD3":
					policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, beta)
				else: 
					policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, beta)
			
			# Evaluate episode
			if timesteps_since_eval >= args.eval_freq:
				timesteps_since_eval %= args.eval_freq
				evaluations.append(evaluate_policy(env, policy))
				
				# save evaluation
				#if args.save_models: policy.save(test_name, directory="./pytorch_models")
				#np.save("./results/%s" % (test_name), evaluations) 
			
			# Reset environment
			obs = env.reset()
			done = False
			episode_reward = 0
			episode_timesteps = 0
			episode_num += 1 
		
		# Select action randomly or according to policy
		if total_timesteps < args.start_timesteps:
			action = env.action_space.sample()
		else:
			action = policy.select_action(np.array(obs))
			if args.expl_noise != 0: 
				action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)

		# Perform action
		new_obs, reward, done, _ = env.step(action) 
		done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
		episode_reward += reward

		# Push experience into replay buffer
		experience = (obs, action, reward, new_obs, done_bool)
		replay_buffer.add(experience)

		obs = new_obs

		episode_timesteps += 1
		total_timesteps += 1
		timesteps_since_eval += 1
		
	# Final evaluation 
	evaluations.append(evaluate_policy(env, policy))
	
	# Save results
	if args.save_models: policy.save("%s" % (test_name), directory="./pytorch_models")
	#np.save("./results/%s" % (evaluations_file), evaluations)  
	#np.save("./results/%s" % ('rewards.txt'), episode_rewards) 
	utils.save_scores(episode_rewards, "./results/%s" % (scores_name))
	utils.plot(episode_rewards, "./results/%s" % (plot_name), 1)
Example #20
0
def load_policy(load_from):
    # Initialize policy
    start_step = 0
    if args.policy == "TD3":
        import TD3
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * kwargs['max_action']
        kwargs["noise_clip"] = args.noise_clip * kwargs['max_action']
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)
    elif args.policy == "OurDDPG":
        import OurDDPG
        policy = OurDDPG.DDPG(**kwargs)
    elif args.policy == "DDPG":
        import DDPG
        policy = DDPG.DDPG(**kwargs)

    # create experiment directory (may not be used)
    exp_cnt = 0
    load_model_path = ''
    results_dir = os.path.join(args.savedir, args.exp_name+'%02d'%exp_cnt)
    while os.path.exists(results_dir):
        exp_cnt+=1
        results_dir = os.path.join(args.savedir, args.exp_name+'%02d'%exp_cnt)

    # load model if necessary
    if load_from != "":
        if os.path.isdir(load_from):
            print("loading latest model from dir: {}".format(load_from))
            # find last file
            search_path = os.path.join(load_from, '*.pt')
            model_files = glob(search_path)
            if not len(model_files):
                print('could not find model exp files at {}'.format(search_path))
                raise
            else:
                load_model_path = sorted(model_files)[-1]
        else:
            load_model_path = load_from
            print("loading model from file: {}".format(load_model_path))
        policy.load(load_model_path)
        # TODO 
        # utils.load_info_dict(load_model_base)
        try:
            start_step = int(load_model_path[-13:-3])
        except:
            try:
                start_step = policy.step
            except:
                print('unable to get start step from name - set it manually')

        # store in old dir
        if not args.continue_in_new_dir:
            results_dir = os.path.split(load_model_path)[0]
            print("continuing in loaded directory")
            print(results_dir)
        else:
            print("resuming in new directory")
            print(results_dir)
    else:
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
            print('storing results in: {}'.format(results_dir))
    return policy, start_step, results_dir, load_model_path
 def f(a, b):
     return TD3.cout_blosum(a, b, g=g)
def experiment(variant):
    print('CUDA status:', torch.cuda.is_available())
    env = make_env(variant['env'])

    # Set seeds
    variant['seed'] = int(variant['seed'])
    env.seed(int(variant['seed']))
    torch.manual_seed(int(variant['seed']))
    np.random.seed(int(variant['seed']))

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {"state_dim": state_dim, "action_dim": action_dim, "max_action": max_action,
              "discount": variant['discount'], "tau": variant['tau'],
              'network_class': NETWORK_CLASSES[variant['network_class']]}

    # custom network kwargs
    mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                              hidden_dim=variant['hidden_dim'],
                              first_dim=variant['first_dim'])
    dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                      hidden_dim=variant['hidden_dim'],
                                      first_dim=variant['first_dim'],
                                      dropout_p=variant['dropout_p'])
    variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                            hidden_dim=variant['hidden_dim'],
                                            first_dim=variant['first_dim'],
                                            sigma=variant['sigma'])
    fourier_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                  hidden_dim=variant['hidden_dim'],
                                  fourier_dim=variant['fourier_dim'],
                                  sigma=variant['sigma'],
                                  concatenate_fourier=variant['concatenate_fourier'],
                                  train_B=variant['train_B'])
    siren_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                hidden_dim=variant['hidden_dim'],
                                first_omega_0=variant['omega'],
                                hidden_omega_0=variant['omega'])
    if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}:
        kwargs['network_kwargs'] = mlp_network_kwargs
    elif variant['network_class'] == 'DropoutMLP':
        kwargs['network_kwargs'] = dropout_mlp_network_kwargs
    elif variant['network_class'] == 'VariableInitMLP':
        kwargs['network_kwargs'] = variable_init_mlp_network_kwargs
    elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}:
        kwargs['network_kwargs'] = fourier_network_kwargs
    elif variant['network_class'] == 'Siren':
        kwargs['network_kwargs'] = siren_network_kwargs
    else:
        raise NotImplementedError

    # Initialize policy
    if variant['policy'] == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = variant['policy_noise * max_action']
        kwargs["noise_clip"] = variant['noise_clip * max_action']
        kwargs["policy_freq"] = variant['policy_freq']
        policy = TD3.TD3(**kwargs)
    elif variant['policy'] == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif variant['policy'] == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    elif variant['policy'] == "SAC":
        kwargs['lr'] = variant['lr']
        kwargs['alpha'] = variant['alpha']
        kwargs['automatic_entropy_tuning'] = variant['automatic_entropy_tuning']
        kwargs['weight_decay'] = variant['weight_decay']
        # left out dmc
        policy = SAC(**kwargs)
    elif 'PytorchSAC' in variant['policy']:
        kwargs['action_range'] = [float(env.action_space.low.min()), float(env.action_space.high.max())]
        kwargs['actor_lr'] = variant['lr']
        kwargs['critic_lr'] = variant['lr']
        kwargs['alpha_lr'] = variant['alpha_lr']
        kwargs['weight_decay'] = variant['weight_decay']
        kwargs['no_target'] = variant['no_target']
        kwargs['mlp_policy'] = variant['mlp_policy']
        kwargs['mlp_qf'] = variant['mlp_qf']
        del kwargs['max_action']
        if variant['policy'] == 'PytorchSAC':
            policy = PytorchSAC(**kwargs)
        elif variant['policy'] == 'RandomNoisePytorchSAC':
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = RandomNoiseSACAgent(**kwargs)
        elif variant['policy'] == 'SmoothedPytorchSAC':
            kwargs['n_critic_samples'] = variant['n_critic_samples']
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = SmoothedSACAgent(**kwargs)
        elif variant['policy'] == 'FuncRegPytorchSAC':
            kwargs['critic_target_update_frequency'] = variant['critic_freq']
            kwargs['fr_weight'] = variant['fr_weight']
            policy = FuncRegSACAgent(**kwargs)
    else:
        raise NotImplementedError

    if variant['load_model'] != "":
        raise RuntimeError

    # load replay buffer
    replay_buffer = torch.load(os.path.join(variant['replay_buffer_folder'], 'generated_replay_buffer.pt'))

    policy_optimizer = torch.optim.Adam(policy.actor.parameters(), lr=variant['lr'])
    qf_optimizer = torch.optim.Adam(policy.critic.Q1.parameters(), lr=variant['lr'])

    # split into train and val for both action and q_value
    indices = np.arange(replay_buffer.max_size)
    random.shuffle(indices)
    train_indices = indices[:int(0.9 * len(indices))]
    val_indices = indices[int(0.9 * len(indices)):]
    train_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[train_indices]).float(),
                                                   torch.tensor(replay_buffer.action[train_indices]).float(),
                                                   torch.tensor(replay_buffer.correct_action[train_indices]).float(),
                                                   torch.tensor(replay_buffer.q_value[train_indices]).float())
    val_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[val_indices]).float(),
                                                 torch.tensor(replay_buffer.action[val_indices]).float(),
                                                 torch.tensor(replay_buffer.correct_action[val_indices]).float(),
                                                 torch.tensor(replay_buffer.q_value[val_indices]).float())

    # train a network on it
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=variant['batch_size'], shuffle=True,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=variant['batch_size'], shuffle=True,
                                             pin_memory=True)

    train_q_losses = []
    train_policy_losses = []
    val_q_losses = []
    val_policy_losses = []
    for _ in trange(variant['n_train_epochs']):
        total_q_loss = 0
        total_policy_loss = 0
        for (state, action, correct_action, q) in train_loader:
            state = state.to(DEVICE)
            action = action.to(DEVICE)
            correct_action = correct_action.to(DEVICE)
            q = q.to(DEVICE)
            q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1))
            policy_preds = policy.actor(state).mean
            q_loss = F.mse_loss(q_preds, q)
            policy_loss = F.mse_loss(policy_preds, correct_action)
            qf_optimizer.zero_grad()
            policy_optimizer.zero_grad()
            q_loss.backward()
            policy_loss.backward()
            qf_optimizer.step()
            policy_optimizer.step()
            total_q_loss += q_loss.item()
            total_policy_loss += policy_loss.item()

        # get validation stats
        total_val_q_loss = 0
        total_val_policy_loss = 0
        with torch.no_grad():
            for (state, action, correct_action, q) in val_loader:
                state = state.to(DEVICE)
                action = action.to(DEVICE)
                correct_action = correct_action.to(DEVICE)
                q = q.to(DEVICE)
                q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1))
                policy_preds = policy.actor(state).mean
                q_loss = F.mse_loss(q_preds, q)
                policy_loss = F.mse_loss(policy_preds, correct_action)
                total_val_q_loss += q_loss.item()
                total_val_policy_loss += policy_loss.item()

        train_q_losses.append(total_q_loss / len(train_loader))
        train_policy_losses.append(total_policy_loss / len(train_loader))
        val_q_losses.append(total_val_q_loss / len(val_loader))
        val_policy_losses.append(total_val_policy_loss / len(val_loader))
        print(f'train: qf loss: {train_q_losses[-1]:.4f}, policy loss: {train_policy_losses[-1]:.4f}')
        print(f'val: qf loss: {val_q_losses[-1]:.4f}, policy loss: {val_policy_losses[-1]:.4f}')

    # evaluate the resulting policy for 100 episodes
    eval_return = eval_policy(policy, variant['env'], variant['seed'], eval_episodes=variant['eval_episodes'])

    # save the results
    to_save = dict(
        train_q_losses=train_q_losses,
        train_policy_losses=train_policy_losses,
        val_q_losses=val_q_losses,
        val_policy_losses=val_policy_losses,
        eval_return=eval_return,
        qf=policy.critic.Q1.state_dict(),
        policy=policy.actor.state_dict()
    )
    torch.save(to_save, os.path.join(variant['replay_buffer_folder'], f'{variant["network_class"]}_distillation.pt'))
Example #23
0
    def __init__(self):
        self.computer = 'kelsey'
        super().__init__('walk_node_{}'.format(
            self.computer))  #Remember to use ros_bridge
        self.pkl_folder = 'pkl_walk_vanilla'
        self.pause_on_nn = False
        self.score = 0
        self.reward = 0.0
        self.distance_new = 0.0
        self.distance_old = 0.0
        self.reward_total = 0.0
        self.joint_states_init = [
            0.0 - hip_x_min_temp, 0.0 - hip_y_min_temp, 0.0 - knee_min_temp,
            0.0 - ankle_y_min_temp, 0.0 - ankle_x_min_temp,
            0.0 - hip_x_min_temp, 0.0 - hip_y_min_temp, 0.0 - knee_min_temp,
            0.0 - ankle_y_min_temp, 0.0 - ankle_x_min_temp,
            0.0 - hip_z_min_temp, 0.0 - hip_z_min_temp
        ]  #, [0.0, 0.0, 0.0, 0.0]]#, [0.0], [0.0], [0.0, 0.0, 0.0]]
        self.joint_states = self.joint_states_init
        self.true_joint_states_init = [
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            0.0, 0.0, 0.0, 0.0, 0.0
        ]
        self.true_joint_states = self.true_joint_states_init
        self.score_hist = []
        self.distance_hist_long = []

        self.fallen_status = 0
        self.vel_init = [0, 0, 0]
        self.positions_init = [0, 0.80]
        self.rpy_init = [0., 0., math.pi]
        self.rpy_vel_init = [0, 0, 0]
        self.y_pos_init = [0.0]
        self.s = self.s1 = classes.State(self.joint_states, self.rpy_init,
                                         self.rpy_vel_init,
                                         self.positions_init, self.vel_init,
                                         self.true_joint_states,
                                         self.y_pos_init)

        self.gamma = 0.99  # discount for future rewards
        self.batch_size = 128  # num of transitions sampled from replay buffer
        self.num_actions = 12
        self.action_init = [
            (hip_x_min_temp / (hip_x_min_temp - hip_x_max_temp)),
            (hip_y_min_temp / (hip_y_min_temp - hip_y_max_temp)),
            (knee_min_temp / (knee_min_temp - knee_max_temp)),
            (ankle_y_min_temp / (ankle_y_min_temp - ankle_y_max_temp)),
            (ankle_x_min_temp / (ankle_x_min_temp - ankle_x_max_temp)),
            (hip_x_min_temp / (hip_x_min_temp - hip_x_max_temp)),
            (hip_y_min_temp / (hip_y_min_temp - hip_y_max_temp)),
            (knee_min_temp / (knee_min_temp - knee_max_temp)),
            (ankle_y_min_temp / (ankle_y_min_temp - ankle_y_max_temp)),
            (ankle_x_min_temp / (ankle_x_min_temp - ankle_x_max_temp)),
            (hip_z_min_temp / (hip_z_min_temp - hip_z_max_temp)),
            (hip_z_min_temp / (hip_z_min_temp - hip_z_max_temp))
        ]
        self.action_init = np.array([(x * 2) - 1 for x in self.action_init])

        self.exploration_noise_init = 0.08  #.10, 0.05
        self.exploration_noise = self.exploration_noise_init
        self.polyak = 0.995  # target policy update parameter (1-tau)
        self.policy_noise = 0.12  #.20, .10          # target policy smoothing noise
        self.noise_clip = 0.5
        self.policy_delay = 2  # delayed policy updates parameter

        self.testing = False
        if self.testing == True:
            self.policy_noise = 0.0
            self.exploration_noise_init = 0.0
            self.exploration_noise = 0.0
        self.last_saved_index = 150000  #minimax at 1165000; lowered effort to 3.92 and renamed i to 1000 at 1419000; instant at 8000
        # policy .05, exp .04 @ 137000; reverted noise to policy .12 and exp .08 @ 200000
        self.distance_hist = []

        if self.last_saved_index > 0:
            self.read_pkl = True
        else:
            self.read_pkl = False

        self.i = self.last_saved_index

        v = True
        self.j = 0
        lr = .0001
        self.num_states = 22
        remove_states = []
        load_weights = True
        read_replay_buffer = True
        add_num_states = 0
        add_actions = 0
        layer_height = 250

        if self.read_pkl == True:

            #agent = NewAgent.load_model('./pkl/agent_{}.pkl'.format(i))

            if load_weights == True:
                print('reading weights...')
                self.agent = TD3.TD3(lr=lr,
                                     state_dim=self.num_states +
                                     add_num_states - len(remove_states),
                                     action_dim=self.num_actions + add_actions,
                                     max_action=1.0,
                                     layer_height=layer_height)

                self.agent.load('./{}'.format(self.pkl_folder),
                                self.i,
                                additional_dims=add_num_states,
                                additional_actions=add_actions,
                                remove_dimensions_=remove_states)
                self.num_actions = self.num_actions + add_actions
                #print('STATES:{}'.format(agent.))
                #agent.state_dim += add_num_states
                #if add_state > 0:

            else:
                print('WARNING: LOADING FULL AGENT')
                self.agent = TD3.TD3.load_model('./{}/agent_{}.pkl'.format(
                    self.pkl_folder, self.i))
                self.agent.use_scheduler = False
            if read_replay_buffer == True:
                print('reading replay buffer...')
                self.replay_buffer = pickle.load(
                    open('./{}/replay_{}.pkl'.format(self.pkl_folder, self.i),
                         'rb'))
            else:
                self.replay_buffer = ReplayBuffer()
        else:
            print('creating agent')
            #agent = NewAgent(alpha=0.000005, beta=0.00001, input_dims=[3], gamma=1.01, layer1_size=30, layer2_size=30, n_outputs=1, n_actions=26) # 26=13*2
            #agent = Agent(alpha=0.000025, beta=0.00025, input_dims=[19], tau=0.001, env='dummy', sigma=.5,
            #          batch_size=100,  layer1_size=200, layer2_size=250, n_actions=12, max_size=100000)
            self.agent = TD3.TD3(lr=lr,
                                 state_dim=self.num_states,
                                 action_dim=self.num_actions,
                                 max_action=1.0,
                                 layer_height=layer_height)
            self.replay_buffer = ReplayBuffer()
        self.state_sub = self.create_subscription(
            State, '/tori_state_{}'.format(self.computer),
            self.state_callback)  # listens for state updates
        self.reward_sub = self.create_subscription(
            State, '/tori_state_{}'.format(self.computer),
            self.reward_callback)  #
        self.replay_sub = self.create_subscription(
            Replay, '/replay',
            self.replay_callback)  # listens for replay messages
        self.replay_pub = self.create_publisher(
            Replay, '/replay',
            qos_profile=1)  # publishes replay to this/other computers
        self.joint_angles_pub = self.create_publisher(
            ToriJointAngles, '/tori_joint_command_{}'.format(self.computer)
        )  # tells control_motion the desired joint positions
        self.checkpoint_sub = self.create_subscription(
            Float64, '/checkpoint_{}'.format(self.computer),
            self.checkpoint_callback)
        self.checkpoit_pub = self.create_publisher(Float64,
                                                   '/checkpoint_{}'.format(
                                                       self.computer),
                                                   qos_profile=0)

        # start training
        self.state_pub = self.create_publisher(State,
                                               '/tori_state_{}'.format(
                                                   self.computer),
                                               qos_profile=1)
        state = State()
        state.fallen_status = float(self.fallen_status)
        state.orientation = self.rpy_init
        state.pos = [0., 0., 0.80
                     ]  #TODO: get position_y_spine, not necessarily minimin
        state.distance_minimum = -.02  #TODO: check this number
        state.rpy_vel = [0., 0., 0.]
        state.vel = [0., 0., 0.]
        state.sim_time = 0.0
        self.state_pub.publish(state)
        print('published!')
Example #24
0
def train(config, args):
    if not os.path.exists("./results"):
        os.makedirs("./results")

    if args.save_model and not os.path.exists("./models"):
        os.makedirs("./models")

    import pybulletgym
    warnings.filterwarnings("ignore")
    eps_bounds = args.reacher_epsilon_bounds      # just aliasing with shorter variable name
    utils_object = utils.GeneralUtils(args)

    if args.tune_run:
        if args.prioritized_replay:
            args.alpha = float(config["alpha"])
            args.beta = float(config["beta"])
            args.discount = float(config.get("discount", args.discount))
            args.tau = float(config.get("tau", args.tau))
        elif args.custom_env and args.use_hindsight:
            eps_bounds = [float(config["epsilons"][0]), float(config["epsilons"][1])]
            args.seed = int(config["seed"])
        else:
            args.discount = float(config.get("discount", args.discount))
            args.tau = float(config.get("tau", args.tau))
    
    if args.custom_env:
        gym.envs.register(
            id='OurReacher-v0',
            entry_point='our_reacher_env:OurReacherEnv',
            max_episode_steps=50,
            reward_threshold=100.0,
        )

        # this is assuming we only use epsilon for custom env or fetch reach, where episode tsteps is 50 !!!!
        max_episode_steps = 50

        # retrieve epsilon range
        [a, b] = eps_bounds
        epsilons = utils_object.epsilon_calc(a, b, max_episode_steps)
        env = gym.make('OurReacher-v0', epsilon=epsilons[0], render=False)
    else:
        env = gym.make(args.env)

    if utils_object.fetch_reach and utils_object.args.fetch_reach_dense:
        env.env.reward_type = "dense"

    # Set seeds
    env.seed(int(args.seed))
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if utils_object.fetch_reach:
        state_dim = env.reset()["observation"].shape[0]
    else:
        state_dim = env.observation_space.shape[0]
    if args.use_hindsight:          # include both current state and goal state
        if args.custom_env:
            state_dim += 2          # reacher nonsense; goal = (x, y)
        elif utils_object.fetch_reach:
            state_dim += 3          # include fetchreach goal state (x,y,z position)
        else:
            state_dim *= 2

    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        kwargs["prioritized_replay"] = args.prioritized_replay
        kwargs["use_rank"] = args.use_rank
        kwargs["use_hindsight"] = args.use_hindsight
        
        policy = TD3.TD3(**kwargs)
    elif args.policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    exp_descriptors = [
        args.policy, 'CustomReacher' if args.custom_env else args.env,
        f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 
        'HER' if args.use_hindsight else '',
        f"{args.decay_type}decay-eps{f'{eps_bounds[0]}-{eps_bounds[1]}' if eps_bounds[0] != eps_bounds[1] else f'{eps_bounds[0]}'}" if args.custom_env else "",
        f"k{args.k}",
        datetime.now().strftime('%Y%m%d%H%M')
    ]
    if args.tune_run:
        # fudgy: assumes tune_run for non-HER experiments
        exp_descriptors = [
            args.policy, 'CustomReacher' if args.custom_env else args.env,
            f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 
            f"tau{args.tau}", f"discount{args.discount}",
            f"alpha{args.alpha}" if args.prioritized_replay else '',
            f"beta{args.beta}" if args.prioritized_replay else '',
            f"k{args.k}",
            datetime.now().strftime('%Y%m%d%H%M')
        ]

    exp_descriptors = [x for x in exp_descriptors if len(x) > 0]
    file_name = "_".join(exp_descriptors)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    if args.prioritized_replay:
        replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim,
                                                      args.max_timesteps, args.start_timesteps,
                                                      alpha=args.alpha, beta=args.beta)
    else:
        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)
    
    # Evaluate untrained policy
    evaluations = [eval_policy(policy, args.env, args.seed, utils_object=utils_object)]
 
    state, done = env.reset(), False

    original_episode_reward = 0
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    trajectory = []

    for t in range(int(args.max_timesteps)):
        
        episode_timesteps += 1
        x, goal = utils_object.compute_x_goal(state, env)
        
        # Select action randomly or according to policy
        if t < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = (
                policy.select_action(np.array(x))
                + np.random.normal(0, max_action * args.expl_noise, size=action_dim)
            ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        if args.use_hindsight:
            if utils_object.fetch_reach:
                goal = state["desired_goal"]
                next_x = np.concatenate([np.array(next_state["observation"]), goal])
            else:
                # env.set_goal(goal)
                next_x = np.concatenate([np.array(next_state), goal])
        elif utils_object.fetch_reach:
            next_x = np.array(next_state["observation"])
        else:
            next_x = next_state

        # Store data in replay buffer
        if not args.use_hindsight:
            replay_buffer.add(x, action, next_x, reward, done_bool)

        trajectory.append((state, action, next_state, reward, done_bool))

        state = next_state
        episode_reward += reward
        if args.custom_env:
          original_episode_reward += env.original_rewards

        # Train agent after collecting sufficient data
        if t >= args.start_timesteps:
            policy.train(replay_buffer, args.batch_size)

        if done:
            if args.use_hindsight:
                replay_buffer.add_hindsight(trajectory, goal, env, k=args.k, fetch_reach=utils_object.fetch_reach)
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Original Reward: {original_episode_reward:.3f}")
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            original_episode_reward = 0
            episode_timesteps = 0
            episode_num += 1
            if args.custom_env:
                epsilon = epsilons[episode_num]
                env.set_epsilon(epsilon)

            trajectory = []

        # Evaluate episode
        if (t + 1) % args.eval_freq == 0:
            evaled_policy = eval_policy(policy, args.env, args.seed, utils_object=utils_object)
            evaluations.append(evaled_policy)
            np.save(f"./results/{file_name}", evaluations)
            if args.save_model:
                policy.save(f"./models/{file_name}")
            if args.plot:
                plotter.plot(file_name, args.custom_env)
            if args.tune_run:
                tune.report(episode_reward_mean=evaled_policy[0])
Example #25
0
def train(config, start_timesteps, max_timesteps, policy_noise, expl_noise,
          noise_clip, policy_freq, batch_size, seed, policy,
          prioritized_replay, env_name, eval_freq, discount, tau, use_rank):
    if prioritized_replay:
        alpha = float(config["alpha"])
        beta = float(config["beta"])
    else:
        discount = float(config["discount"])
        tau = float(config["tau"])

    import pybulletgym
    warnings.filterwarnings("ignore")
    env = gym.make(env_name)

    # Set seeds
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": discount,
        "tau": tau,
    }

    # Initialize policy
    if policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = policy_noise * max_action
        kwargs["noise_clip"] = noise_clip * max_action
        kwargs["policy_freq"] = policy_freq
        kwargs["prioritized_replay"] = prioritized_replay
        kwargs["use_rank"] = use_rank
        policy = TD3.TD3(**kwargs)
    elif policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if prioritized_replay:
        replay_buffer = utils.PrioritizedReplayBuffer(state_dim,
                                                      action_dim,
                                                      max_timesteps,
                                                      start_timesteps,
                                                      alpha=alpha,
                                                      beta=beta)
    else:
        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, env_name, seed)]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(max_timesteps)):

        episode_timesteps += 1
        # Select action randomly or according to policy
        if t < start_timesteps:
            action = env.action_space.sample()
        else:
            action = (policy.select_action(np.array(state)) + np.random.normal(
                0, max_action * expl_noise, size=action_dim)).clip(
                    -max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(
            done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= start_timesteps:
            policy.train(replay_buffer, batch_size)

        if done:
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
            )
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluate episode
        if (t + 1) % eval_freq == 0:
            avg_reward = eval_policy(policy, env_name, seed)
            tune.report(episode_reward_mean=avg_reward)
            evaluations.append(avg_reward)
Example #26
0
def generate_video(args):

    total_time = args.video_length * 100
    exp_path = os.path.join(DATA_DIR, "EXP_{:04d}".format(args.expID))
    if not os.path.exists(exp_path):
        raise FileNotFoundError('checkpoint does not exist')
    print('*** folder fetched: {} ***'.format(exp_path))
    os.makedirs(VIDEO_DIR, exist_ok=True)

    # Retrieve MuJoCo XML files for visualizing ========================================
    env_names = []
    args.graphs = dict()
    # existing envs
    if not args.custom_xml:
        for morphology in args.morphologies:
            env_names += [
                name[:-4] for name in os.listdir(XML_DIR)
                if '.xml' in name and morphology in name
            ]
        for name in env_names:
            args.graphs[name] = utils.getGraphStructure(
                os.path.join(XML_DIR, '{}.xml'.format(name)))
    # custom envs
    else:
        if os.path.isfile(args.custom_xml):
            assert '.xml' in os.path.basename(
                args.custom_xml), 'No XML file found.'
            name = os.path.basename(args.custom_xml)
            env_names.append(name[:-4])  # truncate the .xml suffix
            args.graphs[name[:-4]] = utils.getGraphStructure(args.custom_xml)
        elif os.path.isdir(args.custom_xml):
            for name in os.listdir(args.custom_xml):
                if '.xml' in name:
                    env_names.append(name[:-4])
                    args.graphs[name[:-4]] = utils.getGraphStructure(
                        os.path.join(args.custom_xml, name))
    env_names.sort()

    # Set up env and policy ================================================
    args.limb_obs_size, args.max_action = utils.registerEnvs(
        env_names, args.max_episode_steps, args.custom_xml)
    # determine the maximum number of children in all the envs
    if args.max_children is None:
        args.max_children = utils.findMaxChildren(env_names, args.graphs)
    # setup agent policy
    policy = TD3.TD3(args)

    try:
        cp.load_model_only(exp_path, policy)
    except:
        raise Exception(
            'policy loading failed; check policy params (hint 1: max_children must be the same as the trained policy; hint 2: did the trained policy use torchfold (consider pass --disable_fold)?'
        )

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # visualize ===========================================================
    for env_name in env_names:
        # create env
        env = utils.makeEnvWrapper(env_name, seed=args.seed,
                                   obs_max_len=None)()
        policy.change_morphology(args.graphs[env_name])

        # create unique temp frame dir
        count = 0
        frame_dir = os.path.join(
            VIDEO_DIR, "frames_{}_{}_{}".format(args.expID, env_name, count))
        while os.path.exists(frame_dir):
            count += 1
            frame_dir = "{}/frames_{}_{}_{}".format(VIDEO_DIR, args.expID,
                                                    env_name, count)
        os.makedirs(frame_dir)
        # create video name without overwriting previously generated videos
        count = 0
        video_name = "%04d_%s_%d" % (args.expID, env_name, count)
        while os.path.exists("{}/{}.mp4".format(VIDEO_DIR, video_name)):
            count += 1
            video_name = "%04d_%s_%d" % (args.expID, env_name, count)

        # init env vars
        done = True
        print("-" * 50)
        time_step_counter = 0
        printProgressBar(0, total_time)

        while time_step_counter < total_time:
            printProgressBar(time_step_counter + 1,
                             total_time,
                             prefix=env_name)
            if done:
                obs = env.reset()
                done = False
                episode_reward = 0
            action = policy.select_action(np.array(obs))
            # perform action in the environment
            new_obs, reward, done, _ = env.step(action)
            episode_reward += reward
            # draw image of current frame
            image_data = env.sim.render(VIDEO_RESOLUATION[0],
                                        VIDEO_RESOLUATION[1],
                                        camera_name="track")
            img = Image.fromarray(image_data, "RGB")
            draw = ImageDraw.Draw(img)
            font = ImageFont.truetype('./misc/sans-serif.ttf', 24)
            draw.text((200, 10),
                      "Instant Reward: " + str(reward), (255, 0, 0),
                      font=font)
            draw.text((200, 35),
                      "Episode Reward: " + str(episode_reward), (255, 0, 0),
                      font=font)
            img.save(
                os.path.join(frame_dir, "frame-%.10d.png" % time_step_counter))

            obs = new_obs
            time_step_counter += 1

        # redirect output so output does not show on window
        FNULL = open(os.devnull, 'w')
        # create video
        subprocess.call([
            'ffmpeg', '-framerate', '50', '-y', '-i',
            os.path.join(frame_dir, 'frame-%010d.png'), '-r', '30', '-pix_fmt',
            'yuv420p',
            os.path.join(VIDEO_DIR, '{}.mp4'.format(video_name))
        ],
                        stdout=FNULL,
                        stderr=subprocess.STDOUT)
        subprocess.call(['rm', '-rf', frame_dir])
Example #27
0
File: main.py Project: yasasa/TD3
        os.makedirs("./pytorch_models")

    env = gym.make(args.env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "BNNTD3":
        policy = BNNTD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "BootstrapTD3":
        if args.actor_branches > 0:
            actor_branches = args.actor_branches
        else:
            actor_branches = args.branches
        policy = BootstrapTD3.TD3(state_dim, action_dim, max_action, args.branches, actor_branches)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()
Example #28
0
def main(args):
    file_name = f"{args.policy}_{args.env}_{args.seed}"
    print("---------------------------------------")
    print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}")
    print("---------------------------------------")

    log_path = safe_path(
        os.path.join(args.log_root, '{}_base'.format(args.env)))
    result_path = safe_path(os.path.join(log_path, 'results'))
    model_path = safe_path(os.path.join(log_path, 'models'))

    env = gym.make(args.env)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, env, args.seed)]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(args.max_timesteps)):

        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = (policy.select_action(np.array(state)) + np.random.normal(
                0, max_action * args.expl_noise, size=action_dim)).clip(
                    -max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(
            done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= args.start_timesteps:
            policy.train(replay_buffer, args.batch_size)

        if done:
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
            )
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluate episode
        if (t + 1) % args.eval_freq == 0:
            evaluations.append(eval_policy(policy, env, args.seed))
            np.save(os.path.join(result_path, '{}'.format(file_name)),
                    evaluations)
            if args.save_model:
                policy.save(os.path.join(model_path, '{}'.format(file_name)))
Example #29
0
    print "---------------------------------------"
    print "Settings: %s" % (file_name)
    print "---------------------------------------"

    env = gym.make(args.env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = \
            TD3.TD3(state_dim, action_dim, max_action, actor_lr=args.actor_lr, is_ro=args.is_ro)
    elif args.policy_name == "OurDDPG":
        policy = \
            OurDDPG.DDPG(state_dim, action_dim, max_action, actor_lr=args.actor_lr, is_ro=args.is_ro)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    policy.load(
        "%s_%s_%s.pth" % (args.policy_name, args.env_name, str(args.seed)),
        "pytorch_models")

    evaluate_policy(policy, args.eval_episodes)
Example #30
0
def train(args):

    # Set up directories ===========================================================
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(BUFFER_DIR, exist_ok=True)
    exp_name = "EXP_%04d" % (args.expID)
    exp_path = os.path.join(DATA_DIR, exp_name)
    rb_path = os.path.join(BUFFER_DIR, exp_name)
    os.makedirs(exp_path, exist_ok=True)
    os.makedirs(rb_path, exist_ok=True)
    # save arguments
    with open(os.path.join(exp_path, 'args.txt'), 'w+') as f:
        json.dump(args.__dict__, f, indent=2)

    # Retrieve MuJoCo XML files for training ========================================
    agent_name = args.agent_name
    envs_train_names = [agent_name]
    args.graphs = dict()
    # existing envs
    if not args.custom_xml:
        args.graphs[agent_name] = utils.getGraphStructure(
            os.path.join(XML_DIR, '{}.xml'.format(agent_name)))
    # custom envs

    num_envs_train = len(envs_train_names)
    print("#" * 50 + '\ntraining envs: {}\n'.format(envs_train_names) +
          "#" * 50)

    # Set up training env and policy ================================================
    args.limb_obs_size, args.max_action = utils.registerEnvs(
        envs_train_names, args.max_episode_steps, args.custom_xml)
    max_num_limbs = max(
        [len(args.graphs[env_name]) for env_name in envs_train_names])
    # create vectorized training env
    obs_max_len = max(
        [len(args.graphs[env_name])
         for env_name in envs_train_names]) * args.limb_obs_size
    envs_train = [
        utils.makeEnvWrapper(name, obs_max_len, args.seed)
        for name in envs_train_names
    ]
    # envs_train = SubprocVecEnv(envs_train)  # vectorized env
    # set random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    # determine the maximum number of children in all the training envs
    if args.max_children is None:
        args.max_children = utils.findMaxChildren(envs_train_names,
                                                  args.graphs)
    # setup agent policy
    policy = TD3.LifeLongTD3(args)

    # Create new training instance or load previous checkpoint ========================
    if cp.has_checkpoint(exp_path, rb_path):
        print("*** loading checkpoint from {} ***".format(exp_path))
        total_timesteps, episode_num, replay_buffer, num_samples, loaded_path = cp.load_checkpoint(
            exp_path, rb_path, policy, args)
        print("*** checkpoint loaded from {} ***".format(loaded_path))
    else:
        print("*** training from scratch ***")
        # init training vars
        total_timesteps = 0
        episode_num = 0
        num_samples = 0
        # different replay buffer for each env; avoid using too much memory if there are too many envs

    # Initialize training variables ================================================
    writer = SummaryWriter("%s/%s/" % (DATA_DIR, exp_name))
    s = time.time()
    # TODO: may have to change the following codes into the loop
    timesteps_since_saving = 0
    this_training_timesteps = 0
    episode_timesteps = 0
    episode_reward = 0
    episode_reward_buffer = 0
    done = True

    # Start training ===========================================================
    for env_handle, env_name in zip(envs_train, envs_train_names):
        env = env_handle()
        obs = env.reset()
        replay_buffer = utils.ReplayBuffer(max_size=args.rb_max)
        policy.change_morphology(args.graphs[env_name])
        policy.graph = args.graphs[env_name]
        task_timesteps = 0
        done = False
        episode_timesteps = 0
        episode_reward = 0
        episode_reward_buffer = 0
        while task_timesteps < args.max_timesteps:
            # train and log after one episode for each env
            if done:
                # log updates and train policy
                if this_training_timesteps != 0:
                    policy.train(replay_buffer,
                                 episode_timesteps,
                                 args.batch_size,
                                 args.discount,
                                 args.tau,
                                 args.policy_noise,
                                 args.noise_clip,
                                 args.policy_freq,
                                 graphs=args.graphs,
                                 env_name=env_name)
                    # add to tensorboard display

                    writer.add_scalar('{}_episode_reward'.format(env_name),
                                      episode_reward, task_timesteps)
                    writer.add_scalar('{}_episode_len'.format(env_name),
                                      episode_timesteps, task_timesteps)
                    # print to console
                    print(
                        "-" * 50 +
                        "\nExpID: {}, FPS: {:.2f}, TotalT: {}, EpisodeNum: {}, SampleNum: {}, ReplayBSize: {}"
                        .format(args.expID, this_training_timesteps /
                                (time.time() -
                                 s), total_timesteps, episode_num, num_samples,
                                len(replay_buffer.storage)))
                    print("{} === EpisodeT: {}, Reward: {:.2f}".format(
                        env_name, episode_timesteps, episode_reward))
                    this_training_timesteps = 0
                    s = time.time()

                # save model and replay buffers
                if timesteps_since_saving >= args.save_freq:
                    print("!!!!!")
                    timesteps_since_saving = 0
                    model_saved_path = cp.save_model(exp_path, policy,
                                                     total_timesteps,
                                                     episode_num, num_samples,
                                                     {env_name: replay_buffer},
                                                     envs_train_names, args)
                    print("*** model saved to {} ***".format(model_saved_path))
                    rb_saved_path = cp.save_replay_buffer(
                        rb_path, {env_name: replay_buffer})
                    print("*** replay buffers saved to {} ***".format(
                        rb_saved_path))

                # reset training variables
                obs = env.reset()
                done = False
                episode_reward = 0
                episode_timesteps = 0
                episode_num += 1
                # create reward buffer to store reward for one sub-env when it is not done
                episode_reward_buffer = 0

            # start sampling ===========================================================
            # sample action randomly for sometime and then according to the policy
            if task_timesteps < args.start_timesteps:
                action = np.random.uniform(low=env.action_space.low[0],
                                           high=env.action_space.high[0],
                                           size=max_num_limbs)
            else:
                # remove 0 padding of obs before feeding into the policy (trick for vectorized env)
                obs = np.array(obs[:args.limb_obs_size *
                                   len(args.graphs[env_name])])
                policy_action = policy.select_action(obs)
                if args.expl_noise != 0:
                    policy_action = (policy_action + np.random.normal(
                        0, args.expl_noise, size=policy_action.size)).clip(
                            env.action_space.low[0], env.action_space.high[0])
                # add 0-padding to ensure that size is the same for all envs
                action = np.append(
                    policy_action,
                    np.array([
                        0 for i in range(max_num_limbs - policy_action.size)
                    ]))

            # perform action in the environment
            new_obs, reward, done, _ = env.step(action)

            # record if each env has ever been 'done'

            # add the instant reward to the cumulative buffer
            # if any sub-env is done at the momoent, set the episode reward list to be the value in the buffer
            episode_reward_buffer += reward
            if done and episode_reward == 0:
                episode_reward = episode_reward_buffer
                episode_reward_buffer = 0
            writer.add_scalar('{}_instant_reward'.format(env_name), reward,
                              task_timesteps)
            done_bool = float(done)
            if episode_timesteps + 1 == args.max_episode_steps:
                done_bool = 0
                done = True
            # remove 0 padding before storing in the replay buffer (trick for vectorized env)
            num_limbs = len(args.graphs[env_name])
            obs = np.array(obs[:args.limb_obs_size * num_limbs])
            new_obs = np.array(new_obs[:args.limb_obs_size * num_limbs])
            action = np.array(action[:num_limbs])
            # insert transition in the replay buffer
            replay_buffer.add((obs, new_obs, action, reward, done_bool))
            num_samples += 1
            # do not increment episode_timesteps if the sub-env has been 'done'
            if not done:
                episode_timesteps += 1
                total_timesteps += 1
                task_timesteps += 1
                this_training_timesteps += 1
                timesteps_since_saving += 1

            obs = new_obs
        policy.next_task()

    # save checkpoint after training ===========================================================
    model_saved_path = cp.save_model(exp_path, policy, total_timesteps,
                                     episode_num, num_samples,
                                     {envs_train_names[-1]: replay_buffer},
                                     envs_train_names, args)
    print("*** training finished and model saved to {} ***".format(
        model_saved_path))