Beispiel #1
0
def main():

	params = {
		'actor_learning_rate':1e-4,
		'critic_learning_rate':1e-3,
		'gamma':0.99,
		'tau':0.001,
		'sigma':0.2,
		'num_epochs':275,
		'num_episodes':20,
		'replay_size':1000000,
		'num_train_steps':1,
		'replay_init_size':1000,
		'batch_size':64,
		'render_train':False,
		'restore':False,
		'env':'Hopper-v2_kirkiles_train1step_noise_norm_bufsize1Mi1k'
	}
	
	agent = DDPG(params)
	agent.train()
Beispiel #2
0
def train_agent(args, param):
    """

    Args:
    """

    # create CNN convert the [1,3,84,84] to [1, 200]

    use_gym = False
    # in case seed experements
    args.seed = param
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    #args.repeat_opt = repeat_opt
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str(
        args.policy)
    pathname += "_batch_size_" + str(args.batch_size)
    pathname += '_update_freq: ' + str(
        args.target_update_freq) + "num_q_target_" + str(
            args.num_q_target) + "_seed_" + str(args.seed)
    pathname += "_actor_300_200"
    text = "Star_training target_update_freq: {}  num_q_target: {}  use device {} ".format(
        args.target_update_freq, args.num_q_target, args.device)
    print(pathname, text)
    write_into_file(pathname, text)
    arg_text = str(args)
    write_into_file(pathname, arg_text)
    tensorboard_name = str(args.locexp) + '/runs/' + pathname
    writer = SummaryWriter(tensorboard_name)

    if use_gym:
        env = gym.make(args.env_name)
        env.seed(args.seed)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])
        args.max_episode_steps = env._max_episode_steps
    else:
        size = 84
        env = suite.make(
            args.env_name,
            has_renderer=False,
            use_camera_obs=True,
            ignore_done=True,
            has_offscreen_renderer=True,
            camera_height=size,
            camera_width=size,
            render_collision_mesh=False,
            render_visual_mesh=True,
            camera_name='agentview',
            use_object_obs=False,
            camera_depth=True,
            reward_shaping=True,
        )

    state_dim = 200
    print("State dim, ", state_dim)
    action_dim = env.dof
    print("action_dim ", action_dim)
    max_action = 1
    args.max_episode_steps = 200

    policy = DDPG(state_dim, action_dim, max_action, args)
    file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name)
    obs_shape = (3, 84, 84)
    action_shape = (action_dim, )
    print("obs", obs_shape)
    print("act", action_shape)
    replay_buffer = ReplayBuffer(obs_shape, action_shape,
                                 int(args.buffer_size), args.image_pad,
                                 args.device)
    save_env_vid = False
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    tb_update_counter = 0
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            #env.seed(random.randint(0, 100))
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if tb_update_counter > args.tensorboard_freq:
                print("Write tensorboard")
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
                writer.flush()
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, episode_num)
                text += "Episode steps {} ".format(episode_timesteps)
                text += "Reward: {:.2f}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))

                print(text)
                write_into_file(pathname, text)
                #policy.train(replay_buffer, writer, episode_timesteps)
            # We evaluate the episode and we save the policy
            if total_timesteps > args.start_timesteps:
                policy.train(replay_buffer, writer, 200)
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    env))
                torch.manual_seed(args.seed)
                np.random.seed(args.seed)
                save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(
                    episode_num, evaluations[-1], args.policy)
                policy.save(save_model)
            # When the training step is done, we reset the state of the environment
            if use_gym:
                obs = env.reset()
            else:
                state = env.reset()
                obs, state_buffer = stacked_frames(state, size, args, policy)

            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            if use_gym:
                action = env.action_space.sample()
            else:
                action = np.random.randn(env.dof)
        else:  # After 10000 timesteps, we switch to the model
            if use_gym:
                action = policy.select_action(np.array(obs))
                # If the explore_noise parameter is not 0, we add noise to the action and we clip it
                if args.expl_noise != 0:
                    action = (action + np.random.normal(
                        0, args.expl_noise,
                        size=env.action_space.shape[0])).clip(
                            env.action_space.low, env.action_space.high)
            else:
                action = (policy.select_action(np.array(obs)) +
                          np.random.normal(
                              0, max_action * args.expl_noise,
                              size=action_dim)).clip(-max_action, max_action)

        if total_timesteps % args.target_update_freq == 0:
            if args.policy == "TD3_ad":
                policy.hardupdate()
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        new_obs, reward, done, _ = env.step(action)
        done = float(done)
        if not use_gym:
            new_obs, state_buffer = create_next_obs(new_obs, size, args,
                                                    state_buffer, policy)
        # We check if the episode is done
        #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
        done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float(
            done)
        if not use_gym:
            if episode_timesteps + 1 == args.max_episode_steps:
                done = True
        # We increase the total reward
        reward = reward * args.reward_scalling
        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        if args.debug:
            print("add to buffer next_obs ", obs.shape)
            print("add to bufferobs ", new_obs.shape)
        replay_buffer.add(obs, action, reward, new_obs, done, done_bool)
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs
        if total_timesteps > args.start_timesteps:
            policy.train(replay_buffer, writer, 0)
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # We add the last policy evaluation to our list of evaluations and we save our model
    evaluations.append(
        evaluate_policy(policy, writer, total_timesteps, args, episode_num))