Ejemplo n.º 1
0
def train(env):
    if env == "MountainCar-v0":
        agent = DQN(2, 3)
        play(env, agent, until=195, ckpt=True)
    else:
        agent = DQN(state_dim=(210, 160, 3), n_actions=14)
        play(env, agent)
    agent.save_model(env)
def train(cfg):
    print('Start to train !')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
    env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
    env.seed(1) # 设置env随机种子
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n
    agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
                epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.train_eps+1):
        state = env.reset() # reset环境状态
        ep_reward = 0
        for i_step in range(1, cfg.train_steps+1):
            action = agent.choose_action(state) # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action) # 更新环境参数
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
            state = next_state # 跳转到下一个状态
            agent.update() # 每步更新网络
            if done:
                break
        # 更新target network,复制DQN中的所有weights and biases
        if i_episode % cfg.target_update == 0:
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
        print('Episode:', i_episode, ' Reward: %i' %
              int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        # 计算滑动窗口的reward
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(
                0.9*moving_average_rewards[-1]+0.1*ep_reward)
        writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
        writer.add_scalar('steps_of_each_episode',
                          ep_steps[-1], i_episode)
    writer.close()
    print('Complete training!')
    ''' 保存模型 '''
    if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
        os.mkdir(SAVED_MODEL_PATH)
    agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth')
    print('model saved!')
    '''存储reward等相关结果'''
    save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
Ejemplo n.º 3
0
def main():
    env = retro.make(game='Frogger-Genesis', use_restricted_actions=retro.Actions.DISCRETE)
    gamma = 0.99
    copy_step = 25
    num_actions = env.action_space.n
    num_states = len(env.observation_space.sample())
    hidden_units = [200, 200]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)

    # For stable weights, use one net to train, and copy their weights over to the TargetNet every copy_steps
    TrainNet = DQN(num_actions=num_actions, gamma=gamma, max_experiences=max_experiences,
                   min_experiences=min_experiences, batch_size=batch_size, lr=lr,
                   hidden_units=hidden_units, num_states=num_states)
    TargetNet = DQN(num_actions=num_actions, gamma=gamma, max_experiences=max_experiences,
                    min_experiences=min_experiences, batch_size=batch_size, lr=lr,
                    hidden_units=hidden_units, num_states=num_states)

    # Loading check
    while True:
        if os.path.exists(save_dir):
            if input("\n\nWould you like to load the previous network weights? (y/n) ") == 'y':
                # load weights and copy to train net
                TargetNet.load_model(save_path)
                TrainNet.copy_weights(TargetNet)
                print("Loaded model weights...")
                break

            elif input("\nWould you like to delete the old checkpoints and start again? (y/n)") == 'y':
                shutil.rmtree(save_dir)
                print("Removed old checkpoint...")
                break
        else:
            break

    N = 50000
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1

    # play N games
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()

        with summary_writer.as_default():
            tf.summary.scalar("episode reward", total_reward, step=n)
            tf.summary.scalar("running avg reward(100)", avg_rewards, step=n)

        if n % 100 == 0:
            print("episode:", n, "episode reward:", total_reward,
                  "eps:", epsilon, "avg reward (last 100):", avg_rewards)

            # save the model weights
            TargetNet.save_model(save_path)

    print("avg reward for last 100 episodes:", avg_rewards)

    if create_video:
        make_video(env, TrainNet)

    env.close()