Ejemplo n.º 1
0
def train(rank, device, args):
    current_time = datetime.now().strftime('%b%d_%H-%M')
    LOGGER_DIR = os.path.join(args.log_dir, args.env, current_time, 'Agent:{}'.format(rank))
    writer = SummaryWriter(LOGGER_DIR)
    MODEL_DIR = os.path.join(LOGGER_DIR, 'models')
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    env = create_env(args.env, args)

    if args.pri:
        ram = PrioMemoryBuffer(args.buffer_size)
    else:
        ram = MemoryBuffer(args.buffer_size)

    player = DDPGAgent(env.observation_space, env.action_space, ram, writer, device, args)
    if args.model_dir is not None:
        player.load_models(args.model_dir)
    steps_done = 0
    episode_rewards = []
    max_score = -9999
    count_eps = 0
    for _ep in range(1, args.max_eps):
        observation = env.reset()
        total_reward = 0
        count_eps += 1
        for r in range(10000):
            if 'img' in args.obs:
                state = np.expand_dims(observation, axis=0)
            else:
                state = np.float32(observation)
            action, action_rescale = player.get_exploration_action(state)
            new_observation, reward, done, info = env.step(action_rescale)
            steps_done += 1
            total_reward += reward
            ram.add(observation, np.expand_dims(action, axis=0), reward, new_observation)
            observation = new_observation
            # perform optimization
            if steps_done > args.start_learning:
                player.optimize()
            if done:
                break

        # logger
        writer.add_scalar('episode/reward', total_reward, steps_done)
        writer.add_scalar('episode/length', r, steps_done)
        episode_rewards.append(total_reward)
        if _ep % args.eval_eps == 0:
            reward_ave = np.array(episode_rewards).mean()
            print('Train, episode %d, steps: %d reward: %.3f,ave_reward: %.3f' % (count_eps, steps_done, episode_rewards[-1], reward_ave))
            if reward_ave > max_score:
                player.save_models(os.path.join(MODEL_DIR, 'best'))
                max_score = reward_ave
                print('Save Best!')
            else:
                player.save_models(os.path.join(MODEL_DIR, 'new'))
            episode_rewards = []
        # check memory consumption and clear memory
        gc.collect()
Ejemplo n.º 2
0
def main():
    mp.set_start_method('spawn')
    config = Config()
    # 1. 初始化环境
    env = NormalizedEnv(gym.make('Pendulum-v0'))

    # 2. 初始化agent
    agent = DDPGAgent(env=env,
                      seed=config.seed,
                      batch_size=config.batch_size,
                      learning_rate_actor=config.learning_rate_actor,
                      learning_rate_critic=config.learning_rate_critic,
                      weight_decay=config.weight_decay)
    agent.target_actor.share_memory()
    # 3. 初始化memory
    memory = ReplayMemory(config.capacity)

    q = mp.Queue(10)

    process_collect_list = []
    for i in range(config.agent_num):
        process_name = "collect_process_" + str(i)
        process = mp.Process(name=process_name,
                             target=collect_porcess,
                             args=(i, q, agent.target_actor))
        process.start()
        process_collect_list.append(process)

    steps = mp.Value('d', 0)
    test_p = mp.Process(name="test_process",
                        target=test_process,
                        args=(config, steps, agent.target_actor))
    test_p.start()
    process_collect_list.append(test_p)

    try:
        while True:
            len = q.qsize()
            while len:
                mem = q.get()
                memory.push(mem[0], mem[1], mem[2], mem[3], mem[4])
                len -= 1
            # 4.4 学习
            if memory.len > config.batch_size:
                agent.learning(memory)
            # save model
            if steps.value > 1 and steps.value % config.save_steps == 0:
                agent.save_models(steps.value / config.save_steps)
            steps.value += 1
    except Exception as e:
        print(e)
    except:
        for process in process_collect_list:
            process.join()
            print(process.name + " stop ")
    env.close()