Esempio n. 1
0
    def run(self):
        for e_i in range(self.args.episodes):
            ob = self._env.reset()
            state = torch.tensor(ob, dtype=torch.float32).unsqueeze(0).cuda()

            total_reward = 0
            done = False
            num_steps = 0
            while not done:
                self._env.render()
                # Get current action
                action, _ = self._agent(state)

                # Perform action in environment
                ob, reward, done, _ = self._env.step(action)
                total_reward += reward

                # Add memory step
                if done:
                    next_state = None
                else:
                    next_state = torch.tensor(
                        ob, dtype=torch.float32).unsqueeze(0).cuda()

                if next_state is None:
                    e_t = Experience(state.clone(), action, reward, next_state)
                else:
                    e_t = Experience(state.clone(), action, reward,
                                     next_state.clone())

                self._agent.add_ex(e_t)
                state = next_state

                if self._agent.replay_len(
                ) > self.args.min_init_state and self._agent.replay_len(
                ) > self.args.batch_size and (num_steps +
                                              1) % self.args.update_steps == 0:
                    self._agent.train()

                num_steps += 1

            self._env.reset()
            self._agent.save()
            if self._agent.replay_len() > self.args.min_init_state:
                self.total_ep_reward.append(total_reward)
            self._env.reset()
            print("total_reward", total_reward)
            print("Episode length", num_steps)
            if e_i % self.args.save_iter == 0:
                self._agent.save()
                self.save_results()

            # for i in range(8):
            #     self._agent.train()

        self._env.close()
Esempio n. 2
0
def run_train_episode(env, agent, rpm):
    total_reward = 0
    all_cost = []
    obs = env.reset()
    steps = 0
    while True:
        steps += 1
        context = rpm.recent_obs()
        context.append(obs)
        context = np.stack(context, axis=0)
        action = agent.sample(context)
        next_obs, reward, isOver, _ = env.step(action)
        rpm.append(Experience(obs, action, reward, isOver))
        # start training
        if rpm.size() > MEMORY_WARMUP_SIZE:
            if steps % UPDATE_FREQ == 0:
                batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
                    args.batch_size)
                batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
                batch_next_obs = batch_all_obs[:, 1:, :, :]
                cost = agent.learn(batch_obs, batch_action, batch_reward,
                                   batch_next_obs, batch_isOver)
                all_cost.append(float(cost))
        total_reward += reward
        obs = next_obs
        if isOver:
            break
    if all_cost:
        logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
            total_reward, np.mean(all_cost)))
    return total_reward, steps, np.mean(all_cost)
Esempio n. 3
0
def run_train_episode(env, agent, rpm):
    total_reward = 0
    all_cost = []
    obs = env.reset()
    steps = 0
    while True:
        steps += 1
        context = rpm.recent_obs()
        context.append(obs)
        context = np.stack(context, axis=0)
        action = agent.sample(context)
        next_obs, reward, isOver, _ = env.step(action)
        rpm.append(Experience(obs, action, reward, isOver))
        if rpm.size() > MEMORY_WARMUP_SIZE:
            if steps % UPDATE_FREQ == 0:
                batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
                    args.batch_size)
                batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
                batch_next_obs = batch_all_obs[:, 1:, :, :]
                cost = agent.learn(batch_obs, batch_action, batch_reward,
                                   batch_next_obs, batch_isOver)
                all_cost.append(cost)
        total_reward += reward
        obs = next_obs
        if isOver:
            mean_loss = np.mean(all_cost) if all_cost else None
            return total_reward, steps, mean_loss
Esempio n. 4
0
def collect_exp(env, rpm, agent):
    obs = env.reset()
    # collect data to fulfill replay memory
    for i in tqdm(range(MEMORY_SIZE)):
        context = rpm.recent_obs()
        context.append(obs)
        context = np.stack(context, axis=0)
        action = agent.sample(context)

        next_obs, reward, isOver, _ = env.step(action)
        rpm.append(Experience(obs, action, reward, isOver))
        obs = next_obs
Esempio n. 5
0
def run_evaluate_episode(env, agent,rpm):
    state, _, __ = env.reset('test')
    total_reward = 0
    step=0
    while True:
        context = rpm.recent_state()
        context.append(resizeBirdrToAtari(state))
        context = np.stack(context, axis=0)
        action = agent.predict(context)
        next_state, reward, isOver,_ = env.step(action)
        step+=1
        rpm.appendForTest(Experience(resizeBirdrToAtari(state), action, reward, isOver))
        total_reward += reward
        state=next_state
        if isOver or step>=MAX_Step_Limit:
            time.sleep(2)
            break
    return total_reward
Esempio n. 6
0
def run_train_episode(env, agent, rpm):
    global trainEpisode
    global meanReward
    total_reward = 0
    all_cost = []
    #重置环境
    state,_, __ = env.reset()
    step = 0
    #循环每一步
    while True:
        context = rpm.recent_state()
        context.append(resizeBirdrToAtari(state))
        context = np.stack(context, axis=0)
        #用ε-greedy的方式选一个动作
        action = agent.sample(context)
        #执行动作
        next_state, reward, isOver,_ = env.step(action)
        step += 1
        #存入replay_buffer
        rpm.append(Experience(resizeBirdrToAtari(state), action, reward, isOver))
        if rpm.size() > MEMORY_WARMUP_SIZE:
            if step % UPDATE_FREQ == 0:
                #从replay_buffer中随机采样
                batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
                    batchSize)
                batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
                batch_next_state = batch_all_state[:, 1:, :, :]
                #执行SGD,训练参数θ
                cost = agent.learn(batch_state, batch_action, batch_reward,
                                   batch_next_state, batch_isOver)
                all_cost.append(float(cost))
        total_reward += reward
        state = next_state
        if isOver or step>=MAX_Step_Limit:
            break
    if all_cost:
        trainEpisode+=1
        #以滑动平均的方式打印平均奖励
        meanReward=meanReward+(total_reward-meanReward)/trainEpisode
        print('\n trainEpisode:{},total_reward:{:.2f}, meanReward:{:.2f} mean_cost:{:.3f}'\
              .format(trainEpisode,total_reward, meanReward,np.mean(all_cost)))
    return total_reward, step
Esempio n. 7
0
def ddqn_train(env, scheduler, optimizer_constructor, model_type, batch_size,
               rp_start, rp_size, exp_frame, exp_initial, exp_final, gamma,
               target_update_steps, frames_per_epoch, frames_per_state,
               output_directory, last_checkpoint, envo):
    """
	Implementation of the training algorithm for DDQN. 
	"""

    gym.undo_logger_setup()
    logging.basicConfig(filename=envo + '_' + model_type + 'ddqn_training.log',
                        level=logging.INFO)
    num_actions = env.action_space.n
    env.reset()

    print('No. of actions: ', num_actions)
    print(env.unwrapped.get_action_meanings())

    # initialize action value and target network with the same weights
    model = DQN(num_actions, use_bn=False)
    target = DQN(num_actions, use_bn=False)

    if use_cuda:
        model.cuda()
        target.cuda()

    exp_replay = None
    episodes_count = 1

    if last_checkpoint:
        model.load_state_dict(torch.load(last_checkpoint))
        print(last_checkpoint)
        print('weights loaded...')

        exp_replay = initialize_replay_resume(env, rp_start, rp_size,
                                              frames_per_state, model)
        episodes_count = get_index_from_checkpoint_path(last_checkpoint)

    else:
        exp_replay = initialize_replay(env, rp_start, rp_size,
                                       frames_per_state)

    target.load_state_dict(model.state_dict())

    # scheduler = Scheduler(exp_frame, exp_initial, exp_final)
    optimizer = optimizer_constructor.type(
        model.parameters(),
        lr=optimizer_constructor.kwargs['lr'],
        alpha=optimizer_constructor.kwargs['alpha'],
        eps=optimizer_constructor.kwargs['eps'])

    frames_count = 1
    frames_per_episode = 1
    epsiodes_durations = []
    rewards_per_episode = 0
    rewards_duration = []
    loss_per_epoch = []

    current_state, _, _, _ = play_game(env, frames_per_state)
    print('Starting training...')

    count = 0

    while True:

        epsilon = scheduler.anneal_linear(frames_count)
        choice = random.uniform(0, 1)

        # epsilon greedy algorithm
        if choice <= epsilon:
            action = LongTensor([[random.randrange(num_actions)]])

        else:
            action = get_greedy_action(model, current_state)

        curr_obs, reward, done, _ = play_game(env, frames_per_state,
                                              action[0][0])

        rewards_per_episode += reward
        reward = Tensor([reward])

        exp_replay.push(current_state, action, reward, curr_obs)

        current_state = curr_obs

        #sample random mini-batch
        obs_sample = exp_replay.sample(batch_size)

        batch = Experience(
            *zip(*obs_sample)
        )  #unpack the batch into states, actions, rewards and next_states

        #compute y
        if len(exp_replay) >= batch_size:

            loss = ddqn_compute_y(batch, batch_size, model, target, gamma)
            optimizer.zero_grad()
            loss.backward()

            for param in model.parameters():
                param.grad.data.clamp_(-1, 1)

            optimizer.step()
            loss_per_epoch.append(loss.data.cpu().numpy())

        frames_count += 1
        frames_per_episode += frames_per_state

        if done:
            rewards_duration.append(rewards_per_episode)
            rewards_per_episode = 0
            frames_per_episode = 1
            episodes_count += 1
            env.reset()
            current_state, _, _, _ = play_game(env, frames_per_state)

            if episodes_count % 100 == 0:
                avg_episode_reward = sum(rewards_duration) / 100.0
                avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum(
                    loss_per_epoch)
                print(avg_reward_content)
                logging.info(avg_reward_content)
                rewards_duration = []
                loss_per_epoch = []

        # update weights of target network for every TARGET_UPDATE_FREQ steps
        if frames_count % target_update_steps == 0:
            target.load_state_dict(model.state_dict())
            # print('weights updated at frame no. ', frames_count)

        #Save weights every 250k frames
        if frames_count % 250000 == 0:
            util.make_sure_path_exists(output_directory + '/' + envo + '/')
            torch.save(
                model.state_dict(), output_directory + envo + '_' +
                model_type + '/weights_' + str(frames_count) + '.pth')

        #Print frame count for every 1000000 (one million) frames:
        if frames_count % 1000000 == 0:
            training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon
            print(training_update)
            logging.info(training_update)
Esempio n. 8
0
    # Steps loop
    steps = 0
    while not environment_manager.done:

        if render:
            environment_manager.render()

        action = agent.select_action(state, policy_net)
        experience = environment_manager.take_action(action)
        state = experience[0]
        action = experience[1]
        next_state = experience[2]
        reward = experience[3]
        steps += 1
        memory.push(Experience(state, action, next_state, reward))
        max_episode_reward += reward
        state = next_state

        if memory.can_provide_sample(batch_size):
            experiences_batch = memory.sample(batch_size)
            states = np.zeros((batch_size, environment_manager.final_reshape))
            next_states = np.zeros(
                (batch_size, environment_manager.final_reshape))
            actions, rewards = [], []

            # Prepare data batch
            for i in range(batch_size):
                states[i] = experiences_batch[i][0]
                actions.append(experiences_batch[i][1])
                next_states[i] = experiences_batch[i][2]