コード例 #1
0
ファイル: evaluate.py プロジェクト: jinPrelude/rl_algorithms
def main(args):
    env = gym.make(args['env_name'])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    action_dim = env.action_space.n
    state_dim = env.observation_space.shape[0]

    dqn = Double_DQN_Cnn(args, state_dim, action_dim, device)
    dqn.model.load_state_dict(
        torch.load('./SaveModel/BreakoutDeterministic-v4_dqn_3200'))

    while True:
        state = env.reset()
        state = init_state(state)
        while True:
            select = dqn.get_real_action(state)
            next_state, reward, done, info = env.step(select)
            env.render()
            time.sleep(0.03)

            next_state = preprocess(state, next_state)
            state = next_state

            if done:
                break
コード例 #2
0
ファイル: td3_main.py プロジェクト: jinPrelude/rl_algorithms
def main(args):
    env = gym.make(args['env_name'])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    action_dim = env.action_space.shape[0]
    max_action = env.action_space.high[0]
    state_dim = env.observation_space.shape[0]


    td3 = TD3(args, action_dim, max_action, state_dim, device)
    summary = tensorboardX.SummaryWriter('./log/{}_td3_{}'.format(args['env_name'], args['noise_type']))

    timestep = 0
    for episode in range(args['max_episode']):
        episode_reward = 0
        state = env.reset()
        state = utils.init_state(state)

        while True:
            if timestep < args['random_action_timestep'] :
                select = env.action_space.sample()
                action = utils.carRace_action_to_output(select)
            else :
                action = td3.get_action(state)
                select = utils.carRace_output_to_action(action)

            tmp_reward = 0
            for i in range(4):
                tmp_next_state, reward, done, info = env.step(select)
                tmp_reward += reward

            tmp_next_state = utils.preprocess(tmp_next_state)
            tmp_next_state = tmp_next_state[np.newaxis, np.newaxis, :, :]
            next_state = np.append(tmp_next_state, state[:, :3, :, :], axis=1)

            # show_state(next_state)
            td3.save(state, action[0], tmp_reward, next_state, int(done))
            episode_reward += tmp_reward
            state = next_state.copy()
            timestep += 1

            if timestep > args['train_start_timestep']:
                if timestep % 2 == 0 :
                    td3.train(summary, timestep)

            if done:
                print('episode: ', episode, '   reward : %.3f'%(episode_reward), '    timestep :', timestep)
                summary.add_scalar('reward/timestep', episode_reward, timestep)

                break

        if episode % args['save_freq'] == 0:
            if not os.path.exists('./SaveModel') :
                os.mkdir('./SaveModel')
            torch.save(td3.actor.state_dict(), './SaveModel/{}_td3_{}_{}'.format(args['env_name'], args['noise_type'], episode))
コード例 #3
0
    def run_episode(self):
        """ Game loop """
        # reset
        obs = self.env.reset()
        s = init_state(obs)
        R = 0  # total reward this episode
        self.debug_log = []

        while True:
            time.sleep(self.config['THREAD_DELAY'])  # yield

            if self.render:
                self.env.render()

            a = self.agent.act(s)
            obs, r, done, info = self.env.step(a)
            sp = update_state(s, obs)

            if done:  # terminal state
                sp = None

            self.agent.train(s, a, r, sp)
            if self.debug:
                self.debug_log.append([s, a, r, sp, done])

            s = sp
            R += r

            if done or self.stop_signal:
                Environment.scores.append(R)
                self.episode_number += 1
                # Static purge for now
                Environment.scores = Environment.scores[-100:]

                if self.debug:  # Save logs
                    # TODO: folder restructure
                    save_pickle(
                        self.debug_log,
                        os.path.join(
                            'debug_logs',
                            "ENV{}_EPISODE{}".format(self.id,
                                                     self.episode_number)))
                if self.render:  # Demo mode
                    print("ENV_{} INFO: total reward this episode: {}".format(
                        self.id, R))
                break
コード例 #4
0
ファイル: evaluate.py プロジェクト: jinPrelude/rl_algorithms
def main(args):
    env = gym.make(args['env_name'])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    action_dim = env.action_space.shape[0]
    max_action = env.action_space.high[0]
    state_dim = env.observation_space.shape[0]

    td3 = TD3(args, action_dim, max_action, state_dim, device)
    trained_actor = torch.load(args['model_directory'])
    td3.actor.load_state_dict(trained_actor)

    timestep = 0
    for episode in range(args['max_episode']):
        episode_reward = 0
        state = env.reset()
        state = utils.init_state(state)

        while True:
            action = td3.get_action(state)
            action = utils.carRace_output_to_action(action)
            tmp_reward = 0
            for i in range(4):
                tmp_next_state, reward, done, info = env.step(action)
                tmp_reward += reward
            env.render()
            tmp_next_state = utils._preprocess(tmp_next_state)
            tmp_next_state = tmp_next_state[np.newaxis, np.newaxis, :, :]
            state = np.append(tmp_next_state, state[:, :3, :, :], axis=1)
            episode_reward += tmp_reward
            timestep += 1

            if done:
                print('episode: ', episode,
                      '   reward : %.3f' % (episode_reward), '    timestep :',
                      timestep)
                break
コード例 #5
0
ファイル: main.py プロジェクト: jinPrelude/rl_algorithms
def main(args):
    env = gym.make(args['env_name'])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    action_dim = env.action_space.n
    state_dim = env.observation_space.shape[0]

    dqn = Double_DQN_Cnn(args, state_dim, action_dim, device)
    summary = tensorboardX.SummaryWriter('./log/{}_{}'.format(
        args['env_name'], 'double_dqn'))

    timestep = 0
    for episode in range(args['max_episode']):
        episode_reward = 0
        state = env.reset()
        state = init_state(state)

        while True:
            if args['random_action_timestep'] > timestep:
                select = env.action_space.sample()
            else:
                select = dqn.get_action(state)

            tmp_state = state.copy()
            for i in range(4):
                next_state, reward, done, info = env.step(select)
                if i == 3: break
                next_state = preprocess(tmp_state, next_state)
                tmp_state = next_state
            # env.render()
            next_state = preprocess(tmp_state, next_state)
            dqn.save(state, select, reward, next_state, int(done))
            episode_reward += reward
            state = next_state
            timestep += 1

            if timestep % 10 == 0:
                dqn.update_target()

            if timestep > args[
                    'replay_start_size']:  # BATCH_SIZE(64) 이상일 때 부터 train 시작
                if timestep % args['skip'] == 0:
                    dqn.train()

            if done:
                if episode % 1 == 0:
                    print('episode: ', episode,
                          '   reward : %.3f' % (episode_reward),
                          '    timestep :', timestep, '  epsilon :',
                          dqn.epsilon)

                summary.add_scalar('reward/timestep', episode_reward, timestep)
                break

        if episode % args['save_freq'] == 0:
            if not os.path.exists('./SaveModel'):
                os.mkdir('./SaveModel')
            torch.save(
                dqn.model.state_dict(),
                './SaveModel/{}_{}_{}'.format(args['env_name'], 'dqn',
                                              episode))