def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.n state_dim = env.observation_space.shape[0] dqn = Double_DQN_Cnn(args, state_dim, action_dim, device) dqn.model.load_state_dict( torch.load('./SaveModel/BreakoutDeterministic-v4_dqn_3200')) while True: state = env.reset() state = init_state(state) while True: select = dqn.get_real_action(state) next_state, reward, done, info = env.step(select) env.render() time.sleep(0.03) next_state = preprocess(state, next_state) state = next_state if done: break
def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.shape[0] max_action = env.action_space.high[0] state_dim = env.observation_space.shape[0] td3 = TD3(args, action_dim, max_action, state_dim, device) summary = tensorboardX.SummaryWriter('./log/{}_td3_{}'.format(args['env_name'], args['noise_type'])) timestep = 0 for episode in range(args['max_episode']): episode_reward = 0 state = env.reset() state = utils.init_state(state) while True: if timestep < args['random_action_timestep'] : select = env.action_space.sample() action = utils.carRace_action_to_output(select) else : action = td3.get_action(state) select = utils.carRace_output_to_action(action) tmp_reward = 0 for i in range(4): tmp_next_state, reward, done, info = env.step(select) tmp_reward += reward tmp_next_state = utils.preprocess(tmp_next_state) tmp_next_state = tmp_next_state[np.newaxis, np.newaxis, :, :] next_state = np.append(tmp_next_state, state[:, :3, :, :], axis=1) # show_state(next_state) td3.save(state, action[0], tmp_reward, next_state, int(done)) episode_reward += tmp_reward state = next_state.copy() timestep += 1 if timestep > args['train_start_timestep']: if timestep % 2 == 0 : td3.train(summary, timestep) if done: print('episode: ', episode, ' reward : %.3f'%(episode_reward), ' timestep :', timestep) summary.add_scalar('reward/timestep', episode_reward, timestep) break if episode % args['save_freq'] == 0: if not os.path.exists('./SaveModel') : os.mkdir('./SaveModel') torch.save(td3.actor.state_dict(), './SaveModel/{}_td3_{}_{}'.format(args['env_name'], args['noise_type'], episode))
def run_episode(self): """ Game loop """ # reset obs = self.env.reset() s = init_state(obs) R = 0 # total reward this episode self.debug_log = [] while True: time.sleep(self.config['THREAD_DELAY']) # yield if self.render: self.env.render() a = self.agent.act(s) obs, r, done, info = self.env.step(a) sp = update_state(s, obs) if done: # terminal state sp = None self.agent.train(s, a, r, sp) if self.debug: self.debug_log.append([s, a, r, sp, done]) s = sp R += r if done or self.stop_signal: Environment.scores.append(R) self.episode_number += 1 # Static purge for now Environment.scores = Environment.scores[-100:] if self.debug: # Save logs # TODO: folder restructure save_pickle( self.debug_log, os.path.join( 'debug_logs', "ENV{}_EPISODE{}".format(self.id, self.episode_number))) if self.render: # Demo mode print("ENV_{} INFO: total reward this episode: {}".format( self.id, R)) break
def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.shape[0] max_action = env.action_space.high[0] state_dim = env.observation_space.shape[0] td3 = TD3(args, action_dim, max_action, state_dim, device) trained_actor = torch.load(args['model_directory']) td3.actor.load_state_dict(trained_actor) timestep = 0 for episode in range(args['max_episode']): episode_reward = 0 state = env.reset() state = utils.init_state(state) while True: action = td3.get_action(state) action = utils.carRace_output_to_action(action) tmp_reward = 0 for i in range(4): tmp_next_state, reward, done, info = env.step(action) tmp_reward += reward env.render() tmp_next_state = utils._preprocess(tmp_next_state) tmp_next_state = tmp_next_state[np.newaxis, np.newaxis, :, :] state = np.append(tmp_next_state, state[:, :3, :, :], axis=1) episode_reward += tmp_reward timestep += 1 if done: print('episode: ', episode, ' reward : %.3f' % (episode_reward), ' timestep :', timestep) break
def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.n state_dim = env.observation_space.shape[0] dqn = Double_DQN_Cnn(args, state_dim, action_dim, device) summary = tensorboardX.SummaryWriter('./log/{}_{}'.format( args['env_name'], 'double_dqn')) timestep = 0 for episode in range(args['max_episode']): episode_reward = 0 state = env.reset() state = init_state(state) while True: if args['random_action_timestep'] > timestep: select = env.action_space.sample() else: select = dqn.get_action(state) tmp_state = state.copy() for i in range(4): next_state, reward, done, info = env.step(select) if i == 3: break next_state = preprocess(tmp_state, next_state) tmp_state = next_state # env.render() next_state = preprocess(tmp_state, next_state) dqn.save(state, select, reward, next_state, int(done)) episode_reward += reward state = next_state timestep += 1 if timestep % 10 == 0: dqn.update_target() if timestep > args[ 'replay_start_size']: # BATCH_SIZE(64) 이상일 때 부터 train 시작 if timestep % args['skip'] == 0: dqn.train() if done: if episode % 1 == 0: print('episode: ', episode, ' reward : %.3f' % (episode_reward), ' timestep :', timestep, ' epsilon :', dqn.epsilon) summary.add_scalar('reward/timestep', episode_reward, timestep) break if episode % args['save_freq'] == 0: if not os.path.exists('./SaveModel'): os.mkdir('./SaveModel') torch.save( dqn.model.state_dict(), './SaveModel/{}_{}_{}'.format(args['env_name'], 'dqn', episode))