def main(args): env = gym.make(args['env_name']) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') action_dim = env.action_space.shape[0] max_action = env.action_space.high[0] state_dim = env.observation_space.shape[0] sac = SAC(args, action_dim, max_action, state_dim, device) summary = tensorboardX.SummaryWriter('./log/{}_sac_{}'.format(args['env_name'], args['noise_type'])) timestep = 0 start_time = time.time() for episode in range(args['max_episode']): episode_reward = 0 state = env.reset() while True: action = sac.get_action(state) next_state, reward, done, info = env.step(action) sac.save(state, action, reward, next_state, int(done)) episode_reward += reward state = next_state timestep += 1 if sac.memory_counter > args['batch_size']: # BATCH_SIZE(64) 이상일 때 부터 train 시작 sac.train() if done: print('episode: ', episode, ' reward : %.3f'%(episode_reward), ' timestep :', timestep) summary.add_scalar('reward/episode', episode_reward, episode) break if episode % args['save_freq'] == 0: if not os.path.exists('./SaveModel') : os.mkdir('./SaveModel') torch.save(sac.actor.state_dict(), './SaveModel/{}_sac_{}_{}'.format(args['env_name'], args['noise_type'], episode))
from sac import SAC import itertools import torch ACTIONS = [x for x in itertools.product([-1, 0, 1], [-1, 0, 1])] n_episodes = 1000 n_steps = 10000 rews = [] env = CarEnv() model = SAC(9, 19, 0.7, 0.45, 0.001) for e in range(n_episodes): obs, _ = env.reset() episode_reward = 0 for s in range(n_steps): act_idx, _, _ = model.get_action( torch.tensor(obs, device=torch.device('cuda'))) act = ACTIONS[act_idx] obs_, rew = env.step(act) episode_reward += rew transition = (torch.as_tensor(obs, dtype=torch.double, device=torch.device('cuda')), torch.as_tensor(ACTIONS.index(act), dtype=torch.long, device=torch.device('cuda')), torch.as_tensor(rew, dtype=torch.double, device=torch.device('cuda')), torch.as_tensor(obs_, dtype=torch.double, device=torch.device('cuda')),