format(get_elapsed_time_str(), reward_sum, episode_length)) reward_sum = 0 episode_length = 0 actions.clear() state = env.reset() time.sleep(60) state = torch.from_numpy(state) if __name__ == '__main__': env = create_atari_env(args.rom) # torch.manual_seed(SEED) shared_model = ActorCritic(env.observation_space.shape[0], env.action_space) shared_model.share_memory() # print (shared_model.conv1._parameters['weight'].data.is_cuda) optimizer = SharedAdam(shared_model.parameters(), lr=0.0001) optimizer.share_memory() if args.play: if os.path.isfile(args.play): print("=> loading checkpoint '{}'".format(args.play)) checkpoint = torch.load(args.play) # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] shared_model.load_state_dict(checkpoint['state_dict']) #optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.play))
if __name__ == "__main__": mp.set_start_method('spawn') args = get_args() args.save_dir = 'a3c-{}/'.format(args.env.lower()) if args.test: args.processes = 1 args.lr = 0 args.num_actions = gym.make(args.env).action_space.n if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) torch.manual_seed(args.seed) shared_model = ActorCritic(num_actions=args.num_actions).share_memory() shared_optimizer = SharedAdam(shared_model.parameters(), lr=args.lr) info = { k: torch.DoubleTensor([0]).share_memory_() for k in ['run_epr', 'run_loss', 'episodes', 'frames'] } info['frames'] += shared_model.try_load(args.save_dir) * 1e6 if int(info['frames'].item()) == 0: printlog(args.save_dir, '', mode='w') processes = [] for rank in range(args.processes): p = mp.Process(target=train, args=(shared_model, shared_optimizer, rank, args, info)) p.start() processes.append(p)
import torch as th from utils import SharedAdam from model import Worker, Net import gym import torch.multiprocessing as mp env = gym.make('CartPole-v0') n_action = env.action_space.n n_state = env.observation_space.shape[0] global_net = Net(n_state, n_action) global_net.share_memory() optA = SharedAdam(global_net.policy.parameters(), lr=1e-4, betas=(0.92, 0.999)) optC = SharedAdam(global_net.v.parameters(), lr=1e-4, betas=(0.92, 0.999)) workers = [Worker(global_net, optA, optC, str(i)) for i in range(8)] [w.start() for w in workers] [w.join() for w in workers]
def train(shared_model, shared_optimizer, weights, rank, args, info): env = gym.make(args.env) env.seed(args.seed + rank) torch.manual_seed(args.seed + rank) model = PNN(num_actions=args.num_actions) model.new_task() model.load(weights) model.freeze_column() model.new_task() shared_optimizer = SharedAdam(shared_model.parameters(), lr=args.lr) state = torch.tensor(prepro(env.reset())) start_time = last_disp_time = time.time() episode_length = 0 episode_reward = 0 episode_loss = 0 done = True while info['frames'][0] <= 8e7 or args.test: model.load_state_dict(shared_model.state_dict()) if done: hx = torch.zeros(1, 256) else: hx = hx.detach() values = [] logps = [] actions = [] rewards = [] for step in range(args.steps): episode_length += 1 value, logit, hx = model((state.view(1, 1, 80, 80), hx)) logp = F.log_softmax(logit, dim=-1) action = torch.exp(logp).multinomial(num_samples=1).data[0] state, reward, done, _ = env.step(action.numpy()[0]) # if args.test: # env.render() state = torch.tensor(prepro(state)) episode_reward += reward reward = np.clip(reward, -1, 1) done = done or episode_length >= 1e4 info['frames'].add_(1) num_frames = int(info['frames'].item()) if num_frames % 4e6 == 0: torch.save( shared_model.state_dict(), args.save_dir + 'model.{:.0f}.tar'.format(num_frames / 1e6)) if done: info['episodes'] += 1 if info['episodes'][0] == 1: interp = 1 else: interp = 1 - args.horizon info['run_epr'].mul_(1 - interp).add_(interp * episode_reward) info['run_loss'].mul_(1 - interp).add_(interp * episode_loss) if rank == 0 and time.time() - last_disp_time > 60: elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) printlog( args.save_dir, 'time {}, episodes {:.0f}, frames {:.1f}M, mean episode_reward {:.2f}, run loss {:.2f}' .format(elapsed, info['episodes'].item(), num_frames / 1e6, info['run_epr'].item(), info['run_loss'].item())) last_disp_time = time.time() if done: episode_length, episode_reward, episode_loss = 0, 0, 0 state = torch.tensor(prepro(env.reset())) values.append(value) logps.append(logp) actions.append(action) rewards.append(reward) if done: next_value = torch.zeros(1, 1) else: next_value = model((state.unsqueeze(0), hx))[0] values.append(next_value.detach()) loss = cost_func(args, torch.cat(values), torch.cat(logps), torch.cat(actions), np.asarray(rewards)) episode_loss += loss.item() shared_optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 40) for param, shared_param in zip(model.parameters(), shared_model.parameters()): if shared_param.grad is None: shared_param._grad = param.grad shared_optimizer.step()
from agent import ActorCritic from run_loop import run_loop env = gym.make('CartPole-v0') N_FEATURES = env.observation_space.shape[0] LR = 5e-4 N_ACTIONS = env.action_space.n N_STEPS = 8 NUM_WORKERS = 8 MAX_STEPS = 30000 global_net = SeparateNetwork(N_FEATURES, N_ACTIONS) global_net.share_memory() init_weights(global_net) optimizer = SharedAdam(global_net.parameters(), lr=LR) optimizer.share_memory() # Shared Data eps_counter = mp.Value('i', 0) # Hogwild! style update worker_list = [] for i in range(NUM_WORKERS): agent = ActorCritic( wid=i, shared_model=global_net, model=SeparateNetwork(N_FEATURES, N_ACTIONS), optimizer=optimizer, n_steps=N_STEPS, )