Ejemplo n.º 1
0
                  format(get_elapsed_time_str(), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)


if __name__ == '__main__':
    env = create_atari_env(args.rom)
    # torch.manual_seed(SEED)
    shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
    shared_model.share_memory()
    # print (shared_model.conv1._parameters['weight'].data.is_cuda)
    optimizer = SharedAdam(shared_model.parameters(), lr=0.0001)
    optimizer.share_memory()

    if args.play:
        if os.path.isfile(args.play):
            print("=> loading checkpoint '{}'".format(args.play))
            checkpoint = torch.load(args.play)
            #            args.start_epoch = checkpoint['epoch']
            #            best_prec1 = checkpoint['best_prec1']
            shared_model.load_state_dict(checkpoint['state_dict'])
            #optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.play))
Ejemplo n.º 2
0

if __name__ == "__main__":
    mp.set_start_method('spawn')
    args = get_args()
    args.save_dir = 'a3c-{}/'.format(args.env.lower())
    if args.test:
        args.processes = 1
        args.lr = 0
    args.num_actions = gym.make(args.env).action_space.n
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    torch.manual_seed(args.seed)
    shared_model = ActorCritic(num_actions=args.num_actions).share_memory()
    shared_optimizer = SharedAdam(shared_model.parameters(), lr=args.lr)

    info = {
        k: torch.DoubleTensor([0]).share_memory_()
        for k in ['run_epr', 'run_loss', 'episodes', 'frames']
    }
    info['frames'] += shared_model.try_load(args.save_dir) * 1e6
    if int(info['frames'].item()) == 0:
        printlog(args.save_dir, '', mode='w')

    processes = []
    for rank in range(args.processes):
        p = mp.Process(target=train,
                       args=(shared_model, shared_optimizer, rank, args, info))
        p.start()
        processes.append(p)
Ejemplo n.º 3
0
import torch as th
from utils import SharedAdam
from model import Worker, Net
import gym
import torch.multiprocessing as mp

env = gym.make('CartPole-v0')
n_action = env.action_space.n
n_state = env.observation_space.shape[0]

global_net = Net(n_state, n_action)
global_net.share_memory()
optA = SharedAdam(global_net.policy.parameters(), lr=1e-4, betas=(0.92, 0.999))
optC = SharedAdam(global_net.v.parameters(), lr=1e-4, betas=(0.92, 0.999))
workers = [Worker(global_net, optA, optC, str(i)) for i in range(8)]
[w.start() for w in workers]

[w.join() for w in workers]
Ejemplo n.º 4
0
def train(shared_model, shared_optimizer, weights, rank, args, info):
    env = gym.make(args.env)
    env.seed(args.seed + rank)

    torch.manual_seed(args.seed + rank)
    model = PNN(num_actions=args.num_actions)
    model.new_task()
    model.load(weights)
    model.freeze_column()
    model.new_task()
    shared_optimizer = SharedAdam(shared_model.parameters(), lr=args.lr)

    state = torch.tensor(prepro(env.reset()))

    start_time = last_disp_time = time.time()
    episode_length = 0
    episode_reward = 0
    episode_loss = 0
    done = True

    while info['frames'][0] <= 8e7 or args.test:
        model.load_state_dict(shared_model.state_dict())

        if done:
            hx = torch.zeros(1, 256)
        else:
            hx = hx.detach()
        values = []
        logps = []
        actions = []
        rewards = []

        for step in range(args.steps):
            episode_length += 1
            value, logit, hx = model((state.view(1, 1, 80, 80), hx))
            logp = F.log_softmax(logit, dim=-1)

            action = torch.exp(logp).multinomial(num_samples=1).data[0]
            state, reward, done, _ = env.step(action.numpy()[0])
            # if args.test:
            #     env.render()

            state = torch.tensor(prepro(state))
            episode_reward += reward
            reward = np.clip(reward, -1, 1)
            done = done or episode_length >= 1e4

            info['frames'].add_(1)
            num_frames = int(info['frames'].item())
            if num_frames % 4e6 == 0:
                torch.save(
                    shared_model.state_dict(), args.save_dir +
                    'model.{:.0f}.tar'.format(num_frames / 1e6))

            if done:
                info['episodes'] += 1
                if info['episodes'][0] == 1:
                    interp = 1
                else:
                    interp = 1 - args.horizon
                info['run_epr'].mul_(1 - interp).add_(interp * episode_reward)
                info['run_loss'].mul_(1 - interp).add_(interp * episode_loss)

            if rank == 0 and time.time() - last_disp_time > 60:
                elapsed = time.strftime("%Hh %Mm %Ss",
                                        time.gmtime(time.time() - start_time))
                printlog(
                    args.save_dir,
                    'time {}, episodes {:.0f}, frames {:.1f}M, mean episode_reward {:.2f}, run loss {:.2f}'
                    .format(elapsed, info['episodes'].item(), num_frames / 1e6,
                            info['run_epr'].item(), info['run_loss'].item()))
                last_disp_time = time.time()

            if done:
                episode_length, episode_reward, episode_loss = 0, 0, 0
                state = torch.tensor(prepro(env.reset()))

            values.append(value)
            logps.append(logp)
            actions.append(action)
            rewards.append(reward)

        if done:
            next_value = torch.zeros(1, 1)
        else:
            next_value = model((state.unsqueeze(0), hx))[0]
        values.append(next_value.detach())

        loss = cost_func(args, torch.cat(values), torch.cat(logps),
                         torch.cat(actions), np.asarray(rewards))
        episode_loss += loss.item()
        shared_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 40)

        for param, shared_param in zip(model.parameters(),
                                       shared_model.parameters()):
            if shared_param.grad is None:
                shared_param._grad = param.grad
        shared_optimizer.step()
Ejemplo n.º 5
0
Archivo: main.py Proyecto: JIElite/A3C
from agent import ActorCritic
from run_loop import run_loop

env = gym.make('CartPole-v0')
N_FEATURES = env.observation_space.shape[0]
LR = 5e-4
N_ACTIONS = env.action_space.n
N_STEPS = 8
NUM_WORKERS = 8
MAX_STEPS = 30000

global_net = SeparateNetwork(N_FEATURES, N_ACTIONS)
global_net.share_memory()
init_weights(global_net)

optimizer = SharedAdam(global_net.parameters(), lr=LR)
optimizer.share_memory()

# Shared Data
eps_counter = mp.Value('i', 0)

# Hogwild! style update
worker_list = []
for i in range(NUM_WORKERS):
    agent = ActorCritic(
        wid=i,
        shared_model=global_net,
        model=SeparateNetwork(N_FEATURES, N_ACTIONS),
        optimizer=optimizer,
        n_steps=N_STEPS,
    )