def worker(worker_id, master_end, worker_end): master_end.close() # Forbid worker to use the master end for messaging env = retro.make("StreetFighterIISpecialChampionEdition-Genesis", state='rm-easy') env = Discretizer(env) env = wrap_deepmind(env, scale=True) env.seed(worker_id) while True: cmd, data = worker_end.recv() if cmd == 'step': ob, reward, done, info = env.step(data) if done: ob = env.reset() worker_end.send((ob, reward, done, info)) elif cmd == 'reset': ob = env.reset() worker_end.send(ob) elif cmd == 'close': worker_end.close() break elif cmd == 'get_total_rewards': episode_rewards = env.get_total_rewards() worker_end.send(episode_rewards) else: raise NotImplementedError
def make_atari_env(name, seed): from gym.envs.atari.atari_env import AtariEnv env = AtariEnv(game=name, frameskip=4, obs_type='image') env = monitor(env, name) env = wrap_deepmind(env) env.seed(seed) return env
def play_game(env = wrap_deepmind(gym.make("Pong-v0"), frame_stack = True), agent = None, skipframe = 4, th = 0, maxstep = 5000, render = False, memory = ReplayMemory(50000)): cum_reward = 0.0 render_frames = [] state = env.reset() for i in range(maxstep): # take action: action = agent(state, th = th) reward = 0 for _ in range(skipframe): next_state, r, ended, info = env.step(action) reward += r if ended: break cum_reward += float(reward) # push to replay buffer: memory.push(state, action, next_state, reward, ended) state = next_state if render: if i % 1 == 0: render_frames.append(torch.from_numpy(env.render(mode="rgb_array")).unsqueeze(0)) if ended == 1: break out = {'cum_reward' : cum_reward, 'steps' : i} if render: out['frames'] = torch.cat(render_frames).permute(3,0,1,2).unsqueeze(0) return out
def env_fn(): env = gym.make('{}NoFrameskip-v4'.format(args.env_name)) env.seed(args.seed + rank) env = Monitor( env, osp.join(args.log_dir, "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env, num_skips=args.num_skips)
def train_model(num_frames): env = make_atari('PongNoFrameskip-v4') env = wrap_deepmind(env,episode_life=True, frame_stack=True) train_results = results.results(globals()) cumulative_frames = 0 best_score = -50 games = 0 full_loss = [] rewards = [] while 1: state = env.reset() done = False cum_reward = 0 cum_loss = [] while not done: action = select_action(torch.tensor(np.array(state).reshape(-1, 4, HEIGHT, WIDTH)).to(device), cumulative_frames) next_state, reward, done, _ = env.step(action) memory.add(state, action, reward, next_state, reward) state = next_state if cumulative_frames % TRAIN_FREQUENCY == 0 and cumulative_frames > LEARNING_STARTS: loss = optimize_model(cumulative_frames) cum_loss.append(loss) cum_reward += reward cumulative_frames += 1 if cumulative_frames % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) if best_score < cum_reward: best_score = cum_reward if len(cum_loss) == 0: full_loss.append(0) else: full_loss.append(np.mean(cum_loss)) rewards.append(cum_reward) games += 1 if games % 10 == 0: print("=============================================") print("Game: {} | Frame {}".format(games, cumulative_frames)) print("Final reward: {}".format(cum_reward)) print("Epsilon after: {}".format(EPSILON)) print("Best High Score: {}".format(best_score)) print("Avg Loss Last 100 games: {}".format( np.mean(full_loss[-100:]))) print("Avg Reward Last 100 games: {}".format( np.mean(rewards[-100:]))) train_results.record(cumulative_frames, games, EPSILON, cum_reward, full_loss[-1]) if np.mean(rewards[-100:]) >= 18 and cumulative_frames > LEARNING_STARTS: break torch.save(target_net.state_dict(), PATH) train_results.close()
def make_atari_env(name, seed): from gym.wrappers.monitor import Monitor from gym.envs.atari.atari_env import AtariEnv env = AtariEnv(game=name, frameskip=4, obs_type='image') env = Monitor(env, 'videos/', force=True, video_callable=lambda e: False) env = wrappers.wrap_deepmind(env) env.seed(seed) return env
def make_atari_env(name, history_len): from gym.envs.atari.atari_env import AtariEnv from gym.wrappers.monitor import Monitor env = AtariEnv(game=name, frameskip=4, obs_type='image') env = Monitor(env, 'videos/', force=True, video_callable=lambda e: False) env = wrappers.wrap_deepmind(env) env = wrappers.HistoryWrapper(env, history_len) env.seed(utils.random_seed()) return env
def create_env(num): if num % 4 == 0 or num % 4 == 1: env = retro.make("StreetFighterIISpecialChampionEdition-Genesis", state='rm') env = Discretizer(env) #expt_dir = config.log_path + '/monitor' #env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False) env = wrap_deepmind(env, scale=True) else: env = retro.make("StreetFighterIISpecialChampionEdition-Genesis", state='rm-easy') env = Discretizer(env) #expt_dir = config.log_path + '/monitor' #env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False) env = wrap_deepmind(env, scale=True) ''' elif num % 4 == 2: env = retro.make("StreetFighterIISpecialChampionEdition-Genesis", state = 'rm') env = Discretizer(env) #expt_dir = config.log_path + '/monitor' #env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False) env = wrap_deepmind(env, scale = True) else: env = retro.make("StreetFighterIISpecialChampionEdition-Genesis", state = 're-easy') env = Discretizer(env) #expt_dir = config.log_path + '/monitor' #env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False) env = wrap_deepmind(env, scale = True) ''' # if config.num_actors >1 : # env = ParallelEnv(num_processes = config.num_actors) return env
def create_super_mario_env(): import gym from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v1') env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) #env = wrappers.MaxAndSkipEnv(env, skip=4) env = wrappers.wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) return env
def create_env(config): env_name = config.env env = gym.make(env_name) expt_dir = config.log_path + '/monitor' env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False) env = wrap_deepmind(env, config.episode_life, config.preprocess, config.max_and_skip, config.clip_rewards, config.no_op_reset, config.scale) if config.num_actors > 1: env = ParallelEnv(num_processes=config.num_actors, env=env) return env
def inference(episodes, model, env_name): env = make_atari(env_name) env = wrap_deepmind(env, episode_life=True, frame_stack=True) for _ in range(episodes): observation = env.reset() done = False while not done: time.sleep(0.05) env.render() observation = torch.tensor(np.array(observation).reshape(-1, 4, HEIGHT, WIDTH)).to(device) with torch.no_grad(): action = model(observation).max(1)[1].item() observation, reward, done, _ = env.step(action) if reward != 0: print(reward)
# run python -i test.py for testing stuff in shell import torch import numpy as np import gym from wrappers import make_atari, wrap_deepmind from utils import LinearSchedule, Replay env=wrap_deepmind(make_atari('BreakoutNoFrameskip-v4')) state=env.reset() state = np.array(state) r = Replay(50, 3, False) for i in range(100): action = env.action_space.sample() next_state, reward, done, _ = env.step(action) r.add(state, action, reward, next_state, done) state = next_state s, a, r, ns, d = r.sample_tensor()
next_state = np.expand_dims(next_state, 0) self.buffer.append((state, action, reward, next_state, done)) def sample(self, batch_size): state, action, reward, next_state, done = zip( *random.sample(self.buffer, batch_size)) return np.concatenate(state), action, reward, np.concatenate( next_state), done def __len__(self): return len(self.buffer) env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind(env, frame_stack=True, pytorch_img=True) memory = ReplayMemory(100000) class DQN(nn.Module): def __init__(self, input_shape, num_actions): super(DQN, self).__init__() self.input_shape = input_shape self.num_actions = num_actions self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4) self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
def main(): #Make OpenAI gym environment + wrappers date_time = now.strftime("_%H:%M:%S_%m-%d-%Y") env = gym.make("PongNoFrameskip-v4") env = gym.wrappers.Monitor(env, './data_dqn_ataripong' + date_time) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) #skip 4 frames & max over last_obs env = wrap_deepmind(env) env = wrap_pytorch(env) #obs shape = num_channels x width x height obs_space_shape = env.observation_space.shape[0] action_space_shape = env.action_space.n #Set random seeds seed = 6582 torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) #Initialize Replay Memory (Line 1) replay_memory = ReplayMemory(max_size=100000) #Make Q-Network and Target Q-Network (Lines 2 & 3) qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device) target_qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device) target_qnet.load_state_dict(qnet.state_dict()) #Training Parameters (Changes from Mnih et al. outlined in README.md) optimizer = optim.Adam(qnet.parameters()) num_frames = 1400000 gamma = 0.99 replay_start_size = 50000 target_network_update_freq = 10000 #Train obs = env.reset() num_episodes = 0 for t in range(1, num_frames + 1): epsilon = epsilon_at_t(t) #------------------------------------------------------------------- #Take one step in the environment & add to Replay Memory (Line 7-11) #------------------------------------------------------------------- torch.set_grad_enabled(False) #Select action with epsilon-greedy exploration (Line 7,8) if random.random() > epsilon: ts_obs = torch.from_numpy(obs.astype( np.float32)).unsqueeze(0).to(device) ts_qvals = qnet(ts_obs) action = ts_qvals.max(-1)[1].item() else: action = random.randrange(action_space_shape) torch.set_grad_enabled(True) #Execute action and get reward + next_obs (Line 9, 10) next_obs, reward, done, _ = env.step(action) #Store transition in Replay Memory replay_memory.add(obs, next_obs, action, reward, done) obs = next_obs if done: obs = env.reset() num_episodes += 1 #Populate Replay Memory with <replay_start_size> experiences before learning if t > replay_start_size: #--------------------------------------------- #Sample batch & compute loss & update network (Lines 12 - 15) #--------------------------------------------- obs_minibatch, next_obs_minibatch, actions_minibatch, rewards_minibatch, done_minibatch = replay_memory.sample( ) ts_obs, ts_rewards, ts_next_obs, ts_done = map( lambda x: torch.FloatTensor(x).to(device), [ obs_minibatch, rewards_minibatch, next_obs_minibatch, done_minibatch ]) ts_actions = torch.LongTensor(actions_minibatch).to(device) torch.set_grad_enabled(False) # Compute Target Values (as per Double-DQN update rule) ts_next_qvals_outer = qnet( ts_next_obs) #(32, 2) (outer Qnet, evaluates value) ts_next_qvals_inner = target_qnet( ts_next_obs) #(32, 2) (inner Qnet, evaluates action) ts_next_action_inner = ts_next_qvals_inner.argmax( -1, keepdim=True) #(32, 1) ts_next_action_qvals_outer = ts_next_qvals_outer.gather( -1, ts_next_action_inner).view( -1) #(32, ) (use inner actions to evaluate outer Q values) ts_target_q = ts_rewards + gamma * ts_next_action_qvals_outer * ( 1 - ts_done) torch.set_grad_enabled(True) #Compute predicted ts_pred_q = qnet(ts_obs).gather(-1, ts_actions).view(-1) #(32,) #Calculate Loss & Perform gradient descent (Line 14) loss = F.smooth_l1_loss(ts_pred_q, ts_target_q) optimizer.zero_grad() loss.backward() optimizer.step() #Update target network ever <target_network_update_freq> steps (Line 15) if t % target_network_update_freq == 0: target_qnet.load_state_dict(qnet.state_dict()) #Log to Terminal episode_rewards = env.env.env.env.env.env.env.env.get_episode_rewards() print('Timesteps', t, 'Episode', num_episodes, 'Mean Reward', np.mean(episode_rewards[-100:])) env.env.close()
if render: time_to_sleep = wait_time - (time.time() - start_time) if time_to_sleep > 0: time.sleep(time_to_sleep) return total_reward if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Render on graphics card(cuda:0).") parser.add_argument("--env", default=ENV_NAME, help="Name of the environment, default=" + ENV_NAME) parser.add_argument("-m", "--model", help="DQN") args = parser.parse_args() device = torch.device(GRAPHICS_CARD if args.cuda else "cpu") env = wrappers.make_atari(args.env) env = wrappers.wrap_deepmind(env, False, False, True) net = model.DQN(4, env.action_space.n).to(device) net.load_state_dict(torch.load(args.model)) score = play(env, net, True, device) print(f"Score: {score}")
def train_dqn(env_name, save_path, double=False, dueling=False, notebook=False): env = wrap_deepmind(make_atari(env_name)) num_actions = env.action_space.n print('Num actions: {}'.format(num_actions)) if dueling: model = DuelingNet(out_size=num_actions) target_model = DuelingNet(out_size=num_actions) else: model = DQN(out_size=num_actions) target_model = DQN(out_size=num_actions) criterion = nn.SmoothL1Loss() print('Created models') cuda = False if torch.cuda.is_available(): cuda = True model = model.cuda() target_model = target_model.cuda() print('GPU: {}'.format(torch.cuda.get_device_name(0))) model.apply(init_weights) target_model.apply(init_weights) optimizer = optim.Adam(model.parameters()) #, lr=0.00001) print('Initalized models') schedule = LinearSchedule(P.start_eps, P.end_eps, P.steps_eps) replay = Replay(P.replay_size, P.batch_size, cuda) state = env.reset() num_updates = 0 eps_reward = 0 rewards = [] losses = [] # populate replay with random policy print('Populating replay') for i in tqdm(range(P.replay_start_size), desc='Populating replay'): action = env.action_space.sample() next_state, reward, done, _ = env.step(action) replay.add(state, action, reward, next_state, done) state = next_state if done: state = env.reset() print('Starting training') state = env.reset() for i in tqdm(range(P.num_steps), desc='Total steps'): if schedule.choose_random(): action = env.action_space.sample() else: model_input = torch.from_numpy(np.array(state)[None, :]).type( torch.FloatTensor) if cuda: model_input = model_input.cuda() q_values = model(model_input) action = int(q_values.argmax(1)[0]) next_state, reward, done, _ = env.step(action) eps_reward += reward replay.add(state, action, reward, next_state, done) state = next_state last_eps = 0 if i % P.update_freq == 0: loss = compute_loss(replay, optimizer, model, target_model, P.gamma, criterion, double) num_updates += 1 if num_updates % P.target_update_freq == 0: target_model.load_state_dict(model.state_dict()) if done: rewards.append(eps_reward) losses.append(loss.item()) eps_reward = 0 state = env.reset() if i % P.print_every == 0 and i > 0: print('Step: {}'.format(i)) print('Average episode reward: {}'.format( sum(rewards[last_eps:]) / len(rewards[last_eps:]))) print('Loss: {}'.format( sum(losses[last_eps:]) / len(losses[last_eps:]))) last_eps = len(losses) if i % P.plot_every == 0 and i > 0: plot(i, rewards, losses, notebook, save_path) # if i % P.save_every == 0 and i > 0: torch.save(model, 'experiments/{}/{}_model'.format(save_path, i)) pickle.dump( losses, open("experiments/{}/{}_losses.p".format(save_path, i), "wb")) pickle.dump( rewards, open("experiments/{}/{}_rewards.p".format(save_path, i), "wb"))
epsilon_decay = 30000 num_frames = 1000000 batch_size = 32 learning_rate = 0.0001 # create environment # env_id = "PongNoFrameskip-v4" # env_id = 'SpaceInvadersNoFrameskip-v4' # env_id = 'MsPacmanNoFrameskip-v4' # env_id = 'VideoPinballNoFrameskip-v4' # env_id = 'MontezumaRevengeNoFrameskip-v4' # env_id = 'QbertNoFrameskip-v4' env_id = sys.argv[1] env = make_atari(env_id) # env = gym.wrappers.Monitor(env, 'stats', video_callable=lambda episode_id: False, force=True, resume=False) env = wrap_deepmind(env) env = wrap_pytorch(env) # create networks current_model = CnnDQN(env.observation_space.shape, env.action_space.n) target_model = CnnDQN(env.observation_space.shape, env.action_space.n) if USE_CUDA: current_model = current_model.cuda() target_model = target_model.cuda() # setup optimizer optimizer = optim.Adam(current_model.parameters(), lr = learning_rate) # initialize replay memory replay_buffer = ReplayBuffer(100000)
'batch_size' : 32, 'lr' : 0.0001, 'GAMMA' : 0.95, 'replay_buffer' : 500000, 'end_eps' : 0.1, 'exp_length' : 2000000} param['version'] = ", ".join([ "{}:{}".format(key,val) for key, val in param.items()]) + " "+str(datetime.datetime.now())[:16] print(param['version']) memory = utils.ReplayMemory(param['replay_buffer']) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") eps = utils.EpsilonDecay(start_eps = 1.0, end_eps = param['end_eps'], length = param['exp_length']) writer = SummaryWriter(log_dir = "tensorboard/" + param['version']) checkpoint = utils.CheckpointIfBetter(param, device) env = wrap_deepmind(gym.make(param['env']), frame_stack = True) dqn = model.DQN(num_actions = env.action_space.n).to(device) target_dqn = copy.deepcopy(dqn) def dqn_epsilon_agent(state, net = dqn, th = 0.05): if random.random() > th: yhat = net(default_states_preprocessor(state)) return int(yhat.argmax().cpu().numpy()) else: return env.action_space.sample() optimizer = optim.Adam(dqn.parameters(), lr = param['lr']) # Warmup buffer for _ in range(5): game = utils.play_game(env, agent = dqn_epsilon_agent, th = eps.get(0), memory = memory)
def get_env(): env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind(env) env = wrap_pytorch(env) return env
import gym import tensorflow as tf from wrappers import wrap_deepmind from agent import Agent import time env = gym.make('Breakout-v0') env = wrap_deepmind(env, frame_stack=True, scale=True) action_size = env.action_space.n # Reset the graph tf.reset_default_graph() #Create our agent agent = Agent(action_size) count = 0 with tf.Session() as sess: total_test_rewards = [] saver = tf.train.Saver() # Load the model saver.restore(sess, "./model.ckpt") for episode in range(10): total_rewards = 0 state = env.reset() print("****************************************************") print("EPISODE ", episode)
def get_env(env_id, frame_stack): env = make_atari(env_id) env = wrap_deepmind(env, frame_stack) env = wrap_pytorch(env) return env
def main(): env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) observation_space = env.observation_space.shape action_sapce = env.action_space.n model = CnnDQN(observation_space, action_sapce) if USE_CUDA: model = model.cuda() optimizer = optim.Adam(model.parameters()) replay_buffer = ReplayBuffer(1000) batch_size = 32 gamma = 0.99 replay_initial = 100 num_frames = 14000 losses = [] all_rewards = [] x_axis1 = [] x_axis2= [] episode_reward = 0 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 # 要求探索率随着迭代次数增加而减小 epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) state = env.reset() for frame_idx in range(1, num_frames + 1): #显示动画 env.render() epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() x_axis1.append(frame_idx) all_rewards.append(episode_reward) episode_reward = 0 if frame_idx+1 > replay_initial: loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size) x_axis2.append(frame_idx) losses.append(np.array(loss.data.cpu())) if frame_idx % 100 == 0: plt.figure(1) plt.subplot(121) plt.plot(x_axis1, all_rewards) plt.subplot(122) plt.plot(x_axis2, losses) plt.show() env.close()
expected_state_action_values = expected_state_action_values.float() predicted_state_action_values = predicted_state_action_values.float() return predicted_state_action_values, expected_state_action_values if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--env", default=ENV_NAME, help="Name of the environment, default=" + ENV_NAME) args = parser.parse_args() device = torch.device(GRAPHICS_CARD if args.cuda else "cpu") env = wrappers.make_atari(args.env) env = wrappers.wrap_deepmind(env, episode_life=False, frame_stack=True) exp_buffer = ExperienceBuffer(REPLAY_MEMORY_SIZE) agent = Agent(env, exp_buffer) net = model.DQN(AGENT_HIST_LENGTH, env.action_space.n).to(device) tgt_net = model.DQN(AGENT_HIST_LENGTH, env.action_space.n).to(device) tgt_net.load_state_dict(net.state_dict()) criterion = nn.MSELoss() optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE, momentum=GRAD_MOMENTUM, eps=MIN_SQ_GRAD) writer = SummaryWriter(comment="-" + args.env) remaining_time_buffer = collections.deque(maxlen=100) last_100_rewards_training = collections.deque(maxlen=100)