def __init__(self): self.params = HYPERPARAMS self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.env = gym.make('BeamRider-v4') self.env = wrap_dqn(self.env) self.policy_net = DQN(self.env.observation_space.shape, self.env.action_space.n).to(self.device) if torch.cuda.device_count() > 1: print('using %s gpus' % torch.cuda.device_count()) self.policy_net = nn.DataParallel(self.policy_net) self.target_net = copy.deepcopy(self.policy_net) self.epsilon_tracker = EpsilonTracker(self.params) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.params['learning_rate']) self.reward_tracker = RewardTracker() self.transition = namedtuple( 'Transition', ('state', 'action', 'reward', 'next_state', 'done')) self.memory = ReplayBuffer(self.params['replay_size']) # self.memory = Memory(self.params['replay_size'], 0.6) self.beta_scheduler = LinearScheduler( 0.4, 1.0, timespan=self.params['epsilon_frames']) self.state = self.preprocess(self.env.reset()) self.score = 0 self.batch_size = self.params['batch_size'] self.tb_writer = SummaryWriter('results')
def playback(self, path): target_net = torch.load(path, map_location='cpu') env = gym.make('PongNoFrameskip-v4') env = wrap_dqn(env) state = self.preprocess(env.reset()) done = False score = 0 import time while not done: time.sleep(0.015) action = torch.argmax(target_net(state), dim=1).to(self.device) state, reward, done, _ = env.step(action.item()) state = self.preprocess(state) score += reward print("Score: ", score)
def __init__(self): self.params = HYPERPARAMS self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.env = gym.make('PongNoFrameskip-v4') self.env = wrap_dqn(self.env) self.policy_net = DQN(self.env.observation_space.shape, self.env.action_space.n, self.device).to(self.device) self.target_net = copy.deepcopy(self.policy_net) self.epsilon_tracker = EpsilonTracker(self.params) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.params['learning_rate']) self.reward_tracker = RewardTracker() self.transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state')) self.memory = ReplayMemory(self.params['replay_size'], self.transition) self.episode = 0 self.state = self.preprocess(self.env.reset()) self.score = 0 self.batch_size = self.params['batch_size']
def __init__(self, n_action, is_render=True, is_load=False): self.sess = tf.Session() self.batch_size = 32 self.model = DQN(self.sess, n_action, self.batch_size) self.model_name = "DQN" self.env = wrappers.wrap_dqn(gym.make("BreakoutDeterministic-v4")) self.is_render = is_render self.EPISODE = 600 # epsilon parameter self.epsilon_s = 1.0 self.epsilon_e = 0.1 self.epsilon_decay = 100000 self.epsilon = self.epsilon_s # train parameter self.train_start = 5000 self.update_target_rate = 5000 self.n_action = n_action self.loss = 0 # info self.total_q_max, self.total_loss = 0., 0. # save parameter self.save_episode_rate = 5 # load parameter self.is_load = is_load # saved_model = "./save/{}/{}_episode20.ckpt-{}".format("20180613-132735", self.model_name, "3741") self.saved_model = tf.train.latest_checkpoint("./save/20180614-180138")
parser.add_argument( "--reward", type=float, default=STOP_REWARD, help="Mean reward boundary for stop of training, default=%.2f" % STOP_REWARD) args = parser.parse_args() device = torch.device("cuda") cp_dir = 'checkpoints/' runs_dir = 'runs/' os.makedirs(cp_dir, exist_ok=True) os.makedirs(runs_dir, exist_ok=True) env = gym.make(DEFAULT_ENV_NAME) env = wrappers.wrap_dqn(env) net = dqn_model.RainbowDQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.RainbowDQN(env.observation_space.shape, env.action_space.n).to(device) date_time = datetime.datetime.now().strftime('%d-%b-%Y_%X_%f') run_name = f'{DEFAULT_ENV_NAME}_{date_time}' writer = SummaryWriter(runs_dir + run_name) quantile_tau = [i / N_QUANTILES for i in range(1, N_QUANTILES + 1)] quantile_tau = torch.tensor(quantile_tau).to(device) agent = Agent(net, device=device) exp_source = experience.ExperienceSourceFirstLast(env,
self.advantage_head = nn.Linear(512, n_actions) def forward(self, x): x = F.relu(self.bn1(self.conv1(x))) x = F.relu(self.bn2(self.conv2(x))) x = F.relu(self.bn3(self.conv3(x))) x = x.view(x.size(0), -1) value = self.value_head(F.relu(self.value(x))) advantage = self.advantage_head(F.relu(self.advantage(x))) return value + (advantage - advantage.mean()) env = wrap_dqn(gym.make('PongNoFrameskip-v4')) q_func = Net(6) q_func.load_state_dict(torch.load(sys.argv[1])) q_func.cuda() def var(x): x = np.array(x).reshape(1, 4, 84, 84) x = torch.from_numpy(x) return Variable(x).type(torch.FloatTensor).cuda() def select_action(x): if random.random() < 0.02: return env.action_space.sample()
obse = np.resize(state_grey,(84*84)) stacked = np.stack((state_grey,state_grey, state_grey, state_grey), axis = 0) print(stacked, stacked.shape) #print(state,state.shape) #cv2.imshow('image', state_grey) #cv2.waitKey(0) ''' ''' output = tf.image.rgb_to_grayscale(state) output = tf.image.crop_to_bounding_box(output, 34, 0, 160, 160) output = tf.image.resize_images(output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) output_2 = tf.squeeze(output) print(output_2.numpy(), output.numpy().shape) cv2.imshow('image', output.numpy()) cv2.waitKey(0) ''' env = wp.wrap_dqn(gym.make('BreakoutDeterministic-v4')) state = env.reset() done = False for _ in range(50): if not done: next_state, reward, done, info = env.step(1) print('state', next_state, 'done', done) time.sleep(0.1)
def trainer(MINIBATCH_SIZE=32, GAMMA=0.99, load=True, save=True, epsilon=1.0, min_epsilon=0.1, BUFFER_SIZE=500000, train_indicator=True, render=True): with tf.Session() as sess: # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) # set evironment # robot = gym_environment('FrozenLakeNonskid4x4-v3', False, False, False) # breakout # env = gym.make('BreakoutDeterministic-v4') env = wp.wrap_dqn(gym.make('BreakoutDeterministic-v4')) # Pong-v0 # env= wp.wrap_dqn(gym.make('PongDeterministic-v4')) agent = Network(sess, SIZE_FRAME, N_ACTIONS, LEARNING_RATE, DEVICE) # TENSORFLOW init seession sess.run(tf.global_variables_initializer()) # Initialize target network weights agent.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) replay_buffer.load() print('buffer size is now', replay_buffer.count) # this is for loading the net if load: try: agent.recover() print('********************************') print('models restored succesfully') print('********************************') except tf.errors.NotFoundError: print('********************************') print('Failed to restore models') print('********************************') total_frames_counter = 0 frames_number = 0 frames_to_save = 0 while total_frames_counter < 10000000: if frames_to_save > 10000: agent.save() frames_to_save = 0 if frames_number > 10000: agent.update_target_network() frames_number = 0 print('update_target_network') # agent.save() # replay_buffer.save() state = env.reset() q0 = np.zeros(N_ACTIONS) ep_reward = 0. done = False step = 0 total_loss = deque() loss = 0. while not done: frames_number = frames_number + 1 frames_to_save = frames_to_save + 1 total_frames_counter = total_frames_counter + 1 if total_frames_counter > 20000: epsilon -= 0.00000085 epsilon = np.maximum(min_epsilon, epsilon) train_indicator = True else: train_indicator = False #True # for visualization # numpy_horizontal = np.hstack((np.array(state)[:,:,0], np.array(state)[:,:,1], np.array(state)[:,:,2],np.array(state)[:,:,3])) # cv2.imshow('image', numpy_horizontal) # cv2.waitKey(1) # time.sleep(0.05) # 1. get action with e greedy if np.random.random_sample() < epsilon: #Explore! action = np.random.randint(0, N_ACTIONS) else: # Just stick to what you know bro q0, X = agent.predict( np.reshape( np.array(state).astype(np.uint8), [-1, SIZE_FRAME, SIZE_FRAME, 4])) action = np.argmax(q0) next_state, reward, done, info = env.step( action) #env.step(action) # env.render() # state = observation if train_indicator: # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) q_eval = agent.predict_target(s2_batch) q_target = np.zeros(MINIBATCH_SIZE) for k in range(MINIBATCH_SIZE): if t_batch[k]: q_target[k] = r_batch[k] else: q_target[k] = r_batch[k] + GAMMA * np.max( q_eval[k]) #5.3 Train agent! loss, _ = agent.train( np.reshape(a_batch, (MINIBATCH_SIZE, 1)), np.reshape(q_target, (MINIBATCH_SIZE, 1)), s_batch) # in case you want to understand the innner workings of this # target_final, q_acted, delta, loss, optimize = agent.train_v2(np.reshape(a_batch,(MINIBATCH_SIZE,1)),np.reshape(q_target,(MINIBATCH_SIZE,1)), s_batch ) # print('target_final', target_final, 'q_acted', q_acted, 'delta', delta, 'loss', loss) # 3. Save in replay buffer: replay_buffer.add(state, action, reward, done, next_state) state = next_state ep_reward = ep_reward + reward step += 1 total_loss.append(loss) print('th', total_frames_counter + 1, 'Step', step, 'Reward:', ep_reward, 'epsilon', round(epsilon, 3), np.mean(total_loss)) # print('the reward at the end of the episode,', reward) print('*************************') print('now we save the model') agent.save() #replay_buffer.save() print('model saved succesfuly') print('*************************')