def run(self): for e_i in range(self.args.episodes): ob = self._env.reset() state = torch.tensor(ob, dtype=torch.float32).unsqueeze(0).cuda() total_reward = 0 done = False num_steps = 0 while not done: self._env.render() # Get current action action, _ = self._agent(state) # Perform action in environment ob, reward, done, _ = self._env.step(action) total_reward += reward # Add memory step if done: next_state = None else: next_state = torch.tensor( ob, dtype=torch.float32).unsqueeze(0).cuda() if next_state is None: e_t = Experience(state.clone(), action, reward, next_state) else: e_t = Experience(state.clone(), action, reward, next_state.clone()) self._agent.add_ex(e_t) state = next_state if self._agent.replay_len( ) > self.args.min_init_state and self._agent.replay_len( ) > self.args.batch_size and (num_steps + 1) % self.args.update_steps == 0: self._agent.train() num_steps += 1 self._env.reset() self._agent.save() if self._agent.replay_len() > self.args.min_init_state: self.total_ep_reward.append(total_reward) self._env.reset() print("total_reward", total_reward) print("Episode length", num_steps) if e_i % self.args.save_iter == 0: self._agent.save() self.save_results() # for i in range(8): # self._agent.train() self._env.close()
def run_train_episode(env, agent, rpm): total_reward = 0 all_cost = [] obs = env.reset() steps = 0 while True: steps += 1 context = rpm.recent_obs() context.append(obs) context = np.stack(context, axis=0) action = agent.sample(context) next_obs, reward, isOver, _ = env.step(action) rpm.append(Experience(obs, action, reward, isOver)) # start training if rpm.size() > MEMORY_WARMUP_SIZE: if steps % UPDATE_FREQ == 0: batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size) batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :] cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_isOver) all_cost.append(float(cost)) total_reward += reward obs = next_obs if isOver: break if all_cost: logger.info('[Train]total_reward: {}, mean_cost: {}'.format( total_reward, np.mean(all_cost))) return total_reward, steps, np.mean(all_cost)
def run_train_episode(env, agent, rpm): total_reward = 0 all_cost = [] obs = env.reset() steps = 0 while True: steps += 1 context = rpm.recent_obs() context.append(obs) context = np.stack(context, axis=0) action = agent.sample(context) next_obs, reward, isOver, _ = env.step(action) rpm.append(Experience(obs, action, reward, isOver)) if rpm.size() > MEMORY_WARMUP_SIZE: if steps % UPDATE_FREQ == 0: batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size) batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :] cost = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_isOver) all_cost.append(cost) total_reward += reward obs = next_obs if isOver: mean_loss = np.mean(all_cost) if all_cost else None return total_reward, steps, mean_loss
def collect_exp(env, rpm, agent): obs = env.reset() # collect data to fulfill replay memory for i in tqdm(range(MEMORY_SIZE)): context = rpm.recent_obs() context.append(obs) context = np.stack(context, axis=0) action = agent.sample(context) next_obs, reward, isOver, _ = env.step(action) rpm.append(Experience(obs, action, reward, isOver)) obs = next_obs
def run_evaluate_episode(env, agent,rpm): state, _, __ = env.reset('test') total_reward = 0 step=0 while True: context = rpm.recent_state() context.append(resizeBirdrToAtari(state)) context = np.stack(context, axis=0) action = agent.predict(context) next_state, reward, isOver,_ = env.step(action) step+=1 rpm.appendForTest(Experience(resizeBirdrToAtari(state), action, reward, isOver)) total_reward += reward state=next_state if isOver or step>=MAX_Step_Limit: time.sleep(2) break return total_reward
def run_train_episode(env, agent, rpm): global trainEpisode global meanReward total_reward = 0 all_cost = [] #重置环境 state,_, __ = env.reset() step = 0 #循环每一步 while True: context = rpm.recent_state() context.append(resizeBirdrToAtari(state)) context = np.stack(context, axis=0) #用ε-greedy的方式选一个动作 action = agent.sample(context) #执行动作 next_state, reward, isOver,_ = env.step(action) step += 1 #存入replay_buffer rpm.append(Experience(resizeBirdrToAtari(state), action, reward, isOver)) if rpm.size() > MEMORY_WARMUP_SIZE: if step % UPDATE_FREQ == 0: #从replay_buffer中随机采样 batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( batchSize) batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] batch_next_state = batch_all_state[:, 1:, :, :] #执行SGD,训练参数θ cost = agent.learn(batch_state, batch_action, batch_reward, batch_next_state, batch_isOver) all_cost.append(float(cost)) total_reward += reward state = next_state if isOver or step>=MAX_Step_Limit: break if all_cost: trainEpisode+=1 #以滑动平均的方式打印平均奖励 meanReward=meanReward+(total_reward-meanReward)/trainEpisode print('\n trainEpisode:{},total_reward:{:.2f}, meanReward:{:.2f} mean_cost:{:.3f}'\ .format(trainEpisode,total_reward, meanReward,np.mean(all_cost))) return total_reward, step
def ddqn_train(env, scheduler, optimizer_constructor, model_type, batch_size, rp_start, rp_size, exp_frame, exp_initial, exp_final, gamma, target_update_steps, frames_per_epoch, frames_per_state, output_directory, last_checkpoint, envo): """ Implementation of the training algorithm for DDQN. """ gym.undo_logger_setup() logging.basicConfig(filename=envo + '_' + model_type + 'ddqn_training.log', level=logging.INFO) num_actions = env.action_space.n env.reset() print('No. of actions: ', num_actions) print(env.unwrapped.get_action_meanings()) # initialize action value and target network with the same weights model = DQN(num_actions, use_bn=False) target = DQN(num_actions, use_bn=False) if use_cuda: model.cuda() target.cuda() exp_replay = None episodes_count = 1 if last_checkpoint: model.load_state_dict(torch.load(last_checkpoint)) print(last_checkpoint) print('weights loaded...') exp_replay = initialize_replay_resume(env, rp_start, rp_size, frames_per_state, model) episodes_count = get_index_from_checkpoint_path(last_checkpoint) else: exp_replay = initialize_replay(env, rp_start, rp_size, frames_per_state) target.load_state_dict(model.state_dict()) # scheduler = Scheduler(exp_frame, exp_initial, exp_final) optimizer = optimizer_constructor.type( model.parameters(), lr=optimizer_constructor.kwargs['lr'], alpha=optimizer_constructor.kwargs['alpha'], eps=optimizer_constructor.kwargs['eps']) frames_count = 1 frames_per_episode = 1 epsiodes_durations = [] rewards_per_episode = 0 rewards_duration = [] loss_per_epoch = [] current_state, _, _, _ = play_game(env, frames_per_state) print('Starting training...') count = 0 while True: epsilon = scheduler.anneal_linear(frames_count) choice = random.uniform(0, 1) # epsilon greedy algorithm if choice <= epsilon: action = LongTensor([[random.randrange(num_actions)]]) else: action = get_greedy_action(model, current_state) curr_obs, reward, done, _ = play_game(env, frames_per_state, action[0][0]) rewards_per_episode += reward reward = Tensor([reward]) exp_replay.push(current_state, action, reward, curr_obs) current_state = curr_obs #sample random mini-batch obs_sample = exp_replay.sample(batch_size) batch = Experience( *zip(*obs_sample) ) #unpack the batch into states, actions, rewards and next_states #compute y if len(exp_replay) >= batch_size: loss = ddqn_compute_y(batch, batch_size, model, target, gamma) optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() loss_per_epoch.append(loss.data.cpu().numpy()) frames_count += 1 frames_per_episode += frames_per_state if done: rewards_duration.append(rewards_per_episode) rewards_per_episode = 0 frames_per_episode = 1 episodes_count += 1 env.reset() current_state, _, _, _ = play_game(env, frames_per_state) if episodes_count % 100 == 0: avg_episode_reward = sum(rewards_duration) / 100.0 avg_reward_content = 'Episode from', episodes_count - 99, ' to ', episodes_count, ' has an average of ', avg_episode_reward, ' reward and loss of ', sum( loss_per_epoch) print(avg_reward_content) logging.info(avg_reward_content) rewards_duration = [] loss_per_epoch = [] # update weights of target network for every TARGET_UPDATE_FREQ steps if frames_count % target_update_steps == 0: target.load_state_dict(model.state_dict()) # print('weights updated at frame no. ', frames_count) #Save weights every 250k frames if frames_count % 250000 == 0: util.make_sure_path_exists(output_directory + '/' + envo + '/') torch.save( model.state_dict(), output_directory + envo + '_' + model_type + '/weights_' + str(frames_count) + '.pth') #Print frame count for every 1000000 (one million) frames: if frames_count % 1000000 == 0: training_update = 'frame count: ', frames_count, 'episode count: ', episodes_count, 'epsilon: ', epsilon print(training_update) logging.info(training_update)
# Steps loop steps = 0 while not environment_manager.done: if render: environment_manager.render() action = agent.select_action(state, policy_net) experience = environment_manager.take_action(action) state = experience[0] action = experience[1] next_state = experience[2] reward = experience[3] steps += 1 memory.push(Experience(state, action, next_state, reward)) max_episode_reward += reward state = next_state if memory.can_provide_sample(batch_size): experiences_batch = memory.sample(batch_size) states = np.zeros((batch_size, environment_manager.final_reshape)) next_states = np.zeros( (batch_size, environment_manager.final_reshape)) actions, rewards = [], [] # Prepare data batch for i in range(batch_size): states[i] = experiences_batch[i][0] actions.append(experiences_batch[i][1]) next_states[i] = experiences_batch[i][2]