def main(): env = gym.make('CartPole-v1') replay_memory = ReplayMemory() agent = QNetwork(env.observation_space, env.action_space) get_training_batch = make_training_transformer(env.observation_space.shape, agent) frame = 0 acc_loss = 0 acc_state_value = 0 while frame < MAX_NUM_FRAMES: state = env.reset() for t in range(MAX_EPISODE_DURATION): if take_random_action(frame): action = env.action_space.sample() # pick random action else: action = agent.act(state) next_state, reward, done, info = env.step(action) if done: # on done doesn't return a negative reward... reward *= -1 experience = (state, action, reward, next_state, done) replay_memory.append(experience) frame += 1 experience_samples = replay_memory.sample(BATCH_SIZE) state_batch, qs_batch = get_training_batch(experience_samples) acc_state_value += np.mean(qs_batch) loss = agent.training_step(state_batch, qs_batch) acc_loss += loss if frame % FRAMES_PER_SAVE == 0: model_filename = f"ckpt-loss={loss:.4f}" agent.save_model(model_filename) if frame % FRAMES_PER_PRINT == 0: print(f"Frame: {frame}") avg_loss = acc_loss / FRAMES_PER_PRINT avg_state_value = acc_state_value / FRAMES_PER_PRINT print( f"avg loss: {avg_loss:.4f}; avg value: {avg_state_value:.2f}" ) acc_loss = 0 acc_state_value = 0 if done or frame == MAX_NUM_FRAMES: break state = next_state env.close()
def loaded_replay(self): loaded_replay = ReplayMemory(30, [1, 1, 1], 5, 200, np.random.RandomState(123)) state = 1 loaded_replay.append(0, np.nan, False, np.array([[[state]]]), state, True) start = False terminal = False for i in range(2, 40): state += 1 if i in (12, 20, 30, 35): terminal = True loaded_replay.append(state, state, terminal, np.array([[[state]]]), state, state == 1) if terminal: start = True state = 0 terminal = False return loaded_replay
def trainNetwork(godmode): game = Game() replay_memory = ReplayMemory(5000, 32) neural_net = NeuralNet() r_0, x_t, terminal = game.run_action(0) s_t = np.stack((x_t, x_t), axis=2) random_action_generator = RandomActionGenerator() keyboard_action = KeyboardAction() for t in range(1, 1000): if godmode: action_index = keyboard_action.action() else: action_index = np.argmax(neural_net.predict(s_t)) action_index = random_action_generator.adapt_action(action_index) r_t, x_t1, terminal = game.run_action(action_index) print("TIMESTEP", t, "/ ACTION", action_index, "/ REWARD", r_t, neural_net.state()) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, :1], axis=2) replay_memory.append({ 'state': s_t, 'action': action_index, 'reward': r_t, 'next_state': s_t1, 'terminal': terminal }) s_t = s_t1 replay_memory.save()
class Agent(object): def __init__(self, save_path, **kwargs): self.step = 0 self.save_path = save_path self.n_act = kwargs['n_act'] self.n_z = kwargs['n_z'] self.epsilon_period = kwargs['epsilon_period'] self.min_epsilon = kwargs['min_epsilon'] self.exp_eps_decay = kwargs['exp_eps_decay'] self.burnin = kwargs['burnin'] self.test = False self.test_epsilon = kwargs['test_epsilon'] self.state = None self.track_repeats = kwargs.get('track_repeats', False) self.freeze_weights = kwargs.get('freeze_weights', False) seed = kwargs.get('seed', None) if seed is not None: self.rng = np.random.RandomState(seed) else: self.rng = np.random.RandomState() self.summary_variables = { 'reward': 0, 'nonzero_rewards': 0, 'episode_reward': 0 } if self.track_repeats: self.hashed_obs = set() self.episode_hashed_obs = set() self.hashed_obs_matches = 0. self.hashed_obs_checks = 0. self.final_init(kwargs) def final_init(self, params): batch_size = params['minibatch_size'] * params['concurrent_batches'] replay_length = params['hist_len'] + 1 self.replay_memory = ReplayMemory(params['max_replay_size'], params['obs_size'], replay_length, batch_size, self.rng) self.index_vec = 2**np.arange(self.n_z, dtype='uint64') self.delete_old_episodes = params['delete_old_episodes'] self.summary_variables['num_sweeps'] = 0 self.lookup = Lookup(params['n_z'], self.n_act, params['discount'], params['init_capacity'], params['pri_cutoff'], self.save_path, self.rng) def get_states(self, zs): if len(zs.shape) == 1: zs.shape = [self.n_z, 2] else: zs.shape = [zs.shape[0], self.n_z, 2] keys = np.argmax(zs, -1).astype(np.bool) return np.dot(keys, self.index_vec) def get_epsilon(self): if self.test: return self.test_epsilon step = self.step - self.burnin if step < 0: return 1. if self.epsilon_period == 0: return self.min_epsilon if self.exp_eps_decay: return (1 - self.min_epsilon) * np.exp( -step / float(self.epsilon_period)) + self.min_epsilon return max( (1 - self.min_epsilon) * (1 - step / float(self.epsilon_period)), 0) + self.min_epsilon def get_action(self, model): if self.step <= self.burnin: return self.rng.choice(range(self.n_act)) epsilon = self.get_epsilon() if self.rng.rand() < epsilon: action = self.rng.choice(range(self.n_act)) else: logging.debug("Estimating action for state %s" % str(self.state)) action = self.lookup.estimate_max_action(self.state) return action def init_episode(self, obs, model): self.reset_episode_summary() zs = model.encode(obs) self.state = self.get_states(zs) if (not self.test) and (self.replay_memory is not None): pop_eps = self.replay_memory.append(0, 0, False, obs, self.state, start=True) if self.delete_old_episodes and (pop_eps is not None): self.lookup.delete_transition(*pop_eps) if self.track_repeats: self.check_repeated(obs) def observe(self, action, reward, terminal, obs_next, model): zs = model.encode(obs_next) state_next = self.get_states(zs) table_state_next = None if terminal else state_next if not self.test: self.step += 1 self.lookup.add_transition(self.state, action, reward, table_state_next) pop_eps = self.replay_memory.append(action, reward, terminal, obs_next, state_next) if self.delete_old_episodes and (pop_eps is not None): self.lookup.delete_transition(*pop_eps) if self.track_repeats: self.check_repeated(obs_next) self.state = table_state_next self.update_summary_variables(reward) def train_and_update(self, model, summary_writer): if self.test or (self.step < self.burnin) or self.freeze_weights: return None batch = self.replay_memory.get_minibatch() zs, wait_time = model.train(self.step, summary_writer, batch) self.summary_variables['train_wait_time'] = wait_time self.update_transitions(zs) self.replay_memory.minibatch_inds = batch['inds'] def finish_update(self, model): if self.test or (self.step < self.burnin) or self.freeze_weights: return None zs, wait_time = model.finish_training() self.summary_variables['train_wait_time'] = wait_time self.update_transitions(zs) def update_transitions(self, zs): logging.debug("Updating transitions.") if zs is None: return None states = self.get_states(zs) updated_transitions, pc_changed = self.replay_memory.get_updated_transitions( states) for updated_transition in updated_transitions: self.lookup.update_transition(*updated_transition) self.summary_variables['reassigned_states'] = pc_changed def update_summary_variables(self, reward): prefix = '' if self.test: prefix = 'eval_' self.summary_variables[prefix + 'reward'] += reward self.summary_variables[prefix + 'nonzero_rewards'] += (reward != 0) self.summary_variables[prefix + 'episode_reward'] += reward self.summary_variables[prefix + 'episode_length'] += 1 def get_summary_variables(self): self.summary_variables['epsilon'] = self.get_epsilon() max_q, avg_q, table_size, num_sweeps, p_size, lookup_dist, lookup_val = self.lookup.get_summary_variables( ) self.summary_variables['table_size'] = table_size self.summary_variables['max_q'] = max_q self.summary_variables['avg_q'] = avg_q self.summary_variables['lookup_distance'] = lookup_dist self.summary_variables['lookup_values'] = lookup_val self.summary_variables['num_sweeps'] = num_sweeps self.summary_variables['priority_queue_size'] = p_size if self.track_repeats: self.summary_variables['obs_repeats'] = float( self.hashed_obs_matches) / self.hashed_obs_checks return self.summary_variables def reset_summary_variables(self): if self.test: self.summary_variables['eval_reward'] = 0. self.summary_variables['eval_nonzero_rewards'] = 0. else: self.summary_variables['reward'] = 0. self.summary_variables['nonzero_rewards'] = 0. self.lookup.reset_summary_variables() def reset_episode_summary(self): if self.test: self.summary_variables['eval_episode_reward'] = 0. self.summary_variables['eval_episode_length'] = 0. else: self.summary_variables['episode_reward'] = 0. self.summary_variables['episode_length'] = 0. if self.track_repeats: self.hashed_obs.update(self.episode_hashed_obs) self.episode_hashed_obs = set() #self.hashed_obs_matches = 0. #self.hashed_obs_checks = 0. def check_repeated(self, obs): hash_obs = hash(obs.tostring()) self.episode_hashed_obs.add(hash_obs) if hash_obs in self.hashed_obs: self.hashed_obs_matches += 1. self.hashed_obs_checks += 1. def save(self): old_steps = get_max_steps(self.save_path, 'agent') if old_steps is not None: os.remove("%s/agent.ckpt-%s" % (self.save_path, old_steps)) data = self.replay_memory.save_and_export(self.save_path, self.step, old_steps) lookup = self.lookup self.lookup = None agent_save_path = "%s/agent.ckpt-%s" % (self.save_path, self.step) with open(agent_save_path, "wb") as handle: pickle.dump(self, handle, pickle.HIGHEST_PROTOCOL) self.replay_memory.load_memory(data) lookup.save(self.step, old_steps) self.lookup = lookup
class SimulatorServer(simulator_pb2_grpc.SimulatorServicer): class ClientState(object): def __init__(self): self.memory = [] # list of Experience self.ident = None self.model_idx = np.random.randint(args.ensemble_num) self.last_target_changed = 0 self.target_change_times = 0 def reset(self): self.last_target_changed = 0 self.memory = [] self.model_idx = np.random.randint(args.ensemble_num) self.target_change_times = 0 def update_last_target_changed(self): self.last_target_changed = len(self.memory) def __init__(self): self.rpm = ReplayMemory(int(2e6), OBS_DIM, ACT_DIM) # Need acquire lock when model learning or predicting self.locks = [] for i in range(args.ensemble_num): self.locks.append(threading.Lock()) models = [] for i in range(args.ensemble_num): models.append(OpenSimModel(OBS_DIM, VEL_DIM, ACT_DIM, model_id=i)) hyperparas = { 'gamma': GAMMA, 'tau': TAU, 'ensemble_num': args.ensemble_num } alg = MultiHeadDDPG(models, hyperparas) self.agent = OpenSimAgent(alg, OBS_DIM, ACT_DIM, args.ensemble_num) self.scalars_manager = ScalarsManager(logger.get_dir()) # add lock when appending data to rpm or writing scalars to tensorboard self.MEMORY_LOCK = threading.Lock() self.clients = defaultdict(self.ClientState) self.ready_client_queue = queue.Queue() self.noiselevel = 0.5 self.global_step = 0 # thread to keep training t = threading.Thread(target=self.keep_training) t.start() def _new_ready_client(self): """ The client is ready to start new episode, but blocking until training thread call client_ready_event.set() """ client_ready_event = threading.Event() self.ready_client_queue.put(client_ready_event) logger.info( "[new_ready_client] approximate size of ready clients:{}".format( self.ready_client_queue.qsize())) client_ready_event.wait() def Send(self, request, context): """ Implement Send function in SimulatorServicer Everytime a request comming, will create a new thread to handle """ ident, obs, reward, done, info = request.id, request.observation, request.reward, request.done, request.info client = self.clients[ident] info = json.loads(info) if 'first' in info: # Waiting training thread to allow start new episode self._new_ready_client() obs = np.array(obs, dtype=np.float32) self._process_msg(ident, obs, reward, done, info) if done: # Waiting training thread to allow start new episode self._new_ready_client() action = self.pred_batch(obs, client.model_idx) step = len(client.memory) - client.last_target_changed # whether to add noise depends on the ensemble_num if args.ensemble_num == 1: current_noise = self.noiselevel * (0.98**(step - 1)) noise = np.zeros((ACT_DIM, ), dtype=np.float32) if ident % 3 == 0: if step % 5 == 0: noise = np.random.randn(ACT_DIM) * current_noise elif ident % 3 == 1: if step % 5 == 0: noise = np.random.randn(ACT_DIM) * current_noise * 2 action += noise action = np.clip(action, -1, 1) client.memory[-1].action = action extra_info = {} return simulator_pb2.Reply(action=action, extra=json.dumps(extra_info)) def _process_msg(self, ident, obs, reward, done, info): client = self.clients[ident] reward_scale = (1 - GAMMA) info['shaping_reward'] *= reward_scale if len(client.memory) > 0: client.memory[-1].reward = reward info['target_change_times'] = client.target_change_times client.memory[-1].info = info if info['target_changed']: client.target_change_times = min( client.target_change_times + 1, 3) # re-sample model_idx after target was changed client.model_idx = np.random.randint(args.ensemble_num) if done: assert 'last_obs' in info self._parse_memory(client, ident, info['last_obs']) client.memory.append( TransitionExperience(obs=obs, action=None, reward=None, info=None)) if 'target_changed' in info and info['target_changed']: client.update_last_target_changed() return False def _parse_memory(self, client, ident, last_obs): mem = client.memory n = len(mem) # debug info if ident == 1: for i, exp in enumerate(mem): logger.info( "[step:{}] obs:{} action:{} reward:{} shaping_reward:{}". format(i, np.sum(mem[i].obs), np.sum(mem[i].action), mem[i].reward, mem[i].info['shaping_reward'])) episode_rpm = [] for i in range(n - 1): if not mem[i].info['target_changed']: episode_rpm.append([ mem[i].obs, mem[i].action, mem[i].info['shaping_reward'], mem[i + 1].obs, False, mem[i].info['target_change_times'] ]) if not mem[-1].info['target_changed']: episode_rpm.append([ mem[-1].obs, mem[-1].action, mem[-1].info['shaping_reward'], last_obs, not mem[-1].info['timeout'], mem[i].info['target_change_times'] ]) indicators_dict = calc_indicators(mem) indicators_dict['free_client_num'] = self.ready_client_queue.qsize() indicators_dict['noiselevel'] = self.noiselevel with self.MEMORY_LOCK: self.add_episode_rpm(episode_rpm) self.scalars_manager.record(indicators_dict, self.global_step) self.global_step += 1 if self.global_step >= 50: self.noiselevel = self.noiselevel * NOISE_DECAY client.reset() def learn(self): result_q = queue.Queue() th_list = [] for j in range(args.ensemble_num): t = threading.Thread( target=self.train_single_model, args=(j, result_q)) th_list.append(t) start_time = time.time() for t in th_list: t.start() for t in th_list: t.join() logger.info("[learn] {} heads, time consuming:{}".format( args.ensemble_num, time.time() - start_time)) for t in th_list: result = result_q.get() for critic_loss in result: self.scalars_manager.feed_critic_loss(critic_loss) def train_single_model(self, model_idx, result_q): logger.info("[train_single_model] model_idx:{}".format(model_idx)) critic_loss_list = [] lock = self.locks[model_idx] memory = self.rpm actor_lr = ACTOR_LR * (1.0 - 0.05 * model_idx) critic_lr = CRITIC_LR * (1.0 + 0.1 * model_idx) for T in range(TRAIN_TIMES): [states, actions, rewards, new_states, dones] = memory.sample_batch(BATCH_SIZE) lock.acquire() critic_loss = self.agent.learn(states, actions, rewards, new_states, dones, actor_lr, critic_lr, model_idx) lock.release() critic_loss_list.append(critic_loss) result_q.put(critic_loss_list) def keep_training(self): episode_count = 1000000 for T in range(episode_count): if self.rpm.size() > BATCH_SIZE * args.warm_start_batchs: self.learn() logger.info( "[keep_training/{}] trying to acq a new env".format(T)) # Keep training and predicting balance # After training, waiting for a ready client, and set the client start new episode ready_client_event = self.ready_client_queue.get() ready_client_event.set() if np.mod(T, 100) == 0: logger.info("saving models") self.save(T) if np.mod(T, 10000) == 0: logger.info("saving rpm") self.save_rpm() def save_rpm(self): save_path = os.path.join(logger.get_dir(), "rpm.npz") self.rpm.save(save_path) def restore_rpm(self, rpm_dir): self.rpm.load(rpm_dir) def save(self, T): save_path = os.path.join(logger.get_dir(), 'model_every_100_episodes/step-{}'.format(T)) self.agent.save_params(save_path) def restore(self, model_path, restore_from_one_head): logger.info('restore model from {}'.format(model_path)) self.agent.load_params(model_path, restore_from_one_head) def add_episode_rpm(self, episode_rpm): for x in episode_rpm: self.rpm.append( obs=x[0], act=x[1], reward=x[2], next_obs=x[3], terminal=x[4]) def pred_batch(self, obs, model_idx=None): assert model_idx is not None batch_obs = np.expand_dims(obs, axis=0) self.locks[model_idx].acquire() action = self.agent.predict(batch_obs, model_idx) self.locks[model_idx].release() action = np.squeeze(action, axis=0) return action
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num-envs', type=int, default=32) parser.add_argument('--t-max', type=int, default=1) parser.add_argument('--learning-rate', type=float, default=0.0002) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--steps-per-epoch', type=int, default=100000) parser.add_argument('--testing', type=int, default=0) parser.add_argument('--continue-training', type=int, default=0) parser.add_argument('--epoch-num', type=int, default=40) parser.add_argument('--start-epoch', type=int, default=20) parser.add_argument('--testing-epoch', type=int, default=0) parser.add_argument('--save-log', type=str, default='basic/log') parser.add_argument('--signal-num', type=int, default=4) parser.add_argument('--toxin', type=int, default=0) parser.add_argument('--a1-AC-folder', type=str, default='basic/a1_Qnet') parser.add_argument('--a2-AC-folder', type=str, default='basic/a2_Qnet') parser.add_argument('--a1-CDPG-folder', type=str, default='basic/a1_CDPG') parser.add_argument('--a2-CDPG-folder', type=str, default='basic/a2_CDPG') parser.add_argument('--eps-start', type=float, default=0.1) parser.add_argument('--replay-start-size', type=int, default=50) parser.add_argument('--decay-rate', type=int, default=50000) parser.add_argument('--replay-memory-size', type=int, default=1000000) parser.add_argument('--eps-min', type=float, default=0.05) args = parser.parse_args() config = Config(args) t_max = args.t_max q_ctx = config.ctx steps_per_epoch = args.steps_per_epoch np.random.seed(args.seed) start_epoch = args.start_epoch testing_epoch = args.testing_epoch save_log = args.save_log epoch_num = args.epoch_num epoch_range = range(epoch_num) signal_num = args.signal_num toxin = args.toxin a1_Qnet_folder = args.a1_AC_folder a2_Qnet_folder = args.a2_AC_folder freeze_interval = 1000 update_interval = 5 replay_memory_size = args.replay_memory_size discount = 0.99 replay_start_size = args.replay_start_size history_length = 1 eps_start = args.eps_start eps_min = args.eps_min eps_decay = (eps_start - eps_min) / args.decay_rate eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 testing = args.testing testing = True if testing == 1 else False continue_training = args.continue_training continue_training = True if continue_training == 1 else False rewards = { "positive": 1.0, "negative": -1.0, "tick": -0.002, "loss": -2.0, "win": 2.0 } game = HunterWorld(width=256, height=256, num_preys=10, draw=False, num_hunters=2, num_toxins=toxin) env = PLE(game, fps=30, force_fps=True, display_screen=False, reward_values=rewards, resized_rows=80, resized_cols=80, num_steps=2) action_set = env.get_action_set() action_map1 = [] for action in action_set[0].values(): action_map1.append(action) action_map2 = [] for action in action_set[1].values(): action_map2.append(action) action_num = len(action_map1) replay_memory1 = ReplayMemory(state_dim=(2, 74 + signal_num), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size, state_dtype='float64') replay_memory2 = ReplayMemory(state_dim=(2, 74 + signal_num), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size, state_dtype='float64') a1_target1 = Qnetwork(actions_num=action_num, signal_num=signal_num, q_ctx=q_ctx, isTrain=False, batch_size=1, dir=dir, folder=a1_Qnet_folder) a1_target32 = Qnetwork(actions_num=action_num, signal_num=signal_num, q_ctx=q_ctx, isTrain=False, batch_size=32, dir=dir, folder=a1_Qnet_folder) a1_Qnet = Qnetwork(actions_num=action_num, signal_num=signal_num, q_ctx=q_ctx, isTrain=True, batch_size=32, dir=dir, folder=a1_Qnet_folder) a1_Qnet_last = Qnetwork(actions_num=action_num, signal_num=signal_num, q_ctx=q_ctx, isTrain=True, batch_size=32, dir=dir, folder=a1_Qnet_folder) a2_target1 = Qnetwork(actions_num=action_num, signal_num=signal_num, q_ctx=q_ctx, isTrain=False, batch_size=1, dir=dir, folder=a2_Qnet_folder) a2_target32 = Qnetwork(actions_num=action_num, signal_num=signal_num, q_ctx=q_ctx, isTrain=False, batch_size=32, dir=dir, folder=a2_Qnet_folder) a2_Qnet = Qnetwork(actions_num=action_num, signal_num=signal_num, q_ctx=q_ctx, isTrain=True, batch_size=32, dir=dir, folder=a2_Qnet_folder) a2_Qnet_last = Qnetwork(actions_num=action_num, signal_num=signal_num, q_ctx=q_ctx, isTrain=True, batch_size=32, dir=dir, folder=a1_Qnet_folder) training_steps = 0 total_steps = 0 if testing: env.force_fps = False env.game.draw = True env.display_screen = True a1_Qnet.load_params(testing_epoch) a2_Qnet.load_params(testing_epoch) elif continue_training: epoch_range = range(start_epoch, epoch_num + start_epoch) a1_Qnet.load_params(start_epoch - 1) a2_Qnet.load_params(start_epoch - 1) # logging_config(logging, dir, save_log, file_name) # else: # logging_config(logging, dir, save_log, file_name) copyTargetQNetwork(a1_Qnet.model, a1_target1.model) copyTargetQNetwork(a1_Qnet.model, a1_target32.model) copyTargetQNetwork(a2_Qnet.model, a2_target1.model) copyTargetQNetwork(a2_Qnet.model, a2_target32.model) logging.info('args=%s' % args) logging.info('config=%s' % config.__dict__) print_params(logging, a1_Qnet.model) zero_gradient_4 = mx.nd.array(np.zeros((32, signal_num)), ctx=q_ctx) zero_gradient_1 = mx.nd.array(np.zeros((32,)), ctx=q_ctx) for epoch in epoch_range: steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() env.reset_game() while steps_left > 0: episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 episode_reward = 0 episode_step = 0 collisions = 0.0 time_episode_start = time.time() env.reset_game() signal_buffer1 = np.zeros((signal_num,)) signal_buffer2 = np.zeros((signal_num,)) next_ob = np.zeros((2, 74)) while not env.game_over(): if replay_memory1.size >= history_length and replay_memory1.size > replay_start_size: do_exploration = (np.random.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action1 = np.random.randint(action_num) action2 = np.random.randint(action_num) signal1 = np.zeros((signal_num,)) signal2 = np.zeros((signal_num,)) else: current_state1 = replay_memory1.latest_slice()[0] current_state2 = replay_memory2.latest_slice()[0] a1_target1.model.forward( mx.io.DataBatch( [nd.array(current_state1[:, 0:-4], ctx=q_ctx), nd.array(signal_buffer2.reshape(1, 4), ctx=q_ctx)], [])) signal1, q_value1 = a1_target1.model.get_outputs() signal1 = signal1.asnumpy() q_value1 = q_value1.asnumpy() a2_target1.model.forward( mx.io.DataBatch( [nd.array(current_state2[:, 0:-4], ctx=q_ctx), nd.array(signal_buffer1.reshape(1, 4), ctx=q_ctx)], [])) signal2, q_value2 = a2_target1.model.get_outputs() signal2 = signal2.asnumpy() q_value2 = q_value2.asnumpy() action1 = numpy.argmax(q_value1) action2 = numpy.argmax(q_value2) episode_q_value += q_value1[0, action1] episode_q_value += q_value2[0, action2] episode_action_step += 1 else: signal1 = np.zeros((signal_num,)) signal2 = np.zeros((signal_num,)) action1 = np.random.randint(action_num) action2 = np.random.randint(action_num) ob1 = [] ob2 = [] ob1.append(np.append(next_ob[0].copy(), signal_buffer2)) ob2.append(np.append(next_ob[1].copy(), signal_buffer1)) next_ob, reward, terminal_flag = env.act([action_map1[action1], action_map2[action2]]) signal_buffer1 = signal1.copy() signal_buffer2 = signal2.copy() ob1.append(np.append(next_ob[0].copy(), signal_buffer2)) ob2.append(np.append(next_ob[1].copy(), signal_buffer1)) replay_memory1.append(ob1, action1, reward[0], terminal_flag) replay_memory2.append(ob2, action2, reward[1], terminal_flag) total_steps += 1 sum_reward = sum(reward) episode_reward += sum_reward if sum_reward < 0: collisions += 1 episode_step += 1 if total_steps % update_interval == 0 and replay_memory1.size > replay_start_size: training_steps += 1 state_batch1, actions1, rewards1, nextstate_batch1, terminate_flags1 = replay_memory1.sample( batch_size=minibatch_size) state_batch2, actions2, rewards2, nextstate_batch2, terminate_flags2 = replay_memory2.sample( batch_size=minibatch_size) actions_batch1 = nd.array(actions1, ctx=q_ctx) reward_batch1 = nd.array(rewards1, ctx=q_ctx) terminate_flags1 = nd.array(terminate_flags1, ctx=q_ctx) a1_next_batch1 = nextstate_batch1[:, 0, 1, :-4] a1_state_batch1 = state_batch1[:, 0, 1, :-4] a1_last_batch1 = state_batch1[:, 0, 0, :-4] a2_next_signal1 = nd.array(nextstate_batch1[:, 0, 1, -4:], ctx=q_ctx) a2_signal_batch1 = nd.array(state_batch1[:, 0, 1, -4:], ctx=q_ctx) a2_last_signal1 = nd.array(state_batch1[:, 0, 0, -4:], ctx=q_ctx) actions_batch2 = nd.array(actions2, ctx=q_ctx) reward_batch2 = nd.array(rewards2, ctx=q_ctx) terminate_flags2 = nd.array(terminate_flags2, ctx=q_ctx) a2_next_batch2 = nextstate_batch2[:, 0, 1, :-4] a2_state_batch2 = state_batch2[:, 0, 1, :-4] a2_last_batch2 = state_batch2[:, 0, 0, :-4] a1_next_signal2 = nd.array(nextstate_batch2[:, 0, 1, -4:], ctx=q_ctx) a1_signal_batch2 = nd.array(state_batch2[:, 0, 1, -4:], ctx=q_ctx) a1_last_signal2 = nd.array(state_batch2[:, 0, 0, -4:], ctx=q_ctx) a1_target32.model.forward( mx.io.DataBatch([nd.array(a1_next_batch1, ctx=q_ctx), a2_next_signal1], [])) next_Qnet1 = a1_target32.model.get_outputs()[1] a2_target32.model.forward( mx.io.DataBatch([nd.array(a2_next_batch2, ctx=q_ctx), a1_next_signal2], [])) next_Qnet2 = a2_target32.model.get_outputs()[1] y_batch1 = reward_batch1 + nd.choose_element_0index(next_Qnet1, nd.argmax_channel(next_Qnet1)) * ( 1.0 - terminate_flags1) * discount y_batch2 = reward_batch2 + nd.choose_element_0index(next_Qnet2, nd.argmax_channel(next_Qnet2)) * ( 1.0 - terminate_flags2) * discount a1_Qnet.model.forward( mx.io.DataBatch( [nd.array(a1_state_batch1, ctx=q_ctx), a2_signal_batch1, actions_batch1, y_batch1], []), is_train=True) a2_Qnet.model.forward( mx.io.DataBatch( [nd.array(a2_state_batch2, ctx=q_ctx), a1_signal_batch2, actions_batch2, y_batch2], []), is_train=True) copyTargetQNetwork(a1_Qnet.model, a1_Qnet_last.model) copyTargetQNetwork(a2_Qnet.model, a2_Qnet_last.model) a1_Qnet.model.backward([zero_gradient_4]) a2_Qnet.model.backward([zero_gradient_4]) grads_buffer1 = a1_Qnet.model._exec_group.execs[0].grad_dict['signal'][:] grads_buffer2 = a2_Qnet.model._exec_group.execs[0].grad_dict['signal'][:] a1_Qnet_last.model.forward( mx.io.DataBatch( [nd.array(a1_last_batch1, ctx=q_ctx), a2_last_signal1, actions_batch1, zero_gradient_1], []), is_train=True) a2_Qnet_last.model.forward( mx.io.DataBatch( [nd.array(a2_last_batch2, ctx=q_ctx), a1_last_signal2, actions_batch2, zero_gradient_1], []), is_train=True) a1_Qnet_last.model.backward([grads_buffer2]) a2_Qnet_last.model.backward([grads_buffer1]) a1_last_grads_dict = a1_Qnet_last.model._exec_group.execs[0].grad_dict a1_grads_dict = a1_Qnet.model._exec_group.execs[0].grad_dict a2_last_grads_dict = a2_Qnet_last.model._exec_group.execs[0].grad_dict a2_grads_dict = a2_Qnet.model._exec_group.execs[0].grad_dict # for name in a1_last_grads_dict.keys(): a1_grads_dict[name] += a1_last_grads_dict[name] a2_grads_dict[name] += a2_last_grads_dict[name] a1_Qnet.model.update() a2_Qnet.model.update() if training_steps % 10 == 0: loss1 = 0.5 * nd.square( nd.choose_element_0index(a1_Qnet.model.get_outputs()[1], actions_batch1) - y_batch1) loss2 = 0.5 * nd.square( nd.choose_element_0index(a2_Qnet.model.get_outputs()[1], actions_batch2) - y_batch2) episode_loss += nd.sum(loss1).asnumpy() episode_loss += nd.sum(loss2).asnumpy() episode_update_step += 1 if training_steps % freeze_interval == 0: copyTargetQNetwork(a1_Qnet.model, a1_target1.model) copyTargetQNetwork(a1_Qnet.model, a1_target32.model) copyTargetQNetwork(a2_Qnet.model, a2_target1.model) copyTargetQNetwork(a2_Qnet.model, a2_target32.model) steps_left -= episode_step time_episode_end = time.time() epoch_reward += episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, episode_step, steps_per_epoch, episode_reward, episode_step / (time_episode_end - time_episode_start), eps_curr) info_str += ", Collision:%f/%d " % (collisions / episode_step, collisions) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % (episode_loss / episode_update_step, episode_update_step * 10) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d " % (episode_q_value / episode_action_step, episode_action_step) if episode % 1 == 0: logging.info(info_str) print info_str end = time.time() fps = steps_per_epoch / (end - start) a1_Qnet.save_params(epoch) a2_Qnet.save_params(epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
class Agent(object): def __init__(self, env): # hyperparameter self.frame_size = 84 self.batch_size = 32 self.discount_factor = 0.99 self.target_network_update_frequency = 5 self.agent_history_length = 4 self.action_repeat = 4 self.update_frequency = 4 # environment self.env = env # state dimension self.state_dim = env.observation_space.shape[0] # action dimension self.action_dim = env.action_space.n # self.action_dim = 10 # replay memory self.replay_memory_size = 50 self.replay_start_size = 25000 // self.replay_memory_size self.max_files_num = 500000 // self.replay_memory_size self.replay_memory = ReplayMemory(self.replay_memory_size, self.frame_size, self.agent_history_length, self.max_files_num) # Q function self.q = DQN(self.action_dim) self.target_q = DQN(self.action_dim) # total reward of a episode self.save_epi_reward = [] self.save_mean_q_value = [] # self.stop_train = 30 def preprocess(self, frame): frame = np.reshape( cv2.resize(frame[0:188, 23:136, :], dsize=(self.frame_size, self.frame_size))[..., 0], (1, self.frame_size, self.frame_size, 1)) return np.array(frame, dtype=np.float32) / 255 def train(self, episodes): train_ep = 0 # repeat episode for e in range(episodes): # if stop_train_count > self.stop_train: # self.q.save_weights('./save_weights/boxing_dqn.h5') # print("이제 잘하네!") # break # initialize frames, episode_reward, done repeated_action, frames, done = 0, 0, False sum_q_value = 0 episode_reward = 0 # reset env and observe initial state initial_frame = self.env.reset() seq = [self.preprocess(initial_frame)] for _ in range(self.agent_history_length - 1): obs, _, _, _ = self.env.step(0) seq.append(self.preprocess(obs)) seq = np.stack(seq, axis=3) seq = np.reshape(seq, (1, self.frame_size, self.frame_size, self.agent_history_length)) while not done: frames += 1 # renderprint(idx, end='\r') action = self.q.get_action(seq) # reapted action for each 4 frames if frames % self.action_repeat != 0: self.env.step(repeated_action) continue repeated_action = action # observe next frame observation, reward, done, info = self.env.step(action) # modify reward if reward > 0: print('hit!') reward = np.clip(reward, -1, 1) # preprocess for next sequence next_seq = np.append(self.preprocess(observation), seq[..., :3], axis=3) # store transition in replay memory self.replay_memory.append(seq, action, reward, next_seq, done) # # check what the agent see # test_img = np.reshape(next_seq, (84, 84, 4)) # test_img = cv2.resize(test_img, dsize=(300, 300), interpolation=cv2.INTER_AREA) # cv2.imshow('obs', test_img) # if cv2.waitKey(25)==ord('q') or done: # cv2.destroyAllWindows() # wait for fill data in replay memory if len(os.listdir( './replay_data/seqs')) < self.replay_start_size: seq = next_seq continue # sample batch seqs, actions, rewards, next_seqs, dones = self.replay_memory.sample( self.batch_size) # argmax action from current q a_next_action = self.q.model(next_seqs)[1] argmax_action = np.argmax(a_next_action, axis=1) argmax_action = tf.one_hot(argmax_action, self.action_dim) # calculate Q(s', a') target_vs, target_as = self.target_q.model(next_seqs) target_qs = target_as \ + (target_vs - tf.reshape(tf.reduce_mean(target_as, axis=1), shape=(len(target_as), 1))) # Double dqn targets = rewards + (1 - dones) * ( self.discount_factor * tf.reduce_sum(target_qs * argmax_action, axis=1)) # train input_states = np.reshape( seqs, (self.batch_size, self.frame_size, self.frame_size, self.agent_history_length)) input_actions = tf.one_hot(actions, self.action_dim) self.q.train(input_states, input_actions, targets) seq = next_seq v, a = self.q.model(seq) q = v + (a - tf.reduce_mean(a)) sum_q_value += np.mean(q) # total reward episode_reward += reward if done: train_ep += 1 if train_ep > 0: mean_q_value = sum_q_value / (frames // 4) if train_ep % self.target_network_update_frequency == 0: self.target_q.model.set_weights(self.q.model.get_weights()) print('episode: {}, Reward: {}, Epsilon: {:.5f}, Q-value: {}'. format(train_ep, episode_reward, self.q.epsilon, mean_q_value)) self.save_epi_reward.append(episode_reward) self.save_mean_q_value.append(mean_q_value) if train_ep % 100 == 0: self.q.save_weights('./save_weights/dqn_boxing_' + str(train_ep) + 'epi.h5') np.savetxt('.save_weights/pendulum_epi_reward.txt', self.save_epi_reward) np.savetxt('.save_weights/pendulum_epi_reward.txt', self.save_mean_q_value) def test(self, path): train_ep = 0 # initialize sequence episode_reward, done = 0, False # reset env and observe initial state initial_frame = self.env.reset() seq = [self.preprocess(initial_frame)] for _ in range(self.agent_history_length - 1): obs, _, _, _ = self.env.step(0) seq.append(self.preprocess(obs)) seq = np.stack(seq, axis=3) seq = np.reshape( seq, (1, self.frame_size, self.frame_size, self.agent_history_length)) # init done, total reward, frames, action frames = 0 mean_q_value = 0 self.q.train(seq, 0, 0) self.q.model.load_weights(path) while not done: time.sleep(0.05) frames += 1 # # render # self.env.render() # get action action = np.argmax(self.q.model(seq)[1]) # observe next frame observation, reward, done, info = self.env.step(action) # preprocess for next sequence next_seq = np.append(self.preprocess(observation), seq[..., :3], axis=3) # store transition in replay memory seq = next_seq # check what the agent see # test_img = np.reshape(seq, (84, 84, 4)) test_img = cv2.resize(test_img, dsize=(300, 300), interpolation=cv2.INTER_AREA) cv2.imshow('obs', test_img) if cv2.waitKey(25) == ord('q') or done: cv2.destroyAllWindows() # graph episodes and rewards def plot_result(self): plt.subplot(211) plt.plot(self.save_epi_reward) plt.subplot(212) plt.plot(self.save_mean_q_value) plt.savefig('reward_meanQ.png') plt.show()
class DQN(object): """Deep Q Network implementation for usage with CircleEnv. Attributes: observation_size: Size of the observation space. action_size: Size of the action space. model: The Q function approximator. target_model: Seperate target function approximator for fixed TD targets. memory_size: Replay memory capacity. memory: Experience replay memory buffer. batch_size: Size of minibatches used in training. gamma: Discount rate for return. epsilon: Random action probability. epsilon_min: Minimum random action probability. epsilon_decay: Decay rate for random action probability. """ def __init__(self, observation_size, action_size, model, target_model, memory_size, batch_size, gamma, epsilon, epsilon_min, epsilon_decay): self.observation_size = observation_size self.action_size = action_size self.model = model self.target_model = target_model self.memory_size = memory_size self.memory = ReplayMemory(size=self.memory_size) self.batch_size = batch_size self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.training_data = [] def act(self, state): """Choose an action based on an epsilon greedy policy.""" state = np.reshape(state, (1, self.observation_size )) # Must reshape state to feed into Keras model. if np.random.rand() <= self.epsilon: return np.random.randint(self.action_size) else: q_values = self.model.predict(state) return np.argmax(q_values[0]) def exploit(self, state): state = np.reshape(state, (1, self.observation_size)) q_values = self.model.predict(state) return int(np.argmax(q_values[0])) def replay(self): if len(self.memory) >= self.batch_size: minibatch = self.memory.sample(self.batch_size) for experience in minibatch: state, action, reward, next_state, episode, step, done = experience state = np.reshape(state, (1, self.observation_size)) next_state = np.reshape(next_state, (1, self.observation_size)) target = reward if not done: target = reward + self.gamma * np.amax( self.target_model.predict(next_state)[0]) target_f = self.model.predict(state) target_f[0][action] = target self.model.fit(state, target_f, epochs=1, verbose=0) self.training_data.append( (state[0][0], state[0][1], action, reward, next_state[0][0], next_state[0][1], episode, step, done)) def remember(self, experience): """Stores experiences in experience replay buffer.""" self.memory.append(experience) def save_training_data(self, file_name): data = pd.DataFrame(self.training_data, columns=[ 'state_x', 'state_y', 'action', 'reward', 'next_state_x', 'next_state_y', 'episode', 'step', 'done' ]) data.to_pickle(file_name) def update_target_model(self): self.target_model.set_weights(self.model.get_weights()) def replay_offline(self, experience): state_x, state_y, action, reward, next_state_x, next_state_y, episode, step, done = experience state = np.array([[ state_x, state_y ]]) # Want this to be of shape (1, 2) to feed into Keras model. next_state = np.array([[ next_state_x, next_state_y ]]) # Want this to be of shape (1, 2) to feed into Keras model. action = int( action ) # Action is turned into a float when saved to disk, so we must convert to int. target = reward if not done: target = reward + self.gamma * np.amax( self.target_model.predict(next_state)[0]) target_f = self.model.predict(state) target_f[0][action] = target self.model.fit(state, target_f, epochs=1, verbose=0)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num-envs', type=int, default=32) parser.add_argument('--t-max', type=int, default=1) parser.add_argument('--learning-rate', type=float, default=0.0002) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--steps-per-epoch', type=int, default=100000) parser.add_argument('--testing', type=int, default=0) parser.add_argument('--continue-training', type=int, default=0) parser.add_argument('--epoch-num', type=int, default=40) parser.add_argument('--start-epoch', type=int, default=20) parser.add_argument('--testing-epoch', type=int, default=0) parser.add_argument('--save-log', type=str, default='basic/log') parser.add_argument('--signal-num', type=int, default=4) parser.add_argument('--toxin', type=int, default=0) parser.add_argument('--a1-AC-folder', type=str, default='basic/a1_Qnet') parser.add_argument('--a2-AC-folder', type=str, default='basic/a2_Qnet') parser.add_argument('--a1-CDPG-folder', type=str, default='basic/a1_CDPG') parser.add_argument('--a2-CDPG-folder', type=str, default='basic/a2_CDPG') parser.add_argument('--eps-start', type=float, default=1.0) parser.add_argument('--replay-start-size', type=int, default=50000) parser.add_argument('--decay-rate', type=int, default=500000) parser.add_argument('--replay-memory-size', type=int, default=1000000) parser.add_argument('--eps-min', type=float, default=0.05) args = parser.parse_args() config = Config(args) t_max = args.t_max q_ctx = config.ctx steps_per_epoch = args.steps_per_epoch np.random.seed(args.seed) start_epoch = args.start_epoch testing_epoch = args.testing_epoch save_log = args.save_log epoch_num = args.epoch_num epoch_range = range(epoch_num) signal_num = args.signal_num toxin = args.toxin a1_Qnet_folder = args.a1_AC_folder a2_Qnet_folder = args.a2_AC_folder a1_CDPG_folder = args.a1_CDPG_folder a2_CDPG_folder = args.a2_CDPG_folder freeze_interval = 10000 update_interval = 5 replay_memory_size = args.replay_memory_size discount = 0.99 replay_start_size = args.replay_start_size history_length = 1 eps_start = args.eps_start eps_min = args.eps_min eps_decay = (eps_start - eps_min) / args.decay_rate eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 testing = args.testing testing = True if testing == 1 else False continue_training = args.continue_training continue_training = True if continue_training == 1 else False rewards = { "positive": 1.0, "negative": -1.0, "tick": -0.002, "loss": -2.0, "win": 2.0 } game = HunterWorld(width=256, height=256, num_preys=10, draw=False, num_hunters=2, num_toxins=toxin) env = PLE(game, fps=30, force_fps=True, display_screen=False, reward_values=rewards, resized_rows=80, resized_cols=80, num_steps=2) action_set = env.get_action_set() action_map1 = [] for action in action_set[0].values(): action_map1.append(action) action_map2 = [] for action in action_set[1].values(): action_map2.append(action) action_num = len(action_map1) replay_memory1 = ReplayMemory(state_dim=(2, 74), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size, state_dtype='float64') replay_memory2 = ReplayMemory(state_dim=(2, 74), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size, state_dtype='float64') a1_CDPG = CDPG(state_dim=74, signal_num=signal_num, dir=dir, folder=a1_CDPG_folder, config=config) a1_CDPG_target = CDPG(state_dim=74, signal_num=signal_num, dir=dir, folder=a1_CDPG_folder, config=config) a1_Qnet = QNet(state_dim=74, signal_num=signal_num, act_space=action_num, dir=dir, folder=a1_Qnet_folder, config=config) a1_Qnet_target = QNet(state_dim=74, signal_num=signal_num, act_space=action_num, dir=dir, folder=a1_Qnet_folder, config=config) a2_CDPG = CDPG(state_dim=74, signal_num=signal_num, dir=dir, folder=a2_CDPG_folder, config=config) a2_CDPG_target = CDPG(state_dim=74, signal_num=signal_num, dir=dir, folder=a2_CDPG_folder, config=config) a2_Qnet = QNet(state_dim=74, signal_num=signal_num, act_space=action_num, dir=dir, folder=a2_Qnet_folder, config=config) a2_Qnet_target = QNet(state_dim=74, signal_num=signal_num, act_space=action_num, dir=dir, folder=a2_Qnet_folder, config=config) training_steps = 0 total_steps = 0 if testing: env.force_fps = False env.game.draw = True env.display_screen = True a1_Qnet.load_params(testing_epoch) a2_Qnet.load_params(testing_epoch) a1_CDPG.load_params(testing_epoch) a2_CDPG.load_params(testing_epoch) elif continue_training: epoch_range = range(start_epoch, epoch_num + start_epoch) a1_Qnet.load_params(start_epoch - 1) a2_Qnet.load_params(start_epoch - 1) a1_CDPG.load_params(start_epoch - 1) a2_CDPG.load_params(start_epoch - 1) logging_config(logging, dir, save_log, file_name) else: logging_config(logging, dir, save_log, file_name) copy_params_to(a1_Qnet, a1_Qnet_target) copy_params_to(a1_CDPG, a1_CDPG_target) copy_params_to(a2_Qnet, a2_Qnet_target) copy_params_to(a2_CDPG, a2_CDPG_target) logging.info('args=%s' % args) logging.info('config=%s' % config.__dict__) print_params(logging, a1_Qnet.model) print_params(logging, a1_CDPG.model) for epoch in epoch_range: steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() env.reset_game() while steps_left > 0: episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 episode_reward = 0 episode_step = 0 collisions = 0.0 time_episode_start = time.time() env.reset_game() while not env.game_over(): if replay_memory1.size >= history_length and replay_memory1.size > replay_start_size: do_exploration = (np.random.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action1 = np.random.randint(action_num) action2 = np.random.randint(action_num) else: current_state1 = replay_memory1.latest_slice() current_state2 = replay_memory2.latest_slice() a1_current_state = current_state1[:, 0] a2_current_state = current_state2[:, 1] signal1 = a1_CDPG_target.forward(a2_current_state, is_train=False)[0] signal2 = a2_CDPG_target.forward(a1_current_state, is_train=False)[0] q_value1 = a1_Qnet_target.forward( a1_current_state, signal1, is_train=False)[0].asnumpy() q_value2 = a2_Qnet_target.forward( a2_current_state, signal2, is_train=False)[0].asnumpy() action1 = numpy.argmax(q_value1) action2 = numpy.argmax(q_value2) episode_q_value += q_value1[:, action1] episode_q_value += q_value2[:, action2] episode_action_step += 1 else: action1 = np.random.randint(action_num) action2 = np.random.randint(action_num) next_ob, reward, terminal_flag = env.act( [action_map1[action1], action_map2[action2]]) replay_memory1.append(next_ob, action1, reward[0], terminal_flag) replay_memory2.append(next_ob, action2, reward[1], terminal_flag) total_steps += 1 sum_reward = sum(reward) episode_reward += sum_reward if sum_reward < 0: collisions += 1 episode_step += 1 if total_steps % update_interval == 0 and replay_memory1.size > replay_start_size: training_steps += 1 state_batch1, actions1, rewards1, nextstate_batch1, terminate_flags1 = replay_memory1.sample( batch_size=minibatch_size) state_batch2, actions2, rewards2, nextstate_batch2, terminate_flags2 = replay_memory2.sample( batch_size=minibatch_size) actions_batch1 = nd.array(actions1, ctx=q_ctx) reward_batch1 = nd.array(rewards1, ctx=q_ctx) terminate_flags1 = nd.array(terminate_flags1, ctx=q_ctx) actions_batch2 = nd.array(actions2, ctx=q_ctx) reward_batch2 = nd.array(rewards2, ctx=q_ctx) terminate_flags = nd.array(terminate_flags2, ctx=q_ctx) a1_signal_target = \ a1_CDPG_target.forward(nextstate_batch1[:, :, 1].reshape(32, 74), is_train=False)[0] next_Qnet1 = \ a1_Qnet_target.forward(nextstate_batch1[:, :, 0].reshape(32, 74), a1_signal_target, is_train=False)[ 0] a2_signal_target = \ a2_CDPG_target.forward(nextstate_batch2[:, :, 0].reshape(32, 74), is_train=False)[0] next_Qnet2 = \ a2_Qnet_target.forward(nextstate_batch2[:, :, 1].reshape(32, 74), a2_signal_target, is_train=False)[ 0] y_batch1 = reward_batch1 + nd.choose_element_0index( next_Qnet1, nd.argmax_channel(next_Qnet1)) * ( 1.0 - terminate_flags1) * discount y_batch2 = reward_batch2 + nd.choose_element_0index( next_Qnet2, nd.argmax_channel(next_Qnet2)) * ( 1.0 - terminate_flags) * discount a1_signal = a1_CDPG.forward(state_batch1[:, :, 1].reshape( 32, 74), is_train=True)[0] Qnet1 = a1_Qnet.forward(state_batch1[:, :, 0].reshape(32, 74), a1_signal, is_train=True)[0] a2_signal = a2_CDPG.forward(state_batch2[:, :, 0].reshape( 32, 74), is_train=True)[0] Qnet2 = a2_Qnet.forward(state_batch2[:, :, 1].reshape(32, 74), a2_signal, is_train=True)[0] grads1 = np.zeros(Qnet1.shape) tmp1 = (nd.choose_element_0index(Qnet1, actions_batch1) - y_batch1).asnumpy() grads1[np.arange(grads1.shape[0]), actions1] = np.clip(tmp1, -1, 1) grads1 = mx.nd.array(grads1, ctx=q_ctx) grads2 = np.zeros(Qnet2.shape) tmp2 = (nd.choose_element_0index(Qnet2, actions_batch2) - y_batch2).asnumpy() grads2[np.arange(grads2.shape[0]), actions2] = np.clip(tmp2, -1, 1) grads2 = mx.nd.array(grads2, ctx=q_ctx) a1_Qnet.model.backward(out_grads=[grads1]) a1_CDPG.model.backward(out_grads=[ a1_Qnet.model._exec_group.execs[0].grad_dict['signal'] [:] ]) a1_Qnet.model.update() a1_CDPG.model.update() a2_Qnet.model.backward(out_grads=[grads2]) a2_CDPG.model.backward(out_grads=[ a2_Qnet.model._exec_group.execs[0].grad_dict['signal'] [:] ]) a2_Qnet.model.update() a2_CDPG.model.update() if training_steps % 10 == 0: loss1 = 0.5 * nd.square(grads1) loss2 = 0.5 * nd.square(grads2) episode_loss += nd.sum(loss1).asnumpy() episode_loss += nd.sum(loss2).asnumpy() episode_update_step += 1 if training_steps % freeze_interval == 0: copy_params_to(a1_Qnet, a1_Qnet_target) copy_params_to(a1_CDPG, a1_CDPG_target) copy_params_to(a2_Qnet, a2_Qnet_target) copy_params_to(a2_CDPG, a2_CDPG_target) steps_left -= episode_step time_episode_end = time.time() epoch_reward += episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, episode_step, steps_per_epoch, episode_reward, episode_step / (time_episode_end - time_episode_start), eps_curr) info_str += ", Collision:%f/%d " % (collisions / episode_step, collisions) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % (episode_loss / episode_update_step, episode_update_step * 10) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d " % ( episode_q_value / episode_action_step, episode_action_step) if episode % 1 == 0: logging.info(info_str) print info_str end = time.time() fps = steps_per_epoch / (end - start) a1_Qnet.save_params(epoch) a1_CDPG.save_params(epoch) a2_Qnet.save_params(epoch) a2_CDPG.save_params(epoch) logging.info("Epoch:%d, FPS:%f, Avg Reward: %f/%d" % (epoch, fps, epoch_reward / float(episode), episode))
class Learner(object): def __init__(self, args): if machine_info.is_gpu_available(): assert get_gpu_count() == 1, 'Only support training in single GPU,\ Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .' else: cpu_num = os.environ.get('CPU_NUM') assert cpu_num is not None and cpu_num == '1', 'Only support training in single CPU,\ Please set environment variable: `export CPU_NUM=1`.' model = OpenSimModel(OBS_DIM, VEL_DIM, ACT_DIM) algorithm = parl.algorithms.DDPG( model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) self.agent = OpenSimAgent(algorithm, OBS_DIM, ACT_DIM) self.rpm = ReplayMemory(args.rpm_size, OBS_DIM, ACT_DIM) if args.restore_rpm_path is not None: self.rpm.load(args.restore_rpm_path) if args.restore_model_path is not None: self.restore(args.restore_model_path) # add lock between training and predicting self.model_lock = threading.Lock() # add lock when appending data to rpm or writing scalars to summary self.memory_lock = threading.Lock() self.ready_actor_queue = queue.Queue() self.total_steps = 0 self.noiselevel = 0.5 self.critic_loss_stat = WindowStat(500) self.env_reward_stat = WindowStat(500) self.shaping_reward_stat = WindowStat(500) self.max_env_reward = 0 # thread to keep training learn_thread = threading.Thread(target=self.keep_training) learn_thread.setDaemon(True) learn_thread.start() self.create_actors() def create_actors(self): """Connect to the cluster and start sampling of the remote actor. """ parl.connect(args.cluster_address, ['official_obs_scaler.npz']) for i in range(args.actor_num): logger.info('Remote actor count: {}'.format(i + 1)) remote_thread = threading.Thread(target=self.run_remote_sample) remote_thread.setDaemon(True) remote_thread.start() # There is a memory-leak problem in osim-rl package. # So we will dynamically add actors when remote actors killed due to excessive memory usage. time.sleep(10 * 60) parl_client = get_global_client() while True: if parl_client.actor_num < args.actor_num: logger.info( 'Dynamic adding acotr, current actor num:{}'.format( parl_client.actor_num)) remote_thread = threading.Thread(target=self.run_remote_sample) remote_thread.setDaemon(True) remote_thread.start() time.sleep(5) def _new_ready_actor(self): """ The actor is ready to start new episode, but blocking until training thread call actor_ready_event.set() """ actor_ready_event = threading.Event() self.ready_actor_queue.put(actor_ready_event) logger.info( "[new_avaliabe_actor] approximate size of ready actors:{}".format( self.ready_actor_queue.qsize())) actor_ready_event.wait() def run_remote_sample(self): remote_actor = Actor( difficulty=args.difficulty, vel_penalty_coeff=args.vel_penalty_coeff, muscle_penalty_coeff=args.muscle_penalty_coeff, penalty_coeff=args.penalty_coeff, only_first_target=args.only_first_target) actor_state = ActorState() while True: obs = remote_actor.reset() actor_state.reset() while True: actor_state.memory.append( TransitionExperience( obs=obs, action=None, reward=None, info=None, timestamp=time.time())) action = self.pred_batch(obs) # For each target, decay noise as the steps increase. step = len( actor_state.memory) - actor_state.last_target_changed_steps current_noise = self.noiselevel * (0.98**(step - 1)) noise = np.zeros((ACT_DIM, ), dtype=np.float32) if actor_state.ident % 3 == 0: if step % 5 == 0: noise = np.random.randn(ACT_DIM) * current_noise elif actor_state.ident % 3 == 1: if step % 5 == 0: noise = np.random.randn(ACT_DIM) * current_noise * 2 action += noise action = np.clip(action, -1, 1) obs, reward, done, info = remote_actor.step(action) reward_scale = (1 - GAMMA) info['shaping_reward'] *= reward_scale actor_state.memory[-1].reward = reward actor_state.memory[-1].info = info actor_state.memory[-1].action = action if 'target_changed' in info and info['target_changed']: actor_state.update_last_target_changed() if done: self._parse_memory(actor_state, last_obs=obs) break self._new_ready_actor() def _parse_memory(self, actor_state, last_obs): mem = actor_state.memory n = len(mem) episode_shaping_reward = np.sum( [exp.info['shaping_reward'] for exp in mem]) episode_env_reward = np.sum([exp.info['env_reward'] for exp in mem]) episode_time = time.time() - mem[0].timestamp episode_rpm = [] for i in range(n - 1): episode_rpm.append([ mem[i].obs, mem[i].action, mem[i].info['shaping_reward'], mem[i + 1].obs, False ]) episode_rpm.append([ mem[-1].obs, mem[-1].action, mem[-1].info['shaping_reward'], last_obs, not mem[-1].info['timeout'] ]) with self.memory_lock: self.total_steps += n self.add_episode_rpm(episode_rpm) if actor_state.ident % 3 == 2: # trajectory without noise self.env_reward_stat.add(episode_env_reward) self.shaping_reward_stat.add(episode_shaping_reward) self.max_env_reward = max(self.max_env_reward, episode_env_reward) if self.env_reward_stat.count > 500: summary.add_scalar('recent_env_reward', self.env_reward_stat.mean, self.total_steps) summary.add_scalar('recent_shaping_reward', self.shaping_reward_stat.mean, self.total_steps) if self.critic_loss_stat.count > 500: summary.add_scalar('recent_critic_loss', self.critic_loss_stat.mean, self.total_steps) summary.add_scalar('episode_length', n, self.total_steps) summary.add_scalar('max_env_reward', self.max_env_reward, self.total_steps) summary.add_scalar('ready_actor_num', self.ready_actor_queue.qsize(), self.total_steps) summary.add_scalar('episode_time', episode_time, self.total_steps) self.noiselevel = self.noiselevel * NOISE_DECAY def learn(self): start_time = time.time() for T in range(args.train_times): [states, actions, rewards, new_states, dones] = self.rpm.sample_batch(BATCH_SIZE) with self.model_lock: critic_loss = self.agent.learn(states, actions, rewards, new_states, dones) self.critic_loss_stat.add(critic_loss) logger.info( "[learn] time consuming:{}".format(time.time() - start_time)) def keep_training(self): episode_count = 1000000 for T in range(episode_count): if self.rpm.size() > BATCH_SIZE * args.warm_start_batchs: self.learn() logger.info( "[keep_training/{}] trying to acq a new env".format(T)) # Keep training and predicting balance # After training, wait for a ready actor, and make the actor start new episode ready_actor_event = self.ready_actor_queue.get() ready_actor_event.set() if np.mod(T, 100) == 0: logger.info("saving models") self.save(T) if np.mod(T, 10000) == 0: logger.info("saving rpm") self.save_rpm() def save_rpm(self): save_path = os.path.join(logger.get_dir(), "rpm.npz") self.rpm.save(save_path) def save(self, T): save_path = os.path.join( logger.get_dir(), 'model_every_100_episodes/episodes-{}'.format(T)) self.agent.save(save_path) def restore(self, model_path): logger.info('restore model from {}'.format(model_path)) self.agent.restore(model_path) def add_episode_rpm(self, episode_rpm): for x in episode_rpm: self.rpm.append( obs=x[0], act=x[1], reward=x[2], next_obs=x[3], terminal=x[4]) def pred_batch(self, obs): batch_obs = np.expand_dims(obs, axis=0) with self.model_lock: action = self.agent.predict(batch_obs.astype('float32')) action = np.squeeze(action, axis=0) return action
class DeepQLearner(object): def __init__(self, lib, env, working_dir, record_video_every=50): self.lib = lib self.total_steps = lib.get_number_of_steps_done() self.epsilon_decay_schedule = EpsilonDecaySchedule(1.0, 0.1, 1e6) self.q_estimator = lib.get_q_estimator() self.target_estimator = lib.get_target_estimator() self.state_processor = lib.get_state_processor() self.policy = make_epsilon_greedy_policy(self.q_estimator, len(VALID_ACTIONS)) self.replay_memory = ReplayMemory() self.replay_memory.init_replay_memory( env, self.policy, self.state_processor, lambda: self.epsilon_decay_schedule.next_epsilon(self.total_steps)) self.env = Monitor( env, directory=os.path.join(working_dir, "monitor"), resume=True, video_callable=lambda count: count % record_video_every == 0) self.env = DownsampleFrameWrapper(self.env, self.state_processor.process) self._set_up_logging(working_dir) def _set_up_logging(self, base_dir): self.log = Log() self.log.add_logger(ConsoleLogger()) self.log.add_logger(TFBoardLogger(base_dir)) def reset_env(self): state = self.env.reset() state = np.stack([state] * 4, axis=2) return state def step(self, state, action): next_state, reward, done, _ = self.env.step(VALID_ACTIONS[action]) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) return next_state, reward, done def run(self, episodes_to_run, update_target_estimator_every=10000, discount_factor=0.99, batch_size=32, save_model_every=25): # TODO: Shorten this function as much as possible for episode in range(episodes_to_run): if episode % save_model_every == 0: self.lib.save() state = self.reset_env() loss = None episode_reward = 0 episode_length = 0 for timestep in itertools.count(): epsilon = self.epsilon_decay_schedule.next_epsilon( self.total_steps) self.log.log_epsilon(epsilon, self.total_steps) if self.total_steps % update_target_estimator_every == 0: self.target_estimator.copy_parameters_from( self.q_estimator) self.log.log_step(timestep, self.total_steps, episode, episodes_to_run, loss) action = self.policy(state, epsilon) next_state, reward, done = self.step(state, action) self.replay_memory.append( Transition(state, action, reward, next_state, done)) # Sample a minibatch from the replay memory samples = self.replay_memory.sample(batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples)) # Calculate q values and targets (Double DQN) q_values_next = self.q_estimator.predict(next_states_batch) best_actions = np.argmax(q_values_next, axis=1) q_values_next_target = self.target_estimator.predict( next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ discount_factor * q_values_next_target[np.arange(batch_size), best_actions] # Perform gradient descent update states_batch = np.array(states_batch) loss = self.q_estimator.update(states_batch, action_batch, targets_batch) episode_reward += reward episode_length += 1 self.total_steps += 1 if done: break state = next_state self.log.log_episode(episode, episodes_to_run, episode_length, episode_reward, self.total_steps) self.env.monitor.close()
class Agent: def __init__(self, config, env, state_dim, action_dim): # Get Config self.cf = config # Setting Environment self.env = env self.state_dim = state_dim self.action_dim = action_dim # Setting Replay Memory self.rm = ReplayMemory(self.cf.REPLAY_MEMORY_SIZE, self.cf.FRAME_SIZE, self.cf.AGENT_HISTORY_LENGHTH) # Build Model self.q = build_model(self.cf.FRAME_SIZE, self.action_dim, self.cf.AGENT_HISTORY_LENGHTH) self.target_q = build_model(self.cf.FRAME_SIZE, self.action_dim, self.cf.AGENT_HISTORY_LENGHTH) # Optimizer and Loss for Training self.optimizer = tf.keras.optimizers.Adam( learning_rate=self.cf.LEARNING_RATE, clipnorm=10.) self.loss = tf.keras.losses.Huber() self.q.summary() # Save Logs # wandb.init( # project="fully_conv_layer_test", # name='vanilla_DQN_'+ str(env)[20:-3], # config=self.cf.WANDB) def get_action(self, state): """ Epsilon Greedy """ q = self.q(state)[0] return (np.argmax(q), q) if self.cf.epsilon < np.random.rand() else ( np.random.randint(self.action_dim), q) def model_train(self): # Sample From Replay Memory states, actions, rewards, next_states, dones = self.rm.sample( self.cf.BATCH_SIZE) # Epsilon Decay (+ exponentially) if self.cf.epsilon > self.cf.FINAL_EXPLORATION: self.cf.epsilon -= (1 + self.cf.FINAL_EXPLORATION ) / self.cf.FINAL_EXPLORATION_FRAME # Update Weights with tf.GradientTape() as g: # Maximum q value of next state from target q function max_next_q = np.max(self.target_q(normalize(next_states)), axis=1) # Calculate Targets targets = rewards + (1 - dones) * (self.cf.DISCOUNT_FACTOR * max_next_q) predicts = self.q(normalize(states)) predicts = tf.reduce_sum(predicts * tf.one_hot(actions, self.action_dim), axis=1) loss = self.loss(targets, predicts) g_theta = g.gradient(loss, self.q.trainable_weights) self.optimizer.apply_gradients(zip(g_theta, self.q.trainable_weights)) def run(self, max_frame, game_name, render=False): # For the Logs sum_mean_q, episodic_rewards, new_record = 0, 0, -999 # Initalizing episode = 0 frames, action = 0, 0 initial_state = self.env.reset() state = np.stack( [preprocess(initial_state, frame_size=self.cf.FRAME_SIZE)] * 4, axis=3) state = np.reshape(state, state.shape[:-1]) # No Ops for _ in range(self.cf.NO_OPS): next_state, _, _, _ = self.env.step(0) next_state = np.append(state[..., 1:], preprocess(next_state, frame_size=self.cf.FRAME_SIZE), axis=3) state = next_state while frames < max_frame: # if render: # self.env.render() # Interact with Environmnet (action, q) = self.get_action(normalize(state)) next_state, reward, done, _ = self.env.step(action) reward = np.clip(reward, -1, 1) next_state = np.append(state[..., 1:], preprocess(next_state, frame_size=self.cf.FRAME_SIZE), axis=3) # Append To Replay Memeory self.rm.append(state, action, reward, next_state, done) # Start Training After Collecting Enough Samples if self.rm.crt_idx < self.cf.REPLAY_START_SIZE and not self.rm.is_full( ): state = next_state continue # Training self.model_train() state = next_state episodic_rewards += reward sum_mean_q += np.mean(q) frames += 1 # Update Target Q if frames % self.cf.TARGET_NETWORK_UPDATE_FREQUENCY == 0: self.target_q.set_weights(self.q.get_weights()) if done: episodic_mean_q = sum_mean_q / frames * (self.cf.SKIP_FRAMES + 1) episode += 1 # Update Logs print( f'Epi : {episode}, Reward : {episodic_rewards}, Q : {episodic_mean_q}' ) # wandb.log({ # 'Reward':episodic_rewards, # 'Q value':episodic_mean_q, # 'Epsilon':self.cf.epsilon, # }) # Save Model if new_record < episodic_rewards: new_record = episodic_rewards try: self.q.save_weights( f'../save_weights/{game_name}/{game_name}_{str(int(new_record))}.h5' ) except: os.makedirs(f'../save_weights/{game_name}/') self.q.save_weights( f'../save_weights/{game_name}/{game_name}_{str(int(new_record))}.h5' ) wandb.save( f'../save_weights/{game_name}/{game_name}_{str(int(new_record))}.h5', policy='live') # Initializing sum_mean_q, episodic_rewards = 0, 0 initial_state = self.env.reset() state = np.stack( [preprocess(initial_state, frame_size=self.cf.FRAME_SIZE) ] * 4, axis=3) state = np.reshape(state, state.shape[:-1]) # No Ops for _ in range(self.cf.NO_OPS): next_state, _, _, _ = self.env.step(0) next_state = np.append(state[..., 1:], preprocess( next_state, frame_size=self.cf.FRAME_SIZE), axis=3) state = next_state
def main(): sess = tf.Session() K.set_session(sess) env = gym.make("MountainCarContinuous-v0") #Parameters memory_size = 100000 batch_size = 32 tau = 0.001 lr_actor = 0.0001 lr_critic = 0.001 discount_factor = 0.99 episodes = 1001 time_steps = 501 collect_experience = 50000 save_frequency = 250 ep_reward = [] training = False #Noise objecct noise = OUNoise(env.action_space) #Initialize actor and critic objects actor = Actor(env, sess, lr_actor, tau) #Uncomment to the following line to save the actor model architecture as json file. Need to be saved #once only # actor.save_model_architecture("Actor_model_architecture.json") critic = Critic(env, sess, lr_critic, tau, discount_factor) #Initialize replay memory of size defined by memory_size replay_memory = ReplayMemory(memory_size) #Toggle between true and false for debugging purposes. For training it is always true run = True if run: #Loop over the number of episodes. At eqach new episode reset the environment, reset the noise #state and set total episode reward to 0 for episode in range(episodes): state = env.reset() noise.reset() episode_reward = 0 #Loop over the number of steps in an episode for time in range(time_steps): #Uncomment the following line of you want to visualize the mountain car during training. #Can also be trained without visualization for the case where we are using #position and velocities as state variables. # env.render() #Predict an action from the actor model using the current state action = actor.predict_action(state.reshape((1, 2)))[0] #Add ohlnbeck noise to the predicted action to encourage exploration of the environment exploratory_action = noise.get_action(action, time) #Take the noisy action to enter the next state next_state, reward, done, _ = env.step(exploratory_action) #Predict the action to be taken given the next_state. This next state action is predicted #using the actor's target model next_action = actor.predict_next_action( next_state.reshape((1, 2)))[0] #Append this experience sample to the replay memory replay_memory.append(state, exploratory_action, reward, next_state, next_action, done) #Only start training when there are a minimum number of experience samples available in #memory if replay_memory.count() == collect_experience: training = True print('Start training') #When training: if training: # 1)first draw a random batch of samples from the replay memory batch = replay_memory.sample(batch_size) # 2) using this sample calculate dQ/dA from the critic model grads = critic.calc_grads(batch) # 3) calculate dA/dTheta from the actor using the same batch # 4) multiply dA/dTheta by negative dQ/dA to get dJ/dTheta # 5) Update actor weights such that dJ/dTheta is maximized # 6) The above operation is easily performed by minimizing the value obtained in (4) t_grads = actor.train(batch, grads) # update critic weights by minimizing the bellman loss. Use actor target to compute # next action in the next state (already computed and stored in replay memory) # in order to compute TD target critic.train(batch) #After each weight update of the actor and critic online model perform soft updates # of their targets so that they can smoothly and slowly track the online model's #weights actor.update_target() critic.update_target() #Add each step reward to the episode reward episode_reward += reward #Set current state as next state state = next_state #If target reached before the max allowed time steps, break the inner for loop if done: break #Store episode reward ep_reward.append([episode, episode_reward]) #Print info for each episode to track training progress print( "Completed in {} steps.... episode: {}/{}, episode reward: {} " .format(time, episode, episodes, episode_reward)) #Save model's weights and episode rewards after each save_frequency episode if training and (episode % save_frequency) == 0: print('Data saved at epsisode:', episode) actor.save_weights( './Model/DDPG_actor_model_{}.h5'.format(episode)) pickle.dump( ep_reward, open('./Rewards/rewards_{}.dump'.format(episode), 'wb')) # Close the mountain car environment env.close()
def fit_nash(): suffix = 'Nash_{}_RC_{}_AttackMode_{}_RewardMode_{}'.format(args.NashMode, RC, args.AttackMode, args.RewardMode) # reward_file = open('reward' + suffix + '.txt', 'w') # attack_file = open('attacker_action' + suffix + '.txt', 'w') # weight_file = open('vehicle_weight' + suffix + '.txt', 'w') # distance_file = open('Distance' + suffix + '.txt', 'w') # reward_file.write(""" # Environment Initializing... # The initial head car velocity is {} # The initial safe distance is {} # The Nash Eq* Factor RC is {} # The Reward Calculation Mode is {} # The Attack Mode is {} # The Nash Mode is {} # """.format(env.v_head, env.d0, RC, env.reward_mode, env.attack_mode, args.Nash)) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space, 'veh') agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space, 'att') try: agent_vehicle.load_model('models/vehicle_' + suffix) print('Load vehicle RL model successfully') except: print('No existed vehicle RL model') try: agent_attacker.load_model('models/attacker_' + suffix) print('Load attacker RL model successfully') except: print('No existed attacker RL model') try: policy_vehicle = load_model('models/vehicle_' + suffix + '.h5') print('Load vehicle SL model successfully') except: policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space, 'vehicle') try: policy_attacker = load_model('models/attacker_' + suffix + '.h5') print('Load attacker SL model successfully') except: policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space, 'attacker') print('*'*20, '\n\n\n') memory_vehicle = ReplayMemory(100000) memory_attacker = ReplayMemory(100000) memory_SL_vehicle = ReplayMemory(400000) memory_SL_attacker = ReplayMemory(400000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None res_data = pd.DataFrame(columns=['Weight', 'Attack', 'Eva_distance']) reward_data = pd.DataFrame(columns=['Reward']) rewards = [] total_numsteps = 0 for i_episode in range(args.num_episodes): if i_episode % 100 == 0 and i_episode != 0: print('Writing to CSV files...') reward_data.to_csv(suffix + '.csv', index=False) res_data.to_csv(suffix + '.csv', index=False) if args.NashMode == 0: ETA = 0 elif args.NashMode == 1: ETA = 0.5 elif args.NashMode == 2: ETA = 0.1 - i_episode/args.num_episodes * 0.1 print('No.{} episode starts... ETA is {}'.format(i_episode, ETA)) # reward_file = open('reward' + suffix + '.txt', 'a') # attack_file = open('attacker_action' + suffix + '.txt', 'a') # weight_file = open('vehicle_weight' + suffix + '.txt', 'a') # distance_file = open('Distance' + suffix + '.txt', 'a') local_steps = 0 state = env.reset() state_record = [np.array([state])] episode_steps = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 local_steps = 0 while True: sigma = random.random() if sigma > ETA: # print(state_record[-20:]) # print('rl', torch.Tensor(state_record[-20:]).shape) action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] # print('rl', action_vehicle.shape) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] # 限制权重和为1 action_vehicle = action_vehicle.numpy()[0]/(action_vehicle.numpy()[0].sum()) action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker) res_data = res_data.append([{'Attack':env.action_attacker, 'Weight':action_vehicle, 'Eva_distance':env.d}]) # 将处理的攻击值赋给原值 action_attacker = env.action_attacker total_numsteps += 1 episode_reward += reward state_record.append(next_state) local_steps += 1 episode_steps += 1 if sigma > ETA: memory_SL_vehicle.append(state_record[-1], action_vehicle) memory_SL_attacker.append(state_record[-1], action_attacker) action_vehicle = torch.Tensor(action_vehicle.reshape(1,4)) action_attacker = torch.Tensor(action_attacker.reshape(1,4)) mask = torch.Tensor([not done]) prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([RC - reward]) memory_vehicle.push(prev_state, torch.Tensor(action_vehicle), mask, next_state, reward_vehicle) memory_attacker.push(prev_state, torch.Tensor(action_attacker), mask, next_state, reward_attacker) if done: rewards.append(episode_reward) print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward)) reward_data = reward_data.append([{'Reward': episode_reward}]) # reward_file.write('Episode {} ends, instant reward is {:.2f}\n'.format(i_episode, episode_reward)) break if min(len(memory_vehicle), len(memory_SL_vehicle)) > args.batch_size: # 开始训练 for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) agent_vehicle.update_parameters(batch_vehicle) agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) if i_episode % 10 == 0 and i_episode != 0: eva_res_data = pd.DataFrame(columns=['Eva_reward', 'Eva_distance']) # distance_file.write('{} episode starts, recording distance...\n'.format(i_episode)) state = env.reset() state_record = [np.array([state])] evaluate_reward = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4))])[0] action_vehicle = action_vehicle.numpy()[0] / action_vehicle.numpy()[0].sum() action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker, attack_mode=2) eva_res_data = eva_res_data.append([{'Eva_reward':evaluate_reward, 'Eva_distance':env.d}]) evaluate_reward += reward if done: print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, evaluate_reward, np.mean(rewards[-10:]))) # reward_file.write("Episode: {}, total numsteps: {}, reward: {}, average reward: {}\n".format(i_episode, # total_numsteps, # evaluate_reward, # np.mean(rewards[-10:]))) break # # writer.add_scalar('reward/test', episode_reward, i_episode) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() env.close() reward_data.to_csv(suffix+'_reward.csv', index=False) res_data.to_csv(suffix+'.csv', index=False) eva_res_data.to_csv(suffix+'_eva.csv', index=False) # save model agent_vehicle.save_model('vehicle_'+suffix) agent_attacker.save_model('attacker_'+suffix) policy_attacker.save('models/attacker_'+suffix+'.h5') policy_vehicle.save('models/vehicle_'+suffix+'.h5')
class L0_Learner: def __init__(self, sess, abstraction_scope, visual_scope, num_actions, num_abstract_actions, num_abstract_states, gamma=0.99, learning_rate=0.00025, replay_start_size=5000, epsilon_start=1.0, epsilon_end=0.1, epsilon_steps=1000000, update_freq=4, target_copy_freq=10000, replay_memory_size=1000000, frame_history=1, batch_size=32, error_clip=1, abstraction_function=None, max_episode_steps=-1, base_network_file=None): self.sess = sess self.num_abstract_actions = num_abstract_actions self.num_abstract_states = num_abstract_states self.num_actions = num_actions self.batch_size = batch_size self.gamma = gamma self.frame_history = frame_history self.replay_buffer = ReplayMemory((84, 84), 'uint8', replay_memory_size, frame_history) self.abstraction_scope = abstraction_scope self.abstraction_function = abstraction_function self.inp_frames = tf.placeholder(tf.uint8, [None, 84, 84, self.frame_history]) self.inp_sp_frames = tf.placeholder(tf.uint8, [None, 84, 84, self.frame_history]) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(tf.uint8, [None, frame_history]) self.inp_sp_mask = tf.placeholder(tf.uint8, [None, frame_history]) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) # onehot vector #self.inp_sigma = tf.placeholder(tf.float32, [None, self.num_abstract_states]) self.reward_matrix = -np.ones( (num_abstract_states, num_abstract_states, num_abstract_actions), dtype=np.float32) # make self transitions 0 for i in range(num_abstract_states): self.reward_matrix[i, i, :] = 0 # make goal transitions have reward 1 for a in range(num_abstract_actions): i, j = flat_actions_to_state_pairs(a, num_abstract_states) self.reward_matrix[i, j, a] = 1 self.actions_for_sigma = np.zeros( (num_abstract_states, num_abstract_actions), dtype=np.float32) for a in range(num_abstract_actions): i, j = flat_actions_to_state_pairs(a, num_abstract_states) self.actions_for_sigma[i, a] = 1 # mask stuff here mask = tf.reshape(self.inp_mask, [-1, 1, 1, 1]) masked_input = self.inp_frames * mask l0_vis_scope = 'l0_vis' with tf.variable_scope(l0_vis_scope): self.visual_output_base = hook_visual(masked_input, self.frame_history) self.visual_output = tf.stop_gradient(self.visual_output_base) with tf.variable_scope('online_base'): self.q_online_base = hook_base(self.visual_output_base, self.num_actions) with tf.variable_scope('online_1'): self.q_online_1 = hook_l0(self.visual_output, 1, self.num_actions) with tf.variable_scope('online_2'): self.q_online_2 = hook_l0(self.visual_output, 1, self.num_actions) self.q_online = tf.concat(1, [self.q_online_1, self.q_online_2]) mask_sp = tf.reshape(self.inp_sp_mask, [-1, 1, 1, 1]) masked_input_sp = self.inp_sp_frames * mask_sp l0_target_vis_scope = 'l0_target_vis' with tf.variable_scope(l0_target_vis_scope): self.visual_output_sp = hook_visual(masked_input_sp, self.frame_history) with tf.variable_scope('target_base'): self.q_target_base = hook_base(self.visual_output_sp, self.num_actions) with tf.variable_scope('target_1'): self.q_target_1 = hook_l0(self.visual_output_sp, 1, self.num_actions) with tf.variable_scope('target_2'): self.q_target_2 = hook_l0(self.visual_output_sp, 1, self.num_actions) self.q_target = tf.concat(1, [self.q_target_1, self.q_target_2]) # with tf.variable_scope(visual_scope, reuse=True): # # mask stuff here # mask = tf.reshape(self.inp_mask, [-1, 1, 1, 1]) # masked_input = self.inp_frames * mask # self.visual_output = hook_visual(masked_input, self.frame_history) # # mask_sp = tf.reshape(self.inp_sp_mask, [-1, 1, 1, 1]) # masked_input_sp = self.inp_sp_frames * mask_sp # self.visual_output_sp = hook_visual(masked_input_sp, self.frame_history) # # with tf.variable_scope('online'): # self.q_online = hook_l0(self.visual_output, self.num_abstract_actions, self.num_actions) # with tf.variable_scope('target'): # self.q_target = hook_l0(self.visual_output_sp, self.num_abstract_actions, self.num_actions) # TODO set up double dqn for later experiments. # Q matrix is (num_abstract_actions, num_actions), results in vector with max-q for each abstract action. self.maxQ = tf.reduce_max(self.q_target, reduction_indices=2) with tf.variable_scope(visual_scope, reuse=True): self.l1_visual_output = hook_visual(masked_input, self.frame_history) self.l1_visual_output_sp = hook_visual(masked_input_sp, self.frame_history) with tf.variable_scope(self.abstraction_scope, reuse=True): self.sigma = tf.stop_gradient( hook_abstraction(self.l1_visual_output, num_abstract_states, batch_size)[0]) self.sigma_p = tf.stop_gradient( hook_abstraction(self.l1_visual_output_sp, num_abstract_states, batch_size)[0]) self.sigma_query, self.sigma_query_probs = hook_abstraction( self.l1_visual_output, self.num_abstract_states, 1) self.r = tf.reduce_sum( tf.reshape(self.sigma_p, [-1, 1, num_abstract_states, 1]) * \ tf.reshape(self.sigma, [-1, num_abstract_states, 1, 1]) * \ tf.reshape(self.reward_matrix, [1, num_abstract_states, num_abstract_states, num_abstract_actions]), reduction_indices=[1, 2]) # Give a reward of -1 if reached a terminal state self.r = (self.r * tf.reshape(tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32), [-1, 1])) +\ tf.reshape(tf.cast(self.inp_terminated, dtype=tf.float32) * -1, [-1, 1]) self.use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) * tf.reduce_sum( self.sigma_p * self.sigma, reduction_indices=1) self.y = tf.stop_gradient(self.r + tf.reshape(self.use_backup, [-1, 1]) * gamma * self.maxQ) self.delta = tf.reduce_sum( tf.reshape(self.inp_actions, [-1, 1, num_actions]) * self.q_online, reduction_indices=2) - self.y valid_actions_mask = valid_actions_for_sigma(self.actions_for_sigma, self.sigma, self.num_abstract_actions) self.masked_delta = self.delta * valid_actions_mask self.error = tf.select( tf.abs(self.masked_delta) < error_clip, 0.5 * tf.square(self.masked_delta), error_clip * tf.abs(self.masked_delta)) # base dqn self.maxQ_base = tf.reduce_max(self.q_target_base, reduction_indices=1) self.r_base = tf.sign(self.inp_reward) use_backup_base = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y_base = tf.stop_gradient(self.r_base + use_backup_base * gamma * self.maxQ_base) self.delta_base = tf.reduce_sum(self.inp_actions * self.q_online_base, reduction_indices=1) - self.y_base self.error_base = tf.select( tf.abs(self.delta_base) < error_clip, 0.5 * tf.square(self.delta_base), error_clip * tf.abs(self.delta_base)) self.loss = tf.reduce_sum(self.error) + tf.reduce_sum(self.error_base) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars( 'online_1', 'online_2', 'online_base', l0_vis_scope)) self.copy_op = [ th.make_copy_op('online_1', 'target_1'), th.make_copy_op('online_2', 'target_2'), th.make_copy_op(l0_vis_scope, l0_target_vis_scope), th.make_copy_op('online_base', 'target_base') ] self.replay_buffer = L1ReplayMemory((84, 84), 'uint8', replay_memory_size, frame_history) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.max_episode_steps = max_episode_steps self.num_actions = num_actions self.batch_size = batch_size self.base_network_saver = tf.train.Saver( var_list=th.get_vars('online_base', l0_vis_scope)) # @profile def run_learning_episode(self, initial_sigma, l1_action, environment): assert flat_actions_to_state_pairs( np.argmax(l1_action), self.num_abstract_states)[0] == np.argmax(initial_sigma) R = 0 episode_steps = 0 sigma_p = initial_sigma while not environment.is_current_state_terminal(): if 0 <= self.max_episode_steps <= episode_steps: #sigma_p = 1 - initial_sigma #R = -1 break state = environment.get_current_state() if np.random.uniform(0, 1) < self.epsilon: action = np.random.choice( environment.get_actions_for_state(state)) else: action = self.get_action(state, l1_action) if self.replay_buffer.size() > self.replay_start_size: self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_delta) s, a, r, sp, t = environment.perform_action(action) sigma_p = self.get_abstract_state(sp) self.replay_buffer.append(s[-1], np.argmax(initial_sigma), a, r, sp[-1], np.argmax(sigma_p), t) R += r # TODO: discount? # R = R * self.gamma + r if (self.replay_buffer.size() > self.replay_start_size) and ( self.action_ticker % self.update_freq == 0): loss = self.update_q_values() if (self.action_ticker - self.replay_start_size) % self.target_copy_freq == 0: self.sess.run(self.copy_op) self.action_ticker += 1 episode_steps += 1 if np.sum(np.abs(initial_sigma - sigma_p)) > 0.1: break if self.action_ticker % 1000000 == 0: self.base_network_saver.save(self.sess, 'base_net.ckpt') return initial_sigma, l1_action, R, sigma_p, environment.is_current_state_terminal( ), episode_steps # @profile def get_abstract_state(self, l0_state): if self.abstraction_function is None: [sigma] = self.sess.run( [self.sigma_query], feed_dict={ self.inp_frames: np.reshape(l0_state, [1, 84, 84, 1]), self.inp_mask: np.ones((1, self.frame_history), dtype=np.float32) }) return sigma[0] else: return self.abstraction_function() def update_q_values(self): S1, Sigma1, A, R, S2, Sigma2, T, M1, M2 = self.replay_buffer.sample( self.batch_size) Aonehot = np.zeros((self.batch_size, self.num_actions), dtype=np.float32) Aonehot[list(range(len(A))), A] = 1 if self.abstraction_function is None: [ _, loss, q_online, maxQ, q_target, r, y, error, delta, g, q_online_base ] = self.sess.run( [ self.train_op, self.loss, self.q_online, self.maxQ, self.q_target, self.r, self.y, self.error, self.delta, self.g, self.q_online_base ], feed_dict={ self.inp_frames: S1, self.inp_actions: Aonehot, self.inp_sp_frames: S2, self.inp_reward: R, self.inp_terminated: T, self.inp_mask: M1, self.inp_sp_mask: M2 }) else: onehot_sigma = np.zeros((self.batch_size, 2)) onehot_sigma[list(range(len(Sigma1))), Sigma1] = 1 onehot_sigma_p = np.zeros((self.batch_size, 2)) onehot_sigma_p[list(range(len(Sigma2))), Sigma2] = 1 [ _, loss, q_online, maxQ, q_target, r, y, error, delta, g, use_backup ] = self.sess.run( [ self.train_op, self.loss, self.q_online, self.maxQ, self.q_target, self.r, self.y, self.error, self.delta, self.g, self.use_backup ], feed_dict={ self.inp_frames: S1, self.inp_actions: Aonehot, self.inp_sp_frames: S2, self.inp_reward: R, self.inp_terminated: T, self.inp_mask: M1, self.inp_sp_mask: M2, self.sigma: onehot_sigma, self.sigma_p: onehot_sigma_p }) return loss def get_action(self, state, l1_action): # [q_values] = self.sess.run([self.q_online], # feed_dict={self.inp_frames: np.reshape(state, [1, 84, 84, 1]), # self.inp_mask: np.ones((1, self.frame_history), dtype=np.float32)}) # q_values_l0_for_l1 = np.sum(q_values[0] * np.reshape(l1_action, [self.num_abstract_actions, 1]), axis=0) # return np.argmax(q_values_l0_for_l1) [q_values] = self.sess.run( [self.q_online_base], feed_dict={ self.inp_frames: np.reshape(state, [1, 84, 84, 1]), self.inp_mask: np.ones((1, self.frame_history), dtype=np.float32) }) return np.argmax(q_values)
class AtariGame(Game): def __init__(self, rom_path=_default_rom_path, frame_skip=4, history_length=4, resize_mode='clip', resized_rows=84, resized_cols=84, crop_offset=8, display_screen=False, max_null_op=30, replay_memory_size=1000000, replay_start_size=100, death_end_episode=True): super(AtariGame, self).__init__() self.rng = get_numpy_rng() self.ale = ale_load_from_rom(rom_path=rom_path, display_screen=display_screen) self.start_lives = self.ale.lives() self.action_set = self.ale.getMinimalActionSet() self.resize_mode = resize_mode self.resized_rows = resized_rows self.resized_cols = resized_cols self.crop_offset = crop_offset self.frame_skip = frame_skip self.history_length = history_length self.max_null_op = max_null_op self.death_end_episode = death_end_episode self.screen_buffer_length = 2 self.screen_buffer = numpy.empty( (self.screen_buffer_length, self.ale.getScreenDims()[1], self.ale.getScreenDims()[0]), dtype='uint8') self.replay_memory = ReplayMemory(state_dim=(resized_rows, resized_cols), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size) self.start() def start(self): self.ale.reset_game() null_op_num = self.rng.randint( self.screen_buffer_length, max(self.max_null_op + 1, self.screen_buffer_length + 1)) for i in range(null_op_num): self.ale.act(0) self.ale.getScreenGrayscale( self.screen_buffer[i % self.screen_buffer_length, :, :]) self.total_reward = 0 self.episode_reward = 0 self.episode_step = 0 self.max_episode_step = DEFAULT_MAX_EPISODE_STEP self.start_lives = self.ale.lives() def force_restart(self): self.start() self.replay_memory.clear() def begin_episode(self, max_episode_step=DEFAULT_MAX_EPISODE_STEP): """ Begin an episode of a game instance. We can play the game for a maximum of `max_episode_step` and after that, we are forced to restart """ if self.episode_step > self.max_episode_step or self.ale.game_over(): self.start() else: for i in range(self.screen_buffer_length): self.ale.act(0) self.ale.getScreenGrayscale( self.screen_buffer[i % self.screen_buffer_length, :, :]) self.max_episode_step = max_episode_step self.start_lives = self.ale.lives() self.episode_reward = 0 self.episode_step = 0 @property def episode_terminate(self): termination_flag = self.ale.game_over( ) or self.episode_step >= self.max_episode_step if self.death_end_episode: return (self.ale.lives() < self.start_lives) or termination_flag else: return termination_flag @property def state_enabled(self): return self.replay_memory.size >= self.replay_memory.history_length def get_observation(self): image = self.screen_buffer.max(axis=0) if 'crop' == self.resize_mode: original_rows, original_cols = image.shape new_resized_rows = int( round( float(original_rows) * self.resized_cols / original_cols)) resized = cv2.resize(image, (self.resized_cols, new_resized_rows), interpolation=cv2.INTER_LINEAR) crop_y_cutoff = new_resized_rows - self.crop_offset - self.resized_rows img = resized[crop_y_cutoff:crop_y_cutoff + self.resized_rows, :] return img else: # plt.imshow(image, cmap='gray') # plt.show() return cv2.resize(image, (self.resized_cols, self.resized_rows), interpolation=cv2.INTER_LINEAR) def play(self, a): assert not self.episode_terminate,\ "Warning, the episode seems to have terminated. " \ "We need to call either game.begin_episode(max_episode_step) to continue a new " \ "episode or game.start() to force restart." self.episode_step += 1 reward = 0.0 action = self.action_set[int(a)] for i in range(self.frame_skip): reward += self.ale.act(action) self.ale.getScreenGrayscale( self.screen_buffer[i % self.screen_buffer_length, :, :]) self.total_reward += reward self.episode_reward += reward ob = self.get_observation() # plt.imshow(ob, cmap="gray") # plt.show() terminate_flag = self.episode_terminate self.replay_memory.append(ob, a, numpy.clip(reward, -1, 1), terminate_flag) return reward, terminate_flag
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--t-max', type=int, default=1) parser.add_argument('--learning-rate', type=float, default=0.0002) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--steps-per-epoch', type=int, default=100000) parser.add_argument('--testing', type=int, default=0) parser.add_argument('--continue-training', type=int, default=0) parser.add_argument('--epoch-num', type=int, default=40) parser.add_argument('--start-epoch', type=int, default=20) parser.add_argument('--testing-epoch', type=int, default=3) parser.add_argument('--save-log', type=str, default='basic/log') parser.add_argument('--signal-num', type=int, default=4) parser.add_argument('--toxin', type=int, default=0) parser.add_argument('--a1-AC-folder', type=str, default='basic/a1_Qnet') parser.add_argument('--eps-start', type=float, default=1.0) parser.add_argument('--replay-start-size', type=int, default=50000) parser.add_argument('--decay-rate', type=int, default=500000) parser.add_argument('--replay-memory-size', type=int, default=1000000) parser.add_argument('--eps-min', type=float, default=0.05) rewards = { "positive": 1.0, "negative": -1.0, "tick": -0.002, "loss": -2.0, "win": 2.0 } args = parser.parse_args() config = Config(args) q_ctx = config.ctx steps_per_epoch = args.steps_per_epoch np.random.seed(args.seed) start_epoch = args.start_epoch testing_epoch = args.testing_epoch save_log = args.save_log epoch_num = args.epoch_num epoch_range = range(epoch_num) toxin = args.toxin a1_Qnet_folder = args.a1_AC_folder freeze_interval = 10000 update_interval = 5 replay_memory_size = args.replay_memory_size discount = 0.99 replay_start_size = args.replay_start_size history_length = 1 eps_start = args.eps_start eps_min = args.eps_min eps_decay = (eps_start - eps_min) / args.decay_rate eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 testing = args.testing testing = True if testing == 1 else False continue_training = args.continue_training continue_training = True if continue_training == 1 else False game = HunterWorld(width=256, height=256, num_preys=10, draw=False, num_hunters=2, num_toxins=toxin) env = PLE(game, fps=30, force_fps=True, display_screen=False, reward_values=rewards, resized_rows=80, resized_cols=80, num_steps=2) replay_memory = ReplayMemory(state_dim=(148, ), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size, state_dtype='float32') action_set = env.get_action_set() action_map = [] for action1 in action_set[0].values(): for action2 in action_set[1].values(): action_map.append([action1, action2]) action_map = np.array(action_map) action_num = action_map.shape[0] target1 = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=False, batch_size=1, dir=dir, folder=a1_Qnet_folder) target32 = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=False, batch_size=32, dir=dir, folder=a1_Qnet_folder) Qnet = Qnetwork(actions_num=action_num, q_ctx=q_ctx, isTrain=True, batch_size=32, dir=dir, folder=a1_Qnet_folder) if testing: env.force_fps = False env.game.draw = True env.display_screen = True Qnet.load_params(testing_epoch) elif continue_training: epoch_range = range(start_epoch, epoch_num + start_epoch) Qnet.load_params(start_epoch - 1) logging_config(logging, dir, save_log, file_name) else: logging_config(logging, dir, save_log, file_name) copyTargetQNetwork(Qnet.model, target1.model) copyTargetQNetwork(Qnet.model, target32.model) logging.info('args=%s' % args) logging.info('config=%s' % config.__dict__) print_params(logging, Qnet.model) training_steps = 0 total_steps = 0 for epoch in epoch_range: steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() env.reset_game() while steps_left > 0: episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 episode_reward = 0 episode_step = 0 collisions = 0.0 time_episode_start = time.time() env.reset_game() while not env.game_over(): if replay_memory.size >= history_length and replay_memory.size > replay_start_size: do_exploration = (np.random.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action = np.random.randint(action_num) else: current_state = replay_memory.latest_slice() state = nd.array( current_state.reshape((1, ) + current_state.shape), ctx=q_ctx) target1.model.forward(mx.io.DataBatch([state], [])) q_value = target1.model.get_outputs()[0].asnumpy()[0] action = numpy.argmax(q_value) episode_q_value += q_value[action] episode_action_step += 1 else: action = np.random.randint(action_num) next_ob, reward, terminal_flag = env.act(action_map[action]) reward = np.sum(reward) replay_memory.append( np.array(next_ob).flatten(), action, reward, terminal_flag) total_steps += 1 episode_reward += reward if reward < 0: collisions += 1 episode_step += 1 if total_steps % update_interval == 0 and replay_memory.size > replay_start_size: training_steps += 1 state_batch, actions, rewards, nextstate_batch, terminate_flags = replay_memory.sample( batch_size=minibatch_size) state_batch = nd.array(state_batch, ctx=q_ctx) actions_batch = nd.array(actions, ctx=q_ctx) reward_batch = nd.array(rewards, ctx=q_ctx) terminate_flags = nd.array(terminate_flags, ctx=q_ctx) target32.model.forward( mx.io.DataBatch([nd.array(nextstate_batch, ctx=q_ctx)], [])) Qvalue = target32.model.get_outputs()[0] y_batch = reward_batch + nd.choose_element_0index( Qvalue, nd.argmax_channel(Qvalue)) * ( 1.0 - terminate_flags) * discount Qnet.model.forward(mx.io.DataBatch( [state_batch, actions_batch, y_batch], []), is_train=True) Qnet.model.backward() Qnet.model.update() if training_steps % 10 == 0: loss1 = 0.5 * nd.square( nd.choose_element_0index( Qnet.model.get_outputs()[0], actions_batch) - y_batch) episode_loss += nd.sum(loss1).asnumpy() episode_update_step += 1 if training_steps % freeze_interval == 0: copyTargetQNetwork(Qnet.model, target1.model) copyTargetQNetwork(Qnet.model, target32.model) steps_left -= episode_step time_episode_end = time.time() epoch_reward += episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, episode_step, steps_per_epoch, episode_reward, episode_step / (time_episode_end - time_episode_start), eps_curr) info_str += ", Collision:%f/%d " % (collisions / episode_step, collisions) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % ( episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d " % ( episode_q_value / episode_action_step, episode_action_step) if episode % 1 == 0: logging.info(info_str) print info_str end = time.time() fps = steps_per_epoch / (end - start) Qnet.save_params(epoch) print "Epoch:%d, FPS:%f, Avg Reward: %f/%d" % ( epoch, fps, epoch_reward / float(episode), episode)
class AtariGame(Game): def __init__(self, rom_path=_default_rom_path, frame_skip=4, history_length=4, resize_mode='scale', resized_rows=84, resized_cols=84, crop_offset=8, display_screen=False, max_null_op=30, replay_memory_size=1000000, replay_start_size=100, death_end_episode=True): super(AtariGame, self).__init__() self.rng = get_numpy_rng() self.ale = ale_load_from_rom(rom_path=rom_path, display_screen=display_screen) self.start_lives = self.ale.lives() self.action_set = self.ale.getMinimalActionSet() self.resize_mode = resize_mode self.resized_rows = resized_rows self.resized_cols = resized_cols self.crop_offset = crop_offset self.frame_skip = frame_skip self.history_length = history_length self.max_null_op = max_null_op self.death_end_episode = death_end_episode self.screen_buffer_length = 2 self.screen_buffer = numpy.empty((self.screen_buffer_length, self.ale.getScreenDims()[1], self.ale.getScreenDims()[0]), dtype='uint8') self.replay_memory = ReplayMemory(state_dim=(resized_rows, resized_cols), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size) self.start() def start(self): self.ale.reset_game() null_op_num = self.rng.randint(self.screen_buffer_length, max(self.max_null_op + 1, self.screen_buffer_length + 1)) for i in range(null_op_num): self.ale.act(0) self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :]) self.total_reward = 0 self.episode_reward = 0 self.episode_step = 0 self.max_episode_step = DEFAULT_MAX_EPISODE_STEP self.start_lives = self.ale.lives() def force_restart(self): self.start() self.replay_memory.clear() def begin_episode(self, max_episode_step=DEFAULT_MAX_EPISODE_STEP): """ Begin an episode of a game instance. We can play the game for a maximum of `max_episode_step` and after that, we are forced to restart """ if self.episode_step > self.max_episode_step or self.ale.game_over(): self.start() else: for i in range(self.screen_buffer_length): self.ale.act(0) self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :]) self.max_episode_step = max_episode_step self.start_lives = self.ale.lives() self.episode_reward = 0 self.episode_step = 0 @property def episode_terminate(self): termination_flag = self.ale.game_over() or self.episode_step >= self.max_episode_step if self.death_end_episode: return (self.ale.lives() < self.start_lives) or termination_flag else: return termination_flag @property def state_enabled(self): return self.replay_memory.size >= self.replay_memory.history_length def get_observation(self): image = self.screen_buffer.max(axis=0) if 'crop' == self.resize_mode: original_rows, original_cols = image.shape new_resized_rows = int(round( float(original_rows) * self.resized_cols / original_cols)) resized = cv2.resize(image, (self.resized_cols, new_resized_rows), interpolation=cv2.INTER_LINEAR) crop_y_cutoff = new_resized_rows - self.crop_offset - self.resized_rows img = resized[crop_y_cutoff: crop_y_cutoff + self.resized_rows, :] return img else: return cv2.resize(image, (self.resized_cols, self.resized_rows), interpolation=cv2.INTER_LINEAR) def play(self, a): assert not self.episode_terminate,\ "Warning, the episode seems to have terminated. " \ "We need to call either game.begin_episode(max_episode_step) to continue a new " \ "episode or game.start() to force restart." self.episode_step += 1 reward = 0.0 action = self.action_set[a] for i in range(self.frame_skip): reward += self.ale.act(action) self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :]) self.total_reward += reward self.episode_reward += reward ob = self.get_observation() terminate_flag = self.episode_terminate self.replay_memory.append(ob, a, numpy.clip(reward, -1, 1), terminate_flag) return reward, terminate_flag
def train(self, args): # Decaying learning rate and epsilon global_step = tf.Variable(0, trainable=False, name='global_step') learning_rate = tf.train.exponential_decay(args.base_lr, global_step, args.lr_decay_steps, args.lr_decay_rate, staircase=True, name='lr') learning_rate = tf.maximum(learning_rate, args.lr_clip) train_epsilon = tf.train.polynomial_decay(args.init_epsilon, global_step, args.epsilon_decay_steps, args.final_epsilon) # Set up trainer trainer = tf.train.AdamOptimizer(learning_rate) grad = trainer.compute_gradients(self.loss, var_list=tf.trainable_variables(scope='q_pred')) if args.grad_clip: # Clip the gradients grad = [(tf.clip_by_value(grad, -args.grad_clip, args.grad_clip), var) for grad, var in grad] train_op = trainer.apply_gradients(grad, global_step) # Summary for tensorboard loss_summary = tf.summary.scalar('loss', self.loss) lr_summary = tf.summary.scalar('learning_rate', learning_rate) epsilon_summary = tf.summary.scalar('epsilon', train_epsilon) train_summary = tf.summary.merge([loss_summary, lr_summary, epsilon_summary]) episode_length = tf.placeholder(tf.int32, shape=(), name='episode_length') length_summary = tf.summary.scalar('episode_length', episode_length) avg_reward = tf.placeholder(tf.float32, shape=(), name='avg_reward') reward_summary = tf.summary.scalar('average reward', avg_reward) # Set up saving and logging saver = tf.train.Saver(max_to_keep=10) save_path = os.path.join(args.log_dir, 'checkpoints', 'model') steps_per_save = args.max_iter // 9 if args.restore: self.restore(args.checkpoint) else: self.sess.run(tf.global_variables_initializer()) if args.fix_target: self.sess.run(self.update_q_target) if os.path.exists(args.log_dir): delete_key = input('%s exists. Delete? [y (or enter)/N]' % args.log_dir) if delete_key == 'y' or delete_key == "": os.system('rm -rf %s/*' % args.log_dir) os.makedirs(os.path.join(args.log_dir, 'checkpoints'), exist_ok=True) saver.save(self.sess, save_path, global_step) writer = tf.summary.FileWriter(args.log_dir, self.sess.graph) # Burn in some transitions using the randomly initialized agent into the replay memory if args.replay: replay = ReplayMemory(args.memory_size) epsilon = self.sess.run(train_epsilon) i = 0 while i < args.burn_in: done = False state = self.env.reset() while not done and i < args.burn_in: action = self.policy(state, epsilon) next_state, reward, done, info = self.env.step(action) replay.append((state, action, reward, next_state, done)) state = next_state i += 1 # Training i = self.sess.run(global_step) episode_start = i state = self.env.reset() while i < args.max_iter: # Obtain experience epsilon = self.sess.run(train_epsilon) action = self.policy(state, epsilon) next_state, reward, is_terminal, info = self.env.step(action) if args.replay: replay.append((state, action, reward, next_state, is_terminal)) states, actions, rewards, next_states, is_terminals = replay.sample(args.batch_size) else: states = [state] actions = [action] rewards = [reward] next_states = [next_state] is_terminals = [is_terminal] # Set up target and update prediction network if args.fix_target: feed_dict = {self.state: states, self.action: actions, self.reward: rewards, self.next_state: next_states, self.is_terminal: is_terminals} else: q_target = self.sess.run(self.q_pred, feed_dict={self.state: next_states}) feed_dict = {self.state: states, self.action: actions, self.reward: rewards, self.is_terminal: is_terminals, self.q_target: q_target} if args.double_q: q_next = self.sess.run(self.q_pred, feed_dict={self.state: next_states}) feed_dict.update({self.q_next: q_next}) _, loss, summary = self.sess.run([train_op, self.loss, train_summary], feed_dict=feed_dict) # Logging i += 1 writer.add_summary(summary, i) # Checking for end of episode if is_terminal: summary = self.sess.run(length_summary, feed_dict={episode_length: i - episode_start}) writer.add_summary(summary, i) episode_start = i state = self.env.reset() else: state = next_state # Evaluation if i % args.steps_per_eval == 0: rewards = self.evaluate(args.eval_episodes, args.final_epsilon) summary = self.sess.run(reward_summary, feed_dict={avg_reward: rewards.mean()}) writer.add_summary(summary, i) print('Step: %d Average reward: %f Loss: %f' % (i, rewards.mean(), loss)) np.set_printoptions(precision=10) print('Q values:', self.sess.run(self.q_pred, feed_dict={self.state: [state]})) print('Rewards:', rewards) episode_start = i state = self.env.reset() # Update target network if args.fix_target and i % args.steps_per_update == 0: self.sess.run(self.update_q_target) # Save model if i % steps_per_save == 0: saver.save(self.sess, save_path, global_step)
class DQLearner(interfaces.LearningAgent): def __init__(self, dqn, num_actions, gamma=0.99, learning_rate=0.00025, replay_start_size=50000, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True): self.dqn = dqn config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history] inp_dtype = self.dqn.get_input_dtype() assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.gamma = gamma with tf.variable_scope('online'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = self.dqn.construct_q_network(masked_input) with tf.variable_scope('target'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = self.dqn.construct_q_network(masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = self.dqn.construct_q_network( masked_sp_input) self.maxQ = tf.gather_nd( self.q_target, tf.transpose([ tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32) ], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = tf.sign(self.inp_reward) use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta = tf.reduce_sum(self.inp_actions * self.q_online, reduction_indices=1) - self.y self.error = tf.where( tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta), error_clip * tf.abs(self.delta)) self.loss = tf.reduce_sum(self.error) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(), self.dqn.get_input_dtype(), replay_memory_size, frame_history) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op) def update_q_values(self): S1, A, R, S2, T, M1, M2 = self.replay_buffer.sample(self.batch_size) Aonehot = np.zeros((self.batch_size, self.num_actions), dtype=np.float32) Aonehot[list(range(len(A))), A] = 1 [_, loss, q_online, maxQ, q_target, r, y, error, delta, g] = self.sess.run( [ self.train_op, self.loss, self.q_online, self.maxQ, self.q_target, self.r, self.y, self.error, self.delta, self.g ], feed_dict={ self.inp_frames: S1, self.inp_actions: Aonehot, self.inp_sp_frames: S2, self.inp_reward: R, self.inp_terminated: T, self.inp_mask: M1, self.inp_sp_mask: M2 }) return loss def run_learning_episode(self, environment, max_episode_steps=100000): episode_steps = 0 total_reward = 0 for steps in range(max_episode_steps): if environment.is_current_state_terminal(): break state = environment.get_current_state() if np.random.uniform(0, 1) < self.epsilon: action = np.random.choice( environment.get_actions_for_state(state)) else: action = self.get_action(state) if self.replay_buffer.size() > self.replay_start_size: self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_delta) state, action, reward, next_state, is_terminal = environment.perform_action( action) total_reward += reward self.replay_buffer.append(state[-1], action, reward, next_state[-1], is_terminal) if (self.replay_buffer.size() > self.replay_start_size) and ( self.action_ticker % self.update_freq == 0): loss = self.update_q_values() if (self.action_ticker - self.replay_start_size) % self.target_copy_freq == 0: self.sess.run(self.copy_op) self.action_ticker += 1 episode_steps += 1 return episode_steps, total_reward def get_action(self, state): size = list(np.array(list(range(len(self.dqn.get_input_shape())))) + 1) state_input = np.transpose(state, size + [0]) [q_values] = self.sess.run( [self.q_online], feed_dict={ self.inp_frames: [state_input], self.inp_mask: np.ones((1, self.frame_history), dtype=np.float32) }) return np.argmax(q_values[0]) def save_network(self, file_name): self.saver.save(self.sess, file_name)
def train(agent): senv = ShapeNetEnv(FLAGS) replay_mem = ReplayMemory(FLAGS) #### for debug #a = np.array([[1,0,1],[0,0,0]]) #b = np.array([[1,0,1],[0,1,0]]) #print('IoU: {}'.format(replay_mem.calu_IoU(a, b))) #sys.exit() #### for debug log_string('====== Starting burning in memories ======') burn_in(senv, replay_mem) log_string('====== Done. {} trajectories burnt in ======'.format( FLAGS.burn_in_length)) #epsilon = FLAGS.init_eps K_single = np.asarray([[420.0, 0.0, 112.0], [0.0, 420.0, 112.0], [0.0, 0.0, 1]]) K_list = np.tile(K_single[None, None, ...], (1, FLAGS.max_episode_length, 1, 1)) for i_idx in range(FLAGS.max_iter): state, model_id = senv.reset(True) actions = [] RGB_temp_list = np.zeros( (FLAGS.max_episode_length, FLAGS.resolution, FLAGS.resolution, 3), dtype=np.float32) R_list = np.zeros((FLAGS.max_episode_length, 3, 4), dtype=np.float32) vox_temp = np.zeros((FLAGS.voxel_resolution, FLAGS.voxel_resolution, FLAGS.voxel_resolution), dtype=np.float32) RGB_temp_list[0, ...], _ = replay_mem.read_png_to_uint8( state[0][0], state[1][0], model_id) R_list[0, ...] = replay_mem.get_R(state[0][0], state[1][0]) vox_temp_list = replay_mem.get_vox_pred(RGB_temp_list, R_list, K_list, 0) vox_temp = np.squeeze(vox_temp_list[0, ...]) ## run simulations and get memories for e_idx in range(FLAGS.max_episode_length - 1): agent_action = select_action(agent, RGB_temp_list[e_idx], vox_temp) actions.append(agent_action) state, next_state, done, model_id = senv.step(actions[-1]) RGB_temp_list[e_idx + 1, ...], _ = replay_mem.read_png_to_uint8( next_state[0], next_state[1], model_id) R_list[e_idx + 1, ...] = replay_mem.get_R(next_state[0], next_state[1]) ## TODO: update vox_temp vox_temp_list = replay_mem.get_vox_pred(RGB_temp_list, R_list, K_list, e_idx + 1) vox_temp = np.squeeze(vox_temp_list[e_idx + 1, ...]) if done: traj_state = state traj_state[0] += [next_state[0]] traj_state[1] += [next_state[1]] rewards = replay_mem.get_seq_rewards(RGB_temp_list, R_list, K_list, model_id) temp_traj = trajectData(traj_state, actions, rewards, model_id) replay_mem.append(temp_traj) break rgb_batch, vox_batch, reward_batch, action_batch = replay_mem.get_batch( FLAGS.batch_size) #print 'reward_batch: {}'.format(reward_batch) #print 'rewards: {}'.format(rewards) feed_dict = { agent.is_training: True, agent.rgb_batch: rgb_batch, agent.vox_batch: vox_batch, agent.reward_batch: reward_batch, agent.action_batch: action_batch } opt_train, merge_summary, loss = agent.sess.run( [agent.opt, agent.merged_train, agent.loss], feed_dict=feed_dict) log_string( '+++++Iteration: {}, loss: {:.4f}, mean_reward: {:.4f}+++++'. format(i_idx, loss, np.mean(rewards))) tf_util.save_scalar(i_idx, 'episode_total_reward', np.sum(rewards[:]), agent.train_writer) agent.train_writer.add_summary(merge_summary, i_idx) if i_idx % FLAGS.save_every_step == 0 and i_idx > 0: save(agent, i_idx, i_idx, i_idx) if i_idx % FLAGS.test_every_step == 0 and i_idx > 0: eval_r_mean, eval_IoU_mean, eval_loss_mean = evaluate( agent, FLAGS.test_episode_num, replay_mem) tf_util.save_scalar(i_idx, 'eval_mean_reward', eval_r_mean, agent.train_writer) tf_util.save_scalar(i_idx, 'eval_mean_IoU', eval_IoU_mean, agent.train_writer) tf_util.save_scalar(i_idx, 'eval_mean_loss', eval_loss_mean, agent.train_writer)
class Agent(object): def __init__(self, state_size, action_size, seed, **kwargs): # Setup default parameters for learning self.batch_size = kwargs['batch_size'] if 'batch_size' in kwargs else BATCH_SIZE self.gamma = kwargs['gamma'] if 'gamma' in kwargs else GAMMA self.tau = kwargs['tau'] if 'tau' in kwargs else TAU self.update_every = kwargs['update_every'] if 'update_every' in kwargs else UPDATE_EVERY buffer_size = kwargs['buffer_size'] if 'buffer_size' in kwargs else BUFFER_SIZE lr = kwargs['lr'] if 'lr' in kwargs else LR self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Create model and target Q-Network self.Q = QNetwork(action_size, state_size, seed).to(device) self.Q_target = QNetwork(action_size, state_size, seed).to(device) self.optimizer = optim.Adam(self.Q.parameters(), lr=lr) # Setup replay memory self.memory = ReplayMemory(buffer_size, self.batch_size, seed) # Initialize tim step to track updates self.t_step = 0 def step(self, state, action, reward, next_state, done): # Add new experience self.memory.append(state, action, reward, next_state, done) # update time step, and see if we are # ready to we ahev enough samples self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check if we have enough sample if len(self.memory) > self.batch_size: experiences = self.memory.sample() # sample from memory self.learn(experiences) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.Q.eval() with torch.no_grad(): action_values = self.Q(state) self.Q.train() # Select action using Epsilon-greedy if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Get the maximum predicted Q value for the next state Q_targets_next = self.Q_target(next_states).detach().max(1)[0].unsqueeze(1) # Calculate the Q target for the current state, done flag should force # the second term to zero for the terminal state. Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get the expected Q values using model Q-Network, so we can compute the loss Q_expected = self.Q(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss and propgate the gradient to # update the network weights. self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update the target Q-Network self.soft_update() def soft_update(self): for target_param, local_param in zip(self.Q_target.parameters(), self.Q.parameters()): target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)