def __init_sample(self): if args.experience_replay is not '' and os.path.exists( args.experience_replay): self.D = torch.load(args.experience_replay) self.metrics['steps'], self.metrics['episodes'] = [ self.D.steps ] * self.D.episodes, list(range(1, self.D.episodes + 1)) elif not args.test: self.D = ExperienceReplay(args.experience_size, args.symbolic_env, self.env.observation_size, self.env.action_size, args.bit_depth, args.device) # Initialise dataset D with S random seed episodes print( "Start Multi Sample Processing -------------------------------" ) start_time = time.time() data_lists = [ Manager().list() for i in range(1, args.seed_episodes + 1) ] # Set Global Lists pipes = [Pipe() for i in range(1, args.seed_episodes + 1) ] # Set Multi Pipe workers_init_sample = [ Worker_init_Sample(child_conn=child, id=i + 1) for i, [parent, child] in enumerate(pipes) ] for i, w in enumerate(workers_init_sample): w.start() # Start Single Process pipes[i][0].send( data_lists[i]) # Parent_pipe send data using i'th pipes [w.join() for w in workers_init_sample] # wait sub_process done for i, [parent, child] in enumerate(pipes): # datas = parent.recv() for data in list(parent.recv()): if isinstance(data, tuple): assert len(data) == 4 self.D.append(data[0], data[1], data[2], data[3]) elif isinstance(data, int): t = data self.metrics['steps'].append(t * args.action_repeat + ( 0 if len(self.metrics['steps']) == 0 else self.metrics['steps'][-1])) self.metrics['episodes'].append(i + 1) else: print( "The Recvive Data Have Some Problems, Need To Fix") end_time = time.time() print("the process times {} s".format(end_time - start_time)) print( "End Multi Sample Processing -------------------------------")
def __init__(self, game, mode=SIMPLE, nb_epoch=10000, memory_size=1000, batch_size=50, nb_frames=4, epsilon=1., discount=.9, learning_rate=.1, model=None): self.game = game self.mode = mode self.target_model = None self.rows, self.columns = game.field_shape() self.nb_epoch = nb_epoch self.nb_frames = nb_frames self.nb_actions = game.nb_actions() if mode == TEST: print('Training Mode: Loading model...') self.model = load_model(model) elif mode == SIMPLE: print('Using Plain DQN: Building model...') self.model = self.build_model() elif mode == DOUBLE: print('Using Double DQN: Building primary and target model...') self.model = self.build_model() self.target_model = self.build_model() self.update_target_model() # Trades off the importance of sooner versus later rewards. # A factor of 0 means it rather prefers immediate rewards # and it will mostly consider current rewards. A factor of 1 # will make it strive for a long-term high reward. self.discount = discount # The learning rate or step size determines to what extent the newly # acquired information will override the old information. A factor # of 0 will make the agent not learn anything, while a factor of 1 # would make the agent consider only the most recent information self.learning_rate = learning_rate # Use epsilon-greedy exploration as our policy. # Epsilon determines the probability for choosing random actions. # This factor will decrease linear by the number of epoches. So we choose # a random action by the probability 'eps'. Without this policy the network # is greedy and it will it settles with the first effective strategy it finds. # Hence, we introduce certain randomness. # Epislon reaches its minimum at 1/2 of the games epsilon_end = self.nb_epoch - (self.nb_epoch / 2) self.policy = EpsGreedyPolicy(self.model, epsilon_end, self.nb_actions, epsilon, .1) # Create new experience replay memory. Without this optimization # the training takes extremely long even on a GPU and most # importantly the approximation of Q-values using non-linear # functions, that is used for our NN, is not very stable. self.memory = ExperienceReplay(self.model, self.target_model, self.nb_actions, memory_size, batch_size, self.discount, self.learning_rate) self.frames = None
def __init__(self): self.parms = Parameters() self.results_dir = os.path.join(self.parms.results_path) self.dataset_path = os.path.join(self.parms.results_path, 'dataset/') os.makedirs(self.dataset_path, exist_ok=True) self.metrics = { 'steps': [], 'episodes': [], 'train_rewards': [], 'predicted_rewards': [], 'test_episodes': [], 'test_rewards': [], 'observation_loss': [], 'reward_loss': [], 'kl_loss': [], 'regularizer_loss': [] } os.makedirs(self.results_dir, exist_ok=True) ## Setting cuda options if torch.cuda.is_available() and self.parms.use_cuda: self.parms.device = torch.device('cuda') torch.cuda.set_device(self.parms.gpu_id) print("Using gpu: ", torch.cuda.current_device()) else: self.parms.device = torch.device('cpu') self.use_cuda = False print("Work on: ", self.parms.device) # Initilize buffer experience replay self.env = ControlSuiteEnv(self.parms.env_name, self.parms.seed, self.parms.max_episode_length, self.parms.bit_depth) self.D = ExperienceReplay(self.parms.ex_replay_buff_size, self.env.observation_size, self.env.action_size, self.parms.bit_depth, self.parms.device) if self.parms.seed > 0: self.set_seed() self.trainer = Trainer(self.parms, self.D, self.metrics, self.results_dir, self.env) self.init_exp_rep() # Start Training print("Total training episodes: ", self.parms.training_episodes, " Buffer sampling: ", self.parms.collect_interval) self.trainer.train_models() print("END.")
def __init__(self, model, memory=None, memory_size=1000, nb_frames=None): assert len(model.output_shape) == 2, "Model's output shape should be (nb_samples, nb_actions)." if memory: self.memory = memory else: self.memory = ExperienceReplay(memory_size) if not nb_frames and not model.input_shape: raise Exception("Missing argument : nb_frames not provided") elif not nb_frames: nb_frames = model.input_shape[1] elif model.input_shape[1] and nb_frames and model.input_shape[1] != nb_frames: raise Exception("Dimension mismatch : time dimension of model should be equal to nb_frames.") self.model = model self.nb_frames = nb_frames self.frames = None
def setup_replay(args: argparse.Namespace, env: Env) -> ExperienceReplay: D = ExperienceReplay( args.experience_size, env.observation_size, env.action_size, args.device ) # Initialise dataset D with random seed episodes for _ in range(1, args.seed_episodes + 1): observation, done = env.reset(), False while not done: action = env.sample_random_action() next_observation, _, done, info = env.step(action) D.append(observation, action, info["reward_dist"], info["reward_coll"], done) observation = next_observation return D
class Agent: def __init__(self, model, memory=None, memory_size=100, nb_frames=None): assert len( model.output_shape ) == 2, "Model's output shape should be (nb_samples, nb_actions)." if memory: self.memory = memory else: self.memory = ExperienceReplay(memory_size) if not nb_frames and not model.input_shape[1]: raise Exception("Missing argument : nb_frames not provided") elif not nb_frames: nb_frames = model.input_shape[1] elif model.input_shape[ 1] and nb_frames and model.input_shape[1] != nb_frames: raise Exception( "Dimension mismatch : time dimension of model should be equal to nb_frames." ) self.model = model self.nb_frames = nb_frames # model input shape, 24 self.frames = None @property def memory_size(self): return self.memory.memory_size @memory_size.setter def memory_size(self, value): self.memory.memory_size = value def reset_memory(self): self.exp_replay.reset_memory() def check_game_compatibility(self, game): game_output_shape = (1, None) + game.get_frame().shape #game_output_shape = (None, game.get_frame().shape) if len(game_output_shape) != len(self.model.input_shape): raise Exception( 'Dimension mismatch. Input shape of the model should be compatible with the game.' ) else: for i in range(len(self.model.input_shape)): if self.model.input_shape[i] and game_output_shape[ i] and self.model.input_shape[i] != game_output_shape[ i]: raise Exception( 'Dimension mismatch. Input shape of the model should be compatible with the game.' ) if len(self.model.output_shape ) != 2 or self.model.output_shape[1] != game.nb_actions: raise Exception( 'Output shape of model should be (nb_samples, nb_actions).') def get_game_data(self, game): # returns scaled frame = game.get_frame() # candidate to return scaled if self.frames is None: self.frames = [frame] * self.nb_frames else: self.frames.append(frame) self.frames.pop(0) return np.expand_dims(self.frames, 0) def clear_frames(self): self.frames = None def train(self, game, nb_epoch=1000, batch_size=50, gamma=0.9, epsilon=[1., .1], epsilon_rate=0.5, reset_memory=False, observe=0, checkpoint=None): self.check_game_compatibility(game) if type(epsilon) in {tuple, list}: delta = ((epsilon[0] - epsilon[1]) / (nb_epoch * epsilon_rate)) final_epsilon = epsilon[1] epsilon = epsilon[0] else: final_epsilon = epsilon save = Save() model = self.model nb_actions = model.output_shape[-1] win_count = 0 for epoch in range(nb_epoch): loss = 0. q = np.zeros(3) game.reset() self.clear_frames() if reset_memory: self.reset_memory() game_over = False S = self.get_game_data(game) # S must be scaled i = 0 while not game_over: i = i + 1 if np.random.random() < epsilon or epoch < observe: a = int(np.random.randint(game.nb_actions)) print('>'), else: # S must be scaled q = model.predict(S) # ! a = int(np.argmax(q[0])) game.play(a) r = game.get_score(a) S_prime = self.get_game_data(game) # S_prime must be scaled game_over = game.is_over() # S, a, S_prime, must be scaled # reward, game over is not scaled in catch/snake transition = [S, a, r, S_prime, game_over] # ! self.memory.remember(*transition) S = S_prime if epoch >= observe: batch = self.memory.get_batch(model=model, batch_size=batch_size, gamma=gamma) if batch: inputs, targets = batch # scaled loss += float(model.train_on_batch(inputs, targets)) #if checkpoint and ((epoch + 1 - observe) % checkpoint == 0 or epoch + 1 == nb_epoch): #model.save_weights('4kweights.dat') save.log(game, epoch) if game.is_won(): win_count += 1 if epsilon > final_epsilon and epoch >= observe: epsilon -= delta print(' ') print( "Epoch {:03d}/{:03d} | Loss {:.4f} | Epsilon {:.2f} | Win count {} | loss Avg {:.4f}" .format(epoch + 1, nb_epoch, loss, epsilon, win_count, loss / i)) if ((epoch % 10) == 0): save.save_model(model, Config.f_model) save.log_epoch(loss, win_count, loss / i) def play(self, game, nb_epoch=10, epsilon=0., visualize=True): self.check_game_compatibility(game) model = self.model win_count = 0 frames = [] save = Save() for epoch in range(nb_epoch): game.reset() self.clear_frames() S = self.get_game_data(game) # S must be scaled if visualize: frames.append(game.draw()) game_over = False while not game_over: if np.random.rand() < epsilon: print("random") action = int(np.random.randint(0, game.nb_actions)) else: # S must be scaled q = model.predict(S)[0] # ! possible_actions = game.get_possible_actions() q = [q[i] for i in possible_actions] action = possible_actions[np.argmax(q)] game.play(action) S = self.get_game_data(game) ''' if visualize: frames.append(game.draw()) game_over = game.is_over() ''' save.log(game, nb_epoch) if game.is_won(): win_count += 1 print("Accuracy {} %".format(100. * win_count / nb_epoch)) '''
class Plan(object): def __init__(self): self.results_dir = os.path.join( 'results', '{}_seed_{}_{}_action_scale_{}_no_explore_{}_pool_len_{}_optimisation_iters_{}_top_planning-horizon' .format(args.env, args.seed, args.algo, args.action_scale, args.pool_len, args.optimisation_iters, args.top_planning_horizon)) args.results_dir = self.results_dir args.MultiGPU = True if torch.cuda.device_count( ) > 1 and args.MultiGPU else False self.__basic_setting() self.__init_sample() # Sampleing The Init Data # Initialise model parameters randomly self.transition_model = TransitionModel( args.belief_size, args.state_size, self.env.action_size, args.hidden_size, args.embedding_size, args.dense_activation_function).to(device=args.device) self.observation_model = ObservationModel( args.symbolic_env, self.env.observation_size, args.belief_size, args.state_size, args.embedding_size, args.cnn_activation_function).to(device=args.device) self.reward_model = RewardModel( args.belief_size, args.state_size, args.hidden_size, args.dense_activation_function).to(device=args.device) self.encoder = Encoder( args.symbolic_env, self.env.observation_size, args.embedding_size, args.cnn_activation_function).to(device=args.device) print("We Have {} GPUS".format(torch.cuda.device_count()) ) if args.MultiGPU else print("We use CPU") self.transition_model = nn.DataParallel( self.transition_model.to(device=args.device) ) if args.MultiGPU else self.transition_model self.observation_model = nn.DataParallel( self.observation_model.to(device=args.device) ) if args.MultiGPU else self.observation_model self.reward_model = nn.DataParallel( self.reward_model.to( device=args.device)) if args.MultiGPU else self.reward_model # encoder = nn.DataParallel(encoder.cuda()) # actor_model = nn.DataParallel(actor_model.cuda()) # value_model = nn.DataParallel(value_model.cuda()) # share the global parameters in multiprocessing self.encoder.share_memory() self.observation_model.share_memory() self.reward_model.share_memory() # Set all_model/global_actor_optimizer/global_value_optimizer self.param_list = list(self.transition_model.parameters()) + list( self.observation_model.parameters()) + list( self.reward_model.parameters()) + list( self.encoder.parameters()) self.model_optimizer = optim.Adam( self.param_list, lr=0 if args.learning_rate_schedule != 0 else args.model_learning_rate, eps=args.adam_epsilon) def update_belief_and_act(self, args, env, belief, posterior_state, action, observation, explore=False): # Infer belief over current state q(s_t|o≤t,a<t) from the history # print("action size: ",action.size()) torch.Size([1, 6]) belief, _, _, _, posterior_state, _, _ = self.upper_transition_model( posterior_state, action.unsqueeze(dim=0), belief, self.encoder(observation).unsqueeze(dim=0), None) if hasattr(env, "envs"): belief, posterior_state = list( map(lambda x: x.view(-1, args.test_episodes, x.shape[2]), [x for x in [belief, posterior_state]])) belief, posterior_state = belief.squeeze( dim=0), posterior_state.squeeze( dim=0) # Remove time dimension from belief/state action = self.algorithms.get_action(belief, posterior_state, explore) if explore: action = torch.clamp( Normal(action, args.action_noise).rsample(), -1, 1 ) # Add gaussian exploration noise on top of the sampled action # action = action + args.action_noise * torch.randn_like(action) # Add exploration noise ε ~ p(ε) to the action next_observation, reward, done = env.step( action.cpu() if isinstance(env, EnvBatcher) else action[0].cpu( )) # Perform environment step (action repeats handled internally) return belief, posterior_state, action, next_observation, reward, done def run(self): if args.algo == "dreamer": print("DREAMER") from algorithms.dreamer import Algorithms self.algorithms = Algorithms(self.env.action_size, self.transition_model, self.encoder, self.reward_model, self.observation_model) elif args.algo == "p2p": print("planing to plan") from algorithms.plan_to_plan import Algorithms self.algorithms = Algorithms(self.env.action_size, self.transition_model, self.encoder, self.reward_model, self.observation_model) elif args.algo == "actor_pool_1": print("async sub actor") from algorithms.actor_pool_1 import Algorithms_actor self.algorithms = Algorithms_actor(self.env.action_size, self.transition_model, self.encoder, self.reward_model, self.observation_model) elif args.algo == "aap": from algorithms.asynchronous_actor_planet import Algorithms self.algorithms = Algorithms(self.env.action_size, self.transition_model, self.encoder, self.reward_model, self.observation_model) else: print("planet") from algorithms.planet import Algorithms # args.MultiGPU = False self.algorithms = Algorithms(self.env.action_size, self.transition_model, self.reward_model) if args.test: self.test_only() self.global_prior = Normal( torch.zeros(args.batch_size, args.state_size, device=args.device), torch.ones(args.batch_size, args.state_size, device=args.device)) # Global prior N(0, I) self.free_nats = torch.full( (1, ), args.free_nats, device=args.device) # Allowed deviation in KL divergence # Training (and testing) # args.episodes = 1 for episode in tqdm(range(self.metrics['episodes'][-1] + 1, args.episodes + 1), total=args.episodes, initial=self.metrics['episodes'][-1] + 1): losses = self.train() # self.algorithms.save_loss_data(self.metrics['episodes']) # Update and plot loss metrics self.save_loss_data(tuple( zip(*losses))) # Update and plot loss metrics self.data_collection(episode=episode) # Data collection # args.test_interval = 1 if episode % args.test_interval == 0: self.test(episode=episode) # Test model self.save_model_data(episode=episode) # save model self.env.close() # Close training environment def train_env_model(self, beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs, observations, actions, rewards, nonterminals): # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) if args.worldmodel_LogProbLoss: observation_dist = Normal( bottle(self.observation_model, (beliefs, posterior_states)), 1) observation_loss = -observation_dist.log_prob( observations[1:]).sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) else: observation_loss = F.mse_loss( bottle(self.observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) if args.worldmodel_LogProbLoss: reward_dist = Normal( bottle(self.reward_model, (beliefs, posterior_states)), 1) reward_loss = -reward_dist.log_prob(rewards[:-1]).mean(dim=(0, 1)) else: reward_loss = F.mse_loss(bottle(self.reward_model, (beliefs, posterior_states)), rewards[:-1], reduction='none').mean(dim=(0, 1)) # transition loss div = kl_divergence(Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs)).sum(dim=2) kl_loss = torch.max(div, self.free_nats).mean( dim=(0, 1) ) # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out if args.global_kl_beta != 0: kl_loss += args.global_kl_beta * kl_divergence( Normal(posterior_means, posterior_std_devs), self.global_prior).sum(dim=2).mean(dim=(0, 1)) # Calculate latent overshooting objective for t > 0 if args.overshooting_kl_beta != 0: overshooting_vars = [ ] # Collect variables for overshooting to process in batch for t in range(1, args.chunk_size - 1): d = min(t + args.overshooting_distance, args.chunk_size - 1) # Overshooting distance t_, d_ = t - 1, d - 1 # Use t_ and d_ to deal with different time indexing for latent states seq_pad = ( 0, 0, 0, 0, 0, t - d + args.overshooting_distance ) # Calculate sequence padding so overshooting terms can be calculated in one batch # Store (0) actions, (1) nonterminals, (2) rewards, (3) beliefs, (4) prior states, (5) posterior means, (6) posterior standard deviations and (7) sequence masks overshooting_vars.append( (F.pad(actions[t:d], seq_pad), F.pad(nonterminals[t:d], seq_pad), F.pad(rewards[t:d], seq_pad[2:]), beliefs[t_], prior_states[t_], F.pad(posterior_means[t_ + 1:d_ + 1].detach(), seq_pad), F.pad(posterior_std_devs[t_ + 1:d_ + 1].detach(), seq_pad, value=1), F.pad( torch.ones(d - t, args.batch_size, args.state_size, device=args.device), seq_pad)) ) # Posterior standard deviations must be padded with > 0 to prevent infinite KL divergences overshooting_vars = tuple(zip(*overshooting_vars)) # Update belief/state using prior from previous belief/state and previous action (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs = self.upper_transition_model( torch.cat(overshooting_vars[4], dim=0), torch.cat(overshooting_vars[0], dim=1), torch.cat(overshooting_vars[3], dim=0), None, torch.cat(overshooting_vars[1], dim=1)) seq_mask = torch.cat(overshooting_vars[7], dim=1) # Calculate overshooting KL loss with sequence mask kl_loss += ( 1 / args.overshooting_distance ) * args.overshooting_kl_beta * torch.max((kl_divergence( Normal(torch.cat(overshooting_vars[5], dim=1), torch.cat(overshooting_vars[6], dim=1)), Normal(prior_means, prior_std_devs) ) * seq_mask).sum(dim=2), self.free_nats).mean(dim=(0, 1)) * ( args.chunk_size - 1 ) # Update KL loss (compensating for extra average over each overshooting/open loop sequence) # Calculate overshooting reward prediction loss with sequence mask if args.overshooting_reward_scale != 0: reward_loss += ( 1 / args.overshooting_distance ) * args.overshooting_reward_scale * F.mse_loss( bottle(self.reward_model, (beliefs, prior_states)) * seq_mask[:, :, 0], torch.cat(overshooting_vars[2], dim=1), reduction='none' ).mean(dim=(0, 1)) * ( args.chunk_size - 1 ) # Update reward loss (compensating for extra average over each overshooting/open loop sequence) # Apply linearly ramping learning rate schedule if args.learning_rate_schedule != 0: for group in self.model_optimizer.param_groups: group['lr'] = min( group['lr'] + args.model_learning_rate / args.model_learning_rate_schedule, args.model_learning_rate) model_loss = observation_loss + reward_loss + kl_loss # Update model parameters self.model_optimizer.zero_grad() model_loss.backward() nn.utils.clip_grad_norm_(self.param_list, args.grad_clip_norm, norm_type=2) self.model_optimizer.step() return observation_loss, reward_loss, kl_loss def train(self): # Model fitting losses = [] print("training loop") # args.collect_interval = 1 for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = self.D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros( args.batch_size, args.belief_size, device=args.device), torch.zeros(args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) obs = bottle(self.encoder, (observations[1:], )) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.upper_transition_model( prev_state=init_state, actions=actions[:-1], prev_belief=init_belief, obs=obs, nonterminals=nonterminals[:-1]) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) observation_loss, reward_loss, kl_loss = self.train_env_model( beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs, observations, actions, rewards, nonterminals) # Dreamer implementation: actor loss calculation and optimization with torch.no_grad(): actor_states = posterior_states.detach().to( device=args.device).share_memory_() actor_beliefs = beliefs.detach().to( device=args.device).share_memory_() # if not os.path.exists(os.path.join(os.getcwd(), 'tensor_data/' + args.results_dir)): os.mkdir(os.path.join(os.getcwd(), 'tensor_data/' + args.results_dir)) torch.save( actor_states, os.path.join(os.getcwd(), args.results_dir + '/actor_states.pt')) torch.save( actor_beliefs, os.path.join(os.getcwd(), args.results_dir + '/actor_beliefs.pt')) # [self.actor_pipes[i][0].send(1) for i, w in enumerate(self.workers_actor)] # Parent_pipe send data using i'th pipes # [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool)] # waitting the children finish self.algorithms.train_algorithm(actor_states, actor_beliefs) losses.append( [observation_loss.item(), reward_loss.item(), kl_loss.item()]) # if self.algorithms.train_algorithm(actor_states, actor_beliefs) is not None: # merge_actor_loss, merge_value_loss = self.algorithms.train_algorithm(actor_states, actor_beliefs) # losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item(), merge_actor_loss.item(), merge_value_loss.item()]) # else: # losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item()]) return losses def data_collection(self, episode): print("Data collection") with torch.no_grad(): observation, total_reward = self.env.reset(), 0 belief, posterior_state, action = torch.zeros( 1, args.belief_size, device=args.device), torch.zeros( 1, args.state_size, device=args.device), torch.zeros(1, self.env.action_size, device=args.device) pbar = tqdm(range(args.max_episode_length // args.action_repeat)) for t in pbar: # print("step",t) belief, posterior_state, action, next_observation, reward, done = self.update_belief_and_act( args, self.env, belief, posterior_state, action, observation.to(device=args.device)) self.D.append(observation, action.cpu(), reward, done) total_reward += reward observation = next_observation if args.render: self.env.render() if done: pbar.close() break # Update and plot train reward metrics self.metrics['steps'].append(t + self.metrics['steps'][-1]) self.metrics['episodes'].append(episode) self.metrics['train_rewards'].append(total_reward) Save_Txt(self.metrics['episodes'][-1], self.metrics['train_rewards'][-1], 'train_rewards', args.results_dir) # lineplot(metrics['episodes'][-len(metrics['train_rewards']):], metrics['train_rewards'], 'train_rewards', results_dir) def test(self, episode): print("Test model") # Set models to eval mode self.transition_model.eval() self.observation_model.eval() self.reward_model.eval() self.encoder.eval() self.algorithms.train_to_eval() # self.actor_model_g.eval() # self.value_model_g.eval() # Initialise parallelised test environments test_envs = EnvBatcher( Env, (args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth), {}, args.test_episodes) with torch.no_grad(): observation, total_rewards, video_frames = test_envs.reset( ), np.zeros((args.test_episodes, )), [] belief, posterior_state, action = torch.zeros( args.test_episodes, args.belief_size, device=args.device), torch.zeros( args.test_episodes, args.state_size, device=args.device), torch.zeros(args.test_episodes, self.env.action_size, device=args.device) pbar = tqdm(range(args.max_episode_length // args.action_repeat)) for t in pbar: belief, posterior_state, action, next_observation, reward, done = self.update_belief_and_act( args, test_envs, belief, posterior_state, action, observation.to(device=args.device)) total_rewards += reward.numpy() if not args.symbolic_env: # Collect real vs. predicted frames for video video_frames.append( make_grid(torch.cat([ observation, self.observation_model(belief, posterior_state).cpu() ], dim=3) + 0.5, nrow=5).numpy()) # Decentre observation = next_observation if done.sum().item() == args.test_episodes: pbar.close() break # Update and plot reward metrics (and write video if applicable) and save metrics self.metrics['test_episodes'].append(episode) self.metrics['test_rewards'].append(total_rewards.tolist()) Save_Txt(self.metrics['test_episodes'][-1], self.metrics['test_rewards'][-1], 'test_rewards', args.results_dir) # Save_Txt(np.asarray(metrics['steps'])[np.asarray(metrics['test_episodes']) - 1], metrics['test_rewards'],'test_rewards_steps', results_dir, xaxis='step') # lineplot(metrics['test_episodes'], metrics['test_rewards'], 'test_rewards', results_dir) # lineplot(np.asarray(metrics['steps'])[np.asarray(metrics['test_episodes']) - 1], metrics['test_rewards'], 'test_rewards_steps', results_dir, xaxis='step') if not args.symbolic_env: episode_str = str(episode).zfill(len(str(args.episodes))) write_video(video_frames, 'test_episode_%s' % episode_str, args.results_dir) # Lossy compression save_image( torch.as_tensor(video_frames[-1]), os.path.join(args.results_dir, 'test_episode_%s.png' % episode_str)) torch.save(self.metrics, os.path.join(args.results_dir, 'metrics.pth')) # Set models to train mode self.transition_model.train() self.observation_model.train() self.reward_model.train() self.encoder.train() # self.actor_model_g.train() # self.value_model_g.train() self.algorithms.eval_to_train() # Close test environments test_envs.close() def test_only(self): # Set models to eval mode self.transition_model.eval() self.reward_model.eval() self.encoder.eval() with torch.no_grad(): total_reward = 0 for _ in tqdm(range(args.test_episodes)): observation = self.env.reset() belief, posterior_state, action = torch.zeros( 1, args.belief_size, device=args.device), torch.zeros( 1, args.state_size, device=args.device), torch.zeros(1, self.env.action_size, device=args.device) pbar = tqdm( range(args.max_episode_length // args.action_repeat)) for t in pbar: belief, posterior_state, action, observation, reward, done = self.update_belief_and_act( args, self.env, belief, posterior_state, action, observation.to(evice=args.device)) total_reward += reward if args.render: self.env.render() if done: pbar.close() break print('Average Reward:', total_reward / args.test_episodes) self.env.close() quit() def __basic_setting(self): args.overshooting_distance = min( args.chunk_size, args.overshooting_distance ) # Overshooting distance cannot be greater than chunk size print(' ' * 26 + 'Options') for k, v in vars(args).items(): print(' ' * 26 + k + ': ' + str(v)) print("torch.cuda.device_count() {}".format(torch.cuda.device_count())) os.makedirs(args.results_dir, exist_ok=True) np.random.seed(args.seed) torch.manual_seed(args.seed) # Set Cuda if torch.cuda.is_available() and not args.disable_cuda: print("using CUDA") args.device = torch.device('cuda') torch.cuda.manual_seed(args.seed) else: print("using CPU") args.device = torch.device('cpu') self.summary_name = args.results_dir + "/{}_{}_log" self.writer = SummaryWriter(self.summary_name.format( args.env, args.id)) self.env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth) self.metrics = { 'steps': [], 'episodes': [], 'train_rewards': [], 'test_episodes': [], 'test_rewards': [], 'observation_loss': [], 'reward_loss': [], 'kl_loss': [], 'merge_actor_loss': [], 'merge_value_loss': [] } def __init_sample(self): if args.experience_replay is not '' and os.path.exists( args.experience_replay): self.D = torch.load(args.experience_replay) self.metrics['steps'], self.metrics['episodes'] = [ self.D.steps ] * self.D.episodes, list(range(1, self.D.episodes + 1)) elif not args.test: self.D = ExperienceReplay(args.experience_size, args.symbolic_env, self.env.observation_size, self.env.action_size, args.bit_depth, args.device) # Initialise dataset D with S random seed episodes print( "Start Multi Sample Processing -------------------------------" ) start_time = time.time() data_lists = [ Manager().list() for i in range(1, args.seed_episodes + 1) ] # Set Global Lists pipes = [Pipe() for i in range(1, args.seed_episodes + 1) ] # Set Multi Pipe workers_init_sample = [ Worker_init_Sample(child_conn=child, id=i + 1) for i, [parent, child] in enumerate(pipes) ] for i, w in enumerate(workers_init_sample): w.start() # Start Single Process pipes[i][0].send( data_lists[i]) # Parent_pipe send data using i'th pipes [w.join() for w in workers_init_sample] # wait sub_process done for i, [parent, child] in enumerate(pipes): # datas = parent.recv() for data in list(parent.recv()): if isinstance(data, tuple): assert len(data) == 4 self.D.append(data[0], data[1], data[2], data[3]) elif isinstance(data, int): t = data self.metrics['steps'].append(t * args.action_repeat + ( 0 if len(self.metrics['steps']) == 0 else self.metrics['steps'][-1])) self.metrics['episodes'].append(i + 1) else: print( "The Recvive Data Have Some Problems, Need To Fix") end_time = time.time() print("the process times {} s".format(end_time - start_time)) print( "End Multi Sample Processing -------------------------------") def upper_transition_model(self, prev_state, actions, prev_belief, obs, nonterminals): actions = torch.transpose(actions, 0, 1) if args.MultiGPU else actions nonterminals = torch.transpose(nonterminals, 0, 1).to( device=args.device ) if args.MultiGPU and nonterminals is not None else nonterminals obs = torch.transpose(obs, 0, 1).to( device=args.device) if args.MultiGPU and obs is not None else obs temp_val = self.transition_model(prev_state.to(device=args.device), actions.to(device=args.device), prev_belief.to(device=args.device), obs, nonterminals) return list( map( lambda x: torch.cat(x.chunk(torch.cuda.device_count(), 0), 1) if x.shape[1] != prev_state.shape[0] else x, [x for x in temp_val])) def save_loss_data(self, losses): self.metrics['observation_loss'].append(losses[0]) self.metrics['reward_loss'].append(losses[1]) self.metrics['kl_loss'].append(losses[2]) self.metrics['merge_actor_loss'].append( losses[3]) if losses.__len__() > 3 else None self.metrics['merge_value_loss'].append( losses[4]) if losses.__len__() > 3 else None Save_Txt(self.metrics['episodes'][-1], self.metrics['observation_loss'][-1], 'observation_loss', args.results_dir) Save_Txt(self.metrics['episodes'][-1], self.metrics['reward_loss'][-1], 'reward_loss', args.results_dir) Save_Txt(self.metrics['episodes'][-1], self.metrics['kl_loss'][-1], 'kl_loss', args.results_dir) Save_Txt(self.metrics['episodes'][-1], self.metrics['merge_actor_loss'][-1], 'merge_actor_loss', args.results_dir) if losses.__len__() > 3 else None Save_Txt(self.metrics['episodes'][-1], self.metrics['merge_value_loss'][-1], 'merge_value_loss', args.results_dir) if losses.__len__() > 3 else None # lineplot(metrics['episodes'][-len(metrics['observation_loss']):], metrics['observation_loss'], 'observation_loss', results_dir) # lineplot(metrics['episodes'][-len(metrics['reward_loss']):], metrics['reward_loss'], 'reward_loss', results_dir) # lineplot(metrics['episodes'][-len(metrics['kl_loss']):], metrics['kl_loss'], 'kl_loss', results_dir) # lineplot(metrics['episodes'][-len(metrics['actor_loss']):], metrics['actor_loss'], 'actor_loss', results_dir) # lineplot(metrics['episodes'][-len(metrics['value_loss']):], metrics['value_loss'], 'value_loss', results_dir) def save_model_data(self, episode): # writer.add_scalar("train_reward", metrics['train_rewards'][-1], metrics['steps'][-1]) # writer.add_scalar("train/episode_reward", metrics['train_rewards'][-1], metrics['steps'][-1]*args.action_repeat) # writer.add_scalar("observation_loss", metrics['observation_loss'][0][-1], metrics['steps'][-1]) # writer.add_scalar("reward_loss", metrics['reward_loss'][0][-1], metrics['steps'][-1]) # writer.add_scalar("kl_loss", metrics['kl_loss'][0][-1], metrics['steps'][-1]) # writer.add_scalar("actor_loss", metrics['actor_loss'][0][-1], metrics['steps'][-1]) # writer.add_scalar("value_loss", metrics['value_loss'][0][-1], metrics['steps'][-1]) # print("episodes: {}, total_steps: {}, train_reward: {} ".format(metrics['episodes'][-1], metrics['steps'][-1], metrics['train_rewards'][-1])) # Checkpoint models if episode % args.checkpoint_interval == 0: # torch.save({'transition_model': transition_model.state_dict(), # 'observation_model': observation_model.state_dict(), # 'reward_model': reward_model.state_dict(), # 'encoder': encoder.state_dict(), # 'actor_model': actor_model_g.state_dict(), # 'value_model': value_model_g.state_dict(), # 'model_optimizer': model_optimizer.state_dict(), # 'actor_optimizer': actor_optimizer_g.state_dict(), # 'value_optimizer': value_optimizer_g.state_dict() # }, os.path.join(results_dir, 'models_%d.pth' % episode)) if args.checkpoint_experience: torch.save( self.D, os.path.join(args.results_dir, 'experience.pth') ) # Warning: will fail with MemoryError with large memory sizes
class Agent: def __init__(self, model, memory=None, memory_size=1000, nb_frames=None): assert len( model.output_shape ) == 2, "Model's output shape should be (nb_samples, nb_actions)." if memory: self.memory = memory else: self.memory = ExperienceReplay(memory_size) if not nb_frames and not model.input_shape: raise Exception("Missing argument : nb_frames not provided") elif not nb_frames: nb_frames = model.input_shape[1] elif model.input_shape[ 1] and nb_frames and model.input_shape[1] != nb_frames: raise Exception( "Dimension mismatch : time dimension of model should be equal to nb_frames." ) self.model = model self.nb_frames = nb_frames self.frames = None @property def memory_size(self): return self.memory.memory_size @memory_size.setter def memory_size(self, value): self.memory.memory_size = value def reset_memory(self): self.exp_replay.reset_memory() def check_game_compatibility(self, game): game_output_shape = (1, None) + game.get_frame().shape if len(game_output_shape) != len(self.model.input_shape): raise Exception( 'Dimension mismatch. Input shape of the model should be compatible with the game.' ) else: for i in range(len(self.model.input_shape)): if self.model.input_shape[i] and game_output_shape[ i] and self.model.input_shape[i] != game_output_shape[ i]: raise Exception( 'Dimension mismatch. Input shape of the model should be compatible with the game.' ) if len(self.model.output_shape ) != 2 or self.model.output_shape[1] != game.nb_actions: raise Exception( 'Output shape of model should be (nb_samples, nb_actions).') def get_game_data(self, game): frame = game.get_frame() if self.frames is None: self.frames = [frame] * self.nb_frames else: self.frames.append(frame) self.frames.pop(0) return np.expand_dims(self.frames, 0) def clear_frames(self): self.frames = None def train(self, game, nb_epoch=1000, batch_size=50, gamma=0.9, epsilon=[1., .1], epsilon_rate=0.5, reset_memory=False): self.check_game_compatibility(game) if type(epsilon) in {tuple, list}: delta = ((epsilon[0] - epsilon[1]) / (nb_epoch * epsilon_rate)) final_epsilon = epsilon[1] epsilon = epsilon[0] else: final_epsilon = epsilon model = self.model nb_actions = model.output_shape[-1] win_count = 0 for epoch in range(nb_epoch): loss = 0. game.reset() self.clear_frames() if reset_memory: self.reset_memory() game_over = False S = self.get_game_data(game) while not game_over: if np.random.random() < epsilon: a = int(np.random.randint(game.nb_actions)) else: q = model.predict(S) a = int(np.argmax(q[0])) game.play(a) r = game.get_score() S_prime = self.get_game_data(game) game_over = game.is_over() transition = [S, a, r, S_prime, game_over] self.memory.remember(*transition) S = S_prime inputs, targets = self.memory.get_batch(model=model, batch_size=batch_size, gamma=gamma) loss += model.train_on_batch(inputs, targets)[0] if game.is_won(): win_count += 1 if epsilon > final_epsilon: epsilon -= delta print( "Epoch {:03d}/{:03d} | Loss {:.4f} | Epsilon {:.2f} | Win count {}" .format(epoch + 1, nb_epoch, loss, epsilon, win_count)) def play(self, game, nb_epoch=10, epsilon=0., visualize=True): self.check_game_compatibility(game) model = self.model win_count = 0 frames = [] for epoch in range(nb_epoch): game.reset() self.clear_frames() S = self.get_game_data(game) if visualize: frames.append(game.draw()) game_over = False while not game_over: if np.random.rand() < epsilon: print("random") action = int(np.random.randint(0, game.nb_actions)) else: q = model.predict(S) action = int(np.argmax(q[0])) game.play(action) S = self.get_game_data(game) if visualize: frames.append(game.draw()) game_over = game.is_over() if game.is_won(): win_count += 1 print("Accuracy {} %".format(100. * win_count / nb_epoch)) if visualize: if 'images' not in os.listdir('.'): os.mkdir('images') for i in range(len(frames)): plt.imshow(frames[i], interpolation='none') plt.savefig("images/" + game.name + str(i) + ".png")
results_dir = os.path.join('results', args.id) os.makedirs(results_dir, exist_ok=True) logdir = os.path.join(results_dir, "logs") os.makedirs(logdir, exist_ok=True) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.manual_seed(args.seed) else: device = torch.device('cpu') # Initialise training environment and experience replay memory env = Env(args.env, args.seed, args.max_episode_length, args.action_repeat) D = ExperienceReplay(args.experience_size, env.observation_size, env.action_size, device) # Initialise dataset D with S random seed episodes for s in range(1, args.seed_episodes + 1): observation, done, t = env.reset(), False, 0 epdata = [] while not done: action = env.sample_random_action() next_observation, reward, done = env.step(action) D.append(observation, action, reward, done) epdata.append(next_observation) observation = next_observation t += 1 epdata = np.concatenate(epdata) frames = torch.FloatTensor(epdata[:,:3,:,:]) / 255. write_video(frames, "Episode"+str(s), logdir) print(epdata.shape)
class Agent: def __init__(self, game, mode=SIMPLE, nb_epoch=10000, memory_size=1000, batch_size=50, nb_frames=4, epsilon=1., discount=.9, learning_rate=.1, model=None): self.game = game self.mode = mode self.target_model = None self.rows, self.columns = game.field_shape() self.nb_epoch = nb_epoch self.nb_frames = nb_frames self.nb_actions = game.nb_actions() if mode == TEST: print('Training Mode: Loading model...') self.model = load_model(model) elif mode == SIMPLE: print('Using Plain DQN: Building model...') self.model = self.build_model() elif mode == DOUBLE: print('Using Double DQN: Building primary and target model...') self.model = self.build_model() self.target_model = self.build_model() self.update_target_model() # Trades off the importance of sooner versus later rewards. # A factor of 0 means it rather prefers immediate rewards # and it will mostly consider current rewards. A factor of 1 # will make it strive for a long-term high reward. self.discount = discount # The learning rate or step size determines to what extent the newly # acquired information will override the old information. A factor # of 0 will make the agent not learn anything, while a factor of 1 # would make the agent consider only the most recent information self.learning_rate = learning_rate # Use epsilon-greedy exploration as our policy. # Epsilon determines the probability for choosing random actions. # This factor will decrease linear by the number of epoches. So we choose # a random action by the probability 'eps'. Without this policy the network # is greedy and it will it settles with the first effective strategy it finds. # Hence, we introduce certain randomness. # Epislon reaches its minimum at 1/2 of the games epsilon_end = self.nb_epoch - (self.nb_epoch / 2) self.policy = EpsGreedyPolicy(self.model, epsilon_end, self.nb_actions, epsilon, .1) # Create new experience replay memory. Without this optimization # the training takes extremely long even on a GPU and most # importantly the approximation of Q-values using non-linear # functions, that is used for our NN, is not very stable. self.memory = ExperienceReplay(self.model, self.target_model, self.nb_actions, memory_size, batch_size, self.discount, self.learning_rate) self.frames = None def build_model(self): model = Sequential() model.add(Conv2D(32, (2, 2), activation='relu', input_shape=(self.nb_frames, self.rows, self.columns), data_format="channels_first")) model.add(Conv2D(64, (2, 2), activation='relu')) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(Flatten()) model.add(Dropout(0.1)) model.add(Dense(512, activation='relu')) model.add(Dense(self.nb_actions)) model.compile(Adam(), 'MSE') return model def update_target_model(self): self.target_model.set_weights(self.model.get_weights()) def get_frames(self): frame = self.game.get_state() if self.frames is None: self.frames = [frame] * self.nb_frames else: self.frames.append(frame) self.frames.pop(0) # Expand frames to match the input shape for the CNN (4D) # 1D = # batches # 2D = # frames per batch # 3D / 4D = game board return np.expand_dims(self.frames, 0) def clear_frames(self): self.frames = None def print_stats(self, data, y_label, x_label='Epoch', marker='-'): data = np.array(data) x, y = data.T p = np.polyfit(x, y, 3) fig = plt.figure() plt.plot(x, y, marker) plt.plot(x, np.polyval(p, x), 'r:') plt.xlabel(x_label) plt.ylabel(y_label) words = y_label.split() file_name = '_'.join(map(lambda x: x.lower(), words)) path = './plots/{name}_{size}x{size}_{timestamp}' fig.savefig(path.format(size=self.game.grid_size, name=file_name, timestamp=int(time()))) def train(self, update_freq=10): total_steps = 0 max_steps = self.game.grid_size**2 * 3 loops = 0 nb_wins = 0 cumulative_reward = 0 duration_buffer = [] reward_buffer = [] steps_buffer = [] wins_buffer = [] for epoch in range(self.nb_epoch): loss = 0. duration = 0 steps = 0 self.game.reset() self.clear_frames() done = False # Observe the initial state state_t = self.get_frames() start_time = time() while(not done): # Explore or Exploit action = self.policy.select_action(state_t, epoch) # Act on the environment _, reward, done, is_victory = self.game.act(action) state_tn = self.get_frames() cumulative_reward += reward steps += 1 total_steps += 1 if steps == max_steps and not done: loops += 1 done = True # Build transition and remember it (Experience Replay) transition = [state_t, action, reward, state_tn, done] self.memory.remember(*transition) state_t = state_tn # Get batch of batch_size samples # A batch generally approximates the distribution of the input data # better than a single input. The larger the batch, the better the # approximation. However, larger batches take longer to process. batch = self.memory.get_batch() if batch: inputs, targets = batch loss += float(self.model.train_on_batch(inputs, targets)) if self.game.is_victory(): nb_wins += 1 if done: duration = utils.get_time_difference(start_time, time()) if self.mode == DOUBLE and self.target_model is not None and total_steps % (update_freq) == 0: self.update_target_model() current_epoch = epoch + 1 reward_buffer.append([current_epoch, cumulative_reward]) duration_buffer.append([current_epoch, duration]) steps_buffer.append([current_epoch, steps]) wins_buffer.append([current_epoch, nb_wins]) summary = 'Epoch {:03d}/{:03d} | Loss {:.4f} | Epsilon {:.2f} | Time(ms) {:3.3f} | Steps {:.2f} | Wins {} | Loops {}' print(summary.format(current_epoch, self.nb_epoch, loss, self.policy.get_eps(), duration, steps, nb_wins, loops)) # Generate plots self.print_stats(reward_buffer, 'Cumulative Reward') self.print_stats(duration_buffer, 'Duration per Game') self.print_stats(steps_buffer, 'Steps per Game') self.print_stats(wins_buffer, 'Wins') path = './models/model_{mode}_{size}x{size}_{epochs}_{timestamp}.h5' mode = 'dqn' if self.mode == SIMPLE else 'ddqn' self.model.save(path.format(mode=mode, size=self.game.grid_size, epochs=self.nb_epoch, timestamp=int(time()))) def play(self, nb_games=5, interval=.7): nb_wins = 0 accuracy = 0 summary = '{}\n\nAccuracy {:.2f}% | Game {}/{} | Wins {}' for epoch in range(nb_games): self.game.reset() self.clear_frames() done = False state_t = self.get_frames() self.print_state(summary, state_t[:,-1], accuracy, epoch, nb_games, nb_wins, 0) while(not done): q = self.model.predict(state_t) action = np.argmax(q[0]) _, _, done, is_victory = self.game.act(action) state_tn = self.get_frames() state_t = state_tn if is_victory: nb_wins += 1 accuracy = 100. * nb_wins / nb_games self.print_state(summary, state_t[:,-1], accuracy, epoch, nb_games, nb_wins, interval) def print_state(self, summary, state, accuracy, epoch, nb_games, nb_wins, interval): utils.clear_screen() print(summary.format(state, accuracy, epoch + 1, nb_games, nb_wins)) sleep(interval)
'observation_loss': [], 'reward_loss': [], 'kl_loss': [] } print("Initializing environment!") # Initialise training environment and experience replay memory env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth) if args.load_experience: D = torch.load(os.path.join(results_dir, 'experience.pth')) metrics['steps'], metrics['episodes'] = [D.steps] * D.episodes, list( range(1, D.episodes + 1)) else: D = ExperienceReplay(args.experience_size, args.symbolic_env, env.observation_size, env.action_size, args.bit_depth, args.device) # Initialise dataset D with S random seed episodes for s in range(1, args.seed_episodes + 1): observation, done, t = env.reset(), False, 0 while not done: action = env.sample_random_action() next_observation, reward, done = env.step(action) D.append(observation, action, reward, done) observation = next_observation t += 1 metrics['steps'].append(t * args.action_repeat + ( 0 if len(metrics['steps']) == 0 else metrics['steps'][-1])) metrics['episodes'].append(s) print("Initializing model parameters!")
'actor_loss': [], 'value_loss': [] } summary_name = results_dir + "/{}_{}_log" # Initialise training environment and experience replay memory env = Env(args.env, args.symbolic, args.seed, args.max_episode_length, args.action_repeat, args.bit_depth) args.observation_size, args.action_size = env.observation_size, env.action_size # Initialise agent agent = Dreamer(args) D = ExperienceReplay(args.experience_size, args.symbolic, env.observation_size, env.action_size, args.bit_depth, args.device) # Initialise dataset D with S random seed episodes for s in range(1, args.seed_episodes + 1): observation, done, t = env.reset(), False, 0 while not done: action = env.sample_random_action() next_observation, reward, done = env.step(action) D.append(next_observation, action.cpu(), reward, done) # here use the next_observation observation = next_observation t += 1 metrics['env_steps'].append(t * args.action_repeat + ( 0 if len(metrics['env_steps']) == 0 else metrics['env_steps'][-1])) metrics['episodes'].append(s) print("(random)episodes: {}, total_env_steps: {} ".format(
class Agent: def __init__(self, model, memory=None, memory_size=1000, nb_frames=None): assert len(model.output_shape) == 2, "Model's output shape should be (nb_samples, nb_actions)." if memory: self.memory = memory else: self.memory = ExperienceReplay(memory_size) if not nb_frames and not model.input_shape: raise Exception("Missing argument : nb_frames not provided") elif not nb_frames: nb_frames = model.input_shape[1] elif model.input_shape[1] and nb_frames and model.input_shape[1] != nb_frames: raise Exception("Dimension mismatch : time dimension of model should be equal to nb_frames.") self.model = model self.nb_frames = nb_frames self.frames = None @property def memory_size(self): return self.memory.memory_size @memory_size.setter def memory_size(self, value): self.memory.memory_size = value def reset_memory(self): self.exp_replay.reset_memory() def check_game_compatibility(self, game): game_output_shape = (1, None) + game.get_frame().shape if len(game_output_shape) != len(self.model.input_shape): raise Exception('Dimension mismatch. Input shape of the model should be compatible with the game.') else: for i in range(len(self.model.input_shape)): if self.model.input_shape[i] and game_output_shape[i] and self.model.input_shape[i] != game_output_shape[i]: raise Exception('Dimension mismatch. Input shape of the model should be compatible with the game.') if len(self.model.output_shape) != 2 or self.model.output_shape[1] != game.nb_actions: raise Exception('Output shape of model should be (nb_samples, nb_actions).') def get_game_data(self, game): frame = game.get_frame() if self.frames is None: self.frames = [frame] * self.nb_frames else: self.frames.append(frame) self.frames.pop(0) return np.expand_dims(self.frames, 0) def clear_frames(self): self.frames = None def action_count(self, game): #print "game.get_action_count: ", game.get_action_count return game.get_action_count # SET WHICH RUNS TO PRINT OUT HERE ***************************************************************** def report_action(self, game): return ((self.action_count(game) % self.report_freq) == 0) # and ((self.action_count(game) % self.report_freq) < 20) #% 10000) == 0 # def train(self, game, nb_epoch=1000, batch_size=50, gamma=0.9, epsilon=[1., .1], epsilon_rate=0.5, reset_memory=False, id=""): txt_store_path = "./txtstore/run_1000e_b50_15r_reg_lr1/junk/" printing = False record_weights = False self.max_moves = game.get_max_moves() self.report_freq = self.max_moves #50 '''fo_A = open(txt_store_path + "A.txt", "rw+") fo_G = open(txt_store_path + "G.txt", "rw+") fo_Gb = open(txt_store_path + "Gb.txt", "rw+") fo_I = open(txt_store_path + "I.txt", "rw+") fo_Q = open(txt_store_path + "Q.txt", "rw+") fo_R = open(txt_store_path + "R.txt", "rw+") fo_S = open(txt_store_path + "S.txt", "rw+") fo_T = open(txt_store_path + "T.txt", "rw+") fo_W = open(txt_store_path + "W.txt", "rw+") fo_Wb = open(txt_store_path + "Wb.txt", "rw+")''' self.check_game_compatibility(game) if type(epsilon) in {tuple, list}: delta = ((epsilon[0] - epsilon[1]) / (nb_epoch * epsilon_rate)) final_epsilon = epsilon[1] epsilon = epsilon[0] else: final_epsilon = epsilon model = self.model nb_actions = model.output_shape[-1] win_count = 0 scores = np.zeros((nb_epoch,self.max_moves/self.report_freq)) losses = np.zeros((nb_epoch,self.max_moves/self.report_freq)) for epoch in range(nb_epoch): #ipdb.set_trace(context=9) # TRACING HERE ********************************************* loss = 0. game.reset() self.clear_frames() if reset_memory: self.reset_memory() game_over = False S = self.get_game_data(game) no_last_S = True plot_showing = False while not game_over: if np.random.random() < epsilon: a = int(np.random.randint(game.nb_actions)) #if (self.action_count(game) % 100000) == 0: '''if self.report_action(game): if printing: print "random", q = model.predict(S)''' q = model.predict(S) expected_action = (a == int(np.argmax(q[0]))) else: expected_action = True q = model.predict(S) #print q.shape #print q[0] # ************************************** CATCHING NANS '''if (q[0,0] != q[0,0]): ipdb.set_trace(context=9) # TRACING HERE ********************************************* ''' a = int(np.argmax(q[0])) #if (self.action_count(game) % 100000) == 0: prob = epsilon/game.nb_actions if expected_action: prob = 1 - epsilon + prob game.play(a, self.report_action(game)) r = game.get_score() #ipdb.set_trace(context=9) # TRACING HERE ********************************************* # PRINTING S HERE ****************************************************************** ''' if plot_showing: plt.clf() plt.imshow(np.reshape(S,(6,6))) plt.draw() plt.show(block=False) plot_showing = True print "hi" ''' # PRINTING S HERE ****************************************************************** S_prime = self.get_game_data(game) '''if self.report_action(game): if printing: print "S: ", S #if no_last_S: # last_S = S # no_last_S = False #else: # print "dS:", S - last_S # print " ==> Q(lS):", model.predict(last_S) #print print " ==> Q(S): ", q, " ==> A: ", a, " ==> R: %f" % r #print " ==> Q(S'):", model.predict(S_prime) #print fo_S.seek(0,2) np.savetxt(fo_S, S[0], fmt='%4.4f') # fo_Q.seek(0,2) np.savetxt(fo_Q, q, fmt='%4.4f') # fo_A.seek(0,2) fo_A.write(str(a)+"\n") #savetxt(fo, S[0], fmt='%4.4f') # fo_R.seek(0,2) fo_R.write(str(r)+"\n") ''' #ipdb.set_trace(context=9) # TRACING HERE ********************************************* #last_S = S game_over = game.is_over() transition = [S, a, r, S_prime, game_over, prob] self.memory.remember(*transition) S = S_prime batch = self.memory.get_batch(model=model, batch_size=batch_size, gamma=gamma, ruql=True) #, print_it=False) #self.report_action(game)) if batch: inputs, targets, probs = batch #print("model.total_loss: ", model.total_loss) '''if record_weights: weights_pre = model.get_weights() # GOT WEIGHTS ************************* #print "weights_pre" #print weights_pre if self.report_action(game): fo_W.seek(0,2) np.savetxt(fo_W, weights_pre[0], fmt='%4.4f') # fo_W.write("\n") fo_Wb.seek(0,2) np.savetxt(fo_Wb, weights_pre[1], fmt='%4.4f') # fo_Wb.write("\n")''' #output = model.train_on_batch(inputs, targets) #loss += float(output[0]) #model.train_on_batch(inputs, targets)) '''print "myAgent" print 'inputs: ', type(inputs), "; ", inputs.shape print 'targets: ', type(targets), "; ", targets.shape print 'probs: ', type(probs), "; ", probs.shape''' loss += float(model.train_on_batch(inputs, targets, probs=probs)) #if self.report_action(game): # #print output # #fo_G.seek(0,2) # #np.savetxt(fo_G, output[1], fmt='%4.4f') # # #fo_G.write("\n") # #fo_Gb.seek(0,2) # #np.savetxt(fo_Gb, output[2], fmt='%4.4f') # # #fo_Gb.write("\n") #weights_post = model.get_weights() # GOT WEIGHTS ******************************** #print "weights_post" #print weights_post #ipdb.set_trace() # TRACING HERE ********************************************* #print("action_count PRE: ", action_count) if self.report_action(game): action_count = self.action_count(game) #print("action_count/self.report_freq: ", action_count/self.report_freq) #print("action_count: ", action_count) #print("self.report_freq: ", self.report_freq) #print("scores so far: ", scores) #print("scores.shape: ", scores.shape)''' while (action_count/self.report_freq > scores.shape[1]): scores = np.append(scores, np.zeros((nb_epoch,1)), 1) losses = np.append(losses, np.zeros((nb_epoch,1)), 1) scores[epoch, action_count/self.report_freq-1] = game.get_total_score() losses[epoch, action_count/self.report_freq-1] = loss #print ("running a batch (of %d): 1: %d; 2: %d" % (len(batch), batch[0].size, \ # batch[1].size)) #print "memory size: ", self.memory_size #print "using memory\n", inputs, "; tgt: ", targets #fo_I.seek(0,2) #np.savetxt(fo_I, inputs[0], fmt='%4.4f') # #fo_T.seek(0,2) #np.savetxt(fo_T, targets, fmt='%4.4f') # #fo_T.write("\n") if game.is_won(): win_count += 1 if epsilon > final_epsilon: epsilon -= delta if (epoch % 50) == 0: print("Epoch {:03d}/{:03d} | Loss {:.4f} | Epsilon {:.2f} | Win count {}".format(epoch + 1, nb_epoch, loss, epsilon, win_count)) pickle.dump(scores, open(txt_store_path + "score" + id + ".p", "wb" ) ) pickle.dump(losses, open(txt_store_path + "loss" + id + ".p", "wb" ) ) ''' fo_A.close() fo_G.close() fo_Gb.close() fo_I.close() fo_Q.close() fo_R.close() fo_S.close() fo_T.close() fo_W.close() fo_Wb.close()''' average_taken_over = 10 last_col = self.max_moves/self.report_freq -1 fo_log = open("log.txt", "rw+") fo_log.seek(0,2) average_score = np.mean(scores[-average_taken_over:nb_epoch, last_col]) average_error = np.mean(losses[-average_taken_over:nb_epoch, last_col]) fo_log.write("\n{:20}|{:^12}|{:^10}|{:^10}|{:^6}|{:^12}|{:^12}|{:^12}{:^6}{:^6}|{:^10}|{:^20}|{:^10}|{:^6}".format(" ", "game moves", "avg score", "error", "WC", "epochs", "batch size", "epsiln frm", ".. to", ".. by", "lr", "desciption", "timer", "reg")) fo_log.write("\n{:<20}|{:^12d}|{:^10.2f}|{:^10.2f}|{:^6d}|".format(time.strftime("%d/%m/%Y %H:%M"), self.max_moves, \ average_score, average_error, win_count)) #average_taken_over, fo_log.close() def play(self, game, nb_epoch=1, epsilon=0., visualize=False): self.check_game_compatibility(game) model = self.model win_count = 0 frames = [] for epoch in range(nb_epoch): game.reset() self.clear_frames() S = self.get_game_data(game) if visualize: frames.append(game.draw()) game_over = False while not game_over: if np.random.rand() < epsilon: print("random") action = int(np.random.randint(0, game.nb_actions)) else: q = model.predict(S) action = int(np.argmax(q[0])) game.play(action) S = self.get_game_data(game) if visualize: frames.append(game.draw()) game_over = game.is_over() if game.is_won(): win_count += 1 print("Accuracy {} %".format(100. * win_count / nb_epoch)) if visualize: if 'images' not in os.listdir('.'): os.mkdir('images') for i in range(len(frames)): plt.imshow(frames[i], interpolation='none') plt.savefig("images/" + game.name + str(i) + ".png")
'test_rewards': [], 'test_Qs': [] } # Environment env = AtariEnv(args) env.train() # Agent and memory if args.algorithm == 'MFEC': agent = MFECAgent(args, env.observation_space.shape, env.action_space.n, env.hash_space.shape[0]) elif args.algorithm == 'NEC': agent = NECAgent(args, env.observation_space.shape, env.action_space.n, env.hash_space.shape[0]) mem = ExperienceReplay(args.memory_capacity, env.observation_space.shape, args.device) # Construct validation memory val_mem = ExperienceReplay(args.evaluation_size, env.observation_space.shape, args.device) T, done, states = 0, True, [] # Store transition data in episodic buffers while T < args.evaluation_size: if done: state, done = env.reset(), False states.append( state.cpu().numpy()) # Append transition data to episodic buffers state, _, done = env.step(env.action_space.sample()) T += 1 val_mem.append_batch(np.stack(states), np.zeros((args.evaluation_size, ), dtype=np.int64), np.zeros((args.evaluation_size, ), dtype=np.float32))
def train(args: argparse.Namespace, env: Env, D: ExperienceReplay, models: Tuple[nn.Module, nn.Module, nn.Module, nn.Module], optimizer: Tuple[optim.Optimizer, optim.Optimizer], param_list: List[nn.parameter.Parameter], planner: nn.Module): # auxilliary tensors global_prior = Normal( torch.zeros(args.batch_size, args.state_size, device=args.device), torch.ones(args.batch_size, args.state_size, device=args.device) ) # Global prior N(0, I) # Allowed deviation in KL divergence free_nats = torch.full((1, ), args.free_nats, dtype=torch.float32, device=args.device) summary_writter = SummaryWriter(args.tensorboard_dir) # unpack models transition_model, observation_model, reward_model, encoder = models transition_optimizer, reward_optimizer = optimizer for idx_episode in trange(args.episodes, leave=False, desc="Episode"): for idx_train in trange(args.collect_interval, leave=False, desc="Training"): # Draw sequence chunks {(o[t], a[t], r[t+1], z[t+1])} ~ D uniformly at random from the dataset # The first two dimensions of the tensors are L (chunk size) and n (batch size) # We want to use o[t+1] to correct the error of the transition model, # so we need to convert the sequence to {(o[t+1], a[t], r[t+1], z[t+1])} observations, actions, rewards_dist, rewards_coll, nonterminals = D.sample(args.batch_size, args.chunk_size) # Create initial belief and state for time t = 0 init_belief = torch.zeros(args.batch_size, args.belief_size, device=args.device) init_state = torch.zeros(args.batch_size, args.state_size, device=args.device) # Transition model forward # deterministic: h[t+1] = f(h[t], a[t]) # prior: s[t+1] ~ Prob(s|h[t+1]) # posterior: s[t+1] ~ Prob(s|h[t+1], o[t+1]) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1] ) # observation loss predictions = bottle(observation_model, (beliefs, posterior_states)) visual_loss = F.mse_loss( predictions[:, :, :3*64*64], observations[1:, :, :3*64*64] ).mean() symbol_loss = F.mse_loss( predictions[:, :, 3*64*64:], observations[1:, :, 3*64*64:] ).mean() observation_loss = visual_loss + symbol_loss # KL divergence loss. Minimize the difference between posterior and prior kl_loss = torch.max( kl_divergence( Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs) ).sum(dim=2), free_nats ).mean(dim=(0, 1)) # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out if args.global_kl_beta != 0: kl_loss += args.global_kl_beta * kl_divergence( Normal(posterior_means, posterior_std_devs), global_prior ).sum(dim=2).mean(dim=(0, 1)) # overshooting loss if args.overshooting_kl_beta != 0: overshooting_vars = [] # Collect variables for overshooting to process in batch for t in range(1, args.chunk_size - 1): d = min(t + args.overshooting_distance, args.chunk_size - 1) # Overshooting distance # Use t_ and d_ to deal with different time indexing for latent states t_, d_ = t - 1, d - 1 # Calculate sequence padding so overshooting terms can be calculated in one batch seq_pad = (0, 0, 0, 0, 0, t - d + args.overshooting_distance) # Store # * a[t:d], # * z[t+1:d+1] # * r[t+1:d+1] # * h[t] # * s[t] prior # * E[s[t:d]] posterior # * Var[s[t:d]] posterior # * mask: # the last few sequences do not have enough length, # so we pad it with 0 to the same length as previous sequence for batch operation, # and use mask to indicate invalid variables. overshooting_vars.append( (F.pad(actions[t:d], seq_pad), F.pad(nonterminals[t:d], seq_pad), F.pad(rewards_dist[t:d], seq_pad[2:]), beliefs[t_], prior_states[t_], F.pad(posterior_means[t_ + 1:d_ + 1].detach(), seq_pad), F.pad(posterior_std_devs[t_ + 1:d_ + 1].detach(), seq_pad, value=1), F.pad(torch.ones(d - t, args.batch_size, args.state_size, device=args.device), seq_pad) ) ) # Posterior standard deviations must be padded with > 0 to prevent infinite KL divergences overshooting_vars = tuple(zip(*overshooting_vars)) # Update belief/state using prior from previous belief/state and previous action (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs = transition_model( torch.cat(overshooting_vars[4], dim=0), torch.cat(overshooting_vars[0], dim=1), torch.cat(overshooting_vars[3], dim=0), None, torch.cat(overshooting_vars[1], dim=1) ) seq_mask = torch.cat(overshooting_vars[7], dim=1) # Calculate overshooting KL loss with sequence mask kl_loss += (1 / args.overshooting_distance) * args.overshooting_kl_beta * torch.max( (kl_divergence( Normal(torch.cat(overshooting_vars[5], dim=1), torch.cat(overshooting_vars[6], dim=1)), Normal(prior_means, prior_std_devs) ) * seq_mask).sum(dim=2), free_nats ).mean(dim=(0, 1)) * (args.chunk_size - 1) # Update KL loss (compensating for extra average over each overshooting/open loop sequence) # TODO: add learning rate schedule # Update model parameters transition_optimizer.zero_grad() loss = observation_loss * 200 + kl_loss loss.backward() nn.utils.clip_grad_norm_(param_list, args.grad_clip_norm, norm_type=2) transition_optimizer.step() # reward loss rewards_dist_predict, rewards_coll_predict = bottle(reward_model.raw, (beliefs.detach(), posterior_states.detach())) reward_loss = F.mse_loss( rewards_dist_predict, rewards_dist[:-1], reduction='mean' ) + F.binary_cross_entropy( rewards_coll_predict, rewards_coll[:-1], reduction='mean' ) reward_optimizer.zero_grad() reward_loss.backward() reward_optimizer.step() # add tensorboard log global_step = idx_train + idx_episode * args.collect_interval summary_writter.add_scalar("observation_loss", observation_loss, global_step) summary_writter.add_scalar("reward_loss", reward_loss, global_step) summary_writter.add_scalar("kl_loss", kl_loss, global_step) for idx_collect in trange(1, leave=False, desc="Collecting"): experience = collect_experience(args, env, models, planner, True, desc="Collecting experience {}".format(idx_collect)) T = len(experience["observation"]) for idx_step in range(T): D.append(experience["observation"][idx_step], experience["action"][idx_step], experience["reward_dist"][idx_step], experience["reward_coll"][idx_step], experience["done"][idx_step]) # Checkpoint models if (idx_episode + 1) % args.checkpoint_interval == 0: record_path = os.path.join(args.checkpoint_dir, "checkpoint") checkpoint_path = os.path.join(args.checkpoint_dir, 'models_%d.pth' % (idx_episode+1)) torch.save( { 'transition_model': transition_model.state_dict(), 'observation_model': observation_model.state_dict(), 'reward_model': reward_model.state_dict(), 'encoder': encoder.state_dict(), 'transition_optimizer': transition_optimizer.state_dict(), 'reward_optimizer': reward_optimizer.state_dict() }, checkpoint_path) with open(record_path, "w") as f: f.write('models_%d.pth' % (idx_episode+1)) planner.save(os.path.join(args.torchscript_dir, "mpc_planner.pth")) transition_model.save(os.path.join(args.torchscript_dir, "transition_model.pth")) reward_model.save(os.path.join(args.torchscript_dir, "reward_model.pth")) observation_model.save(os.path.join(args.torchscript_dir, "observation_decoder.pth")) encoder.save(os.path.join(args.torchscript_dir, "observation_encoder.pth")) summary_writter.close()
class Initializer(): def __init__(self): self.parms = Parameters() self.results_dir = os.path.join(self.parms.results_path) self.dataset_path = os.path.join(self.parms.results_path, 'dataset/') os.makedirs(self.dataset_path, exist_ok=True) self.metrics = { 'steps': [], 'episodes': [], 'train_rewards': [], 'predicted_rewards': [], 'test_episodes': [], 'test_rewards': [], 'observation_loss': [], 'reward_loss': [], 'kl_loss': [], 'regularizer_loss': [] } os.makedirs(self.results_dir, exist_ok=True) ## Setting cuda options if torch.cuda.is_available() and self.parms.use_cuda: self.parms.device = torch.device('cuda') torch.cuda.set_device(self.parms.gpu_id) print("Using gpu: ", torch.cuda.current_device()) else: self.parms.device = torch.device('cpu') self.use_cuda = False print("Work on: ", self.parms.device) # Initilize buffer experience replay self.env = ControlSuiteEnv(self.parms.env_name, self.parms.seed, self.parms.max_episode_length, self.parms.bit_depth) self.D = ExperienceReplay(self.parms.ex_replay_buff_size, self.env.observation_size, self.env.action_size, self.parms.bit_depth, self.parms.device) if self.parms.seed > 0: self.set_seed() self.trainer = Trainer(self.parms, self.D, self.metrics, self.results_dir, self.env) self.init_exp_rep() # Start Training print("Total training episodes: ", self.parms.training_episodes, " Buffer sampling: ", self.parms.collect_interval) self.trainer.train_models() print("END.") def set_seed(self): print("Setting seed") os.environ['PYTHONHASHSEED'] = str(self.parms.seed) random.seed(self.parms.seed) np.random.seed(self.parms.seed) torch.manual_seed(self.parms.seed) if self.parms.use_cuda: torch.cuda.manual_seed(self.parms.seed) #torch.backends.cudnn.enabled=False # This makes the training slower #torch.backends.cudnn.deterministic=True # This makes the training slower # Init buffer experience replay def init_exp_rep(self): print("Starting initialization buffer.") for s in tqdm(range(1, self.parms.num_init_episodes + 1)): observation, done, t = self.env.reset(), False, 0 while not done: action = self.env.sample_random_action() next_observation, reward, done = self.env.step(action) self.D.append(observation, action, reward, done) observation = next_observation t += 1 self.metrics['steps'].append(t * self.env.action_repeat + (0 if len(self.metrics['steps']) == 0 else self.metrics['steps'][-1])) self.metrics['episodes'].append(s)
class Dreamer(Agent): # The agent has its own replay buffer, update, act def __init__(self, args): """ All paras are passed by args :param args: a dict that includes parameters """ super().__init__() self.args = args # Initialise model parameters randomly self.transition_model = TransitionModel( args.belief_size, args.state_size, args.action_size, args.hidden_size, args.embedding_size, args.dense_act).to(device=args.device) self.observation_model = ObservationModel( args.symbolic, args.observation_size, args.belief_size, args.state_size, args.embedding_size, activation_function=(args.dense_act if args.symbolic else args.cnn_act)).to(device=args.device) self.reward_model = RewardModel(args.belief_size, args.state_size, args.hidden_size, args.dense_act).to(device=args.device) self.encoder = Encoder(args.symbolic, args.observation_size, args.embedding_size, args.cnn_act).to(device=args.device) self.actor_model = ActorModel( args.action_size, args.belief_size, args.state_size, args.hidden_size, activation_function=args.dense_act, fix_speed=args.fix_speed, throttle_base=args.throttle_base).to(device=args.device) self.value_model = ValueModel(args.belief_size, args.state_size, args.hidden_size, args.dense_act).to(device=args.device) self.value_model2 = ValueModel(args.belief_size, args.state_size, args.hidden_size, args.dense_act).to(device=args.device) self.pcont_model = PCONTModel(args.belief_size, args.state_size, args.hidden_size, args.dense_act).to(device=args.device) self.target_value_model = deepcopy(self.value_model) self.target_value_model2 = deepcopy(self.value_model2) for p in self.target_value_model.parameters(): p.requires_grad = False for p in self.target_value_model2.parameters(): p.requires_grad = False # setup the paras to update self.world_param = list(self.transition_model.parameters())\ + list(self.observation_model.parameters())\ + list(self.reward_model.parameters())\ + list(self.encoder.parameters()) if args.pcont: self.world_param += list(self.pcont_model.parameters()) # setup optimizer self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr) self.actor_optimizer = optim.Adam(self.actor_model.parameters(), lr=args.actor_lr) self.value_optimizer = optim.Adam(list(self.value_model.parameters()) + list(self.value_model2.parameters()), lr=args.value_lr) # setup the free_nat to self.free_nats = torch.full( (1, ), args.free_nats, dtype=torch.float32, device=args.device) # Allowed deviation in KL divergence # TODO: change it to the new replay buffer, in buffer.py self.D = ExperienceReplay(args.experience_size, args.symbolic, args.observation_size, args.action_size, args.bit_depth, args.device) if self.args.auto_temp: # setup for learning of alpha term (temp of the entropy term) self.log_temp = torch.zeros(1, requires_grad=True, device=args.device) self.target_entropy = -np.prod( args.action_size if not args.fix_speed else self.args. action_size - 1).item() # heuristic value from SAC paper self.temp_optimizer = optim.Adam( [self.log_temp], lr=args.value_lr) # use the same value_lr # TODO: print out the param used in Dreamer # var_counts = tuple(count_vars(module) for module in [self., self.ac.q1, self.ac.q2]) # print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # def process_im(self, image, image_size=None, rgb=None): # # Resize, put channel first, convert it to a tensor, centre it to [-0.5, 0.5] and add batch dimenstion. # # def preprocess_observation_(observation, bit_depth): # # Preprocesses an observation inplace (from float32 Tensor [0, 255] to [-0.5, 0.5]) # observation.div_(2 ** (8 - bit_depth)).floor_().div_(2 ** bit_depth).sub_( # 0.5) # Quantise to given bit depth and centre # observation.add_(torch.rand_like(observation).div_( # 2 ** bit_depth)) # Dequantise (to approx. match likelihood of PDF of continuous images vs. PMF of discrete images) # # image = image[40:, :, :] # clip the above 40 rows # image = torch.tensor(cv2.resize(image, (40, 40), interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1), # dtype=torch.float32) # Resize and put channel first # # preprocess_observation_(image, self.args.bit_depth) # return image.unsqueeze(dim=0) def process_im(self, images, image_size=None, rgb=None): images = cv2.resize(images, (40, 40)) images = np.dot(images, [0.299, 0.587, 0.114]) obs = torch.tensor(images, dtype=torch.float32).div_(255.).sub_(0.5).unsqueeze( dim=0) # shape [1, 40, 40], range:[-0.5,0.5] return obs.unsqueeze(dim=0) # add batch dimension def append_buffer(self, new_traj): # append new collected trajectory, not implement the data augmentation # shape of new_traj: [(o, a, r, d) * steps] for state in new_traj: observation, action, reward, done = state self.D.append(observation, action.cpu(), reward, done) def _compute_loss_world(self, state, data): # unpackage data beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = state observations, rewards, nonterminals = data # observation_loss = F.mse_loss( # bottle(self.observation_model, (beliefs, posterior_states)), # observations[1:], # reduction='none').sum(dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1)) # # reward_loss = F.mse_loss( # bottle(self.reward_model, (beliefs, posterior_states)), # rewards[1:], # reduction='none').mean(dim=(0,1)) observation_loss = F.mse_loss( bottle(self.observation_model, (beliefs, posterior_states)), observations, reduction='none').sum( dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1)) reward_loss = F.mse_loss(bottle(self.reward_model, (beliefs, posterior_states)), rewards, reduction='none').mean(dim=(0, 1)) # TODO: 5 # transition loss kl_loss = torch.max( kl_divergence( Independent(Normal(posterior_means, posterior_std_devs), 1), Independent(Normal(prior_means, prior_std_devs), 1)), self.free_nats).mean(dim=(0, 1)) # print("check the reward", bottle(pcont_model, (beliefs, posterior_states)).shape, nonterminals[:-1].shape) if self.args.pcont: pcont_loss = F.binary_cross_entropy( bottle(self.pcont_model, (beliefs, posterior_states)), nonterminals) # pcont_pred = torch.distributions.Bernoulli(logits=bottle(self.pcont_model, (beliefs, posterior_states))) # pcont_loss = -pcont_pred.log_prob(nonterminals[1:]).mean(dim=(0, 1)) return observation_loss, self.args.reward_scale * reward_loss, kl_loss, ( self.args.pcont_scale * pcont_loss if self.args.pcont else 0) def _compute_loss_actor(self, imag_beliefs, imag_states, imag_ac_logps=None): # reward and value prediction of imagined trajectories imag_rewards = bottle(self.reward_model, (imag_beliefs, imag_states)) imag_values = bottle(self.value_model, (imag_beliefs, imag_states)) imag_values2 = bottle(self.value_model2, (imag_beliefs, imag_states)) imag_values = torch.min(imag_values, imag_values2) with torch.no_grad(): if self.args.pcont: pcont = bottle(self.pcont_model, (imag_beliefs, imag_states)) else: pcont = self.args.discount * torch.ones_like(imag_rewards) pcont = pcont.detach() if imag_ac_logps is not None: imag_values[ 1:] -= self.args.temp * imag_ac_logps # add entropy here returns = cal_returns(imag_rewards[:-1], imag_values[:-1], imag_values[-1], pcont[:-1], lambda_=self.args.disclam) discount = torch.cumprod( torch.cat([torch.ones_like(pcont[:1]), pcont[:-2]], 0), 0) discount = discount.detach() assert list(discount.size()) == list(returns.size()) actor_loss = -torch.mean(discount * returns) return actor_loss def _compute_loss_critic(self, imag_beliefs, imag_states, imag_ac_logps=None): with torch.no_grad(): # calculate the target with the target nn target_imag_values = bottle(self.target_value_model, (imag_beliefs, imag_states)) target_imag_values2 = bottle(self.target_value_model2, (imag_beliefs, imag_states)) target_imag_values = torch.min(target_imag_values, target_imag_values2) imag_rewards = bottle(self.reward_model, (imag_beliefs, imag_states)) if self.args.pcont: pcont = bottle(self.pcont_model, (imag_beliefs, imag_states)) else: pcont = self.args.discount * torch.ones_like(imag_rewards) # print("check pcont", pcont) if imag_ac_logps is not None: target_imag_values[1:] -= self.args.temp * imag_ac_logps returns = cal_returns(imag_rewards[:-1], target_imag_values[:-1], target_imag_values[-1], pcont[:-1], lambda_=self.args.disclam) target_return = returns.detach() value_pred = bottle(self.value_model, (imag_beliefs, imag_states))[:-1] value_pred2 = bottle(self.value_model2, (imag_beliefs, imag_states))[:-1] value_loss = F.mse_loss(value_pred, target_return, reduction="none").mean(dim=(0, 1)) value_loss2 = F.mse_loss(value_pred2, target_return, reduction="none").mean(dim=(0, 1)) value_loss += value_loss2 return value_loss def _latent_imagination(self, beliefs, posterior_states, with_logprob=False): # Rollout to generate imagined trajectories chunk_size, batch_size, _ = list( posterior_states.size()) # flatten the tensor flatten_size = chunk_size * batch_size posterior_states = posterior_states.detach().reshape(flatten_size, -1) beliefs = beliefs.detach().reshape(flatten_size, -1) imag_beliefs, imag_states, imag_ac_logps = [beliefs ], [posterior_states], [] for i in range(self.args.planning_horizon): imag_action, imag_ac_logp = self.actor_model( imag_beliefs[-1].detach(), imag_states[-1].detach(), deterministic=False, with_logprob=with_logprob, ) imag_action = imag_action.unsqueeze(dim=0) # add time dim # print(imag_states[-1].shape, imag_action.shape, imag_beliefs[-1].shape) imag_belief, imag_state, _, _ = self.transition_model( imag_states[-1], imag_action, imag_beliefs[-1]) imag_beliefs.append(imag_belief.squeeze(dim=0)) imag_states.append(imag_state.squeeze(dim=0)) if with_logprob: imag_ac_logps.append(imag_ac_logp.squeeze(dim=0)) imag_beliefs = torch.stack(imag_beliefs, dim=0).to( self.args.device ) # shape [horizon+1, (chuck-1)*batch, belief_size] imag_states = torch.stack(imag_states, dim=0).to(self.args.device) if with_logprob: imag_ac_logps = torch.stack(imag_ac_logps, dim=0).to( self.args.device) # shape [horizon, (chuck-1)*batch] return imag_beliefs, imag_states, imag_ac_logps if with_logprob else None def update_parameters(self, gradient_steps): loss_info = [] # used to record loss for s in tqdm(range(gradient_steps)): # get state and belief of samples observations, actions, rewards, nonterminals = self.D.sample( self.args.batch_size, self.args.chunk_size) # print("check sampled rewrads", rewards) init_belief = torch.zeros(self.args.batch_size, self.args.belief_size, device=self.args.device) init_state = torch.zeros(self.args.batch_size, self.args.state_size, device=self.args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) # beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model( # init_state, # actions[:-1], # init_belief, # bottle(self.encoder, (observations[1:], )), # nonterminals[:-1]) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model( init_state, actions, init_belief, bottle(self.encoder, (observations, )), nonterminals) # TODO: 4 # update paras of world model world_model_loss = self._compute_loss_world( state=(beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs), data=(observations, rewards, nonterminals)) observation_loss, reward_loss, kl_loss, pcont_loss = world_model_loss self.world_optimizer.zero_grad() (observation_loss + reward_loss + kl_loss + pcont_loss).backward() nn.utils.clip_grad_norm_(self.world_param, self.args.grad_clip_norm, norm_type=2) self.world_optimizer.step() # freeze params to save memory for p in self.world_param: p.requires_grad = False for p in self.value_model.parameters(): p.requires_grad = False for p in self.value_model2.parameters(): p.requires_gard = False # latent imagination imag_beliefs, imag_states, imag_ac_logps = self._latent_imagination( beliefs, posterior_states, with_logprob=self.args.with_logprob) # update temp if self.args.auto_temp: temp_loss = -( self.log_temp * (imag_ac_logps[0] + self.target_entropy).detach()).mean() self.temp_optimizer.zero_grad() temp_loss.backward() self.temp_optimizer.step() self.args.temp = self.log_temp.exp() # update actor actor_loss = self._compute_loss_actor(imag_beliefs, imag_states, imag_ac_logps=imag_ac_logps) self.actor_optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor_model.parameters(), self.args.grad_clip_norm, norm_type=2) self.actor_optimizer.step() for p in self.world_param: p.requires_grad = True for p in self.value_model.parameters(): p.requires_grad = True for p in self.value_model2.parameters(): p.requires_grad = True # update critic imag_beliefs = imag_beliefs.detach() imag_states = imag_states.detach() critic_loss = self._compute_loss_critic( imag_beliefs, imag_states, imag_ac_logps=imag_ac_logps) self.value_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.value_model.parameters(), self.args.grad_clip_norm, norm_type=2) nn.utils.clip_grad_norm_(self.value_model2.parameters(), self.args.grad_clip_norm, norm_type=2) self.value_optimizer.step() loss_info.append([ observation_loss.item(), reward_loss.item(), kl_loss.item(), pcont_loss.item() if self.args.pcont else 0, actor_loss.item(), critic_loss.item() ]) # finally, update target value function every #gradient_steps with torch.no_grad(): self.target_value_model.load_state_dict( self.value_model.state_dict()) with torch.no_grad(): self.target_value_model2.load_state_dict( self.value_model2.state_dict()) return loss_info def infer_state(self, observation, action, belief=None, state=None): """ Infer belief over current state q(s_t|o≤t,a<t) from the history, return updated belief and posterior_state at time t returned shape: belief/state [belief/state_dim] (remove the time_dim) """ # observation is obs.to(device), action.shape=[act_dim] (will add time dim inside this fn), belief.shape belief, _, _, _, posterior_state, _, _ = self.transition_model( state, action.unsqueeze(dim=0), belief, self.encoder(observation).unsqueeze( dim=0)) # Action and observation need extra time dimension belief, posterior_state = belief.squeeze( dim=0), posterior_state.squeeze( dim=0) # Remove time dimension from belief/state return belief, posterior_state def select_action(self, state, deterministic=False): # get action with the inputs get from fn: infer_state; return a numpy with shape [batch, act_size] belief, posterior_state = state action, _ = self.actor_model(belief, posterior_state, deterministic=deterministic, with_logprob=False) if not deterministic and not self.args.with_logprob: print("e") action = Normal(action, self.args.expl_amount).rsample() # clip the angle action[:, 0].clamp_(min=self.args.angle_min, max=self.args.angle_max) # clip the throttle if self.args.fix_speed: action[:, 1] = self.args.throttle_base else: action[:, 1].clamp_(min=self.args.throttle_min, max=self.args.throttle_max) print("action", action) # return action.cup().numpy() return action # this is a Tonsor.cuda def import_parameters(self, params): # only import or export the parameters used when local rollout self.encoder.load_state_dict(params["encoder"]) self.actor_model.load_state_dict(params["policy"]) self.transition_model.load_state_dict(params["transition"]) def export_parameters(self): """ return the model paras used for local rollout """ params = { "encoder": self.encoder.cpu().state_dict(), "policy": self.actor_model.cpu().state_dict(), "transition": self.transition_model.cpu().state_dict() } self.encoder.to(self.args.device) self.actor_model.to(self.args.device) self.transition_model.to(self.args.device) return params
def __init__(self, args): """ All paras are passed by args :param args: a dict that includes parameters """ super().__init__() self.args = args # Initialise model parameters randomly self.transition_model = TransitionModel( args.belief_size, args.state_size, args.action_size, args.hidden_size, args.embedding_size, args.dense_act).to(device=args.device) self.observation_model = ObservationModel( args.symbolic, args.observation_size, args.belief_size, args.state_size, args.embedding_size, activation_function=(args.dense_act if args.symbolic else args.cnn_act)).to(device=args.device) self.reward_model = RewardModel(args.belief_size, args.state_size, args.hidden_size, args.dense_act).to(device=args.device) self.encoder = Encoder(args.symbolic, args.observation_size, args.embedding_size, args.cnn_act).to(device=args.device) self.actor_model = ActorModel( args.action_size, args.belief_size, args.state_size, args.hidden_size, activation_function=args.dense_act, fix_speed=args.fix_speed, throttle_base=args.throttle_base).to(device=args.device) self.value_model = ValueModel(args.belief_size, args.state_size, args.hidden_size, args.dense_act).to(device=args.device) self.value_model2 = ValueModel(args.belief_size, args.state_size, args.hidden_size, args.dense_act).to(device=args.device) self.pcont_model = PCONTModel(args.belief_size, args.state_size, args.hidden_size, args.dense_act).to(device=args.device) self.target_value_model = deepcopy(self.value_model) self.target_value_model2 = deepcopy(self.value_model2) for p in self.target_value_model.parameters(): p.requires_grad = False for p in self.target_value_model2.parameters(): p.requires_grad = False # setup the paras to update self.world_param = list(self.transition_model.parameters())\ + list(self.observation_model.parameters())\ + list(self.reward_model.parameters())\ + list(self.encoder.parameters()) if args.pcont: self.world_param += list(self.pcont_model.parameters()) # setup optimizer self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr) self.actor_optimizer = optim.Adam(self.actor_model.parameters(), lr=args.actor_lr) self.value_optimizer = optim.Adam(list(self.value_model.parameters()) + list(self.value_model2.parameters()), lr=args.value_lr) # setup the free_nat to self.free_nats = torch.full( (1, ), args.free_nats, dtype=torch.float32, device=args.device) # Allowed deviation in KL divergence # TODO: change it to the new replay buffer, in buffer.py self.D = ExperienceReplay(args.experience_size, args.symbolic, args.observation_size, args.action_size, args.bit_depth, args.device) if self.args.auto_temp: # setup for learning of alpha term (temp of the entropy term) self.log_temp = torch.zeros(1, requires_grad=True, device=args.device) self.target_entropy = -np.prod( args.action_size if not args.fix_speed else self.args. action_size - 1).item() # heuristic value from SAC paper self.temp_optimizer = optim.Adam( [self.log_temp], lr=args.value_lr) # use the same value_lr
class Agent: def __init__(self, model, memory=None, memory_size=500, nb_frames=None): assert len( model.get_output_shape_at(0) ) == 2, "Model's output shape should be (nb_samples, nb_actions)." if memory: self.memory = memory else: self.memory = ExperienceReplay(memory_size) if not nb_frames and not model.get_input_shape_at(0)[1]: raise Exception("Missing argument : nb_frames not provided") elif not nb_frames: nb_frames = model.get_input_shape_at(0)[1] elif model.get_input_shape_at( 0 )[1] and nb_frames and model.get_input_shape_at(0)[1] != nb_frames: raise Exception( "Dimension mismatch : time dimension of model should be equal to nb_frames." ) self.model = model self.nb_frames = nb_frames self.frames = None @property def memory_size(self): return self.memory.memory_size @memory_size.setter def memory_size(self, value): self.memory.memory_size = value def reset_memory(self): self.exp_replay.reset_memory() def check_game_compatibility(self, game): #if len(self.model.input_layers_node_indices) != 1: #raise Exception('Multi node input is not supported.') game_output_shape = (1, None) + game.get_frame().shape if len(game_output_shape) != len(self.model.get_input_shape_at(0)): raise Exception( 'Dimension mismatch. Input shape of the model should be compatible with the game.' ) else: for i in range(len(self.model.get_input_shape_at(0))): if self.model.get_input_shape_at(0)[i] and game_output_shape[ i] and self.model.get_input_shape_at( 0)[i] != game_output_shape[i]: raise Exception( 'Dimension mismatch. Input shape of the model should be compatible with the game.' ) if len( self.model.get_output_shape_at(0) ) != 2 or self.model.get_output_shape_at(0)[1] != game.nb_actions: raise Exception( 'Output shape of model should be (nb_samples, nb_actions).') def get_game_data(self, game): frame = game.get_frame() if self.frames is None: self.frames = [frame] * self.nb_frames else: self.frames.append(frame) self.frames.pop(0) return np.expand_dims(self.frames, 0) def clear_frames(self): self.frames = None def train(self, game, nb_epoch=1000, batch_size=50, gamma=0.9, epsilon=[1., .1], epsilon_rate=0.5, reset_memory=False, observe=0, checkpoint=None, total_sessions=0, session_id=1): self.check_game_compatibility(game) ts = int(time.time()) #fn = "gold-{}.csv".format(ts) #fn = "9nyc-250-1000-epr8-heat-adam.csv" #fn = "400-rl-nopool.csv" fn = "3-normal.csv" fn2 = "heat.csv" #advice_type = "OA" advice_type = "OA" meta_advice_type = "HFHA" #meta_feedback_frequency = 0.1 #meta_feedback_frequency = 0.5 #HF!!! meta_feedback_frequency = 0.1 #LF!!! heatmap = [[0] * 20 for i in range(20)] if session_id == 1: advice_type = "OA" if session_id == 2: advice_type = "NA" if session_id == 3: advice_type = "RL" # print(heatmap) # with open("dummyheat.csv",'a') as f2: # csvWriter = csv.writer(f2,delimiter=',') # csvWriter.writerows(heatmap) # if ( session_id >= 3 and session_id < 5 ): # print("Switching to HFLA") # meta_advice_type = "HFLA" # #meta_feedback_frequency = 0.1 # elif ( session_id >= 5 and session_id < 7 ): # print("Switching to LFHA") # meta_feedback_frequency = 0.1 # meta_advice_type = "LFHA" # elif ( session_id >= 7 and session_id < 9 ): # print("Switching to LFLA") # meta_advice_type = "LFLA" # elif ( session_id >= 9 and session_id < 11 ): # advice_type = "OA" # print("Switching to NA HFLA") # meta_advice_type = "HFLA" # meta_feedback_frequency = 0.5 # elif ( session_id >= 11 and session_id < 13 ): # print("Switching to NA HFLA") # meta_advice_type = "HFLA" # #meta_feedback_frequency = 0.1 # elif ( session_id >= 13 and session_id < 15 ): # print("Switching to NA LFHA") # meta_feedback_frequency = 0.1 # meta_advice_type = "LFHA" # elif ( session_id >= 15 and session_id < 17 ): # print("Switching to NA LFLA") # meta_advice_type = "LFLA" # if ( session_id >= 2 and session_id < 3 ): # meta_feedback_frequency = 0.1 # print("Switching to LFHA") # advice_type = "OA" # meta_advice_type = "LFHA" # meta_feedback_frequency = 0.1 # elif ( session_id >= 3 and session_id < 4 ): # advice_type = "NA" # print("Switching to NA LFHA") # meta_feedback_frequency = 0.1 # meta_advice_type = "LFHA" # elif ( session_id >= 4 and session_id < 5 ): # print("Switching to NA LFLA") # meta_feedback_frequency = 0.1 # advice_type = "NA" # meta_advice_type = "LFLA" # elif ( session_id >= 5 and session_id < 6 ): # advice_type = "OA" # print("Switching to OA HFHA") # meta_advice_type = "HFHA" # meta_feedback_frequency = 0.5 # elif ( session_id >= 6 and session_id < 7 ): # advice_type = "NA" # meta_feedback_frequency = 0.5 # print("Switching to NA HFHA") # meta_advice_type = "HFHA" # meta_feedback_frequency = 0.5 # elif ( session_id >= 7 and session_id < 8 ): # advice_type = "NA" # print("Switching to NA HFLA") # meta_feedback_frequency = 0.5 # meta_advice_type = "HFLA" # elif ( session_id >= 8 and session_id < 9 ): # advice_type = "OA" # meta_feedback_frequency = 0.5 # print("Switching to OA HFLA") # meta_advice_type = "HFLA" # if ( session_id >= 4 and session_id < 7 ): # #print("Switching to LFLA") # advice_type = "RL" # #meta_advice_type = "LFLA" # elif ( session_id >= 7 and session_id < 10 ): # # with open("1RLheat.csv",'a') as f2: # # csvWriter = csv.writer(f2,delimiter=',') # # csvWriter.writerows(heatmap) # # heatmap = [ [0]*20 for i in range(20)] # advice_type = "NA" # #print("Switching to LFHA") # #meta_feedback_frequency = 0.1 # #meta_advice_type = "LFHA" # elif ( session_id >= 10 ): # # with open("1NAheat.csv",'a') as f2: # # csvWriter = csv.writer(f2,delimiter=',') # # csvWriter.writerows(heatmap) # # heatmap = [ [0]*20 for i in range(20)] # #print("Switching to LFLA") # #meta_advice_type = "LFLA" # advice_type = "NA" # with open(fn,'w') as f: # f.write('session_id,advice_type,time,epoch,frames,score,win_perc,loss'+'\n') # f.flush() # f.close() with open(fn,'a') as f: with open(fn, 'a') as f: total_frames = 0 #f.write('session_id,advice_type,time,epoch,frames,score,win_perc,loss'+'\n') #f.flush() if type(epsilon) in {tuple, list}: delta = ((epsilon[0] - epsilon[1]) / (nb_epoch * epsilon_rate)) final_epsilon = epsilon[1] epsilon = epsilon[0] else: final_epsilon = epsilon model = self.model nb_actions = model.get_output_shape_at(0)[-1] win_count = 0 rolling_win_window = [] max_obs_loss = -99999999999999999 m_loss = -99999999 for epoch in range(nb_epoch): lastAdviceStep = 0 adviceGiven = 0 adviceAttempts = 0 modelActions = 0 print(heatmap) loss = 0. game.reset() self.clear_frames() if reset_memory: self.reset_memory() game_over = False S = self.get_game_data(game) savedModel = False while not game_over: a = 0 if advice_type == "RL": if np.random.random() < epsilon or epoch < observe: a = int(np.random.randint(game.nb_actions)) #print("Random Action") else: q = model.predict( S ) #use the prediction confidence to determine whether to ask the player for help qs = model.predict_classes(S) #a = int(np.argmax(qs[0])) #highest_conf = np.amax(q) #print("Game Grid: {}".format(game.get_grid())) #print("Highest MSE Confidence = {}".format(highest_conf)) #a = int(np.argmax(q[0])) a = int(np.argmax(qs[0])) if advice_type == "OA": if np.random.random() < epsilon or epoch < observe: a = int(np.random.randint(game.nb_actions)) #print("Random Action") else: q = model.predict( S ) #use the prediction confidence to determine whether to ask the player for help qs = model.predict_classes(S) #print(qs) #print(q) highest_loss = abs(np.amax(q)) #added ABS lowest_loss = abs(np.amin(q)) #print(highest_loss) #print("HighestLoss:{}".format(highest_loss)) if highest_loss > max_obs_loss and highest_loss != 0: max_obs_loss = highest_loss #print("MaxLoss:{}".format(highest_loss)) #inn = highest_loss / max_obs_loss relative_cost = np.power( lowest_loss / max_obs_loss, 0.5) #print("RelCostA:{}".format(relative_cost)) if relative_cost < 1e-20: relative_cost = 1e-20 relative_cost = -1 / (np.log(relative_cost) - 1) #print("RelCostB:{}".format(relative_cost)) confidence_score_max = 1 confidence_score_min = 0.01 feedback_chance = confidence_score_min + ( confidence_score_max - confidence_score_min) * relative_cost if feedback_chance < 0.01: feedback_chance = 0.01 #if feedback_chance < 0.1: giveAdvice = False if (random.random() < meta_feedback_frequency): giveAdvice = True adviceAttempts = adviceAttempts + 1 if (relative_cost <= 0.25 and game.stepsTaken >= (lastAdviceStep + 10)) or giveAdvice == False: #print("HC: {}".format(max_obs_loss)) modelActions = modelActions + 1 #print("Highest Loss: {} RC: {} POS: Q0:{}".format(highest_loss, relative_cost, q[0])) a = int(np.argmax(qs[0])) else: if random.random() < .5 and ( meta_advice_type == "HFLA" or meta_advice_type == "LFLA"): lastAdviceStep = game.stepsTaken a = int(np.random.randint(game.nb_actions)) adviceGiven = adviceGiven + 1 #print("Taking BAD Player Action") else: lastAdviceStep = game.stepsTaken adviceGiven = adviceGiven + 1 x = game.location[0] z = game.location[1] yaw = game.location[2] a = -1 #print(yaw) if z <= 6: if x < 12: #print("Segment1") if yaw == 270: a = 0 if yaw == 180: a = 1 if yaw == 90: a = 3 if yaw == 0: a = 2 elif x > 15: #print("Segment2") if yaw == 90: a = 0 if yaw == 180: a = 2 if yaw == 0: a = 1 if yaw == 270: a = 3 else: #print("Segment3") if yaw == 0: a = 0 if yaw == 270: a = 1 if yaw == 90: a = 2 if yaw == 180: a = 3 elif (x >= 7) and ((z == 7) or (z == 8) or (z == 9) or (z == 10) or (z == 11) or (z == 12)): #print("Segment4") if yaw == 90: a = 0 if yaw == 180: a = 2 if yaw == 0: a = 1 if yaw == 270: a = 3 elif ((x < 7) and (x > 3)) and ( (z == 7) or (z == 8) or (z == 9) or (z == 10) or (z == 11) or (z == 12)): if yaw == 0: a = 0 if yaw == 270: a = 1 if yaw == 90: a = 2 if yaw == 180: a = 3 elif ((x < 3)) and ((z == 7) or (z == 8) or (z == 9) or (z == 10) or (z == 11) or (z == 12)): if yaw == 0: a = 2 if yaw == 270: a = 0 if yaw == 180: a = 1 if yaw == 90: a = 3 elif (z == 14) or (z == 15): if yaw == 0: a = 0 if yaw == 270: a = 1 if yaw == 90: a = 2 if yaw == 180: a = 3 elif (z == 17) or (z == 16): #print("Segment6") if yaw == 270: a = 0 if yaw == 180: a = 1 if yaw == 0: a = 2 if yaw == 90: a = 3 elif (z > 17): #print("Segment6") if yaw == 270: a = 2 if yaw == 180: a = 0 if yaw == 0: a = 3 if yaw == 90: a = 1 else: a = int( np.random.randint(game.nb_actions)) if a == -1: a = int( np.random.randint(game.nb_actions)) # if z < 6 and x < 13: # print("Segment1") # if yaw == 270: # a = 0 # else: # a = 1 # elif z < 8 and x >= 13: # print("Segment2") # if yaw == 0: # a = 0 # else: # a = 1 # elif z >= 8 and x == 13: # print("Segment3") # if yaw == 90: # a = 0 # else: # a = 1 # elif z >= 8 and z<= 17 and x < 6: # print("Segment4") # if yaw == 0: # a = 0 # else: # a = 1 # elif z > 18 and x < 18: # print("Segment5") # if yaw == 270: # a = 0 # else: # a = 1 # else: # a = int(np.argmax(q[0])) #print("Game Grid: {}".format(game.get_grid())) #print("Highest MSE Confidence = {}".format(highest_conf)) if advice_type == "NA": if np.random.random() < epsilon or epoch < observe: a = int(np.random.randint(game.nb_actions)) game.play(a) heatmap[game.location[0]][ game.location[1]] = heatmap[game.location[0]][ game.location[1]] + 1 #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) #f2.flush() r = game.get_score() S_prime = self.get_game_data(game) game_over = game.is_over() transition = [S, a, r, S_prime, game_over] self.memory.remember(*transition) S = S_prime #print("Random Action") else: q = model.predict( S ) #use the prediction confidence to determine whether to ask the player for help qs = model.predict_classes(S) highest_loss = abs(np.amax(q)) #added ABS lowest_loss = abs(np.amin(q)) #print("HighestLoss:{}".format(highest_loss)) if highest_loss > max_obs_loss and highest_loss != 0: max_obs_loss = highest_loss #print("MaxLoss:{}".format(highest_loss)) #inn = highest_loss / max_obs_loss relative_cost = np.power( lowest_loss / max_obs_loss, 0.5) #print("RelCostA:{}".format(relative_cost)) if relative_cost < 1e-20: relative_cost = 1e-20 relative_cost = -1 / (np.log(relative_cost) - 1) #print("RelCostB:{}".format(relative_cost)) confidence_score_max = 1 confidence_score_min = 0.01 feedback_chance = confidence_score_min + ( confidence_score_max - confidence_score_min) * relative_cost #feedback_chance = random.random() #print("Feedback Chance: {}".format(feedback_chance)) if feedback_chance < 0.01: feedback_chance = 0.01 #if feedback_chance > meta_feedback_frequency: #if feedback_chance < 0.1: #print(relative_cost) giveAdvice = False if (random.random() < meta_feedback_frequency): giveAdvice = True adviceAttempts = adviceAttempts + 1 if (relative_cost <= 0.25 and game.stepsTaken >= (lastAdviceStep + 10)) or giveAdvice == False: #print("Taking Model Action") #print("HC: {}".format(max_obs_loss)) #print("Confidence: {} RC: {}".format(feedback_chance, relative_cost)) modelActions = modelActions + 1 #a = int(np.argmin(q[0])) a = int(np.argmax(qs[0])) game.play(a) heatmap[game.location[0]][ game.location[1]] = heatmap[ game.location[0]][game.location[1]] + 1 #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) #f2.flush() r = game.get_score() S_prime = self.get_game_data(game) game_over = game.is_over() transition = [S, a, r, S_prime, game_over] self.memory.remember(*transition) S = S_prime else: #print("Taking Player Action") if random.random() < .5 and ( meta_advice_type == "HFLA" or meta_advice_type == "LFLA"): a = int(np.random.randint(game.nb_actions)) adviceGiven = adviceGiven + 1 game.play(a) heatmap[game.location[0]][game.location[ 1]] = heatmap[game.location[0]][ game.location[1]] + 1 lastAdviceStep = game.stepsTaken #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) #f2.flush() r = game.get_score() S_prime = self.get_game_data(game) game_over = game.is_over() transition = [S, a, r, S_prime, game_over] self.memory.remember(*transition) S = S_prime if game_over == False: #game.play(checkForBestMove(game.location[0],game.location[1],game.location[2])) a = int( np.random.randint(game.nb_actions)) game.play(a) heatmap[game.location[0]][ game.location[1]] = heatmap[ game.location[0]][ game.location[1]] + 1 #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) #f2.flush() r = game.get_score() S_prime = self.get_game_data(game) game_over = game.is_over() transition = [ S, a, r, S_prime, game_over ] self.memory.remember(*transition) S = S_prime # if game_over == False: # game.play(checkForBestMove(game.location[0],game.location[1],game.location[2])) # heatmap[game.location[0]][game.location[1]] = heatmap[game.location[0]][game.location[1]] + 1 # #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) # #f2.flush() # r = game.get_score() # S_prime = self.get_game_data(game) # game_over = game.is_over() # transition = [S, a, r, S_prime, game_over] # self.memory.remember(*transition) # S = S_prime #print("Taking BAD Player Action") else: adviceGiven = adviceGiven + 1 lastAdviceStep = game.stepsTaken x = game.location[0] z = game.location[1] yaw = game.location[2] #print(x) #print(z) a = -1 #print(yaw) if z <= 6: if x < 12: #print("Segment1") if yaw == 270: a = 0 if yaw == 180: a = 1 if yaw == 90: a = 3 if yaw == 0: a = 2 elif x > 15: #print("Segment2") if yaw == 90: a = 0 if yaw == 180: a = 2 if yaw == 0: a = 1 if yaw == 270: a = 3 else: #print("Segment3") if yaw == 0: a = 0 if yaw == 270: a = 1 if yaw == 90: a = 2 if yaw == 180: a = 3 elif (x >= 7) and ((z == 7) or (z == 8) or (z == 9) or (z == 10) or (z == 11) or (z == 12)): #print("Segment4") if yaw == 90: a = 0 if yaw == 180: a = 2 if yaw == 0: a = 1 if yaw == 270: a = 3 elif ((x < 7) and (x > 3)) and ( (z == 7) or (z == 8) or (z == 9) or (z == 10) or (z == 11) or (z == 12)): if yaw == 0: a = 0 if yaw == 270: a = 1 if yaw == 90: a = 2 if yaw == 180: a = 3 elif ((x < 3)) and ((z == 7) or (z == 8) or (z == 9) or (z == 10) or (z == 11) or (z == 12)): if yaw == 0: a = 2 if yaw == 270: a = 0 if yaw == 180: a = 1 if yaw == 90: a = 3 elif (z == 14) or (z == 15): if yaw == 0: a = 0 if yaw == 270: a = 1 if yaw == 90: a = 2 if yaw == 180: a = 3 elif (z == 17) or (z == 16): #print("Segment6") if yaw == 270: a = 0 if yaw == 180: a = 1 if yaw == 0: a = 2 if yaw == 90: a = 3 elif (z > 17): #print("Segment6") if yaw == 270: a = 2 if yaw == 180: a = 0 if yaw == 0: a = 3 if yaw == 90: a = 1 else: a = int( np.random.randint(game.nb_actions)) if a == -1: a = int( np.random.randint(game.nb_actions)) # #print(yaw) # if z < 6 and x < 13: # #print("Segment1") # if yaw == 270: # a = 0 # else: # a = 1 # elif z < 8 and x >= 13: # #print("Segment2") # if yaw == 0: # a = 0 # else: # a = 1 # elif z >= 8 and x == 13: # #print("Segment3") # if yaw == 90: # a = 0 # else: # a = 1 # elif z >= 8 and z<= 17 and x < 6: # #print("Segment4") # if yaw == 0: # a = 0 # else: # a = 1 # elif z > 18 and x < 18: # #print("Segment5") # if yaw == 270: # a = 0 # else: # a = 1 # else: # a = int(np.argmax(q[0])) #Play an extra 2 times (for NA friction) game.play(a) heatmap[game.location[0]][ game.location[1]] = heatmap[ game.location[0]][game.location[1]] + 1 #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) #f2.flush() r = game.get_score() S_prime = self.get_game_data(game) game_over = game.is_over() transition = [S, a, r, S_prime, game_over] self.memory.remember(*transition) S = S_prime if game_over == False: game.play( checkForBestMove( game.location[0], game.location[1], game.location[2])) heatmap[game.location[0]][game.location[ 1]] = heatmap[game.location[0]][ game.location[1]] + 1 #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) #f2.flush() r = game.get_score() S_prime = self.get_game_data(game) game_over = game.is_over() transition = [S, a, r, S_prime, game_over] self.memory.remember(*transition) S = S_prime # if game_over == False: # game.play(checkForBestMove(game.location[0],game.location[1],game.location[2])) # heatmap[game.location[0]][game.location[1]] = heatmap[game.location[0]][game.location[1]] + 1 # #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) # #f2.flush() # r = game.get_score() # S_prime = self.get_game_data(game) # game_over = game.is_over() # transition = [S, a, r, S_prime, game_over] # self.memory.remember(*transition) # S = S_prime if game_over == False: if advice_type != "NA": game.play(a) heatmap[game.location[0]][ game.location[1]] = heatmap[game.location[0]][ game.location[1]] + 1 #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 )) #f2.flush() r = game.get_score() S_prime = self.get_game_data(game) game_over = game.is_over() transition = [S, a, r, S_prime, game_over] self.memory.remember(*transition) S = S_prime if epoch >= observe: batch = self.memory.get_batch(model=model, batch_size=batch_size, gamma=gamma) if batch: inputs, targets = batch mtob = model.train_on_batch(inputs, targets) if mtob > m_loss: m_loss = mtob loss += float(mtob) #print( "LOSS: {} CULM_LOSS: {}".format(mtob,loss)) if checkpoint and (savedModel == False) and ( (epoch + 1 - observe) % checkpoint == 0 or epoch + 1 == nb_epoch): #model.save_weights('weights.dat') print("Checkpoint... saving model..") if advice_type == "OA": model.save('oa_model.h5') if advice_type == "NA": model.save('na_model.h5') if advice_type == "RL": model.save('rl_model.h5') # model_json = model.to_json() # with open("model.json", "w") as json_file: # json_file.write(model_json) # #serialize weights to HDF5 # model.save_weights("model.h5") savedModel = True if game.is_won(): win_count += 1 rolling_win_window.insert(0, 1) else: rolling_win_window.insert(0, 0) if epsilon > final_epsilon and epoch >= observe: epsilon -= delta percent_win = 0 cdt = datetime.datetime.now() if sum(rolling_win_window) != 0: percent_win = sum(rolling_win_window) / 4 total_frames = total_frames + game.stepsTaken f.write( '{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n'.format( session_id, advice_type, meta_advice_type, str(cdt), (epoch + 1), total_frames, game.score, percent_win, epsilon, loss, game.stepsTaken, adviceGiven, adviceAttempts, modelActions)) f.flush() print( "Session: {} | Time: {} | Epoch {:03d}/{:03d} | Steps {:.4f} | Epsilon {:.2f} | Score {} | Loss {}" .format(session_id, str(cdt), epoch + 1, nb_epoch, game.stepsTaken, epsilon, game.score, loss)) if len(rolling_win_window) > 4: rolling_win_window.pop() time.sleep(1.0) if advice_type == "OA": with open("{}OAheatxtues.csv".format(session_id), 'w+') as f2: csvWriter = csv.writer(f2, delimiter=',') csvWriter.writerows(heatmap) #heatmap = [ [0]*20 for i in range(20)] if advice_type == "RL": with open("{}RLheatxtues.csv".format(session_id), 'w+') as f2: csvWriter = csv.writer(f2, delimiter=',') csvWriter.writerows(heatmap) #heatmap = [ [0]*20 for i in range(20)] if advice_type == "NA": with open("{}NAheatxtues.csv".format(session_id), 'w+') as f2: csvWriter = csv.writer(f2, delimiter=',') csvWriter.writerows(heatmap) #heatmap = [ [0]*20 for i in range(20)] def play(self, game, nb_epoch=10, epsilon=0., visualize=False): self.check_game_compatibility(game) model = self.model win_count = 0 frames = [] for epoch in range(nb_epoch): print("Playing") game.reset() self.clear_frames() S = self.get_game_data(game) if visualize: frames.append(game.draw()) game_over = False while not game_over: if np.random.rand() < epsilon: print("random") action = int(np.random.randint(0, game.nb_actions)) else: q = model.predict(S)[0] possible_actions = game.get_possible_actions() q = [q[i] for i in possible_actions] action = possible_actions[np.argmax(q)] print(action) game.play(action) S = self.get_game_data(game) if visualize: frames.append(game.draw()) game_over = game.is_over() if game.is_won(): win_count += 1 print("Accuracy {} %".format(100. * win_count / nb_epoch)) #Visualizing/printing images is currently super slow if visualize: if 'images' not in os.listdir('.'): os.mkdir('images') for i in range(len(frames)): plt.imshow(frames[i], interpolation='none') plt.savefig("images/" + game.name + str(i) + ".png")
''' Constants ''' nb_actions = 6 memory_size = 100 observe = 0 batch_size = 50 epsilon = (1.0, 0.1) epsilon_rate = 0.5 delta = ((epsilon[0] - epsilon[1]) / (iterations * epsilon_rate)) final_epsilon = epsilon[1] epsilon = epsilon[0] win_count = 0 ''' Memory and Model ''' memory = ExperienceReplay(memory_size) model = build_model() ''' Agent Code ''' initial_state = [ '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.33', '1.0', '1.0', '1.0', '1.0', '0.0', '0.0', '1.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '0.0', '0.0', '1.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0' ] if sys.platform == "win32": loop = asyncio.ProactorEventLoop() # for subprocess' pipes on Windows asyncio.set_event_loop(loop) else: