Ejemplo n.º 1
0
    def __init_sample(self):
        if args.experience_replay is not '' and os.path.exists(
                args.experience_replay):
            self.D = torch.load(args.experience_replay)
            self.metrics['steps'], self.metrics['episodes'] = [
                self.D.steps
            ] * self.D.episodes, list(range(1, self.D.episodes + 1))
        elif not args.test:
            self.D = ExperienceReplay(args.experience_size, args.symbolic_env,
                                      self.env.observation_size,
                                      self.env.action_size, args.bit_depth,
                                      args.device)

            # Initialise dataset D with S random seed episodes
            print(
                "Start Multi Sample Processing -------------------------------"
            )
            start_time = time.time()
            data_lists = [
                Manager().list() for i in range(1, args.seed_episodes + 1)
            ]  # Set Global Lists
            pipes = [Pipe() for i in range(1, args.seed_episodes + 1)
                     ]  # Set Multi Pipe
            workers_init_sample = [
                Worker_init_Sample(child_conn=child, id=i + 1)
                for i, [parent, child] in enumerate(pipes)
            ]

            for i, w in enumerate(workers_init_sample):
                w.start()  # Start Single Process
                pipes[i][0].send(
                    data_lists[i])  # Parent_pipe send data using i'th pipes
            [w.join() for w in workers_init_sample]  # wait sub_process done

            for i, [parent, child] in enumerate(pipes):
                # datas = parent.recv()
                for data in list(parent.recv()):
                    if isinstance(data, tuple):
                        assert len(data) == 4
                        self.D.append(data[0], data[1], data[2], data[3])
                    elif isinstance(data, int):
                        t = data
                        self.metrics['steps'].append(t * args.action_repeat + (
                            0 if len(self.metrics['steps']) ==
                            0 else self.metrics['steps'][-1]))
                        self.metrics['episodes'].append(i + 1)
                    else:
                        print(
                            "The Recvive Data Have Some Problems, Need To Fix")
            end_time = time.time()
            print("the process times {} s".format(end_time - start_time))
            print(
                "End Multi Sample Processing -------------------------------")
Ejemplo n.º 2
0
  def __init__(self, game, mode=SIMPLE, nb_epoch=10000, memory_size=1000, batch_size=50, nb_frames=4, epsilon=1., discount=.9, learning_rate=.1, model=None):

    self.game = game
    self.mode = mode
    self.target_model = None
    self.rows, self.columns = game.field_shape()
    self.nb_epoch = nb_epoch
    self.nb_frames = nb_frames
    self.nb_actions = game.nb_actions()

    if mode == TEST:
      print('Training Mode: Loading model...')
      self.model = load_model(model)
    elif mode == SIMPLE:
      print('Using Plain DQN: Building model...')
      self.model = self.build_model()
    elif mode == DOUBLE:
      print('Using Double DQN: Building primary and target model...')
      self.model = self.build_model()
      self.target_model = self.build_model()
      self.update_target_model()

    # Trades off the importance of sooner versus later rewards.
    # A factor of 0 means it rather prefers immediate rewards
    # and it will mostly consider current rewards. A factor of 1
    # will make it strive for a long-term high reward.
    self.discount = discount

    # The learning rate or step size determines to what extent the newly
    # acquired information will override the old information. A factor
    # of 0 will make the agent not learn anything, while a factor of 1
    # would make the agent consider only the most recent information
    self.learning_rate = learning_rate

    # Use epsilon-greedy exploration as our policy.
    # Epsilon determines the probability for choosing random actions.
    # This factor will decrease linear by the number of epoches. So we choose
    # a random action by the probability 'eps'. Without this policy the network
    # is greedy and it will it settles with the first effective strategy it finds.
    # Hence, we introduce certain randomness.
    # Epislon reaches its minimum at 1/2 of the games
    epsilon_end = self.nb_epoch - (self.nb_epoch / 2)
    self.policy = EpsGreedyPolicy(self.model, epsilon_end, self.nb_actions, epsilon, .1)

    # Create new experience replay memory. Without this optimization
    # the training takes extremely long even on a GPU and most
    # importantly the approximation of Q-values using non-linear
    # functions, that is used for our NN, is not very stable.
    self.memory = ExperienceReplay(self.model, self.target_model, self.nb_actions, memory_size, batch_size, self.discount, self.learning_rate)

    self.frames = None
Ejemplo n.º 3
0
    def __init__(self):
        self.parms = Parameters()
        self.results_dir = os.path.join(self.parms.results_path)
        self.dataset_path = os.path.join(self.parms.results_path, 'dataset/')
        os.makedirs(self.dataset_path, exist_ok=True)
        self.metrics = {
            'steps': [],
            'episodes': [],
            'train_rewards': [],
            'predicted_rewards': [],
            'test_episodes': [],
            'test_rewards': [],
            'observation_loss': [],
            'reward_loss': [],
            'kl_loss': [],
            'regularizer_loss': []
        }
        os.makedirs(self.results_dir, exist_ok=True)

        ## Setting cuda options
        if torch.cuda.is_available() and self.parms.use_cuda:
            self.parms.device = torch.device('cuda')
            torch.cuda.set_device(self.parms.gpu_id)
            print("Using gpu: ", torch.cuda.current_device())
        else:
            self.parms.device = torch.device('cpu')
            self.use_cuda = False
            print("Work on: ", self.parms.device)

        # Initilize buffer experience replay
        self.env = ControlSuiteEnv(self.parms.env_name, self.parms.seed,
                                   self.parms.max_episode_length,
                                   self.parms.bit_depth)
        self.D = ExperienceReplay(self.parms.ex_replay_buff_size,
                                  self.env.observation_size,
                                  self.env.action_size, self.parms.bit_depth,
                                  self.parms.device)

        if self.parms.seed > 0:
            self.set_seed()

        self.trainer = Trainer(self.parms, self.D, self.metrics,
                               self.results_dir, self.env)
        self.init_exp_rep()

        # Start Training
        print("Total training episodes: ", self.parms.training_episodes,
              " Buffer sampling: ", self.parms.collect_interval)
        self.trainer.train_models()
        print("END.")
Ejemplo n.º 4
0
	def __init__(self, model, memory=None, memory_size=1000, nb_frames=None):
		assert len(model.output_shape) == 2, "Model's output shape should be (nb_samples, nb_actions)."
		if memory:
			self.memory = memory
		else:
			self.memory = ExperienceReplay(memory_size)
		if not nb_frames and not model.input_shape:
			raise Exception("Missing argument : nb_frames not provided")
		elif not nb_frames:
			nb_frames = model.input_shape[1]
		elif model.input_shape[1] and nb_frames and model.input_shape[1] != nb_frames:
			raise Exception("Dimension mismatch : time dimension of model should be equal to nb_frames.")
		self.model = model
		self.nb_frames = nb_frames
		self.frames = None
Ejemplo n.º 5
0
def setup_replay(args: argparse.Namespace, env: Env) -> ExperienceReplay:
    D = ExperienceReplay(
        args.experience_size,
        env.observation_size,
        env.action_size,
        args.device
    )
    # Initialise dataset D with random seed episodes
    for _ in range(1, args.seed_episodes + 1):
        observation, done = env.reset(), False
        while not done:
            action = env.sample_random_action()
            next_observation, _, done, info = env.step(action)
            D.append(observation, action, info["reward_dist"], info["reward_coll"], done)
            observation = next_observation

    return D
Ejemplo n.º 6
0
class Agent:
    def __init__(self, model, memory=None, memory_size=100, nb_frames=None):
        assert len(
            model.output_shape
        ) == 2, "Model's output shape should be (nb_samples, nb_actions)."
        if memory:
            self.memory = memory
        else:
            self.memory = ExperienceReplay(memory_size)
        if not nb_frames and not model.input_shape[1]:
            raise Exception("Missing argument : nb_frames not provided")
        elif not nb_frames:
            nb_frames = model.input_shape[1]
        elif model.input_shape[
                1] and nb_frames and model.input_shape[1] != nb_frames:
            raise Exception(
                "Dimension mismatch : time dimension of model should be equal to nb_frames."
            )
        self.model = model
        self.nb_frames = nb_frames  # model input shape, 24
        self.frames = None

    @property
    def memory_size(self):
        return self.memory.memory_size

    @memory_size.setter
    def memory_size(self, value):
        self.memory.memory_size = value

    def reset_memory(self):
        self.exp_replay.reset_memory()

    def check_game_compatibility(self, game):
        game_output_shape = (1, None) + game.get_frame().shape
        #game_output_shape = (None, game.get_frame().shape)
        if len(game_output_shape) != len(self.model.input_shape):
            raise Exception(
                'Dimension mismatch. Input shape of the model should be compatible with the game.'
            )
        else:
            for i in range(len(self.model.input_shape)):
                if self.model.input_shape[i] and game_output_shape[
                        i] and self.model.input_shape[i] != game_output_shape[
                            i]:
                    raise Exception(
                        'Dimension mismatch. Input shape of the model should be compatible with the game.'
                    )
        if len(self.model.output_shape
               ) != 2 or self.model.output_shape[1] != game.nb_actions:
            raise Exception(
                'Output shape of model should be (nb_samples, nb_actions).')

    def get_game_data(self, game):  # returns scaled
        frame = game.get_frame()  # candidate to return scaled
        if self.frames is None:
            self.frames = [frame] * self.nb_frames
        else:
            self.frames.append(frame)
            self.frames.pop(0)
        return np.expand_dims(self.frames, 0)

    def clear_frames(self):
        self.frames = None

    def train(self,
              game,
              nb_epoch=1000,
              batch_size=50,
              gamma=0.9,
              epsilon=[1., .1],
              epsilon_rate=0.5,
              reset_memory=False,
              observe=0,
              checkpoint=None):
        self.check_game_compatibility(game)
        if type(epsilon) in {tuple, list}:
            delta = ((epsilon[0] - epsilon[1]) / (nb_epoch * epsilon_rate))
            final_epsilon = epsilon[1]
            epsilon = epsilon[0]
        else:
            final_epsilon = epsilon
        save = Save()
        model = self.model
        nb_actions = model.output_shape[-1]
        win_count = 0
        for epoch in range(nb_epoch):
            loss = 0.
            q = np.zeros(3)
            game.reset()
            self.clear_frames()
            if reset_memory:
                self.reset_memory()
            game_over = False
            S = self.get_game_data(game)  # S must be scaled
            i = 0
            while not game_over:
                i = i + 1
                if np.random.random() < epsilon or epoch < observe:
                    a = int(np.random.randint(game.nb_actions))
                    print('>'),
                else:
                    # S must be scaled
                    q = model.predict(S)  # !
                    a = int(np.argmax(q[0]))
                game.play(a)
                r = game.get_score(a)
                S_prime = self.get_game_data(game)  # S_prime must be scaled
                game_over = game.is_over()
                # S, a, S_prime, must be scaled
                # reward, game over is not scaled in catch/snake
                transition = [S, a, r, S_prime, game_over]  # !
                self.memory.remember(*transition)
                S = S_prime
                if epoch >= observe:
                    batch = self.memory.get_batch(model=model,
                                                  batch_size=batch_size,
                                                  gamma=gamma)
                    if batch:
                        inputs, targets = batch  # scaled
                        loss += float(model.train_on_batch(inputs, targets))
                #if checkpoint and ((epoch + 1 - observe) % checkpoint == 0 or epoch + 1 == nb_epoch):
                #model.save_weights('4kweights.dat')

                save.log(game, epoch)

            if game.is_won():
                win_count += 1
            if epsilon > final_epsilon and epoch >= observe:
                epsilon -= delta
            print(' ')
            print(
                "Epoch {:03d}/{:03d} | Loss {:.4f} | Epsilon {:.2f} | Win count {} | loss Avg {:.4f}"
                .format(epoch + 1, nb_epoch, loss, epsilon, win_count,
                        loss / i))

            if ((epoch % 10) == 0):
                save.save_model(model, Config.f_model)
            save.log_epoch(loss, win_count, loss / i)

    def play(self, game, nb_epoch=10, epsilon=0., visualize=True):
        self.check_game_compatibility(game)
        model = self.model
        win_count = 0
        frames = []
        save = Save()
        for epoch in range(nb_epoch):
            game.reset()
            self.clear_frames()
            S = self.get_game_data(game)  # S must be scaled
            if visualize:
                frames.append(game.draw())
            game_over = False
            while not game_over:
                if np.random.rand() < epsilon:
                    print("random")
                    action = int(np.random.randint(0, game.nb_actions))
                else:
                    # S must be scaled
                    q = model.predict(S)[0]  # !
                    possible_actions = game.get_possible_actions()
                    q = [q[i] for i in possible_actions]
                    action = possible_actions[np.argmax(q)]

                game.play(action)

                S = self.get_game_data(game)
                '''
				if visualize:
					frames.append(game.draw())
				game_over = game.is_over()
				'''
                save.log(game, nb_epoch)

            if game.is_won():
                win_count += 1
        print("Accuracy {} %".format(100. * win_count / nb_epoch))
        '''
Ejemplo n.º 7
0
class Plan(object):
    def __init__(self):

        self.results_dir = os.path.join(
            'results',
            '{}_seed_{}_{}_action_scale_{}_no_explore_{}_pool_len_{}_optimisation_iters_{}_top_planning-horizon'
            .format(args.env, args.seed, args.algo, args.action_scale,
                    args.pool_len, args.optimisation_iters,
                    args.top_planning_horizon))

        args.results_dir = self.results_dir
        args.MultiGPU = True if torch.cuda.device_count(
        ) > 1 and args.MultiGPU else False

        self.__basic_setting()
        self.__init_sample()  # Sampleing The Init Data

        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, self.env.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_activation_function).to(device=args.device)
        self.observation_model = ObservationModel(
            args.symbolic_env, self.env.observation_size, args.belief_size,
            args.state_size, args.embedding_size,
            args.cnn_activation_function).to(device=args.device)
        self.reward_model = RewardModel(
            args.belief_size, args.state_size, args.hidden_size,
            args.dense_activation_function).to(device=args.device)
        self.encoder = Encoder(
            args.symbolic_env, self.env.observation_size, args.embedding_size,
            args.cnn_activation_function).to(device=args.device)

        print("We Have {} GPUS".format(torch.cuda.device_count())
              ) if args.MultiGPU else print("We use CPU")
        self.transition_model = nn.DataParallel(
            self.transition_model.to(device=args.device)
        ) if args.MultiGPU else self.transition_model
        self.observation_model = nn.DataParallel(
            self.observation_model.to(device=args.device)
        ) if args.MultiGPU else self.observation_model
        self.reward_model = nn.DataParallel(
            self.reward_model.to(
                device=args.device)) if args.MultiGPU else self.reward_model

        # encoder = nn.DataParallel(encoder.cuda())
        # actor_model = nn.DataParallel(actor_model.cuda())
        # value_model = nn.DataParallel(value_model.cuda())

        # share the global parameters in multiprocessing
        self.encoder.share_memory()
        self.observation_model.share_memory()
        self.reward_model.share_memory()

        # Set all_model/global_actor_optimizer/global_value_optimizer
        self.param_list = list(self.transition_model.parameters()) + list(
            self.observation_model.parameters()) + list(
                self.reward_model.parameters()) + list(
                    self.encoder.parameters())
        self.model_optimizer = optim.Adam(
            self.param_list,
            lr=0
            if args.learning_rate_schedule != 0 else args.model_learning_rate,
            eps=args.adam_epsilon)

    def update_belief_and_act(self,
                              args,
                              env,
                              belief,
                              posterior_state,
                              action,
                              observation,
                              explore=False):
        # Infer belief over current state q(s_t|o≤t,a<t) from the history
        # print("action size: ",action.size()) torch.Size([1, 6])
        belief, _, _, _, posterior_state, _, _ = self.upper_transition_model(
            posterior_state, action.unsqueeze(dim=0), belief,
            self.encoder(observation).unsqueeze(dim=0), None)
        if hasattr(env, "envs"):
            belief, posterior_state = list(
                map(lambda x: x.view(-1, args.test_episodes, x.shape[2]),
                    [x for x in [belief, posterior_state]]))

        belief, posterior_state = belief.squeeze(
            dim=0), posterior_state.squeeze(
                dim=0)  # Remove time dimension from belief/state
        action = self.algorithms.get_action(belief, posterior_state, explore)

        if explore:
            action = torch.clamp(
                Normal(action, args.action_noise).rsample(), -1, 1
            )  # Add gaussian exploration noise on top of the sampled action
            # action = action + args.action_noise * torch.randn_like(action)  # Add exploration noise ε ~ p(ε) to the action
        next_observation, reward, done = env.step(
            action.cpu() if isinstance(env, EnvBatcher) else action[0].cpu(
            ))  # Perform environment step (action repeats handled internally)
        return belief, posterior_state, action, next_observation, reward, done

    def run(self):
        if args.algo == "dreamer":
            print("DREAMER")
            from algorithms.dreamer import Algorithms
            self.algorithms = Algorithms(self.env.action_size,
                                         self.transition_model, self.encoder,
                                         self.reward_model,
                                         self.observation_model)
        elif args.algo == "p2p":
            print("planing to plan")
            from algorithms.plan_to_plan import Algorithms
            self.algorithms = Algorithms(self.env.action_size,
                                         self.transition_model, self.encoder,
                                         self.reward_model,
                                         self.observation_model)
        elif args.algo == "actor_pool_1":
            print("async sub actor")
            from algorithms.actor_pool_1 import Algorithms_actor
            self.algorithms = Algorithms_actor(self.env.action_size,
                                               self.transition_model,
                                               self.encoder, self.reward_model,
                                               self.observation_model)
        elif args.algo == "aap":
            from algorithms.asynchronous_actor_planet import Algorithms
            self.algorithms = Algorithms(self.env.action_size,
                                         self.transition_model, self.encoder,
                                         self.reward_model,
                                         self.observation_model)
        else:
            print("planet")
            from algorithms.planet import Algorithms
            # args.MultiGPU = False
            self.algorithms = Algorithms(self.env.action_size,
                                         self.transition_model,
                                         self.reward_model)

        if args.test: self.test_only()

        self.global_prior = Normal(
            torch.zeros(args.batch_size, args.state_size, device=args.device),
            torch.ones(args.batch_size, args.state_size,
                       device=args.device))  # Global prior N(0, I)
        self.free_nats = torch.full(
            (1, ), args.free_nats,
            device=args.device)  # Allowed deviation in KL divergence

        # Training (and testing)
        # args.episodes = 1
        for episode in tqdm(range(self.metrics['episodes'][-1] + 1,
                                  args.episodes + 1),
                            total=args.episodes,
                            initial=self.metrics['episodes'][-1] + 1):
            losses = self.train()
            # self.algorithms.save_loss_data(self.metrics['episodes']) # Update and plot loss metrics
            self.save_loss_data(tuple(
                zip(*losses)))  # Update and plot loss metrics
            self.data_collection(episode=episode)  # Data collection
            # args.test_interval = 1
            if episode % args.test_interval == 0:
                self.test(episode=episode)  # Test model
            self.save_model_data(episode=episode)  # save model

        self.env.close()  # Close training environment

    def train_env_model(self, beliefs, prior_states, prior_means,
                        prior_std_devs, posterior_states, posterior_means,
                        posterior_std_devs, observations, actions, rewards,
                        nonterminals):
        # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
        if args.worldmodel_LogProbLoss:
            observation_dist = Normal(
                bottle(self.observation_model, (beliefs, posterior_states)), 1)
            observation_loss = -observation_dist.log_prob(
                observations[1:]).sum(
                    dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
        else:
            observation_loss = F.mse_loss(
                bottle(self.observation_model, (beliefs, posterior_states)),
                observations[1:],
                reduction='none').sum(
                    dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1))
        if args.worldmodel_LogProbLoss:
            reward_dist = Normal(
                bottle(self.reward_model, (beliefs, posterior_states)), 1)
            reward_loss = -reward_dist.log_prob(rewards[:-1]).mean(dim=(0, 1))
        else:
            reward_loss = F.mse_loss(bottle(self.reward_model,
                                            (beliefs, posterior_states)),
                                     rewards[:-1],
                                     reduction='none').mean(dim=(0, 1))

        # transition loss
        div = kl_divergence(Normal(posterior_means, posterior_std_devs),
                            Normal(prior_means, prior_std_devs)).sum(dim=2)
        kl_loss = torch.max(div, self.free_nats).mean(
            dim=(0, 1)
        )  # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out
        if args.global_kl_beta != 0:
            kl_loss += args.global_kl_beta * kl_divergence(
                Normal(posterior_means, posterior_std_devs),
                self.global_prior).sum(dim=2).mean(dim=(0, 1))
        # Calculate latent overshooting objective for t > 0
        if args.overshooting_kl_beta != 0:
            overshooting_vars = [
            ]  # Collect variables for overshooting to process in batch
            for t in range(1, args.chunk_size - 1):
                d = min(t + args.overshooting_distance,
                        args.chunk_size - 1)  # Overshooting distance
                t_, d_ = t - 1, d - 1  # Use t_ and d_ to deal with different time indexing for latent states
                seq_pad = (
                    0, 0, 0, 0, 0, t - d + args.overshooting_distance
                )  # Calculate sequence padding so overshooting terms can be calculated in one batch
                # Store (0) actions, (1) nonterminals, (2) rewards, (3) beliefs, (4) prior states, (5) posterior means, (6) posterior standard deviations and (7) sequence masks
                overshooting_vars.append(
                    (F.pad(actions[t:d],
                           seq_pad), F.pad(nonterminals[t:d], seq_pad),
                     F.pad(rewards[t:d],
                           seq_pad[2:]), beliefs[t_], prior_states[t_],
                     F.pad(posterior_means[t_ + 1:d_ + 1].detach(), seq_pad),
                     F.pad(posterior_std_devs[t_ + 1:d_ + 1].detach(),
                           seq_pad,
                           value=1),
                     F.pad(
                         torch.ones(d - t,
                                    args.batch_size,
                                    args.state_size,
                                    device=args.device), seq_pad))
                )  # Posterior standard deviations must be padded with > 0 to prevent infinite KL divergences
            overshooting_vars = tuple(zip(*overshooting_vars))
            # Update belief/state using prior from previous belief/state and previous action (over entire sequence at once)
            beliefs, prior_states, prior_means, prior_std_devs = self.upper_transition_model(
                torch.cat(overshooting_vars[4], dim=0),
                torch.cat(overshooting_vars[0], dim=1),
                torch.cat(overshooting_vars[3], dim=0), None,
                torch.cat(overshooting_vars[1], dim=1))
            seq_mask = torch.cat(overshooting_vars[7], dim=1)
            # Calculate overshooting KL loss with sequence mask
            kl_loss += (
                1 / args.overshooting_distance
            ) * args.overshooting_kl_beta * torch.max((kl_divergence(
                Normal(torch.cat(overshooting_vars[5], dim=1),
                       torch.cat(overshooting_vars[6], dim=1)),
                Normal(prior_means, prior_std_devs)
            ) * seq_mask).sum(dim=2), self.free_nats).mean(dim=(0, 1)) * (
                args.chunk_size
                - 1
            )  # Update KL loss (compensating for extra average over each overshooting/open loop sequence)
            # Calculate overshooting reward prediction loss with sequence mask
            if args.overshooting_reward_scale != 0:
                reward_loss += (
                    1 / args.overshooting_distance
                ) * args.overshooting_reward_scale * F.mse_loss(
                    bottle(self.reward_model,
                           (beliefs, prior_states)) * seq_mask[:, :, 0],
                    torch.cat(overshooting_vars[2], dim=1),
                    reduction='none'
                ).mean(dim=(0, 1)) * (
                    args.chunk_size - 1
                )  # Update reward loss (compensating for extra average over each overshooting/open loop sequence)
        # Apply linearly ramping learning rate schedule
        if args.learning_rate_schedule != 0:
            for group in self.model_optimizer.param_groups:
                group['lr'] = min(
                    group['lr'] + args.model_learning_rate /
                    args.model_learning_rate_schedule,
                    args.model_learning_rate)
        model_loss = observation_loss + reward_loss + kl_loss
        # Update model parameters
        self.model_optimizer.zero_grad()
        model_loss.backward()
        nn.utils.clip_grad_norm_(self.param_list,
                                 args.grad_clip_norm,
                                 norm_type=2)
        self.model_optimizer.step()
        return observation_loss, reward_loss, kl_loss

    def train(self):
        # Model fitting
        losses = []
        print("training loop")
        # args.collect_interval = 1
        for s in tqdm(range(args.collect_interval)):

            # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
            observations, actions, rewards, nonterminals = self.D.sample(
                args.batch_size,
                args.chunk_size)  # Transitions start at time t = 0
            # Create initial belief and state for time t = 0
            init_belief, init_state = torch.zeros(
                args.batch_size, args.belief_size,
                device=args.device), torch.zeros(args.batch_size,
                                                 args.state_size,
                                                 device=args.device)
            # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
            obs = bottle(self.encoder, (observations[1:], ))
            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.upper_transition_model(
                prev_state=init_state,
                actions=actions[:-1],
                prev_belief=init_belief,
                obs=obs,
                nonterminals=nonterminals[:-1])

            # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
            observation_loss, reward_loss, kl_loss = self.train_env_model(
                beliefs, prior_states, prior_means, prior_std_devs,
                posterior_states, posterior_means, posterior_std_devs,
                observations, actions, rewards, nonterminals)

            # Dreamer implementation: actor loss calculation and optimization
            with torch.no_grad():
                actor_states = posterior_states.detach().to(
                    device=args.device).share_memory_()
                actor_beliefs = beliefs.detach().to(
                    device=args.device).share_memory_()

            # if not os.path.exists(os.path.join(os.getcwd(), 'tensor_data/' + args.results_dir)): os.mkdir(os.path.join(os.getcwd(), 'tensor_data/' + args.results_dir))
            torch.save(
                actor_states,
                os.path.join(os.getcwd(),
                             args.results_dir + '/actor_states.pt'))
            torch.save(
                actor_beliefs,
                os.path.join(os.getcwd(),
                             args.results_dir + '/actor_beliefs.pt'))

            # [self.actor_pipes[i][0].send(1) for i, w in enumerate(self.workers_actor)]  # Parent_pipe send data using i'th pipes
            # [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool)]  # waitting the children finish

            self.algorithms.train_algorithm(actor_states, actor_beliefs)
            losses.append(
                [observation_loss.item(),
                 reward_loss.item(),
                 kl_loss.item()])

            # if self.algorithms.train_algorithm(actor_states, actor_beliefs) is not None:
            #   merge_actor_loss, merge_value_loss = self.algorithms.train_algorithm(actor_states, actor_beliefs)
            #   losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item(), merge_actor_loss.item(), merge_value_loss.item()])
            # else:
            #   losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item()])

        return losses

    def data_collection(self, episode):
        print("Data collection")
        with torch.no_grad():
            observation, total_reward = self.env.reset(), 0
            belief, posterior_state, action = torch.zeros(
                1, args.belief_size, device=args.device), torch.zeros(
                    1, args.state_size,
                    device=args.device), torch.zeros(1,
                                                     self.env.action_size,
                                                     device=args.device)
            pbar = tqdm(range(args.max_episode_length // args.action_repeat))
            for t in pbar:
                # print("step",t)
                belief, posterior_state, action, next_observation, reward, done = self.update_belief_and_act(
                    args, self.env, belief, posterior_state, action,
                    observation.to(device=args.device))
                self.D.append(observation, action.cpu(), reward, done)
                total_reward += reward
                observation = next_observation
                if args.render: self.env.render()
                if done:
                    pbar.close()
                    break

            # Update and plot train reward metrics
            self.metrics['steps'].append(t + self.metrics['steps'][-1])
            self.metrics['episodes'].append(episode)
            self.metrics['train_rewards'].append(total_reward)

            Save_Txt(self.metrics['episodes'][-1],
                     self.metrics['train_rewards'][-1], 'train_rewards',
                     args.results_dir)
            # lineplot(metrics['episodes'][-len(metrics['train_rewards']):], metrics['train_rewards'], 'train_rewards', results_dir)

    def test(self, episode):
        print("Test model")
        # Set models to eval mode
        self.transition_model.eval()
        self.observation_model.eval()
        self.reward_model.eval()
        self.encoder.eval()
        self.algorithms.train_to_eval()
        # self.actor_model_g.eval()
        # self.value_model_g.eval()
        # Initialise parallelised test environments
        test_envs = EnvBatcher(
            Env, (args.env, args.symbolic_env, args.seed,
                  args.max_episode_length, args.action_repeat, args.bit_depth),
            {}, args.test_episodes)

        with torch.no_grad():
            observation, total_rewards, video_frames = test_envs.reset(
            ), np.zeros((args.test_episodes, )), []
            belief, posterior_state, action = torch.zeros(
                args.test_episodes, args.belief_size,
                device=args.device), torch.zeros(
                    args.test_episodes, args.state_size,
                    device=args.device), torch.zeros(args.test_episodes,
                                                     self.env.action_size,
                                                     device=args.device)
            pbar = tqdm(range(args.max_episode_length // args.action_repeat))
            for t in pbar:
                belief, posterior_state, action, next_observation, reward, done = self.update_belief_and_act(
                    args, test_envs, belief, posterior_state, action,
                    observation.to(device=args.device))
                total_rewards += reward.numpy()
                if not args.symbolic_env:  # Collect real vs. predicted frames for video
                    video_frames.append(
                        make_grid(torch.cat([
                            observation,
                            self.observation_model(belief,
                                                   posterior_state).cpu()
                        ],
                                            dim=3) + 0.5,
                                  nrow=5).numpy())  # Decentre
                observation = next_observation
                if done.sum().item() == args.test_episodes:
                    pbar.close()
                    break

        # Update and plot reward metrics (and write video if applicable) and save metrics
        self.metrics['test_episodes'].append(episode)
        self.metrics['test_rewards'].append(total_rewards.tolist())

        Save_Txt(self.metrics['test_episodes'][-1],
                 self.metrics['test_rewards'][-1], 'test_rewards',
                 args.results_dir)
        # Save_Txt(np.asarray(metrics['steps'])[np.asarray(metrics['test_episodes']) - 1], metrics['test_rewards'],'test_rewards_steps', results_dir, xaxis='step')

        # lineplot(metrics['test_episodes'], metrics['test_rewards'], 'test_rewards', results_dir)
        # lineplot(np.asarray(metrics['steps'])[np.asarray(metrics['test_episodes']) - 1], metrics['test_rewards'], 'test_rewards_steps', results_dir, xaxis='step')
        if not args.symbolic_env:
            episode_str = str(episode).zfill(len(str(args.episodes)))
            write_video(video_frames, 'test_episode_%s' % episode_str,
                        args.results_dir)  # Lossy compression
            save_image(
                torch.as_tensor(video_frames[-1]),
                os.path.join(args.results_dir,
                             'test_episode_%s.png' % episode_str))

        torch.save(self.metrics, os.path.join(args.results_dir, 'metrics.pth'))

        # Set models to train mode
        self.transition_model.train()
        self.observation_model.train()
        self.reward_model.train()
        self.encoder.train()
        # self.actor_model_g.train()
        # self.value_model_g.train()
        self.algorithms.eval_to_train()
        # Close test environments
        test_envs.close()

    def test_only(self):
        # Set models to eval mode
        self.transition_model.eval()
        self.reward_model.eval()
        self.encoder.eval()
        with torch.no_grad():
            total_reward = 0
            for _ in tqdm(range(args.test_episodes)):
                observation = self.env.reset()
                belief, posterior_state, action = torch.zeros(
                    1, args.belief_size, device=args.device), torch.zeros(
                        1, args.state_size,
                        device=args.device), torch.zeros(1,
                                                         self.env.action_size,
                                                         device=args.device)
                pbar = tqdm(
                    range(args.max_episode_length // args.action_repeat))
                for t in pbar:
                    belief, posterior_state, action, observation, reward, done = self.update_belief_and_act(
                        args, self.env, belief, posterior_state, action,
                        observation.to(evice=args.device))
                    total_reward += reward
                    if args.render: self.env.render()
                    if done:
                        pbar.close()
                        break
        print('Average Reward:', total_reward / args.test_episodes)
        self.env.close()
        quit()

    def __basic_setting(self):
        args.overshooting_distance = min(
            args.chunk_size, args.overshooting_distance
        )  # Overshooting distance cannot be greater than chunk size
        print(' ' * 26 + 'Options')
        for k, v in vars(args).items():
            print(' ' * 26 + k + ': ' + str(v))

        print("torch.cuda.device_count() {}".format(torch.cuda.device_count()))
        os.makedirs(args.results_dir, exist_ok=True)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        # Set Cuda
        if torch.cuda.is_available() and not args.disable_cuda:
            print("using CUDA")
            args.device = torch.device('cuda')
            torch.cuda.manual_seed(args.seed)
        else:
            print("using CPU")
            args.device = torch.device('cpu')

        self.summary_name = args.results_dir + "/{}_{}_log"
        self.writer = SummaryWriter(self.summary_name.format(
            args.env, args.id))
        self.env = Env(args.env, args.symbolic_env, args.seed,
                       args.max_episode_length, args.action_repeat,
                       args.bit_depth)
        self.metrics = {
            'steps': [],
            'episodes': [],
            'train_rewards': [],
            'test_episodes': [],
            'test_rewards': [],
            'observation_loss': [],
            'reward_loss': [],
            'kl_loss': [],
            'merge_actor_loss': [],
            'merge_value_loss': []
        }

    def __init_sample(self):
        if args.experience_replay is not '' and os.path.exists(
                args.experience_replay):
            self.D = torch.load(args.experience_replay)
            self.metrics['steps'], self.metrics['episodes'] = [
                self.D.steps
            ] * self.D.episodes, list(range(1, self.D.episodes + 1))
        elif not args.test:
            self.D = ExperienceReplay(args.experience_size, args.symbolic_env,
                                      self.env.observation_size,
                                      self.env.action_size, args.bit_depth,
                                      args.device)

            # Initialise dataset D with S random seed episodes
            print(
                "Start Multi Sample Processing -------------------------------"
            )
            start_time = time.time()
            data_lists = [
                Manager().list() for i in range(1, args.seed_episodes + 1)
            ]  # Set Global Lists
            pipes = [Pipe() for i in range(1, args.seed_episodes + 1)
                     ]  # Set Multi Pipe
            workers_init_sample = [
                Worker_init_Sample(child_conn=child, id=i + 1)
                for i, [parent, child] in enumerate(pipes)
            ]

            for i, w in enumerate(workers_init_sample):
                w.start()  # Start Single Process
                pipes[i][0].send(
                    data_lists[i])  # Parent_pipe send data using i'th pipes
            [w.join() for w in workers_init_sample]  # wait sub_process done

            for i, [parent, child] in enumerate(pipes):
                # datas = parent.recv()
                for data in list(parent.recv()):
                    if isinstance(data, tuple):
                        assert len(data) == 4
                        self.D.append(data[0], data[1], data[2], data[3])
                    elif isinstance(data, int):
                        t = data
                        self.metrics['steps'].append(t * args.action_repeat + (
                            0 if len(self.metrics['steps']) ==
                            0 else self.metrics['steps'][-1]))
                        self.metrics['episodes'].append(i + 1)
                    else:
                        print(
                            "The Recvive Data Have Some Problems, Need To Fix")
            end_time = time.time()
            print("the process times {} s".format(end_time - start_time))
            print(
                "End Multi Sample Processing -------------------------------")

    def upper_transition_model(self, prev_state, actions, prev_belief, obs,
                               nonterminals):
        actions = torch.transpose(actions, 0, 1) if args.MultiGPU else actions
        nonterminals = torch.transpose(nonterminals, 0, 1).to(
            device=args.device
        ) if args.MultiGPU and nonterminals is not None else nonterminals
        obs = torch.transpose(obs, 0, 1).to(
            device=args.device) if args.MultiGPU and obs is not None else obs
        temp_val = self.transition_model(prev_state.to(device=args.device),
                                         actions.to(device=args.device),
                                         prev_belief.to(device=args.device),
                                         obs, nonterminals)

        return list(
            map(
                lambda x: torch.cat(x.chunk(torch.cuda.device_count(), 0), 1)
                if x.shape[1] != prev_state.shape[0] else x,
                [x for x in temp_val]))

    def save_loss_data(self, losses):
        self.metrics['observation_loss'].append(losses[0])
        self.metrics['reward_loss'].append(losses[1])
        self.metrics['kl_loss'].append(losses[2])
        self.metrics['merge_actor_loss'].append(
            losses[3]) if losses.__len__() > 3 else None
        self.metrics['merge_value_loss'].append(
            losses[4]) if losses.__len__() > 3 else None

        Save_Txt(self.metrics['episodes'][-1],
                 self.metrics['observation_loss'][-1], 'observation_loss',
                 args.results_dir)
        Save_Txt(self.metrics['episodes'][-1], self.metrics['reward_loss'][-1],
                 'reward_loss', args.results_dir)
        Save_Txt(self.metrics['episodes'][-1], self.metrics['kl_loss'][-1],
                 'kl_loss', args.results_dir)
        Save_Txt(self.metrics['episodes'][-1],
                 self.metrics['merge_actor_loss'][-1], 'merge_actor_loss',
                 args.results_dir) if losses.__len__() > 3 else None
        Save_Txt(self.metrics['episodes'][-1],
                 self.metrics['merge_value_loss'][-1], 'merge_value_loss',
                 args.results_dir) if losses.__len__() > 3 else None

        # lineplot(metrics['episodes'][-len(metrics['observation_loss']):], metrics['observation_loss'], 'observation_loss', results_dir)
        # lineplot(metrics['episodes'][-len(metrics['reward_loss']):], metrics['reward_loss'], 'reward_loss', results_dir)
        # lineplot(metrics['episodes'][-len(metrics['kl_loss']):], metrics['kl_loss'], 'kl_loss', results_dir)
        # lineplot(metrics['episodes'][-len(metrics['actor_loss']):], metrics['actor_loss'], 'actor_loss', results_dir)
        # lineplot(metrics['episodes'][-len(metrics['value_loss']):], metrics['value_loss'], 'value_loss', results_dir)

    def save_model_data(self, episode):
        # writer.add_scalar("train_reward", metrics['train_rewards'][-1], metrics['steps'][-1])
        # writer.add_scalar("train/episode_reward", metrics['train_rewards'][-1], metrics['steps'][-1]*args.action_repeat)
        # writer.add_scalar("observation_loss", metrics['observation_loss'][0][-1], metrics['steps'][-1])
        # writer.add_scalar("reward_loss", metrics['reward_loss'][0][-1], metrics['steps'][-1])
        # writer.add_scalar("kl_loss", metrics['kl_loss'][0][-1], metrics['steps'][-1])
        # writer.add_scalar("actor_loss", metrics['actor_loss'][0][-1], metrics['steps'][-1])
        # writer.add_scalar("value_loss", metrics['value_loss'][0][-1], metrics['steps'][-1])
        # print("episodes: {}, total_steps: {}, train_reward: {} ".format(metrics['episodes'][-1], metrics['steps'][-1], metrics['train_rewards'][-1]))

        # Checkpoint models
        if episode % args.checkpoint_interval == 0:
            # torch.save({'transition_model': transition_model.state_dict(),
            #             'observation_model': observation_model.state_dict(),
            #             'reward_model': reward_model.state_dict(),
            #             'encoder': encoder.state_dict(),
            #             'actor_model': actor_model_g.state_dict(),
            #             'value_model': value_model_g.state_dict(),
            #             'model_optimizer': model_optimizer.state_dict(),
            #             'actor_optimizer': actor_optimizer_g.state_dict(),
            #             'value_optimizer': value_optimizer_g.state_dict()
            #             }, os.path.join(results_dir, 'models_%d.pth' % episode))
            if args.checkpoint_experience:
                torch.save(
                    self.D, os.path.join(args.results_dir, 'experience.pth')
                )  # Warning: will fail with MemoryError with large memory sizes
Ejemplo n.º 8
0
class Agent:
    def __init__(self, model, memory=None, memory_size=1000, nb_frames=None):
        assert len(
            model.output_shape
        ) == 2, "Model's output shape should be (nb_samples, nb_actions)."
        if memory:
            self.memory = memory
        else:
            self.memory = ExperienceReplay(memory_size)
        if not nb_frames and not model.input_shape:
            raise Exception("Missing argument : nb_frames not provided")
        elif not nb_frames:
            nb_frames = model.input_shape[1]
        elif model.input_shape[
                1] and nb_frames and model.input_shape[1] != nb_frames:
            raise Exception(
                "Dimension mismatch : time dimension of model should be equal to nb_frames."
            )
        self.model = model
        self.nb_frames = nb_frames
        self.frames = None

    @property
    def memory_size(self):
        return self.memory.memory_size

    @memory_size.setter
    def memory_size(self, value):
        self.memory.memory_size = value

    def reset_memory(self):
        self.exp_replay.reset_memory()

    def check_game_compatibility(self, game):
        game_output_shape = (1, None) + game.get_frame().shape
        if len(game_output_shape) != len(self.model.input_shape):
            raise Exception(
                'Dimension mismatch. Input shape of the model should be compatible with the game.'
            )
        else:
            for i in range(len(self.model.input_shape)):
                if self.model.input_shape[i] and game_output_shape[
                        i] and self.model.input_shape[i] != game_output_shape[
                            i]:
                    raise Exception(
                        'Dimension mismatch. Input shape of the model should be compatible with the game.'
                    )
        if len(self.model.output_shape
               ) != 2 or self.model.output_shape[1] != game.nb_actions:
            raise Exception(
                'Output shape of model should be (nb_samples, nb_actions).')

    def get_game_data(self, game):
        frame = game.get_frame()
        if self.frames is None:
            self.frames = [frame] * self.nb_frames
        else:
            self.frames.append(frame)
            self.frames.pop(0)
        return np.expand_dims(self.frames, 0)

    def clear_frames(self):
        self.frames = None

    def train(self,
              game,
              nb_epoch=1000,
              batch_size=50,
              gamma=0.9,
              epsilon=[1., .1],
              epsilon_rate=0.5,
              reset_memory=False):
        self.check_game_compatibility(game)
        if type(epsilon) in {tuple, list}:
            delta = ((epsilon[0] - epsilon[1]) / (nb_epoch * epsilon_rate))
            final_epsilon = epsilon[1]
            epsilon = epsilon[0]
        else:
            final_epsilon = epsilon
        model = self.model
        nb_actions = model.output_shape[-1]
        win_count = 0
        for epoch in range(nb_epoch):
            loss = 0.
            game.reset()
            self.clear_frames()
            if reset_memory:
                self.reset_memory()
            game_over = False
            S = self.get_game_data(game)
            while not game_over:
                if np.random.random() < epsilon:
                    a = int(np.random.randint(game.nb_actions))
                else:
                    q = model.predict(S)
                    a = int(np.argmax(q[0]))
                game.play(a)
                r = game.get_score()
                S_prime = self.get_game_data(game)
                game_over = game.is_over()
                transition = [S, a, r, S_prime, game_over]
                self.memory.remember(*transition)
                S = S_prime
                inputs, targets = self.memory.get_batch(model=model,
                                                        batch_size=batch_size,
                                                        gamma=gamma)
                loss += model.train_on_batch(inputs, targets)[0]
            if game.is_won():
                win_count += 1
            if epsilon > final_epsilon:
                epsilon -= delta
            print(
                "Epoch {:03d}/{:03d} | Loss {:.4f} | Epsilon {:.2f} | Win count {}"
                .format(epoch + 1, nb_epoch, loss, epsilon, win_count))

    def play(self, game, nb_epoch=10, epsilon=0., visualize=True):
        self.check_game_compatibility(game)
        model = self.model
        win_count = 0
        frames = []
        for epoch in range(nb_epoch):
            game.reset()
            self.clear_frames()
            S = self.get_game_data(game)
            if visualize:
                frames.append(game.draw())
            game_over = False
            while not game_over:
                if np.random.rand() < epsilon:
                    print("random")
                    action = int(np.random.randint(0, game.nb_actions))
                else:
                    q = model.predict(S)
                    action = int(np.argmax(q[0]))
                game.play(action)
                S = self.get_game_data(game)
                if visualize:
                    frames.append(game.draw())
                game_over = game.is_over()
            if game.is_won():
                win_count += 1
        print("Accuracy {} %".format(100. * win_count / nb_epoch))
        if visualize:
            if 'images' not in os.listdir('.'):
                os.mkdir('images')
            for i in range(len(frames)):
                plt.imshow(frames[i], interpolation='none')
                plt.savefig("images/" + game.name + str(i) + ".png")
Ejemplo n.º 9
0
results_dir = os.path.join('results', args.id)
os.makedirs(results_dir, exist_ok=True)
logdir =  os.path.join(results_dir, "logs")
os.makedirs(logdir, exist_ok=True)

np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
  device = torch.device('cuda')
  torch.cuda.manual_seed(args.seed)
else:
  device = torch.device('cpu')

# Initialise training environment and experience replay memory
env = Env(args.env, args.seed, args.max_episode_length, args.action_repeat)
D = ExperienceReplay(args.experience_size, env.observation_size, env.action_size, device)
# Initialise dataset D with S random seed episodes
for s in range(1, args.seed_episodes + 1):
  observation, done, t = env.reset(), False, 0
  epdata = []
  while not done:
    action = env.sample_random_action()
    next_observation, reward, done = env.step(action)
    D.append(observation, action, reward, done)
    epdata.append(next_observation)
    observation = next_observation
    t += 1
  epdata = np.concatenate(epdata)
  frames = torch.FloatTensor(epdata[:,:3,:,:]) / 255.
  write_video(frames, "Episode"+str(s), logdir)
  print(epdata.shape)
Ejemplo n.º 10
0
class Agent:
  def __init__(self, game, mode=SIMPLE, nb_epoch=10000, memory_size=1000, batch_size=50, nb_frames=4, epsilon=1., discount=.9, learning_rate=.1, model=None):

    self.game = game
    self.mode = mode
    self.target_model = None
    self.rows, self.columns = game.field_shape()
    self.nb_epoch = nb_epoch
    self.nb_frames = nb_frames
    self.nb_actions = game.nb_actions()

    if mode == TEST:
      print('Training Mode: Loading model...')
      self.model = load_model(model)
    elif mode == SIMPLE:
      print('Using Plain DQN: Building model...')
      self.model = self.build_model()
    elif mode == DOUBLE:
      print('Using Double DQN: Building primary and target model...')
      self.model = self.build_model()
      self.target_model = self.build_model()
      self.update_target_model()

    # Trades off the importance of sooner versus later rewards.
    # A factor of 0 means it rather prefers immediate rewards
    # and it will mostly consider current rewards. A factor of 1
    # will make it strive for a long-term high reward.
    self.discount = discount

    # The learning rate or step size determines to what extent the newly
    # acquired information will override the old information. A factor
    # of 0 will make the agent not learn anything, while a factor of 1
    # would make the agent consider only the most recent information
    self.learning_rate = learning_rate

    # Use epsilon-greedy exploration as our policy.
    # Epsilon determines the probability for choosing random actions.
    # This factor will decrease linear by the number of epoches. So we choose
    # a random action by the probability 'eps'. Without this policy the network
    # is greedy and it will it settles with the first effective strategy it finds.
    # Hence, we introduce certain randomness.
    # Epislon reaches its minimum at 1/2 of the games
    epsilon_end = self.nb_epoch - (self.nb_epoch / 2)
    self.policy = EpsGreedyPolicy(self.model, epsilon_end, self.nb_actions, epsilon, .1)

    # Create new experience replay memory. Without this optimization
    # the training takes extremely long even on a GPU and most
    # importantly the approximation of Q-values using non-linear
    # functions, that is used for our NN, is not very stable.
    self.memory = ExperienceReplay(self.model, self.target_model, self.nb_actions, memory_size, batch_size, self.discount, self.learning_rate)

    self.frames = None

  def build_model(self):
    model = Sequential()
    model.add(Conv2D(32, (2, 2), activation='relu', input_shape=(self.nb_frames, self.rows, self.columns), data_format="channels_first"))
    model.add(Conv2D(64, (2, 2), activation='relu'))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(Flatten())
    model.add(Dropout(0.1))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(self.nb_actions))
    model.compile(Adam(), 'MSE')

    return model

  def update_target_model(self):
    self.target_model.set_weights(self.model.get_weights())

  def get_frames(self):
    frame = self.game.get_state()
    if self.frames is None:
      self.frames = [frame] * self.nb_frames
    else:
      self.frames.append(frame)
      self.frames.pop(0)

    # Expand frames to match the input shape for the CNN (4D)
    # 1D      = # batches
    # 2D      = # frames per batch
    # 3D / 4D = game board
    return np.expand_dims(self.frames, 0)

  def clear_frames(self):
    self.frames = None

  def print_stats(self, data, y_label, x_label='Epoch', marker='-'):
    data = np.array(data)
    x, y = data.T
    p = np.polyfit(x, y, 3)

    fig = plt.figure()

    plt.plot(x, y, marker)
    plt.plot(x, np.polyval(p, x), 'r:')
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    words = y_label.split()
    file_name = '_'.join(map(lambda x: x.lower(), words))
    path = './plots/{name}_{size}x{size}_{timestamp}'
    fig.savefig(path.format(size=self.game.grid_size, name=file_name, timestamp=int(time())))

  def train(self, update_freq=10):
    total_steps = 0
    max_steps = self.game.grid_size**2 * 3
    loops = 0
    nb_wins = 0
    cumulative_reward = 0
    duration_buffer = []
    reward_buffer = []
    steps_buffer = []
    wins_buffer = []

    for epoch in range(self.nb_epoch):
      loss = 0.
      duration = 0
      steps = 0

      self.game.reset()
      self.clear_frames()
      done = False

      # Observe the initial state
      state_t = self.get_frames()

      start_time = time()

      while(not done):
        # Explore or Exploit
        action = self.policy.select_action(state_t, epoch)

        # Act on the environment
        _, reward, done, is_victory = self.game.act(action)
        state_tn = self.get_frames()

        cumulative_reward += reward
        steps += 1
        total_steps += 1

        if steps == max_steps and not done:
          loops += 1
          done = True

        # Build transition and remember it (Experience Replay)
        transition = [state_t, action, reward, state_tn, done]
        self.memory.remember(*transition)
        state_t = state_tn

        # Get batch of batch_size samples
        # A batch generally approximates the distribution of the input data
        # better than a single input. The larger the batch, the better the
        # approximation. However, larger batches take longer to process.
        batch = self.memory.get_batch()

        if batch:
          inputs, targets = batch
          loss += float(self.model.train_on_batch(inputs, targets))

        if self.game.is_victory():
          nb_wins += 1

        if done:
          duration = utils.get_time_difference(start_time, time())

        if self.mode == DOUBLE and self.target_model is not None and total_steps % (update_freq) == 0:
          self.update_target_model()

      current_epoch = epoch + 1
      reward_buffer.append([current_epoch, cumulative_reward])
      duration_buffer.append([current_epoch, duration])
      steps_buffer.append([current_epoch, steps])
      wins_buffer.append([current_epoch, nb_wins])

      summary = 'Epoch {:03d}/{:03d} | Loss {:.4f} | Epsilon {:.2f} | Time(ms) {:3.3f} | Steps {:.2f} | Wins {} | Loops {}'
      print(summary.format(current_epoch, self.nb_epoch, loss, self.policy.get_eps(), duration, steps, nb_wins, loops))

    # Generate plots
    self.print_stats(reward_buffer, 'Cumulative Reward')
    self.print_stats(duration_buffer, 'Duration per Game')
    self.print_stats(steps_buffer, 'Steps per Game')
    self.print_stats(wins_buffer, 'Wins')

    path = './models/model_{mode}_{size}x{size}_{epochs}_{timestamp}.h5'
    mode = 'dqn' if self.mode == SIMPLE else 'ddqn'
    self.model.save(path.format(mode=mode, size=self.game.grid_size, epochs=self.nb_epoch, timestamp=int(time())))

  def play(self, nb_games=5, interval=.7):
    nb_wins = 0
    accuracy = 0
    summary = '{}\n\nAccuracy {:.2f}% | Game {}/{} | Wins {}'

    for epoch in range(nb_games):
      self.game.reset()
      self.clear_frames()
      done = False

      state_t = self.get_frames()

      self.print_state(summary, state_t[:,-1], accuracy, epoch, nb_games, nb_wins, 0)

      while(not done):
        q = self.model.predict(state_t)
        action = np.argmax(q[0])

        _, _, done, is_victory = self.game.act(action)
        state_tn = self.get_frames()

        state_t = state_tn

        if is_victory:
          nb_wins += 1

        accuracy = 100. * nb_wins / nb_games

        self.print_state(summary, state_t[:,-1], accuracy, epoch, nb_games, nb_wins, interval)

  def print_state(self, summary, state, accuracy, epoch, nb_games, nb_wins, interval):
    utils.clear_screen()
    print(summary.format(state, accuracy, epoch + 1, nb_games, nb_wins))
    sleep(interval)
Ejemplo n.º 11
0
    'observation_loss': [],
    'reward_loss': [],
    'kl_loss': []
}

print("Initializing environment!")
# Initialise training environment and experience replay memory
env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length,
          args.action_repeat, args.bit_depth)
if args.load_experience:
    D = torch.load(os.path.join(results_dir, 'experience.pth'))
    metrics['steps'], metrics['episodes'] = [D.steps] * D.episodes, list(
        range(1, D.episodes + 1))
else:
    D = ExperienceReplay(args.experience_size, args.symbolic_env,
                         env.observation_size, env.action_size, args.bit_depth,
                         args.device)
    # Initialise dataset D with S random seed episodes
    for s in range(1, args.seed_episodes + 1):
        observation, done, t = env.reset(), False, 0
        while not done:
            action = env.sample_random_action()
            next_observation, reward, done = env.step(action)
            D.append(observation, action, reward, done)
            observation = next_observation
            t += 1
        metrics['steps'].append(t * args.action_repeat + (
            0 if len(metrics['steps']) == 0 else metrics['steps'][-1]))
        metrics['episodes'].append(s)

print("Initializing model parameters!")
Ejemplo n.º 12
0
    'actor_loss': [],
    'value_loss': []
}

summary_name = results_dir + "/{}_{}_log"

# Initialise training environment and experience replay memory
env = Env(args.env, args.symbolic, args.seed, args.max_episode_length,
          args.action_repeat, args.bit_depth)

args.observation_size, args.action_size = env.observation_size, env.action_size

# Initialise agent
agent = Dreamer(args)

D = ExperienceReplay(args.experience_size, args.symbolic, env.observation_size,
                     env.action_size, args.bit_depth, args.device)

# Initialise dataset D with S random seed episodes
for s in range(1, args.seed_episodes + 1):
    observation, done, t = env.reset(), False, 0
    while not done:
        action = env.sample_random_action()
        next_observation, reward, done = env.step(action)
        D.append(next_observation, action.cpu(), reward,
                 done)  # here use the next_observation
        observation = next_observation
        t += 1
    metrics['env_steps'].append(t * args.action_repeat + (
        0 if len(metrics['env_steps']) == 0 else metrics['env_steps'][-1]))
    metrics['episodes'].append(s)
    print("(random)episodes: {}, total_env_steps: {} ".format(
Ejemplo n.º 13
0
class Agent:

	def __init__(self, model, memory=None, memory_size=1000, nb_frames=None):
		assert len(model.output_shape) == 2, "Model's output shape should be (nb_samples, nb_actions)."
		if memory:
			self.memory = memory
		else:
			self.memory = ExperienceReplay(memory_size)
		if not nb_frames and not model.input_shape:
			raise Exception("Missing argument : nb_frames not provided")
		elif not nb_frames:
			nb_frames = model.input_shape[1]
		elif model.input_shape[1] and nb_frames and model.input_shape[1] != nb_frames:
			raise Exception("Dimension mismatch : time dimension of model should be equal to nb_frames.")
		self.model = model
		self.nb_frames = nb_frames
		self.frames = None

	@property
	def memory_size(self):
		return self.memory.memory_size

	@memory_size.setter
	def memory_size(self, value):
		self.memory.memory_size = value

	def reset_memory(self):
		self.exp_replay.reset_memory()

	def check_game_compatibility(self, game):
		game_output_shape = (1, None) + game.get_frame().shape
		if len(game_output_shape) != len(self.model.input_shape):
			raise Exception('Dimension mismatch. Input shape of the model should be compatible with the game.')
		else:
			for i in range(len(self.model.input_shape)):
				if self.model.input_shape[i] and game_output_shape[i] and self.model.input_shape[i] != game_output_shape[i]:
					raise Exception('Dimension mismatch. Input shape of the model should be compatible with the game.')
		if len(self.model.output_shape) != 2 or self.model.output_shape[1] != game.nb_actions:
			raise Exception('Output shape of model should be (nb_samples, nb_actions).')

	def get_game_data(self, game):
		frame = game.get_frame()
		if self.frames is None:
			self.frames = [frame] * self.nb_frames
		else:
			self.frames.append(frame)
			self.frames.pop(0)
		return np.expand_dims(self.frames, 0)

	def clear_frames(self):
		self.frames = None

	def action_count(self, game):
		#print "game.get_action_count: ", game.get_action_count
		return game.get_action_count

	# SET WHICH RUNS TO PRINT OUT HERE *****************************************************************
	def report_action(self, game):
		return ((self.action_count(game) % self.report_freq) == 0) # and ((self.action_count(game) % self.report_freq) < 20) #% 10000) == 0 #

	def train(self, game, nb_epoch=1000, batch_size=50, gamma=0.9, epsilon=[1., .1], epsilon_rate=0.5, reset_memory=False, id=""):

		txt_store_path = "./txtstore/run_1000e_b50_15r_reg_lr1/junk/"
		printing = False
		record_weights = False
		self.max_moves = game.get_max_moves()
		self.report_freq  = self.max_moves #50

		'''fo_A = open(txt_store_path + "A.txt", "rw+")
		fo_G = open(txt_store_path + "G.txt", "rw+")
		fo_Gb = open(txt_store_path + "Gb.txt", "rw+")
		fo_I = open(txt_store_path + "I.txt", "rw+")
		fo_Q = open(txt_store_path + "Q.txt", "rw+")
		fo_R = open(txt_store_path + "R.txt", "rw+")
		fo_S = open(txt_store_path + "S.txt", "rw+")
		fo_T = open(txt_store_path + "T.txt", "rw+")
		fo_W = open(txt_store_path + "W.txt", "rw+")
		fo_Wb = open(txt_store_path + "Wb.txt", "rw+")'''

		self.check_game_compatibility(game)
		if type(epsilon)  in {tuple, list}:
			delta =  ((epsilon[0] - epsilon[1]) / (nb_epoch * epsilon_rate))
			final_epsilon = epsilon[1]
			epsilon = epsilon[0]
		else:
			final_epsilon = epsilon
		model = self.model
		nb_actions = model.output_shape[-1]
		win_count = 0

		scores = np.zeros((nb_epoch,self.max_moves/self.report_freq))
		losses = np.zeros((nb_epoch,self.max_moves/self.report_freq))


		for epoch in range(nb_epoch):
			#ipdb.set_trace(context=9)	# TRACING HERE *********************************************
			loss = 0.
			game.reset()
			self.clear_frames()
			if reset_memory:
				self.reset_memory()
			game_over = False
			S = self.get_game_data(game)
			no_last_S = True

			plot_showing = False

			while not game_over:
				if np.random.random() < epsilon:
					a = int(np.random.randint(game.nb_actions))
					#if (self.action_count(game) % 100000) == 0:
					'''if self.report_action(game):
						if printing:
							print "random",
						q = model.predict(S)'''
					q = model.predict(S)
					expected_action = (a == int(np.argmax(q[0])))
				else:
					expected_action = True
					q = model.predict(S)
					#print q.shape
					#print q[0]
					# ************************************** CATCHING NANS
					'''if (q[0,0] != q[0,0]):
						ipdb.set_trace(context=9)	# TRACING HERE *********************************************
					'''
					a = int(np.argmax(q[0]))
					#if (self.action_count(game) % 100000) == 0:
				prob = epsilon/game.nb_actions
				if expected_action:
					prob = 1 - epsilon + prob
				game.play(a, self.report_action(game))
				r = game.get_score()
				#ipdb.set_trace(context=9)	# TRACING HERE *********************************************


				# PRINTING S HERE ******************************************************************

				''' if plot_showing:
					plt.clf()
				plt.imshow(np.reshape(S,(6,6)))
				plt.draw()
				plt.show(block=False)
				plot_showing = True
				print "hi" '''

				# PRINTING S HERE ******************************************************************

				S_prime = self.get_game_data(game)



				'''if self.report_action(game):
					if printing:
						print "S: ", S
						#if no_last_S:
						#	last_S = S
						#	no_last_S = False
						#else:
						#	print "dS:", S - last_S
						#	print "    ==>  Q(lS):", model.predict(last_S)
						#print
						print "    ==>  Q(S): ", q, "    ==>  A: ", a, "    ==> R: %f" % r
						#print "    ==>  Q(S'):", model.predict(S_prime)
						#print
					fo_S.seek(0,2)
					np.savetxt(fo_S, S[0], fmt='%4.4f') #
					fo_Q.seek(0,2)
					np.savetxt(fo_Q, q, fmt='%4.4f') #
					fo_A.seek(0,2)
					fo_A.write(str(a)+"\n") #savetxt(fo, S[0], fmt='%4.4f') #
					fo_R.seek(0,2)
					fo_R.write(str(r)+"\n")
				'''

				#ipdb.set_trace(context=9)	# TRACING HERE *********************************************


				#last_S = S

				game_over = game.is_over()
				transition = [S, a, r, S_prime, game_over, prob]
				self.memory.remember(*transition)
				S = S_prime
				batch = self.memory.get_batch(model=model, batch_size=batch_size, gamma=gamma, ruql=True) #, print_it=False) #self.report_action(game))
				if batch:
					inputs, targets, probs = batch

					#print("model.total_loss: ", model.total_loss)
					'''if record_weights:

						weights_pre = model.get_weights() # GOT WEIGHTS *************************
						#print "weights_pre"
						#print weights_pre

						if self.report_action(game):
							fo_W.seek(0,2)
							np.savetxt(fo_W, weights_pre[0], fmt='%4.4f') #
							fo_W.write("\n")
							fo_Wb.seek(0,2)
							np.savetxt(fo_Wb, weights_pre[1], fmt='%4.4f') #
							fo_Wb.write("\n")'''

					#output = model.train_on_batch(inputs, targets)
					#loss += float(output[0]) #model.train_on_batch(inputs, targets))
					'''print "myAgent"
					print 'inputs: ', type(inputs), "; ", inputs.shape 
					print 'targets: ', type(targets), "; ", targets.shape
					print 'probs: ', type(probs), "; ", probs.shape'''
					loss += float(model.train_on_batch(inputs, targets, probs=probs))

					#if self.report_action(game):
					#	#print output
					#	#fo_G.seek(0,2)
					#	#np.savetxt(fo_G, output[1], fmt='%4.4f') #
					#	#fo_G.write("\n")
					#	#fo_Gb.seek(0,2)
					#	#np.savetxt(fo_Gb, output[2], fmt='%4.4f') #
					#	#fo_Gb.write("\n")

					#weights_post = model.get_weights() # GOT WEIGHTS ********************************
					#print "weights_post"
					#print weights_post
					#ipdb.set_trace()	# TRACING HERE *********************************************

					#print("action_count PRE: ", action_count)
					if self.report_action(game):
						action_count = self.action_count(game)
						#print("action_count/self.report_freq: ", action_count/self.report_freq)
						#print("action_count: ", action_count)
						#print("self.report_freq: ", self.report_freq)
						#print("scores so far: ", scores)
						#print("scores.shape: ", scores.shape)'''
						while (action_count/self.report_freq > scores.shape[1]):
							scores = np.append(scores, np.zeros((nb_epoch,1)), 1)
							losses = np.append(losses, np.zeros((nb_epoch,1)), 1)
						scores[epoch, action_count/self.report_freq-1] = game.get_total_score()
						losses[epoch, action_count/self.report_freq-1] = loss

						#print ("running a batch (of %d): 1: %d; 2: %d" % (len(batch), batch[0].size, \
						#	batch[1].size))
						#print "memory size: ", self.memory_size
						#print "using memory\n", inputs, "; tgt: ", targets
						#fo_I.seek(0,2)
						#np.savetxt(fo_I, inputs[0], fmt='%4.4f') #
						#fo_T.seek(0,2)
						#np.savetxt(fo_T, targets, fmt='%4.4f') #
					#fo_T.write("\n")
			if game.is_won():
				win_count += 1
			if epsilon > final_epsilon:
				epsilon -= delta
			if (epoch % 50) == 0:
				print("Epoch {:03d}/{:03d} | Loss {:.4f} | Epsilon {:.2f} | Win count {}".format(epoch + 1, nb_epoch, loss, epsilon, win_count))
		pickle.dump(scores, open(txt_store_path + "score" + id + ".p", "wb" ) )
		pickle.dump(losses, open(txt_store_path + "loss" + id + ".p", "wb" ) )
		'''
		fo_A.close()
		fo_G.close()
		fo_Gb.close()
		fo_I.close()
		fo_Q.close()
		fo_R.close()
		fo_S.close()
		fo_T.close()
		fo_W.close()
		fo_Wb.close()'''

		average_taken_over = 10
		last_col = self.max_moves/self.report_freq -1

		fo_log = open("log.txt", "rw+")
		fo_log.seek(0,2)

		average_score = np.mean(scores[-average_taken_over:nb_epoch, last_col])
		average_error = np.mean(losses[-average_taken_over:nb_epoch, last_col])

		fo_log.write("\n{:20}|{:^12}|{:^10}|{:^10}|{:^6}|{:^12}|{:^12}|{:^12}{:^6}{:^6}|{:^10}|{:^20}|{:^10}|{:^6}".format(" ", "game moves", "avg score", "error", "WC", "epochs", "batch size", "epsiln frm", ".. to", ".. by", "lr", "desciption", "timer", "reg"))
		fo_log.write("\n{:<20}|{:^12d}|{:^10.2f}|{:^10.2f}|{:^6d}|".format(time.strftime("%d/%m/%Y %H:%M"), self.max_moves, \
			average_score, average_error, win_count)) #average_taken_over,
		fo_log.close()


	def play(self, game, nb_epoch=1, epsilon=0., visualize=False):
		self.check_game_compatibility(game)
		model = self.model
		win_count = 0
		frames = []
		for epoch in range(nb_epoch):
			game.reset()
			self.clear_frames()
			S = self.get_game_data(game)
			if visualize:
				frames.append(game.draw())
			game_over = False
			while not game_over:
				if np.random.rand() < epsilon:
					print("random")
					action = int(np.random.randint(0, game.nb_actions))
				else:
					q = model.predict(S)
					action = int(np.argmax(q[0]))
				game.play(action)
				S = self.get_game_data(game)
				if visualize:
					frames.append(game.draw())
				game_over = game.is_over()
			if game.is_won():
				win_count += 1
		print("Accuracy {} %".format(100. * win_count / nb_epoch))
		if visualize:
			if 'images' not in os.listdir('.'):
				os.mkdir('images')
			for i in range(len(frames)):
				plt.imshow(frames[i], interpolation='none')
				plt.savefig("images/" + game.name + str(i) + ".png")
Ejemplo n.º 14
0
Archivo: main.py Proyecto: Kaixhin/EC
    'test_rewards': [],
    'test_Qs': []
}

# Environment
env = AtariEnv(args)
env.train()

# Agent and memory
if args.algorithm == 'MFEC':
    agent = MFECAgent(args, env.observation_space.shape, env.action_space.n,
                      env.hash_space.shape[0])
elif args.algorithm == 'NEC':
    agent = NECAgent(args, env.observation_space.shape, env.action_space.n,
                     env.hash_space.shape[0])
    mem = ExperienceReplay(args.memory_capacity, env.observation_space.shape,
                           args.device)

# Construct validation memory
val_mem = ExperienceReplay(args.evaluation_size, env.observation_space.shape,
                           args.device)
T, done, states = 0, True, []  # Store transition data in episodic buffers
while T < args.evaluation_size:
    if done:
        state, done = env.reset(), False
    states.append(
        state.cpu().numpy())  # Append transition data to episodic buffers
    state, _, done = env.step(env.action_space.sample())
    T += 1
val_mem.append_batch(np.stack(states),
                     np.zeros((args.evaluation_size, ), dtype=np.int64),
                     np.zeros((args.evaluation_size, ), dtype=np.float32))
Ejemplo n.º 15
0
def train(args: argparse.Namespace,
          env: Env,
          D: ExperienceReplay,
          models: Tuple[nn.Module, nn.Module, nn.Module, nn.Module],
          optimizer: Tuple[optim.Optimizer, optim.Optimizer],
          param_list: List[nn.parameter.Parameter],
          planner: nn.Module):
    # auxilliary tensors
    global_prior = Normal(
        torch.zeros(args.batch_size, args.state_size, device=args.device),
        torch.ones(args.batch_size, args.state_size, device=args.device)
    )  # Global prior N(0, I)
    # Allowed deviation in KL divergence
    free_nats = torch.full((1, ), args.free_nats, dtype=torch.float32, device=args.device)
    summary_writter = SummaryWriter(args.tensorboard_dir)

    # unpack models
    transition_model, observation_model, reward_model, encoder = models
    transition_optimizer, reward_optimizer = optimizer

    for idx_episode in trange(args.episodes, leave=False, desc="Episode"):
        for idx_train in trange(args.collect_interval, leave=False, desc="Training"):
            # Draw sequence chunks {(o[t], a[t], r[t+1], z[t+1])} ~ D uniformly at random from the dataset
            # The first two dimensions of the tensors are L (chunk size) and n (batch size)
            # We want to use o[t+1] to correct the error of the transition model,
            # so we need to convert the sequence to {(o[t+1], a[t], r[t+1], z[t+1])}
            observations, actions, rewards_dist, rewards_coll, nonterminals = D.sample(args.batch_size, args.chunk_size)
            # Create initial belief and state for time t = 0
            init_belief = torch.zeros(args.batch_size, args.belief_size, device=args.device)
            init_state = torch.zeros(args.batch_size, args.state_size, device=args.device)
            # Transition model forward
            # deterministic: h[t+1] = f(h[t], a[t])
            # prior:         s[t+1] ~ Prob(s|h[t+1])
            # posterior:     s[t+1] ~ Prob(s|h[t+1], o[t+1])
            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(
                init_state,
                actions[:-1],
                init_belief,
                bottle(encoder, (observations[1:], )),
                nonterminals[:-1]
            )

            # observation loss
            predictions = bottle(observation_model, (beliefs, posterior_states))
            visual_loss = F.mse_loss(
                predictions[:, :, :3*64*64],
                observations[1:, :, :3*64*64]
            ).mean()
            symbol_loss = F.mse_loss(
                predictions[:, :, 3*64*64:],
                observations[1:, :, 3*64*64:]
            ).mean()
            observation_loss = visual_loss + symbol_loss

            # KL divergence loss. Minimize the difference between posterior and prior
            kl_loss = torch.max(
                kl_divergence(
                    Normal(posterior_means, posterior_std_devs),
                    Normal(prior_means, prior_std_devs)
                ).sum(dim=2),
                free_nats
            ).mean(dim=(0, 1))  # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out
            if args.global_kl_beta != 0:
                kl_loss += args.global_kl_beta * kl_divergence(
                    Normal(posterior_means, posterior_std_devs),
                    global_prior
                ).sum(dim=2).mean(dim=(0, 1))

            # overshooting loss
            if args.overshooting_kl_beta != 0:
                overshooting_vars = []  # Collect variables for overshooting to process in batch
                for t in range(1, args.chunk_size - 1):
                    d = min(t + args.overshooting_distance, args.chunk_size - 1)  # Overshooting distance
                    # Use t_ and d_ to deal with different time indexing for latent states
                    t_, d_ = t - 1, d - 1
                    # Calculate sequence padding so overshooting terms can be calculated in one batch
                    seq_pad = (0, 0, 0, 0, 0, t - d + args.overshooting_distance)
                    # Store
                    # * a[t:d],
                    # * z[t+1:d+1]
                    # * r[t+1:d+1]
                    # * h[t]
                    # * s[t] prior
                    # * E[s[t:d]] posterior
                    # * Var[s[t:d]] posterior
                    # * mask:
                    #       the last few sequences do not have enough length,
                    #       so we pad it with 0 to the same length as previous sequence for batch operation,
                    #       and use mask to indicate invalid variables.
                    overshooting_vars.append(
                        (F.pad(actions[t:d], seq_pad),
                         F.pad(nonterminals[t:d], seq_pad),
                         F.pad(rewards_dist[t:d], seq_pad[2:]),
                         beliefs[t_],
                         prior_states[t_],
                         F.pad(posterior_means[t_ + 1:d_ + 1].detach(), seq_pad),
                         F.pad(posterior_std_devs[t_ + 1:d_ + 1].detach(), seq_pad, value=1),
                         F.pad(torch.ones(d - t, args.batch_size, args.state_size, device=args.device), seq_pad)
                         )
                    )  # Posterior standard deviations must be padded with > 0 to prevent infinite KL divergences

                overshooting_vars = tuple(zip(*overshooting_vars))
                # Update belief/state using prior from previous belief/state and previous action (over entire sequence at once)
                beliefs, prior_states, prior_means, prior_std_devs = transition_model(
                    torch.cat(overshooting_vars[4], dim=0),
                    torch.cat(overshooting_vars[0], dim=1),
                    torch.cat(overshooting_vars[3], dim=0),
                    None,
                    torch.cat(overshooting_vars[1], dim=1)
                )
                seq_mask = torch.cat(overshooting_vars[7], dim=1)
                # Calculate overshooting KL loss with sequence mask
                kl_loss += (1 / args.overshooting_distance) * args.overshooting_kl_beta * torch.max(
                    (kl_divergence(
                        Normal(torch.cat(overshooting_vars[5], dim=1), torch.cat(overshooting_vars[6], dim=1)),
                        Normal(prior_means, prior_std_devs)
                    ) * seq_mask).sum(dim=2),
                    free_nats
                ).mean(dim=(0, 1)) * (args.chunk_size - 1)  # Update KL loss (compensating for extra average over each overshooting/open loop sequence)

            # TODO: add learning rate schedule
            # Update model parameters
            transition_optimizer.zero_grad()
            loss = observation_loss * 200 + kl_loss
            loss.backward()
            nn.utils.clip_grad_norm_(param_list, args.grad_clip_norm, norm_type=2)
            transition_optimizer.step()

            # reward loss
            rewards_dist_predict, rewards_coll_predict = bottle(reward_model.raw, (beliefs.detach(), posterior_states.detach()))
            reward_loss = F.mse_loss(
                rewards_dist_predict,
                rewards_dist[:-1],
                reduction='mean'
            ) + F.binary_cross_entropy(
                rewards_coll_predict,
                rewards_coll[:-1],
                reduction='mean'
            )
            reward_optimizer.zero_grad()
            reward_loss.backward()
            reward_optimizer.step()

            # add tensorboard log
            global_step = idx_train + idx_episode * args.collect_interval
            summary_writter.add_scalar("observation_loss", observation_loss, global_step)
            summary_writter.add_scalar("reward_loss", reward_loss, global_step)
            summary_writter.add_scalar("kl_loss", kl_loss, global_step)

        for idx_collect in trange(1, leave=False, desc="Collecting"):
            experience = collect_experience(args, env, models, planner, True, desc="Collecting experience {}".format(idx_collect))
            T = len(experience["observation"])
            for idx_step in range(T):
                D.append(experience["observation"][idx_step],
                         experience["action"][idx_step],
                         experience["reward_dist"][idx_step],
                         experience["reward_coll"][idx_step],
                         experience["done"][idx_step])

        # Checkpoint models
        if (idx_episode + 1) % args.checkpoint_interval == 0:
            record_path = os.path.join(args.checkpoint_dir, "checkpoint")
            checkpoint_path = os.path.join(args.checkpoint_dir, 'models_%d.pth' % (idx_episode+1))
            torch.save(
                {
                    'transition_model': transition_model.state_dict(),
                    'observation_model': observation_model.state_dict(),
                    'reward_model': reward_model.state_dict(),
                    'encoder': encoder.state_dict(),
                    'transition_optimizer': transition_optimizer.state_dict(),
                    'reward_optimizer': reward_optimizer.state_dict()
                },
                checkpoint_path)
            with open(record_path, "w") as f:
                f.write('models_%d.pth' % (idx_episode+1))
            planner.save(os.path.join(args.torchscript_dir, "mpc_planner.pth"))
            transition_model.save(os.path.join(args.torchscript_dir, "transition_model.pth"))
            reward_model.save(os.path.join(args.torchscript_dir, "reward_model.pth"))
            observation_model.save(os.path.join(args.torchscript_dir, "observation_decoder.pth"))
            encoder.save(os.path.join(args.torchscript_dir, "observation_encoder.pth"))

    summary_writter.close()
Ejemplo n.º 16
0
class Initializer():
    def __init__(self):
        self.parms = Parameters()
        self.results_dir = os.path.join(self.parms.results_path)
        self.dataset_path = os.path.join(self.parms.results_path, 'dataset/')
        os.makedirs(self.dataset_path, exist_ok=True)
        self.metrics = {
            'steps': [],
            'episodes': [],
            'train_rewards': [],
            'predicted_rewards': [],
            'test_episodes': [],
            'test_rewards': [],
            'observation_loss': [],
            'reward_loss': [],
            'kl_loss': [],
            'regularizer_loss': []
        }
        os.makedirs(self.results_dir, exist_ok=True)

        ## Setting cuda options
        if torch.cuda.is_available() and self.parms.use_cuda:
            self.parms.device = torch.device('cuda')
            torch.cuda.set_device(self.parms.gpu_id)
            print("Using gpu: ", torch.cuda.current_device())
        else:
            self.parms.device = torch.device('cpu')
            self.use_cuda = False
            print("Work on: ", self.parms.device)

        # Initilize buffer experience replay
        self.env = ControlSuiteEnv(self.parms.env_name, self.parms.seed,
                                   self.parms.max_episode_length,
                                   self.parms.bit_depth)
        self.D = ExperienceReplay(self.parms.ex_replay_buff_size,
                                  self.env.observation_size,
                                  self.env.action_size, self.parms.bit_depth,
                                  self.parms.device)

        if self.parms.seed > 0:
            self.set_seed()

        self.trainer = Trainer(self.parms, self.D, self.metrics,
                               self.results_dir, self.env)
        self.init_exp_rep()

        # Start Training
        print("Total training episodes: ", self.parms.training_episodes,
              " Buffer sampling: ", self.parms.collect_interval)
        self.trainer.train_models()
        print("END.")

    def set_seed(self):
        print("Setting seed")
        os.environ['PYTHONHASHSEED'] = str(self.parms.seed)
        random.seed(self.parms.seed)
        np.random.seed(self.parms.seed)
        torch.manual_seed(self.parms.seed)
        if self.parms.use_cuda:
            torch.cuda.manual_seed(self.parms.seed)
            #torch.backends.cudnn.enabled=False      # This makes the training slower
            #torch.backends.cudnn.deterministic=True # This makes the training slower

    # Init buffer experience replay
    def init_exp_rep(self):
        print("Starting initialization buffer.")
        for s in tqdm(range(1, self.parms.num_init_episodes + 1)):
            observation, done, t = self.env.reset(), False, 0
            while not done:
                action = self.env.sample_random_action()
                next_observation, reward, done = self.env.step(action)
                self.D.append(observation, action, reward, done)
                observation = next_observation
                t += 1
            self.metrics['steps'].append(t * self.env.action_repeat +
                                         (0 if len(self.metrics['steps']) ==
                                          0 else self.metrics['steps'][-1]))
            self.metrics['episodes'].append(s)
Ejemplo n.º 17
0
class Dreamer(Agent):
    # The agent has its own replay buffer, update, act
    def __init__(self, args):
        """
    All paras are passed by args
    :param args: a dict that includes parameters
    """
        super().__init__()
        self.args = args
        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, args.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_act).to(device=args.device)

        self.observation_model = ObservationModel(
            args.symbolic,
            args.observation_size,
            args.belief_size,
            args.state_size,
            args.embedding_size,
            activation_function=(args.dense_act if args.symbolic else
                                 args.cnn_act)).to(device=args.device)

        self.reward_model = RewardModel(args.belief_size, args.state_size,
                                        args.hidden_size,
                                        args.dense_act).to(device=args.device)

        self.encoder = Encoder(args.symbolic, args.observation_size,
                               args.embedding_size,
                               args.cnn_act).to(device=args.device)

        self.actor_model = ActorModel(
            args.action_size,
            args.belief_size,
            args.state_size,
            args.hidden_size,
            activation_function=args.dense_act,
            fix_speed=args.fix_speed,
            throttle_base=args.throttle_base).to(device=args.device)

        self.value_model = ValueModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.value_model2 = ValueModel(args.belief_size, args.state_size,
                                       args.hidden_size,
                                       args.dense_act).to(device=args.device)

        self.pcont_model = PCONTModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.target_value_model = deepcopy(self.value_model)
        self.target_value_model2 = deepcopy(self.value_model2)

        for p in self.target_value_model.parameters():
            p.requires_grad = False
        for p in self.target_value_model2.parameters():
            p.requires_grad = False

        # setup the paras to update
        self.world_param = list(self.transition_model.parameters())\
                          + list(self.observation_model.parameters())\
                          + list(self.reward_model.parameters())\
                          + list(self.encoder.parameters())
        if args.pcont:
            self.world_param += list(self.pcont_model.parameters())

        # setup optimizer
        self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr)
        self.actor_optimizer = optim.Adam(self.actor_model.parameters(),
                                          lr=args.actor_lr)
        self.value_optimizer = optim.Adam(list(self.value_model.parameters()) +
                                          list(self.value_model2.parameters()),
                                          lr=args.value_lr)

        # setup the free_nat to
        self.free_nats = torch.full(
            (1, ), args.free_nats, dtype=torch.float32,
            device=args.device)  # Allowed deviation in KL divergence

        # TODO: change it to the new replay buffer, in buffer.py
        self.D = ExperienceReplay(args.experience_size, args.symbolic,
                                  args.observation_size, args.action_size,
                                  args.bit_depth, args.device)

        if self.args.auto_temp:
            # setup for learning of alpha term (temp of the entropy term)
            self.log_temp = torch.zeros(1,
                                        requires_grad=True,
                                        device=args.device)
            self.target_entropy = -np.prod(
                args.action_size if not args.fix_speed else self.args.
                action_size - 1).item()  # heuristic value from SAC paper
            self.temp_optimizer = optim.Adam(
                [self.log_temp], lr=args.value_lr)  # use the same value_lr

        # TODO: print out the param used in Dreamer
        # var_counts = tuple(count_vars(module) for module in [self., self.ac.q1, self.ac.q2])
        # print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts)

    # def process_im(self, image, image_size=None, rgb=None):
    #   # Resize, put channel first, convert it to a tensor, centre it to [-0.5, 0.5] and add batch dimenstion.
    #
    #   def preprocess_observation_(observation, bit_depth):
    #     # Preprocesses an observation inplace (from float32 Tensor [0, 255] to [-0.5, 0.5])
    #     observation.div_(2 ** (8 - bit_depth)).floor_().div_(2 ** bit_depth).sub_(
    #       0.5)  # Quantise to given bit depth and centre
    #     observation.add_(torch.rand_like(observation).div_(
    #       2 ** bit_depth))  # Dequantise (to approx. match likelihood of PDF of continuous images vs. PMF of discrete images)
    #
    #   image = image[40:, :, :]  # clip the above 40 rows
    #   image = torch.tensor(cv2.resize(image, (40, 40), interpolation=cv2.INTER_LINEAR).transpose(2, 0, 1),
    #                         dtype=torch.float32)  # Resize and put channel first
    #
    #   preprocess_observation_(image, self.args.bit_depth)
    #   return image.unsqueeze(dim=0)
    def process_im(self, images, image_size=None, rgb=None):
        images = cv2.resize(images, (40, 40))
        images = np.dot(images, [0.299, 0.587, 0.114])
        obs = torch.tensor(images,
                           dtype=torch.float32).div_(255.).sub_(0.5).unsqueeze(
                               dim=0)  # shape [1, 40, 40], range:[-0.5,0.5]
        return obs.unsqueeze(dim=0)  # add batch dimension

    def append_buffer(self, new_traj):
        # append new collected trajectory, not implement the data augmentation
        # shape of new_traj: [(o, a, r, d) * steps]
        for state in new_traj:
            observation, action, reward, done = state
            self.D.append(observation, action.cpu(), reward, done)

    def _compute_loss_world(self, state, data):
        # unpackage data
        beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = state
        observations, rewards, nonterminals = data

        # observation_loss = F.mse_loss(
        #   bottle(self.observation_model, (beliefs, posterior_states)),
        #   observations[1:],
        #   reduction='none').sum(dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1))
        #
        # reward_loss = F.mse_loss(
        #   bottle(self.reward_model, (beliefs, posterior_states)),
        #   rewards[1:],
        #   reduction='none').mean(dim=(0,1))

        observation_loss = F.mse_loss(
            bottle(self.observation_model, (beliefs, posterior_states)),
            observations,
            reduction='none').sum(
                dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1))

        reward_loss = F.mse_loss(bottle(self.reward_model,
                                        (beliefs, posterior_states)),
                                 rewards,
                                 reduction='none').mean(dim=(0, 1))  # TODO: 5

        # transition loss
        kl_loss = torch.max(
            kl_divergence(
                Independent(Normal(posterior_means, posterior_std_devs), 1),
                Independent(Normal(prior_means, prior_std_devs), 1)),
            self.free_nats).mean(dim=(0, 1))

        # print("check the reward", bottle(pcont_model, (beliefs, posterior_states)).shape, nonterminals[:-1].shape)
        if self.args.pcont:
            pcont_loss = F.binary_cross_entropy(
                bottle(self.pcont_model, (beliefs, posterior_states)),
                nonterminals)
            # pcont_pred = torch.distributions.Bernoulli(logits=bottle(self.pcont_model, (beliefs, posterior_states)))
            # pcont_loss = -pcont_pred.log_prob(nonterminals[1:]).mean(dim=(0, 1))

        return observation_loss, self.args.reward_scale * reward_loss, kl_loss, (
            self.args.pcont_scale * pcont_loss if self.args.pcont else 0)

    def _compute_loss_actor(self,
                            imag_beliefs,
                            imag_states,
                            imag_ac_logps=None):
        # reward and value prediction of imagined trajectories
        imag_rewards = bottle(self.reward_model, (imag_beliefs, imag_states))
        imag_values = bottle(self.value_model, (imag_beliefs, imag_states))
        imag_values2 = bottle(self.value_model2, (imag_beliefs, imag_states))
        imag_values = torch.min(imag_values, imag_values2)

        with torch.no_grad():
            if self.args.pcont:
                pcont = bottle(self.pcont_model, (imag_beliefs, imag_states))
            else:
                pcont = self.args.discount * torch.ones_like(imag_rewards)
        pcont = pcont.detach()

        if imag_ac_logps is not None:
            imag_values[
                1:] -= self.args.temp * imag_ac_logps  # add entropy here

        returns = cal_returns(imag_rewards[:-1],
                              imag_values[:-1],
                              imag_values[-1],
                              pcont[:-1],
                              lambda_=self.args.disclam)

        discount = torch.cumprod(
            torch.cat([torch.ones_like(pcont[:1]), pcont[:-2]], 0), 0)
        discount = discount.detach()

        assert list(discount.size()) == list(returns.size())
        actor_loss = -torch.mean(discount * returns)
        return actor_loss

    def _compute_loss_critic(self,
                             imag_beliefs,
                             imag_states,
                             imag_ac_logps=None):

        with torch.no_grad():
            # calculate the target with the target nn
            target_imag_values = bottle(self.target_value_model,
                                        (imag_beliefs, imag_states))
            target_imag_values2 = bottle(self.target_value_model2,
                                         (imag_beliefs, imag_states))
            target_imag_values = torch.min(target_imag_values,
                                           target_imag_values2)
            imag_rewards = bottle(self.reward_model,
                                  (imag_beliefs, imag_states))

            if self.args.pcont:
                pcont = bottle(self.pcont_model, (imag_beliefs, imag_states))
            else:
                pcont = self.args.discount * torch.ones_like(imag_rewards)

        # print("check pcont", pcont)
            if imag_ac_logps is not None:
                target_imag_values[1:] -= self.args.temp * imag_ac_logps

        returns = cal_returns(imag_rewards[:-1],
                              target_imag_values[:-1],
                              target_imag_values[-1],
                              pcont[:-1],
                              lambda_=self.args.disclam)
        target_return = returns.detach()

        value_pred = bottle(self.value_model, (imag_beliefs, imag_states))[:-1]
        value_pred2 = bottle(self.value_model2,
                             (imag_beliefs, imag_states))[:-1]

        value_loss = F.mse_loss(value_pred, target_return,
                                reduction="none").mean(dim=(0, 1))
        value_loss2 = F.mse_loss(value_pred2, target_return,
                                 reduction="none").mean(dim=(0, 1))
        value_loss += value_loss2

        return value_loss

    def _latent_imagination(self,
                            beliefs,
                            posterior_states,
                            with_logprob=False):
        # Rollout to generate imagined trajectories

        chunk_size, batch_size, _ = list(
            posterior_states.size())  # flatten the tensor
        flatten_size = chunk_size * batch_size

        posterior_states = posterior_states.detach().reshape(flatten_size, -1)
        beliefs = beliefs.detach().reshape(flatten_size, -1)

        imag_beliefs, imag_states, imag_ac_logps = [beliefs
                                                    ], [posterior_states], []

        for i in range(self.args.planning_horizon):
            imag_action, imag_ac_logp = self.actor_model(
                imag_beliefs[-1].detach(),
                imag_states[-1].detach(),
                deterministic=False,
                with_logprob=with_logprob,
            )
            imag_action = imag_action.unsqueeze(dim=0)  # add time dim

            # print(imag_states[-1].shape, imag_action.shape, imag_beliefs[-1].shape)
            imag_belief, imag_state, _, _ = self.transition_model(
                imag_states[-1], imag_action, imag_beliefs[-1])
            imag_beliefs.append(imag_belief.squeeze(dim=0))
            imag_states.append(imag_state.squeeze(dim=0))
            if with_logprob:
                imag_ac_logps.append(imag_ac_logp.squeeze(dim=0))

        imag_beliefs = torch.stack(imag_beliefs, dim=0).to(
            self.args.device
        )  # shape [horizon+1, (chuck-1)*batch, belief_size]
        imag_states = torch.stack(imag_states, dim=0).to(self.args.device)
        if with_logprob:
            imag_ac_logps = torch.stack(imag_ac_logps, dim=0).to(
                self.args.device)  # shape [horizon, (chuck-1)*batch]

        return imag_beliefs, imag_states, imag_ac_logps if with_logprob else None

    def update_parameters(self, gradient_steps):
        loss_info = []  # used to record loss
        for s in tqdm(range(gradient_steps)):
            # get state and belief of samples
            observations, actions, rewards, nonterminals = self.D.sample(
                self.args.batch_size, self.args.chunk_size)
            # print("check sampled rewrads", rewards)
            init_belief = torch.zeros(self.args.batch_size,
                                      self.args.belief_size,
                                      device=self.args.device)
            init_state = torch.zeros(self.args.batch_size,
                                     self.args.state_size,
                                     device=self.args.device)

            # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
            # beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(
            #   init_state,
            #   actions[:-1],
            #   init_belief,
            #   bottle(self.encoder, (observations[1:], )),
            #   nonterminals[:-1])

            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(
                init_state, actions, init_belief,
                bottle(self.encoder, (observations, )),
                nonterminals)  # TODO: 4

            # update paras of world model
            world_model_loss = self._compute_loss_world(
                state=(beliefs, prior_states, prior_means, prior_std_devs,
                       posterior_states, posterior_means, posterior_std_devs),
                data=(observations, rewards, nonterminals))
            observation_loss, reward_loss, kl_loss, pcont_loss = world_model_loss
            self.world_optimizer.zero_grad()
            (observation_loss + reward_loss + kl_loss + pcont_loss).backward()
            nn.utils.clip_grad_norm_(self.world_param,
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.world_optimizer.step()

            # freeze params to save memory
            for p in self.world_param:
                p.requires_grad = False
            for p in self.value_model.parameters():
                p.requires_grad = False
            for p in self.value_model2.parameters():
                p.requires_gard = False

            # latent imagination
            imag_beliefs, imag_states, imag_ac_logps = self._latent_imagination(
                beliefs, posterior_states, with_logprob=self.args.with_logprob)

            # update temp
            if self.args.auto_temp:
                temp_loss = -(
                    self.log_temp *
                    (imag_ac_logps[0] + self.target_entropy).detach()).mean()
                self.temp_optimizer.zero_grad()
                temp_loss.backward()
                self.temp_optimizer.step()
                self.args.temp = self.log_temp.exp()

            # update actor
            actor_loss = self._compute_loss_actor(imag_beliefs,
                                                  imag_states,
                                                  imag_ac_logps=imag_ac_logps)

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            nn.utils.clip_grad_norm_(self.actor_model.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.actor_optimizer.step()

            for p in self.world_param:
                p.requires_grad = True
            for p in self.value_model.parameters():
                p.requires_grad = True
            for p in self.value_model2.parameters():
                p.requires_grad = True

            # update critic
            imag_beliefs = imag_beliefs.detach()
            imag_states = imag_states.detach()

            critic_loss = self._compute_loss_critic(
                imag_beliefs, imag_states, imag_ac_logps=imag_ac_logps)

            self.value_optimizer.zero_grad()
            critic_loss.backward()
            nn.utils.clip_grad_norm_(self.value_model.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            nn.utils.clip_grad_norm_(self.value_model2.parameters(),
                                     self.args.grad_clip_norm,
                                     norm_type=2)
            self.value_optimizer.step()

            loss_info.append([
                observation_loss.item(),
                reward_loss.item(),
                kl_loss.item(),
                pcont_loss.item() if self.args.pcont else 0,
                actor_loss.item(),
                critic_loss.item()
            ])

        # finally, update target value function every #gradient_steps
        with torch.no_grad():
            self.target_value_model.load_state_dict(
                self.value_model.state_dict())
        with torch.no_grad():
            self.target_value_model2.load_state_dict(
                self.value_model2.state_dict())

        return loss_info

    def infer_state(self, observation, action, belief=None, state=None):
        """ Infer belief over current state q(s_t|o≤t,a<t) from the history,
        return updated belief and posterior_state at time t
        returned shape: belief/state [belief/state_dim] (remove the time_dim)
    """
        # observation is obs.to(device), action.shape=[act_dim] (will add time dim inside this fn), belief.shape
        belief, _, _, _, posterior_state, _, _ = self.transition_model(
            state, action.unsqueeze(dim=0), belief,
            self.encoder(observation).unsqueeze(
                dim=0))  # Action and observation need extra time dimension

        belief, posterior_state = belief.squeeze(
            dim=0), posterior_state.squeeze(
                dim=0)  # Remove time dimension from belief/state

        return belief, posterior_state

    def select_action(self, state, deterministic=False):
        # get action with the inputs get from fn: infer_state; return a numpy with shape [batch, act_size]
        belief, posterior_state = state
        action, _ = self.actor_model(belief,
                                     posterior_state,
                                     deterministic=deterministic,
                                     with_logprob=False)
        if not deterministic and not self.args.with_logprob:
            print("e")
            action = Normal(action, self.args.expl_amount).rsample()

            # clip the angle
            action[:, 0].clamp_(min=self.args.angle_min,
                                max=self.args.angle_max)
            # clip the throttle
            if self.args.fix_speed:
                action[:, 1] = self.args.throttle_base
            else:
                action[:, 1].clamp_(min=self.args.throttle_min,
                                    max=self.args.throttle_max)
        print("action", action)
        # return action.cup().numpy()
        return action  # this is a Tonsor.cuda

    def import_parameters(self, params):
        # only import or export the parameters used when local rollout
        self.encoder.load_state_dict(params["encoder"])
        self.actor_model.load_state_dict(params["policy"])
        self.transition_model.load_state_dict(params["transition"])

    def export_parameters(self):
        """ return the model paras used for local rollout """
        params = {
            "encoder": self.encoder.cpu().state_dict(),
            "policy": self.actor_model.cpu().state_dict(),
            "transition": self.transition_model.cpu().state_dict()
        }

        self.encoder.to(self.args.device)
        self.actor_model.to(self.args.device)
        self.transition_model.to(self.args.device)

        return params
Ejemplo n.º 18
0
    def __init__(self, args):
        """
    All paras are passed by args
    :param args: a dict that includes parameters
    """
        super().__init__()
        self.args = args
        # Initialise model parameters randomly
        self.transition_model = TransitionModel(
            args.belief_size, args.state_size, args.action_size,
            args.hidden_size, args.embedding_size,
            args.dense_act).to(device=args.device)

        self.observation_model = ObservationModel(
            args.symbolic,
            args.observation_size,
            args.belief_size,
            args.state_size,
            args.embedding_size,
            activation_function=(args.dense_act if args.symbolic else
                                 args.cnn_act)).to(device=args.device)

        self.reward_model = RewardModel(args.belief_size, args.state_size,
                                        args.hidden_size,
                                        args.dense_act).to(device=args.device)

        self.encoder = Encoder(args.symbolic, args.observation_size,
                               args.embedding_size,
                               args.cnn_act).to(device=args.device)

        self.actor_model = ActorModel(
            args.action_size,
            args.belief_size,
            args.state_size,
            args.hidden_size,
            activation_function=args.dense_act,
            fix_speed=args.fix_speed,
            throttle_base=args.throttle_base).to(device=args.device)

        self.value_model = ValueModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.value_model2 = ValueModel(args.belief_size, args.state_size,
                                       args.hidden_size,
                                       args.dense_act).to(device=args.device)

        self.pcont_model = PCONTModel(args.belief_size, args.state_size,
                                      args.hidden_size,
                                      args.dense_act).to(device=args.device)

        self.target_value_model = deepcopy(self.value_model)
        self.target_value_model2 = deepcopy(self.value_model2)

        for p in self.target_value_model.parameters():
            p.requires_grad = False
        for p in self.target_value_model2.parameters():
            p.requires_grad = False

        # setup the paras to update
        self.world_param = list(self.transition_model.parameters())\
                          + list(self.observation_model.parameters())\
                          + list(self.reward_model.parameters())\
                          + list(self.encoder.parameters())
        if args.pcont:
            self.world_param += list(self.pcont_model.parameters())

        # setup optimizer
        self.world_optimizer = optim.Adam(self.world_param, lr=args.world_lr)
        self.actor_optimizer = optim.Adam(self.actor_model.parameters(),
                                          lr=args.actor_lr)
        self.value_optimizer = optim.Adam(list(self.value_model.parameters()) +
                                          list(self.value_model2.parameters()),
                                          lr=args.value_lr)

        # setup the free_nat to
        self.free_nats = torch.full(
            (1, ), args.free_nats, dtype=torch.float32,
            device=args.device)  # Allowed deviation in KL divergence

        # TODO: change it to the new replay buffer, in buffer.py
        self.D = ExperienceReplay(args.experience_size, args.symbolic,
                                  args.observation_size, args.action_size,
                                  args.bit_depth, args.device)

        if self.args.auto_temp:
            # setup for learning of alpha term (temp of the entropy term)
            self.log_temp = torch.zeros(1,
                                        requires_grad=True,
                                        device=args.device)
            self.target_entropy = -np.prod(
                args.action_size if not args.fix_speed else self.args.
                action_size - 1).item()  # heuristic value from SAC paper
            self.temp_optimizer = optim.Adam(
                [self.log_temp], lr=args.value_lr)  # use the same value_lr
Ejemplo n.º 19
0
class Agent:
    def __init__(self, model, memory=None, memory_size=500, nb_frames=None):
        assert len(
            model.get_output_shape_at(0)
        ) == 2, "Model's output shape should be (nb_samples, nb_actions)."
        if memory:
            self.memory = memory
        else:
            self.memory = ExperienceReplay(memory_size)
        if not nb_frames and not model.get_input_shape_at(0)[1]:
            raise Exception("Missing argument : nb_frames not provided")
        elif not nb_frames:
            nb_frames = model.get_input_shape_at(0)[1]
        elif model.get_input_shape_at(
                0
        )[1] and nb_frames and model.get_input_shape_at(0)[1] != nb_frames:
            raise Exception(
                "Dimension mismatch : time dimension of model should be equal to nb_frames."
            )
        self.model = model
        self.nb_frames = nb_frames
        self.frames = None

    @property
    def memory_size(self):
        return self.memory.memory_size

    @memory_size.setter
    def memory_size(self, value):
        self.memory.memory_size = value

    def reset_memory(self):
        self.exp_replay.reset_memory()

    def check_game_compatibility(self, game):
        #if len(self.model.input_layers_node_indices) != 1:
        #raise Exception('Multi node input is not supported.')
        game_output_shape = (1, None) + game.get_frame().shape
        if len(game_output_shape) != len(self.model.get_input_shape_at(0)):
            raise Exception(
                'Dimension mismatch. Input shape of the model should be compatible with the game.'
            )
        else:
            for i in range(len(self.model.get_input_shape_at(0))):
                if self.model.get_input_shape_at(0)[i] and game_output_shape[
                        i] and self.model.get_input_shape_at(
                            0)[i] != game_output_shape[i]:
                    raise Exception(
                        'Dimension mismatch. Input shape of the model should be compatible with the game.'
                    )
        if len(
                self.model.get_output_shape_at(0)
        ) != 2 or self.model.get_output_shape_at(0)[1] != game.nb_actions:
            raise Exception(
                'Output shape of model should be (nb_samples, nb_actions).')

    def get_game_data(self, game):
        frame = game.get_frame()
        if self.frames is None:
            self.frames = [frame] * self.nb_frames
        else:
            self.frames.append(frame)
            self.frames.pop(0)
        return np.expand_dims(self.frames, 0)

    def clear_frames(self):
        self.frames = None

    def train(self,
              game,
              nb_epoch=1000,
              batch_size=50,
              gamma=0.9,
              epsilon=[1., .1],
              epsilon_rate=0.5,
              reset_memory=False,
              observe=0,
              checkpoint=None,
              total_sessions=0,
              session_id=1):
        self.check_game_compatibility(game)

        ts = int(time.time())
        #fn = "gold-{}.csv".format(ts)

        #fn = "9nyc-250-1000-epr8-heat-adam.csv"
        #fn = "400-rl-nopool.csv"
        fn = "3-normal.csv"
        fn2 = "heat.csv"
        #advice_type = "OA"
        advice_type = "OA"
        meta_advice_type = "HFHA"
        #meta_feedback_frequency = 0.1
        #meta_feedback_frequency = 0.5 #HF!!!
        meta_feedback_frequency = 0.1  #LF!!!

        heatmap = [[0] * 20 for i in range(20)]

        if session_id == 1:
            advice_type = "OA"
        if session_id == 2:
            advice_type = "NA"
        if session_id == 3:
            advice_type = "RL"
        # print(heatmap)
        # with open("dummyheat.csv",'a') as f2:
        # 	csvWriter = csv.writer(f2,delimiter=',')
        # 	csvWriter.writerows(heatmap)
        # if ( session_id >= 3 and session_id < 5 ):
        # 	print("Switching to HFLA")
        # 	meta_advice_type = "HFLA"
        # 	#meta_feedback_frequency = 0.1
        # elif ( session_id >= 5 and session_id < 7 ):
        # 	print("Switching to LFHA")
        # 	meta_feedback_frequency = 0.1
        # 	meta_advice_type = "LFHA"
        # elif ( session_id >= 7 and session_id < 9 ):
        # 	print("Switching to LFLA")
        # 	meta_advice_type = "LFLA"
        # elif ( session_id >= 9 and session_id < 11 ):
        # 	advice_type = "OA"
        # 	print("Switching to NA HFLA")
        # 	meta_advice_type = "HFLA"
        # 	meta_feedback_frequency = 0.5
        # elif ( session_id >= 11 and session_id < 13 ):
        # 	print("Switching to NA HFLA")
        # 	meta_advice_type = "HFLA"
        # 	#meta_feedback_frequency = 0.1
        # elif ( session_id >= 13 and session_id < 15 ):
        # 	print("Switching to NA LFHA")
        # 	meta_feedback_frequency = 0.1
        # 	meta_advice_type = "LFHA"
        # elif ( session_id >= 15 and session_id < 17 ):
        # 	print("Switching to NA LFLA")
        # 	meta_advice_type = "LFLA"

        # if ( session_id >= 2 and session_id < 3 ):
        # 	meta_feedback_frequency = 0.1
        # 	print("Switching to LFHA")
        # 	advice_type = "OA"
        # 	meta_advice_type = "LFHA"
        # 	meta_feedback_frequency = 0.1
        # elif ( session_id >= 3 and session_id < 4 ):
        # 	advice_type = "NA"
        # 	print("Switching to NA LFHA")
        # 	meta_feedback_frequency = 0.1
        # 	meta_advice_type = "LFHA"
        # elif ( session_id >= 4 and session_id < 5 ):
        # 	print("Switching to NA LFLA")
        # 	meta_feedback_frequency = 0.1
        # 	advice_type = "NA"
        # 	meta_advice_type = "LFLA"
        # elif ( session_id >= 5 and session_id < 6 ):
        # 	advice_type = "OA"
        # 	print("Switching to OA HFHA")
        # 	meta_advice_type = "HFHA"
        # 	meta_feedback_frequency = 0.5
        # elif ( session_id >= 6 and session_id < 7 ):
        # 	advice_type = "NA"
        # 	meta_feedback_frequency = 0.5
        # 	print("Switching to NA HFHA")
        # 	meta_advice_type = "HFHA"
        # 	meta_feedback_frequency = 0.5
        # elif ( session_id >= 7 and session_id < 8 ):
        # 	advice_type = "NA"
        # 	print("Switching to NA HFLA")
        # 	meta_feedback_frequency = 0.5
        # 	meta_advice_type = "HFLA"
        # elif ( session_id >= 8 and session_id < 9 ):
        # 	advice_type = "OA"
        # 	meta_feedback_frequency = 0.5
        # 	print("Switching to OA HFLA")
        # 	meta_advice_type = "HFLA"

        # if ( session_id >= 4 and session_id < 7 ):
        # 	#print("Switching to LFLA")
        # 	advice_type = "RL"
        # 	#meta_advice_type = "LFLA"
        # elif ( session_id >= 7 and session_id < 10 ):
        # 	# with open("1RLheat.csv",'a') as f2:
        # 	# 	csvWriter = csv.writer(f2,delimiter=',')
        # 	# 	csvWriter.writerows(heatmap)
        # 	# 	heatmap = [ [0]*20 for i in range(20)]
        # 	advice_type = "NA"
        # 	#print("Switching to LFHA")
        # 	#meta_feedback_frequency = 0.1
        # 	#meta_advice_type = "LFHA"
        # elif ( session_id >= 10 ):
        # 	# with open("1NAheat.csv",'a') as f2:
        # 	# 	csvWriter = csv.writer(f2,delimiter=',')
        # 	# 	csvWriter.writerows(heatmap)
        # 	# 	heatmap = [ [0]*20 for i in range(20)]
        # 	#print("Switching to LFLA")

        # 	#meta_advice_type = "LFLA"
        # 	advice_type = "NA"

        # with open(fn,'w') as f:
        # 	f.write('session_id,advice_type,time,epoch,frames,score,win_perc,loss'+'\n')
        # 	f.flush()
        # 	f.close()            with open(fn,'a') as f:
        with open(fn, 'a') as f:
            total_frames = 0
            #f.write('session_id,advice_type,time,epoch,frames,score,win_perc,loss'+'\n')
            #f.flush()
            if type(epsilon) in {tuple, list}:
                delta = ((epsilon[0] - epsilon[1]) / (nb_epoch * epsilon_rate))
                final_epsilon = epsilon[1]
                epsilon = epsilon[0]
            else:
                final_epsilon = epsilon
            model = self.model
            nb_actions = model.get_output_shape_at(0)[-1]
            win_count = 0
            rolling_win_window = []
            max_obs_loss = -99999999999999999
            m_loss = -99999999
            for epoch in range(nb_epoch):
                lastAdviceStep = 0
                adviceGiven = 0
                adviceAttempts = 0
                modelActions = 0
                print(heatmap)
                loss = 0.
                game.reset()
                self.clear_frames()
                if reset_memory:
                    self.reset_memory()
                game_over = False
                S = self.get_game_data(game)
                savedModel = False
                while not game_over:
                    a = 0
                    if advice_type == "RL":
                        if np.random.random() < epsilon or epoch < observe:
                            a = int(np.random.randint(game.nb_actions))
                            #print("Random Action")
                        else:
                            q = model.predict(
                                S
                            )  #use the prediction confidence to determine whether to ask the player for help
                            qs = model.predict_classes(S)
                            #a = int(np.argmax(qs[0]))
                            #highest_conf = np.amax(q)
                            #print("Game Grid: {}".format(game.get_grid()))
                            #print("Highest MSE Confidence = {}".format(highest_conf))
                            #a = int(np.argmax(q[0]))
                            a = int(np.argmax(qs[0]))
                    if advice_type == "OA":
                        if np.random.random() < epsilon or epoch < observe:
                            a = int(np.random.randint(game.nb_actions))
                            #print("Random Action")
                        else:
                            q = model.predict(
                                S
                            )  #use the prediction confidence to determine whether to ask the player for help
                            qs = model.predict_classes(S)
                            #print(qs)
                            #print(q)
                            highest_loss = abs(np.amax(q))  #added ABS
                            lowest_loss = abs(np.amin(q))
                            #print(highest_loss)
                            #print("HighestLoss:{}".format(highest_loss))
                            if highest_loss > max_obs_loss and highest_loss != 0:
                                max_obs_loss = highest_loss
                                #print("MaxLoss:{}".format(highest_loss))
                            #inn = highest_loss / max_obs_loss
                            relative_cost = np.power(
                                lowest_loss / max_obs_loss, 0.5)
                            #print("RelCostA:{}".format(relative_cost))
                            if relative_cost < 1e-20:
                                relative_cost = 1e-20
                            relative_cost = -1 / (np.log(relative_cost) - 1)
                            #print("RelCostB:{}".format(relative_cost))
                            confidence_score_max = 1
                            confidence_score_min = 0.01
                            feedback_chance = confidence_score_min + (
                                confidence_score_max -
                                confidence_score_min) * relative_cost

                            if feedback_chance < 0.01:
                                feedback_chance = 0.01
                            #if feedback_chance < 0.1:
                            giveAdvice = False
                            if (random.random() < meta_feedback_frequency):
                                giveAdvice = True
                                adviceAttempts = adviceAttempts + 1
                            if (relative_cost <= 0.25 and game.stepsTaken >=
                                (lastAdviceStep + 10)) or giveAdvice == False:
                                #print("HC: {}".format(max_obs_loss))
                                modelActions = modelActions + 1
                                #print("Highest Loss: {} RC: {} POS: Q0:{}".format(highest_loss, relative_cost, q[0]))
                                a = int(np.argmax(qs[0]))
                            else:
                                if random.random() < .5 and (
                                        meta_advice_type == "HFLA"
                                        or meta_advice_type == "LFLA"):
                                    lastAdviceStep = game.stepsTaken
                                    a = int(np.random.randint(game.nb_actions))
                                    adviceGiven = adviceGiven + 1
                                    #print("Taking BAD Player Action")
                                else:
                                    lastAdviceStep = game.stepsTaken
                                    adviceGiven = adviceGiven + 1
                                    x = game.location[0]
                                    z = game.location[1]
                                    yaw = game.location[2]
                                    a = -1
                                    #print(yaw)
                                    if z <= 6:
                                        if x < 12:
                                            #print("Segment1")
                                            if yaw == 270:
                                                a = 0
                                            if yaw == 180:
                                                a = 1
                                            if yaw == 90:
                                                a = 3
                                            if yaw == 0:
                                                a = 2
                                        elif x > 15:
                                            #print("Segment2")
                                            if yaw == 90:
                                                a = 0
                                            if yaw == 180:
                                                a = 2
                                            if yaw == 0:
                                                a = 1
                                            if yaw == 270:
                                                a = 3
                                        else:
                                            #print("Segment3")
                                            if yaw == 0:
                                                a = 0
                                            if yaw == 270:
                                                a = 1
                                            if yaw == 90:
                                                a = 2
                                            if yaw == 180:
                                                a = 3
                                    elif (x >= 7) and ((z == 7) or (z == 8) or
                                                       (z == 9) or (z == 10) or
                                                       (z == 11) or (z == 12)):
                                        #print("Segment4")
                                        if yaw == 90:
                                            a = 0
                                        if yaw == 180:
                                            a = 2
                                        if yaw == 0:
                                            a = 1
                                        if yaw == 270:
                                            a = 3
                                    elif ((x < 7) and (x > 3)) and (
                                        (z == 7) or (z == 8) or (z == 9) or
                                        (z == 10) or (z == 11) or (z == 12)):
                                        if yaw == 0:
                                            a = 0
                                        if yaw == 270:
                                            a = 1
                                        if yaw == 90:
                                            a = 2
                                        if yaw == 180:
                                            a = 3
                                    elif ((x < 3)) and ((z == 7) or (z == 8) or
                                                        (z == 9) or
                                                        (z == 10) or
                                                        (z == 11) or
                                                        (z == 12)):
                                        if yaw == 0:
                                            a = 2
                                        if yaw == 270:
                                            a = 0
                                        if yaw == 180:
                                            a = 1
                                        if yaw == 90:
                                            a = 3
                                    elif (z == 14) or (z == 15):
                                        if yaw == 0:
                                            a = 0
                                        if yaw == 270:
                                            a = 1
                                        if yaw == 90:
                                            a = 2
                                        if yaw == 180:
                                            a = 3
                                    elif (z == 17) or (z == 16):
                                        #print("Segment6")
                                        if yaw == 270:
                                            a = 0
                                        if yaw == 180:
                                            a = 1
                                        if yaw == 0:
                                            a = 2
                                        if yaw == 90:
                                            a = 3
                                    elif (z > 17):
                                        #print("Segment6")
                                        if yaw == 270:
                                            a = 2
                                        if yaw == 180:
                                            a = 0
                                        if yaw == 0:
                                            a = 3
                                        if yaw == 90:
                                            a = 1
                                    else:
                                        a = int(
                                            np.random.randint(game.nb_actions))

                                    if a == -1:
                                        a = int(
                                            np.random.randint(game.nb_actions))
                                    # if z < 6 and x < 13:
                                    # 	print("Segment1")
                                    # 	if yaw == 270:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # elif z < 8 and x >= 13:
                                    # 	print("Segment2")
                                    # 	if yaw == 0:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # elif z >= 8 and x == 13:
                                    # 	print("Segment3")
                                    # 	if yaw == 90:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # elif z >= 8 and z<= 17 and x < 6:
                                    # 	print("Segment4")
                                    # 	if yaw == 0:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # elif z > 18 and x < 18:
                                    # 	print("Segment5")
                                    # 	if yaw == 270:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # else:
                                    # 	a = int(np.argmax(q[0]))

                                #print("Game Grid: {}".format(game.get_grid()))
                                #print("Highest MSE Confidence = {}".format(highest_conf))

                    if advice_type == "NA":
                        if np.random.random() < epsilon or epoch < observe:
                            a = int(np.random.randint(game.nb_actions))
                            game.play(a)
                            heatmap[game.location[0]][
                                game.location[1]] = heatmap[game.location[0]][
                                    game.location[1]] + 1
                            #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                            #f2.flush()
                            r = game.get_score()
                            S_prime = self.get_game_data(game)
                            game_over = game.is_over()
                            transition = [S, a, r, S_prime, game_over]
                            self.memory.remember(*transition)
                            S = S_prime
                            #print("Random Action")
                        else:
                            q = model.predict(
                                S
                            )  #use the prediction confidence to determine whether to ask the player for help
                            qs = model.predict_classes(S)
                            highest_loss = abs(np.amax(q))  #added ABS
                            lowest_loss = abs(np.amin(q))
                            #print("HighestLoss:{}".format(highest_loss))
                            if highest_loss > max_obs_loss and highest_loss != 0:
                                max_obs_loss = highest_loss
                                #print("MaxLoss:{}".format(highest_loss))
                            #inn = highest_loss / max_obs_loss
                            relative_cost = np.power(
                                lowest_loss / max_obs_loss, 0.5)
                            #print("RelCostA:{}".format(relative_cost))
                            if relative_cost < 1e-20:
                                relative_cost = 1e-20
                            relative_cost = -1 / (np.log(relative_cost) - 1)
                            #print("RelCostB:{}".format(relative_cost))
                            confidence_score_max = 1
                            confidence_score_min = 0.01
                            feedback_chance = confidence_score_min + (
                                confidence_score_max -
                                confidence_score_min) * relative_cost
                            #feedback_chance = random.random()
                            #print("Feedback Chance: {}".format(feedback_chance))
                            if feedback_chance < 0.01:
                                feedback_chance = 0.01
                            #if feedback_chance > meta_feedback_frequency:
                            #if feedback_chance < 0.1:
                            #print(relative_cost)
                            giveAdvice = False
                            if (random.random() < meta_feedback_frequency):
                                giveAdvice = True
                                adviceAttempts = adviceAttempts + 1
                            if (relative_cost <= 0.25 and game.stepsTaken >=
                                (lastAdviceStep + 10)) or giveAdvice == False:
                                #print("Taking Model Action")
                                #print("HC: {}".format(max_obs_loss))
                                #print("Confidence: {} RC: {}".format(feedback_chance, relative_cost))
                                modelActions = modelActions + 1
                                #a = int(np.argmin(q[0]))
                                a = int(np.argmax(qs[0]))
                                game.play(a)
                                heatmap[game.location[0]][
                                    game.location[1]] = heatmap[
                                        game.location[0]][game.location[1]] + 1
                                #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                                #f2.flush()
                                r = game.get_score()
                                S_prime = self.get_game_data(game)
                                game_over = game.is_over()
                                transition = [S, a, r, S_prime, game_over]
                                self.memory.remember(*transition)
                                S = S_prime
                            else:
                                #print("Taking Player Action")
                                if random.random() < .5 and (
                                        meta_advice_type == "HFLA"
                                        or meta_advice_type == "LFLA"):
                                    a = int(np.random.randint(game.nb_actions))
                                    adviceGiven = adviceGiven + 1
                                    game.play(a)
                                    heatmap[game.location[0]][game.location[
                                        1]] = heatmap[game.location[0]][
                                            game.location[1]] + 1
                                    lastAdviceStep = game.stepsTaken
                                    #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                                    #f2.flush()
                                    r = game.get_score()
                                    S_prime = self.get_game_data(game)
                                    game_over = game.is_over()
                                    transition = [S, a, r, S_prime, game_over]
                                    self.memory.remember(*transition)
                                    S = S_prime
                                    if game_over == False:
                                        #game.play(checkForBestMove(game.location[0],game.location[1],game.location[2]))
                                        a = int(
                                            np.random.randint(game.nb_actions))
                                        game.play(a)
                                        heatmap[game.location[0]][
                                            game.location[1]] = heatmap[
                                                game.location[0]][
                                                    game.location[1]] + 1
                                        #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                                        #f2.flush()
                                        r = game.get_score()
                                        S_prime = self.get_game_data(game)
                                        game_over = game.is_over()
                                        transition = [
                                            S, a, r, S_prime, game_over
                                        ]
                                        self.memory.remember(*transition)
                                        S = S_prime
                                        # if game_over == False:
                                        # 	game.play(checkForBestMove(game.location[0],game.location[1],game.location[2]))
                                        # 	heatmap[game.location[0]][game.location[1]] = heatmap[game.location[0]][game.location[1]] + 1
                                        # 	#f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                                        # 	#f2.flush()
                                        # 	r = game.get_score()
                                        # 	S_prime = self.get_game_data(game)
                                        # 	game_over = game.is_over()
                                        # 	transition = [S, a, r, S_prime, game_over]
                                        # 	self.memory.remember(*transition)
                                        # 	S = S_prime
                                    #print("Taking BAD Player Action")
                                else:
                                    adviceGiven = adviceGiven + 1
                                    lastAdviceStep = game.stepsTaken
                                    x = game.location[0]
                                    z = game.location[1]
                                    yaw = game.location[2]
                                    #print(x)
                                    #print(z)
                                    a = -1
                                    #print(yaw)
                                    if z <= 6:
                                        if x < 12:
                                            #print("Segment1")
                                            if yaw == 270:
                                                a = 0
                                            if yaw == 180:
                                                a = 1
                                            if yaw == 90:
                                                a = 3
                                            if yaw == 0:
                                                a = 2
                                        elif x > 15:
                                            #print("Segment2")
                                            if yaw == 90:
                                                a = 0
                                            if yaw == 180:
                                                a = 2
                                            if yaw == 0:
                                                a = 1
                                            if yaw == 270:
                                                a = 3
                                        else:
                                            #print("Segment3")
                                            if yaw == 0:
                                                a = 0
                                            if yaw == 270:
                                                a = 1
                                            if yaw == 90:
                                                a = 2
                                            if yaw == 180:
                                                a = 3
                                    elif (x >= 7) and ((z == 7) or (z == 8) or
                                                       (z == 9) or (z == 10) or
                                                       (z == 11) or (z == 12)):
                                        #print("Segment4")
                                        if yaw == 90:
                                            a = 0
                                        if yaw == 180:
                                            a = 2
                                        if yaw == 0:
                                            a = 1
                                        if yaw == 270:
                                            a = 3
                                    elif ((x < 7) and (x > 3)) and (
                                        (z == 7) or (z == 8) or (z == 9) or
                                        (z == 10) or (z == 11) or (z == 12)):
                                        if yaw == 0:
                                            a = 0
                                        if yaw == 270:
                                            a = 1
                                        if yaw == 90:
                                            a = 2
                                        if yaw == 180:
                                            a = 3
                                    elif ((x < 3)) and ((z == 7) or (z == 8) or
                                                        (z == 9) or
                                                        (z == 10) or
                                                        (z == 11) or
                                                        (z == 12)):
                                        if yaw == 0:
                                            a = 2
                                        if yaw == 270:
                                            a = 0
                                        if yaw == 180:
                                            a = 1
                                        if yaw == 90:
                                            a = 3
                                    elif (z == 14) or (z == 15):
                                        if yaw == 0:
                                            a = 0
                                        if yaw == 270:
                                            a = 1
                                        if yaw == 90:
                                            a = 2
                                        if yaw == 180:
                                            a = 3
                                    elif (z == 17) or (z == 16):
                                        #print("Segment6")
                                        if yaw == 270:
                                            a = 0
                                        if yaw == 180:
                                            a = 1
                                        if yaw == 0:
                                            a = 2
                                        if yaw == 90:
                                            a = 3
                                    elif (z > 17):
                                        #print("Segment6")
                                        if yaw == 270:
                                            a = 2
                                        if yaw == 180:
                                            a = 0
                                        if yaw == 0:
                                            a = 3
                                        if yaw == 90:
                                            a = 1
                                    else:
                                        a = int(
                                            np.random.randint(game.nb_actions))

                                    if a == -1:
                                        a = int(
                                            np.random.randint(game.nb_actions))
                                    # #print(yaw)
                                    # if z < 6 and x < 13:
                                    # 	#print("Segment1")
                                    # 	if yaw == 270:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # elif z < 8 and x >= 13:
                                    # 	#print("Segment2")
                                    # 	if yaw == 0:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # elif z >= 8 and x == 13:
                                    # 	#print("Segment3")
                                    # 	if yaw == 90:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # elif z >= 8 and z<= 17 and x < 6:
                                    # 	#print("Segment4")
                                    # 	if yaw == 0:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # elif z > 18 and x < 18:
                                    # 	#print("Segment5")
                                    # 	if yaw == 270:
                                    # 		a = 0
                                    # 	else:
                                    # 		a = 1
                                    # else:
                                    # 	a = int(np.argmax(q[0]))

                                #Play an extra 2 times (for NA friction)
                                game.play(a)
                                heatmap[game.location[0]][
                                    game.location[1]] = heatmap[
                                        game.location[0]][game.location[1]] + 1
                                #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                                #f2.flush()
                                r = game.get_score()
                                S_prime = self.get_game_data(game)
                                game_over = game.is_over()
                                transition = [S, a, r, S_prime, game_over]
                                self.memory.remember(*transition)
                                S = S_prime
                                if game_over == False:
                                    game.play(
                                        checkForBestMove(
                                            game.location[0], game.location[1],
                                            game.location[2]))
                                    heatmap[game.location[0]][game.location[
                                        1]] = heatmap[game.location[0]][
                                            game.location[1]] + 1
                                    #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                                    #f2.flush()
                                    r = game.get_score()
                                    S_prime = self.get_game_data(game)
                                    game_over = game.is_over()
                                    transition = [S, a, r, S_prime, game_over]
                                    self.memory.remember(*transition)
                                    S = S_prime
                                    # if game_over == False:
                                    # 	game.play(checkForBestMove(game.location[0],game.location[1],game.location[2]))
                                    # 	heatmap[game.location[0]][game.location[1]] = heatmap[game.location[0]][game.location[1]] + 1
                                    # 	#f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                                    # 	#f2.flush()
                                    # 	r = game.get_score()
                                    # 	S_prime = self.get_game_data(game)
                                    # 	game_over = game.is_over()
                                    # 	transition = [S, a, r, S_prime, game_over]
                                    # 	self.memory.remember(*transition)
                                    # 	S = S_prime
                    if game_over == False:
                        if advice_type != "NA":
                            game.play(a)
                            heatmap[game.location[0]][
                                game.location[1]] = heatmap[game.location[0]][
                                    game.location[1]] + 1
                            #f2.write('{},{},{},{}\n'.format(advice_type,game.location[0],game.location[1],1 ))
                            #f2.flush()
                            r = game.get_score()
                            S_prime = self.get_game_data(game)
                            game_over = game.is_over()
                            transition = [S, a, r, S_prime, game_over]
                            self.memory.remember(*transition)
                            S = S_prime
                    if epoch >= observe:
                        batch = self.memory.get_batch(model=model,
                                                      batch_size=batch_size,
                                                      gamma=gamma)
                        if batch:
                            inputs, targets = batch
                            mtob = model.train_on_batch(inputs, targets)
                            if mtob > m_loss:
                                m_loss = mtob
                            loss += float(mtob)
                            #print( "LOSS: {} CULM_LOSS: {}".format(mtob,loss))
                    if checkpoint and (savedModel == False) and (
                        (epoch + 1 - observe) % checkpoint == 0
                            or epoch + 1 == nb_epoch):
                        #model.save_weights('weights.dat')
                        print("Checkpoint... saving model..")
                        if advice_type == "OA":
                            model.save('oa_model.h5')
                        if advice_type == "NA":
                            model.save('na_model.h5')
                        if advice_type == "RL":
                            model.save('rl_model.h5')
                        # model_json = model.to_json()
                        # with open("model.json", "w") as json_file:
                        #    json_file.write(model_json)
                        # #serialize weights to HDF5
                        # model.save_weights("model.h5")
                        savedModel = True
                if game.is_won():
                    win_count += 1
                    rolling_win_window.insert(0, 1)
                else:
                    rolling_win_window.insert(0, 0)
                if epsilon > final_epsilon and epoch >= observe:
                    epsilon -= delta
                    percent_win = 0
                    cdt = datetime.datetime.now()
                    if sum(rolling_win_window) != 0:
                        percent_win = sum(rolling_win_window) / 4
                    total_frames = total_frames + game.stepsTaken
                    f.write(
                        '{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n'.format(
                            session_id, advice_type, meta_advice_type,
                            str(cdt), (epoch + 1), total_frames, game.score,
                            percent_win, epsilon, loss, game.stepsTaken,
                            adviceGiven, adviceAttempts, modelActions))
                    f.flush()
                    print(
                        "Session: {} | Time: {} | Epoch {:03d}/{:03d} | Steps {:.4f} | Epsilon {:.2f} | Score {} | Loss {}"
                        .format(session_id, str(cdt), epoch + 1, nb_epoch,
                                game.stepsTaken, epsilon, game.score, loss))
                    if len(rolling_win_window) > 4:
                        rolling_win_window.pop()
                    time.sleep(1.0)

            if advice_type == "OA":
                with open("{}OAheatxtues.csv".format(session_id), 'w+') as f2:
                    csvWriter = csv.writer(f2, delimiter=',')
                    csvWriter.writerows(heatmap)
                #heatmap = [ [0]*20 for i in range(20)]
            if advice_type == "RL":
                with open("{}RLheatxtues.csv".format(session_id), 'w+') as f2:
                    csvWriter = csv.writer(f2, delimiter=',')
                    csvWriter.writerows(heatmap)
                #heatmap = [ [0]*20 for i in range(20)]
            if advice_type == "NA":
                with open("{}NAheatxtues.csv".format(session_id), 'w+') as f2:
                    csvWriter = csv.writer(f2, delimiter=',')
                    csvWriter.writerows(heatmap)
                #heatmap = [ [0]*20 for i in range(20)]

    def play(self, game, nb_epoch=10, epsilon=0., visualize=False):
        self.check_game_compatibility(game)
        model = self.model
        win_count = 0
        frames = []
        for epoch in range(nb_epoch):
            print("Playing")
            game.reset()
            self.clear_frames()
            S = self.get_game_data(game)
            if visualize:
                frames.append(game.draw())
            game_over = False
            while not game_over:
                if np.random.rand() < epsilon:
                    print("random")
                    action = int(np.random.randint(0, game.nb_actions))
                else:
                    q = model.predict(S)[0]
                    possible_actions = game.get_possible_actions()
                    q = [q[i] for i in possible_actions]
                    action = possible_actions[np.argmax(q)]
                print(action)
                game.play(action)
                S = self.get_game_data(game)
                if visualize:
                    frames.append(game.draw())
                game_over = game.is_over()
            if game.is_won():
                win_count += 1
        print("Accuracy {} %".format(100. * win_count / nb_epoch))
        #Visualizing/printing images is currently super slow
        if visualize:
            if 'images' not in os.listdir('.'):
                os.mkdir('images')
            for i in range(len(frames)):
                plt.imshow(frames[i], interpolation='none')
                plt.savefig("images/" + game.name + str(i) + ".png")
Ejemplo n.º 20
0
''' Constants '''
nb_actions = 6
memory_size = 100
observe = 0
batch_size = 50

epsilon = (1.0, 0.1)
epsilon_rate = 0.5

delta = ((epsilon[0] - epsilon[1]) / (iterations * epsilon_rate))
final_epsilon = epsilon[1]
epsilon = epsilon[0]

win_count = 0
''' Memory and Model '''
memory = ExperienceReplay(memory_size)
model = build_model()
''' Agent Code '''
initial_state = [
    '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.33', '1.0', '1.0',
    '1.0', '1.0', '0.0', '0.0', '1.0', '1.0', '1.0', '0.0', '1.0', '1.0',
    '0.0', '0.0', '0.0', '1.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0',
    '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0',
    '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0', '1.0', '1.0',
    '0.0', '1.0', '1.0', '0.0', '1.0', '1.0', '0.0'
]

if sys.platform == "win32":
    loop = asyncio.ProactorEventLoop()  # for subprocess' pipes on Windows
    asyncio.set_event_loop(loop)
else: