Beispiel #1
0
    def __init__(self, model, args):
        self.model = model
        self.args = args
        self.iteration = 0
        self.memory = Memory()

        if self.args.override or not os.path.isdir(
                self.args.output_dir) or self.args.output_dir == 'tmp':
            mkdir(self.args.output_dir, wipe=True)

        # initialize logging and model saving
        if self.args.output_dir is not None:
            self.logger = Logger(
                os.path.join(self.args.output_dir, 'train_log.json'))
        else:
            self.logger = Logger()
Beispiel #2
0
 def __init__(
         self, n_states, n_actions, hidden_size=128, alr=2e-3, clr=2e-3,
         gamma=0.99, epochs=4, eps_clip=0.2, atype='cat'):
     self.gamma = gamma
     self.eps_clip = eps_clip
     self.epochs = epochs
     self.atype = atype
     self.memory = Memory()
     if atype == 'cat':
         self.actor = CatActor(n_states, n_actions, hidden_size)
     else:
         self.actor = NumActor(n_states, n_actions, hidden_size)
     self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=alr)
     self.critic = V_Critic(n_states, 1, hidden_size)
     self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=clr)
     self.MseLoss = torch.nn.MSELoss()
     self.step = 0
Beispiel #3
0
 def __init__(self, path, date_format, start_date, positive_window_size, #manufacturer, \
         disk_model, columns, features, label, forget_type, bl_delay=False, \
         dropna=False, negative_window_size=6, validation_window=6, \
         bl_regression=False, label_days=None, bl_transfer=False, bl_ssd=False):
     super().__init__()
     self.memory = Memory(path, start_date, positive_window_size,  #manufacturer,\
         disk_model, columns, features, label, forget_type, dropna, bl_delay, \
         negative_window_size, bl_regression, label_days, bl_transfer, date_format, bl_ssd)
     if not bl_transfer:
         self.memory.buffering()
         self.data = self.memory.ret_df.drop(['model', 'date'], axis=1)
     else:
         self.data = self.memory.ret_df.drop(['model', 'date'], axis=1)
     self.data = self.data.reset_index(drop=True)
     self.class_name = label[0]
     self.num_classes = 2
     self.bl_delay = bl_delay
     self.validation_window = validation_window
Beispiel #4
0
 def __init__(self, paramsManager):
     self.paramsManager = paramsManager
     self.memory = Memory(
         self.paramsManager.get_params()["agent"]["GOOD_MEMORIES_SIZE"],
         self.paramsManager.get_params()["agent"]["BAD_MEMORIES_SIZE"],
         self.paramsManager.get_params()["agent"]["MINI_BATCH_SIZE"],
         self.paramsManager.get_params()["environment"]
         ["FRAME_PROCESSED_WIDTH"],
         self.paramsManager.get_params()["environment"]
         ["FRAME_PROCESSED_HEIGHT"],
         self.paramsManager.get_params()["environment"]
         ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"])
     print("[i] Creating main convolutional neural network")
     self.main_cnn = CNN()
     print("[i] Creating target convolutional neural network")
     self.target_cnn = copy.deepcopy(self.main_cnn)
     print("[!] Creating the agent")
     self.main_cnn.cuda()
     self.target_cnn.cuda()
     self.agent = Agent(
         self.main_cnn, self.target_cnn,
         self.paramsManager.get_params()["agent"]["EPSILON_MAX"],
         self.paramsManager.get_params()["agent"]
         ["NUMBER_OF_FRAMES_WITH_CONSTANT_EPSILON"],
         self.paramsManager.get_params()["agent"]["FIRST_EPSILON_DECAY"],
         self.paramsManager.get_params()["agent"]
         ["FRAMES_TO_FIRST_EPSILON_DECAY"],
         self.paramsManager.get_params()["agent"]["FINAL_EPSILON_VALUE"],
         self.paramsManager.get_params()["agent"]
         ["FRAMES_TO_FINAL_EPSILON"],
         self.paramsManager.get_params()["agent"]
         ["EXPLORATION_PROBABILITY_DURING_EVALUATION"],
         self.paramsManager.get_params()["agent"]["LEARNING_RATE"])
     self.breakout_wrapper = BreakoutWrapper(
         self.paramsManager.get_params()["environment"]["NAME"],
         self.paramsManager.get_params()["agent"]["NO_OP_STEPS"],
         self.paramsManager.get_params()["environment"]
         ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"],
         self.paramsManager.get_params()["environment"]
         ["FRAME_PROCESSED_WIDTH"],
         self.paramsManager.get_params()["environment"]
         ["FRAME_PROCESSED_HEIGHT"],
         self.paramsManager.get_params()["environment"]["RENDER"])
Beispiel #5
0
    def sample_worker(self, pid, queue, min_batch_size):
        torch.randn(pid)
        if hasattr(self.env, 'np_random'):
            self.env.np_random.rand(pid)
        memory = Memory()
        logger = LoggerRL()

        while logger.num_steps < min_batch_size:
            state = self.env.reset()
            if self.running_state is not None:
                state = self.running_state(state)
            logger.start_episode(self.env)
            self.pre_episode()

            for t in range(10000):
                state_var = tensor(state).unsqueeze(0)
                vs_out = self.trans_policy(state_var)
                mean_action = self.mean_action or np.random.binomial(
                    1, 1 - self.noise_rate)
                action = self.policy_net.select_action(vs_out,
                                                       mean_action)[0].numpy()
                action = int(
                    action
                ) if self.policy_net.type == 'discrete' else action.astype(
                    np.float64)
                next_state, env_reward, done, info = self.env.step(action)
                if self.running_state is not None:
                    next_state = self.running_state(next_state)
                if self.custom_reward is not None:
                    c_reward, c_info = self.custom_reward(
                        self.env, state, action, info)
                    reward = c_reward
                else:
                    c_reward, c_info = 0.0, np.array([0.0])
                    reward = env_reward
                logger.step(self.env, env_reward, c_reward, c_info)

                mask = 0 if done else 1
                exp = 1 - mean_action
                self.push_memory(memory, state, action, mask, next_state,
                                 reward, exp)

                if pid == 0 and self.render:
                    self.env.render()
                if done:
                    break
                state = next_state

            logger.end_episode(self.env)
        logger.end_sampling()

        if queue is not None:
            queue.put([pid, memory, logger])
        else:
            return memory, logger
Beispiel #6
0
class Simulate(AbstractPredict):
    def __init__(self, path, date_format, start_date, positive_window_size, #manufacturer, \
            disk_model, columns, features, label, forget_type, bl_delay=False, \
            dropna=False, negative_window_size=6, validation_window=6, \
            bl_regression=False, label_days=None, bl_transfer=False, bl_ssd=False):
        super().__init__()
        self.memory = Memory(path, start_date, positive_window_size,  #manufacturer,\
            disk_model, columns, features, label, forget_type, dropna, bl_delay, \
            negative_window_size, bl_regression, label_days, bl_transfer, date_format, bl_ssd)
        if not bl_transfer:
            self.memory.buffering()
            self.data = self.memory.ret_df.drop(['model', 'date'], axis=1)
        else:
            self.data = self.memory.ret_df.drop(['model', 'date'], axis=1)
        self.data = self.data.reset_index(drop=True)
        self.class_name = label[0]
        self.num_classes = 2
        self.bl_delay = bl_delay
        self.validation_window = validation_window

    def load(self):
        # Load Data from Memory class and backtracking delayed instances
        self.memory.data_management(self.keep_delay, self.bl_delay)

        self.data = self.memory.ret_df.drop(['model', 'date'], axis=1)
        self.data = self.data.reset_index(drop=True)

    def delay_evaluate(self):
        pop_sn = []
        i = 0
        for sn, instances in self.keep_delay.items():
            instances.dequeue()
            if len(instances.queue) == 0:
                pop_sn.append(sn)
            i += 1
        for sn in pop_sn:
            self.keep_delay.pop(sn)

    def run(self):
        self.inspect(self.data, self.class_name, self.num_classes,
                     self.memory.new_inst_start_index, self.validation_window)
Beispiel #7
0
    def perform_rollout(self, theta, inner=False):
        memory = Memory(self.hp)
        (s1, s2), _ = self.env.reset()
        for t in range(self.hp.len_rollout):
            a1, lp1 = self.act(s1, self.theta)
            a2, lp2 = self.act_opp(s2, theta)
            if self.id > 0:
                (s2, s1), (r2, r1), _, _ = self.env.step((a2, a1))
            else:
                (s1, s2), (r1, r2), _, _ = self.env.step((a1, a2))

            r1 = torch.Tensor(r1)
            r2 = torch.Tensor(r2)

            if inner:
                memory.add(lp2, lp1, r2)
            else:
                memory.add(lp1, lp2, r1)

        return memory
Beispiel #8
0
                                                            axis=0), mask)

        if done:
            print(step_no, 'epsiode reward- ' + str(eps_reward))
            eps_reward = 0
            state = env.reset()
            state_list = []
            for i in range(args.past_frames - 1):
                state_list.append(initial[0])
            state_list.append(state[0])

        if len(memory) < args.batch_size:
            batch = memory.sample()
        else:
            batch = memory.sample(size=args.batch_size)

        update_dqn(model, batch, args, criterion, optimizer, device)
        step_no += 1


inp_channels = 4
env = gym.make(args.env_name)
env = AtariRescale105x80(env)
num_actions = env.action_space.n
dqn = DQN(inp_channels, num_actions)
memory = Memory(limit=args.replay_size)
device = torch.device('cuda')
dqn.to(device)

train_model(dqn, env, memory, args, device)
Beispiel #9
0
 def setup_memory(self) -> None:
     columns = [
         "states", "next_states", "actions", "log_probs", "rewards", "done"
     ]
     self.episode_memory = Memory(columns)
     self.epoch_memory = Memory(columns)
class PolicyGradient(Agent):
    def __init__(self,
                 env: Env,
                 lr: float,
                 gamma: float = 0.99,
                 layers=(128, 128),
                 verbose=False,
                 model_path=None,
                 save=False):
        super().__init__(env, verbose, save)
        self.gamma = gamma
        self.model_path = model_path
        if self.action_space.discrete:
            head = nn.Softmax(dim=-1)
        else:
            head = nn.Tanh()

        self.model = MLP(self.state_space.shape[0], self.action_space.shape[0],
                         layers, head)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.reset()

    def setup_memory(self) -> None:
        columns = ["states", "next_states", "actions", "log_probs", "rewards"]
        self.episode_memory = Memory(columns)
        self.epoch_memory = Memory(columns)

    def act(self, state: List, train: bool = True) -> Tuple:
        state = torch.from_numpy(state).type(torch.FloatTensor)
        action_probs = self.model(state)

        distribution = self.action_space.distribution(action_probs)
        action = distribution.sample()
        if train:
            return action.data.numpy(), distribution.log_prob(action)
        else:
            return torch.argmax(action_probs).data.numpy(),

    def update(self) -> None:
        self.optimizer.zero_grad()
        loss, = self.epoch_memory.get_columns(["loss"])
        loss = torch.mean(torch.stack(loss))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
        self.optimizer.step()
        print(f"Value Loss: {loss.item()}")
        self.reset()

    def save_model(self) -> None:
        torch.save(self.model.state_dict(), self.model_path)

    def load_model(self, model_path: str) -> None:
        self.model.load_state_dict(torch.load(model_path))
        self.model.eval()

    def setup_schedulers(self, n_epochs: int) -> None:
        scheduler = CosineAnnealingLR(self.optimizer, n_epochs)
        self.schedulers.append(scheduler)

    def cumulate_rewards(self) -> None:
        cumulated_reward = 0
        cumulated_rewards = []
        rewards, log_probs = self.episode_memory.get_columns(
            ["rewards", "log_probs"])
        for i in range(len(rewards) - 1, -1, -1):
            cumulated_reward = self.gamma * cumulated_reward + rewards[i]
            cumulated_rewards.append(cumulated_reward)

        cumulated_rewards = cumulated_rewards[::-1]
        loss = -torch.sum(
            torch.mul(torch.stack(log_probs), torch.Tensor(cumulated_rewards)))
        self.episode_memory.append_column("loss", loss)
        self.episode_memory.extend_column("cumulated_rewards",
                                          cumulated_rewards)
Beispiel #11
0
 def __init__(self):
     super().__init__()
     self.memory = Memory(1000)
Beispiel #12
0
class PPO:
    def __init__(
            self, n_states, n_actions, hidden_size=128, alr=2e-3, clr=2e-3,
            gamma=0.99, epochs=4, eps_clip=0.2, atype='cat'):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.epochs = epochs
        self.atype = atype
        self.memory = Memory()
        if atype == 'cat':
            self.actor = CatActor(n_states, n_actions, hidden_size)
        else:
            self.actor = NumActor(n_states, n_actions, hidden_size)
        self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=alr)
        self.critic = V_Critic(n_states, 1, hidden_size)
        self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=clr)
        self.MseLoss = torch.nn.MSELoss()
        self.step = 0

    def store_transition(self, s, a, r, s_, done, p):
        self.memory.states.append(s)
        self.memory.rewards.append(r)
        self.memory.actions.append(a)
        self.memory.probs.append(p)
        self.memory.next_states.append(s_)
        self.memory.is_terminals.append(done)

    def update(self, batch_size=None):
        if len(self.memory.actions) < batch_size: return
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(
                self.memory.rewards[::-1], self.memory.is_terminals[::-1]):
            if is_terminal: discounted_reward = 0
            discounted_reward = reward + self.gamma * discounted_reward
            rewards.insert(0, discounted_reward)
        rewards = torch.tensor(rewards).float()

        old_states = torch.FloatTensor(self.memory.states).detach()
        old_actions = torch.stack(self.memory.actions).detach()
        old_probs = torch.stack(self.memory.probs).detach()
        rewards = (rewards - rewards.mean()) / (rewards.std()+1e-7)

        split_res = self.memory.split(batch_size)

        for _ in range(self.epochs):
            for idxs in split_res:
                split_old_states = old_states[idxs[0]:idxs[1]]
                split_old_actions = old_actions[idxs[0]:idxs[1]]
                split_old_probs = old_probs[idxs[0]:idxs[1]]
                split_rewards = rewards[idxs[0]:idxs[1]]

                dist = self.actor.choose_action(split_old_states, True)
                log_probs = dist.log_prob(split_old_actions.squeeze())
                # diff = log_probs.squeeze() - split_old_probs.squeeze()
                # ratios = torch.exp(log_probs.squeeze()) / torch.exp(split_old_probs.squeeze())
                ratios = torch.exp(log_probs.squeeze() - split_old_probs.squeeze())

                state_values = self.critic(split_old_states).squeeze()
                advantages = split_rewards - state_values.detach()
                surr1 = ratios * advantages
                surr2 = ratios.clamp(1-self.eps_clip, 1+self.eps_clip) * advantages
                aloss = -torch.min(surr1, surr2) - 0.01 * dist.entropy()
                
                self.actor_opt.zero_grad()
                aloss = aloss.mean()
                aloss.backward()
                clip_gradient(self.actor_opt, 0.1)
                writer.add_scalar('actor_loss', aloss.item(), self.step)
                self.actor_opt.step()

                closs = 0.5*self.MseLoss(split_rewards, state_values).mean()
                self.critic_opt.zero_grad()
                closs.backward()
                clip_gradient(self.critic_opt, 0.1)
                writer.add_scalar('critic_loss', closs.item(), self.step)
                self.critic_opt.step()
                self.step += 1
Beispiel #13
0
    args = parser.parse_args()
    args.output = get_output_folder(args.output, args.env)
    args.use_cuda = USE_CUDA
    with open(args.output + "/parameters.txt", 'w') as file:
        for key, value in vars(args).items():
            file.write("{} = {}\n".format(key, value))

    # Environment
    env = gym.make(args.env)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = int(env.action_space.high[0])

    # Memory
    memory = Memory(args.mem_size, state_dim, action_dim, args)

    # Algorithm
    drla = NASTD3(state_dim, action_dim, max_action, args)

    # Action noise
    a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma)

    # Logger
    fields = ["eval_score", "critic_loss", "actor_loss", "total_steps"]
    logger = Logger(args.output, fields)

    # Train
    ite = 0
    K = 5000
    total_steps = 0
Beispiel #14
0
def main(cfg):
    random.seed(cfg.exp.seed)
    np.random.seed(cfg.exp.seed)
    torch.manual_seed(cfg.exp.seed)
    torch.backends.cudnn.deterministic = cfg.exp.torch_deterministic

    # so that the environment automatically resets
    env = SyncVectorEnv([
        lambda: RecordEpisodeStatistics(gym.make('CartPole-v1'))
    ])

    actor, critic = Actor(), Critic()
    actor_optim = Adam(actor.parameters(), eps=1e-5, lr=cfg.params.actor_lr)
    critic_optim = Adam(critic.parameters(), eps=1e-5, lr=cfg.params.critic_lr)
    memory = Memory(mini_batch_size=cfg.params.mini_batch_size, batch_size=cfg.params.batch_size)
    obs = env.reset()
    global_rewards = []

    NUM_UPDATES = (cfg.params.total_timesteps // cfg.params.batch_size) * cfg.params.epochs
    cur_timestep = 0

    def calc_factor(cur_timestep: int) -> float:
        """Calculates the factor to be multiplied with the learning rate to update it."""
        update_number = cur_timestep // cfg.params.batch_size
        total_updates = cfg.params.total_timesteps // cfg.params.batch_size
        fraction = 1.0 - update_number / total_updates
        return fraction

    actor_scheduler = LambdaLR(actor_optim, lr_lambda=calc_factor, verbose=True)
    critic_scheduler = LambdaLR(critic_optim, lr_lambda=calc_factor, verbose=True)

    while cur_timestep < cfg.params.total_timesteps:
        # keep playing the game
        obs = torch.as_tensor(obs, dtype=torch.float32)
        with torch.no_grad():
            dist = actor(obs)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            value = critic(obs)
        action = action.cpu().numpy()
        value = value.cpu().numpy()
        log_prob = log_prob.cpu().numpy()
        obs_, reward, done, info = env.step(action)
        
        if done[0]:
            tqdm.write(f'Reward: {info[0]["episode"]["r"]}, Avg Reward: {np.mean(global_rewards[-10:]):.3f}')
            global_rewards.append(info[0]['episode']['r'])
            wandb.log({'Avg_Reward': np.mean(global_rewards[-10:]), 'Reward': info[0]['episode']['r']})

        memory.remember(obs.squeeze(0).cpu().numpy(), action.item(), log_prob.item(), reward.item(), done.item(), value.item())
        obs = obs_
        cur_timestep += 1

        # if the current timestep is a multiple of the batch size, then we need to update the model
        if cur_timestep % cfg.params.batch_size == 0:
            for epoch in tqdm(range(cfg.params.epochs), desc=f'Num updates: {cfg.params.epochs * (cur_timestep // cfg.params.batch_size)} / {NUM_UPDATES}'):
                # sample a batch from memory of experiences
                old_states, old_actions, old_log_probs, old_rewards, old_dones, old_values, batch_indices = memory.sample()
                old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32)
                old_actions = torch.tensor(old_actions, dtype=torch.float32)
                advantage = calculate_advantage(old_rewards, old_values, old_dones, gae_gamma=cfg.params.gae_gamma, gae_lambda=cfg.params.gae_lambda)
                
                advantage = torch.tensor(advantage, dtype=torch.float32)
                old_rewards = torch.tensor(old_rewards, dtype=torch.float32)
                old_values = torch.tensor(old_values, dtype=torch.float32)

                # for each mini batch from batch, calculate advantage using GAE
                for mini_batch_index in batch_indices:
                    # remember: Normalization of advantage is done on mini batch, not the entire batch
                    advantage[mini_batch_index] = (advantage[mini_batch_index] - advantage[mini_batch_index].mean()) / (advantage[mini_batch_index].std() + 1e-8)

                    dist = actor(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0))
                    # actions = dist.sample()
                    log_probs = dist.log_prob(old_actions[mini_batch_index]).squeeze(0)
                    entropy = dist.entropy().squeeze(0)

                    log_ratio = log_probs - old_log_probs[mini_batch_index]
                    ratio = torch.exp(log_ratio)

                    with torch.no_grad():
                        # approx_kl = ((ratio-1)-log_ratio).mean()
                        approx_kl = ((old_log_probs[mini_batch_index] - log_probs)**2).mean()
                        wandb.log({'Approx_KL': approx_kl})

                    actor_loss = -torch.min(
                        ratio * advantage[mini_batch_index],
                        torch.clamp(ratio, 1 - cfg.params.actor_loss_clip, 1 + cfg.params.actor_loss_clip) * advantage[mini_batch_index]
                    ).mean()

                    values = critic(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0)).squeeze(-1)
                    returns = old_values[mini_batch_index] + advantage[mini_batch_index]

                    critic_loss = torch.max(
                        (values - returns)**2,
                        (old_values[mini_batch_index] + torch.clamp(
                            values - old_values[mini_batch_index], -cfg.params.critic_loss_clip, cfg.params.critic_loss_clip
                            ) - returns
                        )**2
                    ).mean()
                    # critic_loss = F.mse_loss(values, returns)

                    wandb.log({'Actor_Loss': actor_loss.item(), 'Critic_Loss': critic_loss.item(), 'Entropy': entropy.mean().item()})
                    loss = actor_loss + 0.25 * critic_loss - 0.01 * entropy.mean()
                    actor_optim.zero_grad()
                    critic_optim.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(actor.parameters(), cfg.params.max_grad_norm)
                    nn.utils.clip_grad_norm_(critic.parameters(), cfg.params.max_grad_norm)

                    actor_optim.step()
                    critic_optim.step()

            memory.reset()
            actor_scheduler.step(cur_timestep)
            critic_scheduler.step(cur_timestep)

            y_pred, y_true = old_values.cpu().numpy(), (old_values + advantage).cpu().numpy()
            var_y = np.var(y_true)
            explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
            wandb.log({'Explained_Var': explained_var})

    if cfg.exp.save_weights:
        torch.save(actor.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/actor.pth'))
        torch.save(critic.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/critic.pth'))
Beispiel #15
0
def train():
    numAgent = 10  # multiple agents are running synchronously.
    # each agent has a different type with different properties.
    # Only one network is created, different agent gets their
    # own behavior according to the embedding input.
    numGame = 20  # multiple games running simultaneously.
    print('agent count:', numAgent)
    print('Env num:', numGame)

    env = {}
    for game in range(numGame):
        env[game] = miniDotaEnv(args, numAgent)

    # initialize the neural networks.
    # use a single network to share the knowledge.
    net = ac(args)
    if not args.cpuSimulation:
        net = net.to(device)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model',
                                       str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)
        net.load_state_dict(ckpt['net'])

    observations, lastDone = {}, {}
    for game in range(numGame):
        observations[game] = env[game].reset(0)[
            'observations']  # get initial state.
        lastDone[game] = [
            False
        ] * 10  # to record whether game is done at the previous step.

    optimizer = optim.Adam(net.parameters(), lr=args.lr)

    for iteration in range(args.max_iter):  # playing-training iteration.
        start = time.time()
        print()
        print('Start iteration %d ..' % iteration)
        if args.cpuSimulation:
            net = net.cpu()
        net.eval()  # switch to evaluation mode.
        memory = []
        for i in range(numGame):
            memory.append([Memory() for j in range(numAgent)])
            # memory is cleared at every iter so only the current iteration's samples are used in training.
            # the separation of memory according to game is necessary as they
            # need to be processed separate for each game.

        steps = 0
        teamscore = 0  # only for game 0.
        record = []  # record the states for visualization.
        gameEnd = np.zeros(numGame).astype(bool)

        while steps <= args.time_horizon:  # loop for one game.
            if np.all(gameEnd):
                break
            steps += 1
            stateList = []
            for game in range(numGame):
                for agent in range(numAgent):
                    stateList.append(
                        np.expand_dims(observations[game][agent], axis=0))
            stateCombined = np.concatenate(stateList, axis=0)
            # concatenate the states of all games and process them by the network together.
            with torch.no_grad():
                actionDistr = net(to_tensor(stateCombined, args.cpuSimulation))
            actions = get_action(actionDistr)

            for game in range(numGame):
                if not gameEnd[game]:
                    # the following random action cannot work, because random action has too small prob density value,
                    # leading to strange bugs.
                    #                    sample = random.random()
                    #                    if sample > args.randomActionRatio * (1 - min(1, iteration/1000) ):
                    #                        thisGameAction = actions[10*game:10*(game+1), :] # contain actions from all agents.
                    #                        check(thisGameAction)
                    #                    else:
                    #                        actionmove = np.random.randint(0, 3, size=(10,3))
                    #                        target = np.random.randint(0, 12, size=(10,1))
                    #                        thisGameAction = np.concatenate([actionmove, target], axis=1)
                    thisGameAction = actions[10 * game:10 * (
                        game + 1
                    ), :]  # select the actions from all agents of this env.
                    envInfo = env[game].step(
                        thisGameAction
                    )  # environment runs one step given the action.
                    nextObs = envInfo['observations']  # get the next state.
                    if game == 0:
                        record.append(
                            np.concatenate([
                                env[game].getState(),
                                actions[0:10, :].reshape(-1)
                            ]))
                    rewards = envInfo['rewards']
                    dones = envInfo['local_done']
                    #                    masks = list(~dones) # cut the return calculation at the done point.
                    masks = [
                        True
                    ] * numAgent  # no need to mask out the last state-action pair,
                    # because the last reward is useful to us.

                    for i in range(numAgent):
                        if not lastDone[game][i]:
                            memory[game][i].push(observations[game][i],
                                                 thisGameAction[i], rewards[i],
                                                 masks[i])
                    lastDone[game] = dones
                    if game == 0:
                        teamscore += sum(
                            [rewards[x] for x in env[game].getTeam0()])
                    observations[game] = nextObs

                    gameEnd[game] = np.all(dones)
                    if gameEnd[game]:
                        if game == 0:
                            print('Game 0 score: %f' % teamscore)


#                            recordMat = np.stack(record)# stack will expand the dimension before concatenate.
#                            draw(recordMat, iteration, env[game].getUnitRange(), 10)
                        observations[game] = env[game].reset(iteration +
                                                             1)['observations']
                        lastDone[game] = [False] * 10

        simEnd = time.time()
        print('Simulation time: %.f' % (simEnd - start))

        net.train()  # switch to training mode.
        net = net.cuda()

        sts, ats, returns, advants, old_policy, old_value = [], [], [], [], [], []

        for game in range(numGame):
            for i in range(numAgent):
                batch = memory[game][i].sample()
                st, at, rt, adv, old_p, old_v = process_memory(
                    net, batch, args)
                sts.append(st)
                ats.append(at)
                returns.append(rt)
                advants.append(adv)
                old_policy.append(old_p)
                old_value.append(old_v)

        sts = torch.cat(sts)
        ats = torch.cat(ats)
        returns = torch.cat(returns)
        advants = torch.cat(advants)
        old_policy = torch.cat(old_policy)
        old_value = torch.cat(old_value)

        train_model(net, optimizer, sts, ats, returns, advants, old_policy,
                    old_value, args)
        # training is based on the state-action pairs from all games of the current iteration.

        trainEnd = time.time()
        print('Training time: %.f' % (trainEnd - simEnd))

        if iteration % 10 == 0:
            model_path = os.path.join(os.getcwd(), 'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path,
                                     'ckpt_%.3f.pth.tar' % teamscore)

            save_checkpoint(
                {
                    'net': net.state_dict(),
                    'args': args,
                    'score': teamscore
                },
                filename=ckpt_path)
Beispiel #16
0
class BreakOutPlayer:
    def __init__(self, paramsManager):
        self.paramsManager = paramsManager
        self.memory = Memory(
            self.paramsManager.get_params()["agent"]["GOOD_MEMORIES_SIZE"],
            self.paramsManager.get_params()["agent"]["BAD_MEMORIES_SIZE"],
            self.paramsManager.get_params()["agent"]["MINI_BATCH_SIZE"],
            self.paramsManager.get_params()["environment"]
            ["FRAME_PROCESSED_WIDTH"],
            self.paramsManager.get_params()["environment"]
            ["FRAME_PROCESSED_HEIGHT"],
            self.paramsManager.get_params()["environment"]
            ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"])
        print("[i] Creating main convolutional neural network")
        self.main_cnn = CNN()
        print("[i] Creating target convolutional neural network")
        self.target_cnn = copy.deepcopy(self.main_cnn)
        print("[!] Creating the agent")
        self.main_cnn.cuda()
        self.target_cnn.cuda()
        self.agent = Agent(
            self.main_cnn, self.target_cnn,
            self.paramsManager.get_params()["agent"]["EPSILON_MAX"],
            self.paramsManager.get_params()["agent"]
            ["NUMBER_OF_FRAMES_WITH_CONSTANT_EPSILON"],
            self.paramsManager.get_params()["agent"]["FIRST_EPSILON_DECAY"],
            self.paramsManager.get_params()["agent"]
            ["FRAMES_TO_FIRST_EPSILON_DECAY"],
            self.paramsManager.get_params()["agent"]["FINAL_EPSILON_VALUE"],
            self.paramsManager.get_params()["agent"]
            ["FRAMES_TO_FINAL_EPSILON"],
            self.paramsManager.get_params()["agent"]
            ["EXPLORATION_PROBABILITY_DURING_EVALUATION"],
            self.paramsManager.get_params()["agent"]["LEARNING_RATE"])
        self.breakout_wrapper = BreakoutWrapper(
            self.paramsManager.get_params()["environment"]["NAME"],
            self.paramsManager.get_params()["agent"]["NO_OP_STEPS"],
            self.paramsManager.get_params()["environment"]
            ["NUMBER_OF_FRAMES_TO_STACK_ON_STATE"],
            self.paramsManager.get_params()["environment"]
            ["FRAME_PROCESSED_WIDTH"],
            self.paramsManager.get_params()["environment"]
            ["FRAME_PROCESSED_HEIGHT"],
            self.paramsManager.get_params()["environment"]["RENDER"])

    def train(self):
        frame_number = 0
        rewards = []
        # Stores the mean rewards of each epoch
        epochs_means = []
        # While we are training
        while frame_number < self.paramsManager.get_params(
        )["agent"]["MAX_FRAMES"]:
            #########################
            ####### TRAINING ########
            #########################
            # Epoch counter
            epoch_counter = 0
            # Stores the epoch rewards
            epoch_rewards = []
            # While we arent on evaluation
            while epoch_counter < self.paramsManager.get_params(
            )["agent"]["EVAL_FREQUENCY"]:
                # Resetting the env
                done_life_lost = self.breakout_wrapper.reset(evaluation=False)
                # Other params
                total_episode_reward = 0
                current_ale_lives = 5
                perform_fire = True
                for i in range(self.paramsManager.get_params()["agent"]
                               ["MAX_EPISODE_LENGTH"]):
                    # Prints the saparetor defined on the json
                    print(self.paramsManager.get_params()["environment"]
                          ["SEPARATOR"])
                    # If its necessary to FIRE
                    if perform_fire:
                        chosen_action = 1
                    else:
                        chosen_action = self.agent.get_action(
                            frame_number,
                            self.breakout_wrapper.actual_state,
                            evaluation=False)
                    # We take the step. A dying penalty is added by the breakout_wrapper
                    processed_new_frame, reward, done, done_life_lost, _, info = self.breakout_wrapper.step(
                        chosen_action,
                        self.paramsManager.get_params()["agent"]
                        ["DYING_REWARD"], current_ale_lives)
                    print("[i] Action performed: ", chosen_action,
                          ". Reward: ", reward, ".Frame number: ",
                          frame_number)
                    # If we already have rewards:
                    if len(rewards) != 0:
                        print("[i] Mean Training Reward: %.3f" %
                              (sum(rewards) / len(rewards)))
                    if len(epoch_rewards) != 0:
                        print("[i] Mean Epoch Reward: %.3f" %
                              (sum(epoch_rewards) / len(epoch_rewards)))
                    frame_number += 1
                    epoch_counter += 1
                    total_episode_reward += reward
                    if self.paramsManager.get_params()["agent"]["CLIP_REWARD"]:
                        self.memory.store(processed_new_frame, chosen_action,
                                          self.clip_reward(reward),
                                          done_life_lost)
                    else:
                        self.memory.store(processed_new_frame, chosen_action,
                                          reward, done_life_lost)
                    # If its time to learn
                    if frame_number % self.paramsManager.get_params()["agent"][
                            "UPDATE_FREQUENCY"] and frame_number > self.paramsManager.get_params(
                            )["agent"]["REPLAY_MEMORY_START_FRAME"]:
                        losses = self.agent.learn(
                            self.memory,
                            self.paramsManager.get_params()["agent"]["GAMMA"],
                            self.paramsManager.get_params()["agent"]
                            ["MINI_BATCH_SIZE"])
                    if frame_number % self.paramsManager.get_params()["agent"][
                            "NETWORK_UPDATE_FREQ"] == 0 and frame_number > self.paramsManager.get_params(
                            )["agent"]["REPLAY_MEMORY_START_FRAME"]:
                        self.agent.updateNetworks()
                    if info["ale.lives"] < current_ale_lives:
                        perform_fire = True
                        current_ale_lives = info["ale.lives"]
                    elif info["ale.lives"] == current_ale_lives:
                        perform_fire = False
                    if done:
                        done = False
                        perform_fire = True
                        break
                rewards.append(total_episode_reward)
                epoch_rewards.append(total_episode_reward)
            #########################
            ####### SAVE INFO #######
            #########################
            epochs_means.append(sum(epoch_rewards) / len(epoch_rewards))
            file = open("results.txt", "w")
            print("============ EPOCH %d FINISHED ============" %
                  len(epochs_means))
            for idx, mean in enumerate(epochs_means):
                print("Epoch number: %d. Mean reward: %.3f" % (idx, mean))
                file.write("Epoch number: %d. Mean reward: %.3f\n" %
                           (idx, mean))
            file.close()
            time.sleep(10)

    def clip_reward(self, r):
        if r > 0:
            return 1
        elif r == 0:
            return 0
        else:
            return -1
Beispiel #17
0
                           critic_layer_sizes,
                           grayscale=grayscale).to(device)

# Create AE Hashing model and optimizers
ae_hash = AEHash(len_AE_hashcode,
                 4 if stacked else 1,
                 noise_scale,
                 saturating_weight,
                 device=device).to(device)
ae_hash_optim = optim.Adam(ae_hash.parameters())

# Create SimHash
sim_hash = SimHash(len_AE_hashcode, len_SimHash_hashcode)

# Set up memory
memory = Memory(capacity, GAMMA, LAMBDA,
                'cpu')  # Put memory on cpu to save space

# Set up pixel observation preprocessing
transform = Compose([
    ToPILImage(),
    Grayscale(num_output_channels=1),  # Turn frame into grayscale
    Resize((52, 52)),
    ToTensor()
])

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
Beispiel #18
0
            }
            for i in range(n_actor):
                agent.store_policy('Pendulum-v0', score=fs[i], index=i)
            n += 1

        # printing iteration resume
        if debug:
            prPurple('Iteration#{}: Total steps:{} \n'.format(n, total_steps))


if __name__ == "__main__":

    # The environment
    env = gym.make("Pendulum-v0")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = int(env.action_space.high[0])

    # replay buffer
    memory = Memory(100000, state_dim, action_dim)

    # agent
    agent = DTD3(state_dim, action_dim, max_action, memory, n_actor=5)
    print("starting")
    train(agent,
          n_episodes=1000,
          max_steps=1000000,
          debug=True,
          n_eval=100,
          n_actor=5)
Beispiel #19
0
class ExperienceReplayDQNAgent(DQNAgent):
    def __init__(self):
        super().__init__()
        self.memory = Memory(1000)

    def remember(self, state, action, reward, next_state, done):
        self.memory.store((state, action, reward, next_state, done))

    def replay_new(self, memory):
        idx, minibatch, ISWeights = memory.sample(1000)
        for sample in minibatch:
            state = sample[0][0]
            action = sample[0][1]
            reward = sample[0][2]
            next_state = sample[0][3]
            done = sample[0][4]
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(
                    self.model.predict(np.array([next_state]))[0])
            target_f = self.model.predict(np.array([state]))
            memory.batch_update(idx, np.abs(target_f[0] - target))
            target_f[0][np.argmax(action)] = target
            self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)

    def train_short_memory(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target = reward + self.gamma * np.amax(
                self.model.predict(next_state.reshape((1, 11)))[0])
        target_f = self.model.predict(state.reshape((1, 11)))
        target_f[0][np.argmax(action)] = target
        self.model.fit(state.reshape((1, 11)), target_f, epochs=1, verbose=0)

    def run(self, mode_file):
        pygame.init()
        counter_games = 0
        score_plot = []
        counter_plot = []
        record = 0
        while counter_games < 200:
            # Initialize classes
            game = Game(440, 440, mode_file)
            player1 = game.player
            food1 = game.food

            # Perform first move
            initialize_game(player1, game, food1, self)
            if game_settings['display_option']:
                display(player1, food1, game, record)

            while not game.crash:
                # agent.epsilon is set to give randomness to actions
                self.epsilon = 80 - counter_games

                # get old state
                state_old = self.get_state(game, player1, food1)

                # perform random actions based on agent.epsilon, or choose the action
                if randint(0, 200) < self.epsilon:
                    final_move = to_categorical(randint(0, 2), num_classes=3)
                else:
                    # predict action based on the old state
                    prediction = self.model.predict(state_old.reshape((1, 11)))
                    final_move = to_categorical(np.argmax(prediction[0]),
                                                num_classes=3)

                # perform new move and get new state
                player1.do_move(final_move, player1.x, player1.y, game, food1,
                                self)
                state_new = self.get_state(game, player1, food1)

                # set reward for the new state
                reward = self.set_reward(player1, game.crash)

                # train short memory base on the new action and state
                self.train_short_memory(state_old, final_move, reward,
                                        state_new, game.crash)

                # store the new data into a long term memory
                self.remember(state_old, final_move, reward, state_new,
                              game.crash)
                record = get_record(game.score, record)
                if game_settings['display_option']:
                    display(player1, food1, game, record)
                    pygame.time.wait(game_settings['speed'])

            self.replay_new(self.memory)
            counter_games += 1
            print('Game', counter_games, '      Score:', game.score)
            score_plot.append(game.score)
            counter_plot.append(counter_games)
        self.model.save_weights('weights.hdf5')
        # from google.colab import files
        # files.download("weights.hdf5")
        plot_seaborn(counter_plot, score_plot)
Beispiel #20
0
class Trainer:
    def __init__(self, model, args):
        self.model = model
        self.args = args
        self.iteration = 0
        self.memory = Memory()

        if self.args.override or not os.path.isdir(
                self.args.output_dir) or self.args.output_dir == 'tmp':
            mkdir(self.args.output_dir, wipe=True)

        # initialize logging and model saving
        if self.args.output_dir is not None:
            self.logger = Logger(
                os.path.join(self.args.output_dir, 'train_log.json'))
        else:
            self.logger = Logger()

    # a wrapper for model.forward to feed inputs as list and get outputs as a list
    def evaluate_model(self, inputs):
        output = self.model.get_model().forward(*inputs)
        return list(output) if isinstance(output, tuple) else [output]

    def train(self):
        # load after a forward call for dynamic models
        batched_data, _, _ = load_samples(self.model.get_loader(),
                                          self.model.cuda,
                                          self.args.batch_size)
        self.evaluate_model(batched_data)
        self.iteration = load(self.args.output_dir, self.model.get_model(),
                              self.iteration, self.model.get_optimizer())

        for i in range(self.iteration, self.iteration + self.args.iterations):
            #################### LOAD INPUTS ############################
            # TODO, make separate timer class if more complex timings arise
            t0 = time.time()
            batched_data, batched_targets, sample_array = load_samples(
                self.model.get_loader(), self.model.cuda, self.args.batch_size)
            self.logger.set('timing.input_loading_time', time.time() - t0)
            #############################################################

            #################### FORWARD ################################
            t1 = time.time()
            outputs = self.evaluate_model(batched_data)
            self.logger.set('timing.foward_pass_time', time.time() - t1)
            #############################################################

            #################### BACKWARD AND SGD  #####################
            t2 = time.time()
            loss = self.model.get_lossfn()(*(outputs + batched_targets))
            self.model.get_optimizer().zero_grad()
            loss.backward()
            self.model.get_optimizer().step()
            self.logger.set('timing.loss_backward_update_time',
                            time.time() - t2)
            #############################################################

            #################### LOGGING, VIZ and SAVE ###################
            print 'iteration: {0} loss: {1}'.format(self.iteration,
                                                    loss.data.item())

            if self.args.compute_graph and i == self.iteration:
                compute_graph(
                    loss,
                    output_file=os.path.join(self.args.output_dir,
                                             self.args.compute_graph))

            if self.iteration % self.args.save_iter == 0:
                save(self.model.get_model(), self.model.get_optimizer(),
                     self.iteration, self.args.output_dir)

            self.logger.set('time', time.time())
            self.logger.set('date', str(datetime.now()))
            self.logger.set('loss', loss.data.item())
            self.logger.set('iteration', self.iteration)
            self.logger.set('resident_memory',
                            str(self.memory.resident(scale='mB')) + 'mB')
            self.logger.dump_line()
            self.iteration += 1

            if self.args.visualize_iter > 0 and self.iteration % self.args.visualize_iter == 0:
                Batcher.debatch_outputs(sample_array, outputs)
                map(lambda x: x.visualize({'title': random_str(5)}),
                    sample_array)
                ImageVisualizer().dump_image(
                    os.path.join(
                        self.args.output_dir,
                        'visualizations_{0:08d}.svg'.format(self.iteration)))
Beispiel #21
0
class ActorCritic(Agent):
    def __init__(self,
                 env: Env,
                 policy_lr: float,
                 value_lr: float,
                 gamma: float = 0.99,
                 value_iter=50,
                 policy_layers=(128, 128),
                 value_layers=(128, 128),
                 verbose=False,
                 save=True,
                 policy_path=None,
                 value_path=None):
        super().__init__(env, verbose, save)
        self.gamma = gamma

        if self.action_space.discrete:
            policy_head = nn.Softmax(dim=-1)
        else:
            policy_head = nn.Tanh()

        self.policy_path = policy_path
        self.value_path = value_path
        self.policy_model = MLP(self.state_space.shape[0],
                                self.action_space.shape[0], policy_layers,
                                policy_head)
        self.value_model = MLP(self.state_space.shape[0], 1, value_layers,
                               None)
        self.policy_optimizer = optim.Adam(self.policy_model.parameters(),
                                           lr=policy_lr)
        self.value_optimizer = optim.Adam(self.value_model.parameters(),
                                          lr=value_lr)
        self.value_loss = nn.MSELoss()
        self.reset()
        self.counter = 0
        self.value_iter = value_iter

    def setup_memory(self) -> None:
        columns = [
            "states", "next_states", "actions", "log_probs", "rewards", "done"
        ]
        self.episode_memory = Memory(columns)
        self.epoch_memory = Memory(columns)

    def act(self, state: List, train=True) -> Tuple:
        state = torch.from_numpy(state).type(torch.FloatTensor)
        action_probs = self.policy_model(state)
        distribution = self.action_space.distribution(action_probs)
        action = distribution.sample()
        if train:
            return action.data.numpy(), distribution.log_prob(action)
        else:
            return torch.argmax(action_probs).data.numpy(),

    def update(self) -> None:
        states, next_states, rewards, cumulated_rewards, log_probs, done = self.epoch_memory.get_columns(
            [
                "states", "next_states", "rewards", "cumulated_rewards",
                "log_probs", "done"
            ])
        # Compute the advantge for the previous Value function
        with torch.no_grad():
            advantages = torch.Tensor(rewards) + (
                self.gamma * (1 - torch.tensor(done, dtype=int)) *
                self.value_model(torch.Tensor(next_states)).squeeze() -
                self.value_model(torch.Tensor(states)).squeeze())

        # Train the value function a cetrain number of iterations
        for _ in range(int(self.value_iter) + 1):
            values = self.value_model(torch.Tensor(states)).squeeze()
            value_loss = self.value_loss(values,
                                         torch.Tensor(cumulated_rewards))
            self.value_optimizer.zero_grad()
            value_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.value_model.parameters(), 1)
            self.value_optimizer.step()
        self.value_iter *= 0.95
        print(f"Value Loss: {value_loss.item()}")
        # Compute the policy loss using th previous value function
        policy_loss = -torch.sum(torch.mul(torch.stack(log_probs),
                                           advantages)) / self.counter
        print(f"Policy Loss: {policy_loss.item()}")
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_model.parameters(), 1)
        self.policy_optimizer.step()
        self.reset()

    def save_model(self) -> None:
        torch.save(self.policy_model.state_dict(), self.policy_path)
        torch.save(self.value_model.state_dict(), self.value_path)

    def load_model(self, policy_path: str, value_path: str) -> None:
        self.policy_model.load_state_dict(torch.load(policy_path))
        self.value_model.load_state_dict(torch.load(value_path))
        self.policy_model.eval()
        self.value_model.eval()

    def setup_schedulers(self, n_epochs: int) -> None:
        policy_scheduler = ExponentialLR(self.policy_optimizer, 0.97)
        value_scheduler = ExponentialLR(self.value_optimizer, 0.97)
        self.schedulers.append(policy_scheduler)
        self.schedulers.append(value_scheduler)

    def cumulate_rewards(self) -> None:
        cumulated_reward = 0
        cumulated_rewards = []
        rewards, = self.episode_memory.get_columns(["rewards"])
        for i in range(len(rewards) - 1, -1, -1):
            cumulated_reward = self.gamma * cumulated_reward + rewards[i]
            cumulated_rewards.append(cumulated_reward)
        self.episode_memory.extend_column("cumulated_rewards",
                                          cumulated_rewards[::-1])
plt.ion()

# Create OpenAI gym environment
env = gym.make(env_name)
if is_unwrapped:
    env = env.unwrapped

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
policy_net = PolicyNet(layer_sizes).to(device)  # Policy network

# Set up memory
memory = Memory(capacity, GAMMA, LAMBDA, device)

# Set up optimizer
policynet_optimizer = optim.Adam(policy_net.parameters())

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
    "epoch mean rewards": [],
    "max reward achieved": 0,
    "past %d epochs mean reward" % num_avg_epoch: 0
}
Beispiel #23
0
def Game(max_ep_len=1000, num_frames=4):
    global exit_game
    global actions

    env = gym.make('CarRacing-v0')
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape
    print(f"State: {state_dim}")
    print(f"Action: {action_dim}")

    # set interrupts
    env.reset()
    env.viewer.window.on_key_press = key_press
    env.viewer.window.on_key_release = key_release

    # make global actions array 
    actions = np.zeros(4, dtype=np.float32)

    # mem
    memory = Memory()
    memory.create(state_dim, action_dim)

    # logger
    ep_ret_log = []

    # init environment
    obs, ep_ret, ep_len, epoch = env.reset(), 0, 0, 0
    obs = np.expand_dims(obs, axis=0)
    state_stack = np.repeat(obs, num_frames, axis=0)
    print(state_stack.shape)
    print(state_stack.dtype)

    # main loop
    while exit_game == False:
        # render window
        env.render()

        # take action
        obs2, r, d, _ = env.step(actions[:3])
        obs2 = np.expand_dims(obs2, axis=0)
        state_stack = np.append(state_stack[1:], obs2, axis=0)

        # statistics
        ep_ret += r
        ep_len += 1

        # Ignore the 'done' signal
        d = False if ep_len == max_ep_len else d

        # store in memory
        memory.add(state_stack, np.array(actions[:3]), r, d)
        
        # End of episode
        if d or (ep_len == max_ep_len):
            print(f"Epoch: {epoch}, EpRet: {ep_ret}, EpLen: {ep_len}, ReplayBuff: {len(memory)}")

            # if exists statistical data
            if len(ep_ret_log) > 0:
                log = np.array(ep_ret_log)
                print("AvgEpRet:", log.mean())
                print("StdEpRet:", log.std())
                print("MaxEpRet:", log.max())
                print("MinEpRet:", log.min())
            
            print()

            ep_ret_log.append(ep_ret)

            obs, ep_ret, ep_len = env.reset(), 0, 0
            obs = np.expand_dims(obs, axis=0)
            state_stack = np.repeat(obs, num_frames, axis=0)

            epoch += 1
    
    print('\n')

    # save the dataset
    memory.save()
Beispiel #24
0
def main():
    # initialize the game
    env = gym.make('Pendulum-v0').unwrapped
    print env.observation_space
    print env.observation_space.high
    print env.observation_space.low
    print env.action_space
    # import hyper parameters
    args = init_hyper_para()
    # random initialize critic network
    state_dim = env.reset().shape[0]
    action_dim = env.action_space.shape[0]
    #  if we have the saved model, load it
    if os.path.exists(
            '/home/likang/PycharmProjects/myddpg/bin/Models/critic.ckpt'):
        critic_net = torch.load(
            '/home/likang/PycharmProjects/myddpg/bin/Models/critic.ckpt')
    else:  # initialize the model
        critic_net = net.CriticNetwork(
            state_dim=state_dim, action_dim=action_dim).to(
                device)  # need to init paras according to the gym game
    # random initialize actor network(also called policy network)
    if os.path.exists(
            '/home/likang/PycharmProjects/myddpg/bin/Models/actor.ckpt'):
        actor_net = torch.load(
            '/home/likang/PycharmProjects/myddpg/bin/Models/actor.ckpt')
    else:
        actor_net = net.ActorNetwork(state_dim=state_dim,
                                     action_dim=action_dim).to(device)
    # initialize
    optimizer_critic = opt.Adam(critic_net.parameters(), lr=0.001)
    optimizer_actor = opt.Adam(actor_net.parameters(), lr=0.001)
    # initialize target critic network which is the same of critic network
    target_critic_net = copy.deepcopy(critic_net)
    # initialize target actor network which is the same of actor network
    target_actor_net = copy.deepcopy(actor_net)
    # init the memory buffer
    memory = Memory(args.capacity)
    # initialize a random process N for action exploration
    ounoise = OUNoise(env.action_space.shape[0])  # init random process
    # enter circle of training process
    for ep in range(args.num_ep):
        print(["ep: ", ep])
        # reset random process
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
            0, args.exploration_end -
            ep) / args.exploration_end + args.final_noise_scale
        ounoise.reset()
        # initialize a state s1
        state = env.reset()  # 这里把state初始化成二维的tensor
        state = torch.tensor([state], dtype=torch.float32).to(device)
        for t in range(MAX_STEP):
            print(['time step: ', t])
            # select a action according to actor network(also called policy network)
            action = actor_net.select_action(state, ounoise)
            # execute the action and get a new state s_i+i
            # get a reward from the environment
            next_state, reward, done, _ = env.step([action.item()])
            # store the transition {s_i, a_i, r_i, s_i+1} into memory
            next_state = torch.tensor([next_state],
                                      device=device,
                                      dtype=torch.float32)
            reward = torch.tensor([[reward]],
                                  device=device,
                                  dtype=torch.float32)
            memory.push(state, action, reward, next_state)
            state = next_state
            # print([state, action, reward, next_state])
            del action, reward, next_state
            # get a batch_size transitions.
            # (s_i, a_i, r_i, s_{i+1}) in Algorithm1 of DDPG
            transitions = memory.sample(args.batch_size)
            s1 = torch.cat([tran.state for tran in transitions])
            s2 = torch.cat([tran.next_state for tran in transitions])
            r1 = torch.cat([tran.reward for tran in transitions])
            a1 = torch.cat([tran.action for tran in transitions])
            update_critic_net(s1, s2, r1, a1, target_actor_net,
                              target_critic_net, critic_net, optimizer_critic,
                              args)
            # update actor policy network
            update_actor_net(s1, actor_net, critic_net, optimizer_actor)
            # update target critic network
            # theta^{Q'}, see algorithm1 of DDPG
            for target_param, source_param in zip(
                    target_critic_net.parameters(), critic_net.parameters()):
                target_param.data.copy_(args.tau * source_param +
                                        (1 - args.tau) * target_param)
            # update target actor network
            # theta^{mu'}, see algorithm1 of DDPG
            for target_param, source_param in zip(
                    target_actor_net.parameters(), actor_net.parameters()):
                target_param.data.copy_(args.tau * source_param +
                                        (1 - args.tau) * target_param)
            # show image
            plt.imshow(env.render('rgb_array'))
            time.sleep(0.001)
            # finish
            if done:
                break
            del transitions
        gc.collect()

        if ep % 10 == 0:  # save model
            torch.save(critic_net, './Models/' + 'critic.ckpt')
            torch.save(actor_net, './Models/' + 'actor.ckpt')
Beispiel #25
0
# Create OpenAI gym environment
env = gym.make(env_name)
if is_unwrapped:
    env = env.unwrapped

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
policy_net = PolicyNet(layer_sizes).to(device)  # Policy network
value_net = ValueNet(input_size).to(device)  # Value network

# Set up memory
memory = Memory(capacity, GAMMA, LAMBDA, device)

# Set up optimizer
policynet_optimizer = optim.Adam(policy_net.parameters())
valuenet_optimizer = optim.Adam(value_net.parameters())

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
    "epoch mean rewards": [],
    "max reward achieved": 0,
    "past %d epochs mean reward" % num_avg_epoch: 0,
    "value net loss": []
Beispiel #26
0
        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    states = running_state(env_info.vector_observations)

    actor_optim = optim.Adam(actor.parameters(), lr=args.actor_lr)
    critic_optim = optim.Adam(critic.parameters(),
                              lr=args.critic_lr,
                              weight_decay=args.l2_rate)

    scores = []
    score_avg = 0

    for iter in range(args.max_iter):
        actor.eval(), critic.eval()
        memory = [Memory() for _ in range(num_agent)]

        steps = 0
        score = 0

        while steps < args.time_horizon:
            steps += 1

            mu, std, _ = actor(to_tensor(states))
            actions = get_action(mu, std)
            env_info = env.step(actions)[default_brain]

            next_states = running_state(env_info.vector_observations)
            rewards = env_info.rewards
            dones = env_info.local_done
            masks = list(~(np.array(dones)))
Beispiel #27
0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model. Two value net
policy_net = PolicyNet(layer_sizes).to(device)  # Policy network
value_net_ex = ValueNet(input_size).to(
    device)  # Value network for extrinsic reward
value_net_in = ValueNet(input_size + 1 + output_size).to(
    device)  # One additional input unit to indicate trajectory number

# Set up optimizer
valuenet_in_optimizer = optim.Adam(value_net_in.parameters())
valuenet_ex_optimizer = optim.Adam(value_net_ex.parameters())

# Set up memory
memory = Memory(capacity, GAMMA, LAMBDA, device=device)


# Define observation normalization function. Normalize state vector values to range [-1., 1.]
def state_nomalize(s):
    # Obtain environment observation space limit
    high = env.observation_space.high
    low = env.observation_space.low
    return ((s - low) / (high - low)) * 2 - 1


# Create Hashing function
simhash = SimHash(input_size,
                  len_hashcode,
                  preprocessor=state_nomalize if use_preprocessor else None)
Beispiel #28
0
def marl_test(config):

    experiment_name = config.setdefault("experiment_name", "")
    time_slots = config.setdefault("time_slots", 10000)
    simulations = config.setdefault("simulations", 3)

    memory_size = config.setdefault("memory_size", 1200)
    pretrain_length = config.setdefault("pretrain_length", 6)
    step_size = config.setdefault("step_size", 5)
    save_freq = config.setdefault("save_freq", 1000)
    save_results = config.setdefault("save_results", True)
    save_model = config.setdefault("save_model", False)
    load_model = config.setdefault("load_model", False)
    load_slot = config.setdefault("load_slot", 4999)
    training = config.setdefault("training", False)
    episode_interval = config.setdefault("episode_interval", 25)
    explore_step = config.setdefault("explore", 2000)
    greedy_step = config.setdefault("greedy", 20000)
    training_stop = config.setdefault("training_stop", 20000)  # Stop the training after these time step.
    train_after_episode = config.setdefault("train_after_episode", False)  # Train after each episode in stead of training after each time slot.
    global_reward_avg = config.setdefault("global_reward_avg", False)  # Train after each episode in stead of training after each time slot.
    save_positions = config.setdefault("save_positions", False)  # Train after each episode in stead of training after each time slot.
    enable_channel = config.setdefault("enable_channel", False)  # Train after each episode in stead of training after each time slot.

    batch_size = config["RLAgent"]["batch_size"]
    ia_penalty_enable = config.setdefault("ia_penalty_enable", False)
    ia_averaging = config.setdefault("ia_averaging", False)


    for simulation in range(simulations):
        print("-=-=-=-=-=-=-=-=-=-=-= experiment_name: " + experiment_name + " SIMULATION " + str(simulation + 1) + " =-=-=-=-=-=-=-=-=-=-=-")
        # Initialize the env.
        env = TestEnv(**config["EnvironmentTest"])

        if ia_penalty_enable:
            ia_penalty_threshold = config.setdefault("ia_penalty_threshold", 5)
            ia_penalty_value = config.setdefault("ia_penalty_value", -10)
            ia_penalty_counter = {}
            previous_actions = {}  # store the previous taken action by the UE.
            num_users = env.get_total_users()
            for user in range(num_users):
                ia_penalty_counter[user] = 0
                previous_actions[user] = -1

            # Initialize the agen

        mainDRQN = DRQN(env, name=experiment_name, total_episodes=time_slots/episode_interval, **config["RLAgent"])
        #mainDRQN = DeepRecurrentQNetwork(env=env, name=experiment_name, **config["RLAgent"])
        if load_model:
            print("Load model DRQN time step " + str(load_slot))
            save_dir = "save_model/" + "test/"
            mainDRQN.load_model(save_dir, load_slot)

        # this is experience replay buffer(deque) from which each batch will be sampled and fed to the neural network for training
        memory = Memory(max_size=memory_size)

        log_reward_slot = []
        log_actions_slot = []
        log_ia_slot = []
        sum_ia_prev = 0

        log_x_positions = []
        start_time = time.time()
        episode = 0  # Used to update the greediness of the algorithm
        # cumulative reward
        cum_r = [0]
        cum_r_slots = [0]

        # cumulative collision
        cum_collision = [0]
        cum_collision_slots = [0]
        # this is our input buffer which will be used for  predicting next Q-values
        history_input = deque(maxlen=step_size)
       # env.network.reset_ia()
        # to sample random actions for each user
        action = env.sample()

        #obs = env.step(action)
        obs, rews = env.my_step(action, 0)
        rews = list(rews)
        state = env.obtain_state(obs, action, rews)
        # reward = [i[1] for i in obs[:num_users]]
        num_users = env.get_total_users()
        num_channels = env.get_action_space()
        ##############################################
        for ii in range(pretrain_length*step_size*5):
            action = env.sample()
            if enable_channel:
                obs, reward = env.my_step_ch(action,
                                             0)  # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)]
            else:
                #obs, reward = env.my_step(
                #    action, 0)  # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)]
                obs, reward = env.my_step_design(action, 0)

            # obs is a list of tuple with [[(ACK,REW) for each user] ,CHANNEL_RESIDUAL_CAPACITY_VECTOR]
            next_state = env.obtain_state(obs, action, rews)
            #next_state = env.state_generator(action, obs)
            memory.add((state, action, rews, next_state))
            state = next_state
            history_input.append(state)

            ##############################################
        # TODO: now load the positions
        env.load_saved_positions()
        for time_step in range(time_slots):
            #initializing action vector
            action = np.zeros([num_users], dtype=np.int32)

            #converting input historskyy into numpy array
            # TODO: enable below for lstm
            state_vector = np.array(history_input)  #  LSTM
            #  state_vector = state  #  DQN
            for each_user in range(num_users):
                #action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, time_slot=time_step)
                if time_step < explore_step and not load_model: # and 0:
                    action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode,
                                                              policy="explore")

                elif time_step < greedy_step and not load_model: # and 0:
                    action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode)
                else:
                    action[each_user] = mainDRQN.infer_action(each_user, state_vector=state_vector, episode=episode, policy="greedy")

            # taking action as predicted from the q values and receiving the observation from the envionment
            # obs = env.step(action)           # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)]
            if save_positions:
                user_pos = env.get_x_pos()
                log_x_positions.append(user_pos)
            if enable_channel:
                obs, reward = env.my_step_ch(action, time_step)           # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)]
            else:
                obs, reward = env.my_step(action, time_step)           # obs is a list of tuple with [(ACK,REW) for each user ,(CHANNEL_RESIDUAL_CAPACITY_VECTOR)]
                #obs, reward = env.my_step_design(action, time_step)
                # TODO: update the env topology after each step.
            log_actions_slot.append(action)
            ia = env.network.get_information_age(time_step)
            ia_sum = calculate_ia_penalty(ia)
            log_ia_slot.append(ia)
            if ia_averaging:  # ia based penalty to the reward
                ia_penalty = 0
                if ia_sum > sum_ia_prev:
                    ia_penalty = -1
                elif ia_sum < sum_ia_prev:
                    ia_penalty = 1

                sum_ia_prev = ia_sum

            # Generate next state from action and observation
            # next_state = env.state_generator(action, obs)  used for DQN
            next_state = env.obtain_state(obs, action, reward, episode, mainDRQN.get_eps())
            #	print (next_state)

            # reward for all users given by environment
            #reward = [i[1] for i in obs[:num_users]]

            # calculating sum of rewards
            sum_r = np.sum(reward)

            #calculating cumulative reward
            cum_r.append(cum_r[-1] + sum_r)
            cum_r_slots.append(cum_r_slots[-1] + sum_r)

            #If NUM_CHANNELS = 2 , total possible reward = 2 , therefore collision = (2 - sum_r) or (NUM_CHANNELS - sum_r)
            collision = num_channels - sum_r

            #calculating cumulative collision
            cum_collision.append(cum_collision[-1] + collision)
            cum_collision_slots.append(cum_collision_slots[-1] + collision)
            #############################
            #  for co-operative policy we will give reward-sum to each user who have contributed
            #  to play co-operatively and rest 0
            # NOTE: I think, I do not need that part since I already use positive and negative reward.

            for i in range(len(reward)):  # for each user we have this.
                #if reward[i] > 0:
                if ia_averaging:
                    # add penalty based on the direction of the Information age.
                    reward[i] += ia_penalty

                if ia_penalty_enable:
                    if reward[i] < 1 and action[i] == previous_actions[i]:
                        ia_penalty_counter[i] += 1
                    else:
                        ia_penalty_counter[i] = 0

                    if ia_penalty_counter[i] > ia_penalty_threshold:
                        reward[i] = ia_penalty_value

                    previous_actions[i] = action[i]

                if global_reward_avg:
                    reward[i] = reward[i] + sum_r/len(reward)  # Add the average total reward to each UE.

            #############################
            #reward = reward*2  # Add the average total reward to each UE.
            log_reward_slot.append(sum_r)
            #	print (reward)
            #	print("EPOCH " + str(time_step))

            # add new experiences into the memory buffer as (state, action , reward , next_state) for training
            memory.add((state, action, reward, next_state))

            state = next_state
            #add new experience to generate input-history sequence for next state
            history_input.append(state)

            #  Start training.
            if not train_after_episode:
                if time_step < training_stop and training: #and not load_model:
                    mainDRQN.train(memory, time_step)

            if time_step%(episode_interval) == episode_interval-1:
                print("Time step " + str(time_step) + " epsilon " + str(mainDRQN.get_eps())
                      + " cum Collison " + str(cum_collision[episode_interval]) + " sum reward " + str(cum_r[episode_interval]) + " total time " + str(time.time()-start_time) )
                cum_r = [0]
                cum_collision = [0]
                episode += 1
                # Updates the velocity of the vehicles if activated
                env.update_velocity()
               # ia = env.network.get_information_age(time_step)
                if train_after_episode and time_step > (batch_size+10) and training:
                    mainDRQN.train(memory, time_step)

            if time_step%save_freq == save_freq-1:
                # Save the collisions
                if save_results:
                    print("save results for timestep ", time_step + 1)
                    save_dir = "save_results/" + "test/"
                    save_dir = save_dir + experiment_name
                    if not os.path.isdir(save_dir):
                        os.makedirs(save_dir)
                   # filename = save_dir + "/collisions" + "_" + str(time_step) +"_sim"+str(simulation)
                   # np.save(filename, np.asarray(cum_collision_slots))
                    filename = save_dir + "/rewards" + "_sim"+str(simulation)
                    np.save(filename, np.asarray(log_reward_slot))
                    filename = save_dir + "/actions" + "_sim"+str(simulation)
                    np.save(filename, np.asarray(log_actions_slot))
                  #  filename = save_dir + "/time_step" + "_" + str(time_step)+"_sim"+str(simulation)
                  #  np.save(filename, np.asarray(str(time.time()-start_time)))
                    filename = save_dir + "/positions" + "_sim"+str(simulation)
                    np.save(filename, np.asarray(log_x_positions))
                    #filename = save_dir + "/ia" + "_sim"+str(simulation)
                    #np.save(filename, np.asarray(log_ia_slot))
                    #"_" + str(time_step)+

                if save_model:
                    print("save model for timestep ", time_step + 1)
                    save_dir = "save_model/" + "test/"
                    #save_dir = save_dir
                    mainDRQN.save_model(save_dir, time_step,simulation)