Python ReplayMemory.push Examples

Programming Language: Python

Namespace/Package Name: utils

Class/Type: ReplayMemory

Method/Function: push

Examples at hotexamples.com: 27

Python ReplayMemory.push - 27 examples found. These are the top rated real world Python examples of utils.ReplayMemory.push extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ReplayMemory(30)

push(27)

sample(24)

add(5)

append(3)

sample_batch(2)

push_to_train_buffer(1)

store(1)

sample_mini_batch(1)

record(1)

recall(1)

push_to_memory(1)

push_to_test_buffer(1)

pop(1)

get_size(1)

get_minibatch_indices(1)

generate_minibatch_samples(1)

close(1)

add_experience(1)

train_agent_batch(1)

Example #1

Show file

def generate_memory(size, game='Pendulum'):

    if game.startswith('Pendulum'):
        env = PendulumWrapper()
    elif game.startswith('LunarLander'):
        env = LunarWrapper()

    memory = ReplayMemory(100000)

    for i in range(size):
        s = env.reset()
        a = env.action_space.sample()
        s_, r, d, _ = env.step(a)

        memory.push(s, a, r, s_, 1 - int(d))

    return memory

Example #2

Show file

class TransitionSaver:
    def __init__(self):
        self.processor = PreprocessImage(None)
        self.memory = ReplayMemory()
        self.transitions = []
        self.index = 0
        self.nsteps = 10

    def new_episode(self, first_state):
        self.state = self.processor._observation(first_state)

    def add_transition(self, action, next_state, reward, done):
        if not done and self.index < self.nsteps:
            next_state = self.processor._observation(next_state)
            self.transitions.insert(0, Transition(self.state, self.add_noop(action), next_state, torch.FloatTensor([reward]), torch.zeros(1)))

            transitions = []
            gamma = 1
            for trans in self.transitions:
                transitions.append(trans._replace(n_reward= trans.n_reward + gamma * reward))
                gamma = gamma * GAMMA
            self.transitions = transitions
        else:
            for trans in self.transitions:
                self.memory.push(trans)
            self.transitions = []
        self.state = next_state
    
    def add_noop(self, actions):
        actions.insert(0, 0)
        actions = torch.LongTensor(actions)
        actions[0] = (1 - actions[1:].max(0)[0])[0]
        return actions.max(0)[1]

    def save(self, fname):
        with open(fname, 'wb') as memory_file:
            pickle.dump(self.memory, memory_file)

Example #3

Show file

File: dqn_training.py Project: beedrill/cloud_rltl

    env.reset()

    episode_record = []  # use this to record temporarily for one episode
    # for t in count():
    for t in range(2999):
        steps_done += 1
        # Select and perform an action
        # print(state.shape)
        action = select_action(torch.tensor(state).to(device))
        # print(action.item())
        next_state, reward, terminal, _ = env.step([action.item()])
        episode_record.append((next_state, reward))
        # print(next_state.shape)
        reward = torch.tensor([reward], device=device)
        # Store the transition in memory
        memory.push(torch.tensor([state]), torch.tensor([action]),
                    torch.tensor([next_state]), reward)
        # print("reward",reward)
        # Move to the next state
        state = next_state
        # Perform one step of the optimization (on the target network)
        optimize_model()
        if terminal:
            print('terminal')
            episode_durations.append(t + 1)
            break
        # Update the target network, copying all weights and biases in DQN
        if steps_done % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
    average_reward = evaluate_episode(episode_record)
    print("episode:", i_episode, 'average reward:', average_reward)
    torch.save(target_net.state_dict(),

Example #4

Show file

File: main.py Project: thanard/batch-q-learning

         rad = np.linalg.norm(s_next - kwargs["emb_goal"], 2)
         threshold = 3.5
         kwargs["emb_threshold"] = threshold
     else:
         rad = np.linalg.norm(ts_next - goal.reshape(-1), 2)
         threshold = 0.5
     r = -1
     if rad < threshold:
         count += 1
         # print(ts_next)
         r = 0
         s_next = None
     if is_shapedreward:
         r -= rad
     if not is_image:
         memory.push(ts, a, ts_next, r)
     else:
         memory.push(s, a, s_next, r)
 print("Number of goals reached in transitions: %d" % count)
 """
 Training Q-function
 """
 n_iters = len(transitions) // BATCH_SIZE
 for epoch in range(N_EPOCHS):
     loss = 0
     for it in range(n_iters):
         loss += optimize_model(memory, policy_net, target_net, optimizer,
                                GAMMA, BATCH_SIZE)
         if it % TARGET_UPDATE == 0:
             target_net.load_state_dict(policy_net.state_dict())
     pred_v, real_dist, emb_dist, reward, emb_reward = eval_task(

Example #5

Show file

File: dq_pole.py Project: Pitrified/reinforcement-learning

def run_dq_pole(num_episodes):
    logg = logging.getLogger(f"c.{__name__}.run_dq_pole")
    logg.debug(f"Start run_dq_pole")

    env = gym.make("CartPole-v0").unwrapped

    plt.ion()

    # if gpu is to be used
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logg.debug(f"Using {device} as device")

    #  show_frame(env)

    # hyperparameters
    BATCH_SIZE = 128
    GAMMA = 0.999
    EPS_START = 0.9
    EPS_END = 0.05
    EPS_DECAY = 200
    TARGET_UPDATE = 10

    env.reset()
    # Get screen size so that we can initialize layers correctly based on shape
    # returned from AI gym. Typical dimensions at this point are close to 3x40x90
    # which is the result of a clamped and down-scaled render buffer in get_screen()
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    n_actions = env.action_space.n

    policy_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(10000)

    steps_done = 0

    # main training loop. At the beginning we reset the environment and
    # initialize the state Tensor. Then, we sample an action, execute it,
    # observe the next screen and the reward (always 1), and optimize our model
    # once. When the episode ends (our model fails), we restart the loop.

    #  num_episodes = 50
    episode_durations = []

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen(env, device)
        current_screen = get_screen(env, device)
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(
                state,
                n_actions,
                steps_done,
                device,
                policy_net,
                EPS_START,
                EPS_END,
                EPS_DECAY,
            )
            _, reward, done, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env, device)
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(BATCH_SIZE, memory, device, policy_net, target_net,
                           GAMMA, optimizer)
            if done:
                episode_durations.append(t + 1)
                plot_durations(episode_durations)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

    print("Complete")
    env.render()
    # remember to close the env, avoid sys.meta_path undefined
    env.close()
    plt.ioff()
    plt.show()

Example #6

Show file

class DQNAgent(Agent):
    def __init__(self, model, env, **kwargs):
        Agent.__init__(self, **kwargs)
        self.update_step = 0
        self.eps = self.EPS_START
        self.global_step = 0
        self.model = model
        self.target_model = copy.deepcopy(model)
        self.in_size = model.in_size
        self.out_size = model.out_size
        self.memory = ReplayMemory(self.REPLAY_CAPACITY)
        self.opt = torch.optim.Adam(self.model.parameters(), lr=self.LR)
        self.env = env
        self.container = Container(self.model.SAVE_MODEL_NAME)

    def select_action(self, state):
        if self.is_training:
            self.global_step += 1
            self.eps = self.EPS_START - (self.EPS_START - self.EPS_END
                                         ) / self.EPS_DECAY * self.global_step
            if self.eps < self.EPS_END:
                self.eps = self.EPS_END

        if self.is_training and np.random.rand() < self.eps:
            return LongTensor([[np.random.randint(self.out_size)]])
        else:
            var = Variable(state).type(FloatTensor)
            out = self.model(var)
            return out.max(1)[1].data.view(1, 1)

    def _DQ_loss(self, y_pred, reward_batch, non_final_mask,
                 non_final_next_states):
        q_next = Variable(torch.zeros(self.BATCH_SIZE).type(FloatTensor))
        target_q = self.target_model(non_final_next_states)
        if self.DOUBLE_DQN:
            max_act = self.model(non_final_next_states).max(1)[1].view(-1, 1)
            q_next[non_final_mask] = target_q.gather(1, max_act).data.view(
                target_q.gather(1, max_act).data.shape[0])
        else:
            q_next[non_final_mask] = target_q.max(1)[0].data

        # next_state_values.volatile = False
        y = q_next * self.GAMMA + reward_batch
        loss = nn.functional.mse_loss(y_pred, y)
        return loss

    def _calc_loss(self):
        batch = self.memory.sample(self.BATCH_SIZE)
        non_final_mask = ByteTensor(
            tuple([s is not None for s in batch.next_state]))
        non_final_next_states = Variable(
            torch.cat([s for s in batch.next_state if s is not None]))

        state_batch = Variable(
            torch.cat([s for s in batch.state if s is not None]))
        action_batch = Variable(
            torch.cat([s for s in batch.action if s is not None]))
        reward_batch = Variable(
            torch.cat([s for s in batch.reward if s is not None]))

        y_pred = self.model(state_batch).gather(1, action_batch).squeeze()
        loss = self._DQ_loss(y_pred, reward_batch, non_final_mask,
                             non_final_next_states)
        self.container.add("y_pred", torch.mean(y_pred.data))
        self.container.add("loss", loss.data.item())
        return loss

    def update_policy(self):
        loss = self._calc_loss()
        self.opt.zero_grad()
        loss.backward()
        if self.GRADIENT_CLIPPING:
            for param in self.model.parameters():
                param.grad.data.clamp_(-self.GRADIENT_CLIPPING,
                                       self.GRADIENT_CLIPPING)
        self.opt.step()

    def update_target_network(self):
        if not self.SOFT_UPDATE:
            self.update_step = (self.update_step + 1) % self.TARGET_UPDATE_FREQ
            if self.update_step == 0:
                state_dict = self.model.state_dict()
                self.target_model.load_state_dict(copy.deepcopy(state_dict))
        else:
            tw = self.target_model.state_dict().values()
            sw = self.model.state_dict().values()
            for t, s in zip(tw, sw):
                t.add_(self.TARGET_UPDATE_FREQ * (s - t))

    def _forward(self, obs, is_train, update_memory):
        if self.state_processor:
            state = self.state_processor(obs)
        else:
            temp = obs[None, :] if len(obs.shape) == 1 else obs[None, None, :]
            state = torch.from_numpy(temp).type(FloatTensor)

        if self.GET_DEMO:
            action = self.rule_processor(obs)
        else:
            action = self.select_action(state)

        act = action.numpy().squeeze()
        if self.VERBOSE:
            print("action: {}".format(act))
        action_step = self.ACTION_REPEAT
        reward = 0
        done = False
        while action_step > 0:
            action_step -= 1
            next_obs, r, done, _ = self.env.step(act)

            # CartPole reward
            # x, x_dot, theta, theta_dot = next_obs
            # r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8
            # r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5
            # r = r1 + r2

            # MountainCar reward
            # position, velocity = next_obs
            # r = abs(position - (-0.5))

            reward += r
            if done:
                break

        self.reward_episode += reward
        if update_memory:
            reward = FloatTensor([reward])
            self.memory.push(state, action, reward)
            if done:
                self.memory.push(None, None, None)

        if len(self.memory) >= self.REPLAY_START and is_train:
            self.update_policy()
            self.update_target_network()

        if self.is_render:
            self.env.render()

        return next_obs, done

    def fit(self,
            is_train,
            update_memory=True,
            num_step=np.inf,
            num_episode=np.inf,
            max_episode_length=np.inf,
            is_render=False):
        if num_step == np.inf and num_episode == np.inf:
            raise Exception("")
        if num_step != np.inf and num_episode != np.inf:
            raise Exception("")

        self.is_render = is_render
        while self.i_episode < num_episode and self.i_step < num_step:
            self.i_episode += 1
            print("------------------------")
            print("episode: {}, step: {}".format(self.i_episode, self.i_step))
            obs = self.env.reset()
            self.reward_episode = 0
            episode_step = 0
            while episode_step < max_episode_length:
                episode_step += 1
                self.i_step += 1
                obs, done = self._forward(obs, is_train, update_memory)
                if done:
                    self.reward_step_pairs.push(self.reward_episode,
                                                self.i_step)
                    if self.is_test:
                        self.container.add("reward", self.reward_episode,
                                           self.record_i_step)
                    self.print(is_train)
                    break

    def train(self, **kwargs):
        self.is_training = True
        if kwargs.pop("clear", True):
            self.i_episode = 0
            self.i_step = 0
            self.reward_step_pairs.reset()
        print("Training starts...")
        self.fit(True, **kwargs)
        # self.model.save()
        self.container.save()

    def run(self, **kwargs):
        self.is_training = False
        if kwargs.pop("clear", True):
            self.i_episode = 0
            self.i_step = 0
            self.reward_step_pairs.reset()
        print("Running starts...")
        self.fit(False, **kwargs)

    def _test(self, num_step):
        self.record_i_episode = self.i_episode
        self.record_i_step = self.i_step
        self.is_test = True
        self.run(num_step=num_step)
        self.i_episode = self.record_i_episode
        self.i_step = self.record_i_step
        self.is_test = False

    def train_test(self, num_step, test_period=1000, test_step=100):
        self.i_episode = 0
        self.i_step = 0
        while self.i_step < num_step:
            self._test(test_step)
            self.train(num_step=self.record_i_step + test_period, clear=False)
        self._test(test_step)

    def print(self, is_train):
        print("reward_episode {}".format(self.reward_episode))
        print("eps {}".format(self.eps))
        if is_train:
            print("loss_episode {}".format(self.container.get("loss")))
            print("y_pred_episode {}".format(self.container.get("y_pred")))

Example #7

Show file

File: agents.py Project: AFanthomme/rl_film

class DQNagent(object):
    def __init__(self, filename='dqn0'):
        self.filename = './trained_agents/' + filename
        self.policy_net = DQN(self.filename + '.cfg')
        self.target_net = DQN(self.filename + '.cfg')
        self.memory = ReplayMemory(16384)
        self.gamma = 0.999

    def select_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            idx = LongTensor([[random.randrange(self.policy_net.output_size)]])
        else:
            idx = self.policy_net(
                Variable(state,
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1)
        return idx

    def update(self, batch_size=16):
        if len(self.memory.memory) < batch_size:
            batch_size = len(self.memory.memory)

        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))

        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = Variable(torch.zeros(batch_size).type(Tensor))
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0]

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        expected_state_action_values = Variable(
            expected_state_action_values.data)

        loss = F.mse_loss(state_action_values, expected_state_action_values)

        old_params = freeze_as_np_dict(self.policy_net.state_dict())
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            logging.debug(param.grad.data.sum())
            param.grad.data.clamp_(-1., 1.)
        self.optimizer.step()

        new_params = freeze_as_np_dict(self.policy_net.state_dict())
        check_params_changed(old_params, new_params)
        return loss.data[0]

    def train(self,
              env,
              n_epochs=30,
              epsilon_init=1.,
              epsilon_schedule='exp',
              eps_decay=None,
              lr=0.001,
              batch_size=32):
        if epsilon_schedule == 'linear':
            eps_range = np.linspace(epsilon_init, 0., n_epochs)
        elif epsilon_schedule == 'constant':
            eps_range = [epsilon_init for _ in range(n_epochs)]
        elif epsilon_schedule == 'exp':
            if not eps_decay:
                eps_decay = n_epochs // 4
            eps_range = [
                epsilon_init * math.exp(-1. * i / eps_decay)
                for i in range(n_epochs)
            ]

        history_file = open(self.filename + 'history', mode='a+')
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)

        losses, rewards, change_history = [], [], []

        for epoch in range(n_epochs):
            env.reset()
            last_screen = get_screen(env)
            current_screen = get_screen(env)
            state = current_screen - last_screen
            done = False
            epoch_losses = []
            epoch_rewards = []
            video = []

            while not done:
                if epoch % 10 == 1:
                    video.append(last_screen)
                action = self.select_action(state, eps_range[epoch])

                _, reward, done, _ = env.step(action[0, 0])

                last_screen = current_screen
                current_screen = get_screen(env)

                reward = Tensor([reward])
                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = None

                self.memory.push(state, action, next_state, reward)
                state = next_state
                loss = self.update(batch_size=batch_size)

                epoch_losses.append(loss)
                epoch_rewards.append(reward)

            history_file.write(
                'Epoch {}: loss= {}, reward= {}, duration= {}\n'.format(
                    epoch, np.mean(epoch_losses), np.sum(epoch_rewards),
                    len(epoch_rewards)))

            losses.append(np.mean(epoch_losses))
            rewards.append(np.sum(epoch_rewards))

            if epoch % 10 == 1:
                self.target_net.load_state_dict(self.policy_net.state_dict())
                self.save(ext=str(epoch))
                self.make_video(video, ext='_train_' + str(epoch))

                with open(self.filename + '.train_losses', 'a+') as f:
                    for l in losses:
                        f.write(str(l) + '\n')
                losses = []
                with open(self.filename + '.train_rewards', 'a+') as f:
                    for r in rewards:
                        f.write(str(r) + '\n')
                rewards = []
        self.save()

    def test(self, env, n_epochs=30, verbose=False):
        rewards = []
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        self.target_net.eval()

        for epoch in range(n_epochs):
            env.reset()
            done = False
            epoch_rewards = []
            video = []

            last_screen = get_screen(env)
            current_screen = get_screen(env)
            state = current_screen - last_screen

            while not done:
                if epoch % 5 == 0:
                    video.append(last_screen)
                action = self.select_action(state, 0.)

                _, reward, done, _ = env.step(action[0, 0])
                last_screen = current_screen
                current_screen = get_screen(env)

                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = None

                epoch_rewards.append(reward)
                reward = Tensor([reward])
                state = next_state

                logging.debug(
                    'Test epoch {} :  reward= {}, duration= {}'.format(
                        epoch, np.sum(epoch_rewards), len(epoch_rewards)))
            rewards.append(np.sum(epoch_rewards))

            if epoch % 5 == 0:
                self.make_video(video, ext='_test_' + str(epoch))

            logging.info('Performance estimate : {} pm {}'.format(
                np.mean(rewards), np.std(rewards)))

    def make_video(self, replay, ext=''):
        n_frames = len(replay)
        b_s, n_channels, n_w, n_h = replay[0].shape
        writer = VideoWriter(self.filename + ext + '.mp4')
        for i in range(n_frames):
            writer.writeFrame(replay[i][0][[1, 2, 0]] * 255)
        writer.close()

    def save(self, ext=''):
        torch.save(self.policy_net.state_dict(),
                   self.filename + ext + '.pol.ckpt')
        torch.save(self.target_net.state_dict(),
                   self.filename + ext + '.tgt.ckpt')

    def load(self, filename):
        self.policy_net.load_state_dict(
            torch.load('./trained_agents/' + filename + '.pol.ckpt'))
        self.target_net.load_state_dict(
            torch.load('./trained_agents/' + filename + '.tgt.ckpt'))

Example #8

Show file

File: train_test_methods.py Project: zhzhang2018/NN_MARL_experiments

def train(agent, env, num_episode=50, test_interval=25, num_test=20, num_iteration=200, iteration_cutoff=0, 
          BATCH_SIZE=128, num_sample=50, action_space=[-1,1], debug=True, memory=None, seed=2020,
          update_mode=UPDATE_PER_ITERATION, reward_mode=FUTURE_REWARD_NO, gamma=0.99, 
          loss_history=[], loss_historyA=[], lr_history=[], lr_historyA=[], reward_mean_var=(0,-1),
          save_sim_intv=50, save_sim_fnames=[], imdir='screencaps/', useVid=False, save_intm_models=False,
          not_use_rand_in_action=False, not_use_rand_in_test=True, 
         return_memory=False):
    test_hists = []
    steps = 0
    if memory is None:
        ### UPDate 11/05: Changed memory size based on number of agents
        memory = ReplayMemory(1000 * env.N)
    if iteration_cutoff <= 0:
        iteration_cutoff = num_iteration # Save all iterations into the memory
    
    # Values that would be useful
    N = env.N
    # Note that the seed only controls the numpy random, which affects the environment.
    # To affect pytorch, refer to further documentations: https://github.com/pytorch/pytorch/issues/7068
    np.random.seed(seed)
#     torch.manual_seed(seed)
    test_seeds = np.random.randint(0, 5392644, size=int(num_episode // test_interval)+1)
    
#     rmean = 0
#     rvar = -1
    (rmean, rvar) = reward_mean_var

    for e in range(num_episode):
        steps = 0
        state = env.reset()
        if agent.centralized:
            state = env.state
        state = torch.from_numpy(state).float()
        state = Variable(state)
        if debug:
            env.render()
        # Train History
        state_pool = []
        action_pool = []
        reward_pool = []
        next_state_pool = []
        loss_history.append([])
        loss_historyA.append([])

        for t in range(num_iteration):
#             agent.net.train()
            agent.set_train(True)
            # Try to pick an action, react, and store the resulting behavior in the pool here
            if agent.centralized:
                action = agent.select_action(state, **{
                        'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action
                    }).T
            else:
                actions = []
                for i in range(N):
                    action = agent.select_action(state[i], **{
                        'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action
                    })
                    actions.append(action)
                if torch.is_tensor(action):
                    action = torch.cat(actions).view(-1,env.N)#.T
                else:
                    action = np.array(actions).T # Shape would become (2,N)

            if torch.is_tensor(action):
                next_state, reward, done, _ = env.step(action.detach().numpy())
            else:
                next_state, reward, done, _ = env.step(action)
                
            if agent.centralized:
                next_state = env.state
            next_state = Variable(torch.from_numpy(next_state).float()) # The float() probably avoids bug in net.forward()
            action = action.T # Turn shape back to (N,2)

            if agent.needsExpert:
                # If we need to use expert input during training, then we consult it and get the best action for this state
                actions = env.controller()
                action = actions.T # Shape should already be (2,N), so we turn it into (N,2)
            
            if not(agent.centralized):
                # if reward_mode & FUTURE_REWARD_YES == 0:
                #     # Push everything directly inside if we don't use future discounts
                #     for i in range(N):
                #         memory.push(state[i], action[i], next_state[i], reward[i])
                # else:
                #     # Store and push them outside the loop
                #     state_pool.append(state)
                #     action_pool.append(action)
                #     reward_pool.append(reward)
                #     next_state_pool.append(next_state)
                pass
            else:
                # if reward_mode & FUTURE_REWARD_YES == 0:
                #     # Push everything directly inside if we don't use future discounts
                #     memory.push(state, action, next_state, reward)
                # else:
                #     # Store and push them outside the loop
                #     state_pool.append(state)
                #     action_pool.append(action)
                #     reward_pool.append(reward)
                #     next_state_pool.append(next_state)
                # Centralized training should directly use the real states, instead of observations
                reward = np.sum(reward)

            # Update 1028: Moved this training step outside the loop
            if update_mode == UPDATE_PER_ITERATION:
                # Added 1214: Push the samples to memory if no need for extra processing
                if reward_mode & FUTURE_REWARD_YES == 0 and reward_mode & FUTURE_REWARD_NORMALIZE == 0:
                    if agent.centralized:
                        memory.push(state, action, next_state, reward, reward)
                    else:
                        for i in range(N):
                            memory.push(state[i], action[i], next_state[i], reward[i], reward[i])
                # Learn
                if len(memory) >= BATCH_SIZE:
                    transitions = memory.sample(BATCH_SIZE)
                    batch = Transition(*zip(*transitions))
                    agent.optimize_model(batch, **{'B':BATCH_SIZE})
                elif len(memory) > 0:
                    transitions = memory.sample(len(memory))
                    batch = Transition(*zip(*transitions))
                    agent.optimize_model(batch, **{'B':len(memory)})
                loss_history[-1].append(agent.losses[:])
#                 print(e,t,agent.losses)
                agent.losses=[]
                # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then
                # we can know from the learning rate if we're in a flatter area.
                # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2
                # The scheduler requires the validation loss - can I just use the average training loss instead?
#                 try:
#                     agent.scheduler.step(np.mean(loss_history[-1]))
#                     lr_history.append(agent.optimizer.param_groups[0]['lr'])
#                 except:
#                     agent.schedulerC.step(np.mean(loss_history[-1]))
#                     lr_history.append(agent.optimizerC.param_groups[0]['lr'])
                try:
                    loss_historyA[-1].append(agent.lossesA[:])
                    agent.lossesA=[]
#                     agent.schedulerA.step(np.mean(loss_historyA[-1]))
#                     lr_historyA.append(agent.optimizerA.param_groups[0]['lr'])
                except:
                    pass
            elif update_mode == UPDATE_ON_POLICY:
                # This case would ditch sampling, and just update by the current thing.
                # Note that methods that use future cumulative reward would be highly incompatible with this...
                if not(agent.centralized) or reward_mode & FUTURE_REWARD_YES != 0:
                    print("Error: Update-on-policy might be incompatible with decentralized planning or cumulative reward")
                    return None
                if rvar == -1 and rmean == 0 and reward_mode & FUTURE_REWARD_NORMALIZE != 0:
                    rvar = np.abs(reward)
                    rmean = reward
                reward = (reward - rmean) / rvar
                
                batch = Transition(state, action, next_state, [[reward]], [[reward]])
                agent.optimize_model(batch, **{'B':1})
#                 batch = Transition(state, action, next_state, reward, reward)
# #                 transitions = [batch,batch]
# #                 agent.optimize_model(Transition(*zip(*transitions)), **{'B':2})
#                 transitions = [batch,batch]
#                 agent.optimize_model(batch, **{'B':1})
                loss_history[-1].append(agent.losses[:])
                agent.losses=[]
                try:
                    loss_historyA[-1].append(agent.lossesA[:])
                    agent.lossesA=[]
                except:
                    pass
                
            else:
                # Store and push them outside the loop
                state_pool.append(state)
                if torch.is_tensor(action):
                    action_pool.append(action.detach().numpy())
                else:
                    action_pool.append(action)
                reward_pool.append(reward)
                next_state_pool.append(next_state)
                    
            state = next_state
            steps += 1

            if debug:
                env.render()

            if debug and done:
                print("Took ", t, " steps to converge")
                break
        
        # Now outside the iteration loop - prepare for per-episode trainings
        if update_mode == UPDATE_ON_POLICY:
            pass
        elif update_mode == UPDATE_PER_EPISODE: #se:
            inst_reward = torch.tensor(reward_pool)
            if reward_mode & FUTURE_REWARD_YES != 0:
                for j in range(len(reward_pool)): ### IT was previously miswritten as "reward". Retard bug that might had effects
                    if j > 0:
                        reward_pool[-j-1] += gamma * reward_pool[-j]
            reward_pool = torch.tensor(reward_pool)
            if reward_mode & FUTURE_REWARD_NORMALIZE != 0:
                if rvar == -1 and rmean == 0:
                    rmean = reward_pool.mean()
                    rvar = reward_pool.std()
                    print("Updated mean and stdev: {0} and {1}".format(rmean.numpy(), rvar.numpy()))
                reward_pool = (reward_pool - rmean) / rvar
                inst_reward = (inst_reward - rmean) / rvar

            # Update: 0106 added option to only push the first few iterations into the memory.
            # if agent.centralized:
            # #             print(state_pool[0].shape, action_pool[0].shape)
            #     for j in range(len(reward_pool)):
            #         memory.push(state_pool[-j-1], action_pool[-j-1], 
            #                     next_state_pool[-j-1], reward_pool[-j-1], inst_reward[-j-1])
            # else:
            #     for j in range(len(reward_pool)):
            #         for i in range(N):
            #             memory.push(state_pool[-j-1][i], action_pool[-j-1][i], 
            #                         next_state_pool[-j-1][i], reward_pool[-j-1][i], inst_reward[-j-1][i])
            if agent.centralized:
                for j in range(iteration_cutoff):
                    print(j, len(reward_pool))
                    memory.push(state_pool[j], action_pool[j], 
                                next_state_pool[j], reward_pool[j], inst_reward[j])
            else:
                for j in range(iteration_cutoff):
                    for i in range(N):
                        memory.push(state_pool[j][i], action_pool[j][i], 
                                    next_state_pool[j][i], reward_pool[j][i], inst_reward[j][i])
            

        if update_mode == UPDATE_PER_EPISODE:
            if len(memory) >= BATCH_SIZE:
                transitions = memory.sample(BATCH_SIZE)
                batch = Transition(*zip(*transitions))
                agent.optimize_model(batch, **{'B':BATCH_SIZE})
            elif len(memory) > 0:
                transitions = memory.sample(len(memory))
                batch = Transition(*zip(*transitions))
                agent.optimize_model(batch, **{'B':len(memory)})
            loss_history[-1].append(agent.losses[:])
            agent.losses=[]
            # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then
            # we can know from the learning rate if we're in a flatter area.
            # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2
#             try:
#                 agent.scheduler.step(np.mean(loss_history[-1]))
#                 lr_history.append(agent.optimizer.param_groups[0]['lr'])
#             except:
#                 agent.schedulerC.step(np.mean(loss_history[-1]))
#                 lr_history.append(agent.optimizerC.param_groups[0]['lr'])
            try:
                loss_historyA[-1].append(agent.lossesA[:])
                agent.lossesA=[]
#                 agent.schedulerA.step(np.mean(loss_historyA[-1]))
#                 lr_historyA.append(agent.optimizerA.param_groups[0]['lr'])
            except:
                pass
        
        if debug:
            print("Episode ", e, " finished; t = ", t)
        
        if e % test_interval == 0:
            print("Test result at episode ", e, ": ")
            test_hist = test(agent, env, num_test, num_iteration, num_sample, action_space, 
                             seed=test_seeds[int(e/test_interval)], debug=debug, not_use_rand_in_action=not_use_rand_in_test)
            test_hists.append(test_hist)
        
        # Save demos of simulation if wanted
        if e % save_sim_intv == (save_sim_intv-1) and e > 0:
            try:
                fnames = [f+'_{0}'.format(e) for f in save_sim_fnames]
                plot_test(agent, env, fnames=fnames,
                    num_iteration=num_iteration, action_space=action_space, imdir=imdir,
                    debug=debug, useVid=useVid, not_use_rand=not_use_rand_in_test)
                for f in fnames:
                    os.system('ffmpeg -y -pattern_type glob -i "'+imdir+f+'*.jpg" '+f+'.gif')
            except:
                print("Failed to save simulation at e={0}".format(e))
            if save_intm_models and len(save_sim_fnames) > 0:
                agent.save_model(save_sim_fnames[0]+'_{0}'.format(e))
    if return_memory:
        return test_hists, memory
    else:
        return test_hists

Example #9

Show file

class Agent(object):
    def __init__(self,
                 state_space,
                 n_actions,
                 replay_buffer_size=50000,
                 batch_size=32,
                 hidden_size=12,
                 gamma=0.98):
        self.n_actions = n_actions
        self.state_space_dim = state_space
        self.policy_net = DQN(state_space, n_actions, hidden_size)
        self.target_net = DQN(state_space, n_actions, hidden_size)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-3)
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states)
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        # Task 4: TODO: Compute the expected Q values
        expected_state_action_values = reward_batch + self.gamma * next_state_values

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values.squeeze(),
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        sample = random.random()
        if sample > epsilon:
            with torch.no_grad():
                state = torch.from_numpy(state).float()
                q_values = self.policy_net(state)
                return torch.argmax(q_values).item()
        else:
            return random.randrange(self.n_actions)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)

Example #10

Show file

File: DDPG_agent.py Project: maxmax1992/DDPG-MADDPG

class DDPG_Agent:
    def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args):
        self.args = args
        self.alow = alow
        self.ahigh = ahigh
        self.policy = Policy_net(ob_sp, act_sp)
        self.policy_targ = Policy_net(ob_sp, act_sp)
        self.qnet = Q_net(ob_sp, act_sp)
        self.qnet_targ = Q_net(ob_sp, act_sp)

        self.policy.to(device)
        self.qnet.to(device)
        self.policy_targ.to(device)
        self.qnet_targ.to(device)
        self.MSE_loss = nn.MSELoss()
        self.noise = OUNoise(1, 1)

        hard_update(self.policy_targ, self.policy)
        hard_update(self.qnet_targ, self.qnet)

        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR)
        self.memory = ReplayMemory(int(1e6))
        self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS,
                                                FINAL_STD,
                                                INITIAL_STD,
                                                warmup_steps=WARMUP_STEPS)
        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer

    def get_action(self, state):
        if self.args.use_ounoise:
            noise = self.noise.sample()[0]
        else:
            noise = np.random.normal(
                0, self.epsilon_scheduler.value(self.n_steps))
        st = torch.from_numpy(state).view(1, -1).float()
        action = self.policy(st)
        action_with_noise = np.clip(action.item() + noise, self.alow,
                                    self.ahigh)
        if self.args.use_writer:
            self.writer.add_scalar("action mean", action.item(), self.n_steps)
            self.writer.add_scalar("action noise", noise, self.n_steps)
            self.writer.add_scalar("epsilon",
                                   self.epsilon_scheduler.value(self.n_steps),
                                   self.n_steps)
            self.writer.add_scalar("action", action_with_noise, self.n_steps)
        self.n_steps += 1
        return action_with_noise

    def store_transition(self, state, action, reward, next_state, done):

        self.memory.push(torch.from_numpy(state), torch.tensor(action),
                         torch.tensor(reward), torch.from_numpy(next_state),
                         torch.tensor(done))

    def reset(self):
        self.noise.reset()

    def train(self):
        batch = self.memory.sample(min(BATCH_SIZE, len(self.memory)))
        b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))]
        states, actions, rewards, next_states, dones = \
            b_dict[0], b_dict[1].view(-1, 1), \
            b_dict[2].view(-1, 1).float().to(device), b_dict[3], \
            b_dict[4].view(-1, 1).float().to(device)

        #  CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a))
        # inputs computation
        inputs_critic = self.qnet(states, actions)
        # targets
        with torch.no_grad():
            policy_acts = self.policy_targ(next_states)
        targ_values = self.qnet_targ(next_states, policy_acts)
        targets_critics = rewards + GAMMA * (1 - dones) * targ_values
        loss_critic = self.MSE_loss(inputs_critic, targets_critics)
        self.q_optimizer.zero_grad()
        loss_critic.backward()
        # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP)
        self.q_optimizer.step()

        # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø
        actor_loss = -self.qnet(states, self.policy(states)).mean()
        self.p_optimizer.zero_grad()
        actor_loss.backward()
        # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP)
        self.p_optimizer.step()
        soft_update(self.policy_targ, self.policy, TAU)
        soft_update(self.qnet_targ, self.qnet, TAU)
        if self.args.use_writer:
            self.writer.add_scalar("critic_loss", loss_critic.item(),
                                   self.n_updates)
            self.writer.add_scalar("actor_loss", actor_loss.item(),
                                   self.n_updates)
        self.n_updates += 1

Example #11

Show file

File: cartpole_environment.py Project: Dumitrescu-Alexandru/Research-project

class Agent(nn.Module):
    def __init__(self, q_models, target_model, hyperbolic, k, gamma,
                 model_params, replay_buffer_size, batch_size, inp_dim, lr):
        super(Agent, self).__init__()
        if hyperbolic:
            self.q_models = torch.nn.ModuleList(q_models)
            self.target_models = torch.nn.ModuleList(target_model)
        else:
            self.q_models = q_models
            self.target_models = target_model
        self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=1e-5)
        self.hyperbolic = hyperbolic
        self.n_actions = model_params.act_space
        self.k = k
        self.gamma = gamma
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.inp_dim = inp_dim

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    @staticmethod
    def get_hyperbolic_train_coeffs(k, num_models):
        coeffs = []
        gamma_intervals = np.linspace(0, 1, num_models + 2)
        for i in range(1, num_models + 1):
            coeffs.append(((gamma_intervals[i + 1] - gamma_intervals[i]) *
                           (1 / k) * gamma_intervals[i]**((1 / k) - 1)))
        return torch.tensor(coeffs) / sum(coeffs)

    def get_action(self, state_batch, epsilon=0.05):
        model_outputs = []
        take_random_action = random.random()
        if take_random_action > epsilon:
            return random.randrange(self.n_actions)
        elif self.hyperbolic:
            if take_random_action > epsilon:
                return random.randrange(self.n_actions)
            else:
                with torch.no_grad():
                    state_batch = torch.tensor(state_batch,
                                               dtype=torch.float32).view(
                                                   -1, self.inp_dim)
                    for ind, mdl in enumerate(self.q_models):
                        model_outputs.append(mdl(state_batch))
                    coeff = self.get_hyperbolic_train_coeffs(
                        self.k, len(self.q_models))
                    model_outputs = torch.cat(model_outputs, 1).reshape(
                        -1, len(self.q_models))
                    model_outputs = (model_outputs * coeff).sum(dim=1)
                    return torch.argmax(model_outputs).item()

    def get_state_act_vals(self, state_batch, action_batch=None):
        if self.hyperbolic:
            model_outputs = []
            for ind, mdl in enumerate(self.q_models):
                model_outputs.append(mdl(state_batch).gather(1, action_batch))
            model_outputs = torch.cat(model_outputs,
                                      1).reshape(-1, len(self.q_models))
            coeffs = self.get_hyperbolic_train_coeffs(self.k,
                                                      len(self.q_models))
            model_outputs = model_outputs * coeffs
            return model_outputs.sum(dim=1).reshape(-1, 1)
        else:
            model_output = self.q_models(state_batch).gather(1, action_batch)
            return model_output

    def get_max_next_state_vals(self, non_final_mask, non_final_next_states):
        if self.hyperbolic:
            target_outptus = []
            gammas = torch.tensor(np.linspace(0, 1,
                                              len(self.q_models) + 1),
                                  dtype=torch.float)[1:]
            for ind, mdl in enumerate(self.target_models):
                next_state_values = torch.zeros(self.batch_size)
                next_state_values[non_final_mask] = mdl(
                    non_final_next_states).max(1)[0].detach()
                target_outptus.append(next_state_values)
            target_outptus = torch.cat(target_outptus,
                                       0).reshape(-1, len(self.target_models))
            target_outptus = target_outptus * gammas
            return target_outptus

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states)
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.get_state_act_vals(state_batch,
                                                      action_batch)
        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        state_action_values = state_action_values.view(-1, 1).repeat(
            1, len(self.q_models))
        next_state_values = self.get_max_next_state_vals(
            non_final_mask, non_final_next_states)
        expected_state_action_values = next_state_values + reward_batch.view(
            -1, 1).repeat(1, len(self.q_models))
        loss = (state_action_values - expected_state_action_values)**2
        coefs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models))
        loss = torch.sum(loss * coefs)
        # loss = F.smooth_l1_loss(state_action_values.squeeze(),
        #                         expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)

Example #12

Show file

class Agent:
    
    def __init__(self, args):

        # which environment to load from the opencv database
        self.env_id = "PongNoFrameskip-v4"
        # create the environment
        self.env = Environment(self.env_id)

        # part of the q-value formula
        self.discount_factor = 0.99
        self.batch_size = 64
        # how often to update the network (backpropogation)
        self.update_frequency = 4
        # often synchronize with the target  network
        self.target_network_update_freq = 1000

        # keeps track of the frames for training, and retrieves them in batches 
        self.agent_history_length = 4
        self.memory = ReplayMemory(capacity=10000, batch_size=self.batch_size)

        # two neural networks. One for main and one for target
        self.main_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        self.target_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        
        # adam optimizer. just a standard procedure
        self.optimizer = Adam(learning_rate=1e-4, epsilon=1e-6)
        # we start with a high exploration rate then slowly decrease it
        self.init_explr = 1.0
        self.final_explr = 0.1
        self.final_explr_frame = 1000000
        self.replay_start_size = 10000

        # metrics for the loss 
        self.loss = tf.keras.losses.Huber()
        # this will be the mean of 100 last rewards
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        # comes from the q loss below
        self.q_metric = tf.keras.metrics.Mean(name="Q_value")

        # what is the max number of frames to train. probably won't reach here.
        self.training_frames = int(1e7)

        # path to save the checkpoints, logs and the weights
        self.checkpoint_path = "./checkpoints/" + args.run_name
        self.tensorboard_writer = tf.summary.create_file_writer(self.checkpoint_path + "/runs/")
        self.print_log_interval = 10
        self.save_weight_interval = 10
        self.env.reset()
           

     # calculate the network loss on the replay buffer (Q-learning)
    def update_main_q_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch):
       
        with tf.GradientTape() as tape:
            ## THIS IS WHERE THE MAGIC HAPPENS!
            ## L = Q(s, a) - (r + discount_factor* Max Q(s’, a))
            next_state_q = self.target_network(next_state_batch)
            next_state_max_q = tf.math.reduce_max(next_state_q, axis=1)
            expected_q = reward_batch + self.discount_factor * next_state_max_q * (1.0 - tf.cast(terminal_batch, tf.float32))
            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)

        return loss

    
     # calculate the network loss on the replay buffer (Double Q-learning)
    def update_main_dq_network(self, state_batch, action_batch, reward_batch, next_state_batch, terminal_batch):
        
        with tf.GradientTape() as tape:
            # THIS IS WHERE THE MAGIC HAPPENS!
            ## here we maintain two Q values: one to maximize the reward in the next state and one to update current state
            q_online = self.main_network(next_state_batch)  # Use q values from online network
            action_q_online = tf.math.argmax(q_online, axis=1)  # optimal actions from the q_online
            q_target = self.target_network(next_state_batch)  #  q values from target netowkr
            ddqn_q = tf.reduce_sum(q_target * tf.one_hot(action_q_online, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            expected_q = reward_batch + self.discount_factor * ddqn_q * (1.0 - tf.cast(terminal_batch, tf.float32))  # Corresponds to equation (4) in ddqn paper
            main_q = tf.reduce_sum(self.main_network(state_batch) * tf.one_hot(action_batch, self.env.get_action_space_size(), 1.0, 0.0), axis=1)
            loss = self.loss(tf.stop_gradient(expected_q), main_q)

        gradients = tape.gradient(loss, self.main_network.trainable_variables)
        clipped_gradients = [tf.clip_by_norm(grad, 10) for grad in gradients]
        self.optimizer.apply_gradients(zip(clipped_gradients, self.main_network.trainable_variables))

        self.loss_metric.update_state(loss)
        self.q_metric.update_state(main_q)

        return loss



    # get the next action index based on the state (84,84,4) and exploration rate
    def get_action(self, state, exploration_rate):
        recent_state = tf.expand_dims(state, axis=0)
        if tf.random.uniform((), minval=0, maxval=1, dtype=tf.float32) < exploration_rate:
            action = tf.random.uniform((), minval=0, maxval=self.env.get_action_space_size(), dtype=tf.int32)
        else:
            q_value = self.main_network(tf.cast(recent_state, tf.float32))
            action = tf.cast(tf.squeeze(tf.math.argmax(q_value, axis=1)), dtype=tf.int32)
        return action
        
    
    # get the epsilon value for the current based. Similar to https://openai.com/blog/openai-baselines-dqn/
    def get_eps(self, current_step, terminal_eps=0.01, terminal_frame_factor=25):
    
        terminal_eps_frame = self.final_explr_frame * terminal_frame_factor

        if current_step < self.replay_start_size:
            eps = self.init_explr
        elif self.replay_start_size <= current_step and current_step < self.final_explr_frame:
            eps = (self.final_explr - self.init_explr) / (self.final_explr_frame - self.replay_start_size) * (current_step - self.replay_start_size) + self.init_explr
        elif self.final_explr_frame <= current_step and current_step < terminal_eps_frame:
            eps = (terminal_eps - self.final_explr) / (terminal_eps_frame - self.final_explr_frame) * (current_step - self.final_explr_frame) + self.final_explr
        else:
            eps = terminal_eps
        return eps
    
        
    # copy over the weights between the main and target network to synchronize
    def update_target_network(self):
        main_vars = self.main_network.trainable_variables
        target_vars = self.target_network.trainable_variables
        for main_var, target_var in zip(main_vars, target_vars):
            target_var.assign(main_var)

    def train(self, algorithm='q'):
    
        total_step = 0
        episode = 0
        latest_mean_score = -99.99
        latest_100_score = deque(maxlen=100)
        # this is kinda arbitrary but looks like the best bot reach 20 when they are done training in this game
        max_reward = 20.0

        # train until the mean reward reaches 20
        while latest_mean_score < max_reward:
            
            # reset the variable for the upcoming episode
            state = self.env.reset()
            episode_step = 0
            episode_score = 0.0
            done = False


            while not done:
                # while the episode is not done, calculate the epsilon and get the next action
                eps = self.get_eps(tf.constant(total_step, tf.float32))
                action = self.get_action(tf.constant(state), tf.constant(eps, tf.float32))
            
                next_state, reward, done, info = self.env.step(action)
                episode_score += reward

                self.memory.push(state, action, reward, next_state, done)
                state = next_state

                # update the netwrok
                if (total_step % self.update_frequency == 0) and (total_step > self.replay_start_size):
                    indices = self.memory.get_minibatch_indices()
                    state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.generate_minibatch_samples(indices)
                    if algorithm == 'q':
                        self.update_main_q_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch)
                    else:
                        self.update_main_dq_network(state_batch, action_batch, reward_batch, next_state_batch, terminal_batch)

                if (total_step % self.target_network_update_freq == 0) and (total_step > self.replay_start_size):
                    loss = self.update_target_network()
                
                total_step += 1
                episode_step += 1

                if done:
                    latest_100_score.append(episode_score)
                    self.write_summary(episode, latest_100_score, episode_score, total_step, eps)
                    episode += 1

                    if episode % self.print_log_interval == 0:
                        print("Episode: ", episode)
                        print("Latest 100 avg: {:.4f}".format(np.mean(latest_100_score)))
                        print("Progress: {} / {} ( {:.2f} % )".format(total_step, self.training_frames, 
                        np.round(total_step / self.training_frames, 3) * 100))
                        latest_mean_score = np.mean(latest_100_score)

                    if episode % self.save_weight_interval == 0:
                        print("Saving weights...")
                        self.main_network.save_weights(self.checkpoint_path + "/weights/episode_{}".format(episode))


    # write the summaries back to the tensorboard
    def write_summary(self, episode, latest_100_score, episode_score, total_step, eps):

        with self.tensorboard_writer.as_default():
            tf.summary.scalar("Reward", episode_score, step=episode)
            tf.summary.scalar("Latest 100 avg rewards", np.mean(latest_100_score), step=episode)
            tf.summary.scalar("Loss", self.loss_metric.result(), step=episode)
            tf.summary.scalar("Average Q", self.q_metric.result(), step=episode)
            tf.summary.scalar("Total Frames", total_step, step=episode)
            tf.summary.scalar("Epsilon", eps, step=episode)

        self.loss_metric.reset_states()
        self.q_metric.reset_states()

Example #13

Show file

class Agent:
    """Definition of the Agent that will interact with the environment.

    Attributes:
        REPLAY_MEM_SIZE (:obj:`int`): max capacity of Replay Memory

        BATCH_SIZE (:obj:`int`): Batch size. Default is 40 as specified in the paper.

        GAMMA (:obj:`float`): The discount, should be a constant between 0 and 1
            that ensures the sum converges. It also controls the importance of future
            expected reward.

        EPS_START(:obj:`float`): initial value for epsilon of the e-greedy action
            selection

        EPS_END(:obj:`float`): final value for epsilon of the e-greedy action
            selection

        LEARNING_RATE(:obj:`float`): learning rate of the optimizer
            (Adam)

        INPUT_DIM (:obj:`int`): input dimentionality withut considering batch size.

        HIDDEN_DIM (:obj:`int`): hidden layer dimentionality (for Linear models only)

        ACTION_NUMBER (:obj:`int`): dimentionality of output layer of the Q network

        TARGET_UPDATE (:obj:`int`): period of Q target network updates

        MODEL (:obj:`string`): type of the model.

        DOUBLE (:obj:`bool`): Type of Q function computation.
    """
    def __init__(self,
                 REPLAY_MEM_SIZE=10000,
                 BATCH_SIZE=40,
                 GAMMA=0.98,
                 EPS_START=1,
                 EPS_END=0.12,
                 EPS_STEPS=300,
                 LEARNING_RATE=0.001,
                 INPUT_DIM=24,
                 HIDDEN_DIM=120,
                 ACTION_NUMBER=3,
                 TARGET_UPDATE=10,
                 MODEL='ddqn',
                 DOUBLE=True):

        self.REPLAY_MEM_SIZE = REPLAY_MEM_SIZE
        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_STEPS = EPS_STEPS
        self.LEARNING_RATE = LEARNING_RATE
        self.INPUT_DIM = INPUT_DIM
        self.HIDDEN_DIM = HIDDEN_DIM
        self.ACTION_NUMBER = ACTION_NUMBER
        self.TARGET_UPDATE = TARGET_UPDATE
        self.MODEL = MODEL  # deep q network (dqn) or Dueling deep q network (ddqn)
        self.DOUBLE = DOUBLE  # to understand if use or do not use a 'Double' model (regularization)
        self.TRAINING = True  # to do not pick random actions during testing
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("Agent is using device:\t" + str(self.device))
        '''elif self.MODEL == 'lin_ddqn':
            self.policy_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
        elif self.MODEL == 'lin_dqn':
            self.policy_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
        '''

        if self.MODEL == 'ddqn':
            self.policy_net = ConvDuelingDQN(
                self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = ConvDuelingDQN(
                self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
        elif self.MODEL == 'dqn':
            self.policy_net = ConvDQN(self.INPUT_DIM,
                                      self.ACTION_NUMBER).to(self.device)
            self.target_net = ConvDQN(self.INPUT_DIM,
                                      self.ACTION_NUMBER).to(self.device)

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.LEARNING_RATE)
        self.memory = ReplayMemory(self.REPLAY_MEM_SIZE)
        self.steps_done = 0
        self.training_cumulative_reward = []

    def select_action(self, state):
        """ the epsilon-greedy action selection"""
        state = state.unsqueeze(0).unsqueeze(1)
        sample = random.random()
        if self.TRAINING:
            if self.steps_done > self.EPS_STEPS:
                eps_threshold = self.EPS_END
            else:
                eps_threshold = self.EPS_START
        else:
            eps_threshold = self.EPS_END

        self.steps_done += 1
        # [Exploitation] pick the best action according to current Q approx.
        if sample > eps_threshold:
            with torch.no_grad():
                # Return the number of the action with highest non normalized probability
                # TODO: decide if diverge from paper and normalize probabilities with
                # softmax or at least compare the architectures
                return torch.tensor([self.policy_net(state).argmax()],
                                    device=self.device,
                                    dtype=torch.long)

        # [Exploration]  pick a random action from the action space
        else:
            return torch.tensor([random.randrange(self.ACTION_NUMBER)],
                                device=self.device,
                                dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            # it will return without doing nothing if we have not enough data to sample
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        # Transition is the named tuple defined above.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        #
        # non_final_mask is a column vector telling wich state of the sampled is final
        # non_final_next_states contains all the non-final states sampled
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        nfns = [s for s in batch.next_state if s is not None]
        non_final_next_states = torch.cat(nfns).view(len(nfns), -1)
        non_final_next_states = non_final_next_states.unsqueeze(1)

        state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1)
        state_batch = state_batch.unsqueeze(1)
        action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1)
        reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        # detach removes the tensor from the graph -> no gradient computation is
        # required
        next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        next_state_values = next_state_values.view(self.BATCH_SIZE, -1)

        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch
        # print("expected_state_action_values.shape:\t%s"%str(expected_state_action_values.shape))

        # Compute MSE loss
        loss = F.mse_loss(state_action_values, expected_state_action_values
                          )  # expected_state_action_values.unsqueeze(1)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def optimize_double_dqn_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            # it will return without doing nothing if we have not enough data to sample
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        # Transition is the named tuple defined above.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        #
        # non_final_mask is a column vector telling wich state of the sampled is final
        # non_final_next_states contains all the non-final states sampled
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        nfns = [s for s in batch.next_state if s is not None]
        non_final_next_states = torch.cat(nfns).view(len(nfns), -1)
        non_final_next_states = non_final_next_states.unsqueeze(1)

        state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1)
        state_batch = state_batch.unsqueeze(1)
        action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1)
        reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1)
        # print("state_batch shape: %s\nstate_batch[0]:%s\nactionbatch shape: %s\nreward_batch shape: %s"%(str(state_batch.view(40,-1).shape),str(state_batch.view(40,-1)[0]),str(action_batch.shape),str(reward_batch.shape)))

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # ---------- D-DQN Extra Line---------------
        _, next_state_action = self.policy_net(state_batch).max(1,
                                                                keepdim=True)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the actions given by policynet.
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        # detach removes the tensor from the graph -> no gradient computation is
        # required
        next_state_values = torch.zeros(self.BATCH_SIZE,
                                        device=self.device).view(
                                            self.BATCH_SIZE, -1)

        out = self.target_net(non_final_next_states)
        next_state_values[non_final_mask] = out.gather(
            1, next_state_action[non_final_mask])
        # next_state_values = next_state_values.view(self.BATCH_SIZE, -1)
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch

        # Compute MSE loss
        loss = F.mse_loss(state_action_values, expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self, env, path, num_episodes=40):
        self.TRAINING = True
        cumulative_reward = [0 for t in range(num_episodes)]
        print("Training:")
        for i_episode in tqdm(range(num_episodes)):
            # Initialize the environment and state
            env.reset(
            )  # reset the env st it is set at the beginning of the time serie
            self.steps_done = 0
            state = env.get_state()
            for t in range(len(env.data)):  # while not env.done

                # Select and perform an action
                action = self.select_action(state)
                reward, done, _ = env.step(action)

                cumulative_reward[i_episode] += reward.item()

                # Observe new state: it will be None if env.done = True. It is the next
                # state since env.step() has been called two rows above.
                next_state = env.get_state()

                # Store the transition in memory
                self.memory.push(state, action, next_state, reward)

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network): note that
                # it will return without doing nothing if we have not enough data to sample

                if self.DOUBLE:
                    self.optimize_double_dqn_model()
                else:
                    self.optimize_model()

                if done:
                    break

            # Update the target network, copying all weights and biases of policy_net
            if i_episode % self.TARGET_UPDATE == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())

        # save the model
        if self.DOUBLE:
            model_name = env.reward_f + '_reward_double_' + self.MODEL + '_model'
            count = 0
            while os.path.exists(path +
                                 model_name):  # avoid overrinding models
                count += 1
                model_name = model_name + "_" + str(count)

        else:
            model_name = env.reward_f + '_reward_' + self.MODEL + '_model'
            count = 0
            while os.path.exists(path +
                                 model_name):  # avoid overrinding models
                count += 1
                model_name = model_name + "_" + str(count)

        torch.save(self.policy_net.state_dict(), path + model_name)

        return cumulative_reward

    def test(self, env_test, model_name=None, path=None):
        self.TRAINING = False
        cumulative_reward = [0 for t in range(len(env_test.data))]
        reward_list = [0 for t in range(len(env_test.data))]

        if model_name is None:
            pass
        elif path is not None:
            if re.match(".*_dqn_.*", model_name):
                self.policy_net = ConvDQN(self.INPUT_DIM,
                                          self.ACTION_NUMBER).to(self.device)
                if str(self.device) == "cuda":
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name))
                else:
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name,
                                   map_location=torch.device('cpu')))
            elif re.match(".*_ddqn_.*", model_name):
                self.policy_net = ConvDuelingDQN(
                    self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
                if str(self.device) == "cuda":
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name))
                else:
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name,
                                   map_location=torch.device('cpu')))
            else:
                raise RuntimeError(
                    "Please Provide a valid model name or valid path.")
        else:
            raise RuntimeError(
                'Path can not be None if model Name is not None.')

        env_test.reset(
        )  # reset the env st it is set at the beginning of the time serie
        state = env_test.get_state()
        for t in tqdm(range(len(env_test.data))):  # while not env.done

            # Select and perform an action
            action = self.select_action(state)

            reward, done, _ = env_test.step(action)

            cumulative_reward[t] += reward.item(
            ) + cumulative_reward[t - 1 if t - 1 > 0 else 0]
            reward_list[t] = reward

            # Observe new state: it will be None if env.done = True. It is the next
            # state since env.step() has been called two rows above.
            next_state = env_test.get_state()

            # Move to the next state
            state = next_state

            if done:
                break

        return cumulative_reward, reward_list

Example #14

Show file

File: rbf_agent.py Project: breno-aberle/reinforcement-learning-course

class Agent(object):
    def __init__(self,
                 num_actions,
                 gamma=0.98,
                 memory_size=5000,
                 batch_size=32):
        self.scaler = None
        self.featurizer = None
        self.q_functions = None
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_actions = num_actions
        self.memory = ReplayMemory(memory_size)
        self.initialize_model()

    def initialize_model(self):
        # Draw some samples from the observation range and initialize the scaler
        obs_limit = np.array([4.8, 5, 0.5, 5])
        samples = np.random.uniform(-obs_limit, obs_limit,
                                    (1000, obs_limit.shape[0]))
        self.scaler = StandardScaler()
        self.scaler.fit(samples)

        # Initialize the RBF featurizer
        self.featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=80)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=50)),
        ])
        self.featurizer.fit(self.scaler.transform(samples))

        # Create a value approximator for each action
        self.q_functions = [
            SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3)
            for _ in range(self.num_actions)
        ]

        # Initialize it to whatever values; implementation detail
        for q_a in self.q_functions:
            q_a.partial_fit(self.featurize(samples),
                            np.zeros((samples.shape[0], )))

    def featurize(self, state):
        """ Test two different features for state representations
        """
        if len(state.shape) == 1:
            state = state.reshape(1, -1)
        # Task 1a: TODO: Use (s, abs(s)) as features # handcrafted feature vector: s = [1, -2, 3, -4], then (s, abs(s)) = [1, -2, 3, -4, 1, 2, 3, 4] (see slack discussion)
        #return np.concatenate((state, abs(state)), axis=1)
        # Task 1b: RBF features # radial basis function representations
        return self.featurizer.transform(self.scaler.transform(state))

    def get_action(self, state, epsilon=0.0):
        if np.random.random() < epsilon:
            a = int(np.random.random() * self.num_actions)
            return a
        else:
            featurized = self.featurize(state)
            qs = [q.predict(featurized)[0] for q in self.q_functions]
            qs = np.array(qs)
            a = np.argmax(qs, axis=0)
            return a

    def single_update(self, state, action, next_state, reward, done):
        # Calculate feature representations of the
        # Task 1: TODO: Set the feature state and feature next state
        featurized_state = self.featurize(state)
        featurized_next_state = self.featurize(next_state)

        # Task 1:  TODO Get Q(s', a) for the next state
        predictions = []
        for q_func in self.q_functions:  # one function approximator for each of the two actions
            predictions.append(
                q_func.predict(featurized_next_state)
            )  # calculate prediction for every function approximator q_function
        next_qs = np.max(predictions)  # chose highest predicted value

        # Calculate the updated target Q- values
        # Task 1: TODO: Calculate target based on rewards and next_qs
        if done:  # terminal state
            target = [reward + self.gamma * 0]
        else:  # not terminal state
            target = [reward + self.gamma * next_qs]

        # Update Q-value estimation
        self.q_functions[action].partial_fit(
            featurized_state,
            target)  # partial_fit() for mini-batch learning (see sklearn docs)

    def update_estimator(self):
        if len(self.memory) < self.batch_size:
            # Use the whole memory
            samples = self.memory.memory
        else:
            # Sample some data
            samples = self.memory.sample(
                self.batch_size
            )  # return random sample; length=32 # print("", )

        # Task 2: TODO: Reformat data in the minibatch
        states = np.array(
            [sample.state for sample in samples]
        )  # pick all the states from the batch, we have to retrieve the data of the batches
        action = np.array([
            sample.action for sample in samples
        ])  # return array with 32 elements (number of batch size)
        next_states = np.array([sample.next_state for sample in samples])
        rewards = np.array([sample.reward for sample in samples])
        dones = np.array([sample.done for sample in samples])

        # Task 2: TODO: Calculate Q(s', a)
        featurized_next_states = self.featurize(next_states)
        # we need to do the same for next_qs as in single_update but for every sample in the batch
        next_qs = []  # 32x1 (#samples x #functions)
        for s in featurized_next_states:
            arr = np.array([q.predict([s]) for q in self.q_functions])
            next_qs.append(np.max(arr))
        next_qs = np.array(next_qs)

        # Calculate the updated target values
        # Task 2: TODO: Calculate target based on rewards and next_qs
        targets = rewards + self.gamma * next_qs * (1 - dones)

        # Calculate featurized states
        featurized_states = self.featurize(states)

        # Get new weights for each action separately
        for a in range(self.num_actions):
            # Find states where a was taken
            idx = action == a

            # If a not present in the batch, skip and move to the next action
            if np.any(idx):
                act_states = featurized_states[idx]
                act_targets = targets[idx]

                # Perform a single SGD step on the Q-function params
                self.q_functions[a].partial_fit(act_states, act_targets)

    def store_transition(self, *args):
        self.memory.push(*args)

Example #15

Show file

File: DQN.py Project: ChildishBob/CS489-Reinforcement-Learning

class DQNagent:

    def __init__(self, mem_size, epsilon, mini_batch_size, learning_rate, gamma):

        self.epsilon = epsilon
        self.mini_batch_size = mini_batch_size
        self.gamma = gamma

        self.update_counter = 0

        self.net = nn.Sequential(
            nn.Linear(2, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 3)
        ).float()

        self.net_target = copy.deepcopy(self.net)

        self.net = self.net.cuda()
        self.net_target = self.net_target.cuda()

        # self.net_target = nn.Sequential(
        #     nn.Linear(2, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 3)
        # ).float()

        self.replay_memory = ReplayMemory(max_size=mem_size)

        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
    
    def get_action(self, obs, mode='e-greedy'):
        if mode == 'random':
            action = random.choice([0, 1, 2])
        elif mode == 'greedy':
            obs = torch.tensor(obs, dtype=torch.float).cuda()
            with torch.no_grad():
                action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        elif mode == 'e-greedy':
            action = random.choice([0, 1, 2])
            if random.random() >= self.epsilon:
                obs = torch.tensor(obs, dtype=torch.float).cuda()
                with torch.no_grad():
                    action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        # if not explore and random.random() >= self.epsilon:
        #     obs = torch.tensor(obs, dtype=torch.float).cuda()
        #     with torch.no_grad():
        #         action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        
        assert type(action) == int
        return action
    
    def store_transition(self, obs, action, reward, new_obs, done):
        self.replay_memory.push(obs, action, reward, new_obs, done)
    
    def update(self):
        
        if len(self.replay_memory) < self.mini_batch_size:
            return

        obs_batch, action_batch, reward_batch, new_obs_batch, done_batch = self.replay_memory.sample(self.mini_batch_size)

        new_obs_batch = torch.tensor(new_obs_batch, dtype=torch.float).cuda()
        # print(new_obs_batch.shape)
        # time.sleep(5)
        with torch.no_grad():
            target_batch = torch.tensor(reward_batch, dtype=torch.float).cuda()
            # print(target_batch.shape)
            # time.sleep(5)
            vals_new_obs = torch.max(self.net_target(new_obs_batch), dim=1)[0]
            # print(vals_new_obs.shape)
            # time.sleep(5)
            for i in range(self.mini_batch_size):
                if not done_batch[i]:
                    target_batch[i] += self.gamma * vals_new_obs[i]
            # target_batch = target_batch + self.gamma * vals_new_obs
        
        obs_batch = torch.tensor(obs_batch, dtype=torch.float).cuda()
        pred_batch = self.net(obs_batch)
        # print(pred_batch[:5])
        # print(pred_batch.size(0))
        # print(action_batch)
        # pred_batch_ = pred_batch[torch.arange(pred_batch.size(0)), action_batch]
        action_batch = torch.tensor(action_batch, dtype=torch.long).cuda()
        # print(action_batch[:5])
        pred_batch_ = pred_batch.gather(1, action_batch.unsqueeze(1)).squeeze(1)
        # print(pred_batch_[:5])
        # time.sleep(5)

        loss = self.criterion(pred_batch_, target_batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_counter += 1
        if self.update_counter%20 == 0:
            self.update_counter = 0
            for target_param, param in zip(self.net_target.parameters(), self.net.parameters()):
                target_param.data.copy_(param)

Example #16

Show file

File: agent_dqn.py Project: breno-aberle/rl-pong-project

class Agent(object):
    def __init__(self,
                 env_name,
                 state_space,
                 n_actions,
                 replay_buffer_size=500000,
                 batch_size=32,
                 hidden_size=64,
                 gamma=0.99):
        self.env_name = env_name
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.train_device = device
        self.n_actions = n_actions
        self.state_space_dim = state_space
        if "CartPole" in self.env_name:
            self.policy_net = CartpoleDQN(state_space, n_actions, 4)
            self.target_net = CartpoleDQN(state_space, n_actions, 4)
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.target_net.eval()
            self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4)
        elif "WimblepongVisualSimpleAI-v0" in self.env_name:
            self.policy_net = Policy(state_space, n_actions, 4)
            self.target_net = Policy(state_space, n_actions, 4)
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.target_net.eval()
            self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4)
        else:
            raise ValueError(
                "Wrong environment. An agent has not been specified for %s" %
                env_name)
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to(
            self.train_device)
        non_final_mask = non_final_mask.type(torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states).to(
            self.train_device)
        state_batch = torch.stack(batch.state).to(self.train_device)
        action_batch = torch.cat(batch.action).to(self.train_device)
        reward_batch = torch.cat(batch.reward).to(self.train_device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch).to(self.train_device)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        # Task 4: TODO: Compute the expected Q values
        expected_state_action_values = reward_batch + self.gamma * next_state_values

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values.squeeze(),
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        #print('initial get action',state.shape)

        #print('final get action',state.shape)
        sample = random.random()
        if sample > epsilon:
            with torch.no_grad():
                #print('a',state)
                state = torch.from_numpy(state)
                #print('b',state)
                state = state.unsqueeze(0)
                q_values = self.policy_net(state)
                return torch.argmax(q_values).item()
        else:
            return random.randrange(3)

    def preprocessing(self, observation):
        """ Preprocess the received information: 1) Grayscaling 2) Reducing quality (resizing)
        Params:
            observation: image of pong
        """
        # Grayscaling
        #img_gray = rgb2gray(observation)
        img_gray = np.dot(observation,
                          [0.2989, 0.5870, 0.1140]).astype(np.uint8)

        # Normalize pixel values
        img_norm = img_gray / 255.0

        # Downsampling: we receive squared image (e.g. 200x200) and downsample by x2.5 to (80x80)
        img_resized = cv2.resize(img_norm, dsize=(80, 80))
        #img_resized = img_norm[::2.5,::2.5]
        return img_resized

    def stack_images(self, observation, img_collection, timestep):
        """ Stack up to four frames together
        """
        # image preprocessing
        img_preprocessed = self.preprocessing(observation)

        if (timestep == 0):  # start of new episode
            # img_collection get filled with zeros again
            img_collection = deque(
                [np.zeros((80, 80), dtype=np.int) for i in range(4)], maxlen=4)
            # fill img_collection 4x with the first frame
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            # Stack the images in img_collection
            img_stacked = np.stack(img_collection, axis=2)
        else:
            # Delete first/oldest entry and append new image
            #img_collection.pop(0)
            img_collection.append(img_preprocessed)

            # Stack the images in img_collection
            img_stacked = np.stack(img_collection,
                                   axis=2)  # TODO: right axis??

        return img_stacked, img_collection

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long().to(self.train_device)
        reward = torch.tensor([reward],
                              dtype=torch.float32).to(self.train_device)
        next_state = torch.from_numpy(next_state).float().to(self.train_device)
        state = torch.from_numpy(state).float().to(self.train_device)
        self.memory.push(state, action, next_state, reward, done)

    def load_model(self):
        #load_path = '/home/isaac/codes/autonomous_driving/highway-env/data/2020_09_03/Intersection_egoattention_dqn_ego_attention_1_22:00:25/models'
        #policy.load_state_dict(torch.load("./model50000ep_WimblepongVisualSimpleAI-v0_0.mdl"))
        """ Load already created model
        return:
            none
        """
        weights = torch.load("FROM2100v2WimblepongVisualSimpleAI-v0_1900.mdl",
                             map_location=self.train_device)
        self.policy_net.load_state_dict(weights, strict=False)

    def get_name(self):
        """ Interface function to retrieve the agents name
        """
        return self.name

    def reset(self):
        """ Resets the agent’s state after an episode is finished

Example #17

Show file

File: main.py Project: OrangePeelZ/tiny_data_projects

def train(eps_decay, gamma, lr, network, seed=131):
    id = 'LunarLander-v2'
    env = gym.make(id).unwrapped
    n_actions = env.action_space.n
    n_states = env.observation_space.shape[0]
    # set seed
    random.seed(seed)
    env.seed(seed)

    # initiate the network
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if network not in NETWORK.keys():
        raise ValueError('Network key not existed!')

    fc1_unit, fc2_unit = NETWORK.get(network)
    policy_net = DQN(state_size=n_states,
                     action_size=n_actions,
                     fc1_unit=fc1_unit,
                     fc2_unit=fc2_unit,
                     seed=131).to(device)
    target_net = DQN(state_size=n_states,
                     action_size=n_actions,
                     fc1_unit=fc1_unit,
                     fc2_unit=fc2_unit,
                     seed=1).to(device)
    target_net.load_state_dict(policy_net.state_dict())

    # initiate the memory replayer and optimizer
    memory = ReplayMemory(MEMORY_CAPACITY)
    # optimizer = optim.RMSprop(policy_net.parameters())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    # initiate the global steps
    steps_done = 0
    # Here my watch started
    rewards = []
    for i_episode in range(N_EPISODES):
        cumulative_reward = 0
        state = env.reset()
        state = torch.tensor([state])
        for t in count():
            if t > N_STEPS_TIMEOUT:
                break
            action, steps_done = select_action(state=state,
                                               policy_net=policy_net,
                                               n_actions=n_actions,
                                               steps_done=steps_done,
                                               device=device,
                                               eps_end=EPS_END,
                                               eps_start=EPS_START,
                                               eps_decay=eps_decay)

            state_next, reward, done, _ = env.step(action.item())
            # env.render()
            cumulative_reward = cumulative_reward + reward
            # convert it to tensor
            state_next = torch.tensor([state_next], device=device)
            reward = torch.tensor([reward], device=device, dtype=torch.float32)
            memory.push(state, action, state_next, reward)
            state = state_next

            # every step update the weights in the policy net
            optimize_model(memory=memory,
                           batch_size=BATCH_SIZE,
                           device=device,
                           policy_net=policy_net,
                           target_net=target_net,
                           optimizer=optimizer,
                           gamma=gamma)

            if done:
                break

        rewards.append(cumulative_reward)

        # update the target net after a while
        if i_episode % TARGET_UPDATE == 0:
            # If want the soft update the weights
            #         soft_update(local_model=policy_net, target_model=target_net, tau=TAU)
            target_net.load_state_dict(policy_net.state_dict())

        if np.min(rewards[-5:]) >= 200:
            break

    # save the rewards
    rewards_path = 'training_rewards_{lr}_{eps_decay}_{gamma}_{network}.pkl'.format(
        lr=lr, eps_decay=eps_decay, gamma=gamma, network=network)
    save_rewards(rewards=rewards, path=rewards_path, option='training_rewards')

    # save the policy net
    model_path = 'model_{lr}_{eps_decay}_{gamma}_{network}.pt'.format(
        lr=lr, eps_decay=eps_decay, gamma=gamma, network=network)
    save_model(model=policy_net, path=model_path)
    print("Finished parameter combo: {params}".format(
        params=[eps_decay, gamma, lr, network]))

Example #18

Show file

class Agent:
    def __init__(self,
                 state_space,
                 n_actions,
                 replay_buffer_size=50000,
                 batch_size=32,
                 hidden_size=64,
                 gamma=0.99):
        self.n_actions = n_actions
        self.state_space_dim = state_space
        self.policy_net = GenericNetwork(state_space,
                                         n_actions,
                                         hidden_size,
                                         name='dqn_network_')
        self.target_net = GenericNetwork(state_space,
                                         n_actions,
                                         hidden_size,
                                         name='target_dqn_network_')
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.action = {}
        self.j = 0

    def learn(self):
        """
        Learning function
        :return:
        """
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8)

        # avoid having an empty tensor
        test_tensor = T.zeros(self.batch_size)
        while T.all(T.eq(test_tensor, non_final_mask)).item() is True:
            transitions = self.memory.sample(self.batch_size)
            batch = Transition(*zip(*transitions))
            non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8)

        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = T.stack(non_final_next_states)
        state_batch = T.stack(batch.state)
        action_batch = T.cat(batch.action)
        reward_batch = T.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = T.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        # Compute mse loss
        loss = F.mse_loss(state_action_values.squeeze(),
                          expected_state_action_values)
        # Optimize the model
        self.policy_net.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.policy_net.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        """
        Used to select actions
        :param state:
        :param epsilon:
        :return: action
        """
        sample = random.random()
        if sample > epsilon:
            with T.no_grad():
                state = T.from_numpy(state).float()
                q_values = self.policy_net(state)
                self.action[self.j] = {
                    'list_of_actions': q_values,
                    'max': T.argmax(q_values).item()
                }
                self.j += 1
                return T.argmax(q_values).item() + 1
        else:
            action = random.randrange(self.n_actions)
            return action + 1

    def update_target_network(self):
        """
        Used to update target networks
        :return:
        """
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, reward, next_state, done):
        """
        Used for memory replay purposes
        :param state:
        :param action:
        :param reward:
        :param next_state:
        :param done:
        :return:
        """
        action = T.Tensor([[action]]).long()
        reward = T.tensor([reward], dtype=T.float32)
        next_state = T.from_numpy(next_state).float()
        state = T.from_numpy(state).float()
        self.memory.push(state, action, reward, next_state, done)

    def save_models(self):
        """
        Used to save models
        :return:
        """
        self.policy_net.save_checkpoint()
        self.target_net.save_checkpoint()

    def load_models(self):
        """
        Used to load models
        :return:
        """
        self.policy_net.load_checkpoint()

Example #19

Show file

class Agent(nn.Module):
    def __init__(self,
                 q_models,
                 target_model,
                 hyperbolic,
                 k,
                 gamma,
                 model_params,
                 replay_buffer_size,
                 batch_size,
                 inp_dim,
                 lr,
                 no_models,
                 act_space,
                 hidden_size,
                 loss_type,
                 target_update=False):
        super(Agent, self).__init__()
        if hyperbolic:
            self.q_models = DQN(state_space_dim=inp_dim,
                                action_space_dim=act_space,
                                hidden=hidden_size,
                                no_models=no_models)
            self.target_models = DQN(state_space_dim=inp_dim,
                                     action_space_dim=act_space,
                                     hidden=hidden_size,
                                     no_models=no_models)
            self.target_models.load_state_dict(self.q_models.state_dict())
            self.target_models.eval()
        else:
            self.q_models = q_models
        self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=lr)
        self.hyperbolic = hyperbolic
        self.n_actions = model_params.act_space
        self.k = k
        # self.gammas = torch.tensor(np.linspace(0, 1, self.q_models.no_models + 1), dtype=torch.float)[1:]
        self.gammas = np.sort(
            np.random.uniform(0, 1, self.q_models.no_models + 1))
        self.gammas = np.append(self.gammas, 0.98)
        self.gammas = torch.tensor(np.sort(self.gammas))
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.inp_dim = inp_dim
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.target_models.to(self.device)
        self.q_models.to(self.device)
        self.gammas = self.gammas.to(self.device)
        self.loss_type = loss_type
        self.criterion = nn.MSELoss()
        self.use_target_network = target_update

    def update_network(self, updates=1):
        for _ in range(updates):
            loss = self._do_network_update()
        return loss

    def get_hyperbolic_train_coeffs(self, k, num_models):
        coeffs = []
        for i in range(1, num_models + 1):
            coeffs.append(((self.gammas[i + 1] - self.gammas[i]) * (1 / k) *
                           self.gammas[i]**((1 / k) - 1)))
        return torch.tensor(coeffs).to(self.device) / sum(coeffs)

    def get_action(self, state_batch, epsilon=0.05, get_among_last=False):
        # epsilon gets smaller as time goes by.
        # (glie_a/(glie_a + eps)) with eps in range(0, no_episodes)
        take_random_action = random.random()
        if take_random_action < epsilon:
            return random.randrange(self.n_actions)
        elif get_among_last:
            state_batch = torch.tensor(state_batch,
                                       dtype=torch.float32,
                                       device=self.device).view(
                                           -1, self.inp_dim)
            model_outputs = self.q_models(state_batch).reshape(
                2, self.q_models.no_models)
            return torch.argmax(model_outputs[:, -10].view(-1)).item()
            model_outputs = model_outputs * self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models)
            actions = torch.argmax(torch.sum(model_outputs, dim=1))
            return actions.item()
        elif self.hyperbolic:
            with torch.no_grad():
                state_batch = torch.tensor(state_batch,
                                           dtype=torch.float32,
                                           device=self.device).view(
                                               -1, self.inp_dim)
                model_outputs = self.q_models(state_batch.double()).reshape(
                    -1, 2)
                coeffs = self.get_hyperbolic_train_coeffs(
                    self.k, self.q_models.no_models).reshape(-1, 1)
                model_outputs = model_outputs * coeffs
                actions = torch.argmax(torch.sum(model_outputs, dim=0))
            return actions.item()

    def get_state_act_vals(self, state_batch, action_batch=None):
        if self.hyperbolic:
            action_batch = action_batch.repeat(
                1, self.q_models.no_models).reshape(-1, 1)
            model_outputs = self.q_models(state_batch.to(self.device).double())
            model_outputs = model_outputs.reshape(-1, self.n_actions)
            model_outputs = model_outputs.gather(1, action_batch)
            # .reshape(self.q_models.no_models * state_batch.shape[0],
            #          2).gather(1, action_batch.reshape(-1))
            return model_outputs
        else:
            model_output = self.q_models(state_batch).gather(1, action_batch)
            return model_output

    def get_max_next_state_vals(self, non_final_mask, non_final_next_states):
        if self.hyperbolic:
            with torch.no_grad():
                next_state_values = torch.zeros(self.batch_size).to(
                    self.device)
                # doing it like this, the model_no will come first and then the batch_no (b1m1, b1m2, b1m3..., b2m1,
                # ...b10m1, b10m2...
                # if False in non_final_mask:
                #     print(non_final_mask)
                #     print(len(non_final_next_states))
                non_final_mask = non_final_mask.reshape(-1, 1).repeat(
                    1, self.q_models.no_models).view(-1)
                # if False in non_final_mask:
                #     print([nf for nf in non_final_mask])
                next_state_values = next_state_values.view(-1, 1).repeat(
                    1, self.q_models.no_models).view(-1)
                if self.use_target_network:
                    # [b1m1o1, b1m1o2], -> max -> [b1m1]
                    # [b1m2o1, b1m2o2],           [b1m2]
                    # [b1m3o1, b1m3o3],           [b1m3]
                    # ...                         ...
                    #
                    next_state_values[non_final_mask] = \
                        self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0]
                    # if False in non_final_mask:
                    #     print("first", self.target_models(non_final_next_states.to(self.device)))
                    #     print("after reshaping", self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions))
                    #     print(self.target_models(non_final_next_states.to(self.device)).shape)
                    #     print("next_state_values", next_state_values)
                else:
                    next_state_values[non_final_mask] = \
                        self.q_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0]
                target_outptus = next_state_values
                return target_outptus * self.gammas[2:].repeat(self.batch_size)

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal
        ]
        non_final_next_states = torch.stack(non_final_next_states).to(
            self.device)
        state_batch = torch.stack(batch.state).to(self.device)
        action_batch = torch.cat(batch.action).to(self.device)
        reward_batch = torch.cat(batch.reward).to(self.device)
        state_action_values = self.get_state_act_vals(state_batch,
                                                      action_batch).view(-1)
        next_state_values = self.get_max_next_state_vals(
            non_final_mask, non_final_next_states)
        # this should be perfect
        expected_state_action_values = next_state_values + \
                                       reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1)
        # print(reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1).shape)
        if self.loss_type == "weighted_loss":
            loss = (state_action_values - expected_state_action_values)**2
            hyp_coef = self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models).repeat(self.batch_size)
            loss = (loss.reshape(-1).view(-1) * hyp_coef).view(-1)
            loss = torch.mean(loss)
        elif self.loss_type == "separate_summarized_loss":
            loss = F.smooth_l1_loss(state_action_values,
                                    expected_state_action_values).double()
            # loss = (state_action_values - expected_state_action_values) ** 2
            # loss = torch.sum(loss)
        elif self.loss_type == "one_output_loss":
            hyp_coef = self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models)
            state_action_values = state_action_values.reshape(
                self.batch_size, -1) * hyp_coef
            state_action_values = torch.sum(state_action_values, dim=1)
            expected_state_action_values = expected_state_action_values.reshape(
                self.batch_size, -1) * hyp_coef
            expected_state_action_values = torch.sum(
                expected_state_action_values, dim=1)
            loss = self.criterion(state_action_values,
                                  expected_state_action_values)

        loss_item = loss.item()
        # print(hyp_coef.repeat(self.batch_size).shape)
        # print(loss.shape)
        # loss = (state_action_values - expected_state_action_values) ** 2 * self.get_hyperbolic_train_coeffs(self.k,
        #                                                                                                     self.q_models.no_models).repeat(
        #     self.batch_size)
        # # loss = torch.sum(loss)
        # loss = F.smooth_l1_loss(stsave_figate_action_values.squeeze(),
        #                         expected_state_action_values)
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.q_models.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()
        return loss_item

    def update_target_network(self):
        self.target_models.load_state_dict(self.q_models.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)

Example #20

Show file

        observation, reward, done, _ = env.step(action.item())
        env.render()

        # record reward
        running_reward += reward
        reward = torch.tensor([reward], device=device)

        if not done:
            next_state = torch.tensor([observation],
                                      device=device,
                                      dtype=torch.float32)
        else:
            next_state = None

        # Store the transition in memory
        memory.push(current_state, action, next_state, reward)
        training_info["memory"] = memory

        # Compute the TD loss of current transition and store it into episode loss
        if not done:
            current_q = policy_net(current_state)[:, action].squeeze()
            target_q = policy_net(next_state).max() + reward.squeeze()
            target_q = torch.tensor(target_q.item(), device=device)
            trans_loss = F.smooth_l1_loss(current_q, target_q).item()
            # Record the TD loss
            running_episode_loss += trans_loss
            if trans_loss > training_info["max TD loss recorded"]:
                training_info["max TD loss recorded"] = trans_loss

        # Move to the next state
        current_state = next_state

Example #21

Show file

File: main_atari.py Project: irustandi/ya-pytorch-rl

def main():
    parser = argparse.ArgumentParser(description='DQN Breakout Script')
    parser.add_argument('--use-cuda',
                        action='store_true',
                        default=False,
                        help='whether to use CUDA (default: False)')
    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        metavar='M',
                        help='batch size (default: 128)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.999,
                        metavar='M',
                        help='gamma (default: 0.999)')
    parser.add_argument('--eps-start',
                        type=float,
                        default=0.9,
                        metavar='M',
                        help='eps start (default: 0.9)')
    parser.add_argument('--eps-end',
                        type=float,
                        default=0.05,
                        metavar='M',
                        help='eps end (default: 0.05)')
    parser.add_argument('--eps-decay',
                        type=int,
                        default=200,
                        metavar='M',
                        help='eps decay (default: 200)')
    parser.add_argument('--num-obs-in-state',
                        type=int,
                        default=4,
                        metavar='M',
                        help='num observations in state (default: 4)')
    parser.add_argument('--replay-memory-capacity',
                        type=int,
                        default=10000,
                        metavar='M',
                        help='replay memory capacity (default: 10000)')
    parser.add_argument('--num-episodes',
                        type=int,
                        default=10,
                        metavar='M',
                        help='num of episodes (default: 10)')
    parser.add_argument('--reset-period',
                        type=int,
                        default=5,
                        metavar='M',
                        help='period to reset target network (default: 5)')
    parser.add_argument('--atari-env',
                        type=str,
                        default='Breakout-v0',
                        metavar='M',
                        help='Atari environment to use (default: Breakout-v0)')
    args = parser.parse_args()

    env = gym.envs.make(args.atari_env)

    model = DQN(args.num_obs_in_state, (84, 84), env.action_space.shape[0])
    model_target = DQN(args.num_obs_in_state, (84, 84),
                       env.action_space.shape[0])

    if args.use_cuda:
        model.cuda()
        model_target.cuda()

    optimizer = optim.RMSprop(model.parameters())
    memory = ReplayMemory(args.replay_memory_capacity)

    epsilons = np.linspace(args.eps_start, args.eps_end, args.eps_decay)
    step_idx = 1
    reset_idx = 1

    tfs = get_transforms()

    episode_reward = 0.
    episode_length = 0

    for i_episode in range(args.num_episodes):
        # Initialize the environment and state
        obs = env.reset()
        state_processor = StateProcessor(args.num_obs_in_state, tfs, obs)
        state = state_processor.get_state()

        while True:
            episode_length += 1
            if step_idx < args.eps_decay:
                eps = epsilons[step_idx]
            else:
                eps = args.eps_end

            action = select_action(model, state, env.action_space.shape[0],
                                   eps, args.use_cuda)
            # print('%d %d' % (episode_length, action[0,0]))
            next_obs, reward, done, info = env.step(action[0, 0])
            episode_reward += reward
            reward = torch.Tensor([reward])
            if args.use_cuda:
                reward = reward.cuda()

            if not done:
                state_processor.push_obs(next_obs)
                next_state = state_processor.get_state()
            else:
                next_state = None  # None next_state marks done

            memory.push(state, action, next_state, reward)

            # optimize
            optimize_model(optimizer, memory, model, model_target,
                           args.batch_size, args.gamma, args.use_cuda)

            step_idx += 1
            reset_idx += 1
            if reset_idx == args.reset_period:
                reset_idx = 1
                model_target.load_state_dict(model.state_dict())

            if done:
                break

        print(episode_reward)
        print(episode_length)
        episode_reward = 0.
        episode_length = 0

Example #22

Show file

File: demo_sucessful_net.py Project: OrangePeelZ/tiny_data_projects

            action, steps_done = select_action(state=state,
                                               policy_net=policy_net,
                                               n_actions=n_actions,
                                               steps_done=steps_done,
                                               device=device,
                                               eps_end=EPS_END,
                                               eps_start=EPS_START,
                                               eps_decay=EPS_DECAY)

            state_next, reward, done, _ = env.step(action.item())
            # env.render()
            cumulative_reward = cumulative_reward + reward
            # convert it to tensor
            state_next = torch.tensor([state_next], device=device)
            reward = torch.tensor([reward], device=device, dtype=torch.float32)
            memory.push(state, action, state_next, reward)
            state = state_next

            # every step update the weights in the policy net
            optimize_model(memory=memory,
                           batch_size=BATCH_SIZE,
                           device=device,
                           policy_net=policy_net,
                           target_net=target_net,
                           optimizer=optimizer,
                           gamma=GAMMA)

            if done:
                break

        rewards.append(cumulative_reward)

Example #23

Show file

class RaLLy():
    def __init__(self, name, env):
        self.name = name
        self.env = env
        self.eps = 0.005
        self.max_timesteps = 10000
        self.explore_noise = 0.5
        self.batch_size = 32
        self.discount = 0.99
        self.tau = 0.005
        self.max_episode_steps = 200
        self.memory = ReplayMemory(10000)

    def train(self):
        policy = DDPGTrainer()
        total_timesteps = 0
        episode_timesteps = 0
        episode_num = 0
        episode_done = True
        episode_reward = 0

        while total_timesteps < self.max_timesteps:
            if episode_done:
                if total_timesteps != 0:
                    print(
                        f"Total steps: {total_timesteps:12} | Episodes: {episode_num:3} | Total reward: {episode_reward}"
                    )
                    # TODO: get training stats
                    policy.train(self.memory, episode_timesteps,
                                 self.batch_size, self.discount, self.tau)

                # Reset environment
                episode_done = False
                episode_num += 1
                episode_timesteps = 0
                episode_reward = 0
                obs = env.reset()

            control, jump, boost, handbrake = policy.actor(torch.tensor(obs))
            action = torch.cat([control, jump, boost, handbrake])

            if self.explore_noise != 0:
                noise = np.random.normal(0, self.explore_noise, size=1)
                noise = torch.clamp(torch.Tensor(noise), -1, 1)
                noise = torch.cat([noise, torch.zeros(3)])
                action = action + noise
                action = torch.clamp(action, -1, 1)

            print(action)

            # Perform action
            new_obs, reward, done, _ = env.step(action.detach())
            episode_done = True if episode_timesteps + 1 == self.max_episode_steps else done
            done_bool = float(done)
            episode_reward += reward

            # Store data in replay buffer
            self.memory.push((obs, new_obs, action, reward, done_bool))

            obs = new_obs
            episode_timesteps += 1
            total_timesteps += 1

Example #24

Show file

class Agent(object):
    def __init__(self,
                 num_actions,
                 gamma=0.98,
                 memory_size=5000,
                 batch_size=32):
        self.scaler = None
        self.featurizer = None
        self.q_functions = None
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_actions = num_actions
        self.memory = ReplayMemory(memory_size)
        self.initialize_model()

    def initialize_model(self):
        # Draw some samples from the observation range and initialize the scaler
        obs_limit = np.array([4.8, 5, 0.5, 5])
        samples = np.random.uniform(-obs_limit, obs_limit,
                                    (1000, obs_limit.shape[0]))
        self.scaler = StandardScaler()
        self.scaler.fit(samples)

        # Initialize the RBF featurizer
        self.featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=80)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=50)),
        ])
        self.featurizer.fit(self.scaler.transform(samples))

        # Create a value approximator for each action
        self.q_functions = [
            SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3)
            for _ in range(self.num_actions)
        ]

        # Initialize it to whatever values; implementation detail
        for q_a in self.q_functions:
            q_a.partial_fit(self.featurize(samples),
                            np.zeros((samples.shape[0], )))

    def featurize(self, state):
        if len(state.shape) == 1:
            state = state.reshape(1, -1)
        # Task 1: TODO: Use (s, abs(s)) as features
        #return np.concatenate((state, np.abs(state)), axis=1)
        # RBF features
        return self.featurizer.transform(self.scaler.transform(state))

    def get_action(self, state, epsilon=0.0):
        if np.random.random() < epsilon:
            a = int(np.random.random() * self.num_actions)
            return a
        else:
            featurized = self.featurize(state)
            qs = [q.predict(featurized)[0] for q in self.q_functions]
            qs = np.array(qs)
            a = np.argmax(qs, axis=0)
            return a

    def single_update(self, state, action, next_state, reward, done):
        # Calculate feature representations of the
        # Task 1: TODO: Set the feature state and feature next state

        featurized_state = self.featurize(state)
        featurized_next_state = self.featurize(next_state)

        # Task 1:  TODO Get Q(s', a) for the next state
        next_qs = [
            q.predict(featurized_next_state)[0] for q in self.q_functions
        ]

        # Calculate the updated target Q- values
        # Task 1: TODO: Calculate target based on rewards and next_qs
        if done:
            target = reward
        else:
            target = reward + self.gamma * np.max(next_qs)
        # Update Q-value estimation
        self.q_functions[action].partial_fit(featurized_state, [target])

    def update_estimator(self):
        if len(self.memory) < self.batch_size:
            # Use the whole memory
            samples = self.memory.memory
        else:
            # Sample some data
            samples = self.memory.sample(self.batch_size)
        # Task 2: TODO: Reformat data in the minibatch
        states = []
        action = []
        next_states = []
        rewards = []
        dones = []
        for s in samples:
            states.append(s.state)
            action.append(s.action)
            next_states.append(s.next_state)
            rewards.append(s.reward)
            dones.append(s.done)
        states = np.array(states)
        next_states = np.array(next_states)
        action = np.array(action)
        rewards = np.array(rewards)
        dones = np.array(dones)

        # Task 2: TODO: Calculate Q(s', a)
        featurized_next_states = self.featurize(next_states)
        next_qs = np.max(np.array(
            [q.predict(featurized_next_states) for q in self.q_functions]).T,
                         axis=1)

        # Calculate the updated target values
        # Task 2: TODO: Calculate target based on rewards and next_qs
        targets = rewards + self.gamma * next_qs * np.invert(dones)

        # Calculate featurized states
        featurized_states = self.featurize(states)
        # Get new weights for each action separately
        for a in range(self.num_actions):
            # Find states where a was taken
            idx = action == a

            # If a not present in the batch, skip and move to the next action
            if np.any(idx):
                act_states = featurized_states[idx]
                act_targets = targets[idx]
                # Perform a single SGD step on the Q-function params
                self.q_functions[a].partial_fit(act_states, act_targets)

    def store_transition(self, *args):
        self.memory.push(*args)

Example #25

Show file

File: play.py Project: mirzakhalov/dq-learning

        test_step = 0
        test_reward = 0
        done = False
        test_memory = ReplayMemory(10000, verbose=False)

        while not done:

            frames.append(test_env.render())

            action = get_action(net, tf.constant(state, tf.float32),
                                tf.constant(0.0, tf.float32))

            next_state, reward, done, info = test_env.step(action)
            test_reward += reward

            test_memory.push(state, action, reward, next_state, done)
            state = next_state

            test_step += 1

            if done and (info["ale.lives"] != 0):
                test_env.reset()
                test_step = 0
                done = False

        reward_set.append(test_reward)
        frame_set.append(frames)

    best_score = np.max(reward_set)
    print("Best score of current network ({} trials): {}".format(
        trial, best_score))

Example #26

Show file

def test_arb(arb_env, modules_list, n_epi=250, max_steps=500):
    s_dim, a_dim = 16, 4
    n_modules = len(modules_list)

    pi_tensors = get_pi(modules_list)
    arb = Arbitrator().to(device)
    returns = []
    all_rets = []
    memory = ReplayMemory(10000)
    for epi in range(n_epi):
        arb_env.reset()
        r_list = []
        steps = 0
        while steps < max_steps:
            state = get_state_vector(arb_env.cur_state)
            coeff = arb(state)
            pi_k = torch.zeros(s_dim, a_dim)
            for m in range(n_modules):
                pi_k += coeff[0][m] * pi_tensors[m]
            a = np.random.choice(
                4, p=pi_k[arb_env.cur_state].detach().cpu().numpy())
            s, a, s_, r, done = arb_env.step(a)
            r_list.append(r)
            reward = torch.FloatTensor([r], device=device)
            next_state = get_state_vector(s_)
            steps += 1
            memory.push(state, torch.FloatTensor([a], device=device),
                        next_state, reward)

            if done:
                state = get_state_vector(arb_env.cur_state)
                coeff = arb(state)
                pi_k = torch.zeros(s_dim, a_dim)
                for m in range(n_modules):
                    pi_k += coeff[0][m] * pi_tensors[m]

                a = np.random.choice(
                    4, p=pi_k[arb_env.cur_state].detach().cpu().numpy())
                # state = get_state_vector(arb_env.cur_state)
                next_state = state
                r = 100.
                steps += 1
                reward = torch.FloatTensor([r], device=device)
                r_list.append(r)
                memory.push(state, torch.FloatTensor([a], device=device),
                            next_state, reward)
                break

        rets = []
        return_so_far = 0
        for t in range(len(r_list) - 1, -1, -1):
            return_so_far = r_list[t] + 0.9 * return_so_far
            rets.append(return_so_far)
        # The returns are stored backwards in time, so we need to revert it
        rets = list(reversed(rets))
        all_rets.extend(rets)
        print("epi {} over".format(epi))
        if epi % 7 == 0:
            arb.optimize(memory, pi_tensors, torch.FloatTensor(all_rets))
            all_rets = []
            memory = ReplayMemory(10000)
        returns.append(sum(r_list))

    return returns

Example #27

Show file

File: train.py Project: amosbrazzoli/DDPG_G2P_Rules

    for i_step in tqdm(range(STEPS_PER_EPOCH)):

        # Does explorative actions for an ammount of steps
        if i_episode * STEPS_PER_EPOCH + i_step < START_STEPS:
            action = select_action(observation, ACTION_NOISE)
        else:
            action = torch.randn(
                env.action_space())  # should be implemented as actionspace BOX

        # Stepping the Environment
        obs_prime, reward, done, _ = env.step(action)
        episode_reward += reward

        if done:
            print("Got one")

        # pushes the performed action, state and reward into the cache
        cache.push(observation.unsqueeze(0), action.unsqueeze(0),
                   reward.unsqueeze(0).float(), obs_prime.unsqueeze(0),
                   done.unsqueeze(0).float())

        #Update to the most recent observation
        observation = obs_prime
        status = optimize_model()

    if status:
        test_policy()

print('Complete')
plt.show()