Example #1
0
class ConnectionInterface():
  def __init__(self, n_inputs, n_actions, batch_size=128, train_frequency=10, memory_size=10000):
    self.model = Model.get_instance(n_inputs, n_actions)
    self.model.to(device)
    self.memory = ReplayMemory(memory_size)

    self.BATCH_SIZE = batch_size
    self.train_frequency = train_frequency

    self.tick = 0

  def get_action(self, s):
    state = torch.Tensor(s).to(device)
    action = self.model.get_action(state).item()

    return action

  def add_transition(self, s, a, r, ns):
    state = torch.Tensor(s).to(device)
    action = torch.LongTensor([[a]]).to(device)
    reward = torch.Tensor([r]).to(device)
    next_state = torch.Tensor(ns).to(device)

    self.memory.push(state, action, next_state, reward)

    if len(self.memory) >= self.BATCH_SIZE and self.tick % self.train_frequency == 0:
      print('Training')
      batch = self.memory.sample(self.BATCH_SIZE)
      self.model.optimise(batch)

    self.tick = self.tick + 1
Example #2
0
class BasePolicy:
    # base class for policy implementation
    def __init__(self, buffer_size, gamma, model, actions_space: gym.Space,
                 summery_writer: SummaryWriter, lr):
        self.gamma = gamma
        self.writer = summery_writer  # use this to log your information to tensorboard
        self.model = model
        self.memory = ReplayMemory(
            capacity=buffer_size
        )  # example for using this memory - in q_policy.py
        self.action_space = actions_space  # you can sample a random action from here. example in q_policy.py

    def select_action(self, state, epsilon, global_step=None):
        # 'global_step' might be used as time-index for tensorboard recordings.
        raise NotImplementedError()

    def optimize(self, batch_size, global_step=None):
        raise NotImplementedError()

    def record(self, state, action, next_state, reward):
        self.memory.push(state, action, next_state, reward)

    def eval(self):
        self.model = self.model.eval()

    def train(self):
        self.model = self.model.train()
Example #3
0
def train():
    policy_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device)
    target_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    
    optimizer = RMSprop(policy_net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    memory = ReplayMemory(MEMORY_SIZE)

    env = Game(N_PLAYERS, LARGEST_CARD, HAND_SIZE, N_ROUNDS)
    select_action = generate_action_selector()

    rewards = []
    for episode in trange(N_EPISODES):
        total_reward = 0
        observation = env.reset()
        done = False

        while not done:
            state = torch.tensor([create_state(observation)], dtype=torch.float, device=device)
            action = select_action(policy_net, state, observation.hand)

            observation, reward, done, info = env.step(action.item())
            total_reward += reward
            
            if not done:
                next_state = torch.tensor([create_state(observation)], dtype=torch.float, device=device)
            else:
                next_state = None
            reward = torch.tensor([reward], device=device)
            memory.push(state, action, next_state, reward)
            state = next_state
            
            optimize_model(policy_net, target_net, optimizer, memory)
            if done:
                rewards.append(total_reward)
                break
        
        if episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
        if episode % SAVE_INTERVAL == 0:
            torch.save(target_net.state_dict(), f'models/model_{episode}.pth')
        if episode % 100 == 0:
            plot_rewards(np.cumsum(rewards), baseline=np.zeros(len(rewards)))

    return rewards
Example #4
0
File: main.py Project: dt1483/SnFFT
def main(hparams):
    logfname = get_logdir(hparams['logdir'], hparams['savename'])
    if not os.path.exists(hparams['logdir']):
        os.makedirs(hparams['logdir'])
    savedir = get_logdir(hparams['logdir'], hparams['savename'])
    os.makedirs(savedir)
    sumdir = os.path.join(savedir, 'logs')
    os.makedirs(sumdir)
    logfile = os.path.join(savedir, 'log.txt')
    logger = SummaryWriter(sumdir)

    with open(os.path.join(savedir, 'args.json'), 'w') as f:
        json.dump(hparams, f, indent=4)

    log = get_logger(logfile)
    log.debug('Saving in {}'.format(savedir))
    log.debug('hparams: {}'.format(hparams))

    torch.manual_seed(hparams['seed'])
    random.seed(hparams['seed'])

    alpha = eval(hparams['alpha'])
    parts = eval(hparams['parts'])
    log.info('alpha: {} | parts: {}'.format(alpha, parts))
    size = IRREP_SIZE[(alpha, parts)]
    pol_net = IrrepLinreg(size * size)
    targ_net = IrrepLinreg(size * size)

    if not hparams['init']:
        log.info('Loading fourier')
        pol_net.loadnp(NP_IRREP_FMT.format(str(alpha), str(parts)))
        targ_net.loadnp(NP_IRREP_FMT.format(str(alpha), str(parts)))
    else:
        pol_net.init(hparams['init'])
        targ_net.init(hparams['init'])
        log.info('Init model using mode: {}'.format(hparams['init']))

    if hparams['noise']:
        log.info('Adding noise: {}'.format(hparams['noise']))
        mu = torch.zeros(pol_net.wr.size())
        std = torch.zeros(pol_net.wr.size()) + hparams['noise']
        wr_noise = torch.normal(mu, std)
        wi_noise = torch.normal(mu, std)
        pol_net.wr.data.add_(wr_noise)
        pol_net.wi.data.add_(wi_noise)

        wr_noise = torch.normal(mu, std)
        wi_noise = torch.normal(mu, std)
        targ_net.wr.data.add_(wr_noise)
        targ_net.wi.data.add_(wi_noise)

    env = Cube2IrrepEnv(alpha, parts, solve_rew=hparams['solve_rew'])
    log.info('env solve reward: {}'.format(env.solve_rew))
    if hparams['opt'] == 'sgd':
        log.info('Using sgd')
        optimizer = torch.optim.SGD(pol_net.parameters(),
                                    lr=hparams['lr'],
                                    momentum=hparams['momentum'])
    elif hparams['opt'] == 'rms':
        log.info('Using rmsprop')
        optimizer = torch.optim.RMSprop(pol_net.parameters(),
                                        lr=hparams['lr'],
                                        momentum=hparams['momentum'])
    memory = ReplayMemory(hparams['capacity'])
    if hparams['meminit']:
        init_memory(memory, env)
    niter = 0
    nupdates = 0
    totsolved = 0
    solved_lens = []
    rewards = np.zeros(hparams['logint'])

    log.info('Before any training:')
    val_avg, val_prop, val_time, solve_lens = val_model(pol_net, env, hparams)
    log.info(
        'Validation | avg solve length: {:.4f} | solve prop: {:.4f} | time: {:.2f}s'
        .format(val_avg, val_prop, val_time))
    log.info(
        'Validation | LQ: {:.3f} | MQ: {:.3f} | UQ: {:.3f} | Max: {}'.format(
            np.percentile(solve_lens, 25), np.percentile(solve_lens, 50),
            np.percentile(solve_lens, 75), max(solve_lens)))
    scramble_lens = []
    for e in range(hparams['epochs']):
        if hparams['curric']:
            dist = curriculum_dist(hparams['max_dist'], e, hparams['epochs'])
        else:
            dist = hparams['max_dist']
        state = env.reset_fixed(max_dist=dist)
        epoch_rews = 0
        scramble_lens.append(dist)

        for i in range(hparams['maxsteps']):
            if hparams['norandom']:
                action = get_action(env, pol_net, state)
            elif random.random() < explore_rate(
                    e, hparams['epochs'] * hparams['explore_proportion'],
                    hparams['eps_min']):
                action = random.randint(0, env.action_space.n - 1)
            else:
                action = get_action(env, pol_net, state)

            ns, rew, done, _ = env.step(action, irrep=False)
            memory.push(state, action, ns, rew, done)
            epoch_rews += rew
            state = ns
            niter += 1

            if (not hparams['noupdate']
                ) and niter > 0 and niter % hparams['update_int'] == 0:
                sample = memory.sample(hparams['batch_size'])
                _loss = update(env, pol_net, targ_net, sample, optimizer,
                               hparams, logger, nupdates)
                logger.add_scalar('loss', _loss, nupdates)
                nupdates += 1

            if done:
                solved_lens.append(i + 1)
                totsolved += 1
                break

        rewards[e % len(rewards)] = epoch_rews
        logger.add_scalar('reward', epoch_rews, e)

        if e % hparams['logint'] == 0 and e > 0:
            val_avg, val_prop, val_time, _ = val_model(pol_net, env, hparams)
            logger.add_scalar('last_{}_solved'.format(hparams['logint']),
                              len(solved_lens) / hparams['logint'], e)
            if len(solved_lens) > 0:
                logger.add_scalar(
                    'last_{}_solved_len'.format(hparams['logint']),
                    np.mean(solved_lens), e)
            logger.add_scalar('val_solve_avg', val_avg, e)
            logger.add_scalar('val_prop', val_prop, e)
            log.info(
                '{:7} | dist: {:4.1f} | avg rew: {:5.2f} | solve prop: {:5.3f}, len: {:5.2f} | exp: {:.2f} | ups {:7} | val avg {:.3f} prop {:.3f}'
                .format(
                    e,
                    np.mean(scramble_lens),
                    np.mean(rewards),
                    len(solved_lens) / hparams['logint'],
                    0 if len(solved_lens) == 0 else np.mean(solved_lens),
                    explore_rate(
                        e, hparams['epochs'] * hparams['explore_proportion'],
                        hparams['eps_min']),
                    nupdates,
                    val_avg,
                    val_prop,
                ))
            solved_lens = []
            scramble_lens = []

        if e % hparams['updatetarget'] == 0 and e > 0:
            targ_net.load_state_dict(pol_net.state_dict())

    log.info('Total updates: {}'.format(nupdates))
    log.info('Total solved: {:8} | Prop solved: {:.4f}'.format(
        totsolved, totsolved / hparams['epochs']))
    logger.export_scalars_to_json(os.path.join(savedir, 'summary.json'))
    logger.close()
    torch.save(pol_net, os.path.join(savedir, 'model.pt'))
    check_memory()

    hparams['val_size'] = 10 * hparams['val_size']
    val_avg, val_prop, val_time, _ = val_model(pol_net, env, hparams)
    log.info(
        'Validation avg solve length: {:.4f} | solve prop: {:.4f} | time: {:.2f}s'
        .format(val_avg, val_prop, val_time))
Example #5
0
# should be unified when running in the server: which pkl file
memory = ReplayMemory(n_episode * n_agents * max_steps)

use_cuda = pt.cuda.is_available()

for i in range(n_episode):
    data1 = pickle.load(pkl_file)
    data2 = pickle.load(pkl_file)
    data3 = pickle.load(pkl_file)
    print('episode is %d' % (i))
    for j in range(max_steps):
        #for k in range(n_agents):
        tmp_whole_obs = data1[j]
        tmp_whole_act = data2[j]
        memory.push(tmp_whole_obs, tmp_whole_act, '', '', '')

loss_func = pt.nn.MSELoss().cuda()


class meta_actor(pt.nn.Module):
    def __init__(self, dim_observation, dim_action):
        # print('model.dim_action',dim_action)
        super(meta_actor, self).__init__()
        self.FC1 = pt.nn.Linear(dim_observation, 500)
        self.FC2 = pt.nn.Linear(500, 128)
        self.FC3 = pt.nn.Linear(128, dim_action)

    def forward(self, obs):
        result = F.relu(self.FC1(obs))
        result = F.relu(self.FC2(result))
def main():
    # training loop
    # s_memory = ReplayMemory(capacity)
    memory = ReplayMemory(capacity)
    states = env.reset()
    episode = 0
    prev_states = np.concatenate([np.zeros([16, 112]),
                                  np.zeros([16, 112])]).reshape(-1, 4, 112)
    prev_reward = np.concatenate([np.zeros([16]),
                                  np.zeros([16])]).reshape(-1, 4, 1)
    prev_action_striker = np.zeros([16])
    prev_action_goalie = np.zeros([16])
    prev_action_striker = prev_action_striker.reshape(-1, 2, 1)
    prev_action_goalie = prev_action_goalie.reshape(-1, 2, 1)
    prev_action = np.concatenate([prev_action_striker, prev_action_goalie],
                                 axis=1)

    while episode < max_episodes:

        action_striker = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        action_goalie = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

        t1 = time.time()
        # if episode < 20:
        # action_striker = np.random.randint(7, size = [16])
        # action_goalie = np.random.randint(5, size = [16])
        # action_striker = np.array(action_striker)
        # action_goalie = np.array(action_goalie)
        # else:
        action_striker, action_goalie = Maddpg_.select_action(
            states[0], states[1])
        action_striker = np.argmax(action_striker.cpu().detach().numpy(),
                                   axis=1)
        action_goalie = np.argmax(action_goalie.cpu().detach().numpy(), axis=1)
        t2 = time.time()
        print(action_striker)
        print('action require: %f s' % (t2 - t1))
        states, reward, done, _ = env.step(action_striker,
                                           action_goalie,
                                           order="field")

        states_temp = deepcopy(states)
        states_temp[0] = states_temp[0].reshape(-1, 2, 112)
        states_temp[1] = states_temp[1].reshape(-1, 2, 112)
        states_temp = np.concatenate([states_temp[0], states_temp[1]], axis=1)

        memory.push(prev_states, states_temp, prev_action, prev_reward)
        t1 = time.time()
        loss_a, loss_c = Maddpg_.update_policy(memory)
        t2 = time.time()
        print(loss_a, loss_c)
        print('Update require: %f s' % (t2 - t1))

        prev_states, prev_reward, prev_action_striker, prev_action_goalie = states, reward, action_striker, action_goalie

        arg_done = np.argwhere(done[0] == True)
        prev_states[0][arg_done] = np.zeros([112])
        prev_states[1][arg_done] = np.zeros([112])
        prev_reward[0][arg_done] = 0
        prev_reward[1][arg_done] = 0
        prev_action_striker[arg_done] = 0
        prev_action_goalie[arg_done] = 0

        prev_states[0] = prev_states[0].reshape(-1, 2, 112)
        prev_states[1] = prev_states[1].reshape(-1, 2, 112)
        prev_states = np.concatenate([prev_states[0], prev_states[1]], axis=1)

        prev_reward[0] = prev_reward[0].reshape(-1, 2, 1)
        prev_reward[1] = prev_reward[1].reshape(-1, 2, 1)
        prev_reward = np.concatenate([prev_reward[0], prev_reward[1]], axis=1)

        prev_action_striker = prev_action_striker.reshape(-1, 2, 1)
        prev_action_goalie = prev_action_goalie.reshape(-1, 2, 1)
        prev_action = np.concatenate([prev_action_striker, prev_action_goalie],
                                     axis=1)

        if True in env.done_goalie:
            #     print("episode: ", episode, "*" * 10)
            #     # print(reward)
            #     # arg_done_goalie = np.argwhere(done_goa == True)
            #     if len(arg_done_goalie) == 2:
            #         print("arg_done_goalie", arg_done_goalie)

            #     for i in arg_done_goalie:
            #         # print("goalie %d"%(i[0]))
            #         # print("action", env.act_goalie_hist[i[0]])
            #         # print("Observation", env.observation_goalie_hist[i[0]])
            #         # print("reword", env.episode_goalie_rewards[i][0])
            #         pass
            #     arg_done_str = np.argwhere(done_goa == True)
            #     if len(arg_done_goalie) == 2:
            #         print("arg_done_str", arg_done_str)

            #     for i in arg_done_str:
            #         # print("str %d"%(i[0]))
            #         # print("action", env.act_striker_hist[i[0]])
            #         # print("Observation", env.observation_striker_hist[i[0]])
            #         # print("reword", env.episode_striker_rewards[i][0])
            #         pass
            #     # env.reset_some_agents(arg_done_str, arg_done_goalie)
            episode += 1
Example #7
0
#                break
"""
randomize state push in memory
before main loop start
"""
global_count = 0
episode = 0
while True:

    episode += 1
    T = 0
    state = env.reset()
    while T < args.max_step:
        action = random.randrange(0, args.action_space)
        next_state, reward, done, _ = env.step(action)
        memory.push([state, action, reward, next_state, done])
        state = next_state
        T += 1
        global_count += 1
        if done:
            break
    print("\r push : %d/%d  " % (global_count, args.learn_start),
          end='\r',
          flush=True)
    #    print("\r push : ",global_count,'/',args.learn_start,end='\r',flush=True)

    if global_count > args.learn_start:
        break

print('')
"""
Example #8
0
class Agent:
    def __init__(self, env, logger, gamma, start_learning, memory_size,
                 batch_size, target_update_step, policy_update_step,
                 max_episode_step, init_epsilon, epsilon_minimum,
                 epsilon_decay_rate, epsilon_decay_step, learning_rate,
                 n_episodes, n_actions, hidden_dim, print_interval,
                 policy_path, start_date):

        self.env = env
        self.gamma = gamma
        self.start_learning = start_learning
        self.batch_size = batch_size
        self.target_update_step = target_update_step
        self.policy_update_step = policy_update_step
        self.max_episode_step = max_episode_step
        self.epsilon_decay_rate = epsilon_decay_rate
        self.epsilon_decay_step = epsilon_decay_step
        self.n_episodes = n_episodes
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.n_actions = n_actions
        self.print_interval = print_interval
        self.start_date = start_date

        if policy_path:
            self.policy_net = torch.load(policy_path)
        else:
            self.policy_net = MLPPolicy(hidden_dim, n_actions,
                                        env.state_shape).to(
                                            self.device).float().to(device)
        self.target_net = MLPPolicy(hidden_dim, n_actions, env.state_shape).to(
            self.device).float().to(device)
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(),
                                          lr=learning_rate)
        self.memory = ReplayMemory(memory_size, env.state_shape)
        self.logger = logger
        self.epsilon = init_epsilon
        self.epsilon_minimum = epsilon_minimum

        self.memory_cache = ReplayMemory(self.max_episode_step,
                                         env.state_shape)

    def experience_replay(self, DEBUG=False):
        # Skip training DQN model if there are not enough saved transitions in the memory buffer
        # to give a input batch.
        if len(self.memory) < self.batch_size:
            # Return a loss value = 0 to notice that training is not yet started (only for logging)
            return torch.FloatTensor([0])

        # state batch shape: (B, N_STATES)
        # action batch shape: (B, 1)
        # reward batch shape: (B)
        state_batch, action_batch, reward_batch, next_state_batch = self.memory.sample(
            self.batch_size)

        # shape: (B)
        if DEBUG:
            print("State batch: \n", state_batch, "type: ",
                  state_batch.type())  # # torch.FloatTensor
            print("Action batch: \n", action_batch, "type: ",
                  action_batch.type())  # torch.LongTensor
            print("Reward batch: \n", reward_batch, "type: ",
                  reward_batch.type())  # torch.FloatTensor
            print("-----")

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch).view(self.batch_size)
        if DEBUG:
            print("Predicted Q values (LHS) = Q(s,a)")
            print("= ", state_action_values)
            print("type: ", state_action_values.type())  # torch.FloatTensor

        # RHS: r + gamma * max_a'( Q(s',a') )
        next_state_values = self.target_net(
            torch.FloatTensor(next_state_batch).to(device))
        if True in torch.isnan(next_state_values):
            next_state_values = torch.nan_to_num(next_state_values)
        next_state_values = torch.max(next_state_values, dim=1)
        next_state_values = next_state_values.values.view([1, self.batch_size])
        # breakpoint()

        # expected_state_action_values :
        #     target Q values = r + gamma * max_a'( Q(s',a') )
        expected_state_action_values = (reward_batch +
                                        (self.gamma * next_state_values)).view(
                                            self.batch_size)
        if DEBUG:
            print("Target Q values (RHS) = r + gamma * max_a'( Q(s',a') )")
            print("= ", expected_state_action_values)
            print("type: ",
                  expected_state_action_values.type())  # torch.FloatTensor

        # Update
        loss = F.mse_loss(state_action_values, expected_state_action_values)
        if torch.isnan(loss):
            breakpoint()
        # Update of DQN network weights
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            # Gradients are clipped within range [-1,1], to prevent exploding magnitude of gradients
            # and failure of training.
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if DEBUG:
            print("Loss: ", loss)
            print("===== End of Experience Replay =====")
        # Return the computed loss value (for logging outside this function)
        return loss

    def get_epsilon(self, global_step):
        if global_step <= self.epsilon_decay_step and self.epsilon > self.epsilon_minimum:
            self.epsilon *= self.epsilon_decay_rate

    def select_action(self, state):
        """
        Input(s) :
        - policy_net: Policy DQN for predicting Q values (for Exploitation)
        - state: current state for predicting Q values (for Exploitation)
        - epsilon: exploration probability
        - params: dictionary of global parameters, expecting:
                  - params["N_ACTIONS"]: number of possible actions
        Output(s) :
        - action: action to be taken, a tensor with type long and shape (1,1)
        """
        while True:
            if random.random() <= self.epsilon:
                # With prob. epsilon
                action = random.randrange(0, self.n_actions, 1)
                action = torch.LongTensor([[action]]).to(self.device)

            else:
                # With prob. 1 - epsilon,
                # (Exploitation) select action with max predicted Q-Values of current state.

                with torch.no_grad():
                    action = torch.argmax(
                        self.policy_net(state)).unsqueeze(0).unsqueeze(0).to(
                            self.device)

            # The agent can only sell stocks when it is holding some;
            # Similarly, it can only buy stocks when it's holding nothing
            # action = 2 >> buy, action = 1 >> no sell no buy, action = 0 >> sell
            # Only valid actions can be returned.
            if self.env.holding_stocks and action in [0, 1]:
                break
            elif not self.env.holding_stocks and action in [1, 2]:
                break

        return action

    def train(self):
        self.policy_net.train()  # Set Policy DQN model as train mode
        start_time = time()  # Timer
        global_steps = 0
        for episode in range(self.n_episodes):
            # Initialize the environment, get initial state
            # you can change the beginning date here
            state = self.env.reset(date=self.start_date)
            # preprocess state
            state = preprocess_state(state, self.device)

            # Logging for current episode
            done = None  # To mark if current episode is done
            episode_reward = 0  # Sum of rewards received in current episode
            episode_step = 0  # Cumulative steps in current episode
            loss_meter = AverageMeter()

            # Loop till end of episode (done = True or when step reaches max)
            while not done and episode_step < self.max_episode_step:
                self.get_epsilon(global_steps)

                action = self.select_action(state)

                next_state, reward, done = self.env.step(action[0][0].item())

                if not done:
                    # preprocess next_state
                    next_state = preprocess_state(next_state, self.device)
                else:
                    next_state = [None]

                self.memory_cache.push(state, action, [reward], next_state)

                if reward is not None:
                    self.memory_cache.process_reward()
                    push_length = self.memory_cache.position
                    self.memory.push(
                        self.memory_cache.state[:push_length],
                        self.memory_cache.action[:push_length],
                        self.memory_cache.reward[:push_length],
                        self.memory_cache.next_state[:push_length])
                    self.memory_cache.reset()

                    loss = self.experience_replay(DEBUG=False)

                    loss_meter.update(loss.item())

                if global_steps % self.target_update_step == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())

                # Update training results at the end of episode.
                state = next_state
                global_steps += 1
                episode_step += 1
                if reward:
                    episode_reward += reward

            # Logging after an episode
            end_time = time()

            self.logger.record({
                'reward': episode_reward,
                'loss': loss_meter.avg
            })

            # Print out logging messages
            if episode % self.print_interval == 0:
                print("====================")
                print(f"Episode {episode}")
                print("Time: ", end_time - start_time)
                print("Global Steps: ", global_steps)
                print("Epsilon: ", self.epsilon)
                print("Loss: ", loss_meter.avg)
                print("Reward: ", episode_reward)
                print("====================")

        avg_reward = self.logger.get_avg_reward()
        self.logger.save_model(self.policy_net)
        return avg_reward
Example #9
0
class Initializer():
    def __init__(self): 
        self.seed = 2
        self.use_cuda = True
        self.replay_size = 1000000
        self.gamma = 0.99
        self.tau = 1e-3
        self.device = torch.device('cuda')
        self.max_iters = 10000000
        self.batch_size = 256+1 
        self.results_path = 'placeholder'
        self.statistic_dir = os.path.join(self.results_path, 'statistics/')
        self.gpu_id = 0
        
        torch.cuda.set_device(self.gpu_id)

        #if folder do not exists, create it
        os.makedirs(self.statistic_dir, exist_ok=True)

        self.metrics = {'steps': [], 'episodes': [], 'train_rewards': [], 'test_rewards': [], 'actor_loss': [], 'critic_loss': [], 'test_episodes': []} 
        

    
    def start(self):
        self.set_seed()
        self.env = ControlSuite('walker-walk', 2, 1000)
        self.max_iters = 1000
        
        self.agent = DDPG(self.gamma, self.tau,self.env.state_space(),self.env,self.device, self.results_path)
        # Initialize replay memory
        self.memory = ReplayMemory(int(self.replay_size))
        self.list_total_rewards = []
        self.list_iter = []
        self.step = 0
        self.current_episode = 0
        self.checkpoint_interval = 100
        self.train()

    
    def train(self):
        for episode in tqdm(range(self.max_iters) ):
            self.metrics['episodes'].append(self.current_episode)
            self.explore_and_collect(self.current_episode)

            if (self.current_episode % self.checkpoint_interval) == 0:
                self.test(self.current_episode)
                self.save_checkpoint()

            self.current_episode += 1
    

    def explore_and_collect(self, iter):
        state = torch.Tensor([self.env.reset()]).cpu()
        done = False
        total_reward = 0

        while not done:
            self.metrics['steps'] = self.step
            self.step += 1
            action = self.agent.get_action(state,iter, action_noise=False)
            next_state, reward, done, _ = self.env.step(action.cpu().numpy()[0])

            mask = torch.Tensor([done]).to(self.device)
            reward = torch.Tensor([reward]).to(self.device)
            next_state = torch.Tensor([next_state]).cpu()
            total_reward += reward

            self.memory.push(state, action, mask, next_state, reward)
            state = next_state

            if len(self.memory) > self.batch_size:
                self.fit_buffer()
            
            if (self.step%100) == 0:
                self.agent.hard_swap()

        #print("iter: ", iter, " total_reward: ", total_reward)
        #self.list_iter.append(iter)
        #self.list_total_rewards.append(total_reward.cpu())
        #plt.plot(self.list_iter, self.list_total_rewards)
        #plt.show()
        #plt.savefig('reward.png')
        self.metrics['train_rewards'].append(total_reward.item())
        self.lineplot(self.metrics['episodes'][-len(self.metrics['train_rewards']):], self.metrics['train_rewards'], 'train_rewards', self.statistic_dir)
        self.lineplot(self.metrics['episodes'][-len(self.metrics['actor_loss']):], self.metrics['actor_loss'], 'actor_loss', self.statistic_dir)
        self.lineplot(self.metrics['episodes'][-len(self.metrics['critic_loss']):], self.metrics['critic_loss'], 'critic_loss', self.statistic_dir)
        torch.save(self.metrics, os.path.join(self.statistic_dir , 'metrics.pth'))

    def save_checkpoint(self):
        self.agent.store_model()
        
    def load_checkpoint(self):
        self.agent.load_model()    
        self.metrics = torch.load(os.path.join(self.statistic_dir, 'metrics.pth'))
        self.current_episode = self.metrics['episodes'][-1]


    def fit_buffer(self):
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Update actor and critic according to the batch
        actor_loss, critic_loss = self.agent.update_params(batch)
        self.metrics['actor_loss'].append(actor_loss)
        self.metrics['critic_loss'].append(critic_loss)

    def test(self, episode):
        
        state = self.env.reset()
        state = torch.Tensor([state]).to(self.device)
        total_reward = 0
        done = False
        i = 0 
        while not done:
            action = self.agent.get_action(state,iter,action_noise=False)
            next_state, reward, done, _ = self.env.step(action.cpu().numpy()[0])

            mask = torch.Tensor([done]).to(self.device)
            reward = torch.Tensor([reward]).to(self.device)
            next_state = torch.Tensor([next_state]).to(self.device)
            total_reward += reward
            state = next_state
            i +=1

        print("Result of test: ", total_reward)
        #self.agent.train_mode()
        self.metrics['test_rewards'].append(total_reward.item())
        self.metrics['test_episodes'].append(episode)
        self.lineplot(self.metrics['test_episodes'][-len(self.metrics['test_rewards']):], self.metrics['test_rewards'], 'test_rewards', self.statistic_dir)


    # Plots min, max and mean + standard deviation bars of a population over time
    def lineplot(self, xs, ys_population, title, path='', xaxis='episode'):
        max_colour, mean_colour, std_colour, transparent = 'rgb(0, 132, 180)', 'rgb(0, 172, 237)', 'rgba(29, 202, 255, 0.2)', 'rgba(0, 0, 0, 0)'

        if isinstance(ys_population[0], list) or isinstance(ys_population[0], tuple):
            ys = np.asarray(ys_population, dtype=np.float32)
            ys_min, ys_max, ys_mean, ys_std, ys_median = ys.min(1), ys.max(1), ys.mean(1), ys.std(1), np.median(ys, 1)
            ys_upper, ys_lower = ys_mean + ys_std, ys_mean - ys_std

            trace_max = Scatter(x=xs, y=ys_max, line=Line(color=max_colour, dash='dash'), name='Max')
            trace_upper = Scatter(x=xs, y=ys_upper, line=Line(color=transparent), name='+1 Std. Dev.', showlegend=False)
            trace_mean = Scatter(x=xs, y=ys_mean, fill='tonexty', fillcolor=std_colour, line=Line(color=mean_colour), name='Mean')
            trace_lower = Scatter(x=xs, y=ys_lower, fill='tonexty', fillcolor=std_colour, line=Line(color=transparent), name='-1 Std. Dev.', showlegend=False)
            trace_min = Scatter(x=xs, y=ys_min, line=Line(color=max_colour, dash='dash'), name='Min')
            trace_median = Scatter(x=xs, y=ys_median, line=Line(color=max_colour), name='Median')
            data = [trace_upper, trace_mean, trace_lower, trace_min, trace_max, trace_median]
        else:
            data = [Scatter(x=xs, y=ys_population, line=Line(color=mean_colour))]
        plotly.offline.plot({
            'data': data,
            'layout': dict(title=title, xaxis={'title': xaxis}, yaxis={'title': title})
        }, filename=os.path.join(path, title + '.html'), auto_open=False)

    def set_seed(self):
        print("Setting seed")
        os.environ['PYTHONHASHSEED']=str(self.seed)
        random.seed(self.seed)
        #torch.random.seed()
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
Example #10
0
class QAgent(Agent):
    def __init__(self):
        self.fex = Extractor()
        self.net = DQN()
        try:
            self.net.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu')))
        except:
            print("Starting with new weights")
            raise Exception("Weights not found")
        self.net.eval()
        self.criterion = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.net.parameters())
        self.memory = ReplayMemory()
        self.training = False

        self.s = None
        self.a = None
        self.score = None

    def registerInitialState(self, state):
        self.s = None
        self.a = None
        self.score = None

    def getAction(self, game_state):
        legal = game_state.getLegalPacmanActions()
        if Directions.STOP in legal: legal.remove(Directions.STOP)
        state = self.fex(game_state)
        if self.training:
            state = state.cuda()
        with torch.no_grad():
            scores = self.net(state)
        scores = list(zip(ACTIONS, scores))
        legal_scores = [p for p in scores if p[0] in legal]
        action = max(legal_scores, key = lambda p: p[1])[0]

        if self.training:
            if random.random() < EPSILON:
                action = random.choice(legal)
            if self.s is not None:
                reward = game_state.getScore() - self.score
                reward = process_reward(self.s, state, reward)
                next_legals = game_state.getLegalActions()
                if Directions.STOP in next_legals: next_legals.remove(Directions.STOP)
                next_legals = (ACTION_MAP[d] for d in next_legals)
                self.memory.push(self.s, self.a, reward, state, next_legals)
            self.s = state
            self.a = ACTION_MAP[action]
            self.score = game_state.getScore()
        return action

    def final(self, state):
        if self.training:
            reward = state.getScore() - self.score
            reward = -10
            self.memory.push(self.s, self.a, reward, None, [])


    def train(self):
        global EPSILON
        self.training = True
        self.net.cuda()
        runners, names = load_runners()

        for epoch in range(EPOCHS):
            for t in self.net.parameters():
                print(t.data)
            if epoch <= 4:
                EPSILON = [0.8, 0.5, 0.3, 0.1, 0.01][epoch]
            print('Epoch {} | EPSILON {}'.format(epoch, EPSILON))
            g_dict = {}

            for runner, name in zip(runners, names):
                games = []
                for game_idx in range(GAMES_PER_EPOCH):
                    game = runner.run_game(self)
                    games.append(game)
                    for _ in range(SAMPLES_PER_GAME):
                        self.training_iteration()

                avg = np.mean([game.state.getScore() for game in games])
                wins = sum([game.state.isWin() for game in games])
                #print(f'{name}: {avg:0.2f} | {wins}/{GAMES_PER_EPOCH}')
                print('{}: {} | {}/{}'.format(name,avg, wins, GAMES_PER_EPOCH))
            print()
            torch.save(self.net.state_dict(), 'model.pth')


    def training_iteration(self):
        # sample mini-batch
        sarsl = self.memory.sample()
        if sarsl is None:
            return
        else:
            states, actions, rewards, next_states, next_state_legals = sarsl

        # replace deaths (None) with zeros
        for i, s in enumerate(next_states):
            if s is None:
                next_states[i] = self.fex.empty()
        next_states = torch.stack(next_states) 
        # get max Q(s',a'); deaths get value 0
        with torch.no_grad():
            next_actions_values = self.net(next_states)
            best_actions_values = []
            for next_legals, action_vals in zip(next_state_legals, next_actions_values):
                legal_vals = [v for (idx,v) in enumerate(action_vals) if idx in next_legals]
                if legal_vals == []:
                    legal_vals = [0]
                best_actions_values.append(max(legal_vals))
            best_actions_values = torch.tensor(best_actions_values).cuda()
        
            # compute target values
            targets = rewards + GAMMA*best_actions_values

        # compute current action values
        actions = actions.reshape(len(actions),1)
        self.net.train()
        action_values = self.net(states).gather(1,actions).reshape(32)
        self.net.eval()
        
        # compute loss and backpropagate it
        loss = self.criterion(targets, action_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def play(self, path):
        runner = LocalPacmanGameRunner(layout_path=path,
                                       random_ghosts=True,
                                       show_window=True,
                                       zoom_window=1.0,
                                       frame_time=0.1,
                                       timeout=-1000)
        game = runner.run_game(self)
Example #11
0
class PPOAgent(object):
    def __init__(self, env, lr, hist_size=8, train_step=1024, trainable=True):

        self.filters1 = 16
        self.filters2 = 32
        self.filters3 = 64
        self.lr = lr
        self.hist_size = hist_size
        self.train_step = train_step
        self.clip_param = 0.1
        self.clip_param_end = 0.03
        self.clip_param_schedule = 1000000
        self.eps_denom = 1e-8
        self.episodes = 10000000
        self.save_frame = 50000
        self.evaluation_reward_length = 100
        self.epochs = 3
        self.num_epochs_trained = 0
        self.discount_factor = 0.99
        self.lam = 0.95
        self.batch_size = 32

        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_schedule = 1000000

        self.env = env
        nonspatial_act_size, spatial_act_depth = env.action_space
        self.nonspatial_act_size, self.spatial_act_depth = env.action_space
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.net = models.GraphConvNet(nonspatial_act_size, spatial_act_depth,
                                       self.device).to(self.device)
        self.target_net = models.GraphConvNet(nonspatial_act_size,
                                              spatial_act_depth,
                                              self.device).to(self.device)

        self.memory = ReplayMemory(self.train_step, self.hist_size,
                                   self.batch_size)
        self.optimizer = optim.Adam(params=self.net.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()

        self.c1 = 1.0
        self.c2 = 0.2

        ### scaling constants for spatial and nonspatial entropy
        self.c3 = 0.1
        self.c4 = 1.0

        self.averages = []

    def update_target_net(self):
        self.target_net.load_state_dict(self.net.state_dict())

    def load_saved_model(self):
        self.net.load_state_dict(
            torch.load("save_model/Starcraft2" + self.env.map + "PPO"))
        self.update_target_net()

    def train(self, training=True):

        evaluation_reward = deque(maxlen=self.evaluation_reward_length)

        ### Keep track of average episode rewards, episode values
        rewards, episodes = [], []

        ### Keeps track of number of frames seen by agent training
        frame = 0

        for e in range(self.episodes):

            done = False
            score = 0

            ### Stores previous output of LSTM
            LSTM_hidden = self.net.init_hidden(1, use_torch=False)

            ### Keeps track of length of current game
            step = 0
            score = 0

            state, reward, done, info = self.env.reset()
            action = [np.array([[0, 0], [0, 0]]), 0]
            value = 0
            r = 0
            G, X, avail_actions = state
            _select_next = True

            while not done:
                epsilon = self.epsilon_min + max(
                    0, (self.epsilon_max - self.epsilon_min) *
                    (1 - (frame / self.epsilon_schedule)))
                # Handle selection, edge cases

                if (not info['friendly_units_present']):
                    print("hello")
                    state, reward, done, info = self.env.step(0)
                    continue

                step += 1
                frame += 1

                prev_LSTM = LSTM_hidden
                prev_action = utils.action_to_onehot(
                    action, GraphConvConfigMinigames.action_space,
                    GraphConvConfigMinigames.spatial_width)

                ### Select action, value

                _, _, value, LSTM_hidden, action, = self.net(
                    np.expand_dims(G, 1),
                    np.expand_dims(X, 1),
                    avail_actions,
                    LSTM_hidden,
                    np.expand_dims(prev_action, 1),
                    epsilon=epsilon,
                    choosing=True)
                value = value.cpu().data.numpy().item()
                LSTM_hidden = LSTM_hidden.cpu().data.numpy()

                spatial_action, nonspatial_action = action

                #print(action)
                ### Env step

                state, reward, done, info = self.env.step(
                    nonspatial_action, spatial_action[0], spatial_action[1])
                G, X, avail_actions = state
                action = [np.array(spatial_action), nonspatial_action]
                score += reward
                ### Append state to history
                #history.append(state)

                push_state = [G, X, avail_actions, prev_LSTM]

                ### Store transition in memory
                if (score == 0 and done):
                    reward -= 100
                    score -= 100
                self.memory.push(push_state, action, reward, done, value, 0, 0,
                                 step)

                ### Start training after random sample generation

                if (frame % self.train_step == 0 and frame != 0 and training):
                    prev_action = utils.action_to_onehot(
                        action, GraphConvConfigMinigames.action_space,
                        GraphConvConfigMinigames.spatial_width)
                    _, _, frame_next_val, _, _ = self.net(
                        np.expand_dims(G, 1), np.expand_dims(X, 1),
                        avail_actions, LSTM_hidden,
                        np.expand_dims(prev_action, 1))
                    frame_next_val = frame_next_val.cpu().data.numpy().item()
                    clip_param = self.clip_param_end + (
                        self.clip_param - self.clip_param_end) * max(
                            0, 1 - (frame / self.clip_param_schedule))
                    self.train_policy_net_ppo(frame, frame_next_val, epsilon,
                                              clip_param)

                    self.update_target_net()

                ### Save model, print time, record information
                if (frame % self.save_frame == 0):
                    #print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    plt.plot(episodes, rewards, 'r')
                    plt.savefig("save_model/Starcraft2" + self.env.map +
                                "PPOgraph.png")
                    torch.save(self.net.state_dict(),
                               "save_model/Starcraft2" + self.env.map + "PPO")

                ### Handle end of game logic
                if done:
                    evaluation_reward.append(score)
                    print("episode:", e, "  score:", score, "  steps:", step,
                          "  evaluation reward:", np.mean(evaluation_reward))
                    #state, reward, done, _ = self.env.reset()
                    self.averages.append(np.mean(evaluation_reward))
                    self.plot_results()

                G, X, avail_actions = state

    ### Main training logic
    def train_policy_net_ppo(self, frame, frame_next_val, epsilon, clip_param):

        for param_group in self.optimizer.param_groups:
            curr_lr = param_group['lr']
        print(
            "\n\n ------- Training network. lr: %f. clip: %f. epsilon: %f ------- \n\n"
            % (curr_lr, clip_param, epsilon))

        ### Compute value targets and advantage for all frames
        self.memory.compute_vtargets_adv(self.discount_factor, self.lam,
                                         frame_next_val)

        ### number of iterations of batches of size self.batch_size. Should divide evenly
        num_iters = int(len(self.memory) / self.batch_size)
        device = self.device
        ### Do multiple epochs
        for i in range(self.epochs):

            pol_loss = 0.0
            vf_loss = 0.0
            ent_total = 0.0

            self.num_epochs_trained += 1

            for j in range(num_iters):

                mini_batch = self.memory.sample_mini_batch(
                    frame, self.hist_size)
                mini_batch = np.array(mini_batch).transpose()

                states = np.stack(mini_batch[0], axis=0)
                G_states = np.stack(states[:, 0], axis=0)
                X_states = np.stack(states[:, 1], axis=0)
                avail_states = np.stack(states[:, 2], axis=0)
                hidden_states = np.concatenate(states[:, 3], axis=2)
                prev_actions = np.stack(states[:, 4], axis=0)
                relevant_states = np.stack(states[:, 5], axis=0)

                n = states.shape[0]

                actions = np.array(list(mini_batch[1]))
                spatial_actions = np.stack(actions[:, 0], 0)
                first_spatials = spatial_actions[:, 0]
                second_spatials = spatial_actions[:, 1]
                nonspatial_acts = np.array(actions[:, 1]).astype(np.int64)

                rewards = np.array(list(mini_batch[2]))
                dones = mini_batch[3]
                v_returns = mini_batch[5].astype(np.float32)
                advantages = mini_batch[6].astype(np.float32)

                first_spatials = torch.from_numpy(first_spatials).to(device)
                second_spatials = torch.from_numpy(second_spatials).to(device)
                nonspatial_acts = torch.from_numpy(nonspatial_acts).to(device)
                nonspatial_acts = nonspatial_acts.unsqueeze(1)

                rewards = torch.from_numpy(rewards).to(device)
                dones = torch.from_numpy(np.uint8(dones)).to(device)
                v_returns = torch.from_numpy(v_returns).to(device)
                advantages = torch.from_numpy(advantages).to(device)

                advantages = (advantages - advantages.mean()) / (torch.clamp(
                    advantages.std(), self.eps_denom))

                spatial_probs, nonspatial_probs, values, _, _ = self.net(
                    G_states,
                    X_states,
                    avail_states,
                    hidden_states,
                    prev_actions,
                    relevant_frames=relevant_states)
                old_spatial_probs, old_nonspatial_probs, old_values, _, _ = self.target_net(
                    G_states,
                    X_states,
                    avail_states,
                    hidden_states,
                    prev_actions,
                    relevant_frames=relevant_states)

                #print(nonspatial_probs.shape, self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials).shape, (nonspatial_acts < 2).shape)
                #print(nonspatial_probs.shape, nonspatial_acts.shape)
                #print(nonspatial_probs[range(self.batch_size),nonspatial_acts].shape)

                gathered_nonspatials = nonspatial_probs.gather(
                    1, nonspatial_acts).squeeze(1)
                old_gathered_nonspatials = old_nonspatial_probs.gather(
                    1, nonspatial_acts).squeeze(1)
                first_spatial_mask = (nonspatial_acts < 3).to(
                    self.device).float().squeeze(1)
                second_spatial_mask = (nonspatial_acts == 0).to(
                    self.device).float().squeeze(1)

                numerator = torch.log(
                    gathered_nonspatials + self.eps_denom) + torch.log(
                        self.index_spatial_probs(spatial_probs[:, 0, :, :],
                                                 first_spatials) +
                        self.eps_denom) * first_spatial_mask + (torch.log(
                            self.index_spatial_probs(spatial_probs[:, 1, :, :],
                                                     second_spatials) +
                            self.eps_denom) * second_spatial_mask)
                denom = torch.log(
                    old_gathered_nonspatials + self.eps_denom) + torch.log(
                        self.index_spatial_probs(old_spatial_probs[:, 0, :, :],
                                                 first_spatials) +
                        self.eps_denom) * first_spatial_mask + (torch.log(
                            self.index_spatial_probs(
                                old_spatial_probs[:, 1, :, :], second_spatials)
                            + self.eps_denom) * second_spatial_mask)
                """
                denom = old_gathered_nonspatials
                print(nonspatial_probs.shape)
                print(denom.shape)
                print((nonspatial_acts < 3).shape)
                print(((self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials)) * (nonspatial_acts < 3).to(self.device).float()).shape)
                denom[nonspatial_acts < 3] = denom[nonspatial_acts < 3] * self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials)
                denom[nonspatial_acts == 0] = denom[nonspatial_acts == 0] * self.index_spatial_probs(old_spatial_probs[:,1,:,:], second_spatials)
                
                denom = torch.log( torch.clamp( denom, self.eps_denom ) )
                """

                ratio = torch.exp(numerator - denom)
                ratio_adv = ratio * advantages.detach()
                bounded_adv = torch.clamp(
                    ratio, 1 - self.clip_param,
                    1 + self.clip_param) * advantages.detach()
                """
                print("ratio: ", ratio, "\n\n")
                print("numerator: ", numerator, "\n\n")
                print("denominator: ", denom, "\n\n")
                """

                pol_avg = -((torch.min(ratio_adv, bounded_adv)).mean())

                value_loss = self.loss(values.squeeze(1), v_returns.detach())

                ent = self.entropy(spatial_probs, nonspatial_probs)

                total_loss = pol_avg + self.c1 * value_loss - self.c2 * ent
                self.optimizer.zero_grad()
                total_loss.backward()
                self.optimizer.step()

                pol_loss += pol_avg.detach().item()
                vf_loss += value_loss.detach().item()
                ent_total += ent.detach().item()

            pol_loss /= num_iters
            vf_loss /= num_iters
            ent_total /= num_iters
            print(
                "Iteration %d: Policy loss: %f. Value loss: %f. Entropy: %f" %
                (self.num_epochs_trained, pol_loss, vf_loss, ent_total))

        print("\n\n ------- Training sequence ended ------- \n\n")

    def index_spatial_probs(self, spatial_probs, indices):
        index_tuple = torch.meshgrid(
            [torch.arange(x) for x in spatial_probs.size()[:-2]]) + (
                indices[:, 0],
                indices[:, 1],
            )
        output = spatial_probs[index_tuple]
        return output

    def get_recent_hist(self, hist):
        length = min(len(hist), self.hist_size)
        if (length == 0):
            return []
        else:
            return hist[-length:]

    def entropy(self, spatial_probs, nonspatial_probs):
        ent = -self.c3 * (torch.mean(
            torch.sum(
                spatial_probs[:, 0, :, :] *
                torch.log(spatial_probs[:, 0, :, :] + self.eps_denom),
                dim=(1, 2))) + self.c4 * torch.mean(
                    torch.sum(nonspatial_probs *
                              torch.log(nonspatial_probs + self.eps_denom),
                              dim=1)))
        return ent

    def clip_gradients(self, clip):

        ### Clip the gradients of self.policy_net
        for param in self.net.parameters():
            if param.grad is None:
                continue
            #print(torch.max(param.grad.data), torch.min(param.grad.data))
            param.grad.data = param.grad.data.clamp(-clip, clip)

    def plot_results(self):
        plt.figure(1)
        plt.clf()
        plt.suptitle('Select-Move PPO')
        plt.title('Agent trained by Ray Sun, David Long, Michael McGuire',
                  fontsize=7)
        plt.xlabel('Training iteration - DefeatRoaches')
        plt.ylabel('Average score')
        plt.plot(self.averages)
        plt.pause(0.001)  # pause a bit so that plots are updated
Example #12
0
            action_index = 0  # so the data isn't relevant to learn
        observation, reward, done, info = env.step(actions[action_index])
        last_screen = current_screen
        on_grass, current_screen = transform_obs(observation)
        # Change of the reward to add penalty when the agent isn't on the road
        if (reward < 0):
            if (on_grass and t > 50):
                reward = float(-1)
            if (not on_grass and t > 50):
                reward = float(0.1)
        if (t <= 50):
            reward = float(0)
        reward = torch.tensor([reward], device=device)

        # Store the transition in memory
        memory.push(last_screen, action_index, current_screen, reward)

        # Move to the next state
        state = current_screen

        # Perform one step of the optimization (on the target network)
        optimize_model()
        tot_reward += reward
        if done:
            break

    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
    torch.save(policy_net.state_dict(), './models/model')  # Save the model
Example #13
0
class DQN_agent:
    def __init__(self,env,policy,target,n_action=18,capacity=100000,batch_size=32,lr=2.5e-4,gamma=0.99,burn_in=50000,C=1000,eps_decay=1000000):
        self.env=env
        self.n_action=n_action
        self.memory=ReplayMemory(capacity)
        self.device="cuda"
        self.policy=policy
        self.target=target
        self.batch_size=batch_size
        self.gamma=gamma
        self.lr=lr
        self.opt= optim.Adam(self.policy.parameters(), lr=self.lr)
        self.burn_in=burn_in
        self.C=C
        self.eps_decay=eps_decay
        self.loss=nn.MSELoss()
    def get_state(self,obs):
        state=torch.FloatTensor(np.array(obs).transpose(2,0,1)).unsqueeze(0)
        return(state)
    def get_action(self,state,eps):
        x=random.random()
        if x<eps:
            return(torch.tensor([[random.randrange(self.n_action)]], dtype=torch.long))
        else:
            with torch.no_grad():
                return(self.policy(state.to("cuda")).max(1)[1].view(1,1))
    def update_policy(self):
        state,action,reward,next_state,done=self.memory.sample(self.batch_size)
        state=state.to("cuda")
        action=action.to("cuda")
        next_state=next_state.to("cuda")
        reward=reward.to("cuda")
        done=done.to("cuda")
        q=self.policy(state).gather(1,action.unsqueeze(1)).squeeze(1)
        q_max=self.target(next_state).max(1)[0]
        y=(reward+self.gamma*q_max)*(1-done)+reward*done
        loss=self.loss(q,y)
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
        return
    def update_target(self):
        self.target.load_state_dict(self.policy.state_dict())
    def train(self,episodes):
        steps=0
        reward_list=[]
        for episode in range(episodes):
            obs=self.env.reset()
            state=self.get_state(obs)
            reward_episode=0
            done=False
            while not done:
                steps+=1
                test_eps=int(steps>self.eps_decay)
                eps=(1-steps*(1-0.1)/self.eps_decay)*(1-test_eps)+0.1*test_eps
                action=self.get_action(state,eps)
                obs,reward,done,info=env.step(action)
                reward_episode+=reward
                next_state=self.get_state(obs)
                reward = torch.tensor([reward], device="cpu", dtype=torch.float)
                action = torch.tensor([action], device="cpu", dtype=torch.long)
                done = torch.tensor([int(done)], device="cpu", dtype=int)
                self.memory.push(state,action,reward,next_state,done)
                if steps>self.burn_in:
                    self.update_policy()
                if steps>self.burn_in and steps%self.C==0:
                    self.update_target()
                state=next_state
            if episode%100 == 0:
                print('Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format(steps, episode, episodes, np.mean(reward_list[-100:])))
            if episode%500==0:
                print(reward_list)
            reward_list.append(reward_episode)
        self.env.close()
        print(reward_list)
        return(reward_list)
    def save_model(self,name):
        torch.save(self.policy,name)
        return
    def load_model(self,name):
        self.policy=torch.load(name)
    def test(self,n_episodes):
        test_reward=[]
        for episode in range(n_episodes):
            obs = self.env.reset()
            state = self.get_state(obs)
            reward_episode = 0.0
            done=False
            while not done:
                with torch.no_grad():
                    action=self.policy(state.to("cuda")).max(1)[1].view(1,1)
                obs,reward,done,infoself.=env.step(action)
                reward_episode+=reward
                state=self.get_state(obs)
                if done:
                    print("Finished Episode {} with reward {}".format(episode, reward_episode))
            self.env.close()
            test_reward.append(reward_episode)
        return (test_reward)
Example #14
0
class Agent:
    def __init__(self,
                 env,
                 exploration_rate=1,
                 exploration_decay=0.9999,
                 explore=True):
        self.action_space = env.action_space.n
        self.memory = ReplayMemory(MEMORY_SIZE)
        self.memory.fill_memory(env)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)
        self.dqn = DQN(4, self.action_space).float().to(self.device)
        self.env = env
        self.episode_rewards = []
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.explore = explore
        self.model_optim = optim.Adam(self.dqn.parameters(), lr=1e-4)
        self.episodes = 0

    def get_action(self, obs):
        if self.exploration_rate > random.random() and self.explore:
            action = random.randint(0, self.action_space - 1)
        else:
            obs = torch.tensor(obs, device=self.device).reshape(1, 4, 80,
                                                                80).float()
            action = self.dqn(obs).argmax().tolist()
        return action

    def train(self, num_episodes):
        num_steps = 0
        running_loss = 0
        loss = nn.MSELoss()

        episode_rewards = []
        for episode in tqdm(range(num_episodes)):
            obs = rgb2gray(self.env.reset()).reshape(1, 80, 80)
            for i in range(3):
                obs = np.append(obs, rgb2gray(self.env.step(0)[0]), 0)

            terminal = False
            episode_reward = 0
            while not terminal:
                action = self.get_action(obs)
                result = self.env.step(action)

                terminal = result[2]
                new_obs = np.append(obs[1:], rgb2gray(result[0]), 0)
                reward = result[1]
                if reward > 0:
                    print(episode, reward)
                episode_reward += reward

                self.memory.push(obs, action, new_obs, reward, terminal)
                batch = self.memory.sample(BATCH_SIZE)
                observations, y = self.process_batch(batch)
                num_steps += 1

                outputs = self.dqn(observations)
                episode_loss = loss(outputs, y)
                self.model_optim.zero_grad()
                episode_loss.backward()
                self.model_optim.step()
                running_loss += episode_loss.item()

                if num_steps % 1000 == 0:  # print every 2000 mini-batches
                    print(num_steps)

            episode_rewards.append(episode_reward)
            if self.exploration_rate > 0.1:
                self.exploration_rate *= self.exploration_decay
        self.episodes += num_episodes
        self.save(str(self.episodes) + '_model')
        self.episode_rewards += episode_rewards
        np.save(str(self.episodes) + '_rewards', self.episode_rewards)
        return episode_rewards

    def process_batch(self, batch):
        observations = [batch[i][0] for i in range(len(batch))]
        observations = torch.tensor(np.array(observations)).reshape(
            (BATCH_SIZE, 4, 80, 80)).float().to(self.device)

        next_observations = [batch[i][2] for i in range(len(batch))]
        next_observations = torch.tensor(np.array(next_observations)).reshape(
            (BATCH_SIZE, 4, 80, 80)).float().to(self.device)

        maxs = self.dqn(next_observations)
        maxs = maxs.max(1).values.float().to(self.device)

        rewards = [batch[i][3] for i in range(len(batch))]
        rewards = torch.tensor(rewards).float().to(self.device)

        terminals = [~batch[i][4] for i in range(len(batch))]
        terminals = torch.tensor(terminals).float().to(self.device)

        maxs = -maxs * terminals

        y = self.dqn(observations)
        Qs = rewards + GAMMA * maxs

        for i in range(len(batch)):
            y[i, batch[i][1]] = Qs[i]

        return observations, y

    def load_dqn(self, path):
        self.dqn = torch.load(path)

    def save(self, path):
        torch.save(self.dqn, path)
ep_durations = [0]  #used for ploting
returns = [0]
last_state_values = [0]
first_state_values = [0]

for i_episode in range(INIT_RM):
    if not TRAIN:
        break
    cur_state = env.reset()
    while True:
        action = agent.take_action(FloatTensor([cur_state]))
        next_state, reward, done, _ = env.step(env.action_space.sample())

        if done:
            reward = -1
            memory.push(FloatTensor([cur_state]), LongTensor([action]), None,
                        FloatTensor([reward]))
        else:
            #tensors of shape 1Xstateshape,1,1x4,1
            memory.push(FloatTensor([cur_state]), LongTensor([action]),
                        FloatTensor([next_state]), FloatTensor([reward]))

        cur_state = next_state

        if done:
            break

start_time = time.time()
frames = 0
i_episode = 0

while frames < N_FRAMES:  #start of training
Example #16
0
class DQN(object):
    def __init__(self,
                 config,
                 env,
                 doubleDQN=False,
                 duelingDQN=False,
                 NoisyDQN=False,
                 N_stepDQN=False,
                 Prioritized=False):
        self.device = config.device

        self.doubleDQN = doubleDQN
        self.duelingDQN = duelingDQN
        self.NoisyDQN = NoisyDQN
        self.N_stepDQN = N_stepDQN
        self.Prioritized = Prioritized

        self.gamma = config.gamma  # 折扣因子
        self.learning_rate = config.learning_rate  # 学习率
        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数
        self.priority_alpha = config.priority_alpha
        self.priority_beta_start = config.priority_beta_start
        self.priority_beta_frames = config.priority_beta_frames

        self.epsilon = config.epsilon  # epsilon初始值,以其概率选择最大值的动作
        self.epsilon_final = config.epsilon_final  # epsilon的最小值
        self.epsilon_decay = config.epsilon_decay  # epsilon衰减率

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.n  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.N_step = config.N_step  # 多步学习的步数

        self.N_step_buffer = []

        if self.Prioritized:
            self.memory = PrioritizedReplayMemory(
                self.replay_size, self.priority_alpha,
                self.priority_beta_start, self.priority_beta_frames)  # 初始化经验池
        else:
            self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        if self.duelingDQN:
            # 初始化评估网络
            self.eval_net = DuelingDQNNet(self.num_states,
                                          self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = DuelingDQNNet(self.num_states,
                                            self.num_actions).to(self.device)
        elif self.NoisyDQN:
            # 初始化评估网络
            self.eval_net = NoisyNet(self.num_states,
                                     self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = NoisyNet(self.num_states,
                                       self.num_actions).to(self.device)
        else:
            self.eval_net = DQNNet(self.num_states,
                                   self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = DQNNet(self.num_states,
                                     self.num_actions).to(self.device)

        # 目标网络和评估网络初始时参数一致
        self.target_net.load_state_dict(self.eval_net.state_dict())

        # 训练的优化器
        self.optimizer = optim.Adam(self.eval_net.parameters(),
                                    lr=self.learning_rate)

        # 均方损失函数
        self.loss_func = nn.MSELoss()

    # 储存记忆
    def store_transition(self, state, action, reward, next_state, done):
        if self.N_stepDQN:
            # 把当前经验放入N_step buffer中
            self.N_step_buffer.append(
                (state, action, reward, next_state, done))

            # 如果没有达到设定的步数,return
            if len(self.N_step_buffer) < self.N_step:
                return

            # 计算N步回报
            R = sum([
                self.N_step_buffer[i][2] * (self.gamma**i)
                for i in range(self.N_step)
            ])
            state, action, _, _, _ = self.N_step_buffer.pop(0)

            self.memory.push((state, action, R, next_state, done))
        else:
            self.memory.push((state, action, reward, next_state, done))

    # 选择动作
    def choose_action(self, s):
        with torch.no_grad():
            if np.random.random(
                    1) >= self.epsilon:  # 如果大于等于epsilon,动作为网络中Q值最大的
                X = torch.tensor([s], device=self.device, dtype=torch.float)
                a = self.eval_net(X).max(1)[1].view(1, 1)  # 用eval网络计算q值
                return a.item()
            else:  # 如果小于epsilon,动作随机
                return np.random.randint(0, self.num_actions)

    # 从经验池中选取样本
    def get_batch(self):
        transitions, indices, weights = self.memory.sample(
            self.batch_size)  # 批样本

        # 解压批样本
        # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)]
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
            *transitions)

        # 将样本转化为tensor
        batch_state = torch.tensor(batch_state,
                                   device=self.device,
                                   dtype=torch.float)
        batch_action = torch.tensor(batch_action,
                                    device=self.device,
                                    dtype=torch.long).squeeze().view(
                                        -1, 1)  # view转换为列tensor
        batch_reward = torch.tensor(batch_reward,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(-1, 1)
        batch_next_state = torch.tensor(batch_next_state,
                                        device=self.device,
                                        dtype=torch.float)
        batch_done = torch.tensor(batch_done,
                                  device=self.device,
                                  dtype=torch.float).squeeze().view(-1, 1)
        # print("状态:", batch_state.shape) 128,4
        # print("动作:", batch_action.shape)
        # print("奖励:", batch_reward.shape)
        # print("done:", batch_done.shape)
        #
        return batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights

    # 学习
    def learn(self):
        # 更新目标网络
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())

        # 获取批样本
        batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights = self.get_batch(
        )

        # print("状态:", batch_state)
        # print("动作:", batch_action)
        # print("done:", batch_done)

        # 计算q(s,a;θ)
        if self.NoisyDQN:
            self.eval_net.sample_noise()
        q_s_a = self.eval_net(batch_state).gather(1, batch_action)
        # print("q_s_a:", q_s_a.shape)

        # 计算target yj = rj + (1 - done) * gamma * max(q(s',a;θ'))
        with torch.no_grad():
            if self.NoisyDQN:
                self.target_net.sample_noise()
            if self.doubleDQN:
                next_max_action = self.eval_net(batch_next_state).max(
                    dim=1)[1].view(-1, 1)
                q_target = batch_reward + (
                    1. - batch_done) * self.gamma * self.target_net(
                        batch_next_state).gather(1, next_max_action)
                # print("q_target:", q_target)
                # print("q_target.shape:", q_target.shape)
            else:
                next_q = self.target_net(batch_next_state)
                # print("next_q:", next_q)
                max_next_q_a = next_q.max(1)[0].view(-1, 1)
                # print("max_next_q_a:", max_next_q_a)
                # print("max_next_q_a.shape:", max_next_q_a.shape)
                q_target = batch_reward + (
                    1. - batch_done) * self.gamma * max_next_q_a
                # print("q_target:", q_target)
                # print("q_target.shape:", q_target.shape)

        # 损失函数更新
        if self.Prioritized:
            diff = (q_target - q_s_a)
            self.memory.update_priorities(
                indices,
                diff.detach().squeeze().abs().cpu().numpy().tolist())
        loss = self.loss_func(q_target, q_s_a)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 学习的步数加一
        self.learn_step_counter += 1

    # 保存模型
    def save(self):
        if self.duelingDQN:
            torch.save(self.eval_net, 'duelingDQN.pkl')
        elif self.NoisyDQN:
            torch.save(self.eval_net, 'NoisyDQN.pkl')
        elif self.N_stepDQN:
            torch.save(self.eval_net, 'N_stepDQN.pkl')
        elif self.Prioritized:
            torch.save(self.eval_net, 'PriorityReplayDQN.pkl')
        else:
            torch.save(self.eval_net, 'DQN.pkl')

    # 加载模型
    def load(self):
        if self.duelingDQN:
            self.eval_net = torch.load('duelingDQN.pkl')
        elif self.NoisyDQN:
            self.eval_net = torch.load('NoisyDQN.pkl')
        elif self.N_stepDQN:
            self.eval_net = torch.load('N_stepDQN.pkl')
        elif self.Prioritized:
            self.eval_net = torch.load('PriorityReplayDQN.pkl')
        else:
            self.eval_net = torch.load('DQN.pkl')
class MADDPG_Agent:
    def __init__(self, n_agents, dim_obs, dim_act, batch_size, 
                        capacity, eps_b_train):
        """ Initialize an Agent object.

        Params
        =======
            n_agents (int)   : number of agents
            dim_obs (int)    : dimension of each state
            dim_act (int)    : dimension of each action
            batch_size (int) : batch size
            capacity (int): 
            eps (int)        : Number of episodes before training
        """

        self.n_agents = n_agents
        self.dim_obs = dim_obs
        self.dim_act = dim_act
        self.batch_size = batch_size
        self.capacity = capacity
        self.eps_b_train = eps_b_train
        self.memory = ReplayMemory(capacity, RANDOM_SEED)
        self.cuda_on = th.cuda.is_available()
        self.var = [1.0 for i in range(n_agents)]
        self.seed = random.seed(10)
        self.checkpoint_dir = 'checkpoints/'
        self.seed = random.seed(RANDOM_SEED)

        # Actor Network with Target Network
        self.actors = [Actor(dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)]
        self.actors_target = [Actor(dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)] #deepcopy(self.actors)
        self.actor_optimizer = [Adam(x.parameters(), lr=LR_ACTOR) for x in self.actors]

        # Critic Network with Target Network
        self.critics = [Critic(n_agents,dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)]
        self.critics_target = [Critic(n_agents,dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)] #deepcopy(self.critics)
        self.critic_optimizer = [Adam(x.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) for x in self.critics]

        # Noise process
        self.noise = [OUNoise(dim_act, RANDOM_SEED) for i in range(n_agents)]

        # Enable the use of CUDA
        if self.cuda_on:
            for m in [self.actors, self.critics, self.actors_target, self.critics_target]:
                for x in m:
                    x.cuda()

        self.steps_done = 0
        self.eps_done = 0


    def step(self, states,actions, rewards, next_states, dones, add_noise=True):
        """Save experience in replay memory, and use random sample for buffer to learn."""
        self.memory.push(states, actions, next_states, rewards)
        #print("memory size = ",len(self.memory))

        # Learn, if enough samples are available in memory
        if self.eps_done % NUM_STEPS_TO_UPDATE == 0:
            for i in range(NUM_STEPS_TO_UPDATE):
                c_loss,a_loss = self.learn()

    def act2(self, state):
        actions = th.zeros(
            self.n_agents,
            self.dim_act)
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        for i in range(self.n_agents):
            sb = state[i, :].detach()
            self.actors[i].eval()
            with th.no_grad():
                act = self.actors[i](sb.unsqueeze(0)).squeeze()
            self.actors[i].train()
            act += th.from_numpy(self.noise.sample()).type(FloatTensor)

            act = th.clamp(act, -1, 1)

            actions[i, :] = act
        self.steps_done += 1

        return actions
       

    def act(self, state):
        actions = th.zeros(
            self.n_agents,
            self.dim_act)
        #FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        for i in range(self.n_agents):
            self.actors[i].eval()
            sb = state[i, :].detach()
            with th.no_grad():
                act = self.actors[i](sb.unsqueeze(0)).squeeze()
            self.actors[i].train()

            act = self.add_noise2(act, i)
            act = th.clamp(act, -1.0, 1.0)

            actions[i, :] = act
        self.steps_done += 1

        return actions

    def act3(self, state):
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        actions = th.zeros(
            self.n_agents,
            self.dim_act)
        for i in range(self.n_agents):
            self.actors[i].eval()
            with th.no_grad():
                sb = state[i, :].detach()
                act = self.actors[i](sb.unsqueeze(0)).squeeze()

                act += th.from_numpy(self.noise[i].sample()).type(FloatTensor)
            act = th.clamp(act, -1, 1)

            actions[i, :] = act
        self.steps_done += 1
        return actions




    def add_noise(self, action, i):
        epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
                                  np.exp(-1. * self.steps_done / EPSILON_DECAY)
        # add noise
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        noise = th.from_numpy(np.random.randn(self.dim_act) * epsilon).type(FloatTensor)
        action += noise
        return action

    def add_noise2(self, action, i):
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        action += th.from_numpy(
        np.random.randn(2) * self.var[i]).type(FloatTensor)

        if self.eps_done > self.eps_b_train and self.var[i] > 0.05:
            self.var[i] *= 0.999998
        #action = th.clamp(action, -1.0, 1.0)

        return action


    def reset(self):
        for i in range(self.n_agents):
            self.noise[i].reset()

    def learn(self):
        """ Update policy and value parameters using given batch of experience tuples"""
        if self.eps_done <= self.eps_b_train:
            return None, None

        if self.eps_done == (self.eps_b_train + 1):
            print("========== Training now =========")

        ByteTensor = th.cuda.ByteTensor if self.cuda_on else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor

        c_loss = []
        a_loss = []

        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))

            non_final_mask = ByteTensor(list(map(lambda s: s is not None,
                                                 batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = th.stack(batch.states).type(FloatTensor)
                  
            reward_batch = th.stack(batch.rewards).type(FloatTensor)
            action_batch = th.stack(batch.actions).type(FloatTensor)
            #pdb.set_trace()
            # : (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = th.stack(
                [s for s in batch.next_states
                 if s is not None]).type(FloatTensor)

            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)
            self.critic_optimizer[agent].zero_grad()
            current_Q = self.critics[agent](whole_state, whole_action)

            non_final_next_actions = [
                self.actors_target[i](non_final_next_states[:,
                                                            i,
                                                            :]) for i in range(
                                                                self.n_agents)]
            non_final_next_actions = th.stack(non_final_next_actions)
            non_final_next_actions = (
                non_final_next_actions.transpose(0,
                                                 1).contiguous())

            target_Q = th.zeros(
                self.batch_size).type(FloatTensor)

            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.dim_obs),
                non_final_next_actions.view(-1,
                                            self.n_agents * self.dim_act)
            ).squeeze()
            # scale_reward: to scale reward in Q functions

            target_Q = (target_Q.unsqueeze(1) * GAMMA) + (
                reward_batch[:, agent].unsqueeze(1) * SCALE_REWARD)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.actors[agent](state_i)
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)


        #if self.steps_done % NUM_STEPS_TO_UPDATE == 0 and self.steps_done > 0:
            #for i in range(self.n_agents):
            soft_update(self.critics_target[agent], self.critics[agent], TAU)
            soft_update(self.actors_target[agent], self.actors[agent], TAU)

        return c_loss, a_loss


    def save_checkpoint(self, episode_num, reward, is_best=False):

        checkpointName = self.checkpoint_dir + 'ep{}.pth'.format(episode_num)
        checkpoint = {
            'episode': episode_num,
            'actor1': self.actors[0].state_dict(),
            'actor2': self.actors[1].state_dict(),
            'critic1': self.critics[0].state_dict(),
            'critic2': self.critics[1].state_dict(),
            'targetActor1': self.actors_target[0].state_dict(),
            'targetActor2': self.actors_target[1].state_dict(),
            'targetCritic1': self.critics_target[0].state_dict(),
            'targetCritic2': self.critics_target[1].state_dict(),
            'actorOpt1': self.actor_optimizer[0].state_dict(),
            'actorOpt2': self.actor_optimizer[1].state_dict(),
            'criticOpt1': self.critic_optimizer[0].state_dict(),
            'criticOpt2': self.critic_optimizer[1].state_dict(),
            'replayBuffer': self.memory,
            'reward': reward
            
        } 
        th.save(checkpoint, checkpointName)

    def printModelArch(self,model):
        print(model.state_dict())
Example #18
0
class DRRN_Agent:
    def __init__(self, args):
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.accummulate_step = args.accummulate_step

        self.network = DRRN().to(device)
        self.memory = ReplayMemory(args.memory_size)
        self.save_path = args.output_dir
        self.clip = args.clip
        self.optimizer = torch.optim.Adam(self.network.parameters(),
                                          lr=args.learning_rate)

        # self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=args.warmup_steps,
        #                                                  num_training_steps=args.max_steps)

    def observe(self, state, act, rew, next_state, next_acts, done, history):
        self.memory.push(state, act, rew, next_state, next_acts, done, history)

    def build_state(self, obs, infos):
        """ Returns a state representation built from various info sources. """
        # obs_ids = [self.network.str_to_token_ids(o, self.network.state_max_length) for o in obs]
        # look_ids = [self.network.str_to_token_ids(info['look'], self.network.look_max_length) for info in infos]
        # inv_ids = [self.network.str_to_token_ids(info['inv'], self.network.inv_max_length) for info in infos]
        # return [State(ob, lk, inv) for ob, lk, inv in zip(obs_ids, look_ids, inv_ids)]
        states = []
        for obs, info in zip(obs, infos):
            state = obs + info['look'] + info['inv']
            states.append(state)
        return states

    def encode(self, act_list):
        """ Encode a list of actions """
        # return [self.network.str_to_token_ids(o, self.network.act_max_length) for o in act_list]
        return act_list

    def act(self, states, poss_acts, history, sample=True, return_all=False):
        """ Returns a string action from poss_acts. """
        idxs, values = self.network.act(states, poss_acts, history, sample,
                                        return_all)

        if return_all:
            return None, idxs, values

        act_ids = [poss_acts[batch][idx] for batch, idx in enumerate(idxs)]
        return act_ids, idxs, values

    def update(self):
        if len(self.memory) < self.batch_size:
            return

        batch_loss = None
        num_per_step = int(self.batch_size / self.accummulate_step)
        for _ in range(self.accummulate_step):

            transitions = self.memory.sample(num_per_step)
            batch = Transition(*zip(*transitions))

            # Compute Q(s', a') for all a'
            # TODO: Use a target network???
            next_history = []
            for act, history in zip(batch.act, batch.history):
                next_history.append(history + [act])
            next_qvals = self.network(batch.next_state, batch.next_acts,
                                      next_history)
            # Take the max over next q-values
            next_qvals = torch.tensor([vals.max() for vals in next_qvals],
                                      device=device)
            # Zero all the next_qvals that are done
            next_qvals = next_qvals * (
                1 - torch.tensor(batch.done, dtype=torch.float, device=device))
            targets = torch.tensor(batch.reward,
                                   dtype=torch.float,
                                   device=device) + self.gamma * next_qvals

            # Next compute Q(s, a)
            # Nest each action in a list - so that it becomes the only admissible cmd
            nested_acts = tuple([[a] for a in batch.act])
            qvals = self.network(batch.state, nested_acts, batch.history)
            # Combine the qvals: Maybe just do a greedy max for generality
            qvals = torch.cat(qvals)

            loss = F.smooth_l1_loss(qvals, targets.detach())

            # Compute Huber loss
            if batch_loss is None:
                batch_loss = loss
            else:
                batch_loss += loss

        batch_loss /= num_per_step

        self.optimizer.zero_grad()
        batch_loss.backward()
        nn.utils.clip_grad_norm_(self.network.parameters(), self.clip)
        self.optimizer.step()
        # self.scheduler.step()

        return loss.item()

    def load(self):
        try:
            self.memory = pickle.load(
                open(pjoin(self.save_path, 'memory.pkl'), 'rb'))
            self.network = torch.load(pjoin(self.save_path, 'model.pt'))
        except Exception as e:
            print("Error saving model.")
            logging.error(traceback.format_exc())

    def save(self):
        try:
            pickle.dump(self.memory,
                        open(pjoin(self.save_path, 'memory.pkl'), 'wb'))
            torch.save(self.network, pjoin(self.save_path, 'model.pt'))
        except Exception as e:
            print("Error saving model.")
            logging.error(traceback.format_exc())
Example #19
0
    state = current_screen - last_screen
    #print state
    for t in count():
        action = select_action(state)
        _, reward, done, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device)

        last_screen = current_screen
        current_screen = get_screen(env, device)

        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None

        memory.push(state, action, next_state, reward)

        state = next_state
        #if done:
        #    print "Episode Done"
        #else:
        #    print state.size()
        optimize_model(policy_net, optimizer)
        if done:
            episode_durations.append(t + 1)
            plot_durations(episode_durations, AVERAGE_SIZE)
            break

    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
Example #20
0
class DQN(HyperParam):
    def __init__(self, n_actions, device, batch_norm=False):
        self.device = device
        self.n_actions = n_actions
        self._memory_init()
        self._net_init(n_actions, batch_norm)
        self.epsilon = LinearAnneal(self.EPS_INIT, self.EPS_END,
                                    self.EXPLORE_STEP)
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                       lr=self.LR)

    def _memory_init(self):
        self.memory = ReplayMemory(self.MEMORY_SIZE)

    def _net_init(self, n_actions, batch_norm):
        """
        Initialization of two neural network

        policy net - a function return the all q values corresponding to each action
                     given the input state. This network is used to compute expected
                     q vlue and will be optimized during each iteration
        target net - a function which will be updated from policy net after N optimization
                     step (N is a hyperparameter). This network is used to compute
                     expected q value based on next state
        """
        self.policy_net = Net(n_actions, batch_norm).to(self.device)
        self.target_net = Net(n_actions, batch_norm).to(self.device)
        self._update_target()
        self.target_net.eval()

    def _choose_action(self, state):
        """
        epsilon - greedy policy to decide next action

        the value of epsilon will anneal linearly
        """
        sample = random.random()
        if sample > self.epsilon.anneal():
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            action = random.randrange(self.n_actions)
            return torch.tensor([[action]],
                                device=self.device,
                                dtype=torch.long)

    def _q(self, states, actions):
        return self.policy_net(states).gather(1, actions)

    def _expected_q(self, next_states, rewards):
        """
        Calculation of expected q value

        based on bellman equation: q = r + gamma * q_next
        """
        # only use those next state is not the end of the game
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, next_states)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s for s in next_states if s is not None])

        # put the state into the network and filter those action with the max q value
        q_next = torch.zeros(self.BATCH_SIZE, device=self.device)
        q_next[non_final_mask] = self.target_net(non_final_next_states).max(
            1)[0].detach()
        expected_q = rewards + self.GAMMA * q_next

        return expected_q.unsqueeze(1)

    def _optimize(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)

        # calculate q value and expected q value
        q = self._q(states, actions)
        expected_q = self._expected_q(batch.next_state, rewards)
        loss = F.smooth_l1_loss(q, expected_q)

        # optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def _update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save(self, file_name):
        torch.save(self.policy_net.state_dict(), file_name)

    def load(self, model):
        self.policy_net.load_state_dict(torch.load(model))
        self.policy_net.eval()

    def train(self, env, logger):
        """Main part for training the agent"""
        processor = StateProcessor()
        optim_cnt = 0
        for i_episode in range(self.N_EPISODE):
            total_reward = 0
            state = processor.to_tensor(env.reset()).to(self.device)
            for t in itertools.count():
                # Select and perform an action
                action = self._choose_action(state)
                next_state, reward, done, _ = env.step(action)
                # Sum up total reward for one episode, convert reward to tensor
                total_reward += reward
                reward = torch.tensor([reward],
                                      dtype=torch.float32,
                                      device=self.device)

                if done:
                    self.memory.push(state, action, None, reward)
                    self._optimize()
                    break
                else:
                    next_state = processor.to_tensor(next_state).to(
                        self.device)
                    self.memory.push(state, action, next_state, reward)
                    self._optimize()

                state = next_state
            optim_cnt += t
            score = env.unwrapped.game.get_score()
            logger.info(
                f"{i_episode},{optim_cnt},{total_reward:.1f},{score},{self.epsilon.p:.6f}"
            )

            if i_episode % self.TARGET_UPDATE == 0:
                self._update_target()
                self.save(f"model_{i_episode}.pkl")

    def test(self, env):
        while True:
            processor = StateProcessor()
            state = processor.to_tensor(env.reset()).to(self.device)
            while True:
                with torch.no_grad():
                    action = self.policy_net(state).max(1)[1].view(1, 1)
                next_state, _, done, _ = env.step(action)

                if done:
                    break
                next_state = processor.to_tensor(next_state).to(self.device)
                state = next_state
Example #21
0
def train_dqn(settings):
    required_settings = [
        "batch_size",
        "checkpoint_frequency",
        "device",
        "eps_start",
        "eps_end",
        "eps_cliff",
        "eps_decay",
        "gamma",
        "log_freq",
        "logs_dir",
        "lr",
        "max_steps",
        "memory_size",
        "model_name",
        "num_episodes",
        "out_dir",
        "target_net_update_freq",
    ]
    if not settings_is_valid(settings, required_settings):
        raise Exception(
            f"Settings object {settings} missing some required settings.")

    batch_size = settings["batch_size"]
    checkpoint_frequency = settings["checkpoint_frequency"]
    device = settings["device"]
    eps_start = settings["eps_start"]
    eps_end = settings["eps_end"]
    eps_cliff = settings["eps_cliff"]
    # eps_decay = settings["eps_decay"]
    gamma = settings["gamma"]
    logs_dir = settings["logs_dir"]
    log_freq = settings["log_freq"]
    lr = settings["lr"]
    max_steps = settings["max_steps"]
    memory_size = settings["memory_size"]
    model_name = settings["model_name"]
    num_episodes = settings["num_episodes"]
    out_dir = settings["out_dir"]
    target_net_update_freq = settings["target_net_update_freq"]

    # Initialize environment
    env = gym.make("StarGunner-v0")

    # Initialize model
    num_actions = env.action_space.n
    settings["num_actions"] = num_actions
    policy_net = DQN(settings).to(device)
    target_net = DQN(settings).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Initialize memory
    logging.info("Initializing memory.")
    memory = ReplayMemory(memory_size)
    memory.init_with_random((1, 3, 84, 84), num_actions)
    logging.info("Finished initializing memory.")

    # Initialize other model ingredients
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    # Initialize tensorboard
    writer = SummaryWriter(logs_dir)

    # Loop over episodes
    policy_net.train()
    steps_done = 0
    log_reward_acc = 0.0
    log_steps_acc = 0
    for episode in tqdm(range(num_episodes)):
        state = process_state(env.reset()).to(device)
        reward_acc = 0.0
        loss_acc = 0.0

        # Loop over steps in episode
        for t in range(max_steps):
            with torch.no_grad():
                Q = policy_net.forward(state.type(torch.float))

            # Get best predicted action and perform it
            if steps_done < eps_cliff:
                epsilon = -(eps_start -
                            eps_end) / eps_cliff * steps_done + eps_start
            else:
                epsilon = eps_end

            if random.random() < epsilon:
                predicted_action = torch.tensor([env.action_space.sample()
                                                 ]).to(device)
            else:
                predicted_action = torch.argmax(Q, dim=1)
            next_state, raw_reward, done, info = env.step(
                predicted_action.item())
            # Note that next state could also be a difference
            next_state = process_state(next_state)
            reward = torch.tensor([clamp_reward(raw_reward)])

            # Save to memory
            memory.push(state.to("cpu"), predicted_action.to("cpu"),
                        next_state, reward)

            # Move to next state
            state = next_state.to(device)

            # Sample from memory
            batch = Transition(*zip(*memory.sample(batch_size)))

            # Mask terminal state (adapted from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)
            final_mask = torch.tensor(
                tuple(map(lambda s: s is not None, batch.next_state)),
                device=device,
                dtype=torch.bool,
            )
            # print("FINAL_MASK", final_mask.shape)
            state_batch = torch.cat(batch.state).type(torch.float).to(device)
            next_state_batch = torch.cat(batch.next_state).type(
                torch.float).to(device)
            action_batch = torch.cat(batch.action).to(device)
            reward_batch = torch.cat(batch.reward).to(device)

            # print("STATE_BATCH SHAPE", state_batch.shape)
            # print("STATE_BATCH", state_batch[4, :, 100])
            # print("ACTION_BATCH SHAPE", action_batch.shape)
            # print("ACTION_BATCH", action_batch)
            # print("REWARD_BATCH SHAPE", reward_batch.shape)

            # Compute Q
            # Q_next = torch.zeros((batch_size, num_actions))
            # print("MODEL STATE BATCH SHAPE", model(state_batch).shape)
            Q_actual = policy_net(state_batch).gather(
                1, action_batch.view(action_batch.shape[0], 1))
            Q_next_pred = target_net(next_state_batch)
            Q_max = torch.max(Q_next_pred, dim=1)[0].detach()
            # print("Q_MAX shape", Q_max.shape)
            target = reward_batch + gamma * Q_max * final_mask.to(Q_max.dtype)
            # print("TARGET SIZE", target.shape)

            # Calculate loss
            loss = F.smooth_l1_loss(Q_actual, target.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()

            # Clamp gradient to avoid gradient explosion
            for param in policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            optimizer.step()

            # Store stats
            loss_acc += loss.item()
            reward_acc += raw_reward
            steps_done += 1

            if steps_done % target_net_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())

            # Exit if in terminal state
            if done:
                logging.debug(
                    f"Episode {episode} finished after {t} timesteps with reward {reward_acc}."
                )
                break

        logging.debug(f"Loss: {loss_acc / t}")

        # Save model checkpoint
        if (episode != 0) and (episode % checkpoint_frequency == 0):
            save_model_checkpoint(
                policy_net,
                optimizer,
                episode,
                loss,
                f"{out_dir}/checkpoints/{model_name}_{episode}",
            )

        # Log to tensorboard
        log_reward_acc += reward_acc
        log_steps_acc += t
        writer.add_scalar("Loss / Timestep", loss_acc / t, episode)
        if episode % log_freq == 0:
            writer.add_scalar("Reward", log_reward_acc / log_freq, episode)
            writer.add_scalar("Reward / Timestep",
                              log_reward_acc / log_steps_acc, episode)
            writer.add_scalar("Duration", log_steps_acc / log_freq, episode)
            writer.add_scalar("Steps", log_reward_acc / log_steps_acc,
                              steps_done)
            log_reward_acc = 0.0
            log_steps_acc = 0

    # Save model
    save_model(policy_net, f"{out_dir}/{model_name}.model")

    # Report final stats
    logging.info(f"Steps Done: {steps_done}")

    env.close()
    return policy_net
Example #22
0
n_agents = 4
length_lstm = 10
pkl_file = open('data_saq.pkl', 'rb')

# should be unified when running in the server: which pkl file
memory = ReplayMemory(n_episode * n_agents * max_steps + 100)

use_cuda = pt.cuda.is_available()

for i in range(n_episode):
    data1 = pickle.load(pkl_file)
    data2 = pickle.load(pkl_file)
    data3 = pickle.load(pkl_file)
    print('episode is %d' % (i))
    for j in range(max_steps):
        memory.push(data1[j], data2[j], '', '', '')

loss_func = pt.nn.MSELoss().cuda()


class meta_actor(pt.nn.Module):
    def __init__(self, dim_observation, dim_action):
        # print('model.dim_action',dim_action)
        super(meta_actor, self).__init__()
        self.FC1 = pt.nn.Linear(dim_observation, 500)
        self.FC2 = pt.nn.Linear(500, 128)
        self.FC3 = pt.nn.Linear(128, dim_action)

    def forward(self, obs):
        result = F.relu(self.FC1(obs))
        result = F.relu(self.FC2(result))
Example #23
0
class DRRN_Agent:
    def __init__(self, args):
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(args.spm_path)
        self.network = DRRN(len(self.sp), args.embedding_dim,
                            args.hidden_dim).to(device)
        self.memory = ReplayMemory(args.memory_size)
        self.save_path = args.output_dir
        self.clip = args.clip
        self.optimizer = torch.optim.Adam(self.network.parameters(),
                                          lr=args.learning_rate)

    def observe(self, state, act, rew, next_state, next_acts, done):
        self.memory.push(state, act, rew, next_state, next_acts, done)

    def build_state(self, obs, infos):
        """ Returns a state representation built from various info sources. """
        obs_ids = [self.sp.EncodeAsIds(o) for o in obs]
        look_ids = [self.sp.EncodeAsIds(info['look']) for info in infos]
        inv_ids = [self.sp.EncodeAsIds(info['inv']) for info in infos]
        return [
            State(ob, lk, inv)
            for ob, lk, inv in zip(obs_ids, look_ids, inv_ids)
        ]

    def encode(self, obs_list):
        """ Encode a list of observations """
        return [self.sp.EncodeAsIds(o) for o in obs_list]

    def act(self, states, poss_acts, sample=True):
        """ Returns a string action from poss_acts. """
        idxs, values = self.network.act(states, poss_acts, sample)
        act_ids = [poss_acts[batch][idx] for batch, idx in enumerate(idxs)]
        return act_ids, idxs, values

    def update(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Compute Q(s', a') for all a'
        # TODO: Use a target network???
        next_qvals = self.network(batch.next_state, batch.next_acts)
        # Take the max over next q-values
        next_qvals = torch.tensor([vals.max() for vals in next_qvals],
                                  device=device)
        # Zero all the next_qvals that are done
        next_qvals = next_qvals * (
            1 - torch.tensor(batch.done, dtype=torch.float, device=device))
        targets = torch.tensor(batch.reward, dtype=torch.float,
                               device=device) + self.gamma * next_qvals

        # Next compute Q(s, a)
        # Nest each action in a list - so that it becomes the only admissible cmd
        nested_acts = tuple([[a] for a in batch.act])
        qvals = self.network(batch.state, nested_acts)
        # Combine the qvals: Maybe just do a greedy max for generality
        qvals = torch.cat(qvals)

        # Compute Huber loss
        loss = F.smooth_l1_loss(qvals, targets.detach())
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.network.parameters(), self.clip)
        self.optimizer.step()
        return loss.item()

    def load(self):
        try:
            self.memory = pickle.load(
                open(pjoin(self.save_path, 'memory.pkl'), 'rb'))
            self.network = torch.load(pjoin(self.save_path, 'model.pt'))
        except Exception as e:
            print("Error saving model.")
            logging.error(traceback.format_exc())

    def save(self):
        try:
            pickle.dump(self.memory,
                        open(pjoin(self.save_path, 'memory.pkl'), 'wb'))
            torch.save(self.network, pjoin(self.save_path, 'model.pt'))
        except Exception as e:
            print("Error saving model.")
            logging.error(traceback.format_exc())
Example #24
0
class SAC(object):
    def __init__(self, config, env):
        self.device = config.device

        self.gamma = config.gamma  # 折扣因子

        self.tau = config.tau

        # 学习率
        self.value_lr = config.value_lr
        self.soft_q_lr = config.soft_q_lr
        self.policy_lr = config.policy_lr

        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.shape[0]  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        # 初始化V网络
        self.value_net = ValueNetwork(self.num_states, 256).to(self.device)
        # 初始化V目标网络
        self.target_value_net = ValueNetwork(self.num_states,
                                             256).to(self.device)

        # V目标网络和V网络初始时参数一致
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param.data)

        # 初始化Q网络
        self.soft_q_net = SoftQNetwork(self.num_states, self.num_actions,
                                       256).to(self.device)

        # 初始化策略网络
        self.policy_net = PolicyNetwork(self.num_states, self.num_actions,
                                        256).to(self.device)

        # 训练的优化器
        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=self.value_lr)
        self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(),
                                           lr=self.soft_q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=self.policy_lr)

        # 均方损失函数
        self.value_criterion = nn.MSELoss()
        self.soft_q_criterion = nn.MSELoss()

    # 储存记忆
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.push((state, action, reward, next_state, done))

    # 选择动作
    def choose_action(self, s):
        s = torch.FloatTensor(s).to(self.device)
        mean, log_std = self.policy_net(s)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        action = action.detach().cpu().numpy()
        return action[0]

    # 获取动作的log_prob
    def get_action_log_prob(self, s, epsilon=1e-6):
        mean, log_std = self.policy_net(s)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
        log_prob = log_prob.sum(-1, keepdim=True)
        # log_prob = Normal(mean, std).log_prob(mean + std * z.to(self.device)) - torch.log(1 - action.pow(2) + epsilon)  # reparameterization

        return action, log_prob, z, mean, log_std

    # 从经验池中选取样本
    def get_batch(self):
        transitions, _, _ = self.memory.sample(self.batch_size)  # 批样本

        # 解压批样本
        # 例如zipped为[(1, 4), (2, 5), (3, 6)],zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)]
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
            *transitions)

        # 将样本转化为tensor
        batch_state = torch.tensor(batch_state,
                                   device=self.device,
                                   dtype=torch.float)
        batch_action = torch.tensor(batch_action,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(
                                        -1, 1)  # view转换为列tensor
        batch_reward = torch.tensor(batch_reward,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(-1, 1)
        batch_next_state = torch.tensor(batch_next_state,
                                        device=self.device,
                                        dtype=torch.float)
        batch_done = torch.tensor(batch_done,
                                  device=self.device,
                                  dtype=torch.float).squeeze().view(-1, 1)
        # print("状态:", batch_state.shape) 128,4
        # print("动作:", batch_action.shape)
        # print("奖励:", batch_reward.shape)
        # print("done:", batch_done.shape)
        #
        return batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _

    # 学习
    def learn(self):
        # 获取批样本
        batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ = self.get_batch(
        )

        # print("状态:", batch_state)
        # print("动作:", batch_action)
        # print("done:", batch_done)

        expected_q_value = self.soft_q_net(batch_state, batch_action)  # q(s,a)
        expected_value = self.value_net(batch_state)  # v(s)
        new_action, log_prob, z, mean, log_std = self.get_action_log_prob(
            batch_state)  # a~, logpi(a~|s), dist, 均值,标准差

        target_value = self.target_value_net(batch_next_state)  # vtar(s')
        next_q_value = batch_reward + (
            1 -
            batch_done) * self.gamma * target_value  # r + gamma*(1-d)*vtar(s')
        q_value_loss = self.soft_q_criterion(expected_q_value,
                                             next_q_value.detach()).mean()

        expected_new_q_value = self.soft_q_net(batch_state,
                                               new_action)  # q(s,a~)
        next_value = expected_new_q_value - log_prob
        value_loss = self.value_criterion(expected_value,
                                          next_value.detach()).mean()

        log_prob_target = expected_new_q_value - expected_value  # q(s,a) - v(s)
        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()

        self.soft_q_optimizer.zero_grad()
        q_value_loss.backward()
        self.soft_q_optimizer.step()

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

        # 学习的步数加一
        self.learn_step_counter += 1

    # 保存模型
    def save(self):
        torch.save(self.soft_q_net, 'sac1_q.pkl')
        torch.save(self.value_net, 'sac1_v.pkl')
        torch.save(self.policy_net, 'sac1_policy.pkl')

    # 加载模型
    def load(self):
        self.soft_q_net = torch.load('sac1_q.pkl')
        self.value_net = torch.load('sac1_v.pkl')
        self.policy_net = torch.load('sac1_policy.pkl')
Example #25
0
class DDQN(object):
    def __init__(self, n_states, n_actions, args):
        if args.seed > 0:
            self.seed(args.seed)

        self.n_states = n_states
        self.n_actions = n_actions

        # create agent network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.agent = Learner(self.n_states, self.n_actions, **net_cfg)
        self.target = Learner(self.n_states, self.n_actions, **net_cfg)
        self.agent_optim = Adam(self.agent.parameters(), lr=args.lr)

        self.update_target_steps = args.update_target_timing

        hard_update(self.target, self.agent)

        # create replay memory
        self.memory = ReplayMemory(capacity=args.rmsize)

        # hyper parameters
        self.batch_size = args.bsize
        self.discount_rate = args.discount_rate
        self.decay_epsilon = 1 / args.decay_epsilon
        self.min_epsilon = args.min_epsilon
        
        self.epsilon = 1.0
        
        if USE_CUDA: self.cuda()

    def update(self, step):
        state_batch, action_batch, next_state_batch, reward_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)
        q_predict = self.agent(to_tensor(state_batch))
        n_q_predict = self.agent(to_tensor(next_state_batch))
        q_batch = torch.zeros(self.batch_size, 1)
        n_act_batch = np.zeros(self.batch_size)
        next_q_value = torch.zeros(self.batch_size, 1)

        for n in range(self.batch_size):
            q_batch[n] = q_predict[n][action_batch[n]]
            n_act_batch = torch.argmax(n_q_predict[n])
            # print(n_act_batch)
            # print(self.target(to_tensor(next_state_batch[n])))
            next_q_value[n] = self.target(to_tensor(next_state_batch[n]))[n_act_batch]

        # next_q_value = torch.max(self.target(to_tensor(next_state_batch)), 1)[0].reshape(self.batch_size, 1)
        
        # next_q_value = self.target(to_tensor(next_state_batch))[n_act_batch]

        target_q_batch = to_tensor(reward_batch).reshape(self.batch_size, 1) + self.discount_rate * next_q_value * to_tensor(1-terminal_batch.astype(np.float).reshape(self.batch_size, 1))

        # q_predict = self.agent(to_tensor(state_batch))
        # print("q_predict:{}" .format(q_predict))
        # q_batch = torch.zeros(self.batch_size, 1)
        # print("q_batch:{}" .format(q_batch.shape))
        # print("q_batch:{}" .format(q_batch))
        value_loss = criterion(q_batch, target_q_batch)
        # print("loss:{}" .format(value_loss))
        self.agent.zero_grad()
        value_loss.backward()
        self.agent_optim.step()

        if step % self.update_target_steps == 0:
            # print("update target")
            self.update_target()

    def update_target(self):
        hard_update(self.target, self.agent)

    def random_action(self):
        action = np.random.uniform(-1., 1., self.n_actions)
        # self.a_t = action
        
        action = np.argmax(action)

        # idx = np.where(action == max(action))

        # action = np.random.choice(idx[0])
        # print(action)
        return action

    def select_action(self, s_t, decay_epsilon=True):
        if np.random.random () < self.epsilon:
            action = self.random_action()
        else:
            action = to_numpy(
                self.agent(to_tensor(np.array([s_t])))
            ).squeeze(0)
            # print("action:{}".format(action))
            action = np.argmax(action)
            # idx = np.where(action == max(action))
            # action = np.random.choice(idx[0])
            
            # print("action:{}" .format(action))
            # action = np.clip(action, -1, 1)

        if self.epsilon > self.min_epsilon and decay_epsilon:
            self.epsilon = max(self.min_epsilon, self.epsilon - self.decay_epsilon)    

        return action

    def observe(self, obs, act, new_obs, rew, done):
        items = np.asarray([obs, act, new_obs, rew, done])
        self.memory.push(items)

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
Example #26
0
class AbstractDQNAgent(AbstractStochasticAgent, ABC):
    def __init__(self, env, config=None):
        super(AbstractDQNAgent, self).__init__(config)
        self.env = env
        assert isinstance(
            env.action_space,
            spaces.Discrete), "Only compatible with Discrete action spaces."
        self.memory = ReplayMemory(self.config)
        self.exploration_policy = exploration_factory(
            self.config["exploration"], self.env.action_space)
        self.training = True
        self.previous_state = None
        self.previous_past_pose = None

        self.step = 0

    @classmethod
    def default_config(cls):
        return dict(model=dict(
            encoder=dict(in_channels=5, in_height=112, in_width=112)),
                    optimizer=dict(type="ADAM", lr=5e-4, weight_decay=0, k=5),
                    rl_lossfunction="l2",
                    predict_lossfunction='l2',
                    memory_capacity=15000,
                    batch_size=32,
                    gamma=0.80,
                    device="cuda:0",
                    exploration=dict(method="EpsilonGreedy"),
                    target_update=50,
                    double=True)

    def record(self, current_state, current_future_pos, current_past_pos,\
                action, reward,\
                    next_state, next_future_pos, next_past_pos, \
                        done, info):
        """
            Record a transition by performing a Deep Q-Network iteration

            - push the transition into memory
            - sample a minibatch
            - compute the bellman residual loss over the minibatch
            - perform one gradient descent step
            - slowly track the policy network with the target network
        :param state: a state
        :param action: an action
        :param reward: a reward
        :param next_state: a next state
        :param done: whether state is terminal
        """
        if not self.training:
            return

        self.memory.push(current_state, current_future_pos, current_past_pos,\
                action, reward,\
                    next_state, next_future_pos, next_past_pos, \
                        done, info)

        batch = self.sample_minibatch()
        if batch:
            loss, _, _ = self.compute_bellman_residual(batch)
            self.step_optimizer(loss)
            self.update_target_network()

            self.step += 1

    def act(self, current_state, current_past_pos):
        """
            Act according to the state-action value model and an exploration policy
        :param state: current state
        :return: an action
        """
        self.previous_state = current_state
        self.previous_past_pose = current_past_pos
        values = self.get_state_action_values(current_state, current_past_pos)
        self.exploration_policy.update(values, step_time=True)
        return self.exploration_policy.sample()

    def sample_minibatch(self):
        if len(self.memory) < self.config["batch_size"]:
            return None
        transitions = self.memory.sample(self.config["batch_size"])
        return Transition(*zip(*transitions))

    def update_target_network(self):
        self.steps += 1
        if self.steps % self.config["target_update"] == 0:
            self.target_net.load_state_dict(self.value_net.state_dict())

    @abstractmethod
    def compute_bellman_residual(self, batch, target_state_action_value=None):
        """
            Compute the Bellman Residual Loss over a batch
        :param batch: batch of transitions
        :param target_state_action_value: if provided, acts as a target (s,a)-value
                                          if not, it will be computed from batch and model (Double DQN target)
        :return: the loss over the batch, and the computed target
        """
        raise NotImplementedError

    @abstractmethod
    def get_batch_state_values(self, states):
        """
        Get the state values of several states
        :param states: [s1; ...; sN] an array of states
        :return: values, actions:
                 - [V1; ...; VN] the array of the state values for each state
                 - [a1*; ...; aN*] the array of corresponding optimal action indexes for each state
        """
        raise NotImplementedError

    @abstractmethod
    def get_batch_state_action_values(self, current_state, current_past_pos):
        """
        Get the state-action values of several states
        :param states: [s1; ...; sN] an array of states
        :return: values:[[Q11, ..., Q1n]; ...] the array of all action values for each state
        """
        raise NotImplementedError

    def get_state_value(self, state):
        """
        :param state: s, an environment state
        :return: V, its state-value
        """
        values, actions = self.get_batch_state_values([state])
        return values[0], actions[0]

    def get_state_action_values(self, current_state, current_past_pos):
        """
        :param state: s, an environment state
        :return: [Q(a1,s), ..., Q(an,s)] the array of its action-values for each actions
        """
        return self.get_batch_state_action_values([current_state],
                                                  [current_past_pos])[0]

    def step_optimizer(self, loss):
        raise NotImplementedError

    def seed(self, seed=None):
        return self.exploration_policy.seed(seed)

    def reset(self):
        pass

    def set_writer(self, writer):
        super().set_writer(writer)
        try:
            self.exploration_policy.set_writer(writer)
        except AttributeError:
            pass

    def action_distribution(self, state):
        self.previous_state = state
        values = self.get_state_action_values(state)
        self.exploration_policy.update(values, step_time=False)
        return self.exploration_policy.get_distribution()

    def set_time(self, time):
        self.exploration_policy.set_time(time)

    def eval(self):
        self.training = False
        self.config['exploration']['method'] = "Greedy"
        self.exploration_policy = exploration_factory(
            self.config["exploration"], self.env.action_space)
class ModelServer(SocketServer):
    HOST = 'localhost'
    PORT = 5600

    def __init__(self, *args, **kwargs):
        self.model = BasketballModel()
        self.handler = TrainingHandler()
        self.status = 0
        self.last_connection_amount = 0
        self.running_time = datetime.now()
        self.memory = ReplayMemory(100000)
        self.csv = CSVFile()
        super(ModelServer, self).__init__(self.HOST, self.PORT)

    def on_message_received(self, sock: socket, data, received_data: str,
                            addr: Tuple[str, int]) -> None:
        request = json.loads(received_data)
        print('Received {} from {}'.format(request, addr))
        if is_correct_message(request):
            host, prt = addr
            conn = self.handler.get_connection(prt)
            if is_result(request):
                res_throw = float(request['throw'])
                res_force = float(request['force'])
                res_distance = float(request['distance'])
                self.csv.add_observation(res_throw, res_force, res_distance,
                                         (datetime.now() -
                                          self.running_time).total_seconds())
                self.memory.push(res_throw, res_force, res_distance)
                conn.result = res_distance
            elif is_request(request):
                conn.distance = float(request['distance'])

    def on_step(self):
        # If all the results from the throws are in,
        if self.handler.all_results_are_in():
            # Then let us learn from all the results
            self.model.learn(self.handler.predictions,
                             self.handler.get_all_results())
            # Clear the results so that we can receive fresh results
            self.handler.clear_results()
            del self.handler.predictions
            self.status = 0
        # If all the distances are in
        if self.handler.all_distances_are_in():
            # Then we can predict the force and height
            throws = self.model.throw(self.handler.get_all_distances())
            # PyTorch tries to be clever, but we need it in the right dimensions
            if len(throws.shape) <= 1:
                throws = throws.unsqueeze(0)
            # Add the predictions to the training handler for later
            self.handler.predictions = throws
            # And send them to all the connected clients
            for conn, throw in zip(self.handler.get_connections(), throws):
                # In order to send the tensor data over the network,
                # we must first convert the tensor to simple python
                # data types and then we can access them as normal.
                t = throw[0].tolist()
                # t = random.uniform(0.2, 1)
                self.send_prediction_to_connection(conn, t, t)
            # Clear distances afterwards
            self.handler.clear_distances()
            self.status = 1

    def on_connection_closed(self, addr: Tuple[str, int]):
        host, port = addr
        self.handler.remove_connection(port)

    def on_accept_connection(self, sock: socket, addr: Tuple[str, int],
                             data: SimpleNamespace):
        host, port = addr
        self.handler.add_connection(Connection(sock, host, port, data))

    def send_prediction_to_connection(self, conn: Connection, force: float,
                                      height: float) -> None:
        prediction = {'Type': 'prediction', 'Force': force, 'Height': height}
        self.send_message(conn.data, prediction)

    def ask_for_distances(self, conn: Connection) -> None:
        request = {'Type': 'request'}
        self.send_message(conn.data, request)
Example #28
0
for i in range(n_episode):
    data1 = pickle.load(pkl_file)
    data2 = pickle.load(pkl_file)
    data3 = pickle.load(pkl_file)
    print('episode is %d' % (i))
    for j in range(max_steps):
        for k in range(n_agents):
            tmp_state = Variable(pt.zeros(5, 22).type(FloatTensor))
            tmp_action = Variable(pt.zeros(5, 2).type(FloatTensor))
            tmp_state[0:4, :] = data1[j]
            tmp_state[4, :] = data1[j][k, :]
            tmp_action[0:4, :] = data2[j]
            tmp_action[4, :] = data2[j][k, :]

            memory.push(tmp_state, tmp_action, '', data3[j][k].cpu(), '')

loss_func = pt.nn.MSELoss().cuda()


class meta_critic(pt.nn.Module):
    def __init__(self, n_agent, dim_observation, dim_action):
        super(meta_critic, self).__init__()
        self.n_agent = n_agent
        self.dim_observation = dim_observation
        self.dim_action = dim_action
        obs_dim = self.dim_observation * n_agent
        act_dim = self.dim_action * n_agent

        self.FC1 = pt.nn.Linear(obs_dim, 1024)
        self.FC2 = pt.nn.Linear(1024 + act_dim, 512)
Example #29
0
    class Model(object):
        def __init__(self):
            self.Rewards = []
            self.eval_net = DQN(N_C, arg.h, arg.w, N_A).to(device)
            if (arg.Reload_net):
                print('========== Reload net! ==========')
                self.eval_net = torch.load('policy_net.pkl')

            self.target_net = DQN(N_C, arg.h, arg.w, N_A).to(device)
            self.target_net.load_state_dict(self.eval_net.state_dict())
            self.target_net.eval()

            self.memory_counter = 0  # for storing memory
            self.learn_step_counter = 0  # for target updating
            self.memory = ReplayMemory(MEMORY_CAPACITY)  # initialize memory
            self.loss_func = nn.MSELoss()
            self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                              lr=LR)

        def choose_action(self, x):
            self.eval_net.eval()

            N_ACTIONS = N_A
            x = process_x(x)

            # input only one sample
            if np.random.uniform() < EPSILON:  # greedy
                actions_value = self.eval_net.forward(x)
                action = torch.max(actions_value, 1)[1].data.numpy()
                action = action[0]
            else:  # random
                action = np.random.randint(0, N_ACTIONS)
                # action = action if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)
            return action

        def store_transition(self, s, a, r, info, s_):

            # transition = np.hstack((s, a, r, info, s_))
            # transition = (s, a, r, info, s_)
            self.memory.push(s, a, r, info, s_)
            self.memory_counter += 1

        def learn(self):
            self.eval_net.train()
            # target parameter update
            if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
                print('------- replace netwark!-------',
                      self.learn_step_counter)
                self.target_net.load_state_dict(self.eval_net.state_dict())
            self.learn_step_counter += 1

            # sample batch transitions
            memory = self.memory
            transitions = memory.sample(BATCH_SIZE)

            batch = Transition(*zip(*transitions))
            batch_s, batch_a, batch_r, batch_info, batch_s_ = batch

            info_array = np.array(batch_info)
            batch_position, batch_press_shift, batch_pos_passed = info_array[:,
                                                                             0], info_array[:,
                                                                                            1], info_array[:,
                                                                                                           2]

            batch_s = torch.FloatTensor(batch_s)
            batch_s_ = torch.FloatTensor(batch_s_)
            batch_a = list_tensor(batch_a, 'long')
            batch_r = list_tensor(batch_r)

            q_eval = self.eval_net(batch_s).gather(1, batch_a)
            q_next = self.target_net(batch_s_).max(1)[0].view(
                BATCH_SIZE,
                1).detach()  # detach from graph, don't backpropagate
            q_target = batch_r + GAMMA * q_next

            # loss = self.loss_func(q_eval, q_target)
            loss = F.smooth_l1_loss(q_eval, q_target)

            err = q_eval - q_next
            if (arg.print_loss):
                print(
                    '---- Loss ----> {:6.3f},  --- mean-err  -----> {:6.3f} ) '
                    .format(
                        float(loss.data.numpy()),
                        float(self.loss_func(q_eval, q_target).data.numpy())))
            # q_eval -  q_target

            self.optimizer.zero_grad()
            loss.backward()

            # tmp = 0
            # for param in self.eval_net.parameters():
            #     max_g = param.grad.data.numpy()
            #     mx = np.max(max_g)
            #     if(mx >tmp):
            #         tmp = mx
            # print(tmp)
            # param.grad.data.clamp_(-1, 1)

            if (arg.plot_net): plot_net(self.eval_net, 0)
            self.optimizer.step()
Example #30
0
class Agent:
	def __init__(self, state_size=14, T=96, is_eval=True):
		self.state_size = state_size # normalized previous days
		self.action_size = 3
		self.memory = ReplayMemory(10000)
		self.inventory = []
		self.is_eval = is_eval
		self.T = T

		self.gamma = 0.99
		self.epsilon = 1.0
		self.epsilon_min = 0.01
		self.epsilon_decay = 0.995
		self.batch_size = 16
		if os.path.exists('models/target_model'):
			self.policy_net = torch.load('models/policy_model', map_location=device)
			self.target_net = torch.load('models/target_model', map_location=device)
		else:
			self.policy_net = DQN(state_size, self.action_size).to(device)
			self.target_net = DQN(state_size, self.action_size).to(device)

			for param_p in self.policy_net.parameters(): 
				weight_init.normal_(param_p)

		self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=0.00025)
		
	def act(self, state):
		if not self.is_eval and np.random.rand() <= self.epsilon:
			return random.randrange(self.action_size) - 1

		tensor = torch.FloatTensor(state).to(device)
		tensor = tensor.unsqueeze(0)
		options = self.target_net(tensor)
		# options = self.policy_net(tensor)
		return (np.argmax(options[-1].detach().cpu().numpy()) - 1)
		# return (np.argmax(options[0].detach().numpy()) - 1)

	def store(self, state, actions, new_states, rewards, action, step):
		if step < 1000: # soft update
			for n in range(len(actions)):
				self.memory.push(state, actions[n], new_states[n], rewards[n])
		else:
			for n in range(len(actions)):
				if actions[n] == action:
					self.memory.push(state, actions[n], new_states[n], rewards[n])
					break

	def optimize(self, step):
		# print(len(self.memory))
		if len(self.memory) < self.batch_size * 10:
				return
		transitions = self.memory.sample(self.batch_size)
		# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
		# detailed explanation). This converts batch-array of Transitions
		# to Transition of batch-arrays.
		batch = Transition(*zip(*transitions))

		# Compute a mask of non-final states and concatenate the batch elements
		# (a final state would've been the one after which simulation ended)
		next_state = torch.FloatTensor(batch.next_state).to(device)
		non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state)))
		non_final_next_states = torch.cat([s for s in next_state if s is not None])

		state_batch = torch.FloatTensor(batch.state).to(device)
		action_batch = torch.LongTensor(torch.add(torch.tensor(batch.action), torch.tensor(1))).to(device)
		reward_batch = torch.FloatTensor(batch.reward).to(device)

		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
		# columns of actions taken. These are the actions which would've been taken
		# for each batch state according to policy_net
		l = self.policy_net(state_batch).size(0)
		state_action_values = self.policy_net(state_batch)[95:l:96].gather(1, action_batch.reshape((self.batch_size, 1)))
		state_action_values = state_action_values.squeeze(-1)

		# Compute V(s_{t+1}) for all next states.
		# Expected values of actions for non_final_next_states are computed based
		# on the "older" target_net; selecting their best reward with max(1)[0].
		# This is merged based on the mask, such that we'll have either the expected
		# state value or 0 in case the state was final.
		next_state_values = torch.zeros(self.batch_size, device=device)
		next_state_values[non_final_mask] = self.target_net(next_state)[95:l:96].max(1)[0].detach()
		# Compute the expected Q values
		expected_state_action_values = (next_state_values * self.gamma) + reward_batch

		# Compute the loss
		loss = torch.nn.MSELoss()(expected_state_action_values, state_action_values)

		# Optimize the model
		
		loss.backward()
		for param in self.policy_net.parameters():
				param.grad.data.clamp_(-1, 1)
		
		self.optimizer.step()
		
		if step % self.T == 0:
			# print('soft_update')
			gamma = 0.001
			param_before = copy.deepcopy(self.target_net)
			target_update = copy.deepcopy(self.target_net.state_dict())
			for k in target_update.keys():
				target_update[k] = self.target_net.state_dict()[k] * (1 - gamma) + self.policy_net.state_dict()[k] * gamma
			self.target_net.load_state_dict(target_update)