Python ReplayMemory.push Examples

Programming Language: Python

Namespace/Package Name: memory

Class/Type: ReplayMemory

Method/Function: push

Examples at hotexamples.com: 30

Python ReplayMemory.push - 30 examples found. These are the top rated real world Python examples of memory.ReplayMemory.push extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

push(30)

ReplayMemory(30)

add(14)

priority_weight(7)

append(5)

__len__(3)

get_batch(3)

clear(2)

number_samples(2)

addTransition(2)

load(2)

len(1)

load_memory(1)

Experience(1)

postappend(1)

preappend(1)

insert(1)

process_reward(1)

push_all(1)

is_sufficient(1)

get_memtrace(1)

init_with_random(1)

get_sample(1)

get_replays(1)

get_memory(1)

get_item(1)

fill_memory(1)

draw(1)

compute_vtargets_adv(1)

add_transition(1)

add_samples(1)

add_replay(1)

__init__(1)

push_trace(1)

Example #1

Show file

File: conn_interface.py Project: DomBelcher/RL-Server

class ConnectionInterface():
  def __init__(self, n_inputs, n_actions, batch_size=128, train_frequency=10, memory_size=10000):
    self.model = Model.get_instance(n_inputs, n_actions)
    self.model.to(device)
    self.memory = ReplayMemory(memory_size)

    self.BATCH_SIZE = batch_size
    self.train_frequency = train_frequency

    self.tick = 0

  def get_action(self, s):
    state = torch.Tensor(s).to(device)
    action = self.model.get_action(state).item()

    return action

  def add_transition(self, s, a, r, ns):
    state = torch.Tensor(s).to(device)
    action = torch.LongTensor([[a]]).to(device)
    reward = torch.Tensor([r]).to(device)
    next_state = torch.Tensor(ns).to(device)

    self.memory.push(state, action, next_state, reward)

    if len(self.memory) >= self.BATCH_SIZE and self.tick % self.train_frequency == 0:
      print('Training')
      batch = self.memory.sample(self.BATCH_SIZE)
      self.model.optimise(batch)

    self.tick = self.tick + 1

Example #2

Show file

class BasePolicy:
    # base class for policy implementation
    def __init__(self, buffer_size, gamma, model, actions_space: gym.Space,
                 summery_writer: SummaryWriter, lr):
        self.gamma = gamma
        self.writer = summery_writer  # use this to log your information to tensorboard
        self.model = model
        self.memory = ReplayMemory(
            capacity=buffer_size
        )  # example for using this memory - in q_policy.py
        self.action_space = actions_space  # you can sample a random action from here. example in q_policy.py

    def select_action(self, state, epsilon, global_step=None):
        # 'global_step' might be used as time-index for tensorboard recordings.
        raise NotImplementedError()

    def optimize(self, batch_size, global_step=None):
        raise NotImplementedError()

    def record(self, state, action, next_state, reward):
        self.memory.push(state, action, next_state, reward)

    def eval(self):
        self.model = self.model.eval()

    def train(self):
        self.model = self.model.train()

Example #3

Show file

def train():
    policy_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device)
    target_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    
    optimizer = RMSprop(policy_net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    memory = ReplayMemory(MEMORY_SIZE)

    env = Game(N_PLAYERS, LARGEST_CARD, HAND_SIZE, N_ROUNDS)
    select_action = generate_action_selector()

    rewards = []
    for episode in trange(N_EPISODES):
        total_reward = 0
        observation = env.reset()
        done = False

        while not done:
            state = torch.tensor([create_state(observation)], dtype=torch.float, device=device)
            action = select_action(policy_net, state, observation.hand)

            observation, reward, done, info = env.step(action.item())
            total_reward += reward
            
            if not done:
                next_state = torch.tensor([create_state(observation)], dtype=torch.float, device=device)
            else:
                next_state = None
            reward = torch.tensor([reward], device=device)
            memory.push(state, action, next_state, reward)
            state = next_state
            
            optimize_model(policy_net, target_net, optimizer, memory)
            if done:
                rewards.append(total_reward)
                break
        
        if episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
        if episode % SAVE_INTERVAL == 0:
            torch.save(target_net.state_dict(), f'models/model_{episode}.pth')
        if episode % 100 == 0:
            plot_rewards(np.cumsum(rewards), baseline=np.zeros(len(rewards)))

    return rewards

Example #4

Show file

File: main.py Project: dt1483/SnFFT

def main(hparams):
    logfname = get_logdir(hparams['logdir'], hparams['savename'])
    if not os.path.exists(hparams['logdir']):
        os.makedirs(hparams['logdir'])
    savedir = get_logdir(hparams['logdir'], hparams['savename'])
    os.makedirs(savedir)
    sumdir = os.path.join(savedir, 'logs')
    os.makedirs(sumdir)
    logfile = os.path.join(savedir, 'log.txt')
    logger = SummaryWriter(sumdir)

    with open(os.path.join(savedir, 'args.json'), 'w') as f:
        json.dump(hparams, f, indent=4)

    log = get_logger(logfile)
    log.debug('Saving in {}'.format(savedir))
    log.debug('hparams: {}'.format(hparams))

    torch.manual_seed(hparams['seed'])
    random.seed(hparams['seed'])

    alpha = eval(hparams['alpha'])
    parts = eval(hparams['parts'])
    log.info('alpha: {} | parts: {}'.format(alpha, parts))
    size = IRREP_SIZE[(alpha, parts)]
    pol_net = IrrepLinreg(size * size)
    targ_net = IrrepLinreg(size * size)

    if not hparams['init']:
        log.info('Loading fourier')
        pol_net.loadnp(NP_IRREP_FMT.format(str(alpha), str(parts)))
        targ_net.loadnp(NP_IRREP_FMT.format(str(alpha), str(parts)))
    else:
        pol_net.init(hparams['init'])
        targ_net.init(hparams['init'])
        log.info('Init model using mode: {}'.format(hparams['init']))

    if hparams['noise']:
        log.info('Adding noise: {}'.format(hparams['noise']))
        mu = torch.zeros(pol_net.wr.size())
        std = torch.zeros(pol_net.wr.size()) + hparams['noise']
        wr_noise = torch.normal(mu, std)
        wi_noise = torch.normal(mu, std)
        pol_net.wr.data.add_(wr_noise)
        pol_net.wi.data.add_(wi_noise)

        wr_noise = torch.normal(mu, std)
        wi_noise = torch.normal(mu, std)
        targ_net.wr.data.add_(wr_noise)
        targ_net.wi.data.add_(wi_noise)

    env = Cube2IrrepEnv(alpha, parts, solve_rew=hparams['solve_rew'])
    log.info('env solve reward: {}'.format(env.solve_rew))
    if hparams['opt'] == 'sgd':
        log.info('Using sgd')
        optimizer = torch.optim.SGD(pol_net.parameters(),
                                    lr=hparams['lr'],
                                    momentum=hparams['momentum'])
    elif hparams['opt'] == 'rms':
        log.info('Using rmsprop')
        optimizer = torch.optim.RMSprop(pol_net.parameters(),
                                        lr=hparams['lr'],
                                        momentum=hparams['momentum'])
    memory = ReplayMemory(hparams['capacity'])
    if hparams['meminit']:
        init_memory(memory, env)
    niter = 0
    nupdates = 0
    totsolved = 0
    solved_lens = []
    rewards = np.zeros(hparams['logint'])

    log.info('Before any training:')
    val_avg, val_prop, val_time, solve_lens = val_model(pol_net, env, hparams)
    log.info(
        'Validation | avg solve length: {:.4f} | solve prop: {:.4f} | time: {:.2f}s'
        .format(val_avg, val_prop, val_time))
    log.info(
        'Validation | LQ: {:.3f} | MQ: {:.3f} | UQ: {:.3f} | Max: {}'.format(
            np.percentile(solve_lens, 25), np.percentile(solve_lens, 50),
            np.percentile(solve_lens, 75), max(solve_lens)))
    scramble_lens = []
    for e in range(hparams['epochs']):
        if hparams['curric']:
            dist = curriculum_dist(hparams['max_dist'], e, hparams['epochs'])
        else:
            dist = hparams['max_dist']
        state = env.reset_fixed(max_dist=dist)
        epoch_rews = 0
        scramble_lens.append(dist)

        for i in range(hparams['maxsteps']):
            if hparams['norandom']:
                action = get_action(env, pol_net, state)
            elif random.random() < explore_rate(
                    e, hparams['epochs'] * hparams['explore_proportion'],
                    hparams['eps_min']):
                action = random.randint(0, env.action_space.n - 1)
            else:
                action = get_action(env, pol_net, state)

            ns, rew, done, _ = env.step(action, irrep=False)
            memory.push(state, action, ns, rew, done)
            epoch_rews += rew
            state = ns
            niter += 1

            if (not hparams['noupdate']
                ) and niter > 0 and niter % hparams['update_int'] == 0:
                sample = memory.sample(hparams['batch_size'])
                _loss = update(env, pol_net, targ_net, sample, optimizer,
                               hparams, logger, nupdates)
                logger.add_scalar('loss', _loss, nupdates)
                nupdates += 1

            if done:
                solved_lens.append(i + 1)
                totsolved += 1
                break

        rewards[e % len(rewards)] = epoch_rews
        logger.add_scalar('reward', epoch_rews, e)

        if e % hparams['logint'] == 0 and e > 0:
            val_avg, val_prop, val_time, _ = val_model(pol_net, env, hparams)
            logger.add_scalar('last_{}_solved'.format(hparams['logint']),
                              len(solved_lens) / hparams['logint'], e)
            if len(solved_lens) > 0:
                logger.add_scalar(
                    'last_{}_solved_len'.format(hparams['logint']),
                    np.mean(solved_lens), e)
            logger.add_scalar('val_solve_avg', val_avg, e)
            logger.add_scalar('val_prop', val_prop, e)
            log.info(
                '{:7} | dist: {:4.1f} | avg rew: {:5.2f} | solve prop: {:5.3f}, len: {:5.2f} | exp: {:.2f} | ups {:7} | val avg {:.3f} prop {:.3f}'
                .format(
                    e,
                    np.mean(scramble_lens),
                    np.mean(rewards),
                    len(solved_lens) / hparams['logint'],
                    0 if len(solved_lens) == 0 else np.mean(solved_lens),
                    explore_rate(
                        e, hparams['epochs'] * hparams['explore_proportion'],
                        hparams['eps_min']),
                    nupdates,
                    val_avg,
                    val_prop,
                ))
            solved_lens = []
            scramble_lens = []

        if e % hparams['updatetarget'] == 0 and e > 0:
            targ_net.load_state_dict(pol_net.state_dict())

    log.info('Total updates: {}'.format(nupdates))
    log.info('Total solved: {:8} | Prop solved: {:.4f}'.format(
        totsolved, totsolved / hparams['epochs']))
    logger.export_scalars_to_json(os.path.join(savedir, 'summary.json'))
    logger.close()
    torch.save(pol_net, os.path.join(savedir, 'model.pt'))
    check_memory()

    hparams['val_size'] = 10 * hparams['val_size']
    val_avg, val_prop, val_time, _ = val_model(pol_net, env, hparams)
    log.info(
        'Validation avg solve length: {:.4f} | solve prop: {:.4f} | time: {:.2f}s'
        .format(val_avg, val_prop, val_time))

Example #5

Show file

File: meta_actor.py Project: zwfightzw/MLM

# should be unified when running in the server: which pkl file
memory = ReplayMemory(n_episode * n_agents * max_steps)

use_cuda = pt.cuda.is_available()

for i in range(n_episode):
    data1 = pickle.load(pkl_file)
    data2 = pickle.load(pkl_file)
    data3 = pickle.load(pkl_file)
    print('episode is %d' % (i))
    for j in range(max_steps):
        #for k in range(n_agents):
        tmp_whole_obs = data1[j]
        tmp_whole_act = data2[j]
        memory.push(tmp_whole_obs, tmp_whole_act, '', '', '')

loss_func = pt.nn.MSELoss().cuda()


class meta_actor(pt.nn.Module):
    def __init__(self, dim_observation, dim_action):
        # print('model.dim_action',dim_action)
        super(meta_actor, self).__init__()
        self.FC1 = pt.nn.Linear(dim_observation, 500)
        self.FC2 = pt.nn.Linear(500, 128)
        self.FC3 = pt.nn.Linear(128, dim_action)

    def forward(self, obs):
        result = F.relu(self.FC1(obs))
        result = F.relu(self.FC2(result))

Example #6

Show file

File: main_madppg.py Project: phykala616/DL_final_project

def main():
    # training loop
    # s_memory = ReplayMemory(capacity)
    memory = ReplayMemory(capacity)
    states = env.reset()
    episode = 0
    prev_states = np.concatenate([np.zeros([16, 112]),
                                  np.zeros([16, 112])]).reshape(-1, 4, 112)
    prev_reward = np.concatenate([np.zeros([16]),
                                  np.zeros([16])]).reshape(-1, 4, 1)
    prev_action_striker = np.zeros([16])
    prev_action_goalie = np.zeros([16])
    prev_action_striker = prev_action_striker.reshape(-1, 2, 1)
    prev_action_goalie = prev_action_goalie.reshape(-1, 2, 1)
    prev_action = np.concatenate([prev_action_striker, prev_action_goalie],
                                 axis=1)

    while episode < max_episodes:

        action_striker = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        action_goalie = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

        t1 = time.time()
        # if episode < 20:
        # action_striker = np.random.randint(7, size = [16])
        # action_goalie = np.random.randint(5, size = [16])
        # action_striker = np.array(action_striker)
        # action_goalie = np.array(action_goalie)
        # else:
        action_striker, action_goalie = Maddpg_.select_action(
            states[0], states[1])
        action_striker = np.argmax(action_striker.cpu().detach().numpy(),
                                   axis=1)
        action_goalie = np.argmax(action_goalie.cpu().detach().numpy(), axis=1)
        t2 = time.time()
        print(action_striker)
        print('action require: %f s' % (t2 - t1))
        states, reward, done, _ = env.step(action_striker,
                                           action_goalie,
                                           order="field")

        states_temp = deepcopy(states)
        states_temp[0] = states_temp[0].reshape(-1, 2, 112)
        states_temp[1] = states_temp[1].reshape(-1, 2, 112)
        states_temp = np.concatenate([states_temp[0], states_temp[1]], axis=1)

        memory.push(prev_states, states_temp, prev_action, prev_reward)
        t1 = time.time()
        loss_a, loss_c = Maddpg_.update_policy(memory)
        t2 = time.time()
        print(loss_a, loss_c)
        print('Update require: %f s' % (t2 - t1))

        prev_states, prev_reward, prev_action_striker, prev_action_goalie = states, reward, action_striker, action_goalie

        arg_done = np.argwhere(done[0] == True)
        prev_states[0][arg_done] = np.zeros([112])
        prev_states[1][arg_done] = np.zeros([112])
        prev_reward[0][arg_done] = 0
        prev_reward[1][arg_done] = 0
        prev_action_striker[arg_done] = 0
        prev_action_goalie[arg_done] = 0

        prev_states[0] = prev_states[0].reshape(-1, 2, 112)
        prev_states[1] = prev_states[1].reshape(-1, 2, 112)
        prev_states = np.concatenate([prev_states[0], prev_states[1]], axis=1)

        prev_reward[0] = prev_reward[0].reshape(-1, 2, 1)
        prev_reward[1] = prev_reward[1].reshape(-1, 2, 1)
        prev_reward = np.concatenate([prev_reward[0], prev_reward[1]], axis=1)

        prev_action_striker = prev_action_striker.reshape(-1, 2, 1)
        prev_action_goalie = prev_action_goalie.reshape(-1, 2, 1)
        prev_action = np.concatenate([prev_action_striker, prev_action_goalie],
                                     axis=1)

        if True in env.done_goalie:
            #     print("episode: ", episode, "*" * 10)
            #     # print(reward)
            #     # arg_done_goalie = np.argwhere(done_goa == True)
            #     if len(arg_done_goalie) == 2:
            #         print("arg_done_goalie", arg_done_goalie)

            #     for i in arg_done_goalie:
            #         # print("goalie %d"%(i[0]))
            #         # print("action", env.act_goalie_hist[i[0]])
            #         # print("Observation", env.observation_goalie_hist[i[0]])
            #         # print("reword", env.episode_goalie_rewards[i][0])
            #         pass
            #     arg_done_str = np.argwhere(done_goa == True)
            #     if len(arg_done_goalie) == 2:
            #         print("arg_done_str", arg_done_str)

            #     for i in arg_done_str:
            #         # print("str %d"%(i[0]))
            #         # print("action", env.act_striker_hist[i[0]])
            #         # print("Observation", env.observation_striker_hist[i[0]])
            #         # print("reword", env.episode_striker_rewards[i][0])
            #         pass
            #     # env.reset_some_agents(arg_done_str, arg_done_goalie)
            episode += 1

Example #7

Show file

#                break
"""
randomize state push in memory
before main loop start
"""
global_count = 0
episode = 0
while True:

    episode += 1
    T = 0
    state = env.reset()
    while T < args.max_step:
        action = random.randrange(0, args.action_space)
        next_state, reward, done, _ = env.step(action)
        memory.push([state, action, reward, next_state, done])
        state = next_state
        T += 1
        global_count += 1
        if done:
            break
    print("\r push : %d/%d  " % (global_count, args.learn_start),
          end='\r',
          flush=True)
    #    print("\r push : ",global_count,'/',args.learn_start,end='\r',flush=True)

    if global_count > args.learn_start:
        break

print('')
"""

Example #8

Show file

class Agent:
    def __init__(self, env, logger, gamma, start_learning, memory_size,
                 batch_size, target_update_step, policy_update_step,
                 max_episode_step, init_epsilon, epsilon_minimum,
                 epsilon_decay_rate, epsilon_decay_step, learning_rate,
                 n_episodes, n_actions, hidden_dim, print_interval,
                 policy_path, start_date):

        self.env = env
        self.gamma = gamma
        self.start_learning = start_learning
        self.batch_size = batch_size
        self.target_update_step = target_update_step
        self.policy_update_step = policy_update_step
        self.max_episode_step = max_episode_step
        self.epsilon_decay_rate = epsilon_decay_rate
        self.epsilon_decay_step = epsilon_decay_step
        self.n_episodes = n_episodes
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.n_actions = n_actions
        self.print_interval = print_interval
        self.start_date = start_date

        if policy_path:
            self.policy_net = torch.load(policy_path)
        else:
            self.policy_net = MLPPolicy(hidden_dim, n_actions,
                                        env.state_shape).to(
                                            self.device).float().to(device)
        self.target_net = MLPPolicy(hidden_dim, n_actions, env.state_shape).to(
            self.device).float().to(device)
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(),
                                          lr=learning_rate)
        self.memory = ReplayMemory(memory_size, env.state_shape)
        self.logger = logger
        self.epsilon = init_epsilon
        self.epsilon_minimum = epsilon_minimum

        self.memory_cache = ReplayMemory(self.max_episode_step,
                                         env.state_shape)

    def experience_replay(self, DEBUG=False):
        # Skip training DQN model if there are not enough saved transitions in the memory buffer
        # to give a input batch.
        if len(self.memory) < self.batch_size:
            # Return a loss value = 0 to notice that training is not yet started (only for logging)
            return torch.FloatTensor([0])

        # state batch shape: (B, N_STATES)
        # action batch shape: (B, 1)
        # reward batch shape: (B)
        state_batch, action_batch, reward_batch, next_state_batch = self.memory.sample(
            self.batch_size)

        # shape: (B)
        if DEBUG:
            print("State batch: \n", state_batch, "type: ",
                  state_batch.type())  # # torch.FloatTensor
            print("Action batch: \n", action_batch, "type: ",
                  action_batch.type())  # torch.LongTensor
            print("Reward batch: \n", reward_batch, "type: ",
                  reward_batch.type())  # torch.FloatTensor
            print("-----")

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch).view(self.batch_size)
        if DEBUG:
            print("Predicted Q values (LHS) = Q(s,a)")
            print("= ", state_action_values)
            print("type: ", state_action_values.type())  # torch.FloatTensor

        # RHS: r + gamma * max_a'( Q(s',a') )
        next_state_values = self.target_net(
            torch.FloatTensor(next_state_batch).to(device))
        if True in torch.isnan(next_state_values):
            next_state_values = torch.nan_to_num(next_state_values)
        next_state_values = torch.max(next_state_values, dim=1)
        next_state_values = next_state_values.values.view([1, self.batch_size])
        # breakpoint()

        # expected_state_action_values :
        #     target Q values = r + gamma * max_a'( Q(s',a') )
        expected_state_action_values = (reward_batch +
                                        (self.gamma * next_state_values)).view(
                                            self.batch_size)
        if DEBUG:
            print("Target Q values (RHS) = r + gamma * max_a'( Q(s',a') )")
            print("= ", expected_state_action_values)
            print("type: ",
                  expected_state_action_values.type())  # torch.FloatTensor

        # Update
        loss = F.mse_loss(state_action_values, expected_state_action_values)
        if torch.isnan(loss):
            breakpoint()
        # Update of DQN network weights
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            # Gradients are clipped within range [-1,1], to prevent exploding magnitude of gradients
            # and failure of training.
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if DEBUG:
            print("Loss: ", loss)
            print("===== End of Experience Replay =====")
        # Return the computed loss value (for logging outside this function)
        return loss

    def get_epsilon(self, global_step):
        if global_step <= self.epsilon_decay_step and self.epsilon > self.epsilon_minimum:
            self.epsilon *= self.epsilon_decay_rate

    def select_action(self, state):
        """
        Input(s) :
        - policy_net: Policy DQN for predicting Q values (for Exploitation)
        - state: current state for predicting Q values (for Exploitation)
        - epsilon: exploration probability
        - params: dictionary of global parameters, expecting:
                  - params["N_ACTIONS"]: number of possible actions
        Output(s) :
        - action: action to be taken, a tensor with type long and shape (1,1)
        """
        while True:
            if random.random() <= self.epsilon:
                # With prob. epsilon
                action = random.randrange(0, self.n_actions, 1)
                action = torch.LongTensor([[action]]).to(self.device)

            else:
                # With prob. 1 - epsilon,
                # (Exploitation) select action with max predicted Q-Values of current state.

                with torch.no_grad():
                    action = torch.argmax(
                        self.policy_net(state)).unsqueeze(0).unsqueeze(0).to(
                            self.device)

            # The agent can only sell stocks when it is holding some;
            # Similarly, it can only buy stocks when it's holding nothing
            # action = 2 >> buy, action = 1 >> no sell no buy, action = 0 >> sell
            # Only valid actions can be returned.
            if self.env.holding_stocks and action in [0, 1]:
                break
            elif not self.env.holding_stocks and action in [1, 2]:
                break

        return action

    def train(self):
        self.policy_net.train()  # Set Policy DQN model as train mode
        start_time = time()  # Timer
        global_steps = 0
        for episode in range(self.n_episodes):
            # Initialize the environment, get initial state
            # you can change the beginning date here
            state = self.env.reset(date=self.start_date)
            # preprocess state
            state = preprocess_state(state, self.device)

            # Logging for current episode
            done = None  # To mark if current episode is done
            episode_reward = 0  # Sum of rewards received in current episode
            episode_step = 0  # Cumulative steps in current episode
            loss_meter = AverageMeter()

            # Loop till end of episode (done = True or when step reaches max)
            while not done and episode_step < self.max_episode_step:
                self.get_epsilon(global_steps)

                action = self.select_action(state)

                next_state, reward, done = self.env.step(action[0][0].item())

                if not done:
                    # preprocess next_state
                    next_state = preprocess_state(next_state, self.device)
                else:
                    next_state = [None]

                self.memory_cache.push(state, action, [reward], next_state)

                if reward is not None:
                    self.memory_cache.process_reward()
                    push_length = self.memory_cache.position
                    self.memory.push(
                        self.memory_cache.state[:push_length],
                        self.memory_cache.action[:push_length],
                        self.memory_cache.reward[:push_length],
                        self.memory_cache.next_state[:push_length])
                    self.memory_cache.reset()

                    loss = self.experience_replay(DEBUG=False)

                    loss_meter.update(loss.item())

                if global_steps % self.target_update_step == 0:
                    self.target_net.load_state_dict(
                        self.policy_net.state_dict())

                # Update training results at the end of episode.
                state = next_state
                global_steps += 1
                episode_step += 1
                if reward:
                    episode_reward += reward

            # Logging after an episode
            end_time = time()

            self.logger.record({
                'reward': episode_reward,
                'loss': loss_meter.avg
            })

            # Print out logging messages
            if episode % self.print_interval == 0:
                print("====================")
                print(f"Episode {episode}")
                print("Time: ", end_time - start_time)
                print("Global Steps: ", global_steps)
                print("Epsilon: ", self.epsilon)
                print("Loss: ", loss_meter.avg)
                print("Reward: ", episode_reward)
                print("====================")

        avg_reward = self.logger.get_avg_reward()
        self.logger.save_model(self.policy_net)
        return avg_reward

Example #9

Show file

class Initializer():
    def __init__(self): 
        self.seed = 2
        self.use_cuda = True
        self.replay_size = 1000000
        self.gamma = 0.99
        self.tau = 1e-3
        self.device = torch.device('cuda')
        self.max_iters = 10000000
        self.batch_size = 256+1 
        self.results_path = 'placeholder'
        self.statistic_dir = os.path.join(self.results_path, 'statistics/')
        self.gpu_id = 0
        
        torch.cuda.set_device(self.gpu_id)

        #if folder do not exists, create it
        os.makedirs(self.statistic_dir, exist_ok=True)

        self.metrics = {'steps': [], 'episodes': [], 'train_rewards': [], 'test_rewards': [], 'actor_loss': [], 'critic_loss': [], 'test_episodes': []} 
        

    
    def start(self):
        self.set_seed()
        self.env = ControlSuite('walker-walk', 2, 1000)
        self.max_iters = 1000
        
        self.agent = DDPG(self.gamma, self.tau,self.env.state_space(),self.env,self.device, self.results_path)
        # Initialize replay memory
        self.memory = ReplayMemory(int(self.replay_size))
        self.list_total_rewards = []
        self.list_iter = []
        self.step = 0
        self.current_episode = 0
        self.checkpoint_interval = 100
        self.train()

    
    def train(self):
        for episode in tqdm(range(self.max_iters) ):
            self.metrics['episodes'].append(self.current_episode)
            self.explore_and_collect(self.current_episode)

            if (self.current_episode % self.checkpoint_interval) == 0:
                self.test(self.current_episode)
                self.save_checkpoint()

            self.current_episode += 1
    

    def explore_and_collect(self, iter):
        state = torch.Tensor([self.env.reset()]).cpu()
        done = False
        total_reward = 0

        while not done:
            self.metrics['steps'] = self.step
            self.step += 1
            action = self.agent.get_action(state,iter, action_noise=False)
            next_state, reward, done, _ = self.env.step(action.cpu().numpy()[0])

            mask = torch.Tensor([done]).to(self.device)
            reward = torch.Tensor([reward]).to(self.device)
            next_state = torch.Tensor([next_state]).cpu()
            total_reward += reward

            self.memory.push(state, action, mask, next_state, reward)
            state = next_state

            if len(self.memory) > self.batch_size:
                self.fit_buffer()
            
            if (self.step%100) == 0:
                self.agent.hard_swap()

        #print("iter: ", iter, " total_reward: ", total_reward)
        #self.list_iter.append(iter)
        #self.list_total_rewards.append(total_reward.cpu())
        #plt.plot(self.list_iter, self.list_total_rewards)
        #plt.show()
        #plt.savefig('reward.png')
        self.metrics['train_rewards'].append(total_reward.item())
        self.lineplot(self.metrics['episodes'][-len(self.metrics['train_rewards']):], self.metrics['train_rewards'], 'train_rewards', self.statistic_dir)
        self.lineplot(self.metrics['episodes'][-len(self.metrics['actor_loss']):], self.metrics['actor_loss'], 'actor_loss', self.statistic_dir)
        self.lineplot(self.metrics['episodes'][-len(self.metrics['critic_loss']):], self.metrics['critic_loss'], 'critic_loss', self.statistic_dir)
        torch.save(self.metrics, os.path.join(self.statistic_dir , 'metrics.pth'))

    def save_checkpoint(self):
        self.agent.store_model()
        
    def load_checkpoint(self):
        self.agent.load_model()    
        self.metrics = torch.load(os.path.join(self.statistic_dir, 'metrics.pth'))
        self.current_episode = self.metrics['episodes'][-1]


    def fit_buffer(self):
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Update actor and critic according to the batch
        actor_loss, critic_loss = self.agent.update_params(batch)
        self.metrics['actor_loss'].append(actor_loss)
        self.metrics['critic_loss'].append(critic_loss)

    def test(self, episode):
        
        state = self.env.reset()
        state = torch.Tensor([state]).to(self.device)
        total_reward = 0
        done = False
        i = 0 
        while not done:
            action = self.agent.get_action(state,iter,action_noise=False)
            next_state, reward, done, _ = self.env.step(action.cpu().numpy()[0])

            mask = torch.Tensor([done]).to(self.device)
            reward = torch.Tensor([reward]).to(self.device)
            next_state = torch.Tensor([next_state]).to(self.device)
            total_reward += reward
            state = next_state
            i +=1

        print("Result of test: ", total_reward)
        #self.agent.train_mode()
        self.metrics['test_rewards'].append(total_reward.item())
        self.metrics['test_episodes'].append(episode)
        self.lineplot(self.metrics['test_episodes'][-len(self.metrics['test_rewards']):], self.metrics['test_rewards'], 'test_rewards', self.statistic_dir)


    # Plots min, max and mean + standard deviation bars of a population over time
    def lineplot(self, xs, ys_population, title, path='', xaxis='episode'):
        max_colour, mean_colour, std_colour, transparent = 'rgb(0, 132, 180)', 'rgb(0, 172, 237)', 'rgba(29, 202, 255, 0.2)', 'rgba(0, 0, 0, 0)'

        if isinstance(ys_population[0], list) or isinstance(ys_population[0], tuple):
            ys = np.asarray(ys_population, dtype=np.float32)
            ys_min, ys_max, ys_mean, ys_std, ys_median = ys.min(1), ys.max(1), ys.mean(1), ys.std(1), np.median(ys, 1)
            ys_upper, ys_lower = ys_mean + ys_std, ys_mean - ys_std

            trace_max = Scatter(x=xs, y=ys_max, line=Line(color=max_colour, dash='dash'), name='Max')
            trace_upper = Scatter(x=xs, y=ys_upper, line=Line(color=transparent), name='+1 Std. Dev.', showlegend=False)
            trace_mean = Scatter(x=xs, y=ys_mean, fill='tonexty', fillcolor=std_colour, line=Line(color=mean_colour), name='Mean')
            trace_lower = Scatter(x=xs, y=ys_lower, fill='tonexty', fillcolor=std_colour, line=Line(color=transparent), name='-1 Std. Dev.', showlegend=False)
            trace_min = Scatter(x=xs, y=ys_min, line=Line(color=max_colour, dash='dash'), name='Min')
            trace_median = Scatter(x=xs, y=ys_median, line=Line(color=max_colour), name='Median')
            data = [trace_upper, trace_mean, trace_lower, trace_min, trace_max, trace_median]
        else:
            data = [Scatter(x=xs, y=ys_population, line=Line(color=mean_colour))]
        plotly.offline.plot({
            'data': data,
            'layout': dict(title=title, xaxis={'title': xaxis}, yaxis={'title': title})
        }, filename=os.path.join(path, title + '.html'), auto_open=False)

    def set_seed(self):
        print("Setting seed")
        os.environ['PYTHONHASHSEED']=str(self.seed)
        random.seed(self.seed)
        #torch.random.seed()
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)

Example #10

Show file

class QAgent(Agent):
    def __init__(self):
        self.fex = Extractor()
        self.net = DQN()
        try:
            self.net.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu')))
        except:
            print("Starting with new weights")
            raise Exception("Weights not found")
        self.net.eval()
        self.criterion = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.net.parameters())
        self.memory = ReplayMemory()
        self.training = False

        self.s = None
        self.a = None
        self.score = None

    def registerInitialState(self, state):
        self.s = None
        self.a = None
        self.score = None

    def getAction(self, game_state):
        legal = game_state.getLegalPacmanActions()
        if Directions.STOP in legal: legal.remove(Directions.STOP)
        state = self.fex(game_state)
        if self.training:
            state = state.cuda()
        with torch.no_grad():
            scores = self.net(state)
        scores = list(zip(ACTIONS, scores))
        legal_scores = [p for p in scores if p[0] in legal]
        action = max(legal_scores, key = lambda p: p[1])[0]

        if self.training:
            if random.random() < EPSILON:
                action = random.choice(legal)
            if self.s is not None:
                reward = game_state.getScore() - self.score
                reward = process_reward(self.s, state, reward)
                next_legals = game_state.getLegalActions()
                if Directions.STOP in next_legals: next_legals.remove(Directions.STOP)
                next_legals = (ACTION_MAP[d] for d in next_legals)
                self.memory.push(self.s, self.a, reward, state, next_legals)
            self.s = state
            self.a = ACTION_MAP[action]
            self.score = game_state.getScore()
        return action

    def final(self, state):
        if self.training:
            reward = state.getScore() - self.score
            reward = -10
            self.memory.push(self.s, self.a, reward, None, [])


    def train(self):
        global EPSILON
        self.training = True
        self.net.cuda()
        runners, names = load_runners()

        for epoch in range(EPOCHS):
            for t in self.net.parameters():
                print(t.data)
            if epoch <= 4:
                EPSILON = [0.8, 0.5, 0.3, 0.1, 0.01][epoch]
            print('Epoch {} | EPSILON {}'.format(epoch, EPSILON))
            g_dict = {}

            for runner, name in zip(runners, names):
                games = []
                for game_idx in range(GAMES_PER_EPOCH):
                    game = runner.run_game(self)
                    games.append(game)
                    for _ in range(SAMPLES_PER_GAME):
                        self.training_iteration()

                avg = np.mean([game.state.getScore() for game in games])
                wins = sum([game.state.isWin() for game in games])
                #print(f'{name}: {avg:0.2f} | {wins}/{GAMES_PER_EPOCH}')
                print('{}: {} | {}/{}'.format(name,avg, wins, GAMES_PER_EPOCH))
            print()
            torch.save(self.net.state_dict(), 'model.pth')


    def training_iteration(self):
        # sample mini-batch
        sarsl = self.memory.sample()
        if sarsl is None:
            return
        else:
            states, actions, rewards, next_states, next_state_legals = sarsl

        # replace deaths (None) with zeros
        for i, s in enumerate(next_states):
            if s is None:
                next_states[i] = self.fex.empty()
        next_states = torch.stack(next_states) 
        # get max Q(s',a'); deaths get value 0
        with torch.no_grad():
            next_actions_values = self.net(next_states)
            best_actions_values = []
            for next_legals, action_vals in zip(next_state_legals, next_actions_values):
                legal_vals = [v for (idx,v) in enumerate(action_vals) if idx in next_legals]
                if legal_vals == []:
                    legal_vals = [0]
                best_actions_values.append(max(legal_vals))
            best_actions_values = torch.tensor(best_actions_values).cuda()
        
            # compute target values
            targets = rewards + GAMMA*best_actions_values

        # compute current action values
        actions = actions.reshape(len(actions),1)
        self.net.train()
        action_values = self.net(states).gather(1,actions).reshape(32)
        self.net.eval()
        
        # compute loss and backpropagate it
        loss = self.criterion(targets, action_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def play(self, path):
        runner = LocalPacmanGameRunner(layout_path=path,
                                       random_ghosts=True,
                                       show_window=True,
                                       zoom_window=1.0,
                                       frame_time=0.1,
                                       timeout=-1000)
        game = runner.run_game(self)

Example #11

Show file

File: ppo_agent3.py Project: rsun0/sc2rl

class PPOAgent(object):
    def __init__(self, env, lr, hist_size=8, train_step=1024, trainable=True):

        self.filters1 = 16
        self.filters2 = 32
        self.filters3 = 64
        self.lr = lr
        self.hist_size = hist_size
        self.train_step = train_step
        self.clip_param = 0.1
        self.clip_param_end = 0.03
        self.clip_param_schedule = 1000000
        self.eps_denom = 1e-8
        self.episodes = 10000000
        self.save_frame = 50000
        self.evaluation_reward_length = 100
        self.epochs = 3
        self.num_epochs_trained = 0
        self.discount_factor = 0.99
        self.lam = 0.95
        self.batch_size = 32

        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_schedule = 1000000

        self.env = env
        nonspatial_act_size, spatial_act_depth = env.action_space
        self.nonspatial_act_size, self.spatial_act_depth = env.action_space
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.net = models.GraphConvNet(nonspatial_act_size, spatial_act_depth,
                                       self.device).to(self.device)
        self.target_net = models.GraphConvNet(nonspatial_act_size,
                                              spatial_act_depth,
                                              self.device).to(self.device)

        self.memory = ReplayMemory(self.train_step, self.hist_size,
                                   self.batch_size)
        self.optimizer = optim.Adam(params=self.net.parameters(), lr=self.lr)
        self.loss = nn.MSELoss()

        self.c1 = 1.0
        self.c2 = 0.2

        ### scaling constants for spatial and nonspatial entropy
        self.c3 = 0.1
        self.c4 = 1.0

        self.averages = []

    def update_target_net(self):
        self.target_net.load_state_dict(self.net.state_dict())

    def load_saved_model(self):
        self.net.load_state_dict(
            torch.load("save_model/Starcraft2" + self.env.map + "PPO"))
        self.update_target_net()

    def train(self, training=True):

        evaluation_reward = deque(maxlen=self.evaluation_reward_length)

        ### Keep track of average episode rewards, episode values
        rewards, episodes = [], []

        ### Keeps track of number of frames seen by agent training
        frame = 0

        for e in range(self.episodes):

            done = False
            score = 0

            ### Stores previous output of LSTM
            LSTM_hidden = self.net.init_hidden(1, use_torch=False)

            ### Keeps track of length of current game
            step = 0
            score = 0

            state, reward, done, info = self.env.reset()
            action = [np.array([[0, 0], [0, 0]]), 0]
            value = 0
            r = 0
            G, X, avail_actions = state
            _select_next = True

            while not done:
                epsilon = self.epsilon_min + max(
                    0, (self.epsilon_max - self.epsilon_min) *
                    (1 - (frame / self.epsilon_schedule)))
                # Handle selection, edge cases

                if (not info['friendly_units_present']):
                    print("hello")
                    state, reward, done, info = self.env.step(0)
                    continue

                step += 1
                frame += 1

                prev_LSTM = LSTM_hidden
                prev_action = utils.action_to_onehot(
                    action, GraphConvConfigMinigames.action_space,
                    GraphConvConfigMinigames.spatial_width)

                ### Select action, value

                _, _, value, LSTM_hidden, action, = self.net(
                    np.expand_dims(G, 1),
                    np.expand_dims(X, 1),
                    avail_actions,
                    LSTM_hidden,
                    np.expand_dims(prev_action, 1),
                    epsilon=epsilon,
                    choosing=True)
                value = value.cpu().data.numpy().item()
                LSTM_hidden = LSTM_hidden.cpu().data.numpy()

                spatial_action, nonspatial_action = action

                #print(action)
                ### Env step

                state, reward, done, info = self.env.step(
                    nonspatial_action, spatial_action[0], spatial_action[1])
                G, X, avail_actions = state
                action = [np.array(spatial_action), nonspatial_action]
                score += reward
                ### Append state to history
                #history.append(state)

                push_state = [G, X, avail_actions, prev_LSTM]

                ### Store transition in memory
                if (score == 0 and done):
                    reward -= 100
                    score -= 100
                self.memory.push(push_state, action, reward, done, value, 0, 0,
                                 step)

                ### Start training after random sample generation

                if (frame % self.train_step == 0 and frame != 0 and training):
                    prev_action = utils.action_to_onehot(
                        action, GraphConvConfigMinigames.action_space,
                        GraphConvConfigMinigames.spatial_width)
                    _, _, frame_next_val, _, _ = self.net(
                        np.expand_dims(G, 1), np.expand_dims(X, 1),
                        avail_actions, LSTM_hidden,
                        np.expand_dims(prev_action, 1))
                    frame_next_val = frame_next_val.cpu().data.numpy().item()
                    clip_param = self.clip_param_end + (
                        self.clip_param - self.clip_param_end) * max(
                            0, 1 - (frame / self.clip_param_schedule))
                    self.train_policy_net_ppo(frame, frame_next_val, epsilon,
                                              clip_param)

                    self.update_target_net()

                ### Save model, print time, record information
                if (frame % self.save_frame == 0):
                    #print('now time : ', datetime.now())
                    rewards.append(np.mean(evaluation_reward))
                    episodes.append(e)
                    plt.plot(episodes, rewards, 'r')
                    plt.savefig("save_model/Starcraft2" + self.env.map +
                                "PPOgraph.png")
                    torch.save(self.net.state_dict(),
                               "save_model/Starcraft2" + self.env.map + "PPO")

                ### Handle end of game logic
                if done:
                    evaluation_reward.append(score)
                    print("episode:", e, "  score:", score, "  steps:", step,
                          "  evaluation reward:", np.mean(evaluation_reward))
                    #state, reward, done, _ = self.env.reset()
                    self.averages.append(np.mean(evaluation_reward))
                    self.plot_results()

                G, X, avail_actions = state

    ### Main training logic
    def train_policy_net_ppo(self, frame, frame_next_val, epsilon, clip_param):

        for param_group in self.optimizer.param_groups:
            curr_lr = param_group['lr']
        print(
            "\n\n ------- Training network. lr: %f. clip: %f. epsilon: %f ------- \n\n"
            % (curr_lr, clip_param, epsilon))

        ### Compute value targets and advantage for all frames
        self.memory.compute_vtargets_adv(self.discount_factor, self.lam,
                                         frame_next_val)

        ### number of iterations of batches of size self.batch_size. Should divide evenly
        num_iters = int(len(self.memory) / self.batch_size)
        device = self.device
        ### Do multiple epochs
        for i in range(self.epochs):

            pol_loss = 0.0
            vf_loss = 0.0
            ent_total = 0.0

            self.num_epochs_trained += 1

            for j in range(num_iters):

                mini_batch = self.memory.sample_mini_batch(
                    frame, self.hist_size)
                mini_batch = np.array(mini_batch).transpose()

                states = np.stack(mini_batch[0], axis=0)
                G_states = np.stack(states[:, 0], axis=0)
                X_states = np.stack(states[:, 1], axis=0)
                avail_states = np.stack(states[:, 2], axis=0)
                hidden_states = np.concatenate(states[:, 3], axis=2)
                prev_actions = np.stack(states[:, 4], axis=0)
                relevant_states = np.stack(states[:, 5], axis=0)

                n = states.shape[0]

                actions = np.array(list(mini_batch[1]))
                spatial_actions = np.stack(actions[:, 0], 0)
                first_spatials = spatial_actions[:, 0]
                second_spatials = spatial_actions[:, 1]
                nonspatial_acts = np.array(actions[:, 1]).astype(np.int64)

                rewards = np.array(list(mini_batch[2]))
                dones = mini_batch[3]
                v_returns = mini_batch[5].astype(np.float32)
                advantages = mini_batch[6].astype(np.float32)

                first_spatials = torch.from_numpy(first_spatials).to(device)
                second_spatials = torch.from_numpy(second_spatials).to(device)
                nonspatial_acts = torch.from_numpy(nonspatial_acts).to(device)
                nonspatial_acts = nonspatial_acts.unsqueeze(1)

                rewards = torch.from_numpy(rewards).to(device)
                dones = torch.from_numpy(np.uint8(dones)).to(device)
                v_returns = torch.from_numpy(v_returns).to(device)
                advantages = torch.from_numpy(advantages).to(device)

                advantages = (advantages - advantages.mean()) / (torch.clamp(
                    advantages.std(), self.eps_denom))

                spatial_probs, nonspatial_probs, values, _, _ = self.net(
                    G_states,
                    X_states,
                    avail_states,
                    hidden_states,
                    prev_actions,
                    relevant_frames=relevant_states)
                old_spatial_probs, old_nonspatial_probs, old_values, _, _ = self.target_net(
                    G_states,
                    X_states,
                    avail_states,
                    hidden_states,
                    prev_actions,
                    relevant_frames=relevant_states)

                #print(nonspatial_probs.shape, self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials).shape, (nonspatial_acts < 2).shape)
                #print(nonspatial_probs.shape, nonspatial_acts.shape)
                #print(nonspatial_probs[range(self.batch_size),nonspatial_acts].shape)

                gathered_nonspatials = nonspatial_probs.gather(
                    1, nonspatial_acts).squeeze(1)
                old_gathered_nonspatials = old_nonspatial_probs.gather(
                    1, nonspatial_acts).squeeze(1)
                first_spatial_mask = (nonspatial_acts < 3).to(
                    self.device).float().squeeze(1)
                second_spatial_mask = (nonspatial_acts == 0).to(
                    self.device).float().squeeze(1)

                numerator = torch.log(
                    gathered_nonspatials + self.eps_denom) + torch.log(
                        self.index_spatial_probs(spatial_probs[:, 0, :, :],
                                                 first_spatials) +
                        self.eps_denom) * first_spatial_mask + (torch.log(
                            self.index_spatial_probs(spatial_probs[:, 1, :, :],
                                                     second_spatials) +
                            self.eps_denom) * second_spatial_mask)
                denom = torch.log(
                    old_gathered_nonspatials + self.eps_denom) + torch.log(
                        self.index_spatial_probs(old_spatial_probs[:, 0, :, :],
                                                 first_spatials) +
                        self.eps_denom) * first_spatial_mask + (torch.log(
                            self.index_spatial_probs(
                                old_spatial_probs[:, 1, :, :], second_spatials)
                            + self.eps_denom) * second_spatial_mask)
                """
                denom = old_gathered_nonspatials
                print(nonspatial_probs.shape)
                print(denom.shape)
                print((nonspatial_acts < 3).shape)
                print(((self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials)) * (nonspatial_acts < 3).to(self.device).float()).shape)
                denom[nonspatial_acts < 3] = denom[nonspatial_acts < 3] * self.index_spatial_probs(spatial_probs[:,0,:,:], first_spatials)
                denom[nonspatial_acts == 0] = denom[nonspatial_acts == 0] * self.index_spatial_probs(old_spatial_probs[:,1,:,:], second_spatials)
                
                denom = torch.log( torch.clamp( denom, self.eps_denom ) )
                """

                ratio = torch.exp(numerator - denom)
                ratio_adv = ratio * advantages.detach()
                bounded_adv = torch.clamp(
                    ratio, 1 - self.clip_param,
                    1 + self.clip_param) * advantages.detach()
                """
                print("ratio: ", ratio, "\n\n")
                print("numerator: ", numerator, "\n\n")
                print("denominator: ", denom, "\n\n")
                """

                pol_avg = -((torch.min(ratio_adv, bounded_adv)).mean())

                value_loss = self.loss(values.squeeze(1), v_returns.detach())

                ent = self.entropy(spatial_probs, nonspatial_probs)

                total_loss = pol_avg + self.c1 * value_loss - self.c2 * ent
                self.optimizer.zero_grad()
                total_loss.backward()
                self.optimizer.step()

                pol_loss += pol_avg.detach().item()
                vf_loss += value_loss.detach().item()
                ent_total += ent.detach().item()

            pol_loss /= num_iters
            vf_loss /= num_iters
            ent_total /= num_iters
            print(
                "Iteration %d: Policy loss: %f. Value loss: %f. Entropy: %f" %
                (self.num_epochs_trained, pol_loss, vf_loss, ent_total))

        print("\n\n ------- Training sequence ended ------- \n\n")

    def index_spatial_probs(self, spatial_probs, indices):
        index_tuple = torch.meshgrid(
            [torch.arange(x) for x in spatial_probs.size()[:-2]]) + (
                indices[:, 0],
                indices[:, 1],
            )
        output = spatial_probs[index_tuple]
        return output

    def get_recent_hist(self, hist):
        length = min(len(hist), self.hist_size)
        if (length == 0):
            return []
        else:
            return hist[-length:]

    def entropy(self, spatial_probs, nonspatial_probs):
        ent = -self.c3 * (torch.mean(
            torch.sum(
                spatial_probs[:, 0, :, :] *
                torch.log(spatial_probs[:, 0, :, :] + self.eps_denom),
                dim=(1, 2))) + self.c4 * torch.mean(
                    torch.sum(nonspatial_probs *
                              torch.log(nonspatial_probs + self.eps_denom),
                              dim=1)))
        return ent

    def clip_gradients(self, clip):

        ### Clip the gradients of self.policy_net
        for param in self.net.parameters():
            if param.grad is None:
                continue
            #print(torch.max(param.grad.data), torch.min(param.grad.data))
            param.grad.data = param.grad.data.clamp(-clip, clip)

    def plot_results(self):
        plt.figure(1)
        plt.clf()
        plt.suptitle('Select-Move PPO')
        plt.title('Agent trained by Ray Sun, David Long, Michael McGuire',
                  fontsize=7)
        plt.xlabel('Training iteration - DefeatRoaches')
        plt.ylabel('Average score')
        plt.plot(self.averages)
        plt.pause(0.001)  # pause a bit so that plots are updated

Example #12

Show file

            action_index = 0  # so the data isn't relevant to learn
        observation, reward, done, info = env.step(actions[action_index])
        last_screen = current_screen
        on_grass, current_screen = transform_obs(observation)
        # Change of the reward to add penalty when the agent isn't on the road
        if (reward < 0):
            if (on_grass and t > 50):
                reward = float(-1)
            if (not on_grass and t > 50):
                reward = float(0.1)
        if (t <= 50):
            reward = float(0)
        reward = torch.tensor([reward], device=device)

        # Store the transition in memory
        memory.push(last_screen, action_index, current_screen, reward)

        # Move to the next state
        state = current_screen

        # Perform one step of the optimization (on the target network)
        optimize_model()
        tot_reward += reward
        if done:
            break

    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
    torch.save(policy_net.state_dict(), './models/model')  # Save the model

Example #13

Show file

class DQN_agent:
    def __init__(self,env,policy,target,n_action=18,capacity=100000,batch_size=32,lr=2.5e-4,gamma=0.99,burn_in=50000,C=1000,eps_decay=1000000):
        self.env=env
        self.n_action=n_action
        self.memory=ReplayMemory(capacity)
        self.device="cuda"
        self.policy=policy
        self.target=target
        self.batch_size=batch_size
        self.gamma=gamma
        self.lr=lr
        self.opt= optim.Adam(self.policy.parameters(), lr=self.lr)
        self.burn_in=burn_in
        self.C=C
        self.eps_decay=eps_decay
        self.loss=nn.MSELoss()
    def get_state(self,obs):
        state=torch.FloatTensor(np.array(obs).transpose(2,0,1)).unsqueeze(0)
        return(state)
    def get_action(self,state,eps):
        x=random.random()
        if x<eps:
            return(torch.tensor([[random.randrange(self.n_action)]], dtype=torch.long))
        else:
            with torch.no_grad():
                return(self.policy(state.to("cuda")).max(1)[1].view(1,1))
    def update_policy(self):
        state,action,reward,next_state,done=self.memory.sample(self.batch_size)
        state=state.to("cuda")
        action=action.to("cuda")
        next_state=next_state.to("cuda")
        reward=reward.to("cuda")
        done=done.to("cuda")
        q=self.policy(state).gather(1,action.unsqueeze(1)).squeeze(1)
        q_max=self.target(next_state).max(1)[0]
        y=(reward+self.gamma*q_max)*(1-done)+reward*done
        loss=self.loss(q,y)
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
        return
    def update_target(self):
        self.target.load_state_dict(self.policy.state_dict())
    def train(self,episodes):
        steps=0
        reward_list=[]
        for episode in range(episodes):
            obs=self.env.reset()
            state=self.get_state(obs)
            reward_episode=0
            done=False
            while not done:
                steps+=1
                test_eps=int(steps>self.eps_decay)
                eps=(1-steps*(1-0.1)/self.eps_decay)*(1-test_eps)+0.1*test_eps
                action=self.get_action(state,eps)
                obs,reward,done,info=env.step(action)
                reward_episode+=reward
                next_state=self.get_state(obs)
                reward = torch.tensor([reward], device="cpu", dtype=torch.float)
                action = torch.tensor([action], device="cpu", dtype=torch.long)
                done = torch.tensor([int(done)], device="cpu", dtype=int)
                self.memory.push(state,action,reward,next_state,done)
                if steps>self.burn_in:
                    self.update_policy()
                if steps>self.burn_in and steps%self.C==0:
                    self.update_target()
                state=next_state
            if episode%100 == 0:
                print('Total steps: {} \t Episode: {}/{} \t Total reward: {}'.format(steps, episode, episodes, np.mean(reward_list[-100:])))
            if episode%500==0:
                print(reward_list)
            reward_list.append(reward_episode)
        self.env.close()
        print(reward_list)
        return(reward_list)
    def save_model(self,name):
        torch.save(self.policy,name)
        return
    def load_model(self,name):
        self.policy=torch.load(name)
    def test(self,n_episodes):
        test_reward=[]
        for episode in range(n_episodes):
            obs = self.env.reset()
            state = self.get_state(obs)
            reward_episode = 0.0
            done=False
            while not done:
                with torch.no_grad():
                    action=self.policy(state.to("cuda")).max(1)[1].view(1,1)
                obs,reward,done,infoself.=env.step(action)
                reward_episode+=reward
                state=self.get_state(obs)
                if done:
                    print("Finished Episode {} with reward {}".format(episode, reward_episode))
            self.env.close()
            test_reward.append(reward_episode)
        return (test_reward)

Example #14

Show file

class Agent:
    def __init__(self,
                 env,
                 exploration_rate=1,
                 exploration_decay=0.9999,
                 explore=True):
        self.action_space = env.action_space.n
        self.memory = ReplayMemory(MEMORY_SIZE)
        self.memory.fill_memory(env)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)
        self.dqn = DQN(4, self.action_space).float().to(self.device)
        self.env = env
        self.episode_rewards = []
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.explore = explore
        self.model_optim = optim.Adam(self.dqn.parameters(), lr=1e-4)
        self.episodes = 0

    def get_action(self, obs):
        if self.exploration_rate > random.random() and self.explore:
            action = random.randint(0, self.action_space - 1)
        else:
            obs = torch.tensor(obs, device=self.device).reshape(1, 4, 80,
                                                                80).float()
            action = self.dqn(obs).argmax().tolist()
        return action

    def train(self, num_episodes):
        num_steps = 0
        running_loss = 0
        loss = nn.MSELoss()

        episode_rewards = []
        for episode in tqdm(range(num_episodes)):
            obs = rgb2gray(self.env.reset()).reshape(1, 80, 80)
            for i in range(3):
                obs = np.append(obs, rgb2gray(self.env.step(0)[0]), 0)

            terminal = False
            episode_reward = 0
            while not terminal:
                action = self.get_action(obs)
                result = self.env.step(action)

                terminal = result[2]
                new_obs = np.append(obs[1:], rgb2gray(result[0]), 0)
                reward = result[1]
                if reward > 0:
                    print(episode, reward)
                episode_reward += reward

                self.memory.push(obs, action, new_obs, reward, terminal)
                batch = self.memory.sample(BATCH_SIZE)
                observations, y = self.process_batch(batch)
                num_steps += 1

                outputs = self.dqn(observations)
                episode_loss = loss(outputs, y)
                self.model_optim.zero_grad()
                episode_loss.backward()
                self.model_optim.step()
                running_loss += episode_loss.item()

                if num_steps % 1000 == 0:  # print every 2000 mini-batches
                    print(num_steps)

            episode_rewards.append(episode_reward)
            if self.exploration_rate > 0.1:
                self.exploration_rate *= self.exploration_decay
        self.episodes += num_episodes
        self.save(str(self.episodes) + '_model')
        self.episode_rewards += episode_rewards
        np.save(str(self.episodes) + '_rewards', self.episode_rewards)
        return episode_rewards

    def process_batch(self, batch):
        observations = [batch[i][0] for i in range(len(batch))]
        observations = torch.tensor(np.array(observations)).reshape(
            (BATCH_SIZE, 4, 80, 80)).float().to(self.device)

        next_observations = [batch[i][2] for i in range(len(batch))]
        next_observations = torch.tensor(np.array(next_observations)).reshape(
            (BATCH_SIZE, 4, 80, 80)).float().to(self.device)

        maxs = self.dqn(next_observations)
        maxs = maxs.max(1).values.float().to(self.device)

        rewards = [batch[i][3] for i in range(len(batch))]
        rewards = torch.tensor(rewards).float().to(self.device)

        terminals = [~batch[i][4] for i in range(len(batch))]
        terminals = torch.tensor(terminals).float().to(self.device)

        maxs = -maxs * terminals

        y = self.dqn(observations)
        Qs = rewards + GAMMA * maxs

        for i in range(len(batch)):
            y[i, batch[i][1]] = Qs[i]

        return observations, y

    def load_dqn(self, path):
        self.dqn = torch.load(path)

    def save(self, path):
        torch.save(self.dqn, path)

Example #15

Show file

File: mainCP.py Project: FreeJ99/DQN-agent-for-deep-reinforcement-learning

ep_durations = [0]  #used for ploting
returns = [0]
last_state_values = [0]
first_state_values = [0]

for i_episode in range(INIT_RM):
    if not TRAIN:
        break
    cur_state = env.reset()
    while True:
        action = agent.take_action(FloatTensor([cur_state]))
        next_state, reward, done, _ = env.step(env.action_space.sample())

        if done:
            reward = -1
            memory.push(FloatTensor([cur_state]), LongTensor([action]), None,
                        FloatTensor([reward]))
        else:
            #tensors of shape 1Xstateshape,1,1x4,1
            memory.push(FloatTensor([cur_state]), LongTensor([action]),
                        FloatTensor([next_state]), FloatTensor([reward]))

        cur_state = next_state

        if done:
            break

start_time = time.time()
frames = 0
i_episode = 0

while frames < N_FRAMES:  #start of training

Example #16

Show file

class DQN(object):
    def __init__(self,
                 config,
                 env,
                 doubleDQN=False,
                 duelingDQN=False,
                 NoisyDQN=False,
                 N_stepDQN=False,
                 Prioritized=False):
        self.device = config.device

        self.doubleDQN = doubleDQN
        self.duelingDQN = duelingDQN
        self.NoisyDQN = NoisyDQN
        self.N_stepDQN = N_stepDQN
        self.Prioritized = Prioritized

        self.gamma = config.gamma  # 折扣因子
        self.learning_rate = config.learning_rate  # 学习率
        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数
        self.priority_alpha = config.priority_alpha
        self.priority_beta_start = config.priority_beta_start
        self.priority_beta_frames = config.priority_beta_frames

        self.epsilon = config.epsilon  # epsilon初始值，以其概率选择最大值的动作
        self.epsilon_final = config.epsilon_final  # epsilon的最小值
        self.epsilon_decay = config.epsilon_decay  # epsilon衰减率

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.n  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.N_step = config.N_step  # 多步学习的步数

        self.N_step_buffer = []

        if self.Prioritized:
            self.memory = PrioritizedReplayMemory(
                self.replay_size, self.priority_alpha,
                self.priority_beta_start, self.priority_beta_frames)  # 初始化经验池
        else:
            self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        if self.duelingDQN:
            # 初始化评估网络
            self.eval_net = DuelingDQNNet(self.num_states,
                                          self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = DuelingDQNNet(self.num_states,
                                            self.num_actions).to(self.device)
        elif self.NoisyDQN:
            # 初始化评估网络
            self.eval_net = NoisyNet(self.num_states,
                                     self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = NoisyNet(self.num_states,
                                       self.num_actions).to(self.device)
        else:
            self.eval_net = DQNNet(self.num_states,
                                   self.num_actions).to(self.device)
            # 初始化目标网络
            self.target_net = DQNNet(self.num_states,
                                     self.num_actions).to(self.device)

        # 目标网络和评估网络初始时参数一致
        self.target_net.load_state_dict(self.eval_net.state_dict())

        # 训练的优化器
        self.optimizer = optim.Adam(self.eval_net.parameters(),
                                    lr=self.learning_rate)

        # 均方损失函数
        self.loss_func = nn.MSELoss()

    # 储存记忆
    def store_transition(self, state, action, reward, next_state, done):
        if self.N_stepDQN:
            # 把当前经验放入N_step buffer中
            self.N_step_buffer.append(
                (state, action, reward, next_state, done))

            # 如果没有达到设定的步数，return
            if len(self.N_step_buffer) < self.N_step:
                return

            # 计算N步回报
            R = sum([
                self.N_step_buffer[i][2] * (self.gamma**i)
                for i in range(self.N_step)
            ])
            state, action, _, _, _ = self.N_step_buffer.pop(0)

            self.memory.push((state, action, R, next_state, done))
        else:
            self.memory.push((state, action, reward, next_state, done))

    # 选择动作
    def choose_action(self, s):
        with torch.no_grad():
            if np.random.random(
                    1) >= self.epsilon:  # 如果大于等于epsilon，动作为网络中Q值最大的
                X = torch.tensor([s], device=self.device, dtype=torch.float)
                a = self.eval_net(X).max(1)[1].view(1, 1)  # 用eval网络计算q值
                return a.item()
            else:  # 如果小于epsilon，动作随机
                return np.random.randint(0, self.num_actions)

    # 从经验池中选取样本
    def get_batch(self):
        transitions, indices, weights = self.memory.sample(
            self.batch_size)  # 批样本

        # 解压批样本
        # 例如zipped为[(1, 4), (2, 5), (3, 6)]，zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)]
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
            *transitions)

        # 将样本转化为tensor
        batch_state = torch.tensor(batch_state,
                                   device=self.device,
                                   dtype=torch.float)
        batch_action = torch.tensor(batch_action,
                                    device=self.device,
                                    dtype=torch.long).squeeze().view(
                                        -1, 1)  # view转换为列tensor
        batch_reward = torch.tensor(batch_reward,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(-1, 1)
        batch_next_state = torch.tensor(batch_next_state,
                                        device=self.device,
                                        dtype=torch.float)
        batch_done = torch.tensor(batch_done,
                                  device=self.device,
                                  dtype=torch.float).squeeze().view(-1, 1)
        # print("状态:", batch_state.shape) 128,4
        # print("动作:", batch_action.shape)
        # print("奖励:", batch_reward.shape)
        # print("done:", batch_done.shape)
        #
        return batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights

    # 学习
    def learn(self):
        # 更新目标网络
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())

        # 获取批样本
        batch_state, batch_action, batch_reward, batch_next_state, batch_done, indices, weights = self.get_batch(
        )

        # print("状态:", batch_state)
        # print("动作:", batch_action)
        # print("done:", batch_done)

        # 计算q(s,a;θ)
        if self.NoisyDQN:
            self.eval_net.sample_noise()
        q_s_a = self.eval_net(batch_state).gather(1, batch_action)
        # print("q_s_a:", q_s_a.shape)

        # 计算target yj = rj + (1 - done) * gamma * max(q(s',a;θ'))
        with torch.no_grad():
            if self.NoisyDQN:
                self.target_net.sample_noise()
            if self.doubleDQN:
                next_max_action = self.eval_net(batch_next_state).max(
                    dim=1)[1].view(-1, 1)
                q_target = batch_reward + (
                    1. - batch_done) * self.gamma * self.target_net(
                        batch_next_state).gather(1, next_max_action)
                # print("q_target:", q_target)
                # print("q_target.shape:", q_target.shape)
            else:
                next_q = self.target_net(batch_next_state)
                # print("next_q:", next_q)
                max_next_q_a = next_q.max(1)[0].view(-1, 1)
                # print("max_next_q_a:", max_next_q_a)
                # print("max_next_q_a.shape:", max_next_q_a.shape)
                q_target = batch_reward + (
                    1. - batch_done) * self.gamma * max_next_q_a
                # print("q_target:", q_target)
                # print("q_target.shape:", q_target.shape)

        # 损失函数更新
        if self.Prioritized:
            diff = (q_target - q_s_a)
            self.memory.update_priorities(
                indices,
                diff.detach().squeeze().abs().cpu().numpy().tolist())
        loss = self.loss_func(q_target, q_s_a)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 学习的步数加一
        self.learn_step_counter += 1

    # 保存模型
    def save(self):
        if self.duelingDQN:
            torch.save(self.eval_net, 'duelingDQN.pkl')
        elif self.NoisyDQN:
            torch.save(self.eval_net, 'NoisyDQN.pkl')
        elif self.N_stepDQN:
            torch.save(self.eval_net, 'N_stepDQN.pkl')
        elif self.Prioritized:
            torch.save(self.eval_net, 'PriorityReplayDQN.pkl')
        else:
            torch.save(self.eval_net, 'DQN.pkl')

    # 加载模型
    def load(self):
        if self.duelingDQN:
            self.eval_net = torch.load('duelingDQN.pkl')
        elif self.NoisyDQN:
            self.eval_net = torch.load('NoisyDQN.pkl')
        elif self.N_stepDQN:
            self.eval_net = torch.load('N_stepDQN.pkl')
        elif self.Prioritized:
            self.eval_net = torch.load('PriorityReplayDQN.pkl')
        else:
            self.eval_net = torch.load('DQN.pkl')

Example #17

Show file

File: ddpg_agent.py Project: CorentinTrebaol/Collaboration_Competition

class MADDPG_Agent:
    def __init__(self, n_agents, dim_obs, dim_act, batch_size, 
                        capacity, eps_b_train):
        """ Initialize an Agent object.

        Params
        =======
            n_agents (int)   : number of agents
            dim_obs (int)    : dimension of each state
            dim_act (int)    : dimension of each action
            batch_size (int) : batch size
            capacity (int): 
            eps (int)        : Number of episodes before training
        """

        self.n_agents = n_agents
        self.dim_obs = dim_obs
        self.dim_act = dim_act
        self.batch_size = batch_size
        self.capacity = capacity
        self.eps_b_train = eps_b_train
        self.memory = ReplayMemory(capacity, RANDOM_SEED)
        self.cuda_on = th.cuda.is_available()
        self.var = [1.0 for i in range(n_agents)]
        self.seed = random.seed(10)
        self.checkpoint_dir = 'checkpoints/'
        self.seed = random.seed(RANDOM_SEED)

        # Actor Network with Target Network
        self.actors = [Actor(dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)]
        self.actors_target = [Actor(dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)] #deepcopy(self.actors)
        self.actor_optimizer = [Adam(x.parameters(), lr=LR_ACTOR) for x in self.actors]

        # Critic Network with Target Network
        self.critics = [Critic(n_agents,dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)]
        self.critics_target = [Critic(n_agents,dim_obs, dim_act, RANDOM_SEED) for i in range(n_agents)] #deepcopy(self.critics)
        self.critic_optimizer = [Adam(x.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) for x in self.critics]

        # Noise process
        self.noise = [OUNoise(dim_act, RANDOM_SEED) for i in range(n_agents)]

        # Enable the use of CUDA
        if self.cuda_on:
            for m in [self.actors, self.critics, self.actors_target, self.critics_target]:
                for x in m:
                    x.cuda()

        self.steps_done = 0
        self.eps_done = 0


    def step(self, states,actions, rewards, next_states, dones, add_noise=True):
        """Save experience in replay memory, and use random sample for buffer to learn."""
        self.memory.push(states, actions, next_states, rewards)
        #print("memory size = ",len(self.memory))

        # Learn, if enough samples are available in memory
        if self.eps_done % NUM_STEPS_TO_UPDATE == 0:
            for i in range(NUM_STEPS_TO_UPDATE):
                c_loss,a_loss = self.learn()

    def act2(self, state):
        actions = th.zeros(
            self.n_agents,
            self.dim_act)
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        for i in range(self.n_agents):
            sb = state[i, :].detach()
            self.actors[i].eval()
            with th.no_grad():
                act = self.actors[i](sb.unsqueeze(0)).squeeze()
            self.actors[i].train()
            act += th.from_numpy(self.noise.sample()).type(FloatTensor)

            act = th.clamp(act, -1, 1)

            actions[i, :] = act
        self.steps_done += 1

        return actions
       

    def act(self, state):
        actions = th.zeros(
            self.n_agents,
            self.dim_act)
        #FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        for i in range(self.n_agents):
            self.actors[i].eval()
            sb = state[i, :].detach()
            with th.no_grad():
                act = self.actors[i](sb.unsqueeze(0)).squeeze()
            self.actors[i].train()

            act = self.add_noise2(act, i)
            act = th.clamp(act, -1.0, 1.0)

            actions[i, :] = act
        self.steps_done += 1

        return actions

    def act3(self, state):
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        actions = th.zeros(
            self.n_agents,
            self.dim_act)
        for i in range(self.n_agents):
            self.actors[i].eval()
            with th.no_grad():
                sb = state[i, :].detach()
                act = self.actors[i](sb.unsqueeze(0)).squeeze()

                act += th.from_numpy(self.noise[i].sample()).type(FloatTensor)
            act = th.clamp(act, -1, 1)

            actions[i, :] = act
        self.steps_done += 1
        return actions




    def add_noise(self, action, i):
        epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
                                  np.exp(-1. * self.steps_done / EPSILON_DECAY)
        # add noise
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        noise = th.from_numpy(np.random.randn(self.dim_act) * epsilon).type(FloatTensor)
        action += noise
        return action

    def add_noise2(self, action, i):
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor
        action += th.from_numpy(
        np.random.randn(2) * self.var[i]).type(FloatTensor)

        if self.eps_done > self.eps_b_train and self.var[i] > 0.05:
            self.var[i] *= 0.999998
        #action = th.clamp(action, -1.0, 1.0)

        return action


    def reset(self):
        for i in range(self.n_agents):
            self.noise[i].reset()

    def learn(self):
        """ Update policy and value parameters using given batch of experience tuples"""
        if self.eps_done <= self.eps_b_train:
            return None, None

        if self.eps_done == (self.eps_b_train + 1):
            print("========== Training now =========")

        ByteTensor = th.cuda.ByteTensor if self.cuda_on else th.ByteTensor
        FloatTensor = th.cuda.FloatTensor if self.cuda_on else th.FloatTensor

        c_loss = []
        a_loss = []

        for agent in range(self.n_agents):
            transitions = self.memory.sample(self.batch_size)
            batch = Experience(*zip(*transitions))

            non_final_mask = ByteTensor(list(map(lambda s: s is not None,
                                                 batch.next_states)))
            # state_batch: batch_size x n_agents x dim_obs
            state_batch = th.stack(batch.states).type(FloatTensor)
                  
            reward_batch = th.stack(batch.rewards).type(FloatTensor)
            action_batch = th.stack(batch.actions).type(FloatTensor)
            #pdb.set_trace()
            # : (batch_size_non_final) x n_agents x dim_obs
            non_final_next_states = th.stack(
                [s for s in batch.next_states
                 if s is not None]).type(FloatTensor)

            # for current agent
            whole_state = state_batch.view(self.batch_size, -1)
            whole_action = action_batch.view(self.batch_size, -1)
            self.critic_optimizer[agent].zero_grad()
            current_Q = self.critics[agent](whole_state, whole_action)

            non_final_next_actions = [
                self.actors_target[i](non_final_next_states[:,
                                                            i,
                                                            :]) for i in range(
                                                                self.n_agents)]
            non_final_next_actions = th.stack(non_final_next_actions)
            non_final_next_actions = (
                non_final_next_actions.transpose(0,
                                                 1).contiguous())

            target_Q = th.zeros(
                self.batch_size).type(FloatTensor)

            target_Q[non_final_mask] = self.critics_target[agent](
                non_final_next_states.view(-1, self.n_agents * self.dim_obs),
                non_final_next_actions.view(-1,
                                            self.n_agents * self.dim_act)
            ).squeeze()
            # scale_reward: to scale reward in Q functions

            target_Q = (target_Q.unsqueeze(1) * GAMMA) + (
                reward_batch[:, agent].unsqueeze(1) * SCALE_REWARD)

            loss_Q = nn.MSELoss()(current_Q, target_Q.detach())
            loss_Q.backward()
            self.critic_optimizer[agent].step()

            self.actor_optimizer[agent].zero_grad()
            state_i = state_batch[:, agent, :]
            action_i = self.actors[agent](state_i)
            ac = action_batch.clone()
            ac[:, agent, :] = action_i
            whole_action = ac.view(self.batch_size, -1)
            actor_loss = -self.critics[agent](whole_state, whole_action)
            actor_loss = actor_loss.mean()
            actor_loss.backward()
            self.actor_optimizer[agent].step()
            c_loss.append(loss_Q)
            a_loss.append(actor_loss)


        #if self.steps_done % NUM_STEPS_TO_UPDATE == 0 and self.steps_done > 0:
            #for i in range(self.n_agents):
            soft_update(self.critics_target[agent], self.critics[agent], TAU)
            soft_update(self.actors_target[agent], self.actors[agent], TAU)

        return c_loss, a_loss


    def save_checkpoint(self, episode_num, reward, is_best=False):

        checkpointName = self.checkpoint_dir + 'ep{}.pth'.format(episode_num)
        checkpoint = {
            'episode': episode_num,
            'actor1': self.actors[0].state_dict(),
            'actor2': self.actors[1].state_dict(),
            'critic1': self.critics[0].state_dict(),
            'critic2': self.critics[1].state_dict(),
            'targetActor1': self.actors_target[0].state_dict(),
            'targetActor2': self.actors_target[1].state_dict(),
            'targetCritic1': self.critics_target[0].state_dict(),
            'targetCritic2': self.critics_target[1].state_dict(),
            'actorOpt1': self.actor_optimizer[0].state_dict(),
            'actorOpt2': self.actor_optimizer[1].state_dict(),
            'criticOpt1': self.critic_optimizer[0].state_dict(),
            'criticOpt2': self.critic_optimizer[1].state_dict(),
            'replayBuffer': self.memory,
            'reward': reward
            
        } 
        th.save(checkpoint, checkpointName)

    def printModelArch(self,model):
        print(model.state_dict())

Example #18

Show file

class DRRN_Agent:
    def __init__(self, args):
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.accummulate_step = args.accummulate_step

        self.network = DRRN().to(device)
        self.memory = ReplayMemory(args.memory_size)
        self.save_path = args.output_dir
        self.clip = args.clip
        self.optimizer = torch.optim.Adam(self.network.parameters(),
                                          lr=args.learning_rate)

        # self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=args.warmup_steps,
        #                                                  num_training_steps=args.max_steps)

    def observe(self, state, act, rew, next_state, next_acts, done, history):
        self.memory.push(state, act, rew, next_state, next_acts, done, history)

    def build_state(self, obs, infos):
        """ Returns a state representation built from various info sources. """
        # obs_ids = [self.network.str_to_token_ids(o, self.network.state_max_length) for o in obs]
        # look_ids = [self.network.str_to_token_ids(info['look'], self.network.look_max_length) for info in infos]
        # inv_ids = [self.network.str_to_token_ids(info['inv'], self.network.inv_max_length) for info in infos]
        # return [State(ob, lk, inv) for ob, lk, inv in zip(obs_ids, look_ids, inv_ids)]
        states = []
        for obs, info in zip(obs, infos):
            state = obs + info['look'] + info['inv']
            states.append(state)
        return states

    def encode(self, act_list):
        """ Encode a list of actions """
        # return [self.network.str_to_token_ids(o, self.network.act_max_length) for o in act_list]
        return act_list

    def act(self, states, poss_acts, history, sample=True, return_all=False):
        """ Returns a string action from poss_acts. """
        idxs, values = self.network.act(states, poss_acts, history, sample,
                                        return_all)

        if return_all:
            return None, idxs, values

        act_ids = [poss_acts[batch][idx] for batch, idx in enumerate(idxs)]
        return act_ids, idxs, values

    def update(self):
        if len(self.memory) < self.batch_size:
            return

        batch_loss = None
        num_per_step = int(self.batch_size / self.accummulate_step)
        for _ in range(self.accummulate_step):

            transitions = self.memory.sample(num_per_step)
            batch = Transition(*zip(*transitions))

            # Compute Q(s', a') for all a'
            # TODO: Use a target network???
            next_history = []
            for act, history in zip(batch.act, batch.history):
                next_history.append(history + [act])
            next_qvals = self.network(batch.next_state, batch.next_acts,
                                      next_history)
            # Take the max over next q-values
            next_qvals = torch.tensor([vals.max() for vals in next_qvals],
                                      device=device)
            # Zero all the next_qvals that are done
            next_qvals = next_qvals * (
                1 - torch.tensor(batch.done, dtype=torch.float, device=device))
            targets = torch.tensor(batch.reward,
                                   dtype=torch.float,
                                   device=device) + self.gamma * next_qvals

            # Next compute Q(s, a)
            # Nest each action in a list - so that it becomes the only admissible cmd
            nested_acts = tuple([[a] for a in batch.act])
            qvals = self.network(batch.state, nested_acts, batch.history)
            # Combine the qvals: Maybe just do a greedy max for generality
            qvals = torch.cat(qvals)

            loss = F.smooth_l1_loss(qvals, targets.detach())

            # Compute Huber loss
            if batch_loss is None:
                batch_loss = loss
            else:
                batch_loss += loss

        batch_loss /= num_per_step

        self.optimizer.zero_grad()
        batch_loss.backward()
        nn.utils.clip_grad_norm_(self.network.parameters(), self.clip)
        self.optimizer.step()
        # self.scheduler.step()

        return loss.item()

    def load(self):
        try:
            self.memory = pickle.load(
                open(pjoin(self.save_path, 'memory.pkl'), 'rb'))
            self.network = torch.load(pjoin(self.save_path, 'model.pt'))
        except Exception as e:
            print("Error saving model.")
            logging.error(traceback.format_exc())

    def save(self):
        try:
            pickle.dump(self.memory,
                        open(pjoin(self.save_path, 'memory.pkl'), 'wb'))
            torch.save(self.network, pjoin(self.save_path, 'model.pt'))
        except Exception as e:
            print("Error saving model.")
            logging.error(traceback.format_exc())

Example #19

Show file

    state = current_screen - last_screen
    #print state
    for t in count():
        action = select_action(state)
        _, reward, done, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device)

        last_screen = current_screen
        current_screen = get_screen(env, device)

        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None

        memory.push(state, action, next_state, reward)

        state = next_state
        #if done:
        #    print "Episode Done"
        #else:
        #    print state.size()
        optimize_model(policy_net, optimizer)
        if done:
            episode_durations.append(t + 1)
            plot_durations(episode_durations, AVERAGE_SIZE)
            break

    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

Example #20

Show file

File: agent.py Project: yzheng51/rl-dino-run

class DQN(HyperParam):
    def __init__(self, n_actions, device, batch_norm=False):
        self.device = device
        self.n_actions = n_actions
        self._memory_init()
        self._net_init(n_actions, batch_norm)
        self.epsilon = LinearAnneal(self.EPS_INIT, self.EPS_END,
                                    self.EXPLORE_STEP)
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                       lr=self.LR)

    def _memory_init(self):
        self.memory = ReplayMemory(self.MEMORY_SIZE)

    def _net_init(self, n_actions, batch_norm):
        """
        Initialization of two neural network

        policy net - a function return the all q values corresponding to each action
                     given the input state. This network is used to compute expected
                     q vlue and will be optimized during each iteration
        target net - a function which will be updated from policy net after N optimization
                     step (N is a hyperparameter). This network is used to compute
                     expected q value based on next state
        """
        self.policy_net = Net(n_actions, batch_norm).to(self.device)
        self.target_net = Net(n_actions, batch_norm).to(self.device)
        self._update_target()
        self.target_net.eval()

    def _choose_action(self, state):
        """
        epsilon - greedy policy to decide next action

        the value of epsilon will anneal linearly
        """
        sample = random.random()
        if sample > self.epsilon.anneal():
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            action = random.randrange(self.n_actions)
            return torch.tensor([[action]],
                                device=self.device,
                                dtype=torch.long)

    def _q(self, states, actions):
        return self.policy_net(states).gather(1, actions)

    def _expected_q(self, next_states, rewards):
        """
        Calculation of expected q value

        based on bellman equation: q = r + gamma * q_next
        """
        # only use those next state is not the end of the game
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, next_states)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s for s in next_states if s is not None])

        # put the state into the network and filter those action with the max q value
        q_next = torch.zeros(self.BATCH_SIZE, device=self.device)
        q_next[non_final_mask] = self.target_net(non_final_next_states).max(
            1)[0].detach()
        expected_q = rewards + self.GAMMA * q_next

        return expected_q.unsqueeze(1)

    def _optimize(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)

        # calculate q value and expected q value
        q = self._q(states, actions)
        expected_q = self._expected_q(batch.next_state, rewards)
        loss = F.smooth_l1_loss(q, expected_q)

        # optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def _update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def save(self, file_name):
        torch.save(self.policy_net.state_dict(), file_name)

    def load(self, model):
        self.policy_net.load_state_dict(torch.load(model))
        self.policy_net.eval()

    def train(self, env, logger):
        """Main part for training the agent"""
        processor = StateProcessor()
        optim_cnt = 0
        for i_episode in range(self.N_EPISODE):
            total_reward = 0
            state = processor.to_tensor(env.reset()).to(self.device)
            for t in itertools.count():
                # Select and perform an action
                action = self._choose_action(state)
                next_state, reward, done, _ = env.step(action)
                # Sum up total reward for one episode, convert reward to tensor
                total_reward += reward
                reward = torch.tensor([reward],
                                      dtype=torch.float32,
                                      device=self.device)

                if done:
                    self.memory.push(state, action, None, reward)
                    self._optimize()
                    break
                else:
                    next_state = processor.to_tensor(next_state).to(
                        self.device)
                    self.memory.push(state, action, next_state, reward)
                    self._optimize()

                state = next_state
            optim_cnt += t
            score = env.unwrapped.game.get_score()
            logger.info(
                f"{i_episode},{optim_cnt},{total_reward:.1f},{score},{self.epsilon.p:.6f}"
            )

            if i_episode % self.TARGET_UPDATE == 0:
                self._update_target()
                self.save(f"model_{i_episode}.pkl")

    def test(self, env):
        while True:
            processor = StateProcessor()
            state = processor.to_tensor(env.reset()).to(self.device)
            while True:
                with torch.no_grad():
                    action = self.policy_net(state).max(1)[1].view(1, 1)
                next_state, _, done, _ = env.step(action)

                if done:
                    break
                next_state = processor.to_tensor(next_state).to(self.device)
                state = next_state

Example #21

Show file

def train_dqn(settings):
    required_settings = [
        "batch_size",
        "checkpoint_frequency",
        "device",
        "eps_start",
        "eps_end",
        "eps_cliff",
        "eps_decay",
        "gamma",
        "log_freq",
        "logs_dir",
        "lr",
        "max_steps",
        "memory_size",
        "model_name",
        "num_episodes",
        "out_dir",
        "target_net_update_freq",
    ]
    if not settings_is_valid(settings, required_settings):
        raise Exception(
            f"Settings object {settings} missing some required settings.")

    batch_size = settings["batch_size"]
    checkpoint_frequency = settings["checkpoint_frequency"]
    device = settings["device"]
    eps_start = settings["eps_start"]
    eps_end = settings["eps_end"]
    eps_cliff = settings["eps_cliff"]
    # eps_decay = settings["eps_decay"]
    gamma = settings["gamma"]
    logs_dir = settings["logs_dir"]
    log_freq = settings["log_freq"]
    lr = settings["lr"]
    max_steps = settings["max_steps"]
    memory_size = settings["memory_size"]
    model_name = settings["model_name"]
    num_episodes = settings["num_episodes"]
    out_dir = settings["out_dir"]
    target_net_update_freq = settings["target_net_update_freq"]

    # Initialize environment
    env = gym.make("StarGunner-v0")

    # Initialize model
    num_actions = env.action_space.n
    settings["num_actions"] = num_actions
    policy_net = DQN(settings).to(device)
    target_net = DQN(settings).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Initialize memory
    logging.info("Initializing memory.")
    memory = ReplayMemory(memory_size)
    memory.init_with_random((1, 3, 84, 84), num_actions)
    logging.info("Finished initializing memory.")

    # Initialize other model ingredients
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    # Initialize tensorboard
    writer = SummaryWriter(logs_dir)

    # Loop over episodes
    policy_net.train()
    steps_done = 0
    log_reward_acc = 0.0
    log_steps_acc = 0
    for episode in tqdm(range(num_episodes)):
        state = process_state(env.reset()).to(device)
        reward_acc = 0.0
        loss_acc = 0.0

        # Loop over steps in episode
        for t in range(max_steps):
            with torch.no_grad():
                Q = policy_net.forward(state.type(torch.float))

            # Get best predicted action and perform it
            if steps_done < eps_cliff:
                epsilon = -(eps_start -
                            eps_end) / eps_cliff * steps_done + eps_start
            else:
                epsilon = eps_end

            if random.random() < epsilon:
                predicted_action = torch.tensor([env.action_space.sample()
                                                 ]).to(device)
            else:
                predicted_action = torch.argmax(Q, dim=1)
            next_state, raw_reward, done, info = env.step(
                predicted_action.item())
            # Note that next state could also be a difference
            next_state = process_state(next_state)
            reward = torch.tensor([clamp_reward(raw_reward)])

            # Save to memory
            memory.push(state.to("cpu"), predicted_action.to("cpu"),
                        next_state, reward)

            # Move to next state
            state = next_state.to(device)

            # Sample from memory
            batch = Transition(*zip(*memory.sample(batch_size)))

            # Mask terminal state (adapted from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)
            final_mask = torch.tensor(
                tuple(map(lambda s: s is not None, batch.next_state)),
                device=device,
                dtype=torch.bool,
            )
            # print("FINAL_MASK", final_mask.shape)
            state_batch = torch.cat(batch.state).type(torch.float).to(device)
            next_state_batch = torch.cat(batch.next_state).type(
                torch.float).to(device)
            action_batch = torch.cat(batch.action).to(device)
            reward_batch = torch.cat(batch.reward).to(device)

            # print("STATE_BATCH SHAPE", state_batch.shape)
            # print("STATE_BATCH", state_batch[4, :, 100])
            # print("ACTION_BATCH SHAPE", action_batch.shape)
            # print("ACTION_BATCH", action_batch)
            # print("REWARD_BATCH SHAPE", reward_batch.shape)

            # Compute Q
            # Q_next = torch.zeros((batch_size, num_actions))
            # print("MODEL STATE BATCH SHAPE", model(state_batch).shape)
            Q_actual = policy_net(state_batch).gather(
                1, action_batch.view(action_batch.shape[0], 1))
            Q_next_pred = target_net(next_state_batch)
            Q_max = torch.max(Q_next_pred, dim=1)[0].detach()
            # print("Q_MAX shape", Q_max.shape)
            target = reward_batch + gamma * Q_max * final_mask.to(Q_max.dtype)
            # print("TARGET SIZE", target.shape)

            # Calculate loss
            loss = F.smooth_l1_loss(Q_actual, target.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()

            # Clamp gradient to avoid gradient explosion
            for param in policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            optimizer.step()

            # Store stats
            loss_acc += loss.item()
            reward_acc += raw_reward
            steps_done += 1

            if steps_done % target_net_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())

            # Exit if in terminal state
            if done:
                logging.debug(
                    f"Episode {episode} finished after {t} timesteps with reward {reward_acc}."
                )
                break

        logging.debug(f"Loss: {loss_acc / t}")

        # Save model checkpoint
        if (episode != 0) and (episode % checkpoint_frequency == 0):
            save_model_checkpoint(
                policy_net,
                optimizer,
                episode,
                loss,
                f"{out_dir}/checkpoints/{model_name}_{episode}",
            )

        # Log to tensorboard
        log_reward_acc += reward_acc
        log_steps_acc += t
        writer.add_scalar("Loss / Timestep", loss_acc / t, episode)
        if episode % log_freq == 0:
            writer.add_scalar("Reward", log_reward_acc / log_freq, episode)
            writer.add_scalar("Reward / Timestep",
                              log_reward_acc / log_steps_acc, episode)
            writer.add_scalar("Duration", log_steps_acc / log_freq, episode)
            writer.add_scalar("Steps", log_reward_acc / log_steps_acc,
                              steps_done)
            log_reward_acc = 0.0
            log_steps_acc = 0

    # Save model
    save_model(policy_net, f"{out_dir}/{model_name}.model")

    # Report final stats
    logging.info(f"Steps Done: {steps_done}")

    env.close()
    return policy_net

Example #22

Show file

File: meta_actor_rnn.py Project: zwfightzw/MLM

n_agents = 4
length_lstm = 10
pkl_file = open('data_saq.pkl', 'rb')

# should be unified when running in the server: which pkl file
memory = ReplayMemory(n_episode * n_agents * max_steps + 100)

use_cuda = pt.cuda.is_available()

for i in range(n_episode):
    data1 = pickle.load(pkl_file)
    data2 = pickle.load(pkl_file)
    data3 = pickle.load(pkl_file)
    print('episode is %d' % (i))
    for j in range(max_steps):
        memory.push(data1[j], data2[j], '', '', '')

loss_func = pt.nn.MSELoss().cuda()


class meta_actor(pt.nn.Module):
    def __init__(self, dim_observation, dim_action):
        # print('model.dim_action',dim_action)
        super(meta_actor, self).__init__()
        self.FC1 = pt.nn.Linear(dim_observation, 500)
        self.FC2 = pt.nn.Linear(500, 128)
        self.FC3 = pt.nn.Linear(128, dim_action)

    def forward(self, obs):
        result = F.relu(self.FC1(obs))
        result = F.relu(self.FC2(result))

Example #23

Show file

File: drrn.py Project: qiming-zou/tdqn

class DRRN_Agent:
    def __init__(self, args):
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(args.spm_path)
        self.network = DRRN(len(self.sp), args.embedding_dim,
                            args.hidden_dim).to(device)
        self.memory = ReplayMemory(args.memory_size)
        self.save_path = args.output_dir
        self.clip = args.clip
        self.optimizer = torch.optim.Adam(self.network.parameters(),
                                          lr=args.learning_rate)

    def observe(self, state, act, rew, next_state, next_acts, done):
        self.memory.push(state, act, rew, next_state, next_acts, done)

    def build_state(self, obs, infos):
        """ Returns a state representation built from various info sources. """
        obs_ids = [self.sp.EncodeAsIds(o) for o in obs]
        look_ids = [self.sp.EncodeAsIds(info['look']) for info in infos]
        inv_ids = [self.sp.EncodeAsIds(info['inv']) for info in infos]
        return [
            State(ob, lk, inv)
            for ob, lk, inv in zip(obs_ids, look_ids, inv_ids)
        ]

    def encode(self, obs_list):
        """ Encode a list of observations """
        return [self.sp.EncodeAsIds(o) for o in obs_list]

    def act(self, states, poss_acts, sample=True):
        """ Returns a string action from poss_acts. """
        idxs, values = self.network.act(states, poss_acts, sample)
        act_ids = [poss_acts[batch][idx] for batch, idx in enumerate(idxs)]
        return act_ids, idxs, values

    def update(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Compute Q(s', a') for all a'
        # TODO: Use a target network???
        next_qvals = self.network(batch.next_state, batch.next_acts)
        # Take the max over next q-values
        next_qvals = torch.tensor([vals.max() for vals in next_qvals],
                                  device=device)
        # Zero all the next_qvals that are done
        next_qvals = next_qvals * (
            1 - torch.tensor(batch.done, dtype=torch.float, device=device))
        targets = torch.tensor(batch.reward, dtype=torch.float,
                               device=device) + self.gamma * next_qvals

        # Next compute Q(s, a)
        # Nest each action in a list - so that it becomes the only admissible cmd
        nested_acts = tuple([[a] for a in batch.act])
        qvals = self.network(batch.state, nested_acts)
        # Combine the qvals: Maybe just do a greedy max for generality
        qvals = torch.cat(qvals)

        # Compute Huber loss
        loss = F.smooth_l1_loss(qvals, targets.detach())
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.network.parameters(), self.clip)
        self.optimizer.step()
        return loss.item()

    def load(self):
        try:
            self.memory = pickle.load(
                open(pjoin(self.save_path, 'memory.pkl'), 'rb'))
            self.network = torch.load(pjoin(self.save_path, 'model.pt'))
        except Exception as e:
            print("Error saving model.")
            logging.error(traceback.format_exc())

    def save(self):
        try:
            pickle.dump(self.memory,
                        open(pjoin(self.save_path, 'memory.pkl'), 'wb'))
            torch.save(self.network, pjoin(self.save_path, 'model.pt'))
        except Exception as e:
            print("Error saving model.")
            logging.error(traceback.format_exc())

Example #24

Show file

File: sac1.py Project: haewngX/PytorchRL

class SAC(object):
    def __init__(self, config, env):
        self.device = config.device

        self.gamma = config.gamma  # 折扣因子

        self.tau = config.tau

        # 学习率
        self.value_lr = config.value_lr
        self.soft_q_lr = config.soft_q_lr
        self.policy_lr = config.policy_lr

        self.replace_target_iter = config.replace_target_iter  # 目标网络更新频率
        self.replay_size = config.replay_size  # 经验池大小
        self.batch_size = config.batch_size  # 批样本数

        self.num_states = env.observation_space.shape[0]  # 状态空间维度
        self.num_actions = env.action_space.shape[0]  # 动作空间维度

        self.learn_start = self.batch_size * 3  # 控制学习的参数

        self.learn_step_counter = 0  # 学习的总步数

        self.memory = ReplayMemory(self.replay_size)  # 初始化经验池

        # 初始化V网络
        self.value_net = ValueNetwork(self.num_states, 256).to(self.device)
        # 初始化V目标网络
        self.target_value_net = ValueNetwork(self.num_states,
                                             256).to(self.device)

        # V目标网络和V网络初始时参数一致
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param.data)

        # 初始化Q网络
        self.soft_q_net = SoftQNetwork(self.num_states, self.num_actions,
                                       256).to(self.device)

        # 初始化策略网络
        self.policy_net = PolicyNetwork(self.num_states, self.num_actions,
                                        256).to(self.device)

        # 训练的优化器
        self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                          lr=self.value_lr)
        self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(),
                                           lr=self.soft_q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=self.policy_lr)

        # 均方损失函数
        self.value_criterion = nn.MSELoss()
        self.soft_q_criterion = nn.MSELoss()

    # 储存记忆
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.push((state, action, reward, next_state, done))

    # 选择动作
    def choose_action(self, s):
        s = torch.FloatTensor(s).to(self.device)
        mean, log_std = self.policy_net(s)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        action = action.detach().cpu().numpy()
        return action[0]

    # 获取动作的log_prob
    def get_action_log_prob(self, s, epsilon=1e-6):
        mean, log_std = self.policy_net(s)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)

        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
        log_prob = log_prob.sum(-1, keepdim=True)
        # log_prob = Normal(mean, std).log_prob(mean + std * z.to(self.device)) - torch.log(1 - action.pow(2) + epsilon)  # reparameterization

        return action, log_prob, z, mean, log_std

    # 从经验池中选取样本
    def get_batch(self):
        transitions, _, _ = self.memory.sample(self.batch_size)  # 批样本

        # 解压批样本
        # 例如zipped为[(1, 4), (2, 5), (3, 6)]，zip(*zipped)解压为[(1, 2, 3), (4, 5, 6)]
        batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
            *transitions)

        # 将样本转化为tensor
        batch_state = torch.tensor(batch_state,
                                   device=self.device,
                                   dtype=torch.float)
        batch_action = torch.tensor(batch_action,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(
                                        -1, 1)  # view转换为列tensor
        batch_reward = torch.tensor(batch_reward,
                                    device=self.device,
                                    dtype=torch.float).squeeze().view(-1, 1)
        batch_next_state = torch.tensor(batch_next_state,
                                        device=self.device,
                                        dtype=torch.float)
        batch_done = torch.tensor(batch_done,
                                  device=self.device,
                                  dtype=torch.float).squeeze().view(-1, 1)
        # print("状态:", batch_state.shape) 128,4
        # print("动作:", batch_action.shape)
        # print("奖励:", batch_reward.shape)
        # print("done:", batch_done.shape)
        #
        return batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _

    # 学习
    def learn(self):
        # 获取批样本
        batch_state, batch_action, batch_reward, batch_next_state, batch_done, _, _ = self.get_batch(
        )

        # print("状态:", batch_state)
        # print("动作:", batch_action)
        # print("done:", batch_done)

        expected_q_value = self.soft_q_net(batch_state, batch_action)  # q(s,a)
        expected_value = self.value_net(batch_state)  # v(s)
        new_action, log_prob, z, mean, log_std = self.get_action_log_prob(
            batch_state)  # a~, logpi(a~|s), dist, 均值，标准差

        target_value = self.target_value_net(batch_next_state)  # vtar(s')
        next_q_value = batch_reward + (
            1 -
            batch_done) * self.gamma * target_value  # r + gamma*(1-d)*vtar(s')
        q_value_loss = self.soft_q_criterion(expected_q_value,
                                             next_q_value.detach()).mean()

        expected_new_q_value = self.soft_q_net(batch_state,
                                               new_action)  # q(s,a~)
        next_value = expected_new_q_value - log_prob
        value_loss = self.value_criterion(expected_value,
                                          next_value.detach()).mean()

        log_prob_target = expected_new_q_value - expected_value  # q(s,a) - v(s)
        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()

        self.soft_q_optimizer.zero_grad()
        q_value_loss.backward()
        self.soft_q_optimizer.step()

        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

        # 学习的步数加一
        self.learn_step_counter += 1

    # 保存模型
    def save(self):
        torch.save(self.soft_q_net, 'sac1_q.pkl')
        torch.save(self.value_net, 'sac1_v.pkl')
        torch.save(self.policy_net, 'sac1_policy.pkl')

    # 加载模型
    def load(self):
        self.soft_q_net = torch.load('sac1_q.pkl')
        self.value_net = torch.load('sac1_v.pkl')
        self.policy_net = torch.load('sac1_policy.pkl')

Example #25

Show file

File: ddqn.py Project: NRAKS/pytorch_rl

class DDQN(object):
    def __init__(self, n_states, n_actions, args):
        if args.seed > 0:
            self.seed(args.seed)

        self.n_states = n_states
        self.n_actions = n_actions

        # create agent network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.agent = Learner(self.n_states, self.n_actions, **net_cfg)
        self.target = Learner(self.n_states, self.n_actions, **net_cfg)
        self.agent_optim = Adam(self.agent.parameters(), lr=args.lr)

        self.update_target_steps = args.update_target_timing

        hard_update(self.target, self.agent)

        # create replay memory
        self.memory = ReplayMemory(capacity=args.rmsize)

        # hyper parameters
        self.batch_size = args.bsize
        self.discount_rate = args.discount_rate
        self.decay_epsilon = 1 / args.decay_epsilon
        self.min_epsilon = args.min_epsilon
        
        self.epsilon = 1.0
        
        if USE_CUDA: self.cuda()

    def update(self, step):
        state_batch, action_batch, next_state_batch, reward_batch, terminal_batch = self.memory.sample_and_split(self.batch_size)
        q_predict = self.agent(to_tensor(state_batch))
        n_q_predict = self.agent(to_tensor(next_state_batch))
        q_batch = torch.zeros(self.batch_size, 1)
        n_act_batch = np.zeros(self.batch_size)
        next_q_value = torch.zeros(self.batch_size, 1)

        for n in range(self.batch_size):
            q_batch[n] = q_predict[n][action_batch[n]]
            n_act_batch = torch.argmax(n_q_predict[n])
            # print(n_act_batch)
            # print(self.target(to_tensor(next_state_batch[n])))
            next_q_value[n] = self.target(to_tensor(next_state_batch[n]))[n_act_batch]

        # next_q_value = torch.max(self.target(to_tensor(next_state_batch)), 1)[0].reshape(self.batch_size, 1)
        
        # next_q_value = self.target(to_tensor(next_state_batch))[n_act_batch]

        target_q_batch = to_tensor(reward_batch).reshape(self.batch_size, 1) + self.discount_rate * next_q_value * to_tensor(1-terminal_batch.astype(np.float).reshape(self.batch_size, 1))

        # q_predict = self.agent(to_tensor(state_batch))
        # print("q_predict:{}" .format(q_predict))
        # q_batch = torch.zeros(self.batch_size, 1)
        # print("q_batch:{}" .format(q_batch.shape))
        # print("q_batch:{}" .format(q_batch))
        value_loss = criterion(q_batch, target_q_batch)
        # print("loss:{}" .format(value_loss))
        self.agent.zero_grad()
        value_loss.backward()
        self.agent_optim.step()

        if step % self.update_target_steps == 0:
            # print("update target")
            self.update_target()

    def update_target(self):
        hard_update(self.target, self.agent)

    def random_action(self):
        action = np.random.uniform(-1., 1., self.n_actions)
        # self.a_t = action
        
        action = np.argmax(action)

        # idx = np.where(action == max(action))

        # action = np.random.choice(idx[0])
        # print(action)
        return action

    def select_action(self, s_t, decay_epsilon=True):
        if np.random.random () < self.epsilon:
            action = self.random_action()
        else:
            action = to_numpy(
                self.agent(to_tensor(np.array([s_t])))
            ).squeeze(0)
            # print("action:{}".format(action))
            action = np.argmax(action)
            # idx = np.where(action == max(action))
            # action = np.random.choice(idx[0])
            
            # print("action:{}" .format(action))
            # action = np.clip(action, -1, 1)

        if self.epsilon > self.min_epsilon and decay_epsilon:
            self.epsilon = max(self.min_epsilon, self.epsilon - self.decay_epsilon)    

        return action

    def observe(self, obs, act, new_obs, rew, done):
        items = np.asarray([obs, act, new_obs, rew, done])
        self.memory.push(items)

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)

Example #26

Show file

class AbstractDQNAgent(AbstractStochasticAgent, ABC):
    def __init__(self, env, config=None):
        super(AbstractDQNAgent, self).__init__(config)
        self.env = env
        assert isinstance(
            env.action_space,
            spaces.Discrete), "Only compatible with Discrete action spaces."
        self.memory = ReplayMemory(self.config)
        self.exploration_policy = exploration_factory(
            self.config["exploration"], self.env.action_space)
        self.training = True
        self.previous_state = None
        self.previous_past_pose = None

        self.step = 0

    @classmethod
    def default_config(cls):
        return dict(model=dict(
            encoder=dict(in_channels=5, in_height=112, in_width=112)),
                    optimizer=dict(type="ADAM", lr=5e-4, weight_decay=0, k=5),
                    rl_lossfunction="l2",
                    predict_lossfunction='l2',
                    memory_capacity=15000,
                    batch_size=32,
                    gamma=0.80,
                    device="cuda:0",
                    exploration=dict(method="EpsilonGreedy"),
                    target_update=50,
                    double=True)

    def record(self, current_state, current_future_pos, current_past_pos,\
                action, reward,\
                    next_state, next_future_pos, next_past_pos, \
                        done, info):
        """
            Record a transition by performing a Deep Q-Network iteration

            - push the transition into memory
            - sample a minibatch
            - compute the bellman residual loss over the minibatch
            - perform one gradient descent step
            - slowly track the policy network with the target network
        :param state: a state
        :param action: an action
        :param reward: a reward
        :param next_state: a next state
        :param done: whether state is terminal
        """
        if not self.training:
            return

        self.memory.push(current_state, current_future_pos, current_past_pos,\
                action, reward,\
                    next_state, next_future_pos, next_past_pos, \
                        done, info)

        batch = self.sample_minibatch()
        if batch:
            loss, _, _ = self.compute_bellman_residual(batch)
            self.step_optimizer(loss)
            self.update_target_network()

            self.step += 1

    def act(self, current_state, current_past_pos):
        """
            Act according to the state-action value model and an exploration policy
        :param state: current state
        :return: an action
        """
        self.previous_state = current_state
        self.previous_past_pose = current_past_pos
        values = self.get_state_action_values(current_state, current_past_pos)
        self.exploration_policy.update(values, step_time=True)
        return self.exploration_policy.sample()

    def sample_minibatch(self):
        if len(self.memory) < self.config["batch_size"]:
            return None
        transitions = self.memory.sample(self.config["batch_size"])
        return Transition(*zip(*transitions))

    def update_target_network(self):
        self.steps += 1
        if self.steps % self.config["target_update"] == 0:
            self.target_net.load_state_dict(self.value_net.state_dict())

    @abstractmethod
    def compute_bellman_residual(self, batch, target_state_action_value=None):
        """
            Compute the Bellman Residual Loss over a batch
        :param batch: batch of transitions
        :param target_state_action_value: if provided, acts as a target (s,a)-value
                                          if not, it will be computed from batch and model (Double DQN target)
        :return: the loss over the batch, and the computed target
        """
        raise NotImplementedError

    @abstractmethod
    def get_batch_state_values(self, states):
        """
        Get the state values of several states
        :param states: [s1; ...; sN] an array of states
        :return: values, actions:
                 - [V1; ...; VN] the array of the state values for each state
                 - [a1*; ...; aN*] the array of corresponding optimal action indexes for each state
        """
        raise NotImplementedError

    @abstractmethod
    def get_batch_state_action_values(self, current_state, current_past_pos):
        """
        Get the state-action values of several states
        :param states: [s1; ...; sN] an array of states
        :return: values:[[Q11, ..., Q1n]; ...] the array of all action values for each state
        """
        raise NotImplementedError

    def get_state_value(self, state):
        """
        :param state: s, an environment state
        :return: V, its state-value
        """
        values, actions = self.get_batch_state_values([state])
        return values[0], actions[0]

    def get_state_action_values(self, current_state, current_past_pos):
        """
        :param state: s, an environment state
        :return: [Q(a1,s), ..., Q(an,s)] the array of its action-values for each actions
        """
        return self.get_batch_state_action_values([current_state],
                                                  [current_past_pos])[0]

    def step_optimizer(self, loss):
        raise NotImplementedError

    def seed(self, seed=None):
        return self.exploration_policy.seed(seed)

    def reset(self):
        pass

    def set_writer(self, writer):
        super().set_writer(writer)
        try:
            self.exploration_policy.set_writer(writer)
        except AttributeError:
            pass

    def action_distribution(self, state):
        self.previous_state = state
        values = self.get_state_action_values(state)
        self.exploration_policy.update(values, step_time=False)
        return self.exploration_policy.get_distribution()

    def set_time(self, time):
        self.exploration_policy.set_time(time)

    def eval(self):
        self.training = False
        self.config['exploration']['method'] = "Greedy"
        self.exploration_policy = exploration_factory(
            self.config["exploration"], self.env.action_space)

Example #27

Show file

File: model_server.py Project: Dadsgoosen/Deep-Learning-DTU

class ModelServer(SocketServer):
    HOST = 'localhost'
    PORT = 5600

    def __init__(self, *args, **kwargs):
        self.model = BasketballModel()
        self.handler = TrainingHandler()
        self.status = 0
        self.last_connection_amount = 0
        self.running_time = datetime.now()
        self.memory = ReplayMemory(100000)
        self.csv = CSVFile()
        super(ModelServer, self).__init__(self.HOST, self.PORT)

    def on_message_received(self, sock: socket, data, received_data: str,
                            addr: Tuple[str, int]) -> None:
        request = json.loads(received_data)
        print('Received {} from {}'.format(request, addr))
        if is_correct_message(request):
            host, prt = addr
            conn = self.handler.get_connection(prt)
            if is_result(request):
                res_throw = float(request['throw'])
                res_force = float(request['force'])
                res_distance = float(request['distance'])
                self.csv.add_observation(res_throw, res_force, res_distance,
                                         (datetime.now() -
                                          self.running_time).total_seconds())
                self.memory.push(res_throw, res_force, res_distance)
                conn.result = res_distance
            elif is_request(request):
                conn.distance = float(request['distance'])

    def on_step(self):
        # If all the results from the throws are in,
        if self.handler.all_results_are_in():
            # Then let us learn from all the results
            self.model.learn(self.handler.predictions,
                             self.handler.get_all_results())
            # Clear the results so that we can receive fresh results
            self.handler.clear_results()
            del self.handler.predictions
            self.status = 0
        # If all the distances are in
        if self.handler.all_distances_are_in():
            # Then we can predict the force and height
            throws = self.model.throw(self.handler.get_all_distances())
            # PyTorch tries to be clever, but we need it in the right dimensions
            if len(throws.shape) <= 1:
                throws = throws.unsqueeze(0)
            # Add the predictions to the training handler for later
            self.handler.predictions = throws
            # And send them to all the connected clients
            for conn, throw in zip(self.handler.get_connections(), throws):
                # In order to send the tensor data over the network,
                # we must first convert the tensor to simple python
                # data types and then we can access them as normal.
                t = throw[0].tolist()
                # t = random.uniform(0.2, 1)
                self.send_prediction_to_connection(conn, t, t)
            # Clear distances afterwards
            self.handler.clear_distances()
            self.status = 1

    def on_connection_closed(self, addr: Tuple[str, int]):
        host, port = addr
        self.handler.remove_connection(port)

    def on_accept_connection(self, sock: socket, addr: Tuple[str, int],
                             data: SimpleNamespace):
        host, port = addr
        self.handler.add_connection(Connection(sock, host, port, data))

    def send_prediction_to_connection(self, conn: Connection, force: float,
                                      height: float) -> None:
        prediction = {'Type': 'prediction', 'Force': force, 'Height': height}
        self.send_message(conn.data, prediction)

    def ask_for_distances(self, conn: Connection) -> None:
        request = {'Type': 'request'}
        self.send_message(conn.data, request)

Example #28

Show file

for i in range(n_episode):
    data1 = pickle.load(pkl_file)
    data2 = pickle.load(pkl_file)
    data3 = pickle.load(pkl_file)
    print('episode is %d' % (i))
    for j in range(max_steps):
        for k in range(n_agents):
            tmp_state = Variable(pt.zeros(5, 22).type(FloatTensor))
            tmp_action = Variable(pt.zeros(5, 2).type(FloatTensor))
            tmp_state[0:4, :] = data1[j]
            tmp_state[4, :] = data1[j][k, :]
            tmp_action[0:4, :] = data2[j]
            tmp_action[4, :] = data2[j][k, :]

            memory.push(tmp_state, tmp_action, '', data3[j][k].cpu(), '')

loss_func = pt.nn.MSELoss().cuda()


class meta_critic(pt.nn.Module):
    def __init__(self, n_agent, dim_observation, dim_action):
        super(meta_critic, self).__init__()
        self.n_agent = n_agent
        self.dim_observation = dim_observation
        self.dim_action = dim_action
        obs_dim = self.dim_observation * n_agent
        act_dim = self.dim_action * n_agent

        self.FC1 = pt.nn.Linear(obs_dim, 1024)
        self.FC2 = pt.nn.Linear(1024 + act_dim, 512)

Example #29

Show file

    class Model(object):
        def __init__(self):
            self.Rewards = []
            self.eval_net = DQN(N_C, arg.h, arg.w, N_A).to(device)
            if (arg.Reload_net):
                print('========== Reload net! ==========')
                self.eval_net = torch.load('policy_net.pkl')

            self.target_net = DQN(N_C, arg.h, arg.w, N_A).to(device)
            self.target_net.load_state_dict(self.eval_net.state_dict())
            self.target_net.eval()

            self.memory_counter = 0  # for storing memory
            self.learn_step_counter = 0  # for target updating
            self.memory = ReplayMemory(MEMORY_CAPACITY)  # initialize memory
            self.loss_func = nn.MSELoss()
            self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                              lr=LR)

        def choose_action(self, x):
            self.eval_net.eval()

            N_ACTIONS = N_A
            x = process_x(x)

            # input only one sample
            if np.random.uniform() < EPSILON:  # greedy
                actions_value = self.eval_net.forward(x)
                action = torch.max(actions_value, 1)[1].data.numpy()
                action = action[0]
            else:  # random
                action = np.random.randint(0, N_ACTIONS)
                # action = action if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)
            return action

        def store_transition(self, s, a, r, info, s_):

            # transition = np.hstack((s, a, r, info, s_))
            # transition = (s, a, r, info, s_)
            self.memory.push(s, a, r, info, s_)
            self.memory_counter += 1

        def learn(self):
            self.eval_net.train()
            # target parameter update
            if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
                print('------- replace netwark!-------',
                      self.learn_step_counter)
                self.target_net.load_state_dict(self.eval_net.state_dict())
            self.learn_step_counter += 1

            # sample batch transitions
            memory = self.memory
            transitions = memory.sample(BATCH_SIZE)

            batch = Transition(*zip(*transitions))
            batch_s, batch_a, batch_r, batch_info, batch_s_ = batch

            info_array = np.array(batch_info)
            batch_position, batch_press_shift, batch_pos_passed = info_array[:,
                                                                             0], info_array[:,
                                                                                            1], info_array[:,
                                                                                                           2]

            batch_s = torch.FloatTensor(batch_s)
            batch_s_ = torch.FloatTensor(batch_s_)
            batch_a = list_tensor(batch_a, 'long')
            batch_r = list_tensor(batch_r)

            q_eval = self.eval_net(batch_s).gather(1, batch_a)
            q_next = self.target_net(batch_s_).max(1)[0].view(
                BATCH_SIZE,
                1).detach()  # detach from graph, don't backpropagate
            q_target = batch_r + GAMMA * q_next

            # loss = self.loss_func(q_eval, q_target)
            loss = F.smooth_l1_loss(q_eval, q_target)

            err = q_eval - q_next
            if (arg.print_loss):
                print(
                    '---- Loss ----> {:6.3f},  --- mean-err  -----> {:6.3f} ) '
                    .format(
                        float(loss.data.numpy()),
                        float(self.loss_func(q_eval, q_target).data.numpy())))
            # q_eval -  q_target

            self.optimizer.zero_grad()
            loss.backward()

            # tmp = 0
            # for param in self.eval_net.parameters():
            #     max_g = param.grad.data.numpy()
            #     mx = np.max(max_g)
            #     if(mx >tmp):
            #         tmp = mx
            # print(tmp)
            # param.grad.data.clamp_(-1, 1)

            if (arg.plot_net): plot_net(self.eval_net, 0)
            self.optimizer.step()

Example #30

Show file

File: agent.py Project: jassiay/DRQN_Stock_Trading

class Agent:
	def __init__(self, state_size=14, T=96, is_eval=True):
		self.state_size = state_size # normalized previous days
		self.action_size = 3
		self.memory = ReplayMemory(10000)
		self.inventory = []
		self.is_eval = is_eval
		self.T = T

		self.gamma = 0.99
		self.epsilon = 1.0
		self.epsilon_min = 0.01
		self.epsilon_decay = 0.995
		self.batch_size = 16
		if os.path.exists('models/target_model'):
			self.policy_net = torch.load('models/policy_model', map_location=device)
			self.target_net = torch.load('models/target_model', map_location=device)
		else:
			self.policy_net = DQN(state_size, self.action_size).to(device)
			self.target_net = DQN(state_size, self.action_size).to(device)

			for param_p in self.policy_net.parameters(): 
				weight_init.normal_(param_p)

		self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=0.00025)
		
	def act(self, state):
		if not self.is_eval and np.random.rand() <= self.epsilon:
			return random.randrange(self.action_size) - 1

		tensor = torch.FloatTensor(state).to(device)
		tensor = tensor.unsqueeze(0)
		options = self.target_net(tensor)
		# options = self.policy_net(tensor)
		return (np.argmax(options[-1].detach().cpu().numpy()) - 1)
		# return (np.argmax(options[0].detach().numpy()) - 1)

	def store(self, state, actions, new_states, rewards, action, step):
		if step < 1000: # soft update
			for n in range(len(actions)):
				self.memory.push(state, actions[n], new_states[n], rewards[n])
		else:
			for n in range(len(actions)):
				if actions[n] == action:
					self.memory.push(state, actions[n], new_states[n], rewards[n])
					break

	def optimize(self, step):
		# print(len(self.memory))
		if len(self.memory) < self.batch_size * 10:
				return
		transitions = self.memory.sample(self.batch_size)
		# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
		# detailed explanation). This converts batch-array of Transitions
		# to Transition of batch-arrays.
		batch = Transition(*zip(*transitions))

		# Compute a mask of non-final states and concatenate the batch elements
		# (a final state would've been the one after which simulation ended)
		next_state = torch.FloatTensor(batch.next_state).to(device)
		non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state)))
		non_final_next_states = torch.cat([s for s in next_state if s is not None])

		state_batch = torch.FloatTensor(batch.state).to(device)
		action_batch = torch.LongTensor(torch.add(torch.tensor(batch.action), torch.tensor(1))).to(device)
		reward_batch = torch.FloatTensor(batch.reward).to(device)

		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
		# columns of actions taken. These are the actions which would've been taken
		# for each batch state according to policy_net
		l = self.policy_net(state_batch).size(0)
		state_action_values = self.policy_net(state_batch)[95:l:96].gather(1, action_batch.reshape((self.batch_size, 1)))
		state_action_values = state_action_values.squeeze(-1)

		# Compute V(s_{t+1}) for all next states.
		# Expected values of actions for non_final_next_states are computed based
		# on the "older" target_net; selecting their best reward with max(1)[0].
		# This is merged based on the mask, such that we'll have either the expected
		# state value or 0 in case the state was final.
		next_state_values = torch.zeros(self.batch_size, device=device)
		next_state_values[non_final_mask] = self.target_net(next_state)[95:l:96].max(1)[0].detach()
		# Compute the expected Q values
		expected_state_action_values = (next_state_values * self.gamma) + reward_batch

		# Compute the loss
		loss = torch.nn.MSELoss()(expected_state_action_values, state_action_values)

		# Optimize the model
		
		loss.backward()
		for param in self.policy_net.parameters():
				param.grad.data.clamp_(-1, 1)
		
		self.optimizer.step()
		
		if step % self.T == 0:
			# print('soft_update')
			gamma = 0.001
			param_before = copy.deepcopy(self.target_net)
			target_update = copy.deepcopy(self.target_net.state_dict())
			for k in target_update.keys():
				target_update[k] = self.target_net.state_dict()[k] * (1 - gamma) + self.policy_net.state_dict()[k] * gamma
			self.target_net.load_state_dict(target_update)