Esempio n. 1
0
    def __init__(self, opt, q_trace, learner):
        self.opt = opt
        self.q_trace = q_trace
        self.learner = learner

        # 游戏
        self.env = None
        # s_channel = self.env.observation_space.shape[0]
        # a_space = self.env.action_space

        # 网络
        self.behaviour = ActorCritic(opt).to(device)
Esempio n. 2
0
File: ppo.py Progetto: CAiM-lab/PPO
    def __init__(self, args):
        """"Constructor which allows the PPO class to initialize the attributes of the class"""
        self.args = args
        self.random_seed()
        # Check if GPU is available via CUDA driver
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        # Initialize the actor critic class
        self.actor_critic = ActorCritic(
            self.args.nb_states, self.args.nb_actions,
            self.args.hidden_layer_size).to(self.device)
        # Define the optimizer used for the optimization of the surrogate loss
        self.optimizer = self.args.optimizer(self.actor_critic.parameters(),
                                             self.args.lr)

        # For training multiple instances of the env are needed (Shoulder model)
        self.envs = [self.make_env() for i in range(self.args.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        # To validate the intermediate learning process one test env is needed
        self.env_test = self.args.env
        self.env_test.seed(self.args.seed)
        self.env_test.set_scaling(self.args.output_scaling)

        #  Lists for Tensorboard to visualize learning process during learning
        self.test_rewards = []
        self.loss = []
        self.lr = []
        self.actor_grad_weight = []
        self.action_bang_bang = []

        self.lr.append(self.args.lr)

        # Dump bin files
        if self.args.play is False:
            self.output_path = "trained_models" + '/PPO_{}'.format(
                datetime.now().strftime('%Y%b%d_%H%M%S')) + "/"
            os.mkdir(self.output_path)
            self.writer = SummaryWriter(self.output_path)
#!/usr/bin/python3

from pendulum import Pendulum
from network import ActorCritic

import numpy as np
import pickle
import os.path
import random

actorCritic = ActorCritic(Pendulum.state_size, Pendulum.action_size)

experiences = []
if os.path.exists('experiences.p'):
    experiences = pickle.load(open("experiences.p", "rb"))
print('experiences', len(experiences))

pendulum = Pendulum(Pendulum.random_theta())
round = 0
score = 1
iteration = 0
cumulative_iterations = 0
action0 = False

while round < 27:

    state0 = pendulum.state()

    actions = actorCritic.run_actor([state0])
    if random.random() < 0.25:
        action1 = np.random.choice(Pendulum.action_size, 1)[0]
Esempio n. 4
0
parser.add_argument('--gamma',
                    default=0.99,
                    type=float,
                    help='rewards discount factor')
parser.add_argument('--entropy_weight', default=0.0001, type=float)
parser.add_argument('--alpha', default=0.95, type=float)
parser.add_argument('--type', default='notrpo', type=str, help='iftrpo')
parser.add_argument('--render', action='store_true', help='render')

args = parser.parse_args()
# print(args)
torch.manual_seed(args.seed)

env = gym.make("CartPole-v0")
replay_buffer = ReplayBuffer(args.capacity, args.max_episode_length)
model = ActorCritic(env.observation_space.shape[0], env.action_space.n).cuda()
average_model = ActorCritic(env.observation_space.shape[0],
                            env.action_space.n).cuda()
optimizer = optim.Adam(model.parameters())

frame_idx = 0
test_rewards = []
episode_count = 0
step_count = 0
state = env.reset()

running_rew = 0
plotcount = 0

while frame_idx < args.max_frames:
Esempio n. 5
0
                    'action0': action0,
                    'state1': state1,
                    'action1': action1,
                    'score1': score1
                }
                experiences.append(experience)
            action0 = action1

            # print(action1, actions, state1[Pendulum.state_size - 1])

            cumulative_score_run += score1
            iterations += 1

        print('score final ', score, ' average ',
              cumulative_score_run / iterations, ' initial theta ',
              pendulum.initial_theta, ' iterations ', iterations)
        cumulative_score += score1
        cumulative_iterations += iterations

        pendulum = Pendulum(Pendulum.random_theta())

    return cumulative_score / count, cumulative_iterations / count


actorCritic = ActorCritic(Pendulum.state_size, Pendulum.action_size)
score, iterations = run_test(27, actorCritic)

print('score', score, 'iterations', iterations)

pickle.dump(experiences, open("experiences.p", "wb"))
Esempio n. 6
0
File: ppo.py Progetto: CAiM-lab/PPO
class PPO(object):
    """Main PPO class"""
    def __init__(self, args):
        """"Constructor which allows the PPO class to initialize the attributes of the class"""
        self.args = args
        self.random_seed()
        # Check if GPU is available via CUDA driver
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        # Initialize the actor critic class
        self.actor_critic = ActorCritic(
            self.args.nb_states, self.args.nb_actions,
            self.args.hidden_layer_size).to(self.device)
        # Define the optimizer used for the optimization of the surrogate loss
        self.optimizer = self.args.optimizer(self.actor_critic.parameters(),
                                             self.args.lr)

        # For training multiple instances of the env are needed (Shoulder model)
        self.envs = [self.make_env() for i in range(self.args.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        # To validate the intermediate learning process one test env is needed
        self.env_test = self.args.env
        self.env_test.seed(self.args.seed)
        self.env_test.set_scaling(self.args.output_scaling)

        #  Lists for Tensorboard to visualize learning process during learning
        self.test_rewards = []
        self.loss = []
        self.lr = []
        self.actor_grad_weight = []
        self.action_bang_bang = []

        self.lr.append(self.args.lr)

        # Dump bin files
        if self.args.play is False:
            self.output_path = "trained_models" + '/PPO_{}'.format(
                datetime.now().strftime('%Y%b%d_%H%M%S')) + "/"
            os.mkdir(self.output_path)
            self.writer = SummaryWriter(self.output_path)

        #self.delta = (self.args.lr-self.args.lr_end)/1e6

    def train(self):
        """Main training function"""
        frame_idx = 0
        state = self.envs.reset()
        mean_100_reward = -np.inf
        self.info()

        while frame_idx < self.args.max_frames:
            log_probs = []
            values = []
            states = []
            actions = []
            rewards = []
            masks = []
            entropy = self.args.entropy

            for _ in range(self.args.nb_steps):
                state = torch.FloatTensor(state).to(self.device)
                dist, value = self.actor_critic(state)
                action = dist.sample()
                # Make sure action is loaded to CPU (not GPU)
                next_state, reward, done, _ = self.envs.step(
                    action.cpu().numpy())

                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(
                    torch.FloatTensor(reward).unsqueeze(1).to(self.device))
                masks.append(
                    torch.FloatTensor(1 - done).unsqueeze(1).to(self.device))

                states.append(state)
                actions.append(action)
                state = next_state
                frame_idx += 1
                #self.scheduler()

                # Evaluate training process and write data to tensorboard
                if frame_idx % 1000 == 0:
                    test_reward = np.mean(
                        [self.test_env(self.args.vis) for _ in range(10)])
                    self.test_rewards.append(test_reward)

                    if self.args.play is False:
                        print("Mean reward: ",
                              np.round(np.mean(self.test_rewards[-101:-1]), 0))
                        if mean_100_reward < np.round(
                                np.mean(self.test_rewards[-101:-1]), 0):
                            mean_100_reward = np.round(
                                np.mean(self.test_rewards[-101:-1]), 0)
                            self.save_network(mean_100_reward)
                        if len(self.test_rewards) >= 10:
                            self.writer.add_scalar(
                                'data/reward',
                                np.mean(self.test_rewards[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/ppo_loss', np.mean(self.loss[-11:-1]),
                                frame_idx * self.args.num_envs)
                            self.writer.add_scalar(
                                'data/nb_actions_outside_range',
                                np.mean(self.action_bang_bang[-11:-1]),
                                frame_idx * self.args.num_envs)

                    # if test_reward > threshold_reward: early_stop = True

            next_state = torch.FloatTensor(next_state).to(self.device)
            _, next_value = self.actor_critic(next_state)
            returns = self.calc_gae(next_value, rewards, masks, values,
                                    self.args.gamma, self.args.tau)

            # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations
            returns = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values = torch.cat(values).detach()
            states = torch.cat(states)
            actions = torch.cat(actions)
            advantage = returns - values
            self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size,
                            states, actions, log_probs, returns, advantage,
                            self.args.clip)

    def make_env(self):
        # Private trunk function for calling the SubprocVecEnv class
        def _trunk():
            env = self.args.env  # in this simple case the class TestEnv() is called (see openAI for more envs)
            env.seed(self.args.seed)
            env.set_scaling(self.args.output_scaling)
            return env

        return _trunk

    def test_env(self, vis=False):
        state = self.env_test.reset()
        if vis:
            self.env_test.render()
        done = False
        total_reward = 0
        action_bang_bang = 0
        step = 0
        while not done:
            step += 1
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            dist, _ = self.actor_critic(state)
            action = dist.sample().cpu().numpy()[0]
            force = action * self.args.output_scaling
            next_state, reward, done, _ = self.env_test.step(action)
            if force > 0.5 or force < -0.5:
                action_bang_bang += 1
            state = next_state
            if vis:
                self.env_test.render()
            total_reward += reward
        self.action_bang_bang.append(action_bang_bang / step)
        return total_reward

    # Plain functions except that one can call them from an instance or the class
    @staticmethod
    def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[
                step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns

    @staticmethod
    def ppo_iter(mini_batch_size, states, actions, log_probs, returns,
                 advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[
                rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def ppo_update(self,
                   ppo_epochs,
                   mini_batch_size,
                   states,
                   actions,
                   log_probs,
                   returns,
                   advantages,
                   clip_param=0.2):
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(
                    mini_batch_size, states, actions, log_probs, returns,
                    advantages):
                dist, value = self.actor_critic(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param,
                                    1.0 + clip_param) * advantage

                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
                self.loss.append(loss.item())
                # Important step:
                self.optimizer.zero_grad()
                #pdb.set_trace()
                loss.backward()
                if self.args.grad_norm is not None:
                    nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
                                             self.args.grad_norm)
                self.optimizer.step()

    def save_network(self, reward):
        network_path = self.output_path + "/network" + str(reward)
        pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb"))

    def load_network(self, path):
        network_new = pickle.load(open(path, "rb"))
        self.actor_critic.load_state_dict(network_new)

    def random_seed(self):
        torch.manual_seed(self.args.seed)
        random.seed(self.args.seed)
        np.random.seed(self.args.seed)

    def scheduler(self):
        for g in self.optimizer.param_groups:
            lr = g["lr"]
            if self.args.lr_end > lr:
                lr = self.args.lr_end
            else:
                lr -= self.delta
            self.lr.append(lr)
            g["lr"] = lr

    def info(self):
        fhandler = logging.FileHandler(filename=self.output_path +
                                       '/mylog.log',
                                       mode='a')
        logger.addHandler(fhandler)
        logger.info("--- INFO ---")
        logger.info("args: {}".format(self.args))
Esempio n. 7
0
    argparse.add_argument('--update_intervals', type=int)
    argparse.add_argument('--gifs_save_interval', type=int)
    argparse.add_argument('--gradient_clipping', type=float)
    argparse.add_argument('--render', action='store_true')
    argparse.add_argument('--critic_coefficient', type=float)

    args = argparse.parse_args()

    print('Creating {} environments for parallel processing'.format(
        args.threads))
    args.environments = [
        gym.make(args.environment) for _ in range(args.threads)
    ]

    args.optimizer = tf.keras.optimizers.SGD(args.learning_rate)
    args.actor_critic = ActorCritic(
        args.environments[0].action_space.n,
        input_shape=args.environments[0].observation_space.sample().shape)
    args.actor_critic.set_threads(args.threads)

    sample_input = process_screen(
        args.environments[0].observation_space.sample())
    args.actor_critic(sample_input, 0)
    args.actor_critic.reset_thread_states(0)
    if args.checkpoint_path != None:
        args.actor_critic.load_weights(args.checkpoint_path)

    args.summary_writer = tf.summary.create_file_writer(args.log_dir)

    run_training_procedure(args)
Esempio n. 8
0
 def __init__(self, opt, q_batch):
     self.opt = opt
     self.q_batch = q_batch
     self.network = ActorCritic(opt).to(device)
     self.optimizer = Adam(self.network.parameters(), lr=opt.lr)
     self.network.share_memory()
Esempio n. 9
0
class Learner(object):
    def __init__(self, opt, q_batch):
        self.opt = opt
        self.q_batch = q_batch
        self.network = ActorCritic(opt).to(device)
        self.optimizer = Adam(self.network.parameters(), lr=opt.lr)
        self.network.share_memory()

    def learning(self):
        torch.manual_seed(self.opt.seed)
        coef_hat = torch.Tensor([[self.opt.coef_hat]]).to(device)
        rho_hat = torch.Tensor([[self.opt.rho_hat]]).to(device)
        while True:
            # batch-trace
            # s[batch, n_step+1, 3, width, height]
            # a[batch, n_step, a_space]
            # rew[batch, n_step]
            # a_prob[batch, n_step, a_space]
            s, a, rew, prob = self.q_batch.get(block=True)
            ###########################
            # variables we need later #
            ###########################
            v, coef, rho, entropies, log_prob = [], [], [], [], []
            cx = torch.zeros(self.opt.batch_size, 256).to(device)
            hx = torch.zeros(self.opt.batch_size, 256).to(device)
            for step in range(s.size(1)):
                # value[batch]
                # logit[batch, 12]
                value, logit, (hx, cx) = self.network((s[:, step,
                                                         ...], (hx, cx)))
                v.append(value)
                if step >= a.size(
                        1
                ):  # noted that s[, n_step+1, ...] but a[, n_step,...]
                    break  # loop for n_step+1 because v in n_step+1 is needed.

                # π/μ[batch]
                # TODO: cumprod might produce runtime problem
                logit_a = a[:, step, :] * logit.detach() + (
                    1 - a[:, step, :]) * (1 - logit.detach())
                prob_a = a[:, step, :] * prob[:, step, :] + (
                    1 - a[:, step, :]) * (1 - prob[:, step, :])
                is_rate = torch.cumprod(logit_a / (prob_a + 1e-6), dim=1)[:,
                                                                          -1]
                coef.append(torch.min(coef_hat, is_rate))
                rho.append(torch.min(rho_hat, is_rate))

                # enpy_aspace[batch, 12]
                # calculating the entropy[batch, 1]
                # more specifically there are [a_space] entropy for each batch, sum over them here.
                # noted that ~do not~ use detach here
                enpy_aspace = -torch.log(logit) * logit - torch.log(
                    1 - logit) * (1 - logit)
                enpy = (enpy_aspace).sum(dim=1, keepdim=True)
                entropies.append(enpy)

                # calculating the prob that the action is taken by target policy
                # and the prob_pi_a[batch, 12] and log_prob[batch, 1] of this action
                # noted that ~do not~ use detach here
                prob_pi_a = (a[:, step, :] *
                             logit) + (1 - a[:, step, :]) * (1 - logit)
                log_prob_pi_a = torch.log(prob_pi_a).sum(dim=1, keepdim=True)
                log_prob.append(log_prob_pi_a)
                # prob_pi_a = torch.cumprod(prob_pi_a, dim=1)[:, -1:]
                # log_prob_pi_a = torch.log(prob_pi_a)

            ####################
            # calculating loss #
            ####################
            policy_loss = 0
            value_loss = 0
            # gae = torch.zeros(self.opt.batch_size, 1)
            for rev_step in reversed(range(s.size(1) - 1)):
                # compute v_(s+1)[batch] for policy gradient
                fix_vp = rew[:, rev_step] + self.opt.gamma * (
                    v[rev_step + 1] + value_loss) - v[rev_step]

                # value_loss[batch]
                td = rew[:, rev_step] + self.opt.gamma * v[rev_step +
                                                           1] - v[rev_step]
                value_loss = self.opt.gamma * coef[
                    rev_step] * value_loss + rho[rev_step] * td

                # policy_loss = policy_loss - log_probs[i] * Variable(gae)
                # the td must be detach from network-v

                # # dalta_t[batch]
                # delta_t = rew[:, rev_step] + self.opt.gamma * v[rev_step + 1] - v[rev_step]
                # gae = gae * self.opt.gamma + delta_t.detach()

                policy_loss = policy_loss \
                              - rho[rev_step] * log_prob[rev_step] * fix_vp.detach() \
                              - self.opt.entropy_coef * entropies[rev_step]

            self.optimizer.zero_grad()
            policy_loss = policy_loss.sum()
            value_loss = value_loss.sum()
            loss = policy_loss + self.opt.value_loss_coef * value_loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.network.parameters(),
                                           self.opt.max_grad_norm)
            print("v_loss {:.3f} p_loss {:.3f}".format(value_loss.item(),
                                                       policy_loss.item()))
            self.optimizer.step()
Esempio n. 10
0
class Actor(object):
    def __init__(self, opt, q_trace, learner):
        self.opt = opt
        self.q_trace = q_trace
        self.learner = learner

        # 游戏
        self.env = None
        # s_channel = self.env.observation_space.shape[0]
        # a_space = self.env.action_space

        # 网络
        self.behaviour = ActorCritic(opt).to(device)

    def performing(self, rank):
        torch.manual_seed(self.opt.seed)
        # 每个线程初始化环境
        self.env = retro.make(game=self.opt.env)
        self.env.seed(self.opt.seed + rank)

        s = self.env.reset()
        s = transform(s).unsqueeze(dim=0).to(device)
        episode_length = 0
        r_sum = 0.
        done = True
        while True:
            # apply
            # print(type(self.learner.network.state_dict()))
            self.behaviour.load_state_dict(self.learner.network.state_dict())
            # LSTM
            if done:
                cx = torch.zeros(1, 256).to(device)
                hx = torch.zeros(1, 256).to(device)
            else:
                cx = cx.detach()
                hx = hx.detach()

            trace_s, trace_a, trace_rew, trace_aprob = [], [], [], []
            # collect n-step
            for n in range(self.opt.n_step):
                episode_length += 1
                #  add to trace - 0
                trace_s.append(s)
                value, logit, (hx, cx) = self.behaviour((s, (hx, cx)))
                logit = logit.detach()
                action = torch.bernoulli(logit)

                s, rew, done, info = self.env.step(
                    action.squeeze().to("cpu").numpy().astype(np.int8))
                r_sum += rew
                s = transform(s).unsqueeze(dim=0).to(device)
                rew = torch.Tensor([rew]).to(device)
                done = done or episode_length >= self.opt.max_episode_length

                #  add to trace - 1
                trace_a.append(action)
                trace_rew.append(rew)
                trace_aprob.append(logit)
                if done:
                    print("over, reward {}".format(r_sum))
                    r_sum = 0
                    episode_length = 0
                    # game over punishment
                    trace_rew[-1] = torch.Tensor([-200.]).to(device)
                    break
            # add to trace - 2
            trace_s.append(s)
            # stack n-step
            # s[n_step+1, 3, width, height]
            # a[n_step, a_space]
            # rew[n_step]
            # a_prob[n_step]
            trace_s = torch.cat(tuple(trace_s), dim=0)
            zeros = torch.zeros((self.opt.n_step + 1, ) +
                                trace_s.size()[1:]).to(device)  # expand
            zeros[:trace_s.size(0)] += trace_s
            trace_s = zeros

            trace_a = torch.cat(tuple(trace_a), dim=0)
            zeros = torch.zeros((self.opt.n_step, ) + trace_a.size()[1:]).to(
                device)  # expand
            zeros[:trace_a.size(0)] += trace_a
            trace_a = zeros

            trace_rew = torch.cat(tuple(trace_rew), dim=0)
            zeros = torch.zeros(self.opt.n_step).to(device)  # expand
            zeros[:trace_rew.size(0)] += trace_rew
            trace_rew = zeros

            trace_aprob = torch.cat(tuple(trace_aprob), dim=0)
            zeros = torch.zeros((self.opt.n_step, ) +
                                trace_aprob.size()[1:]).to(device)  # expand
            zeros[:trace_aprob.size(0)] += trace_aprob
            trace_aprob = zeros

            # submit trace to queue
            self.q_trace.put((trace_s.to("cpu"), trace_a.to("cpu"),
                              trace_rew.to("cpu"), trace_aprob.to("cpu")),
                             block=True)

            if done:
                s = self.env.reset()
                s = transform(s).unsqueeze(dim=0).to(device)
Esempio n. 11
0
#!/usr/bin/python3

from pendulum import Pendulum
from network import ActorCritic

import numpy as np
import pickle
import os.path
import random

actorCritic = ActorCritic(Pendulum.state_size, Pendulum.action_size)

experiences = []
if os.path.exists('experiences.p'):
    experiences = pickle.load(open("experiences.p", "rb"))
print('experiences ', len(experiences))

pendulum = Pendulum(Pendulum.random_theta())
round = 0
iteration = 0
action0 = False

while round < 27:

    state0 = pendulum.state()

    actions = actorCritic.run_actor([state0])
    if random.random() < 0.5:
        action1 = np.random.choice(Pendulum.action_size, 1)[0]
    else:
        action1 = np.argmax(actions)
Esempio n. 12
0
        def _func():
            #lock for stopping training during validation
            if self.val_mode:
                #main thread have locked the following lock, release when val is done
                self.val_lock.acquire(), self.val_lock.release()

            #signal we enter area that modify network
            with self.val_counter.get_lock():
                self.val_counter.value += 1

            #get batch
            this_x, this_y, this_comma, this_punctuation, this_addMax, mask, seqs, q_a_words = self.train_generator(
            )

            if self.mode == 1:
                return_mode = 1
                pred_loss, pred, entropy_loss = self.sampler.consume_sample_full_read(
                    this_x, seqs, this_y, self.scale_pred, q_a_words)

                prediction_correct, reward_pred = self.prediction_correct(
                    pred, this_y)

                #signal we leave area that modify network
                with self.val_counter.get_lock():
                    self.val_counter.value -= 1

                return pred_loss, prediction_correct, np.zeros(1,dtype=np.float32),np.zeros(1,dtype=np.float32),\
                       np.zeros(1,dtype=np.float32),entropy_loss,np.zeros(1,dtype=np.float32)

            elif self.mode == 2:
                return_mode = 2
                batch_size = this_x.shape[1]

                self.i += 1
                if self.i % self.update_after == 0:
                    with self.sync_lock:
                        self.sampler.sync()

                read_words, action_agent_1, value_agent_1, action_agent_2, value_agent_2, predictions, is_not_done,\
                    probs_agent_1, probs_agent_2= \
                    self.sampler.get_sample(this_x, seqs, this_comma, this_punctuation, q_a_words)

                #construct new input to the training network, conisisting only of the read words
                is_not_done = np.append(is_not_done,
                                        np.zeros((1, batch_size),
                                                 dtype=np.bool),
                                        axis=0)
                time_length_reduced_input = np.argmin(is_not_done, axis=0)
                max_len = np.max(time_length_reduced_input)
                #make them 1 longer to avoid out of bounds errors when looping in lstm
                reduced_x = np.zeros(shape=(max_len + 1, batch_size),
                                     dtype=np.int32)
                reduced_action_1 = np.zeros(shape=(max_len, batch_size),
                                            dtype=np.int32)
                reduced_value_1 = np.zeros(shape=(max_len, batch_size),
                                           dtype=np.int32)
                reduced_action_2 = np.zeros(shape=(max_len, batch_size),
                                            dtype=np.int32)
                reduced_value_2 = np.zeros(shape=(max_len, batch_size),
                                           dtype=np.int32)
                reduced_probs_1 = np.zeros(shape=(max_len, batch_size),
                                           dtype=np.float32)
                reduced_probs_2 = np.zeros(shape=(max_len, batch_size),
                                           dtype=np.float32)

                number_skips = np.zeros(shape=(batch_size), dtype=np.int)

                for (i, max) in enumerate(time_length_reduced_input):
                    reduced_x[:max, i] = this_x[read_words[:max, i], i]
                    reduced_action_1[:max, i] = action_agent_1[:max, i]
                    reduced_value_1[:max, i] = value_agent_1[:max, i]
                    reduced_action_2[:max, i] = action_agent_2[:max, i]
                    reduced_value_2[:max, i] = value_agent_1[:max, i]
                    reduced_probs_1[:max, i] = probs_agent_1[:max, i]
                    reduced_probs_2[:max, i] = probs_agent_2[:max, i]
                    number_skips[i] = np.sum(
                        action_agent_1[:max, i] == ActorCritic.agent_1_skip())

                #compute the reward

                prediction_correct, reward_pred = self.prediction_correct(
                    predictions, this_y)
                rolling_reward_agent_1, rolling_reward_agent_2, reward_at_end, is_not_done = self.rolling_reward(
                    reduced_action_1, reduced_action_2,
                    time_length_reduced_input, seqs, prediction_correct)

                #t_flip = (((prediction_correct == 0) * -1) + (prediction_correct == 1))
                #rolling_reward_agent_1 = rolling_reward_agent_1 * t_flip
                #rolling_reward_agent_2 = rolling_reward_agent_2 * t_flip
                #reward_at_end = reward_at_end * t_flip
                #print(rolling_reward_agent_1.shape,rolling_reward_agent_2.shape, reward_at_end.shape)

                final_reward = reward_at_end + reward_pred

                #if prediction correct, use is_not_done to also remove the updates in the agent for these
                #print(is_not_done*prediction_correct)
                #to_add_tmp = prediction_correct==0 * 0.5
                #is_not_done = is_not_done*(prediction_correct+to_add_tmp)

                #compute actual advantage:
                sampled_advantage_1 = final_reward - rolling_reward_agent_1
                sampled_advantage_2 = final_reward - rolling_reward_agent_2
                pred_loss, actor_loss, critic_loss, entropy_loss = self.sampler.consume_sample(
                    reduced_x,
                    time_length_reduced_input,
                    sampled_advantage_1,
                    reduced_action_1,
                    sampled_advantage_2,
                    reduced_action_2,
                    is_not_done,
                    this_y,
                    final_reward,
                    self.scale_pred,
                    self.scale_critic,
                    self.scale_actor,
                    q_a_words,
                    reduced_probs_1,
                    reduced_probs_2,
                    embedding_train=self.embed_partial)

                #general logging of behavior
                self.runned_batches += 1
                reading_percentage = (time_length_reduced_input) / seqs

                #start with advantage based directly on reward

                #signal we leave area that modify network
                with self.val_counter.get_lock():
                    self.val_counter.value -= 1

                return pred_loss, prediction_correct, np.ndarray.astype(reading_percentage,np.float32), actor_loss,\
                       critic_loss, entropy_loss, final_reward
Esempio n. 13
0
    batch_size = 50
    is_Q_A = False
    sync_lock = Lock()
    val_lock = Lock()
    val_counter = Value("i", 0)

    NUMBER_THREADS = 3

    tf.reset_default_graph()

    dg = DataGenerator(
        dataset_name, folder_data, folder_result, batch_size, is_Q_A, w2v=None
    )  #, w2v=None #no embedding, as they are slow to use when just debugging
    ac_consumer = ActorCritic(dg.batch_size,
                              dg.vocab_size,
                              dg.number_targets,
                              scope_name="consumer",
                              device="/gpu:0")
    samplers = [
        ActorCritic(dg.batch_size,
                    dg.vocab_size,
                    dg.number_targets,
                    consumer=ac_consumer,
                    scope_name="sampler_" + str(i),
                    device="/cpu:0") for i in range(NUMBER_THREADS)
    ]
    init_op = [
        tf.global_variables_initializer(),
        tf.local_variables_initializer()
    ]
    sess = tf.Session(config=tf.ConfigProto(