def optimize(self, params, optimizer, shared_model, gpu_id):
        if 'Unreal' in self.args.env:
            self.gate_ids = self.env.env.env.env.gate_ids
        else:
            self.gate_ids = self.env.gate_ids
            self.random_ids = self.env.random_ids

        R = torch.zeros(self.num_agents, 1).to(self.device)
        if not self.done:
            # predict value
            state = self.state
            value_multi, *others = self.model(
                (
                    (Variable(state, requires_grad=True), (self.img_hxs, self.img_cxs)),
                    (Variable(torch.Tensor(self.cam_info), requires_grad=True), (self.pose_hxs, self.pose_cxs)),
                    (Variable(torch.Tensor(self.pre_actions), requires_grad=True),
                     Variable(torch.Tensor(self.gate_ids)),
                     Variable(torch.Tensor(self.random_ids)))
                )
            )
            for i in range(self.num_agents):
                R[i][0] = value_multi[i].data

        self.values.append(Variable(R).to(self.device))
        policy_loss = torch.zeros(self.num_agents, 1).to(self.device)
        value_loss = torch.zeros(self.num_agents, 1).to(self.device)
        pred_loss = torch.zeros(1, 1).to(self.device)
        entropies = torch.zeros(self.num_agents, 1).to(self.device)
        w_entropies = torch.Tensor([[float(self.args.entropy)] for i in range(self.num_agents)]).to(self.device)

        R = Variable(R, requires_grad=True).to(self.device)
        gae = torch.zeros(1, 1).to(self.device)
        for i in reversed(range(len(self.rewards))):
            R = self.args.gamma * R + self.rewards[i]
            advantage = R - self.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)
            # Generalized Advantage Estimataion
            delta_t = self.rewards[i] + self.args.gamma * self.values[i + 1].data - self.values[i].data
            gae = gae * self.args.gamma * self.args.tau + delta_t
            policy_loss = policy_loss - \
                      (self.log_probs[i] * Variable(gae)) - \
                      (w_entropies * self.entropies[i])
            entropies += self.entropies[i]

        policy_loss = policy_loss[self.env.random_ids]
        value_loss = value_loss[self.env.random_ids]

        loss = policy_loss.sum() + 0.5 * value_loss.sum()
        self.model.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(params, 50)
        ensure_shared_grads(self.model, shared_model, gpu=gpu_id >= 0)

        optimizer.step()

        values0 = self.values[0].data
        self.clear_actions()
        return policy_loss, value_loss, entropies, pred_loss, values0
Beispiel #2
0
def train(shared_model, optimizer, wholes, scaffolds, whole_conditions, scaffold_conditions, pid, retval_list, args):
    """\
    Target function for the multiprocessed training.

    In addition to updating model parameters, 
    loss values are collected by `retval_list` after each `forward`.

    Parameters
    ----------
    shared_model: torch.nn.Module
        A shared model to be trained.
    optimizer: torch.optim.Optimizer
        A shared optimizer.
    wholes: list[str]
        A list of whole-molecule SMILESs.
    scaffolds: list[str]
        A list of scaffold SMILESs.
    whole_conditions: list[ list[float] ]
        [
            [ value1, value2, ... ],  # condition values of whole 1
            [ value1, value2, ... ],  # condition values of whole 2
        ]
    scaffold_conditions: list[ list[float] ]
        Similar to `whole_conditions`, but with scaffold values.
    pid: int
        CPU index.
    retval_list: list[multiprocessing.managers.ListProxy]
        A list of lists to collect loss floats from CPUs.
        In each cycle, the final shape will be:
            (ncpus, minibatch_size, num_of_losses)
    args: argparse.Namespace
        Delivers parameters from command arguments to the model.
    """
    #each thread make new model
    model=ggm(args)
    for idx in range(len(wholes)):
        #set parameters of model as same as that of reference model
        model.load_state_dict(shared_model.state_dict())
        model.zero_grad()
        optimizer.zero_grad()
        
        #forward
        retval = model(wholes[idx], scaffolds[idx], whole_conditions[idx], scaffold_conditions[idx], args.shuffle_order)
        
        #if retval is None, some error occured. it is usually due to invalid smiles
        if retval is None:
            continue

        #train model
        g_gen, h_gen, loss1, loss2, loss3 = retval
        loss = loss1 + loss2*args.beta1 + loss3  # torch.autograd.Variable of shape (1,)
        retval_list[pid].append((loss.data.cpu().numpy()[0], loss1.data.cpu().numpy()[0], loss2.data.cpu().numpy()[0], loss3.data.cpu().numpy()[0]))
        loss.backward()

        #torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
        utils.ensure_shared_grads(model, shared_model, True)
        optimizer.step()
Beispiel #3
0
    def work(self):
        """
        Worker training procedure
        """
        self.step = 0

        self.model_state = copy.deepcopy(
            self.local_model.init_state(self.device))

        while True:

            self.step += 1

            # update local variables with the weights
            # of the global net
            if self.cfg.USE_GPU:
                with torch.cuda.device(self.gpu_id):
                    self.local_model.load_state_dict(
                        self.global_model.state_dict())
            else:
                self.local_model.load_state_dict(
                    self.global_model.state_dict())

            # accumulate some experience
            # and build the loss
            loss = self.process_rollout()

            # backward pass and
            # update the global model weights
            self.local_model.zero_grad()
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, self.local_model.parameters()), 40.0)
            ut.ensure_shared_grads(self.local_model,
                                   self.global_model,
                                   use_gpu=self.cfg.USE_GPU)
            self.optimizer.step()

            self.logger.log_value('loss',
                                  self.step,
                                  loss.item(),
                                  print_value=False,
                                  to_file=False)

            if (self.step % self.cfg.SAVE_STEP) == 0 and (
                    self.ident % 4 == 0):  #self.name == 'a3c_train_worker_0':
                torch.save(self.global_model.state_dict(), self.ckpt_path)
                print('Variables saved')

            if self.episode_count > self.cfg.MAX_EPISODES:
                # terminate the training
                if self.worker_name == 'a3c_train_worker_0':
                    torch.save(self.global_model.state_dict(), self.ckpt_path)
                    print('Variables saved')
                break
Beispiel #4
0
    def training(self, next_observation, shared_model, shared_optimizer,
                 params):
        self.model.train()
        self.n_update += 1
        self.cx = Variable(self.cx.data)
        self.hx = Variable(self.hx.data)

        R = torch.zeros(1, 1)
        if not self.done:
            self.state = preprocess(next_observation)
            with torch.cuda.device(self.gpu_id):
                obs = Variable(torch.FloatTensor(self.state)).cuda()
            value, _, _, _, _ = self.model(obs, self.target, self.hx, self.cx,
                                           self.eps_len, self.external_memory,
                                           self.gpu_id)
            R = value.data

        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                R = R.cuda()
        R = Variable(R)
        self.values.append(R)

        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                gae = gae.cuda()

        for i in reversed(range(len(self.rewards))):
            R = params.gamma * R + self.rewards[i]
            advantage = R - self.values[i]
            value_loss = value_loss + advantage.pow(2)  # 0.5 *

            # Generalized Advantage Estimataion
            delta_t = params.gamma * self.values[
                i + 1].data - self.values[i].data + self.rewards[i]

            gae = gae * params.gamma * params.tau + delta_t

            policy_loss = policy_loss - self.log_probs[i] * Variable(
                gae) - params.entropy_coef * self.entropies[i]

        self.model.zero_grad()
        (policy_loss +
         params.value_loss_coef * value_loss).backward()  #retain_graph=True
        clip_grad_norm_(self.model.parameters(), 1.0)
        ensure_shared_grads(self.model, shared_model, gpu=self.gpu_id >= 0)
        shared_optimizer.step()
        with torch.cuda.device(self.gpu_id):
            self.model.load_state_dict(
                shared_model.state_dict())  #model update
        self.clear_actions()
Beispiel #5
0
    def train(self, global_t, summary_writer=None):
        t = self.local_t
        if not self.replay_buffer.is_full():
            self.fill_experience()
            return 0  # time_step = 0
        # sync
        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                self.model.load_state_dict(self.shared_model.state_dict())
        else:
            self.model.load_state_dict(self.shared_model.state_dict())

        loss_a3c, episode_score = self.process_a3c()
        # 获取 hx, cx
        h0, c0 = self.hx.detach(), self.cx.detach()
        loss_pc = self.process_pc(h0=h0, c0=c0)

        h0, c0 = self.hx.detach(), self.cx.detach()
        loss_vr = self.process_vr(h0, c0)

        loss_rp = self.process_rp()

        loss = loss_a3c + loss_pc + loss_vr + loss_rp

        self.model.zero_grad()
        loss.backward()

        clip_grad_norm_(self.model.parameters(), 40.0)
        ensure_shared_grads(self.model,
                            self.shared_model,
                            gpu=self.gpu_id >= 0)

        self.adjust_learning_rate(optimizer=self.optimizer,
                                  global_time_step=global_t)
        self.optimizer.step()
        if summary_writer is not None:
            with torch.no_grad():
                losses = list(
                    map(lambda x: float(x.detach().cpu().numpy()),
                        [loss_a3c, loss_pc, loss_vr, loss_rp, loss]))
                tags = dict(
                    zip(['a3c', 'pc', 'vr', 'rp', 'total_loss'], losses))
                summary_writer.add_scalars('losses',
                                           tags,
                                           global_step=global_t)
                # 分数
                if episode_score:
                    summary_writer.add_scalars('score',
                                               {'score': episode_score},
                                               global_step=global_t)
        self._print_log(global_t)
        return self.local_t - t  # offset
Beispiel #6
0
    def training(self, next_obs, shared_model, shared_optimizer, params):
        #pdb.set_trace()
        # self.model.train()
        self.cx = Variable(self.cx.data)
        self.hx = Variable(self.hx.data)

        R = torch.zeros(1, 1)
        if not self.done:
            state = preprocessing(next_obs, self.obs_old, self.gpu_id)
            value, _, _, _ = self.model(state, self.hx, self.cx)
            R = value.data

        with torch.cuda.device(self.gpu_id):
            R = R.cuda()
        R = Variable(R)
        self.values.append(R)

        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)

        with torch.cuda.device(self.gpu_id):
            gae = gae.cuda()

        for i in reversed(range(len(self.rewards))):
            R = params.gamma * R + self.rewards[i]
            advantage = R - self.values[i]
            value_loss = value_loss + advantage.pow(2)  # 0.5 *

            # Generalized Advantage Estimation
            delta_t = params.gamma * self.values[
                i + 1].data - self.values[i].data + self.rewards[i]

            gae = gae * params.gamma * params.tau + delta_t

            policy_loss = policy_loss - self.log_probs[i] * Variable(
                gae) - params.entropy_coef * self.entropies[i]

        shared_optimizer.zero_grad()
        loss = policy_loss + params.value_loss_coef * value_loss
        loss.backward()

        clip_grad_norm_(self.model.parameters(), 50.0)
        ensure_shared_grads(self.model, shared_model, gpu=self.gpu_id >= 0)
        shared_optimizer.step()

        # self.synchronize(shared_model)
        with torch.cuda.device(self.gpu_id):
            self.model.load_state_dict(shared_model.state_dict())
        self.clear_all()
Beispiel #7
0
    def optimize(self, params, optimizer, shared_model, training_mode,
                 device_share):
        R = torch.zeros(len(self.rewards[0]), 1).to(self.device)
        if not self.done:
            # predict value
            state = self.state
            value_multi, *others = self.model(
                Variable(state, requires_grad=True))
            for i in range(len(self.rewards[0])):  # num_agent
                R[i][0] = value_multi[i].data

        self.values.append(Variable(R).to(self.device))

        batch_size = len(self.entropies[0][0])
        policy_loss = torch.zeros(batch_size, 1).to(self.device)
        value_loss = torch.zeros(1, 1).to(self.device)
        entropies = torch.zeros(batch_size, self.dim_action).to(self.device)
        w_entropies = float(self.args.entropy)

        R = Variable(R, requires_grad=True).to(self.device)
        gae = torch.zeros(1, 1).to(self.device)

        for i in reversed(range(len(self.rewards))):
            R = self.args.gamma * R + self.rewards[i]
            advantage = R - self.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)
            # Generalized Advantage Estimataion
            delta_t = self.rewards[i] + self.args.gamma * self.values[
                i + 1].data - self.values[i].data
            gae = gae * self.args.gamma * self.args.tau + delta_t
            policy_loss = policy_loss - \
                (self.log_probs[i] * Variable(gae)) - \
                (w_entropies * self.entropies[i])
            entropies += self.entropies[i].sum()

        self.model.zero_grad()
        loss = policy_loss.sum() + 0.5 * value_loss.sum()
        loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(params, 50)
        ensure_shared_grads(self.model, shared_model, self.device,
                            device_share)
        optimizer.step()

        self.clean_buffer(self.done)

        return policy_loss, value_loss, entropies
Beispiel #8
0
def train (rank, args, shared_model, optimizer, env_conf, datasets=None):
    ptitle('Training Agent: {}'.format(rank))
    print ('Start training agent: ', rank)
    
    if rank == 0:
        logger = Logger (args.log_dir)
        train_step = 0

    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    env_conf ["env_gpu"] = gpu_id
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    if "EM_env" in args.env:
        raw, lbl, prob, gt_lbl = datasets
        env = EM_env (raw, lbl, prob, env_conf, 'train', gt_lbl)
    else:
        env = Voronoi_env (env_conf)

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop (shared_model.parameters (), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam (shared_model.parameters (), lr=args.lr, amsgrad=args.amsgrad)

        # env.seed (args.seed + rank)
    if not args.continuous:
        player = Agent (None, env, args, None)
    else:
        player = Agent_continuous (None, env, args, None)
    player.gpu_id = gpu_id
    if not args.continuous:
        player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat)
    else:
        player.model = A3Clstm_continuous (env.observation_space.shape, env_conf["num_action"], args.hidden_feat)

    player.state = player.env.reset ()
    player.state = torch.from_numpy (player.state).float ()
    old_score = player.env.old_score
    final_score = 0

    if gpu_id >= 0:
        with torch.cuda.device (gpu_id):
            player.state = player.state.cuda ()
            player.model = player.model.cuda ()
    player.model.train ()

    if rank == 0:
        eps_reward = 0
        pinned_eps_reward = 0
        mean_log_prob = 0

    # print ("rank: ", rank)

    while True:
        if gpu_id >= 0:
            with torch.cuda.device (gpu_id):
                player.model.load_state_dict (shared_model.state_dict ())
        else:
            player.model.load_state_dict (shared_model.state_dict ())
        
        if player.done:
            player.eps_len = 0
            if rank == 0:
                if 0 <= (train_step % args.train_log_period) < args.max_episode_length:
                    print ("train: step", train_step, "\teps_reward", eps_reward, 
                        "\timprovement", final_score - old_score)
                old_score = player.env.old_score
                pinned_eps_reward = eps_reward
                eps_reward = 0
                mean_log_prob = 0
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, args.hidden_feat).cuda())
                    player.hx = Variable(torch.zeros(1, args.hidden_feat).cuda())
            else:
                player.cx = Variable(torch.zeros(1, args.hidden_feat))
                player.hx = Variable(torch.zeros(1, args.hidden_feat))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train ()
            if rank == 0:
                # if 0 <= (train_step % args.train_log_period) < args.max_episode_length:
                #     print ("train: step", train_step, "\taction = ", player.action)
                eps_reward += player.reward
                # print (eps_reward)
                mean_log_prob += player.log_probs [-1] / env_conf ["T"]
            if player.done:
                break

        if player.done:
            # if rank == 0:
            #     print ("----------------------------------------------")
            final_score = player.env.old_score
            state = player.env.reset ()
            player.state = torch.from_numpy (state).float ()
            if gpu_id >= 0:
                with torch.cuda.device (gpu_id):
                    player.state = player.state.cuda ()

        R = torch.zeros (1, 1)
        if not player.done:
            if not args.continuous:
                value, _, _ = player.model((Variable(player.state.unsqueeze(0)),
                                        (player.hx, player.cx)))
            else:
                value, _, _, _ = player.model((Variable(player.state.unsqueeze(0)),
                                        (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)

        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = player.values[i + 1].data * args.gamma + player.rewards[i] - \
                        player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t
            # print (player.rewards [i])
            if not args.continuous:
                policy_loss = policy_loss - \
                    player.log_probs[i] * \
                    Variable(gae) - 0.01 * player.entropies[i]
            else:
                policy_loss = policy_loss - \
                    player.log_probs[i].sum () * Variable(gae) - \
                    0.01 * player.entropies[i].sum ()

        player.model.zero_grad ()
        sum_loss = (policy_loss + value_loss)

        sum_loss.backward ()
        ensure_shared_grads (player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step ()
        player.clear_actions ()

        if rank == 0:
            train_step += 1
            if train_step % args.log_period == 0:
                log_info = {
                    # 'train: sum_loss': sum_loss, 
                    'train: value_loss': value_loss, 
                    'train: policy_loss': policy_loss, 
                    'train: advanage': advantage,
                    # 'train: entropy': entropy,
                    'train: eps reward': pinned_eps_reward,
                    # 'train: mean log prob': mean_log_prob
                }

                for tag, value in log_info.items ():
                    logger.scalar_summary (tag, value, train_step)
Beispiel #9
0
def trainhoc(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = OC_env(args.env)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = HOCAgent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = HOCModel(player.env.observation_space.shape[0],
                            player.env.action_space, args.options, args.width)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    threshold = 0
    EnvNumSteps = 0
    while True:
        if EnvNumSteps > threshold:
            threshold += 5000
            print("thread:", rank, "steps:", EnvNumSteps)

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            ### add in option selection part
            probo1, logpo1, player.o1 = player.model.getPolicyO1(
                Variable(player.state))
            probo2, logpo2, player.o2 = player.model.getPolicyO2(
                Variable(player.state), player.o1)

        else:
            player.o1 = player.o1
            player.o2 = player.o2

        for step in range(args.num_steps):
            EnvNumSteps += 1
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            q = player.model(Variable(player.state))

            v = q.max(-1)[0]
            R = v.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = torch.zeros(1, 1)
        value_loss = torch.zeros(1, 1)
        phi_loss = torch.zeros(1, 1)
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        thesize = len(player.rewards)
        for i in reversed(range(len(player.rewards))):
            ### update discounted reward
            before = R
            R = args.gamma * R + player.rewards[i]

            ### update value function
            difference1 = R - player.qs1[i]
            value_loss = value_loss + 0.5 * difference1.pow(2)
            difference2 = R - player.qs2[i]
            value_loss = value_loss + 0.5 * difference2.pow(2)
            if i + 1 < thesize:
                difference3 = before - player.values[i + 1]
                difference4 = before - player.qs1[i + 1]

            ### update policy
            # adv1 = R - player.qs1[i]
            delta2 = R - player.qs2[i]


            policy_loss = policy_loss - \
                player.log_probsa[i] * \
                Variable(delta2) - 0.1 * player.entropiesA[i]

            if i + 1 < thesize:
                beta1 = player.termprobs1[i + 1].data
                beta2 = player.termprobs2[i + 1].data

                policy_loss = policy_loss - \
                    args.gamma * player.log_probso1[i+1] * \
                    Variable(beta1 * beta2 * difference3.data) - 0.1 * player.entropieso1[i+1]

                policy_loss = policy_loss - \
                    args.gamma * player.log_probso2[i+1] * \
                    Variable(beta2 * difference4.data) - 0.1 * player.entropieso2[i+1]

                advantage1 = player.qs1[i + 1].data - player.values[
                    i + 1].data + args.delib
                phi_loss = phi_loss + \
                    args.gamma * player.termprobs1[i+1] * \
                    Variable(advantage1 * beta2, requires_grad=False)

                advantage2 = player.qs2[
                    i + 1].data - (1 - beta1) * player.qs1[i + 1].data - (
                        beta1 * player.values[i + 1].data) + args.delib
                phi_loss = phi_loss + \
                    args.gamma * player.termprobs2[i+1] * \
                    Variable(advantage2, requires_grad=False)

        player.model.zero_grad()
        (phi_loss.sum() + policy_loss.sum() +
         0.5 * value_loss.sum()).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Beispiel #10
0
def train(rank, args, shared_model, optimizer):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = create_env(args.env, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if args.model == 'MLP':
        player.model = A3C_MLP(player.env.observation_space.shape[0],
                               player.env.action_space, args.stack_frames)
    if args.model == 'CONV':
        player.model = A3C_CONV(args.stack_frames, player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 128).cuda())
                    player.hx = Variable(torch.zeros(1, 128).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 128))
                player.hx = Variable(torch.zeros(1, 128))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):

            player.action_train()

            if player.done:
                break

        if player.done:
            player.eps_len = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = torch.zeros(1, 1).cuda()
        else:
            R = torch.zeros(1, 1)
        if not player.done:
            state = player.state
            if args.model == 'CONV':
                state = state.unsqueeze(0)
            value, _, _, _ = player.model(
                (Variable(state), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = torch.zeros(1, 1).cuda()
        else:
            gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            #          print(player.rewards[i])
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                (player.log_probs[i].sum() * Variable(gae)) - \
                (0.01 * player.entropies[i].sum())

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Beispiel #11
0
    def act(self):
        self.model.load_state_dict(self.shared_model.state_dict())
        self.model.train()

        log_probs, entropies, rewards, values = [], [], [], []
        for _ in range(self.t_max):
            pout, vout = self.model.pi_and_v(self.state_var)
            reward = self.env.receive_action(pout.action_indices[0])
            if self.clip_reward:
                reward = np.clip(reward, -1, 1)

            log_probs.append(pout.sampled_actions_log_probs)
            entropies.append(pout.entropy)
            values.append(vout)
            rewards.append(reward)

            if self.env.is_terminal:
                break

            self.update_state()

        R = 0
        if not self.env.is_terminal:
            _, vout = self.model.pi_and_v(self.state_var, keep_same_state=True)
            R = float(vout.data.numpy())
        else:
            self.env.reset()
            self.model.reset_state()
            self.update_state()

        t = len(rewards)
        pi_loss, v_loss = 0, 0
        for i in reversed(range(t)):
            R = self.gamma*R + rewards[i]
            v = values[i]

            advantage = R - float(v.data.numpy()[0, 0])
            # Accumulate gradients of policy
            log_prob = log_probs[i]
            entropy = entropies[i]
            # Log probability is increased proportionally to advantage
            pi_loss -= log_prob * advantage
            # Entropy is maximized
            pi_loss -= self.beta * entropy
            # Accumulate gradients of value function
            v_loss += (v - R).pow(2).div_(2)

        if self.pi_loss_coef != 1.0:
            pi_loss *= self.pi_loss_coef

        if self.v_loss_coef != 1.0:
            v_loss *= self.v_loss_coef

        # Normalize the loss of sequences truncated by terminal states
        if self.keep_loss_scale_same and t < self.t_max:
            factor = self.t_max / t
            pi_loss *= factor
            v_loss *= factor

        total_loss = pi_loss + v_loss

        # Compute gradients using thread-specific model
        self.optimizer.zero_grad()

        total_loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 40)
        # Copy the gradients to the globally shared model
        ensure_shared_grads(self.model, self.shared_model)

        self.optimizer.step()

        self.model.unchain_backward()

        return t
Beispiel #12
0
def train(rank, args, shared_model, optimizer, env_conf):
    torch.manual_seed(args.seed + rank)

    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)
    _ = env.reset()
    action = env.action_space.sample()
    _, _, _, info = env.step(action)
    start_lives = info['ale.lives']

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()
    env.seed(args.seed + rank)
    state = env.reset()
    state = torch.from_numpy(state).float()
    done = True
    episode_length = 0
    current_life = start_lives
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 512))
            hx = Variable(torch.zeros(1, 512))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):

            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, info = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            if args.count_lives:
                if current_life > info['ale.lives']:
                    done = True
                else:
                    current_life = info['ale.lives']
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                current_life = start_lives
                state = env.reset()

            state = torch.from_numpy(state).float()
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:

            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Beispiel #13
0
def train(shared_model, optimizer, rank, global_steps, args):
    setproctitle('{}:train[{}]'.format(args.name, rank))

    torch.manual_seed(args.seed + rank)
    torch.cuda.manual_seed(args.seed + rank)
    env = create_env(args.game_type, args.env_name, 'train:{}'.format(rank),
                     args.remotes[rank])
    env._max_episode_steps = args.max_episode_length
    env.seed(args.seed + rank)

    model = copy.deepcopy(shared_model)
    gpu_id = args.gpu_ids[rank]
    with torch.cuda.device(gpu_id):
        model = model.cuda() if gpu_id >= 0 else model
    model.train()
    optimizer = optimizer or optim.Adam(shared_model.parameters(), lr=args.lr)

    done = True
    try:
        while True:
            # Sync with the shared model
            with torch.cuda.device(gpu_id):
                model.load_state_dict(shared_model.state_dict())
            if done:
                with torch.cuda.device(gpu_id):
                    state = torch.from_numpy(env.reset()).float()
                    state = state.cuda() if gpu_id >= 0 else state
                model.reset()

            values, log_probs, rewards, entropies = [], [], [], []
            for step in range(args.n_steps):
                with global_steps.get_lock():
                    global_steps.value += 1

                value, logit = model(Variable(state.unsqueeze(0)))

                prob = F.softmax(logit)
                log_prob = F.log_softmax(logit)
                entropy = -(log_prob * prob).sum(1)

                action = prob.multinomial().data
                log_prob = log_prob.gather(1, Variable(action))

                raw_state, reward, done, _ = env.step(action.cpu().numpy())
                reward = max(min(reward, args.max_reward), args.min_reward)

                values.append(value)
                log_probs.append(log_prob)
                rewards.append(reward)
                entropies.append(entropy)

                if done:
                    break

                state = state.copy_(torch.from_numpy(raw_state).float())

            R = state.new().resize_((1, 1)).zero_()
            if not done:
                value, _ = model(Variable(state.unsqueeze(0), volatile=True),
                                 keep_same_state=True)
                R = value.data

            values.append(Variable(R))
            policy_loss, value_loss = 0, 0
            R = Variable(R)
            gae = state.new().resize_((1, 1)).zero_()
            for i in reversed(range(len(rewards))):
                R = args.gamma * R + rewards[i]
                advantage = R - values[i]
                value_loss = value_loss + 0.5 * advantage.pow(2)

                # Generalized Advantage Estimataion
                delta_t = rewards[i] + args.gamma * values[
                    i + 1].data - values[i].data
                gae = gae * args.gamma * args.tau + delta_t
                policy_loss = policy_loss - log_probs[i] * Variable(
                    gae) - 0.01 * entropies[i]

            model.zero_grad()
            (policy_loss + 0.5 * value_loss).backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), 40)
            ensure_shared_grads(model, shared_model, gpu=gpu_id >= 0)
            optimizer.step()

            model.detach()

            if global_steps.value >= args.max_global_steps:
                break
    except Exception as e:
        raise
    finally:
        print('Trainer [{}] finished !'.format(rank))
Beispiel #14
0
def train_rep(args, shared_model, env_conf):
    batch_size = 16
    train_times = args.rep_train_time
    trace = []
    td_class = [(0, 1), (1, 2), (2, 3), (3, 5), (5, 7), (7, 9)]
    loss_fn = nn.CrossEntropyLoss()
    optimizer_r = Adam(shared_model.r_net.parameters(), lr=args.rl_r)
    optimizer_c = Adam(shared_model.c_net.parameters(), lr=args.rl_r)
    ptitle('Train rep')
    gpu_id = args.gpu_ids[-1]

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = atari_env(args.env, env_conf, args)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
            # player.model.r_net = player.model.r_net.cuda()
            # player.model.c_net = player.model.c_net.cuda()
    flag = True
    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.train()
            flag = False

        player.action_test()
        trace.append(player.state)
        if len(trace) > args.trace_length:
            # 训练几百次
            for _ in range(train_times):
                range_c = np.random.randint(0, len(td_class))
                TD = np.random.randint(td_class[range_c][0],
                                       td_class[range_c][1])
                begin = np.random.randint(0, len(trace) - TD - batch_size)
                former = torch.stack(trace[begin:begin + batch_size], dim=0)
                latter = torch.stack(trace[begin + TD:begin + TD + batch_size],
                                     dim=0)
                target = torch.zeros(batch_size, dtype=torch.long) + range_c
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        former = former.cuda()
                        latter = latter.cuda()
                        target = target.cuda()

                rep_f, rep_l = player.model.r_net(former), player.model.r_net(
                    latter)
                output = player.model.c_net(rep_f, rep_l, False)
                loss = loss_fn(output, target)
                optimizer_r.zero_grad()
                optimizer_c.zero_grad()
                loss.backward()
                ensure_shared_grads(player.model.r_net,
                                    shared_model.r_net,
                                    gpu=gpu_id >= 0)
                ensure_shared_grads(player.model.c_net,
                                    shared_model.c_net,
                                    gpu=gpu_id >= 0)
                optimizer_r.step()
                optimizer_c.step()
            trace = []
        if player.done and not player.info:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            flag = True

            state = player.env.reset()
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Beispiel #15
0
def train(rank, args, shared_model, optimizer, env_conf, iters,
          checkpoint_path):
    iters = dill.loads(iters)
    if args.enable_gavel_iterator and rank == 0:
        iters._init_logger()
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    elapsed_time = 0
    start_time = time.time()

    for i in iters:
        if i % 100 == 0:
            print('GPU %d finished step %d' % (rank, i), flush=True)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
        elapsed_time += time.time() - start_time
        start_time = time.time()

        if (args.throughput_estimation_interval is not None
                and i % args.throughput_estimation_interval == 0
                and rank == 0):
            print('[THROUGHPUT_ESTIMATION]\t%s\t%d' % (time.time(), i))

        if (args.max_duration is not None
                and elapsed_time >= args.max_duration):
            break
    if args.enable_gavel_iterator and rank == 0:
        state = shared_model.state_dict()
        iters.save_checkpoint(state, checkpoint_path)
        iters.complete()
Beispiel #16
0
def trainocpg(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = OC_env(args.env)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = OCPGAgent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = OCPGModel(player.env.observation_space.shape[0],
                             player.env.action_space, args.options, args.width)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    threshold = 0
    EnvNumSteps = 0
    reward_mean = 0.
    while True:
        if EnvNumSteps > threshold:
            threshold += 5000
            print("thread:", rank, "steps:", EnvNumSteps)

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            ### add in option selection part
            q, logito = player.model(Variable(player.state))
            probo = F.softmax(logito, dim=1)
            player.otensor = probo.multinomial(1).data
            player.o = player.otensor.numpy()[0][0]

        else:
            player.o = player.o

        for step in range(args.num_steps):
            EnvNumSteps += 1
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        # if not player.done:
        q, logito = player.model(Variable(player.state))
        v = q.max(-1)[0]
        R = v.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = torch.zeros(1, 1)
        value_loss = torch.zeros(1, 1)
        phi_loss = torch.zeros(1, 1)
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        thesize = len(player.rewards)
        reward_sum = sum(player.rewards)
        reward_mean = reward_mean + (reward_sum -
                                     thesize * reward_mean) / EnvNumSteps
        JPi = Variable(torch.tensor(reward_mean))
        for i in reversed(range(len(player.rewards))):
            before = R
            R = args.gamma * R + player.rewards[i] - JPi
            difference = R - player.qs[i]
            if i + 1 < thesize:
                difference2 = before - player.values[i + 1]

            else:
                NextQ, NextLogito = player.model(Variable(player.state))
                NextTerm = player.model.getTermination(Variable(player.state),
                                                       player.o)
                NextProbso = F.softmax(NextLogito, dim=1)
                ### select new option
                otensor = NextProbso.multinomial(1).data
                NextLog_probso = F.log_softmax(NextLogito, dim=1)

                NextValue = NextQ.max(-1)[0]
                NextQ = NextQ[0][otensor.numpy()[0][0]]
                NextEntropyso = -(NextLog_probso * NextProbso).sum(1)
                NextLog_probso = NextLog_probso.gather(1, Variable(otensor))
                difference2 = before - NextValue

            value_loss = value_loss + 0.5 * difference.pow(2)

            policy_loss = policy_loss - player.log_probs[i] * Variable(
                difference.data) - 0.1 * player.entropies[i]

            if i + 1 < thesize:
                beta = player.termprobs[i + 1].data

                policy_loss = policy_loss - args.gamma * beta * player.log_probso[
                    i + 1] * Variable(
                        difference2.data) - 0.1 * player.entropieso[i + 1]

                ###!!!!! termination update
                advantage = player.qs[i + 1].data - player.values[
                    i + 1].data + args.delib
                phi_loss = phi_loss + args.gamma * player.termprobs[
                    i + 1] * Variable(advantage, requires_grad=False)

            else:
                beta = NextTerm.data

                policy_loss = policy_loss - args.gamma * beta * NextLog_probso * Variable(
                    difference2.data) - 0.1 * NextEntropyso

                ###!!!!! termination update
                advantage = NextQ.data - NextValue.data + args.delib
                phi_loss = phi_loss + args.gamma * NextTerm * Variable(
                    advantage, requires_grad=False)

        player.model.zero_grad()
        (phi_loss.sum() + policy_loss.sum() +
         0.5 * value_loss.sum()).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
        if str(rank) == "1":
            fullname = args.save_model_dir + args.env + str(rank) + ".torch"
            tmpname = args.save_model_dir + args.env + str(rank) + ".tmp"
            torch.save(optimizer.state_dict(),
                       tmpname)  #optimizer.state_dict()
            os.rename(tmpname, fullname)
Beispiel #17
0
def train(rank, reward_type, args, shared_model, optimizer, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf)
    env.seed(args.seed + rank)

    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0

    player = Agent(None, env, args, None, reward_type)
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    for i in itertools.count():
        if i % 10 == 0:
            print("reward type {0}, iter {1}".format(reward_type, i))
        player.model.load_state_dict(shared_model.state_dict())
        for step in range(args.num_steps):
            player.action_train()
            reward_sum += player.reward
            if args.count_lives:
                player.check_state()
            if player.done:
                break

        if player.done:
            num_tests += 1
            player.current_life = 0
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.clear_actions()
Beispiel #18
0
def train(rank, args, shared_model, optimizer, env_conf, shared_counter,
          targ_shared):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu')

    torch.manual_seed(args.seed + rank)
    torch.cuda.manual_seed(args.seed + rank)

    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None, gpu_id=gpu_id)

    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)
    player.model.apply(weights_init)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).to(torch.float32)
    player.state = player.state.to(device)
    player.model = player.model.to(device)
    #player.targ_model = copy.deepcopy(player.model)

    player.model.train()
    #player.targ_model.eval()
    player.eps_len += 2
    while True:
        player.model.load_state_dict(shared_model.state_dict())
        #player.targ_model.load_state_dict(targ_shared.state_dict())
        if player.done:
            player.cx = torch.zeros(1, 512).to(device)
            player.hx = torch.zeros(1, 512).to(device)
            #player.targ_cx = copy.deepcopy(player.cx).detach()
            #player.targ_hx = copy.deepcopy(player.hx).detach()
        else:
            player.cx = player.cx.detach()
            player.hx = player.hx.detach()

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).to(torch.float32)
            player.state = player.state.to(device)

        #alpha = player.model.log_alpha.exp().detach()
        alpha = .01
        #alpha = 0
        x_R = torch.zeros(1, 1)
        if not player.done:
            with torch.no_grad():
                action, value, logit, q_value, _ = player.model(
                    (player.state.unsqueeze(0), (player.hx, player.cx)))
                x_R = q_value[1].detach() - alpha * F.log_softmax(
                    logit, -1).gather(-1, action)
        x_R = x_R.to(device)
        policy_loss = 0
        adv_gae_loss = 0
        for i in reversed(range(len(player.rewards))):
            x_R = args.gamma * x_R + player.rewards[i]
            adv_gae_loss = adv_gae_loss + (player.tra_adv_gae[i][1] -
                                           x_R.detach()).pow(2) * .5
            #policy_loss = policy_loss - player.log_probs[i] * player.tra_adv_gae[i][0].detach() + alpha * player.log_probs[i] * player.log_probs[i].detach()

            policy_loss = policy_loss - (F.softmax(
                player.values[i], -1) * player.tra_adv_gae[i][0].detach()).sum(
                    -1) - alpha * player.entropies[i].unsqueeze(-1)
            #policy_loss = policy_loss - player.log_probs[i] * (x_R - (F.softmax(player.values[i], -1) *
            #        player.tra_adv_gae[i][0]).sum(-1) - alpha * player.entropies[i]).detach() + alpha * player.log_probs[i] * player.log_probs[i].detach()
            #prob = F.softmax(player.values[i], -1)
            #ent_alpha = alpha * player.entropies[i].unsqueeze(-1)
            #advs = (player.tra_adv_gae[i][0] -
            #        ((player.tra_adv_gae[i][0] * prob).sum(-1, True) +
            #         ent_alpha)).detach()
            #policy_loss = policy_loss - (prob * advs).sum(-1) - ent_alpha
            x_R = x_R - alpha * player.log_probs[i].detach()
        player.model.zero_grad()
        (policy_loss + .5 * adv_gae_loss).backward(retain_graph=False)

        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()

        with shared_counter.get_lock():
            shared_counter.value += len(player.rewards)
            if shared_counter.value > args.interact_steps:
                break
Beispiel #19
0
def train(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = Environment()  # 創建環境
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    # env.seed(args.seed + rank)
    player = Agent(None, env, args, None)  # 創建代理人
    player.gpu_id = gpu_id
    num_actions = env.get_num_actions()

    player.model = A3Clstm(
        Config.STACKED_FRAMES,  # A3C模型
        num_actions)

    player.state, available = player.env.reset()  # 初始環境
    player.state = torch.from_numpy(player.state).float()
    player.available = torch.from_numpy(available).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
            player.available = player.available.cuda()
    player.model.train()  # 訓練模式
    player.eps_len += 1
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())  # 更新網路
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))  # 完成一次訓練 初始化LSTM
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):  # T-max = 20
            player.action_train()
            if player.done:
                break

        if player.done:
            state, available = player.env.reset()
            player.state = torch.from_numpy(state).float()
            player.available = torch.from_numpy(available).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
                    player.available = player.available.cuda()

        R = torch.zeros(1, 1)  # if done : R_t-max = 0
        if not player.done:
            value, _, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data  # R_t-max = V(s)

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Beispiel #20
0
    def train_rollout(self, total_step):
        storage = Storage(self.episode_C['rollout_length'])
        state = self.env._copy_state(*self.state)
        step_times = []
        # Sync.
        self.gnn.load_state_dict(self.shared_gnn.state_dict())
        for rollout_step in range(self.episode_C['rollout_length']):
            start_step_time = time.time()
            prediction = self.env.propagate(self.gnn, [state])
            action = prediction['a'].cpu().numpy()[0]
            next_state, reward, done, achieved_goal = self.env.step(action, self.ep_step, state)

            self.ep_step += 1
            if done:
                # Sync local model with shared model at start of each ep
                self.gnn.load_state_dict(self.shared_gnn.state_dict())
                self.ep_step = 0

            storage.add(prediction)
            storage.add({'r': tensor(reward, self.device).unsqueeze(-1).unsqueeze(-1),
                         'm': tensor(1 - done, self.device).unsqueeze(-1).unsqueeze(-1),
                         's': state})

            state = self.env._copy_state(*next_state)

            total_step += 1

            end_step_time = time.time()
            step_times.append(end_step_time - start_step_time)

        self.state = self.env._copy_state(*state)

        prediction = self.env.propagate(self.gnn, [state])
        storage.add(prediction)
        storage.placeholder()

        advantages = tensor(np.zeros((1, 1)), self.device)
        returns = prediction['v'].detach()
        for i in reversed(range(self.episode_C['rollout_length'])):
            # Disc. Return
            returns = storage.r[i] + self.agent_C['discount'] * storage.m[i] * returns
            # GAE
            td_error = storage.r[i] + self.agent_C['discount'] * storage.m[i] * storage.v[i + 1] - storage.v[i]
            advantages = advantages * self.agent_C['gae_tau'] * self.agent_C['discount'] * storage.m[i] + td_error
            storage.adv[i] = advantages.detach()
            storage.ret[i] = returns.detach()

        # print(returns.shape, td_error.shape, advantages.shape, storage.adv[-1].shape, storage.ret[-1].shape)

        actions, log_probs_old, returns, advantages = storage.cat(['a', 'log_pi_a', 'ret', 'adv'])
        states = [storage.s[i] for i in range(storage.size)]

        actions = actions.detach()
        log_probs_old = log_probs_old.detach()
        advantages = (advantages - advantages.mean()) / advantages.std()

        # Train
        self.gnn.train()
        batch_times = []
        train_pred_times = []
        for _ in range(self.agent_C['optimization_epochs']):
            # Sync. at start of each epoch
            self.gnn.load_state_dict(self.shared_gnn.state_dict())
            sampler = random_sample(np.arange(len(states)), self.agent_C['minibatch_size'])
            for batch_indices in sampler:
                start_batch_time = time.time()

                batch_indices_tensor = tensor(batch_indices, self.device).long()

                # Important Node: these are tensors but dont have a grad
                sampled_states = [states[i] for i in batch_indices]
                sampled_actions = actions[batch_indices_tensor]
                sampled_log_probs_old = log_probs_old[batch_indices_tensor]
                sampled_returns = returns[batch_indices_tensor]
                sampled_advantages = advantages[batch_indices_tensor]

                start_pred_time = time.time()
                prediction = self.env.propagate(self.gnn, sampled_states, sampled_actions)
                end_pred_time = time.time()
                train_pred_times.append(end_pred_time - start_pred_time)

                # Calc. Loss
                ratio = (prediction['log_pi_a'] - sampled_log_probs_old).exp()

                obj = ratio * sampled_advantages
                obj_clipped = ratio.clamp(1.0 - self.agent_C['ppo_ratio_clip'],
                                          1.0 + self.agent_C['ppo_ratio_clip']) * sampled_advantages

                # policy loss and value loss are scalars
                policy_loss = -torch.min(obj, obj_clipped).mean() - self.agent_C['entropy_weight'] * prediction['ent'].mean()

                value_loss = self.agent_C['value_loss_coef'] * (sampled_returns - prediction['v']).pow(2).mean()

                self.opt.zero_grad()
                (policy_loss + value_loss).backward()
                if self.agent_C['clip_grads']:
                    nn.utils.clip_grad_norm_(self.gnn.parameters(), self.agent_C['gradient_clip'])
                ensure_shared_grads(self.gnn, self.shared_gnn)
                self.opt.step()
                end_batch_time = time.time()
                batch_times.append(end_batch_time - start_batch_time)
        self.gnn.eval()
        return total_step, np.array(step_times).mean(), np.array(batch_times).mean(), np.array(train_pred_times).mean()
Beispiel #21
0
def trainac(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = OC_env(args.env)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = ACAgent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = ACModel(player.env.observation_space.shape[0],
                           player.env.action_space, args.options, args.width)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    threshold = 0
    EnvNumSteps = 0
    reward_mean = 0.
    while True:
        if EnvNumSteps > threshold:
            threshold += 5000
            print("thread:", rank, "steps:", EnvNumSteps)

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())

        for step in range(args.num_steps):
            EnvNumSteps += 1
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            q, logit = player.model(Variable(player.state))
            v = q.max(-1)[0]
            R = v.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = torch.zeros(1, 1)
        value_loss = torch.zeros(1, 1)
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        thesize = len(player.rewards)
        reward_sum = sum(player.rewards)
        reward_mean = reward_mean + (reward_sum -
                                     thesize * reward_mean) / EnvNumSteps
        for i in reversed(range(len(player.rewards))):
            before = R
            R = args.gamma * R + player.rewards[i]
            difference = R - player.qs[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * difference.pow(2)

            policy_loss = policy_loss - player.log_probs[i] * Variable(
                advantage.data) - 0.1 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss.sum() + 0.5 * value_loss.sum()).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
        if str(rank) == "1":
            fullname = args.save_model_dir + args.env + str(rank) + ".torch"
            tmpname = args.save_model_dir + args.env + str(rank) + ".tmp"
            torch.save(optimizer.state_dict(),
                       tmpname)  #optimizer.state_dict()
            os.rename(tmpname, fullname)
Beispiel #22
0
def train(rank, args, shared_model, optimizer, env_conf, num_tau_samples=32, num_tau_prime_samples=32, kappa=1.0, num_quantiles=32):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(
                shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        
        R = torch.zeros(1,num_tau_prime_samples)
        if not player.done:
            logit, _, _ = player.model((Variable(
                    player.state.unsqueeze(0)), (player.hx, player.cx)))
        
            q_vals = torch.mean(logit,0)
            _, action = torch.max(q_vals,0)
            logit, _, _ = player.model((Variable(player.state.unsqueeze(0)),
                    (player.hx, player.cx)))
            
            R = logit[:,action]

        
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()       
        #R = R.detach()
        R = Variable(R)
        
        value_loss = 0
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]

            advantage = R.repeat(num_tau_samples,1) - player.logits_array[i].repeat(1, num_tau_prime_samples)
            #print("Ad: ",advantage)
            loss = (torch.abs(advantage) <= kappa).float() * 0.5 * advantage ** 2
            #print("loss: ",loss.sum(0).sum(0), loss)
            loss += (torch.abs(advantage) > kappa).float() * kappa * (torch.abs(advantage) - 0.5 * kappa)
            #print("loss: ",loss.sum(0).sum(0), loss)
            step_loss = torch.abs(player.quantiles_array[i].cuda() - (advantage.detach()<0).float()) * loss/kappa                 
            value_loss += step_loss.sum(0).mean(0)

        
        player.model.zero_grad()
        value_loss.backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Beispiel #23
0
def train(rank, args, shared_model, optimizer, optimizer_r, env_conf, lock,
          counter):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = [
                        Variable(torch.zeros(1, 512).cuda()),
                        Variable(torch.zeros(1, 512).cuda())
                    ]
                    player.hx = [
                        Variable(torch.zeros(1, 512).cuda()),
                        Variable(torch.zeros(1, 512).cuda())
                    ]
            else:
                player.cx = [
                    Variable(torch.zeros(1, 512)),
                    Variable(torch.zeros(1, 512))
                ]
                player.hx = [
                    Variable(torch.zeros(1, 512)),
                    Variable(torch.zeros(1, 512))
                ]
        else:
            player.cx = [
                Variable(player.cx[0].data),
                Variable(player.cx[1].data)
            ]
            player.hx = [
                Variable(player.hx[0].data),
                Variable(player.cx[1].data)
            ]

        # 测试rnet的更新有没有影响到这里
        # ps = list(player.model.r_net.named_parameters())
        # n, v = ps[6]
        # print(v.sum())
        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)),
                 (player.hx[0], player.cx[0]), (player.hx[1], player.cx[1])))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        with lock:
            counter.value += 1
        # rnet
        player.model.r_net.zero_grad()
        (args.actor_weight * policy_loss +
         (1 - args.actor_weight) * value_loss).backward(retain_graph=True)
        ensure_shared_grads(player.model.r_net,
                            shared_model.r_net,
                            gpu=gpu_id >= 0)
        optimizer_r.step()

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        player.model.r_net.zero_grad()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Beispiel #24
0
def train(rank, args, shared_model, optimizer, env_conf):

    torch.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    while True:
        player.model.load_state_dict(shared_model.state_dict())
        for step in range(args.num_steps):
            player.action_train()
            if args.count_lives:
                player.check_state()
            if player.done:
                break

        if player.done:
            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.clear_actions()
Beispiel #25
0
def train_func(rank,
               args,
               shared_model,
               optimizer,
               env_conf,
               datasets=None,
               shared_dict=None):
    if args.deploy:
        return
    ptitle('Train {0}'.format(rank))
    print('Start training agent: ', rank)

    if rank == 0:
        logger = Logger(args.log_dir[:-1] + '_losses/')
        train_step = 0

    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    env_conf["env_gpu"] = gpu_id
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)

    raw_list, gt_lbl_list = datasets
    env = EM_env(raw_list,
                 env_conf,
                 type="train",
                 gt_lbl_list=gt_lbl_list,
                 seed=args.seed + rank)

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)

    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = get_model(args,
                             args.model,
                             env.observation_space.shape,
                             args.features,
                             atrous_rates=args.atr_rate,
                             num_actions=2,
                             split=args.data_channel,
                             gpu_id=gpu_id,
                             multi=args.multi)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()

    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()

    if rank == 0:
        eps_reward = 0
        pinned_eps_reward = 0

    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())

        if player.done:
            player.eps_len = 0

            if rank == 0:
                if train_step % args.train_log_period == 0 and train_step > 0:
                    print("train: step", train_step, "\teps_reward",
                          eps_reward)
                if train_step > 0:
                    pinned_eps_reward = player.env.sum_reward.mean()
                    eps_reward = 0

            if args.lstm_feats:
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        player.cx, player.hx = player.model.lstm.init_hidden(
                            batch_size=1, use_cuda=True)
                else:
                    player.cx, player.hx = player.model.lstm.init_hidden(
                        batch_size=1, use_cuda=False)
        elif args.lstm_feats:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):

            if rank < args.lbl_agents:
                player.action_train(use_lbl=True)
            else:
                player.action_train()

            if rank == 0:
                eps_reward = player.env.sum_reward.mean()
            if player.done:
                break

        if player.done:
            state = player.env.reset(player.model, gpu_id)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        if "3D" in args.data:
            R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1],
                            env_conf["size"][2])
        else:
            R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1])

        if args.lowres:
            R = torch.zeros(1, 1, env_conf["size"][0] // 2,
                            env_conf["size"][1] // 2)

        if not player.done:
            if args.lstm_feats:
                value, _, _ = player.model(
                    (Variable(player.state.unsqueeze(0)), (player.hx,
                                                           player.cx)))
            else:
                value, _ = player.model(Variable(player.state.unsqueeze(0)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0

        if "3D" in args.data:
            gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1],
                              env_conf["size"][2])
        else:
            gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1])

        if args.rew_drop:
            keep_map = torch.tensor(player.env.keep_map)
        if args.lowres:
            gae = torch.zeros(1, 1, env_conf["size"][0] // 2,
                              env_conf["size"][1] // 2)

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
                if args.rew_drop:
                    keep_map = keep_map.cuda()
        R = Variable(R)

        for i in reversed(range(len(player.rewards))):
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    reward_i = torch.tensor(player.rewards[i]).cuda()
            else:
                reward_i = torch.tensor(player.rewards[i])

            R = args.gamma * R + reward_i
            if args.rew_drop:
                advantage = R - player.values[i]
                value_loss = value_loss + (0.5 * advantage * advantage *
                                           keep_map).mean()
                delta_t = player.values[
                    i + 1].data * args.gamma + reward_i - player.values[i].data
                gae = gae * args.gamma * args.tau + delta_t
            else:
                advantage = R - player.values[i]
                value_loss = value_loss + (0.5 * advantage * advantage).mean()
                delta_t = player.values[
                    i + 1].data * args.gamma + reward_i - player.values[i].data
                gae = gae * args.gamma * args.tau + delta_t
            if args.noisy:
                policy_loss = policy_loss - \
                    (player.log_probs[i] * Variable(gae)).mean ()
            else:
                if args.rew_drop:
                    policy_loss = policy_loss - \
                        (player.log_probs[i] * Variable(gae) * keep_map).mean () - \
                        (args.entropy_alpha * player.entropies[i] * keep_map).mean ()
                else:
                    policy_loss = policy_loss - \
                        (player.log_probs[i] * Variable(gae)).mean () - \
                        (args.entropy_alpha * player.entropies[i]).mean ()

        player.model.zero_grad()
        sum_loss = (policy_loss + value_loss)

        curtime = time.time()
        # print ("backward curtime:", curtime)
        sum_loss.backward()
        # print ("backward done", time.time () - curtime)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)

        curtime = time.time()
        # print ("optim curtime:", curtime)
        optimizer.step()
        # print ("optim done", time.time () - curtime)

        player.clear_actions()
        if args.wctrl == "s2m":
            player.env.config["spl_w"] = shared_dict["spl_w"]
            player.env.config["mer_w"] = shared_dict["mer_w"]

        if rank == 0:
            train_step += 1
            if train_step % args.log_period == 0 and train_step > 0:
                log_info = {
                    'train: value_loss': value_loss,
                    'train: policy_loss': policy_loss,
                    'train: eps reward': pinned_eps_reward,
                }

                if "EX" in args.model:
                    log_info["cell_prob_loss"] = cell_prob_loss

                for tag, value in log_info.items():
                    logger.scalar_summary(tag, value, train_step)
Beispiel #26
0
def train(rank, args, shared_model, optimizer, env_conf):
    start_time = time.time()
    ptitle('Training Agent: {}'.format(rank))
    #log = {}

    #setup_logger('{}_train_log'.format(args.env), r'{0}{1}_train_log'.format(
    #    args.log_dir, args.env))
    #log['{}_train_log'.format(args.env)] = logging.getLogger(
    #        '{}_train_log'.format(args.env))

    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    if 'micropolis' in args.env.lower():
        env = micropolis_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if 'micropolis' in args.env.lower():
        modelInit = getattr(model, args.design_head)
        player.model = modelInit(player.env.observation_space.shape[0],
                                 player.env.action_space,
                                 player.env.env.env.MAP_X)
        player.lstm_sizes = player.model.getMemorySizes()
    else:
        player.model = A3Clstm(player.env.observation_space.shape[0],
                               player.env.action_space)
    lstm_size = 512
    if 'micropolis' in args.env.lower():
        if 'arcade' not in args.env.lower():
            lstm_size = (1, 16, env.env.env.MAP_X, env.env.env.MAP_Y)
    player.lstm_size = lstm_size
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    log_counter = 0
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        num_lstm_layers = len(player.lstm_sizes)
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = [
                        Variable(torch.zeros(player.lstm_sizes[i]).cuda())
                        for i in range(num_lstm_layers)
                    ]
                    player.hx = [
                        Variable(torch.zeros(player.lstm_sizes[i]).cuda())
                        for i in range(num_lstm_layers)
                    ]
            else:
                player.cx = [
                    Variable(torch.zeros(lstm_sizes[i]))
                    for i in range(num_lstm_layers)
                ]
                player.hx = [
                    Variable(torch.zeros(lstm_sizes[i]))
                    for i in range(num_lstm_layers)
                ]
        else:
            player.cx = [
                Variable(player.cx[i].data) for i in range(num_lstm_layers)
            ]
            player.hx = [
                Variable(player.hx[i].data) for i in range(num_lstm_layers)
            ]

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break
        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if args.randomize_exploration:
                player.certainty = np.random.uniform(0.5, 1.5)
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            values, logit, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            if values.size()[1] == 1:
                value = values
            else:
                prob = torch.nn.functional.softmax(logit, dim=1)
                action = prob.multinomial(1).data
                value = values[0][action]

            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = torch.zeros(1, 1).cuda()
                R = Variable(R).cuda()
        else:
            R = Variable(R)
        player.values.append(R)
        policy_loss = 0
        value_loss = 0

        for i in reversed(range(len(player.rewards))):
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.rewards[i] = torch.Tensor([player.rewards[i]
                                                      ]).cuda()
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    gae = Variable(gae.cuda())
            else:
                gae = Variable(gae)
            policy_loss = policy_loss - \
                player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i]

    #if log_counter % 10 == 0:
    #    log['{}_train_log'.format(args.env)].info(
    #            "Time {0}, reward {1}, policy loss {2}, value loss {3}, entropy {4}".
    #            format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
    #                '{:9.2e}'.format(float(sum(player.rewards) / len(player.rewards))),
    #                '{:9.2e}'.format(float(policy_loss.data.item())),
    #                '{:9.2e}'.format(float(value_loss.data.item())),
    #                 '{:10.8e}'.format(float(sum(player.entropies)))))
    #log_counter += 1

        optimizer.zero_grad()
        a3c = args.lmbda * (policy_loss + 0.5 * value_loss)
        a3c.backward()

        torch.nn.utils.clip_grad_norm_(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()

        player.clear_actions()
def train(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    print("prank:", rank, "os.pid:", os.getpid())
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = AllowBacktracking(
        make_local_env(env_conf['game'],
                       env_conf['level'],
                       stack=False,
                       scale_rew=False))
    print("Got a local env; obs space:", env.observation_space)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    print("player.state.shape:", player.state.shape)
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            # if player.info['ale.lives'] == 0 or player.max_length:
            #    player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Beispiel #28
0
def train(rank, args, shared_model, optimizer, env_conf, emb, bi_grams, instructions):
    # Changes the process name
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)

    # Define special vectors
    eos_vector = emb.get_vector("<eos>")
    oov_vector = emb.get_vector("<oov>")

    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(
                shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
    env.seed(args.seed + rank)

    # Create agent
    player = Agent(None, env, args, None, emb)
    player.gpu_id = gpu_id

    # Create DNN model for the agent
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space, emb)

    # Set env and move to gpu
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()

    # Set model to "training" mode. Not doing anything but is a good practice to add
    player.model.train()

    # Start iteration
    player.eps_len += 2

    _counter = 0
    while True:

        # Loading param values from shared model
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())

        # Reset LSTM state when episode ends
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, args.lstm_size).cuda())
                    player.hx = Variable(torch.zeros(1, args.lstm_size).cuda())
            else:
                player.cx = Variable(torch.zeros(1, args.lstm_size))
                player.hx = Variable(torch.zeros(1, args.lstm_size))

        # If not ended, save current state value
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        # Make a step and record observations. Repeat until num_steps reached or game is over.
        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        # If episode finished before args.num_steps is reached, reset environment
        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        # If episode not finished after args.num_steps:
        # Estimates value function of current state
        R = torch.zeros(1, 1)
        if not player.done:
            _, value, _, _ = player.model((Variable(player.state.unsqueeze(0)),
                                        (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        # Append reward for the final time step
        player.values.append(Variable(R))

        # Initialise loss accumulator
        policy_loss = 0
        value_loss = 0
        language_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)

        # Accumulate the losses
        for i in reversed(range(len(player.rewards))):

            # Calculating language loss
            if args.use_language:

                # Calculating language loss
                # Get action of a time step
                a = np.argmax(player.action_logits[i].detach().cpu().numpy())

                # Get produced vectors of the time step
                produced_logits = player.produced_logits[i]
                # print(produced_vectors)
                # Get target vectors of the time step (an instruction corresponding to the least cost)
                action_instructions = instructions[a]

                # Sample a few from the set
                for _ in range(10):
                    idx = random.randrange(0, len(action_instructions))
                    instruction = action_instructions[idx]


                    target_words = instruction.split()

                    for pos, target_word in enumerate(target_words):
                        target_class = torch.tensor(emb.get_index(target_word)).cuda()
                        produced_logit = produced_logits[pos]

                        # Cross_entropy combines log-softmax and nll
                        # Here procuded_vec is one-hot while target is an integer
                        language_loss += torch.nn.functional.cross_entropy(produced_logit, target_class.unsqueeze(0))
                        if target_word == '<eos>':
                            break


            # Calculate other losses
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]

            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]


        # Initialise grad accumulator
        player.model.zero_grad()

        # Calculate grad and update
        if args.use_language:
            (policy_loss + 0.5 * value_loss + 0.1 * 0.01* language_loss).backward()
        else:
            (policy_loss + 0.5 * value_loss).backward()

        """
        # (policy_loss + 0.5 * value_loss).backward()
        print("****************")
        print(policy_loss)
        print(value_loss)
        # """
        if args.use_language and _counter % 10 == 0:
            print("****************")
            #print(policy_loss)
            #print(value_loss)
            print("language loss", language_loss)
        _counter += 1

        # Copying over the parameters to shared model
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()

        # Clean agent observations
        player.clear_actions()
Beispiel #29
0
def train(rank, args, shared_model, optimizer, env_conf):
    torch.manual_seed(args.seed + rank)

    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    state = env.reset()
    player = Agent(model, env, args, state)
    player.state = torch.from_numpy(state).float()
    player.model.train()
    epoch = 0
    while True:

        player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            player.cx = Variable(torch.zeros(1, 512))
            player.hx = Variable(torch.zeros(1, 512))
            if player.starter:
                player = player_start(player, train=True)
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):

            player = player_act(player, train=True)

            if player.done:
                break

            if player.current_life > player.info['ale.lives']:
                player.flag = True
                player.current_life = player.info['ale.lives']
            else:
                player.current_life = player.info['ale.lives']
                player.flag = False
            if args.count_lives:
                if player.flag:
                    player.done = True
                    break

            if player.starter and player.flag:
                player = player_start(player, train=True)
            if player.done:
                break

        if player.done:
            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            player.flag = False

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss += 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()

        (policy_loss + value_loss).backward()

        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.values = []
        player.log_probs = []
        player.rewards = []
        player.entropies = []
Beispiel #30
0
def train_worker(args, shared_model, total_steps, optimizer, lock):
    env = make_env(args)
    args = args.train

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model.train()

    state = env.reset()
    state = torch.FloatTensor(state)

    while True:
        model.load_state_dict(shared_model.state_dict())
        model.detach_hidden()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.update_agent_frequency):
            value, logit = model(state.unsqueeze(0))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())

            with total_steps.get_lock():
                total_steps.value += 1

            if done:
                state = env.reset()
                model.reset_hidden()

            state = torch.FloatTensor(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _ = model(state.unsqueeze(0))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach(
            ) - args.entropy_weight * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_weight * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        with lock:
            ensure_shared_grads(model, shared_model)
            optimizer.step()