def train(env, agent, args): monitor = Monitor(train=True, spec="-{}".format(args.method)) monitor.init_log( args.log, "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)) env.reset() for num_eps in range(args.episode_num): terminal = False env.reset() loss = 0 cnt = 0 tot_reward = 0 probe = None if args.env_name == "dst": probe = FloatTensor([0.8, 0.2]) elif args.env_name in ['ft', 'ft5', 'ft7']: probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0]) while not terminal: state = env.observe() action = agent.act(state) next_state, reward, terminal = env.step(action) if args.log: monitor.add_log(state, action, reward, terminal, agent.w_kept) agent.memorize(state, action, next_state, reward, terminal) loss += agent.learn() if cnt > 100: terminal = True agent.reset() tot_reward = tot_reward + ( probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt) cnt = cnt + 1 _, q = agent.predict(probe) if args.env_name == "dst": act_1 = q[0, 3] act_2 = q[0, 1] elif args.env_name in ['ft', 'ft5', 'ft7']: act_1 = q[0, 1] act_2 = q[0, 0] if args.method == "crl-naive": act_1 = act_1.data.cpu() act_2 = act_2.data.cpu() elif args.method == "crl-envelope": act_1 = probe.dot(act_1.data) act_2 = probe.dot(act_2.data) elif args.method == "crl-energy": act_1 = probe.dot(act_1.data) act_2 = probe.dot(act_2.data) print( "end of eps %d with total reward (1) %0.2f, the Q is %0.2f | %0.2f; loss: %0.4f" % ( num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt)) monitor.update( num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt) if num_eps + 1 % 500 == 0: agent.save( args.save, "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name))
class MORLPolicy(Policy.Policy): '''Derived from :class:`Policy` ''' def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False, action_names=None): super(MORLPolicy, self).__init__(domainString, is_training) self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) self.prev_state_check = None # parameter settings if 0: # cfg.has_option('morlpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('morlpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.n_rew = 1 if cfg.has_option('morlpolicy', 'n_rew'): self.n_rew = cfg.getint('morlpolicy', 'n_rew') self.lr = 0.001 if cfg.has_option('morlpolicy', 'learning_rate'): self.lr = cfg.getfloat('morlpolicy', 'learning_rate') self.epsilon = 0.5 if cfg.has_option('morlpolicy', 'epsilon'): self.epsilon = cfg.getfloat('morlpolicy', 'epsilon') self.epsilon_decay = True if cfg.has_option('morlpolicy', 'epsilon_decay'): self.epsilon_decay = cfg.getboolean('morlpolicy', 'epsilon_decay') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('morlpolicy', 'gamma'): self.gamma = cfg.getfloat('morlpolicy', 'gamma') self.weight_num = 32 if cfg.has_option('morlpolicy', 'weight_num'): self.weight_num = cfg.getint('morlpolicy', 'weight_num') self.episode_num = 1000 if cfg.has_option('morlpolicy', 'episode_num'): self.episode_num = cfg.getfloat('morlpolicy', 'episode_num') self.optimizer = "Adam" if cfg.has_option('morlpolicy', 'optimizer'): self.optimizer = cfg.get('morlpolicy', 'optimizer') self.save_step = 100 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.update_freq = 50 if cfg.has_option('morlpolicy', 'update_freq'): self.update_freq = cfg.getint('morlpolicy', 'update_freq') self.policyfeatures = [] if cfg.has_option('morlpolicy', 'features'): logger.info('Features: ' + str(cfg.get('morlpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('morlpolicy', 'features')) self.algorithm = 'naive' if cfg.has_option('morlpolicy', 'algorithm'): self.algorithm = cfg.get('morlpolicy', 'algorithm') logger.info('Learning algorithm: ' + self.algorithm) self.batch_size = 32 if cfg.has_option('morlpolicy', 'batch_size'): self.batch_size = cfg.getint('morlpolicy', 'batch_size') self.mem_size = 1000 if cfg.has_option('morlpolicy', 'mem_size'): self.mem_size = cfg.getint('morlpolicy', 'mem_size') self.training_freq = 1 if cfg.has_option('morlpolicy', 'training_freq'): self.training_freq = cfg.getint('morlpolicy', 'training_freq') # set beta for envelope algorithm self.beta = 0.1 if cfg.has_option('morlpolicy', 'beta'): self.beta = cfg.getfloat('morlpolicy', 'beta') self.beta_init = self.beta self.beta_uplim = 1.00 self.tau = 1000. self.beta_expbase = float( np.power(self.tau * (self.beta_uplim - self.beta), 1. / (self.episode_num + 1))) self.beta_delta = self.beta_expbase / self.tau self.beta -= self.beta_delta # using homotopy method for optimization self.homotopy = False if cfg.has_option('morlpolicy', 'homotopy'): self.homotopy = cfg.getboolean('morlpolicy', 'homotopy') self.epsilon_delta = (self.epsilon - 0.05) / self.episode_num self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString) if action_names is None: self.action_names = self.summaryaction.action_names else: self.action_names = action_names self.action_dim = len(self.action_names) self.stats = [0 for _ in range(self.action_dim)] self.reward_dim = self.n_rew model = None if self.algorithm == 'naive': model = naive.NaiveLinearCQN(self.state_dim, self.action_dim, self.reward_dim) elif self.algorithm == 'envelope': model = envelope.EnvelopeLinearCQN(self.state_dim, self.action_dim, self.reward_dim) self.model_ = model self.model = copy.deepcopy(model) # initialize memory self.trans_mem = deque() self.trans = namedtuple('trans', ['s', 'a', 's_', 'r', 'd', 'ms', 'ms_']) self.priority_mem = deque() self.mem_last_state = None self.mem_last_action = None self.mem_last_mask = None self.mem_cur_state = None self.mem_cur_action = None self.mem_cur_mask = None if self.optimizer == 'Adam': self.optimizer = optim.Adam(self.model_.parameters(), lr=self.lr) elif self.optimizer == 'RMSprop': self.optimizer = optim.RMSprop(self.model_.parameters(), lr=self.lr) try: self.loadPolicy(self.in_policy_file) except: logger.info("No previous model found...") self.w_kept = None self.update_count = 0 if self.is_training: self.model_.train() if use_cuda: self.model.cuda() self.model_.cuda() self.monitor = None def get_n_in(self, domain_string): if domain_string == 'CamRestaurants': return 268 elif domain_string == 'CamHotels': return 111 elif domain_string == 'SFRestaurants': return 636 elif domain_string == 'SFHotels': return 438 elif domain_string == 'Laptops6': return 268 # ic340: this is wrong elif domain_string == 'Laptops11': return 257 elif domain_string is 'TV': return 188 else: print 'DOMAIN {} SIZE NOT SPECIFIED, PLEASE DEFINE n_in'.format( domain_string) def act_on(self, state, preference=None): if self.lastSystemAction is None and self.startwithhello: systemAct, nextaIdex = 'hello()', -1 else: systemAct, nextaIdex = self.nextAction(state, preference) self.lastSystemAction = systemAct self.summaryAct = nextaIdex self.prevbelief = state systemAct = DiaAct.DiaAct(systemAct) return systemAct def record(self, reward, domainInControl=None, weight=None, state=None, action=None): if domainInControl is None: domainInControl = self.domainString if self.actToBeRecorded is None: self.actToBeRecorded = self.summaryAct if state is None: state = self.prevbelief if action is None: action = self.actToBeRecorded cState, cAction = self.convertStateAction(state, action) execMask = self.summaryaction.getExecutableMask(state, cAction) execMask = torch.Tensor(execMask).type(FloatTensor) # # normalising total return to -1~1 # reward /= 20.0 self.mem_last_state = self.mem_cur_state self.mem_last_action = self.mem_cur_action self.mem_last_mask = self.mem_cur_mask self.mem_cur_state = np.vstack( [np.expand_dims(x, 0) for x in [cState]]) # self.mem_cur_action = np.eye(self.action_dim, self.action_dim)[[cAction]] self.mem_cur_action = cAction self.mem_cur_mask = execMask state = self.mem_last_state action = self.mem_last_action next_state = self.mem_cur_state terminal = False if state is not None and action is not None: self.trans_mem.append( self.trans( torch.from_numpy(state).type(FloatTensor), # state action, # action torch.from_numpy(next_state).type( FloatTensor), # next state torch.from_numpy(reward).type(FloatTensor), # reward terminal, # terminal self.mem_last_mask, # action mask self.mem_cur_mask)) # next action mask # randomly produce a preference for calculating priority # preference = self.w_kept preference = torch.randn(self.model_.reward_size) preference = (torch.abs(preference) / torch.norm(preference, p=1)).type(FloatTensor) state = torch.from_numpy(state).type(FloatTensor) _, q = self.model_(Variable(state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False), execmask=Variable( self.mem_last_mask.unsqueeze(0), requires_grad=False)) q = q[0, action].data if self.algorithm == 'naive': wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) if not terminal: next_state = torch.from_numpy(next_state).type(FloatTensor) hq, _ = self.model_(Variable(next_state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False), execmask=Variable( self.mem_cur_mask.unsqueeze(0), requires_grad=False)) hq = hq.data[0] p = abs(wr + self.gamma * hq - q) else: self.w_kept = None # if self.epsilon_decay: # self.epsilon -= self.epsilon_delta p = abs(wr - q) elif self.algorithm == 'envelope': wq = preference.dot(q) wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) if not terminal: next_state = torch.from_numpy(next_state).type(FloatTensor) hq, _ = self.model_(Variable(next_state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False), execmask=Variable( self.mem_cur_mask.unsqueeze(0), requires_grad=False)) hq = hq.data[0] whq = preference.dot(hq) p = abs(wr + self.gamma * whq - wq) else: self.w_kept = None # if self.epsilon_decay: # self.epsilon -= self.epsilon_delta # if self.homotopy: # self.beta += self.beta_delta # self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta p = abs(wr - wq) p += 1e-5 self.priority_mem.append(p) if len(self.trans_mem) > self.mem_size: self.trans_mem.popleft() self.priority_mem.popleft() self.actToBeRecorded = None def finalizeRecord(self, reward, domainInControl=None): if domainInControl is None: domainInControl = self.domainString if self.episodes[domainInControl] is None: logger.warning( "record attempted to be finalized for domain where nothing has been recorded before" ) return # # normalising total return to -1~1 # reward /= 20.0 terminal_state, terminal_action = self.convertStateAction( TerminalState(), TerminalAction()) # # normalising total return to -1~1 # reward /= 20.0 self.mem_last_state = self.mem_cur_state self.mem_last_action = self.mem_cur_action self.mem_last_mask = self.mem_cur_mask self.mem_cur_state = np.vstack( [np.expand_dims(x, 0) for x in [terminal_state]]) self.mem_cur_action = None self.mem_cur_mask = torch.zeros(self.action_dim).type(FloatTensor) state = self.mem_last_state action = self.mem_last_action next_state = self.mem_cur_state terminal = True if state is not None: self.trans_mem.append( self.trans( torch.from_numpy(state).type(FloatTensor), # state action, # action torch.from_numpy(next_state).type( FloatTensor), # next state torch.from_numpy(reward).type(FloatTensor), # reward terminal, # terminal self.mem_last_mask, # action mask self.mem_cur_mask)) # next action mask # randomly produce a preference for calculating priority # preference = self.w_kept preference = torch.randn(self.model_.reward_size) preference = (torch.abs(preference) / torch.norm(preference, p=1)).type(FloatTensor) state = torch.from_numpy(state).type(FloatTensor) _, q = self.model_( Variable(state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False)) q = q.data[0, action] if self.algorithm == 'naive': wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) if not terminal: next_state = torch.from_numpy(next_state).type(FloatTensor) hq, _ = self.model_( Variable(next_state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False)) hq = hq.data[0] p = abs(wr + self.gamma * hq - q) else: self.w_kept = None # if self.epsilon_decay: # self.epsilon -= self.epsilon_delta p = abs(wr - q) elif self.algorithm == 'envelope': wq = preference.dot(q) wr = preference.dot(torch.from_numpy(reward).type(FloatTensor)) if not terminal: next_state = torch.from_numpy(next_state).type(FloatTensor) hq, _ = self.model_( Variable(next_state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False)) hq = hq.data[0] whq = preference.dot(hq) p = abs(wr + self.gamma * whq - wq) else: self.w_kept = None # if self.epsilon_decay: # self.epsilon -= self.epsilon_delta # if self.homotopy: # self.beta += self.beta_delta # self.beta_delta = (self.beta - self.beta_init) * self.beta_expbase + self.beta_init - self.beta p = abs(wr - wq) p += 1e-5 self.priority_mem.append(p) if len(self.trans_mem) > self.mem_size: self.trans_mem.popleft() self.priority_mem.popleft() def convertStateAction(self, state, action): ''' nnType = 'dnn' #nnType = 'rnn' # expand one dimension to match the batch size of 1 at axis 0 if nnType == 'rnn': belief = np.expand_dims(belief,axis=0) ''' if isinstance(state, TerminalState): if self.domainUtil.domainString == 'CamRestaurants': return [0] * 268, action elif self.domainUtil.domainString == 'CamHotels': return [0] * 111, action elif self.domainUtil.domainString == 'SFRestaurants': return [0] * 633, action elif self.domainUtil.domainString == 'SFHotels': return [0] * 438, action elif self.domainUtil.domainString == 'Laptops11': return [0] * 257, action elif self.domainUtil.domainString == 'TV': return [0] * 188, action else: flat_belief = flatten_belief(state, self.domainUtil) self.prev_state_check = flat_belief return flat_belief, action def convertDIPStateAction(self, state, action): ''' ''' if isinstance(state, TerminalState): return [0] * 89, action else: dip_state = DIP_state(state.domainStates[state.currentdomain], self.domainString) action_name = self.actions.action_names[action] act_slot = 'general' for slot in dip_state.slots: if slot in action_name: act_slot = slot flat_belief = dip_state.get_beliefStateVec(act_slot) self.prev_state_check = flat_belief return flat_belief, action def nextAction(self, beliefstate, preference=None): ''' select next action :param beliefstate: :param preference: :returns: (int) next summary action ''' beliefVec = flatten_belief(beliefstate, self.domainUtil) execMask = self.summaryaction.getExecutableMask( beliefstate, self.lastSystemAction) execMask = torch.Tensor(execMask).type(FloatTensor) if preference is None: if self.w_kept is None: self.w_kept = torch.randn(self.model_.reward_size) self.w_kept = (torch.abs(self.w_kept) / torch.norm(self.w_kept, p=1)).type(FloatTensor) preference = self.w_kept if self.is_training and (len(self.trans_mem) < self.batch_size * 10 or torch.rand(1)[0] < self.epsilon): admissible = [i for i, x in enumerate(execMask) if x == 0.0] random.shuffle(admissible) nextaIdex = admissible[0] else: state = np.reshape(beliefVec, (1, len(beliefVec))) state = torch.from_numpy(state).type(FloatTensor) if self.algorithm == 'naive': _, Q = self.model_( Variable(state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False), Variable(execMask.unsqueeze(0), requires_grad=False)) nextaIdex = np.argmax(Q.detach().cpu().numpy()) elif self.algorithm == 'envelope': _, Q = self.model_(Variable(state, requires_grad=False), Variable(preference.unsqueeze(0), requires_grad=False), execmask=Variable(execMask.unsqueeze(0), requires_grad=False)) Q = Q.view(-1, self.model_.reward_size) Q = torch.mv(Q.data, preference) action = Q.max(0)[1].cpu().numpy() nextaIdex = int(action) self.stats[nextaIdex] += 1 summaryAct = self.action_names[nextaIdex] beliefstate = beliefstate.getDomainState(self.domainUtil.domainString) masterAct = self.summaryaction.Convert(beliefstate, summaryAct, self.lastSystemAction) return masterAct, nextaIdex def sample(self, pop, pri, k): pri = np.array(pri).astype(np.float) inds = np.random.choice(range(len(pop)), k, replace=False, p=pri / pri.sum()) return [pop[i] for i in inds] def actmsk(self, num_dim, index): mask = ByteTensor(num_dim).zero_() mask[index] = 1 return mask.unsqueeze(0) def nontmlinds(self, terminal_batch): mask = ByteTensor(terminal_batch) inds = torch.arange(0, len(terminal_batch)).type(LongTensor) inds = inds[mask.eq(0)] return inds def train(self): ''' call this function when the episode ends ''' self.episodecount += 1 if self.monitor is None: self.monitor = Monitor("-" + self.algorithm) if not self.is_training: logger.info("Not in training mode") return else: logger.info("Update naive morl policy parameters.") logger.info("Episode Num so far: %s" % (self.episodecount)) if len(self.trans_mem) > self.batch_size * 10: self.update_count += 1 minibatch = self.sample(self.trans_mem, self.priority_mem, self.batch_size) batchify = lambda x: list(x) * self.weight_num state_batch = batchify(map(lambda x: x.s, minibatch)) action_batch = batchify(map(lambda x: LongTensor([x.a]), minibatch)) reward_batch = batchify(map(lambda x: x.r.unsqueeze(0), minibatch)) next_state_batch = batchify(map(lambda x: x.s_, minibatch)) terminal_batch = batchify(map(lambda x: x.d, minibatch)) mask_batch = batchify(map(lambda x: x.ms.unsqueeze(0), minibatch)) next_mask_batch = batchify( map(lambda x: x.ms_.unsqueeze(0), minibatch)) w_batch = np.random.randn(self.weight_num, self.model_.reward_size) w_batch = np.abs(w_batch) / \ np.linalg.norm(w_batch, ord=1, axis=1, keepdims=True) w_batch = torch.from_numpy(w_batch.repeat( self.batch_size, axis=0)).type(FloatTensor) if self.algorithm == 'naive': __, Q = self.model_(Variable(torch.cat(state_batch, dim=0)), Variable(w_batch), Variable(torch.cat(mask_batch, dim=0))) # detach since we don't want gradients to propagate # HQ, _ = self.model_(Variable(torch.cat(next_state_batch, dim=0), volatile=True), # Variable(w_batch, volatile=True)) _, DQ = self.model( Variable(torch.cat(next_state_batch, dim=0), requires_grad=False), Variable(w_batch, requires_grad=False), Variable(torch.cat(next_mask_batch, dim=0), requires_grad=False)) _, act = self.model_( Variable(torch.cat(next_state_batch, dim=0), requires_grad=False), Variable(w_batch, requires_grad=False), Variable(torch.cat(next_mask_batch, dim=0), requires_grad=False))[1].max(1) HQ = DQ.gather(1, act.unsqueeze(dim=1)).squeeze() w_reward_batch = torch.bmm( w_batch.unsqueeze(1), torch.cat(reward_batch, dim=0).unsqueeze(2)).squeeze() nontmlmask = self.nontmlinds(terminal_batch) with torch.no_grad(): Tau_Q = Variable( torch.zeros(self.batch_size * self.weight_num).type(FloatTensor)) Tau_Q[nontmlmask] = self.gamma * HQ[nontmlmask] Tau_Q += Variable(w_reward_batch) actions = Variable(torch.cat(action_batch, dim=0)) # Compute Huber loss loss = F.smooth_l1_loss(Q.gather(1, actions.unsqueeze(dim=1)), Tau_Q.unsqueeze(dim=1)) elif self.algorithm == 'envelope': action_size = self.model_.action_size reward_size = self.model_.reward_size __, Q = self.model_(Variable(torch.cat(state_batch, dim=0)), Variable(w_batch), w_num=self.weight_num, execmask=Variable( torch.cat(mask_batch, dim=0))) # detach since we don't want gradients to propagate # HQ, _ = self.model_(Variable(torch.cat(next_state_batch, dim=0), volatile=True), # Variable(w_batch, volatile=True), w_num=self.weight_num) _, DQ = self.model(Variable(torch.cat(next_state_batch, dim=0), requires_grad=False), Variable(w_batch, requires_grad=False), execmask=Variable(torch.cat(next_mask_batch, dim=0), requires_grad=False)) w_ext = w_batch.unsqueeze(2).repeat(1, action_size, 1) w_ext = w_ext.view(-1, self.model.reward_size) _, tmpQ = self.model_(Variable(torch.cat(next_state_batch, dim=0), requires_grad=False), Variable(w_batch, requires_grad=False), execmask=Variable(torch.cat( next_mask_batch, dim=0), requires_grad=False)) tmpQ = tmpQ.view(-1, reward_size) # print(torch.bmm(w_ext.unsqueeze(1), # tmpQ.data.unsqueeze(2)).view(-1, action_size)) act = torch.bmm( Variable(w_ext.unsqueeze(1), requires_grad=False), tmpQ.unsqueeze(2)).view(-1, action_size).max(1)[1] HQ = DQ.gather( 1, act.view(-1, 1, 1).expand(DQ.size(0), 1, DQ.size(2))).squeeze() nontmlmask = self.nontmlinds(terminal_batch) with torch.no_grad(): Tau_Q = Variable( torch.zeros(self.batch_size * self.weight_num, reward_size).type(FloatTensor)) Tau_Q[nontmlmask] = self.gamma * HQ[nontmlmask] # Tau_Q.volatile = False Tau_Q += Variable(torch.cat(reward_batch, dim=0)) actions = Variable(torch.cat(action_batch, dim=0)) Q = Q.gather( 1, actions.view(-1, 1, 1).expand(Q.size(0), 1, Q.size(2))).view(-1, reward_size) Tau_Q = Tau_Q.view(-1, reward_size) wQ = torch.bmm(Variable(w_batch.unsqueeze(1)), Q.unsqueeze(2)).squeeze() wTQ = torch.bmm(Variable(w_batch.unsqueeze(1)), Tau_Q.unsqueeze(2)).squeeze() # loss = F.mse_loss(Q.view(-1), Tau_Q.view(-1)) # print self.beta loss = self.beta * F.mse_loss(wQ.view(-1), wTQ.view(-1)) loss += (1 - self.beta) * F.mse_loss(Q.view(-1), Tau_Q.view(-1)) self.optimizer.zero_grad() loss.backward() for param in self.model_.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if self.update_count % self.update_freq == 0: self.model.load_state_dict(self.model_.state_dict()) self.monitor.update(self.episodecount, loss=loss.data) self.savePolicyInc() # self.out_policy_file) def savePolicy(self, FORCE_SAVE=False): """ Does not use this, cause it will be called from agent after every episode. we want to save the policy only periodically. """ pass def savePolicyInc(self, FORCE_SAVE=False): """ save model and replay buffer """ if self.episodecount % self.save_step == 0: torch.save( self.model, "{}.{}.pkl".format(self.out_policy_file, self.algorithm)) def loadPolicy(self, filename): """ load model and replay buffer """ # load models self.model_ = torch.load("{}.{}.pkl".format(filename, self.algorithm)) self.model = copy.deepcopy(self.model_) def restart(self): self.summaryAct = None self.lastSystemAction = None self.prevbelief = None self.actToBeRecorded = None self.w_kept = None if self.epsilon_decay: self.epsilon -= self.epsilon_delta if self.homotopy: self.beta += self.beta_delta self.beta_delta = ( self.beta - self.beta_init ) * self.beta_expbase + self.beta_init - self.beta
def train(env, agent, args): monitor = Monitor(train=True, spec="-{}".format(args.method)) monitor.init_log(args.log, "roi_m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)) env.reset() S = set() corWs = queue.Queue() # add two extreme points corWs.put(FloatTensor([1.0, 0.0])) corWs.put(FloatTensor([0.0, 1.0])) # outer_loop! for _ in range(args.ws): print(colored("size of corWs: {}".format(corWs.qsize()), "green")) if corWs.qsize() == 0: corWs.put(FloatTensor([1.0, 0.0])) corWs.put(FloatTensor([0.0, 1.0])) corner_w = corWs.get_nowait() while not is_corner(corner_w, S) and corWs.qsize()>0: corner_w = corWs.get_nowait() print(colored("{} left....".format(corWs.qsize()), "green")) if not is_corner(corner_w, S): print(colored("no more corner w...", "green")) print(colored("Final S contains", "green")) for s in S: print(colored(s, "green")) break print(colored("solve for w: {}".format(corner_w), "green")) for num_eps in range(int(args.episode_num / args.ws)): terminal = False env.reset() loss = 0 cnt = 0 tot_reward = 0 tot_reward_mo = 0 probe = None if args.env_name == "dst": probe = corner_w elif args.env_name in ['ft', 'ft5', 'ft7']: probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0]) while not terminal: state = env.observe() action = agent.act(state, corner_w) agent.w_kept = corner_w next_state, reward, terminal = env.step(action) if args.log: monitor.add_log(state, action, reward, terminal, agent.w_kept) agent.memorize(state, action, next_state, reward, terminal, roi=True) loss += agent.learn(corner_w) if cnt > 100: terminal = True agent.reset() tot_reward = tot_reward + (probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt) tot_reward_mo = tot_reward_mo + reward * np.power(args.gamma, cnt) cnt = cnt + 1 _, q = agent.predict(probe) if args.env_name == "dst": act_1 = q[0, 3] act_2 = q[0, 1] elif args.env_name in ['ft', 'ft5', 'ft7']: act_1 = q[0, 1] act_2 = q[0, 0] if args.method == "crl-naive": act_1 = act_1.data.cpu() act_2 = act_2.data.cpu() elif args.method == "crl-envelope": act_1 = probe.dot(act_1.data) act_2 = probe.dot(act_2.data) elif args.method == "crl-energy": act_1 = probe.dot(act_1.data) act_2 = probe.dot(act_2.data) print("end of eps %d with total reward (1) %0.2f (%0.2f, %0.2f), the Q is %0.2f | %0.2f; loss: %0.4f" % ( num_eps, tot_reward, tot_reward_mo[0], tot_reward_mo[1], act_1, act_2, # q__max, loss / cnt)) monitor.update(num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt) # agent.is_train=False terminal = False env.reset() cnt = 0 tot_reward_mo = 0 while not terminal: state = env.observe() action = agent.act(state, corner_w) agent.w_kept = corner_w next_state, reward, terminal = env.step(action) if cnt > 100: terminal = True agent.reset() tot_reward_mo = tot_reward_mo + reward * np.power(args.gamma, cnt) cnt = cnt + 1 agent.is_train=True S, corWs = update_ccs(S, corWs, tot_reward_mo) print(colored("----------------\n", "red")) print(colored("Current S contains", "red")) for s in S: print(colored(s, "red")) print(colored("----------------\n", "red")) # if num_eps+1 % 100 == 0: # agent.save(args.save, args.model+args.name+"_tmp_{}".format(number)) agent.save(args.save, "roi_m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name))
def train(env, agent, args): monitor = Monitor(train=True, spec="-{}".format(args.method)) monitor.init_log( args.log, "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)) env.reset() initial_state = env.observe() for num_eps in range(args.episode_num): terminal = False env.reset() loss = 0 cnt = 0 act1 = 0 act2 = 0 tot_reward = 0 tot_reward_nc = 0 tot_reward_dist = 0 mask = None next_mask = None probe = None if args.env_name == "dst": probe = FloatTensor([0.8, 0.2]) elif args.env_name == "crp": probe = FloatTensor([0.5, 0.5]) elif args.env_name in ['ft', 'ft5', 'ft7']: probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0]) while not terminal: t_now = time.time() state = env.observe() t_obs = time.time() - t_now t_now = time.time() if args.env_name == "crp": mask = env.env.get_action_out_mask() action = agent.act(state, mask=mask) t_policy = time.time() - t_now t_now = time.time() next_state, reward, terminal = env.step(action, step=0.5) t_step = time.time() - t_now if args.env_name == "crp": next_mask = env.env.get_action_out_mask() if args.log: monitor.add_log(state, action, reward, terminal, agent.w_kept) t_now = time.time() agent.memorize(state, action, next_state, reward, terminal, mask, next_mask) t_mem = time.time() - t_now t_now = time.time() loss += agent.learn() t_learn = time.time() - t_now if terminal: # terminal = True t_now = time.time() agent.reset() t_reset = time.time() - t_now tot_reward = tot_reward + (probe.cpu().numpy().dot(reward)) act1 += reward[0] act2 += reward[1] tot_reward_nc = tot_reward_nc + 1 - reward[0] tot_reward_dist = tot_reward_dist + env.env.get_distortion( absolute=True, tollerance=0) / 10 cnt = cnt + 1 # _, q = agent.predict(probe, initial_state=initial_state) # if args.env_name == "dst": # act_1 = q[0, 3] # act_2 = q[0, 1] if args.env_name == "crp": act_1 = act1 act_2 = act2 # elif args.env_name in ['ft', 'ft5', 'ft7']: # act_1 = q[0, 1] # act_2 = q[0, 0] # if args.method == "crl-naive": # act_1 = act_1.data.cpu() # act_2 = act_2.data.cpu() # elif args.method == "crl-envelope": # act_1 = probe.dot(act_1.data) # act_2 = probe.dot(act_2.data) # elif args.method == "crl-energy": # act_1 = probe.dot(act_1.data) # act_2 = probe.dot(act_2.data) print( "end of eps %d with total reward (1) %0.2f, the Q is %0.2f | %0.2f; loss: %0.4f; total_nc: %0.2f; total_dist: %0.2f;beta : %0.2f;eps : %0.2f;" % ( num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt, tot_reward_nc, tot_reward_dist, agent.beta, agent.epsilon)) # print("t_obs : %0.2f;t_policy : %0.2f;t_step : %0.2f;t_mem : %0.2f;t_learn : %0.2f;t_reset : %0.2f" % ( # t_obs, # t_policy, # t_step, # t_mem, # t_learn, # t_reset,)) monitor.update( num_eps, tot_reward, act_1, act_2, # q__max, loss / cnt) if (num_eps) % 10 == 0: agent.save( args.save, "m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name)) agent.save( args.save, "m.{}_e.{}_n.{}.ep{}".format(args.model, args.env_name, args.name, num_eps // 100))