def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): super(ACERPolicy, self).__init__(domainString, is_training) tf.reset_default_graph() self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) #improvement================================== self.intrinsic_reward_method = None if cfg.has_option('scme', 'method'): self.intrinsic_reward_method = cfg.get('scme', 'method') #improvement================================== # parameter settings if 0: # cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('dqnpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.actor_lr = 0.0001 if cfg.has_option('dqnpolicy', 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr') self.critic_lr = 0.001 if cfg.has_option('dqnpolicy', 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr') self.delta = 1. if cfg.has_option('dqnpolicy', 'delta'): self.delta = cfg.getfloat('dqnpolicy', 'delta') self.alpha = 0.99 if cfg.has_option('dqnpolicy', 'beta'): self.alpha = cfg.getfloat('dqnpolicy', 'beta') self.c = 10. if cfg.has_option('dqnpolicy', 'is_threshold'): self.c = cfg.getfloat('dqnpolicy', 'is_threshold') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 0.99 if cfg.has_option('dqnpolicy', 'gamma'): self.gamma = cfg.getfloat('dqnpolicy', 'gamma') self.regularisation = 'l2' if cfg.has_option('dqnpolicy', 'regularisation'): self.regularisation = cfg.get('dqnpolicy', 'regularisation') self.learning_rate = 0.001 if cfg.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy', 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') self.curiosityreward = False if cfg.has_option('eval', 'curiosityreward'): self.curiosityreward = cfg.getboolean('eval', 'curiosityreward') self.epsilon = 1 if cfg.has_option('dqnpolicy', 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') if not self.curiosityreward: # no eps-greedy exploration when curious expl. is used self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') else: self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') self.epsilon_end = 1 if cfg.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy', 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') self.capacity = 1000 if cfg.has_option('dqnpolicy', 'capacity'): self.capacity = cfg.getint('dqnpolicy', 'capacity') self.replay_type = 'vanilla' if cfg.has_option('dqnpolicy', 'replay_type'): self.replay_type = cfg.get('dqnpolicy', 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy', 'architecture'): self.architecture = cfg.get('dqnpolicy', 'architecture') self.q_update = 'single' if cfg.has_option('dqnpolicy', 'q_update'): self.q_update = cfg.get('dqnpolicy', 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy', 'h1_size'): self.h1_size = cfg.getint('dqnpolicy', 'h1_size') self.h2_size = 50 if cfg.has_option('dqnpolicy', 'h2_size'): self.h2_size = cfg.getint('dqnpolicy', 'h2_size') self.save_step = 200 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.importance_sampling = 'soft' if cfg.has_option('dqnpolicy', 'importance_sampling'): self.importance_sampling = cfg.get('dqnpolicy', 'importance_sampling') self.train_iters_per_episode = 1 if cfg.has_option('dqnpolicy', 'train_iters_per_episode'): self.train_iters_per_episode = cfg.getint( 'dqnpolicy', 'train_iters_per_episode') self.training_frequency = 2 if cfg.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') # domain specific parameter settings (overrides general policy parameter settings) if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString, 'actor_lr') if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString, 'critic_lr') if cfg.has_option('dqnpolicy_' + domainString, 'delta'): self.delta = cfg.getfloat('dqnpolicy_' + domainString, 'delta') if cfg.has_option('dqnpolicy_' + domainString, 'beta'): self.alpha = cfg.getfloat('dqnpolicy_' + domainString, 'beta') if cfg.has_option('dqnpolicy_' + domainString, 'is_threshold'): self.c = cfg.getfloat('dqnpolicy_' + domainString, 'is_threshold') if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') if cfg.has_option('dqnpolicy_' + domainString, 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads( cfg.get('dqnpolicy_' + domainString, 'features')) if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity') if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') if cfg.has_option('dqnpolicy_' + domainString, 'importance_sampling'): self.importance_sampling = cfg.get('dqnpolicy_' + domainString, 'importance_sampling') if cfg.has_option('dqnpolicy_' + domainString, 'train_iters_per_episode'): self.train_iters_per_episode = cfg.getint( 'dqnpolicy_' + domainString, 'train_iters_per_episode') if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency') self.episode_ct = 0 self.episode_ave_max_q = [] self.mu_prob = 0. # behavioral policy os.environ["CUDA_VISIBLE_DEVICES"] = "" # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBufferEpisode( self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritisedEpisode( self.capacity, self.minibatch_size, self.randomseed) #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) #self.episodes = [] self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString) self.action_dim = len(self.summaryaction.action_names) action_bound = len(self.summaryaction.action_names) self.stats = [0 for _ in range(self.action_dim)] self.global_mu = [0. for _ in range(self.action_dim)] self.acer = acer.ACERNetwork(self.sess, self.state_dim, self.action_dim, \ self.critic_lr, self.delta, self.c, self.alpha, self.h1_size, self.h2_size, self.is_training, self.randomseed) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[ self.domainString].size() #improvement================================== #initial if self.intrinsic_reward_method == 'vime': self.vime_model = vime(self.state_dim, self.action_dim) self.vime_model.load_model('model/vime_model/' + self.in_policy_file) elif self.intrinsic_reward_method == 'cme': self.cme_model = cme(self.state_dim, self.action_dim) self.cme_model.load_model('model/cme_model/' + self.in_policy_file) elif self.intrinsic_reward_method == 'scme': self.scme_model = scme(self.state_dim, self.action_dim) self.scme_model.load_model('model/scme_model/' + self.in_policy_file)
def __init__(self, in_policy_file, out_policy_file, domainString='CamRestaurants', is_training=False): super(ENACPolicy, self).__init__(domainString, is_training) tf.reset_default_graph() self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString) # parameter settings if 0:#cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('dqnpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.actor_lr = 0.0001 if cfg.has_option('dqnpolicy', 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy', 'actor_lr') self.critic_lr = 0.001 if cfg.has_option('dqnpolicy', 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy', 'critic_lr') self.tau = 0.001 if cfg.has_option('dqnpolicy', 'tau'): self.tau = cfg.getfloat('dqnpolicy', 'tau') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('dqnpolicy', 'gamma'): self.gamma = cfg.getfloat('dqnpolicy', 'gamma') self.regularisation = 'l2' if cfg.has_option('dqnpolicy', 'regularisation'): self.regularisation = cfg.get('dqnpolicy', 'regulariser') self.learning_rate = 0.001 if cfg.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy', 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') self.epsilon = 1 if cfg.has_option('dqnpolicy', 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') self.epsilon_end = 1 if cfg.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy', 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') self.capacity = 1000 # max(self.minibatch_size, 2000) if cfg.has_option('dqnpolicy', 'capacity'): self.capacity = cfg.getint('dqnpolicy', 'capacity') self.replay_type = 'vanilla' if cfg.has_option('dqnpolicy', 'replay_type'): self.replay_type = cfg.get('dqnpolicy', 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy', 'architecture'): self.architecture = cfg.get('dqnpolicy', 'architecture') self.q_update = 'single' if cfg.has_option('dqnpolicy', 'q_update'): self.q_update = cfg.get('dqnpolicy', 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy', 'h1_size'): self.h1_size = cfg.getint('dqnpolicy', 'h1_size') self.h2_size = 50 if cfg.has_option('dqnpolicy', 'h2_size'): self.h2_size = cfg.getint('dqnpolicy', 'h2_size') self.save_step = 200 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.importance_sampling = 'soft' if cfg.has_option('dqnpolicy', 'importance_sampling'): self.importance_sampling = cfg.get('dqnpolicy', 'importance_sampling') self.training_frequency = 2 if cfg.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') # domain specific parameter settings (overrides general policy parameter settings) if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') if cfg.has_option('dqnpolicy_' + domainString, 'actor_lr'): self.actor_lr = cfg.getfloat('dqnpolicy_' + domainString, 'actor_lr') if cfg.has_option('dqnpolicy_' + domainString, 'critic_lr'): self.critic_lr = cfg.getfloat('dqnpolicy_' + domainString, 'critic_lr') if cfg.has_option('dqnpolicy_' + domainString, 'tau'): self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau') if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') if cfg.has_option('dqnpolicy_' + domainString, 'features'): logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy_' + domainString, 'features')) if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') logger.info('Learning algorithm: ' + self.learning_algorithm) if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity') if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') if cfg.has_option('dqnpolicy_' + domainString, 'importance_sampling'): self.importance_sampling = cfg.get('dqnpolicy_' + domainString, 'importance_sampling') if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency') self.natural_gradient_prev = 0. """ self.shuffle = False if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'): self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay') if not self.shuffle: # If we don't use experience replay, we don't need to maintain # sliding window of experiences with maximum capacity. # We only need to maintain the data of minibatch_size self.capacity = self.minibatch_size """ self.episode_ave_max_q = [] self.mu_prob = 0. # behavioral policy os.environ["CUDA_VISIBLE_DEVICES"]="" # init session self.sess = tf.Session() with tf.device("/cpu:0"): np.random.seed(self.randomseed) tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) #replay_buffer = ReplayBuffer(self.capacity, self.randomseed) #self.episodes = [] self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString) self.action_dim = len(self.summaryaction.action_names) action_bound = len(self.summaryaction.action_names) self.stats = [0 for _ in range(self.action_dim)] self.enac = enac.ENACNetwork(self.sess, self.state_dim, self.action_dim, \ self.critic_lr, self.tau, action_bound, self.architecture, self.h1_size, self.h2_size, self.is_training) # when all models are defined, init all variables init_op = tf.global_variables_initializer() self.sess.run(init_op) self.loadPolicy(self.in_policy_file) print 'loaded replay size: ', self.episodes[self.domainString].size()
def __init__(self, in_policy_file, out_policy_file, ontology, cfg, logger, SetObj, domainString='CamRestaurants', is_training=False): super(STRACPolicy, self).__init__(domainString, ontology, cfg, logger, SetObj, is_training) self.domainString = domainString self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString, cfg, ontology.OntologyUtils, SetObj) self.in_policy_file = in_policy_file self.out_policy_file = out_policy_file self.is_training = is_training self.accum_belief = [] self.prev_state_check = None self.ontology = ontology self.logger = logger self.SetObj =SetObj # parameter settings if 0:#cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper self.n_in = cfg.getint('dqnpolicy', 'n_in') else: self.n_in = self.get_n_in(domainString) self.actor_critic_combine = True self.learning_rate = 0.001 if cfg.has_option('dqnpolicy', 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate') self.tau = 0.001 if cfg.has_option('dqnpolicy', 'tau'): self.tau = cfg.getfloat('dqnpolicy', 'tau') self.randomseed = 1234 if cfg.has_option('GENERAL', 'seed'): self.randomseed = cfg.getint('GENERAL', 'seed') self.gamma = 1.0 if cfg.has_option('dqnpolicy', 'gamma'): self.gamma = cfg.getfloat('dqnpolicy', 'gamma') self.gamma = 0.9 self.regularisation = 'l2' if cfg.has_option('dqnpolicy', 'regularisation'): self.regularisation = cfg.get('dqnpolicy', 'regulariser') self.exploration_type = 'e-greedy' # Boltzman if cfg.has_option('dqnpolicy', 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy', 'exploration_type') self.episodeNum = 1000 if cfg.has_option('dqnpolicy', 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum') self.maxiter = 5000 if cfg.has_option('dqnpolicy', 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter') self.maxiter += 2000 self.epsilon = 1 if cfg.has_option('dqnpolicy', 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon') self.epsilon_start = 1 if cfg.has_option('dqnpolicy', 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start') # self.epsilon_start = 0.9 self.epsilon_end = 1 if cfg.has_option('dqnpolicy', 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end') self.save_step = 100 if cfg.has_option('policy', 'save_step'): self.save_step = cfg.getint('policy', 'save_step') self.priorProbStart = 1.0 if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy', 'prior_sample_prob_start') self.priorProbEnd = 0.1 if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy', 'prior_sample_prob_end') self.policyfeatures = [] if cfg.has_option('dqnpolicy', 'features'): self.logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features')) self.max_k = 5 if cfg.has_option('dqnpolicy', 'max_k'): self.max_k = cfg.getint('dqnpolicy', 'max_k') self.learning_algorithm = 'drl' if cfg.has_option('dqnpolicy', 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy', 'learning_algorithm') self.logger.info('Learning algorithm: ' + self.learning_algorithm) self.minibatch_size = 32 if cfg.has_option('dqnpolicy', 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size') self.capacity = 1000 # max(self.minibatch_size, 2000) if cfg.has_option('dqnpolicy', 'capacity'): self.capacity = max(cfg.getint('dqnpolicy', 'capacity'), 2000) self.replay_type = 'vanilla' if cfg.has_option('dqnpolicy', 'replay_type'): self.replay_type = cfg.get('dqnpolicy', 'replay_type') self.architecture = 'vanilla' if cfg.has_option('dqnpolicy', 'architecture'): self.architecture = cfg.get('dqnpolicy', 'architecture') self.q_update = 'double' if cfg.has_option('dqnpolicy', 'q_update'): self.q_update = cfg.get('dqnpolicy', 'q_update') self.h1_size = 130 if cfg.has_option('dqnpolicy', 'h1_size'): self.h1_size = cfg.getint('dqnpolicy', 'h1_size') self.h1_drop = None if cfg.has_option('dqnpolicy', 'h1_drop'): self.h1_drop = cfg.getfloat('dqnpolicy', 'h1_drop') self.h2_size = 130 if cfg.has_option('dqnpolicy', 'h2_size'): self.h2_size = cfg.getint('dqnpolicy', 'h2_size') self.h2_drop = None if cfg.has_option('dqnpolicy', 'h2_drop'): self.h2_drop = cfg.getfloat('dqnpolicy', 'h2_drop') self.nature_mode = None if cfg.has_option('dqnpolicy', 'nature_mode'): self.nature_mode = cfg.getboolean('dqnpolicy', 'nature_mode') self.madqn_hidden_layers = None if cfg.has_option('dqnpolicy', 'madqn_hidden_layers'): self.madqn_hidden_layers = cfg.getint('dqnpolicy', 'madqn_hidden_layers') self.madqn_local_hidden_units = None if cfg.has_option('dqnpolicy', 'madqn_local_hidden_units'): self.madqn_local_hidden_units = cfg.get('dqnpolicy', 'madqn_local_hidden_units') self.madqn_local_hidden_units = eval(self.madqn_local_hidden_units) self.madqn_local_dropouts = None if cfg.has_option('dqnpolicy', 'madqn_local_dropouts'): self.madqn_local_dropouts = cfg.get('dqnpolicy', 'madqn_local_dropouts') self.madqn_local_dropouts = eval(self.madqn_local_dropouts) self.madqn_global_hidden_units = None if cfg.has_option('dqnpolicy', 'madqn_global_hidden_units'): self.madqn_global_hidden_units = cfg.get('dqnpolicy', 'madqn_global_hidden_units') self.madqn_global_hidden_units = eval(self.madqn_global_hidden_units) self.madqn_global_dropouts = None if cfg.has_option('dqnpolicy', 'madqn_global_dropouts'): self.madqn_global_dropouts = cfg.get('dqnpolicy', 'madqn_global_dropouts') self.madqn_global_dropouts = eval(self.madqn_global_dropouts) self.madqn_private_rate = None if cfg.has_option('dqnpolicy', 'madqn_private_rate'): self.madqn_private_rate = cfg.getfloat('dqnpolicy', 'madqn_private_rate') self.madqn_sort_input_vec = False if cfg.has_option('dqnpolicy', 'madqn_sort_input_vec'): self.madqn_sort_input_vec = cfg.getboolean('dqnpolicy', 'madqn_sort_input_vec') self.madqn_share_last_layer = False if cfg.has_option('dqnpolicy', 'madqn_share_last_layer'): self.madqn_share_last_layer = cfg.getboolean('dqnpolicy', 'madqn_share_last_layer') self.madqn_shared_last_layer_use_bias = True if cfg.has_option('dqnpolicy', 'madqn_shared_last_layer_use_bias'): self.madqn_shared_last_layer_use_bias = cfg.getboolean('dqnpolicy', 'madqn_shared_last_layer_use_bias') self.madqn_recurrent_mode = False if cfg.has_option('dqnpolicy', 'madqn_recurrent_mode'): self.madqn_recurrent_mode = cfg.getboolean('dqnpolicy', 'madqn_recurrent_mode') self.madqn_input_comm = True if cfg.has_option('dqnpolicy', 'madqn_input_comm'): self.madqn_input_comm = cfg.getboolean('dqnpolicy', 'madqn_input_comm') self.madqn_target_explore = False if cfg.has_option('dqnpolicy', 'madqn_target_explore'): self.madqn_target_explore = cfg.getboolean('dqnpolicy', 'madqn_target_explore') self.madqn_concrete_share_rate = False if cfg.has_option('dqnpolicy', 'madqn_concrete_share_rate'): self.madqn_concrete_share_rate = cfg.getboolean('dqnpolicy', 'madqn_concrete_share_rate') self.madqn_dropout_regularizer = 0. if cfg.has_option('dqnpolicy', 'madqn_dropout_regularizer'): self.madqn_dropout_regularizer = cfg.getfloat('dqnpolicy', 'madqn_dropout_regularizer') self.madqn_weight_regularizer = 0. if cfg.has_option('dqnpolicy', 'madqn_weight_regularizer'): self.madqn_weight_regularizer = cfg.getfloat('dqnpolicy', 'madqn_weight_regularizer') self.madqn_non_local_mode = False if cfg.has_option('dqnpolicy', 'madqn_non_local_mode'): self.madqn_non_local_mode = cfg.getboolean('dqnpolicy', 'madqn_non_local_mode') self.madqn_block_mode = False if cfg.has_option('dqnpolicy', 'madqn_block_mode'): self.madqn_block_mode = cfg.getboolean('dqnpolicy', 'madqn_block_mode') self.madqn_slots_comm = True if cfg.has_option('dqnpolicy', 'madqn_slots_comm'): self.madqn_slots_comm = cfg.getboolean('dqnpolicy', 'madqn_slots_comm') self.madqn_use_dueling = False if cfg.has_option('dqnpolicy', 'madqn_use_dueling'): self.madqn_use_dueling = cfg.getboolean('dqnpolicy', 'madqn_use_dueling') self.madqn_topo_learning_mode = False if cfg.has_option('dqnpolicy', 'madqn_topo_learning_mode'): self.madqn_topo_learning_mode = cfg.getboolean('dqnpolicy', 'madqn_topo_learning_mode') self.madqn_message_embedding = False if cfg.has_option('dqnpolicy', 'madqn_message_embedding'): self.madqn_message_embedding = cfg.getboolean('dqnpolicy', 'madqn_message_embedding') self.madqn_dueling_share_last = False if cfg.has_option('dqnpolicy', 'madqn_dueling_share_last'): self.madqn_dueling_share_last = cfg.getboolean('dqnpolicy', 'madqn_dueling_share_last') self.state_feature = 'vanilla' if cfg.has_option('dqnpolicy', 'state_feature'): self.state_feature = cfg.get('dqnpolicy', 'state_feature') self.init_policy = None if cfg.has_option('dqnpolicy', 'init_policy'): self.init_policy = cfg.get('dqnpolicy', 'init_policy') self.training_frequency = 2 if cfg.has_option('dqnpolicy', 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy', 'training_frequency') self.importance_sampling = True if cfg.has_option('dqnpolicy', 'importance_sampling'): self.importance_sampling = cfg.getint('dqnpolicy', 'importance_sampling') # domain specific parameter settings (overrides general policy parameter settings) if cfg.has_option('dqnpolicy_' + domainString, 'n_in'): self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in') if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'): self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString, 'learning_rate') if cfg.has_option('dqnpolicy_' + domainString, 'tau'): self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau') if cfg.has_option('dqnpolicy_' + domainString, 'gamma'): self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma') if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'): self.regularisation = cfg.get('dqnpolicy_' + domainString, 'regulariser') if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'): self.exploration_type = cfg.get('dqnpolicy_' + domainString, 'exploration_type') if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'): self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString, 'episodeNum') if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'): self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'): self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'): self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_start') if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'): self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon_end') if cfg.has_option('policy_' + domainString, 'save_step'): self.save_step = cfg.getint('policy_' + domainString, 'save_step') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_start'): self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_start') if cfg.has_option('dqnpolicy_' + domainString, 'prior_sample_prob_end'): self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString, 'prior_sample_prob_end') if cfg.has_option('dqnpolicy_' + domainString, 'features'): self.logger.info('Features: ' + str(cfg.get('dqnpolicy_' + domainString, 'features'))) self.policyfeatures = json.loads(cfg.get('dqnpolicy_' + domainString, 'features')) if cfg.has_option('dqnpolicy_' + domainString, 'max_k'): self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k') if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'): self.learning_algorithm = cfg.get('dqnpolicy_' + domainString, 'learning_algorithm') self.logger.info('Learning algorithm: ' + self.learning_algorithm) if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'): self.minibatch_size = cfg.getint('dqnpolicy_' + domainString, 'minibatch_size') if cfg.has_option('dqnpolicy_' + domainString, 'capacity'): self.capacity = max(cfg.getint('dqnpolicy_' + domainString, 'capacity'), 2000) if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'): self.replay_type = cfg.get('dqnpolicy_' + domainString, 'replay_type') if cfg.has_option('dqnpolicy_' + domainString, 'architecture'): self.architecture = cfg.get('dqnpolicy_' + domainString, 'architecture') if cfg.has_option('dqnpolicy_' + domainString, 'q_update'): self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update') if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'): self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size') if cfg.has_option('dqnpolicy_' + domainString, 'h1_drop'): self.h1_drop = cfg.getfloat('dqnpolicy_' + domainString, 'h1_drop') if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'): self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size') if cfg.has_option('dqnpolicy_' + domainString, 'h2_drop'): self.h2_drop = cfg.getfloat('dqnpolicy_' + domainString, 'h2_drop') if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'): self.training_frequency = cfg.getint('dqnpolicy_' + domainString, 'training_frequency') """ self.shuffle = False if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'): self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay') if not self.shuffle: # If we don't use experience replay, we don't need to maintain # sliding window of experiences with maximum capacity. # We only need to maintain the data of minibatch_size self.capacity = self.minibatch_size """ self.episode_ave_max_q = [] os.environ["CUDA_VISIBLE_DEVICES"] = "" # init session # self.sess = tf.Session() # with tf.device("/cpu:0"): np.random.seed(self.randomseed) # tf.set_random_seed(self.randomseed) # initialise an replay buffer if self.replay_type == 'vanilla': self.episodes[self.domainString] = ReplayBufferEpisode(self.capacity, self.minibatch_size, self.randomseed) elif self.replay_type == 'prioritized': self.episodes[self.domainString] = ReplayPrioritisedEpisode(self.capacity, self.minibatch_size, self.randomseed) self.samplecount = 0 self.episodecount = 0 # construct the models self.state_dim = self.n_in self.summaryaction = SummaryAction.SummaryAction(domainString, self.ontology, self.SetObj) self.action_dim = len(self.summaryaction.action_names) action_bound = len(self.summaryaction.action_names) self.stats = [0 for _ in range(self.action_dim)] import tube self.strac = strac.STRACNetwork(self.state_dim, self.action_dim, \ self.learning_rate, self.tau, action_bound, self.minibatch_size, self.architecture, self.h1_size, self.h1_drop, self.h2_size, self.h2_drop, self.domainString, self.madqn_hidden_layers, self.madqn_local_hidden_units, self.madqn_local_dropouts, self.madqn_global_hidden_units, self.madqn_global_dropouts, self.madqn_private_rate, self.madqn_sort_input_vec, self.madqn_share_last_layer, self.madqn_recurrent_mode, self.madqn_input_comm, self.madqn_target_explore, concrete_share_rate=self.madqn_concrete_share_rate, dropout_regularizer=self.madqn_dropout_regularizer, weight_regularizer=self.madqn_weight_regularizer, non_local_mode=self.madqn_non_local_mode, block_mode=self.madqn_block_mode, slots_comm=self.madqn_slots_comm, topo_learning_mode=self.madqn_topo_learning_mode, use_dueling=self.madqn_use_dueling, dueling_share_last=self.madqn_dueling_share_last, message_embedding=self.madqn_message_embedding, state_feature=self.state_feature, init_policy=self.init_policy, shared_last_layer_use_bias=self.madqn_shared_last_layer_use_bias, seed=tube.seed) lock.acquire() self.loadPolicy(self.in_policy_file) self.savePolicyInc() lock.release() print(self.domainString + ' loaded replay size: ' + str(self.episodes[self.domainString].size())) Settings.load_policy(self.strac, threading.currentThread().getName())