def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False,
                 action_names=None):
        super(DQNPolicy, self).__init__(domainString, is_training)

        tf.reset_default_graph()

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []

        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.prev_state_check = None

        # pw: Use turn info for predictions
        # action vector creation
        action_names = [
        ]  # hardcoded to include slots for specific actions (request, confirm, select)
        action_names += [
            "request(food)", "request(area)", "request(pricerange)",
            "confirm(food)", "confirm(area)", "confirm(pricerange)",
            "select(food)", "select(area)", "select(pricerange)", "inform",
            "inform_byname", "inform_alternatives", "bye", "repeat", "reqmore",
            "restart"
        ]
        num_actions = len(action_names)
        self.prev_state = None

        # parameter settings
        if 0:  #cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('dqnpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.learning_rate = 0.001
        if cfg.has_option('dqnpolicy', 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy', 'learning_rate')

        self.tau = 0.001
        if cfg.has_option('dqnpolicy', 'tau'):
            self.tau = cfg.getfloat('dqnpolicy', 'tau')

        self.randomseed = 1234
        if cfg.has_option('GENERAL', 'seed'):
            self.randomseed = cfg.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if cfg.has_option('dqnpolicy', 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy', 'gamma')

        self.regularisation = 'l2'
        if cfg.has_option('dqnpolicy', 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy', 'regulariser')

        self.exploration_type = 'e-greedy'  # Boltzman
        if cfg.has_option('dqnpolicy', 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy', 'exploration_type')

        self.episodeNum = 1000
        if cfg.has_option('dqnpolicy', 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy', 'episodeNum')

        self.maxiter = 5000
        if cfg.has_option('dqnpolicy', 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy', 'maxiter')

        self.epsilon = 1
        if cfg.has_option('dqnpolicy', 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy', 'epsilon')

        self.epsilon_start = 1
        if cfg.has_option('dqnpolicy', 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy', 'epsilon_start')

        self.epsilon_end = 1
        if cfg.has_option('dqnpolicy', 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy', 'epsilon_end')

        self.save_step = 100
        if cfg.has_option('policy', 'save_step'):
            self.save_step = cfg.getint('policy', 'save_step')

        self.priorProbStart = 1.0
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy',
                                               'prior_sample_prob_start')

        self.priorProbEnd = 0.1
        if cfg.has_option('dqnpolicy', 'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy',
                                             'prior_sample_prob_end')

        self.policyfeatures = []
        if cfg.has_option('dqnpolicy', 'features'):
            logger.info('Features: ' + str(cfg.get('dqnpolicy', 'features')))
            self.policyfeatures = json.loads(cfg.get('dqnpolicy', 'features'))

        self.max_k = 5
        if cfg.has_option('dqnpolicy', 'max_k'):
            self.max_k = cfg.getint('dqnpolicy', 'max_k')

        self.learning_algorithm = 'drl'
        if cfg.has_option('dqnpolicy', 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy',
                                              'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if cfg.has_option('dqnpolicy', 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy', 'minibatch_size')

        self.capacity = 1000
        if cfg.has_option('dqnpolicy', 'capacity'):
            self.capacity = cfg.getint('dqnpolicy', 'capacity')

        self.replay_type = 'vanilla'
        if cfg.has_option('dqnpolicy', 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy', 'replay_type')

        self.architecture = 'vanilla'
        if cfg.has_option('dqnpolicy', 'architecture'):
            self.architecture = cfg.get('dqnpolicy', 'architecture')
            if self.architecture == 'dip':
                self.architecture = 'dip2'

        self.q_update = 'single'
        if cfg.has_option('dqnpolicy', 'q_update'):
            self.q_update = cfg.get('dqnpolicy', 'q_update')

        self.h1_size = 130
        if cfg.has_option('dqnpolicy', 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy', 'h1_size')

        self.h2_size = 130
        if cfg.has_option('dqnpolicy', 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy', 'h2_size')

        self.training_frequency = 2
        if cfg.has_option('dqnpolicy', 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy',
                                                 'training_frequency')

        # domain specific parameter settings (overrides general policy parameter settings)
        if cfg.has_option('dqnpolicy_' + domainString, 'n_in'):
            self.n_in = cfg.getint('dqnpolicy_' + domainString, 'n_in')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_rate'):
            self.learning_rate = cfg.getfloat('dqnpolicy_' + domainString,
                                              'learning_rate')

        if cfg.has_option('dqnpolicy_' + domainString, 'tau'):
            self.tau = cfg.getfloat('dqnpolicy_' + domainString, 'tau')

        if cfg.has_option('dqnpolicy_' + domainString, 'gamma'):
            self.gamma = cfg.getfloat('dqnpolicy_' + domainString, 'gamma')

        if cfg.has_option('dqnpolicy_' + domainString, 'regularisation'):
            self.regularisation = cfg.get('dqnpolicy_' + domainString,
                                          'regulariser')

        if cfg.has_option('dqnpolicy_' + domainString, 'exploration_type'):
            self.exploration_type = cfg.get('dqnpolicy_' + domainString,
                                            'exploration_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'episodeNum'):
            self.episodeNum = cfg.getfloat('dqnpolicy_' + domainString,
                                           'episodeNum')

        if cfg.has_option('dqnpolicy_' + domainString, 'maxiter'):
            self.maxiter = cfg.getfloat('dqnpolicy_' + domainString, 'maxiter')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon'):
            self.epsilon = cfg.getfloat('dqnpolicy_' + domainString, 'epsilon')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_start'):
            self.epsilon_start = cfg.getfloat('dqnpolicy_' + domainString,
                                              'epsilon_start')

        if cfg.has_option('dqnpolicy_' + domainString, 'epsilon_end'):
            self.epsilon_end = cfg.getfloat('dqnpolicy_' + domainString,
                                            'epsilon_end')

        if cfg.has_option('policy_' + domainString, 'save_step'):
            self.save_step = cfg.getint('policy_' + domainString, 'save_step')

        if cfg.has_option('dqnpolicy_' + domainString,
                          'prior_sample_prob_start'):
            self.priorProbStart = cfg.getfloat('dqnpolicy_' + domainString,
                                               'prior_sample_prob_start')

        if cfg.has_option('dqnpolicy_' + domainString,
                          'prior_sample_prob_end'):
            self.priorProbEnd = cfg.getfloat('dqnpolicy_' + domainString,
                                             'prior_sample_prob_end')

        if cfg.has_option('dqnpolicy_' + domainString, 'features'):
            logger.info('Features: ' +
                        str(cfg.get('dqnpolicy_' + domainString, 'features')))
            self.policyfeatures = json.loads(
                cfg.get('dqnpolicy_' + domainString, 'features'))

        if cfg.has_option('dqnpolicy_' + domainString, 'max_k'):
            self.max_k = cfg.getint('dqnpolicy_' + domainString, 'max_k')

        if cfg.has_option('dqnpolicy_' + domainString, 'learning_algorithm'):
            self.learning_algorithm = cfg.get('dqnpolicy_' + domainString,
                                              'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        if cfg.has_option('dqnpolicy_' + domainString, 'minibatch_size'):
            self.minibatch_size = cfg.getint('dqnpolicy_' + domainString,
                                             'minibatch_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'capacity'):
            self.capacity = cfg.getint('dqnpolicy_' + domainString, 'capacity')

        if cfg.has_option('dqnpolicy_' + domainString, 'replay_type'):
            self.replay_type = cfg.get('dqnpolicy_' + domainString,
                                       'replay_type')

        if cfg.has_option('dqnpolicy_' + domainString, 'architecture'):
            self.architecture = cfg.get('dqnpolicy_' + domainString,
                                        'architecture')

        if cfg.has_option('dqnpolicy_' + domainString, 'q_update'):
            self.q_update = cfg.get('dqnpolicy_' + domainString, 'q_update')

        if cfg.has_option('dqnpolicy_' + domainString, 'h1_size'):
            self.h1_size = cfg.getint('dqnpolicy_' + domainString, 'h1_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'h2_size'):
            self.h2_size = cfg.getint('dqnpolicy_' + domainString, 'h2_size')

        if cfg.has_option('dqnpolicy_' + domainString, 'training_frequency'):
            self.training_frequency = cfg.getint('dqnpolicy_' + domainString,
                                                 'training_frequency')
        """
        self.shuffle = False
        if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'):
            self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay')
        if not self.shuffle:
            # If we don't use experience replay, we don't need to maintain
            # sliding window of experiences with maximum capacity.
            # We only need to maintain the data of minibatch_size
            self.capacity = self.minibatch_size
        """

        self.episode_ave_max_q = []

        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        policytype = 'dqn'
        self.dropout_rate = 0.
        if cfg.has_option('dqnpolicy', 'dropout_rate'):
            self.dropout_rate = cfg.getfloat('dqnpolicy', 'dropout_rate')
        if cfg.has_option('policy', 'policytype'):
            policytype = cfg.get('policy', 'policytype')
        if policytype != 'feudal':
            # init session
            self.sess = tf.Session()
            with tf.device("/cpu:0"):

                np.random.seed(self.randomseed)
                tf.set_random_seed(self.randomseed)

                # initialise an replay buffer
                if self.replay_type == 'vanilla':
                    self.episodes[self.domainString] = ReplayBuffer(
                        self.capacity, self.minibatch_size, self.randomseed)
                elif self.replay_type == 'prioritized':
                    self.episodes[self.domainString] = ReplayPrioritised(
                        self.capacity, self.minibatch_size, self.randomseed)
                self.samplecount = 0
                self.episodecount = 0

                # construct the models
                self.state_dim = self.n_in
                if self.architecture == 'dip2':
                    self.state_dim = 89
                self.summaryaction = SummaryAction.SummaryAction(domainString)
                if action_names is None:
                    self.action_names = self.summaryaction.action_names
                else:
                    self.action_names = action_names
                self.action_dim = len(self.action_names)
                action_bound = len(self.action_names)
                self.stats = [0 for _ in range(self.action_dim)]

                self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \
                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
                                            self.architecture, self.h1_size,
                                            self.h2_size, dropout_rate=self.dropout_rate)

                # when all models are defined, init all variables
                init_op = tf.global_variables_initializer()
                self.sess.run(init_op)

                self.loadPolicy(self.in_policy_file)
                print 'loaded replay size: ', self.episodes[
                    self.domainString].size()

                self.dqn.update_target_network()
    def __init__(self,
                 in_policy_file,
                 out_policy_file,
                 domainString='CamRestaurants',
                 is_training=False,
                 action_names=None):
        super(DQNPolicy, self).__init__(domainString, is_training)

        tf.reset_default_graph()

        self.domainString = domainString
        self.domainUtil = FlatOnt.FlatDomainOntology(self.domainString)
        self.in_policy_file = in_policy_file
        self.out_policy_file = out_policy_file
        self.is_training = is_training
        self.accum_belief = []
        self.prev_state_check = None

        #improvement==================================
        self.intrinsic_reward_method = None
        self.conf = ConfigParser.ConfigParser()
        if utils.Settings.config.has_option('scme', 'method'):
            self.intrinsic_reward_method = utils.Settings.config.get(
                'scme', 'method')
        #improvement==================================

        # parameter settings
        if 0:  #cfg.has_option('dqnpolicy', 'n_in'): #ic304: this was giving me a weird error, disabled it until i can check it deeper
            self.n_in = cfg.getint('dqnpolicy', 'n_in')
        else:
            self.n_in = self.get_n_in(domainString)

        self.learning_rate = 0.001
        if utils.Settings.config.has_option('dqnpolicy', 'learning_rate'):
            self.learning_rate = utils.Settings.config.getfloat(
                'dqnpolicy', 'learning_rate')

        self.tau = 0.001
        if utils.Settings.config.has_option('dqnpolicy', 'tau'):
            self.tau = utils.Settings.config.getfloat('dqnpolicy', 'tau')

        # self.randomseed = 1234 #TODO cfg import doesn't work anymore therfore i changed all the cfg to u.S.config.
        # if cfg.has_option('GENERAL', 'seed'):
        #     self.randomseed = cfg.getint('GENERAL', 'seed') #see same below, this is just kept as example to try

        self.randomseed = 1234
        if utils.Settings.config.has_option('GENERAL', 'seed'):
            self.randomseed = utils.Settings.config.getint('GENERAL', 'seed')

        self.gamma = 1.0
        if utils.Settings.config.has_option('dqnpolicy', 'gamma'):
            self.gamma = utils.Settings.config.getfloat('dqnpolicy', 'gamma')

        self.regularisation = 'l2'
        if utils.Settings.config.has_option('dqnpolicy', 'regularisation'):
            self.regularisation = utils.Settings.config.get(
                'dqnpolicy', 'regulariser')

        self.exploration_type = 'e-greedy'  # Boltzman
        if utils.Settings.config.has_option('dqnpolicy', 'exploration_type'):
            self.exploration_type = utils.Settings.config.get(
                'dqnpolicy', 'exploration_type')

        self.episodeNum = 1000
        if utils.Settings.config.has_option('dqnpolicy', 'episodeNum'):
            self.episodeNum = utils.Settings.config.getfloat(
                'dqnpolicy', 'episodeNum')

        self.maxiter = 5000
        if utils.Settings.config.has_option('dqnpolicy', 'maxiter'):
            self.maxiter = utils.Settings.config.getfloat(
                'dqnpolicy', 'maxiter')

        self.epsilon = 1
        if utils.Settings.config.has_option('dqnpolicy', 'epsilon'):
            self.epsilon = utils.Settings.config.getfloat(
                'dqnpolicy', 'epsilon')

        self.epsilon_start = 1
        if utils.Settings.config.has_option('dqnpolicy', 'epsilon_start'):
            self.epsilon_start = utils.Settings.config.getfloat(
                'dqnpolicy', 'epsilon_start')

        self.epsilon_end = 1
        if utils.Settings.config.has_option('dqnpolicy', 'epsilon_end'):
            self.epsilon_end = utils.Settings.config.getfloat(
                'dqnpolicy', 'epsilon_end')

        self.save_step = 100
        if utils.Settings.config.has_option('policy', 'save_step'):
            self.save_step = utils.Settings.config.getint(
                'policy', 'save_step')

        self.priorProbStart = 1.0
        if utils.Settings.config.has_option('dqnpolicy',
                                            'prior_sample_prob_start'):
            self.priorProbStart = utils.Settings.config.getfloat(
                'dqnpolicy', 'prior_sample_prob_start')

        self.priorProbEnd = 0.1
        if utils.Settings.config.has_option('dqnpolicy',
                                            'prior_sample_prob_end'):
            self.priorProbEnd = utils.Settings.config.getfloat(
                'dqnpolicy', 'prior_sample_prob_end')

        self.policyfeatures = []
        if utils.Settings.config.has_option('dqnpolicy', 'features'):
            logger.info(
                'Features: ' +
                str(utils.Settings.config.get('dqnpolicy', 'features')))
            self.policyfeatures = json.loads(
                utils.Settings.config.get('dqnpolicy', 'features'))

        self.max_k = 5
        if utils.Settings.config.has_option('dqnpolicy', 'max_k'):
            self.max_k = utils.Settings.config.getint('dqnpolicy', 'max_k')

        self.learning_algorithm = 'drl'
        if utils.Settings.config.has_option('dqnpolicy', 'learning_algorithm'):
            self.learning_algorithm = utils.Settings.config.get(
                'dqnpolicy', 'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        self.minibatch_size = 32
        if utils.Settings.config.has_option('dqnpolicy', 'minibatch_size'):
            self.minibatch_size = utils.Settings.config.getint(
                'dqnpolicy', 'minibatch_size')

        self.capacity = 1000
        if utils.Settings.config.has_option('dqnpolicy', 'capacity'):
            self.capacity = utils.Settings.config.getint(
                'dqnpolicy', 'capacity')

        self.replay_type = 'vanilla'
        if utils.Settings.config.has_option('dqnpolicy', 'replay_type'):
            self.replay_type = utils.Settings.config.get(
                'dqnpolicy', 'replay_type')

        self.architecture = 'vanilla'
        if utils.Settings.config.has_option('dqnpolicy', 'architecture'):
            self.architecture = utils.Settings.config.get(
                'dqnpolicy', 'architecture')
            if self.architecture == 'dip':
                self.architecture = 'dip2'

        self.q_update = 'single'
        if utils.Settings.config.has_option('dqnpolicy', 'q_update'):
            self.q_update = utils.Settings.config.get('dqnpolicy', 'q_update')

        self.h1_size = 130
        if utils.Settings.config.has_option('dqnpolicy', 'h1_size'):
            self.h1_size = utils.Settings.config.getint('dqnpolicy', 'h1_size')

        self.h2_size = 130
        if utils.Settings.config.has_option('dqnpolicy', 'h2_size'):
            self.h2_size = utils.Settings.config.getint('dqnpolicy', 'h2_size')

        self.training_frequency = 2
        if utils.Settings.config.has_option('dqnpolicy', 'training_frequency'):
            self.training_frequency = utils.Settings.config.getint(
                'dqnpolicy', 'training_frequency')

        # domain specific parameter settings (overrides general policy parameter settings)
        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'n_in'):
            self.n_in = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'n_in')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'learning_rate'):
            self.learning_rate = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'learning_rate')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'tau'):
            self.tau = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'tau')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'gamma'):
            self.gamma = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'gamma')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'regularisation'):
            self.regularisation = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'regulariser')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'exploration_type'):
            self.exploration_type = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'exploration_type')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'episodeNum'):
            self.episodeNum = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'episodeNum')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'maxiter'):
            self.maxiter = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'maxiter')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'epsilon'):
            self.epsilon = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'epsilon')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'epsilon_start'):
            self.epsilon_start = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'epsilon_start')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'epsilon_end'):
            self.epsilon_end = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'epsilon_end')

        if utils.Settings.config.has_option('policy_' + domainString,
                                            'save_step'):
            self.save_step = utils.Settings.config.getint(
                'policy_' + domainString, 'save_step')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'prior_sample_prob_start'):
            self.priorProbStart = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'prior_sample_prob_start')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'prior_sample_prob_end'):
            self.priorProbEnd = utils.Settings.config.getfloat(
                'dqnpolicy_' + domainString, 'prior_sample_prob_end')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'features'):
            logger.info('Features: ' + str(
                utils.Settings.config.get('dqnpolicy_' +
                                          domainString, 'features')))
            self.policyfeatures = json.loads(
                utils.Settings.config.get('dqnpolicy_' + domainString,
                                          'features'))

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'max_k'):
            self.max_k = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'max_k')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'learning_algorithm'):
            self.learning_algorithm = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'learning_algorithm')
            logger.info('Learning algorithm: ' + self.learning_algorithm)

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'minibatch_size'):
            self.minibatch_size = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'minibatch_size')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'capacity'):
            self.capacity = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'capacity')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'replay_type'):
            self.replay_type = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'replay_type')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'architecture'):
            self.architecture = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'architecture')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'q_update'):
            self.q_update = utils.Settings.config.get(
                'dqnpolicy_' + domainString, 'q_update')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'h1_size'):
            self.h1_size = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'h1_size')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'h2_size'):
            self.h2_size = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'h2_size')

        if utils.Settings.config.has_option('dqnpolicy_' + domainString,
                                            'training_frequency'):
            self.training_frequency = utils.Settings.config.getint(
                'dqnpolicy_' + domainString, 'training_frequency')
        """
        self.shuffle = False
        if cfg.has_option('dqnpolicy_'+domainString, 'experience_replay'):
            self.shuffle = cfg.getboolean('dqnpolicy_'+domainString, 'experience_replay')
        if not self.shuffle:
            # If we don't use experience replay, we don't need to maintain
            # sliding window of experiences with maximum capacity.
            # We only need to maintain the data of minibatch_size
            self.capacity = self.minibatch_size
        """

        self.episode_ave_max_q = []
        self.curiositypred_loss = []

        #os.environ["CUDA_VISIBLE_DEVICES"] = ""
        policytype = 'dqn'
        self.dropout_rate = 0.
        if utils.Settings.config.has_option('dqnpolicy', 'dropout_rate'):
            self.dropout_rate = utils.Settings.config.getfloat(
                'dqnpolicy', 'dropout_rate')
        if utils.Settings.config.has_option('policy', 'policytype'):
            policytype = utils.Settings.config.get('policy', 'policytype')
        if policytype != 'feudal':
            self.sess = tf.Session()

            with tf.device("/cpu:0"):

                np.random.seed(self.randomseed)
                tf.set_random_seed(self.randomseed)
                # initialise an replay buffer
                if self.replay_type == 'vanilla':
                    self.episodes[self.domainString] = ReplayBuffer(
                        self.capacity, self.minibatch_size, self.randomseed)
                elif self.replay_type == 'prioritized':
                    self.episodes[self.domainString] = ReplayPrioritised(
                        self.capacity, self.minibatch_size, self.randomseed)
                self.samplecount = 0
                self.episodecount = 0

                # construct the models
                self.state_dim = self.n_in
                if self.architecture == 'dip2':
                    self.state_dim = 89
                self.summaryaction = SummaryAction.SummaryAction(domainString)
                if action_names is None:
                    self.action_names = self.summaryaction.action_names
                else:
                    self.action_names = action_names
                self.action_dim = len(self.action_names)
                action_bound = len(self.action_names)
                self.stats = [0 for _ in range(self.action_dim)]

                self.dqn = dqn.DeepQNetwork(self.sess, self.state_dim, self.action_dim, \
                                            self.learning_rate, self.tau, action_bound, self.minibatch_size,
                                            self.architecture, self.h1_size,
                                            self.h2_size, dropout_rate=self.dropout_rate)

                #self.curiosityFunctions = scme(self.sess, self.state_dim, self.action_dim, self.randomseed)

                # when all models are defined, init all variables
                init_op = tf.global_variables_initializer()
                self.sess.run(init_op)

                self.loadPolicy(self.in_policy_file)
                print 'loaded replay size: ', self.episodes[
                    self.domainString].size()

                #improvement==================================
                #initial
                if self.intrinsic_reward_method == 'vime':
                    self.vime_model = vime(self.state_dim, self.action_dim)
                    self.vime_model.load_model('model/vime_model/' +
                                               self.in_policy_file)

                elif self.intrinsic_reward_method == 'cme':
                    self.cme_model = cme(self.state_dim, self.action_dim)
                    self.cme_model.load_model('model/cme_model/' +
                                              self.in_policy_file)

                elif self.intrinsic_reward_method == 'scme':
                    self.scme_model = scme(self.state_dim, self.action_dim)
                    self.scme_model.load_model('model/scme_model/' +
                                               self.in_policy_file)
                #improvement==================================

                self.dqn.update_target_network()