Example #1
0
    def init_nets(self, global_nets=None):
        '''
        Initialize the neural network used to learn the policy function from the spec
        Below we automatically select an appropriate net for a discrete or continuous action space if the setting is of the form 'MLPNet'. Otherwise the correct type of network is assumed to be specified in the spec.
        Networks for continuous action spaces have two heads and return two values, the first is a tensor containing the mean of the action policy, the second is a tensor containing the std deviation of the action policy. The distribution is assumed to be a Gaussian (Normal) distribution.
        Networks for discrete action spaces have a single head and return the logits for a categorical probability distribution over the discrete actions
        '''
        in_dim = self.body.state_dim
        out_dim = net_util.get_out_dim(self.body)
        NetClass = getattr(net, self.net_spec['type'])
        self.net = NetClass(self.net_spec, in_dim, out_dim)
        self.net_names = ['net']
        # init net optimizer and its lr scheduler
        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
        self.lr_scheduler = net_util.get_lr_scheduler(self.optim, self.net.lr_scheduler_spec)
        net_util.set_global_nets(self, global_nets)
        self.post_init_nets()


        reward_path = '../irl/NeuralDialog-LAED/logs/2019-08-16T12-04-13-mwoz_gan_vae.py'  # r4
        use_gpu = False
        # self.reward_agent = reward_agent.RewardAgent(use_gpu)
        self.reward_agent = reward_agent.RewardAgent_EncoderSide(use_gpu)
        val_feed = reward_utils.WoZGanDataLoaders('val')
        reward_agent.load_reward_model(self.reward_agent, reward_path, use_gpu)
        if use_gpu:
            self.reward_agent.cuda()
        self.reward_agent.eval()
        self.reward_count = 0
        self.batch_count = 0
        reward_utils.reward_validate(self.reward_agent, val_feed)
Example #2
0
    def __init__(self, agent, global_nets=None):
        super().__init__(agent, global_nets)

        use_gpu = False
        # vae_type = 'autoencoder'
        vae_type = 'vae'
        # self.reward_agent = reward_agent.RewardAgent(use_gpu)

        # self.reward_agent = reward_agent.RewardAgent_EncoderSide(use_gpu, vae_type)   # this is the State Vae and Action Onehot version
        # reward_path = './your/trained/model/path/2019-08-16T12-04-13-mwoz_gan_vae.py'  # r4  # this is the autoencoder based reward model

        self.reward_agent = reward_agent.RewardAgent_EncoderSide(
            use_gpu,
            vae_type)  # this is the State Vae and Action Onehot version
        reward_path = './your/trained/model/path/2019-09-06T12:04:49.278628-mwoz_gan_vae.py'  # new trained vae-based reward

        # self.reward_agent = reward_agent.RewardAgent_StateVaeActionSeg(use_gpu, vae_type)   # this is the State Vae and Action Seg version
        # reward_path = './your/trained/model/path/2019-09-18T20:06:28.509357-mwoz_gan_vae_StateActionEmbed.py' # new trained state_vae action_seg reward, Hotel excluded

        # self.reward_agent = reward_agent.RewardAgent_StateVaeActionSeg(use_gpu, vae_type)   # this is the State Vae and Action Seg version
        # reward_path = './your/trained/model/path/logs/2019-09-19T22:06:56.826004-mwoz_gan_vae_StateActionEmbed.py' # new trained state_vae action_seg reward, All domains

        val_feed = reward_utils.WoZGanDataLoaders('val')
        reward_agent.load_reward_model(self.reward_agent, reward_path, use_gpu)
        if use_gpu:
            self.reward_agent.cuda()
        self.reward_agent.eval()
        self.reward_count = 0
        self.batch_count = 0

        reward_utils.reward_validate(self.reward_agent, val_feed)
Example #3
0
    def __init__(self, agent, global_nets=None):
        super().__init__(agent, global_nets)
        # create the extra replay memory for warm-up
        # reward_path = "./reward_model/2019-08-01T16-52-56-mwoz_gan_vae.py"
        # reward_path = './reward_model/2019-08-02T17-50-11-mwoz_gan_vae.py'
        # reward_path = '../NeuralDialog-LAED/logs/2019-08-03T04-34-03-mwoz_gan_vae.py'
        # reward_path = '../irl/NeuralDialog-LAED/logs/2019-08-19T13-56-13-mwoz_gan_vae.py'  # r5
        # reward_path = '../irl/NeuralDialog-LAED/logs/2019-08-16T12-04-13-mwoz_gan_vae.py'  # r4  # this is the autoencoder based reward model

        use_gpu = False
        # vae_type = 'autoencoder'
        vae_type = 'vae'
        # self.reward_agent = reward_agent.RewardAgent(use_gpu)

        # self.reward_agent = reward_agent.RewardAgent_EncoderSide(use_gpu, vae_type)   # this is the State Vae and Action Onehot version
        # reward_path = '../irl/NeuralDialog-LAED/logs/2019-08-16T12-04-13-mwoz_gan_vae.py'  # r4  # this is the autoencoder based reward model

        self.reward_agent = reward_agent.RewardAgent_EncoderSide(
            use_gpu,
            vae_type)  # this is the State Vae and Action Onehot version
        reward_path = '../irl/NeuralDialog-LAED/logs/2019-09-06T12:04:49.278628-mwoz_gan_vae.py'  # new trained vae-based reward

        # self.reward_agent = reward_agent.RewardAgent_StateVaeActionSeg(use_gpu, vae_type)   # this is the State Vae and Action Seg version
        # reward_path = '../irl/NeuralDialog-LAED/logs/2019-09-18T20:06:28.509357-mwoz_gan_vae_StateActionEmbed.py' # new trained state_vae action_seg reward, Hotel excluded

        # self.reward_agent = reward_agent.RewardAgent_StateVaeActionSeg(use_gpu, vae_type)   # this is the State Vae and Action Seg version
        # reward_path = '../irl/NeuralDialog-LAED/logs/2019-09-19T22:06:56.826004-mwoz_gan_vae_StateActionEmbed.py' # new trained state_vae action_seg reward, All domains

        val_feed = reward_utils.WoZGanDataLoaders('val')
        reward_agent.load_reward_model(self.reward_agent, reward_path, use_gpu)
        if use_gpu:
            self.reward_agent.cuda()
        self.reward_agent.eval()
        self.reward_count = 0
        self.batch_count = 0

        reward_utils.reward_validate(self.reward_agent, val_feed)
Example #4
0
    def init_nets(self, global_nets=None):
        '''
        Initialize the neural networks used to learn the actor and critic from the spec
        Below we automatically select an appropriate net based on two different conditions
        1. If the action space is discrete or continuous action
            - Networks for continuous action spaces have two heads and return two values, the first is a tensor containing the mean of the action policy, the second is a tensor containing the std deviation of the action policy. The distribution is assumed to be a Gaussian (Normal) distribution.
            - Networks for discrete action spaces have a single head and return the logits for a categorical probability distribution over the discrete actions
        2. If the actor and critic are separate or share weights
            - If the networks share weights then the single network returns a list.
            - Continuous action spaces: The return list contains 3 elements: The first element contains the mean output for the actor (policy), the second element the std dev of the policy, and the third element is the state-value estimated by the network.
            - Discrete action spaces: The return list contains 2 element. The first element is a tensor containing the logits for a categorical probability distribution over the actions. The second element contains the state-value estimated by the network.
        3. If the network type is feedforward, convolutional, or recurrent
            - Feedforward and convolutional networks take a single state as input and require an OnPolicyReplay or OnPolicyBatchReplay memory
            - Recurrent networks take n states as input and require env spec "frame_op": "concat", "frame_op_len": seq_len
        '''
        assert 'shared' in self.net_spec, 'Specify "shared" for ActorCritic network in net_spec'
        self.shared = self.net_spec['shared']

        # create actor/critic specific specs
        actor_net_spec = self.net_spec.copy()
        critic_net_spec = self.net_spec.copy()
        for k in self.net_spec:
            if 'actor_' in k:
                actor_net_spec[k.replace('actor_', '')] = actor_net_spec.pop(k)
                critic_net_spec.pop(k)
            if 'critic_' in k:
                critic_net_spec[k.replace('critic_',
                                          '')] = critic_net_spec.pop(k)
                actor_net_spec.pop(k)
        if critic_net_spec['use_same_optim']:
            critic_net_spec = actor_net_spec

        in_dim = self.body.state_dim
        out_dim = net_util.get_out_dim(self.body, add_critic=self.shared)
        # main actor network, may contain out_dim self.shared == True
        NetClass = getattr(net, actor_net_spec['type'])
        self.net = NetClass(actor_net_spec, in_dim, out_dim)
        self.net_names = ['net']
        if not self.shared:  # add separate network for critic
            critic_out_dim = 1
            CriticNetClass = getattr(net, critic_net_spec['type'])
            self.critic_net = CriticNetClass(critic_net_spec, in_dim,
                                             critic_out_dim)
            self.net_names.append('critic_net')
        # init net optimizer and its lr scheduler
        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
        self.lr_scheduler = net_util.get_lr_scheduler(
            self.optim, self.net.lr_scheduler_spec)
        if not self.shared:
            self.critic_optim = net_util.get_optim(self.critic_net,
                                                   self.critic_net.optim_spec)
            self.critic_lr_scheduler = net_util.get_lr_scheduler(
                self.critic_optim, self.critic_net.lr_scheduler_spec)
        net_util.set_global_nets(self, global_nets)
        self.post_init_nets()

        use_gpu = False
        vae_type = 'vae'
        self.experience_buffer = deque(maxlen=20)

        root_dir = os.path.dirname(
            os.path.dirname(
                os.path.dirname(
                    os.path.dirname(os.path.dirname(
                        os.path.abspath(__file__))))))
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_3_VAE_no_kl_finish")
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_2_VAE")
        reward_path = os.path.join(
            root_dir, "convlab_repo/saved_models/cl_3_VAE_pre_training_mode")

        config_path = os.path.join(reward_path, "params.json")
        with open(config_path, 'r') as f:
            dic = json.load(f)
            config = argparse.Namespace(**dic)
        self.reward_agent = reward_agent.RewardAgent_EncoderSide(
            config, use_gpu, "mine")
        # reward_path = '../irl/NeuralDialog-LAED/logs/2019-09-06T12:04:49.278628-mwoz_gan_vae.py' # new trained vae-based reward

        # val_feed = reward_utils.WoZGanDataLoaders('val', 16)
        # train_feed = reward_utils.WoZGanDataLoaders('train', 16)
        # train_feed.epoch_init(shuffle=True)
        # self.discriminator = reward_agent.A2C_Discriminator(config, use_gpu, train_feed, 16)
        # self.optim_disc = self.discriminator.get_optimizer()

        self.load_pretrain_policy = self.algorithm_spec['load_pretrain_policy']
        policy_mdl = './reward_model/policy_pretrain.mdl'

        if self.load_pretrain_policy:
            if os.path.exists(policy_mdl):
                self.net.load_state_dict(torch.load(policy_mdl))
                # self.old_net.load_state_dict(torch.load(policy_mdl))
                print("successfully loaded the pretrained policy model")
            else:
                raise ValueError("No policy model")

        reward_agent.load_reward_model(self.reward_agent, reward_path, use_gpu)
        if use_gpu:
            self.reward_agent.cuda()
        self.reward_agent.eval()
        self.reward_count = 0
        self.batch_count = 0
Example #5
0
    def init_nets(self, global_nets=None):
        '''PPO uses old and new to calculate ratio for loss'''
        super().init_nets(global_nets)
        # create old net to calculate ratio
        self.old_net = deepcopy(self.net)
        assert id(self.old_net) != id(self.net)

        val_feed = reward_utils.WoZGanDataLoaders('val', 16)
        train_feed = reward_utils.WoZGanDataLoaders('train', 16)
        train_feed.epoch_init(shuffle=True)

        use_gpu = False
        vae_type = 'vae'
        update = True

        self.experience_buffer = deque(maxlen=10)
        # len was 200 at the beginnning
        self.reward_buffer = deque(
            maxlen=self.algorithm_spec['reward_buffer_size'])

        self.reward_agent = reward_agent.RewardAgent_EncoderSide(
            use_gpu, vae_type, update=update, real_data_feed=train_feed
        )  # this is the State Vae and Action Onehot version
        reward_path = '../irl/NeuralDialog-LAED/logs/2019-09-06T12:04:49.278628-mwoz_gan_vae.py'  # new trained vae-based reward
        self.optim_gandisc = None
        if update:
            self.optim_gandisc = self.reward_agent.discriminator.get_optimizer(
            )

        self.reward_type = self.algorithm_spec['reward_type']
        self.disc_training_times = self.algorithm_spec['disc_training_times']
        self.disc_training_freq = self.algorithm_spec['disc_training_freq']
        # self.reward_type = 'OFFGAN'
        # self.reward_type = 'Human'

        if self.reward_type == 'DISC':
            self.discriminator = reward_agent.A2C_Discriminator(
                use_gpu, train_feed, 16)
            disc_mdl = './reward_model/disc_pretrain.mdl'
        else:
            self.discriminator = reward_agent.AIRL(use_gpu, train_feed, 16)
            disc_mdl = './reward_model/airl_pretrain.mdl'
            if os.path.exists(disc_mdl):
                self.discriminator.load_state_dict(torch.load(disc_mdl))
                print("successfully loaded the pretrained Disc model")
        self.optim_disc = self.discriminator.get_optimizer()
        self.disc_training_count = 0
        self.policy_training_flag = False

        reward_agent.load_reward_model(self.reward_agent, reward_path, use_gpu)
        if use_gpu:
            self.reward_agent.cuda()
        self.reward_agent.eval()
        self.reward_count = 0
        self.batch_count = 0
        self.pretrain_finished = False
        self.pretrain_disc_and_valud_finished = False

        reward_utils.reward_validate(self.reward_agent, val_feed)
        self.load_pretrain_policy = self.algorithm_spec['load_pretrain_policy']
        policy_mdl = './reward_model/policy_pretrain.mdl'

        if self.load_pretrain_policy:
            if os.path.exists(policy_mdl):
                self.net.load_state_dict(torch.load(policy_mdl))
                self.old_net.load_state_dict(torch.load(policy_mdl))
                print("successfully loaded the pretrained policy model")
                self.pretrain_finished = True
                self.policy_training_flag = True  # the pretrained reward function will not be updated.
            else:
                self.pretrain_finished = False
Example #6
0
    def __init__(self, agent, global_nets=None):
        super().__init__(agent, global_nets)
        use_gpu = False
        # vae_type = 'autoencoder'
        vae_type = 'vae'
        """
        1, 2 = 3
        """
        # self.reward_agent = reward_agent.RewardAgent_EncoderSide(use_gpu, vae_type)   # this is the State Vae and Action Onehot version
        # reward_path = './your/trained/model/path/2019-08-16T12-04-13-mwoz_gan_vae.py'  # r4  # this is the autoencoder based reward model
        """
        1
        """
        # reward_path = os.path.join(root_dir, "gan_v/logs/cl_1_AE")
        # reward_path = os.path.join(root_dir, "gan_v/logs/naive_model_1_vae_update")
        # reward_path = os.path.join(root_dir, "gan_v/logs/cl_1_AE_action_noise")
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/naive_v_parallel_cl")
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_2_VAE")
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_3_VAE_no_kl_finish")
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_2_VAE")
        reward_path = os.path.join(
            root_dir, "convlab_repo/saved_models/cl_3_VAE_pre_training_mode")

        # abalation test model
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_finish_no_noise")
        config_path = os.path.join(reward_path, "params.json")
        with open(config_path, 'r') as f:
            dic = json.load(f)
            config = argparse.Namespace(**dic)
        self.reward_agent = reward_agent.RewardAgent_EncoderSide(
            config, use_gpu, model_name="mine"
        )  # this is the State Vae and Action Onehot version
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/2019-09-06T12:04:49.278628-mwoz_gan_vae.py")
        """
        2, the normal one.
        """
        # self.reward_agent = reward_agent.RewardAgent_StateVaeActionSeg(use_gpu, vae_type)   # this is the State Vae and Action Seg version
        # reward_path = os.path.join(root_dir, 'convlab_repo/saved_models/2019-09-18T20:06:28.509357-mwoz_gan_vae_StateActionEmbed.py') # new trained state_vae action_seg reward, Hotel excluded
        """
        # 3
        # self.reward_agent = reward_agent.RewardAgent_StateVaeActionSeg(use_gpu, vae_type)   # this is the State Vae and Action Seg version
        # reward_path = os.path.join(root_dir, 'convlab_repo/saved_models/2019-09-19T22:06:56.826004-mwoz_gan_vae_StateActionEmbed.py') # new trained state_vae action_seg reward, All domains
        """
        reward_agent.load_reward_model(self.reward_agent, reward_path, use_gpu)
        if use_gpu:
            self.reward_agent.cuda()
        self.reward_agent.eval()
        self.reward_count = 0
        self.batch_count = 0

        # val_feed = reward_utils.WoZGanDataLoaders('val')
        # reward_utils.reward_validate(self.reward_agent, val_feed)
        """
        #DRAW of current stragetory.
        # For second agent loaded
        """
        reward_path_ziming = os.path.join(
            root_dir,
            "convlab_repo/saved_models/2019-09-06T12:04:49.278628-mwoz_gan_vae.py"
        )
        ziming_agent = reward_agent.RewardAgent_EncoderSide(
            config, use_gpu, model_name="ziming"
        )  # this is the State Vae and Action Onehot version
        reward_agent.load_reward_model(ziming_agent, reward_path_ziming,
                                       use_gpu)

        test_feed = reward_utils.WoZGanDataLoaders("test")
        reward_utils.plot_graph(self.reward_agent, test_feed, surgery="das")
        reward_utils.plot_graph(self.reward_agent,
                                test_feed,
                                surgery="hard_update")
        # reward_utils.plot_graph(self.reward_agent, test_feed, surgery = "product")
        reward_utils.plot_graph(ziming_agent, test_feed, name="ziming")
        import random
        torch.manual_seed(1)
        torch.cuda.manual_seed(1)
        random.seed(1)
        np.random.seed(1)
        # Do a through evaluation.
        type_list = ["hard_update", "das", "product"]
        # type_list = ["d", "a", "s"]        #

        reward_utils.plot_graph_4_seperate(self.reward_agent, ziming_agent,
                                           test_feed, type_list)
Example #7
0
    def init_nets(self, global_nets=None):
        '''PPO uses old and new to calculate ratio for loss'''
        super().init_nets(global_nets)
        # create old net to calculate ratio
        self.old_net = deepcopy(self.net)
        assert id(self.old_net) != id(self.net)
    
        val_feed = reward_utils.WoZGanDataLoaders('val', 64)
        train_feed = reward_utils.WoZGanDataLoaders('train', 64)
        train_feed.epoch_init(shuffle=True)
        
        use_gpu = False
        vae_type = 'vae'
        update = False

        self.experience_buffer = deque(maxlen=10)
        # len was 200 at the beginnning
        self.reward_buffer = deque(maxlen=self.algorithm_spec['reward_buffer_size'])

        """
        my reward model start from here.
        Just change the reward_path and the function is enough for me.
        Potential bugs in actot critic, since this one is the basic function for PPO.
        """
        # ziming's code
        # self.reward_agent = reward_agent.RewardAgent_EncoderSide(use_gpu, vae_type,update=update, real_data_feed=train_feed)   # this is the State Vae and Action Onehot version
        # reward_path = 'convlab_repo/saved_models/2019-09-06T12:04:49.278628-mwoz_gan_vae.py' # the pre trained vae-based reward
        root_dir = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/naive_v_parallel_cl")
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_2_VAE")
        # reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_3_VAE_no_kl_finish")
        reward_path = os.path.join(root_dir, "convlab_repo/saved_models/cl_3_VAE_pre_training_mode")

        config_path = os.path.join(reward_path, "params.json")
        with open(config_path, 'r') as f:
            dic = json.load(f)
            config = argparse.Namespace(**dic)
        self.reward_agent = reward_agent.RewardAgent_EncoderSide(config, use_gpu, "mine", vae_type)


        self.optim_gandisc=None
        # no update for the ppo.
        if update:
            self.optim_gandisc = self.reward_agent.discriminator.get_optimizer(config)
        

        self.disc_training_times = self.algorithm_spec['disc_training_times']
        self.disc_training_freq = self.algorithm_spec['disc_training_freq']
        # self.reward_type = self.algorithm_spec['reward_type']
        # self.reward_type = 'AIRL'
        # self.reward_type = 'DISC'
        self.reward_type = 'OFFGAN'
        # self.reward_type = 'OFFGAN_update'
        # self.reward_type = 'Human'
        # self.reward_type = "my_parallel"

        if self.reward_type=='DISC':
            self.discriminator = reward_agent.A2C_Discriminator(config, use_gpu, train_feed, 64)
            disc_mdl = './reward_model/disc_pretrain.mdl'  
        else:
            self.discriminator = reward_agent.AIRL(config, use_gpu, train_feed, 64)
            disc_mdl = './reward_model/airl_pretrain.mdl'
            # if os.path.exists(disc_mdl):
            #     self.discriminator.load_state_dict(torch.load(disc_mdl))
            #     print("successfully loaded the pretrained Disc model")
        self.optim_disc = self.discriminator.get_optimizer()
        self.disc_training_count = 0
        self.policy_training_flag=False

        # load model
        reward_agent.load_reward_model(self.reward_agent, reward_path, use_gpu)
        if use_gpu:
            self.reward_agent.cuda()
        self.reward_agent.eval()
        self.reward_count = 0
        self.batch_count = 0
        self.pretrain_finished = False
        self.pretrain_disc_and_valud_finished = False
        self.disc_pretrain_finished = False
        if self.reward_type=='OFFGAN':
            self.disc_pretrain_finished = True
            self.policy_training_flag=True
            self.pretrain_finished = False
 

        # reward_utils.reward_validate(self.reward_agent, val_feed)
        self.load_pretrain_policy = self.algorithm_spec['load_pretrain_policy']
        policy_mdl = './reward_model/policy_pretrain.mdl'

        if self.load_pretrain_policy:
            if os.path.exists(policy_mdl):
                self.net.load_state_dict(torch.load(policy_mdl))
                self.old_net.load_state_dict(torch.load(policy_mdl))
                print("successfully loaded the pretrained policy model")
            else:
                raise ValueError("No policy model")
Example #8
0
    def init_nets(self, global_nets=None):
        '''
        Initialize the neural networks used to learn the actor and critic from the spec
        Below we automatically select an appropriate net based on two different conditions
        1. If the action space is discrete or continuous action
            - Networks for continuous action spaces have two heads and return two values, the first is a tensor containing the mean of the action policy, the second is a tensor containing the std deviation of the action policy. The distribution is assumed to be a Gaussian (Normal) distribution.
            - Networks for discrete action spaces have a single head and return the logits for a categorical probability distribution over the discrete actions
        2. If the actor and critic are separate or share weights
            - If the networks share weights then the single network returns a list.
            - Continuous action spaces: The return list contains 3 elements: The first element contains the mean output for the actor (policy), the second element the std dev of the policy, and the third element is the state-value estimated by the network.
            - Discrete action spaces: The return list contains 2 element. The first element is a tensor containing the logits for a categorical probability distribution over the actions. The second element contains the state-value estimated by the network.
        3. If the network type is feedforward, convolutional, or recurrent
            - Feedforward and convolutional networks take a single state as input and require an OnPolicyReplay or OnPolicyBatchReplay memory
            - Recurrent networks take n states as input and require env spec "frame_op": "concat", "frame_op_len": seq_len
        '''
        assert 'shared' in self.net_spec, 'Specify "shared" for ActorCritic network in net_spec'
        self.shared = self.net_spec['shared']

        # create actor/critic specific specs
        actor_net_spec = self.net_spec.copy()
        critic_net_spec = self.net_spec.copy()
        for k in self.net_spec:
            if 'actor_' in k:
                actor_net_spec[k.replace('actor_', '')] = actor_net_spec.pop(k)
                critic_net_spec.pop(k)
            if 'critic_' in k:
                critic_net_spec[k.replace('critic_',
                                          '')] = critic_net_spec.pop(k)
                actor_net_spec.pop(k)
        if critic_net_spec['use_same_optim']:
            critic_net_spec = actor_net_spec

        in_dim = self.body.state_dim
        out_dim = net_util.get_out_dim(self.body, add_critic=self.shared)
        # main actor network, may contain out_dim self.shared == True
        NetClass = getattr(net, actor_net_spec['type'])
        self.net = NetClass(actor_net_spec, in_dim, out_dim)
        self.net_names = ['net']
        if not self.shared:  # add separate network for critic
            critic_out_dim = 1
            CriticNetClass = getattr(net, critic_net_spec['type'])
            self.critic_net = CriticNetClass(critic_net_spec, in_dim,
                                             critic_out_dim)
            self.net_names.append('critic_net')
        # init net optimizer and its lr scheduler
        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
        self.lr_scheduler = net_util.get_lr_scheduler(
            self.optim, self.net.lr_scheduler_spec)
        if not self.shared:
            self.critic_optim = net_util.get_optim(self.critic_net,
                                                   self.critic_net.optim_spec)
            self.critic_lr_scheduler = net_util.get_lr_scheduler(
                self.critic_optim, self.critic_net.lr_scheduler_spec)
        net_util.set_global_nets(self, global_nets)
        self.post_init_nets()

        use_gpu = False
        vae_type = 'vae'
        self.experience_buffer = deque(maxlen=20)
        self.reward_agent = reward_agent.RewardAgent_EncoderSide(
            use_gpu,
            vae_type)  # this is the State Vae and Action Onehot version
        reward_path = '../irl/NeuralDialog-LAED/logs/2019-09-06T12:04:49.278628-mwoz_gan_vae.py'  # new trained vae-based reward

        val_feed = reward_utils.WoZGanDataLoaders('val', 16)
        train_feed = reward_utils.WoZGanDataLoaders('train', 16)
        train_feed.epoch_init(shuffle=True)
        self.discriminator = reward_agent.A2C_Discriminator(
            use_gpu, train_feed, 16)
        self.optim_disc = self.discriminator.get_optimizer()

        reward_agent.load_reward_model(self.reward_agent, reward_path, use_gpu)
        if use_gpu:
            self.reward_agent.cuda()
        self.reward_agent.eval()
        self.reward_count = 0
        self.batch_count = 0

        reward_utils.reward_validate(self.reward_agent, val_feed)