Beispiel #1
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=4)
    eval_env = CartPoleEnv(mode=4)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], [], []
    for i in range(num_agent):
        policy = SoftmaxMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant['policy_kwargs'])
        qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim *
                                     (num_agent - 1)),
                         output_size=action_dim,
                         **variant['qf_kwargs'])
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim *
                                     (num_agent - 1)),
                         output_size=action_dim,
                         **variant['qf_kwargs'])
        target_qf2 = copy.deepcopy(qf1)
        eval_policy = ArgmaxDiscretePolicy(policy)
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space),
            eval_policy,
        )
        policy_n.append(policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = MASACDiscreteTrainer(env=expl_env,
                                   qf1_n=qf1_n,
                                   target_qf1_n=target_qf1_n,
                                   qf2_n=qf2_n,
                                   target_qf2_n=target_qf2_n,
                                   policy_n=policy_n,
                                   **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #2
0
    def step(self, action):
        self.state = self.state + np.asarray(action)
        env = CartPoleEnv(self.state[0], self.state[1], self.state[2],
                          self.state[3], self.state[4])

        episode_count = len(self.action_record)
        model_diff = 0
        for i in range(episode_count):
            ob = env.reset()
            traj_state = []
            for j in range(len(self.action_record[i])):
                # The traj that done is better tricky here
                action = self.action_record[i][j]
                ob, reward, done, _ = env.step(action)
                traj_state.append(ob)
                if done:
                    break
            if not done:
                model_diff = model_diff + 1  # penalty for not done
            model_diff = model_diff + self._traj_diff(np.asarray(traj_state),
                                                      self.state_record[i])
        reward = -model_diff - self.status
        self.status = -model_diff
        done = False
        return np.array(self.state), reward, done, {}
Beispiel #3
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=3)
    eval_env = CartPoleEnv(mode=3)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy_n, eval_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \
        [], [], [], [], [], []
    for i in range(num_agent):
        policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                    action_dim=action_dim,
                                    **variant['policy_kwargs'])
        eval_policy = MakeDeterministic(policy)
        qf1 = FlattenMlp(input_size=(obs_dim * num_agent +
                                     action_dim * num_agent),
                         output_size=1,
                         **variant['qf_kwargs'])
        target_qf1 = copy.deepcopy(qf1)
        qf2 = FlattenMlp(input_size=(obs_dim * num_agent +
                                     action_dim * num_agent),
                         output_size=1,
                         **variant['qf_kwargs'])
        target_qf2 = copy.deepcopy(qf1)
        policy_n.append(policy)
        eval_policy_n.append(eval_policy)
        qf1_n.append(qf1)
        target_qf1_n.append(target_qf1)
        qf2_n.append(qf2)
        target_qf2_n.append(target_qf2)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = MASACTrainer(env=expl_env,
                           qf1_n=qf1_n,
                           target_qf1_n=target_qf1_n,
                           qf2_n=qf2_n,
                           target_qf2_n=target_qf2_n,
                           policy_n=policy_n,
                           **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #4
0
def looping(qt=None, epsilon=config.epsilon, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        end = False
        epsilon = epsilon * 0.9999
        while not end:
            current_state = cart.state
            action = choose_action(current_state, qt, epsilon)
            new_state, reward, end, _ = cart.step(action)
            if end:
                reward = -10
            update_qt_new(qt, current_state, reward, action, new_state)
            turn += 1
            if (visu):
                cart.render()
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", epsilon)
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Beispiel #5
0
 def __init__(self):
     self.plot_data = PlotData()
     self.env = CartPoleEnv()
     self.main_net = DQN()
     self.target_net = deepcopy(self.main_net)
     self.epsilon = config.epsilon
     self.eps_decay = 0.995
     self.visu = False
     self.visu_update = False#300
     self.visu_window = 5
     self.memory = Memory(memory_size = 30)
     self.batch_size = 5
Beispiel #6
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=2)
    eval_env = CartPoleEnv(mode=2)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    policy = SoftmaxMlpPolicy(input_size=obs_dim,
                              output_size=action_dim,
                              **variant['policy_kwargs'])
    qf1 = Mlp(input_size=obs_dim,
              output_size=action_dim,
              **variant['qf_kwargs'])
    target_qf1 = copy.deepcopy(qf1)
    qf2 = Mlp(input_size=obs_dim,
              output_size=action_dim,
              **variant['qf_kwargs'])
    target_qf2 = copy.deepcopy(qf2)

    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    qf_criterion = nn.MSELoss()
    trainer = SACDiscreteTrainer(env=eval_env,
                                 policy=policy,
                                 qf1=qf1,
                                 qf2=qf2,
                                 target_qf1=target_qf1,
                                 target_qf2=target_qf2,
                                 qf_criterion=qf_criterion,
                                 **variant['trainer_kwargs'])
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #7
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    from rlkit.envs.ma_wrappers import MAProbDiscreteEnv
    expl_env = MAProbDiscreteEnv(CartPoleEnv(mode=4))
    eval_env = MAProbDiscreteEnv(CartPoleEnv(mode=4))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf_n, policy_n, target_qf_n, target_policy_n, exploration_policy_n = \
        [], [], [], [], []
    for i in range(num_agent):
        qf = FlattenMlp(input_size=(obs_dim * num_agent +
                                    action_dim * num_agent),
                        output_size=1,
                        **variant['qf_kwargs'])
        policy = SoftmaxMlpPolicy(input_size=obs_dim,
                                  output_size=action_dim,
                                  **variant['policy_kwargs'])
        target_qf = copy.deepcopy(qf)
        target_policy = copy.deepcopy(policy)
        exploration_policy = policy
        qf_n.append(qf)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_policy_n.append(target_policy)
        exploration_policy_n.append(exploration_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, exploration_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)
    trainer = MADDPGTrainer(qf_n=qf_n,
                            target_qf_n=target_qf_n,
                            policy_n=policy_n,
                            target_policy_n=target_policy_n,
                            **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #8
0
def experiment(variant):
    from cartpole import CartPoleEnv
    from rlkit.envs.wrappers import ProbDiscreteEnv
    expl_env = ProbDiscreteEnv(CartPoleEnv(mode=2))
    eval_env = ProbDiscreteEnv(CartPoleEnv(mode=2))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    # import gym
    # from rlkit.envs.wrappers import ProbDiscreteEnv
    # expl_env = ProbDiscreteEnv(gym.make('CartPole-v0'))
    # eval_env = ProbDiscreteEnv(gym.make('CartPole-v0'))
    # obs_dim = eval_env.observation_space.low.size
    # action_dim = eval_env.action_space.low.size

    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    policy = SoftmaxMlpPolicy(input_size=obs_dim,
                              output_size=action_dim,
                              **variant['policy_kwargs'])
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    # remove this since need action to be a prob
    # exploration_policy = PolicyWrappedWithExplorationStrategy(
    #     exploration_strategy=OUStrategy(action_space=expl_env.action_space),
    #     policy=policy,
    # )
    exploration_policy = policy
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(qf=qf,
                          target_qf=target_qf,
                          policy=policy,
                          target_policy=target_policy,
                          **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #9
0
 def __init__(self):
     self.plot_data = PlotData()
     self.cart = CartPoleEnv()
     self.cart.reset()
     self.predi_net = DQN()
     self.updat_net = deepcopy(self.predi_net)
     self.turn = 0
     self.epidode = 0
     self.epsilon = config.epsilon
     self.eps_decay = 0.99
     self.visu = False
     self.visu_update = False  #300
     self.visu_window = 5
     self.consecutive_wins = 0
     self.best_consecutive_wins = 0
     self.last_save = 0
     self.memory = []
Beispiel #10
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=2)
    eval_env = CartPoleEnv(mode=2)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n
    # import gym
    # expl_env = gym.make('CartPole-v0')
    # eval_env = gym.make('CartPole-v0')
    # obs_dim = eval_env.observation_space.low.size
    # action_dim = eval_env.action_space.n

    policy = SoftmaxMlpPolicy(input_size=obs_dim,
                              output_size=action_dim,
                              **variant['policy_kwargs'])
    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = VPGTrainer(policy=policy,
                         value_function=vf,
                         vf_criterion=vf_criterion,
                         **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #11
0
def resample_task():
    task_list = [
        CartPoleEnv(np.random.uniform(L_MIN, L_MAX))
        for task in range(TASK_NUMS)
    ]
    task_lengths = [task.length for task in task_list]
    print(("task length:", task_lengths))
    [task.reset() for task in task_list]
    return task_list
Beispiel #12
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=2)
    eval_env = CartPoleEnv(mode=2)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf = Mlp(input_size=obs_dim,
             output_size=action_dim,
             **variant['qf_kwargs'])
    target_qf = copy.deepcopy(qf)
    eval_policy = ArgmaxDiscretePolicy(qf)
    expl_policy = PolicyWrappedWithExplorationStrategy(
        EpsilonGreedy(expl_env.action_space, variant['epsilon']),
        eval_policy,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    replay_buffer = PrioritizedReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    qf_criterion = nn.MSELoss()
    trainer = DQNTrainer(qf=qf,
                         target_qf=target_qf,
                         qf_criterion=qf_criterion,
                         replay_buffer=replay_buffer,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #13
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = NormalizedBoxEnv(CartPoleEnv(mode=0))
    eval_env = NormalizedBoxEnv(CartPoleEnv(mode=0))
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        **variant['policy_kwargs']
    )
    target_qf = copy.deepcopy(qf)
    target_policy = copy.deepcopy(policy)
    eval_path_collector = MdpPathCollector(eval_env, policy)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=OUStrategy(action_space=expl_env.action_space),
        policy=policy,
    )
    expl_path_collector = MdpPathCollector(expl_env, exploration_policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = DDPGTrainer(
        qf=qf,
        target_qf=target_qf,
        policy=policy,
        target_policy=target_policy,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #14
0
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=0)
    eval_env = CartPoleEnv(mode=0)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs'],
    )
    vf = Mlp(
        hidden_sizes=[32, 32],
        input_size=obs_dim,
        output_size=1,
    )
    vf_criterion = nn.MSELoss()
    eval_policy = MakeDeterministic(policy)
    expl_policy = policy

    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )
    trainer = PPOTrainer(policy=policy,
                         value_function=vf,
                         vf_criterion=vf_criterion,
                         **variant['trainer_kwargs'])
    algorithm = TorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #15
0
def loop(qt=None, epsilon=1, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    config.epsilon = epsilon
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        s = cart.state
        end = False
        epsilon_tmp = config.epsilon
        while not end:
            config.epsilon *= 0.97
            if (visu):
                cart.render()
            a = choose_action(s, qt)
            _, _, end, _ = cart.step(a)
            l_val = bellman_q(s, qt, dummy_cart(s), action=a)
            # print(l_val)
            update_qt(qt, s, a, l_val)
            s = cart.state
            turn += 1
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:",
              config.epsilon)
        config.epsilon = epsilon_tmp
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Beispiel #16
0
def main():
    # Define dimensions of the networks
    
    meta_value_input_dim =  STATE_DIM + TASK_CONFIG_DIM # 7
    task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7

    # init meta value network with a task config network
    meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1)
    task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3)
    meta_value_network.cuda()
    task_config_network.cuda()

    if os.path.exists("meta_value_network_cartpole.pkl"):
        meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl"))
        print("load meta value network success")
    if os.path.exists("task_config_network_cartpole.pkl"):
        task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl"))
        print("load task config network success")

    meta_value_network_optim = torch.optim.Adam(meta_value_network.parameters(),lr=0.001)
    task_config_network_optim = torch.optim.Adam(task_config_network.parameters(),lr=0.001)

    # init a task generator for data fetching
    task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)]
    [task.reset() for task in task_list]

    task_lengths = [task.length for task in task_list]
    print("task length:",task_lengths)

    for episode in range(EPISODE):
        # ----------------- Training ------------------

        if (episode+1) % 10 ==0 :
            # renew the tasks
            task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)]
            task_lengths = [task.length for task in task_list]
            print("task length:",task_lengths)
            [task.reset() for task in task_list]

        # fetch pre data samples for task config network
        # [task_nums,sample_nums,x+y`]
        
        actor_network_list = [ActorNetwork(STATE_DIM,40,ACTION_DIM) for i in range(TASK_NUMS)]
        [actor_network.cuda() for actor_network in actor_network_list]
        actor_network_optim_list = [torch.optim.Adam(actor_network.parameters(),lr = 0.01) for actor_network in actor_network_list]

        # sample pre state,action,reward for task config
        pre_states = []
        pre_actions = []
        pre_rewards = []
        for i in range(TASK_NUMS):
            states,actions,rewards,_,_ = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS)
            pre_states.append(states)
            pre_actions.append(actions)
            pre_rewards.append(rewards)


        for step in range(STEP):

            for i in range(TASK_NUMS):
                # init task config [1, sample_nums,task_config] task_config size=3
                pre_data_samples = torch.cat((pre_states[i][-9:],pre_actions[i][-9:],torch.Tensor(pre_rewards[i])[-9:]),1).unsqueeze(0)
                task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3]

                states,actions,rewards,is_done,final_state = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS)
                final_r = 0
                if not is_done:
                    value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1)
                    final_r = meta_value_network(value_inputs).cpu().data.numpy()[0]

                # train actor network
                actor_network_optim_list[i].zero_grad()
                states_var = Variable(states).cuda()
                
                actions_var = Variable(actions).cuda()
                task_configs = task_config.repeat(1,len(rewards)).view(-1,3)
                log_softmax_actions = actor_network_list[i](states_var)
                vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach()
                # calculate qs
                qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda()

                advantages = qs - vs
                actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y)
                actor_network_loss.backward()
                torch.nn.utils.clip_grad_norm(actor_network_list[i].parameters(),0.5)

                actor_network_optim_list[i].step()

                # train value network

                meta_value_network_optim.zero_grad()

                target_values = qs
                values = meta_value_network(torch.cat((states_var,task_configs),1))
                criterion = nn.MSELoss()
                meta_value_network_loss = criterion(values,target_values)
                meta_value_network_loss.backward()
                torch.nn.utils.clip_grad_norm(meta_value_network.parameters(),0.5)

                meta_value_network_optim.step()                
                
                # train actor network
                
                pre_states[i] = states
                pre_actions[i] = actions
                pre_rewards[i] = rewards

                if (step + 1) % 100 == 0:
                    result = 0
                    test_task = CartPoleEnv(length = task_list[i].length)
                    for test_epi in range(10):
                        state = test_task.reset()
                        for test_step in range(200):
                            softmax_action = torch.exp(actor_network_list[i](Variable(torch.Tensor([state])).cuda()))
                            #print(softmax_action.data)
                            action = np.argmax(softmax_action.cpu().data.numpy()[0])
                            next_state,reward,done,_ = test_task.step(action)
                            result += reward
                            state = next_state
                            if done:
                                break
                    print("episode:",episode,"task:",i,"step:",step+1,"test result:",result/10.0)

        
        if (episode+1) % 10 == 0 :
            # Save meta value network
            torch.save(meta_value_network.state_dict(),"meta_value_network_cartpole.pkl")
            torch.save(task_config_network.state_dict(),"task_config_network_cartpole.pkl")
            print("save networks for episode:",episode)
Beispiel #17
0
def dummy_cart(s, cart=None):
    if cart == None:
        cart = CartPoleEnv()
    cart.reset()
    cart.state = s
    return cart
Beispiel #18
0
            self.epsilon *= self.decay
        if np.random.random() <= self.epsilon:
            return self.env.action_space.sample(), 10
        else:
            return np.argmax(self.model.predict(state)[0][0:2]), np.argmax(
                self.model.predict(state)[0][2:5])

    def save_model(self, filename):
        self.model.save_weights(filename)

    def load_model(self, filename):
        self.model.load_weights(filename)


if __name__ == '__main__':
    env = CartPoleEnv()
    agent = DQN(24, 24, env)
    # agent.load_model('bot.h5')

    episode_data = []
    score_data = []
    episode_data_ = []
    score_data_ = []

    # Learning
    for episode in range(5000):
        state = env.reset()
        state = np.reshape(state, [1, 4])  # reshape from [[a, b]] to [a, b]

        for t in range(1000):
            action, force = agent.act(state)
def experiment(variant):
    from cartpole import CartPoleEnv
    expl_env = CartPoleEnv(mode=3)
    eval_env = CartPoleEnv(mode=3)
    num_agent = expl_env.num_agents
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    from rlkit.torch.networks.graph_builders import FullGraphBuilder
    graph_builder_obs = FullGraphBuilder(
        input_node_dim=obs_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    obs_gnn_1 = GNNNet(
        graph_builder_obs,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )

    graph_builder_eval = FullGraphBuilder(
        input_node_dim=graph_builder_obs.output_node_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    if variant['concat_emb']:
        gnn_out_dim = int(obs_dim + variant['graph_kwargs']['node_dim'] *
                          variant['graph_kwargs']['num_conv_layers'])
    else:
        gnn_out_dim = variant['graph_kwargs']['node_dim']
    from rlkit.torch.networks.networks import FlattenMlp
    post_mlp1 = FlattenMlp(
        input_size=gnn_out_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    from rlkit.torch.networks.graph_r2g_qnet2 import R2GQNet
    qf1 = R2GQNet(
        obs_gnn=obs_gnn_1,
        pre_graph_builder=graph_builder_eval,
        obs_dim=obs_dim,
        action_dim=action_dim,
        post_mlp=post_mlp1,
        normalize_emb=False,
        output_activation=None,
        concat_emb=variant['concat_emb'],
        **variant['graph_kwargs'],
    )
    target_qf1 = copy.deepcopy(qf1)

    obs_gnn_2 = GNNNet(
        graph_builder_obs,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    post_mlp2 = FlattenMlp(
        input_size=gnn_out_dim,
        output_size=1,
        hidden_sizes=[variant['qf_kwargs']['hidden_dim']] *
        (variant['qf_kwargs']['num_layer'] - 1),
        hidden_activation=nn.LeakyReLU(negative_slope=0.2),
    )
    qf2 = R2GQNet(
        obs_gnn=obs_gnn_2,
        pre_graph_builder=graph_builder_eval,
        obs_dim=obs_dim,
        action_dim=action_dim,
        post_mlp=post_mlp2,
        normalize_emb=False,
        output_activation=None,
        concat_emb=variant['concat_emb'],
        **variant['graph_kwargs'],
    )
    target_qf2 = copy.deepcopy(qf2)

    graph_builder_ca = FullGraphBuilder(
        input_node_dim=obs_dim + action_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    from rlkit.torch.networks.gnn_networks import GNNNet
    cgca = GNNNet(
        graph_builder_ca,
        hidden_activation='lrelu0.2',
        output_activation='lrelu0.2',
        **variant['graph_kwargs'],
    )
    from rlkit.torch.networks.networks import FlattenMlp
    from rlkit.torch.networks.layers import SplitLayer
    from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy
    cactor = nn.Sequential(
        cgca,
        FlattenMlp(
            input_size=variant['graph_kwargs']['node_dim'],
            output_size=variant['cactor_kwargs']['hidden_dim'],
            hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] *
            (variant['cactor_kwargs']['num_layer'] - 1),
            hidden_activation=nn.LeakyReLU(negative_slope=0.2),
            output_activation=nn.LeakyReLU(negative_slope=0.2),
        ), nn.LeakyReLU(negative_slope=0.2),
        SplitLayer(layers=[
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
            nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
        ]))
    cactor = TanhGaussianPolicy(module=cactor)

    graph_builder_policy = FullGraphBuilder(
        input_node_dim=obs_dim,
        num_node=num_agent,
        batch_size=variant['algorithm_kwargs']['batch_size'],
        contain_self_loop=False)
    policy_n, expl_policy_n, eval_policy_n = [], [], []
    for i in range(num_agent):
        policy = nn.Sequential(
            FlattenMlp(
                input_size=variant['graph_kwargs']['node_dim'],
                output_size=variant['policy_kwargs']['hidden_dim'],
                hidden_sizes=[variant['policy_kwargs']['hidden_dim']] *
                (variant['policy_kwargs']['num_layer'] - 1),
                hidden_activation=nn.LeakyReLU(negative_slope=0.2),
                output_activation=nn.LeakyReLU(negative_slope=0.2),
            ),
            SplitLayer(layers=[
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim),
                nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim)
            ]))
        policy = TanhGaussianPolicy(module=policy)
        from rlkit.torch.policies.make_deterministic import MakeDeterministic
        eval_policy = MakeDeterministic(policy)
        if variant['random_exploration']:
            from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy
            from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy
            expl_policy = PolicyWrappedWithExplorationStrategy(
                exploration_strategy=EpsilonGreedy(expl_env.action_space,
                                                   prob_random_action=1.0),
                policy=policy,
            )
        else:
            expl_policy = policy

        policy_n.append(policy)
        expl_policy_n.append(expl_policy)
        eval_policy_n.append(eval_policy)

    from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector
    eval_path_collector = MAMdpPathCollector(eval_env,
                                             eval_policy_n,
                                             shared_encoder=obs_gnn_1)
    expl_path_collector = MAMdpPathCollector(expl_env,
                                             expl_policy_n,
                                             shared_encoder=obs_gnn_1)

    from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'],
                                      expl_env,
                                      num_agent=num_agent)

    from rlkit.torch.r2g.r2g_gnn12 import R2GGNNTrainer
    trainer = R2GGNNTrainer(env=expl_env,
                            qf1=qf1,
                            target_qf1=target_qf1,
                            qf2=qf2,
                            target_qf2=target_qf2,
                            cactor=cactor,
                            policy_n=policy_n,
                            **variant['trainer_kwargs'])

    from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    # save init params
    from rlkit.core import logger
    snapshot = algorithm._get_snapshot()
    file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl')
    torch.save(snapshot, file_name)

    algorithm.train()
Beispiel #20
0
        #print('current_discrete', current_discrete)#, next_discrete, action)
        discrete_states[action, current_discrete, next_discrete] += 1
        if terminal:
            state = env.reset(initial_state)
            episodes += 1
            step = 0


if __name__ == '__main__':
    args = parser().parse_args()
    no_intervals = args.no_intervals
    episodes = args.episodes
    
    ENV_NAME = "CartPole-v1"
    #env = gym.make(ENV_NAME)
    env = CartPoleEnv()
    action_space = env.action_space.n
    #env.seed(np.random.randint(0, 10000))
    
    no_discrete_states = no_intervals * no_intervals * no_intervals * no_intervals
    discrete_states = np.zeros((action_space, no_discrete_states, no_discrete_states), dtype='uint32')
    
    step = 2.3 / (no_intervals - 1)
    for x_bound in np.arange(-1.1, 1.2 + step, step):
        x = random.uniform(x_bound - step, x_bound)
        for v_bound in np.arange(-1.1, 1.2 + step, step):
            v = random.uniform(v_bound - step, v_bound)
            for a_bound in np.arange(-1.1, 1.2 + step, step):
                a = random.uniform(a_bound - step, a_bound)
                for v_a_bound in np.arange(-1.1, 1.2 + step, step):
                    for action in range(action_space):
Beispiel #21
0
from cartpole import CartPoleEnv
import numpy as np
cart = CartPoleEnv()
cart.reset()

for _ in range(1000):

    # Calculate the Gradients

    # Update Thetas

    # Sample u trajectory

    # Apply u[0] to the actual system
    cart.step(10)  # Apply Some force

    # Update the New State in the Learner

    # Shift the Thetas

    # Simulate
    cart.render()

cart.close()
Beispiel #22
0
#!/usr/bin/env python
# coding: utf-8

# In[76]:

from cartpole import CartPoleEnv
import numpy as np
import random
import matplotlib.pyplot as plt

env = CartPoleEnv()
env.reset()


def discretize(val, bounds, n_states):
    discrete_val = 0
    if val <= bounds[0]:
        discrete_val = 0
    elif val >= bounds[1]:
        discrete_val = n_states - 1
    else:
        discrete_val = int(
            round((n_states - 1) * ((val - bounds[0]) /
                                    (bounds[1] - bounds[0]))))
    return discrete_val


def discretize_state(vals, s_bounds, n_s):
    discrete_vals = []
    for i in range(len(n_s)):
        discrete_vals.append(discretize(vals[i], s_bounds[i], n_s[i]))
Beispiel #23
0
#!/usr/bin/env python
# coding: utf-8

# In[2]:

from cartpole import CartPoleEnv
import math
import numpy as np

env = CartPoleEnv()
env.reset()


def discretize(val, bounds, n_states):
    discrete_val = 0
    if val <= bounds[0]:
        discrete_val = 0
    elif val >= bounds[1]:
        discrete_val = n_states - 1
    else:
        discrete_val = int(
            round((n_states - 1) * ((val - bounds[0]) /
                                    (bounds[1] - bounds[0]))))
    return discrete_val


def discretize_state(vals, s_bounds, n_s):
    discrete_vals = []
    for i in range(len(n_s)):
        discrete_vals.append(discretize(vals[i], s_bounds[i], n_s[i]))
    return np.array(discrete_vals, dtype=np.int)
from cartpole import CartPoleEnv

env = CartPoleEnv(length=1.0)

env.reset()

for step in range(1000):
    action = 0
    next_state, reward, done, _ = env.step(0)

    if done:
        print "done reward:", reward
        break
Beispiel #25
0
from cartpole import CartPoleEnv
import gym
import numpy as np

def choose_action(state):
    action = 0
    if state[2] > 0:
        action = 0
    else:
        action = 1
    return action

if __name__ == "__main__":
    cart = CartPoleEnv()
    cart.reset()
    action = 0
    # while True:
    #     cart.render()
    #     state, reward, end, thing = cart.step(action)
    #     print(state)
    #     if end:
    #         cart.reset()
    #     else:
    #         action = choose_action(state)
    # cart.close()

from cartpole import CartPoleEnv
import math
import numpy as np

env = CartPoleEnv()
env.reset()


def discretize(val, bounds, n_states):
    discrete_val = 0
    if val <= bounds[0]:
        discrete_val = 0
    elif val >= bounds[1]:
        discrete_val = n_states - 1
    else:
        discrete_val = int(
            round((n_states - 1) * ((val - bounds[0]) /
                                    (bounds[1] - bounds[0]))))
    return discrete_val


def discretize_state(vals, s_bounds, n_s):
    discrete_vals = []
    for i in range(len(n_s)):
        discrete_vals.append(discretize(vals[i], s_bounds[i], n_s[i]))
    return np.array(discrete_vals, dtype=np.int)


# polożenie, prędkość, kąt, prędkość kątowa
n_s = np.array([10, 10, 10, 10])
Beispiel #27
0
def experiment(variant):
    num_agent = variant['num_agent']
    from cartpole import CartPoleEnv
    from rlkit.envs.ma_wrappers import MAProbDiscreteEnv
    expl_env = CartPoleEnv(mode=4)
    eval_env = CartPoleEnv(mode=4)
    obs_dim = eval_env.observation_space.low.size
    action_dim = eval_env.action_space.n

    qf_n, cactor_n, policy_n, target_qf_n, target_cactor_n, target_policy_n, eval_policy_n, expl_policy_n = \
        [], [], [], [], [], [], [], []
    for i in range(num_agent):
        qf = FlattenMlp(
            input_size=(obs_dim*num_agent+action_dim*num_agent),
            output_size=1,
            **variant['qf_kwargs']
        )
        cactor = GumbelSoftmaxMlpPolicy(
            input_size=(obs_dim*num_agent+action_dim*(num_agent-1)),
            output_size=action_dim,
            **variant['cactor_kwargs']
        )
        policy = GumbelSoftmaxMlpPolicy(
            input_size=obs_dim,
            output_size=action_dim,
            **variant['policy_kwargs']
        )
        target_qf = copy.deepcopy(qf)
        target_cactor = copy.deepcopy(cactor)
        target_policy = copy.deepcopy(policy)
        eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True)
        expl_policy = PolicyWrappedWithExplorationStrategy(
            EpsilonGreedy(expl_env.action_space),
            eval_policy,
        )
        qf_n.append(qf)
        cactor_n.append(cactor)
        policy_n.append(policy)
        target_qf_n.append(target_qf)
        target_cactor_n.append(target_cactor)
        target_policy_n.append(target_policy)
        eval_policy_n.append(eval_policy)
        expl_policy_n.append(expl_policy)

    eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n)
    expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n)
    replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent)
    trainer = PRGTrainer(
        env=expl_env,
        qf_n=qf_n,
        target_qf_n=target_qf_n,
        policy_n=policy_n,
        target_policy_n=target_policy_n,
        cactor_n=cactor_n,
        target_cactor_n=target_cactor_n,
        **variant['trainer_kwargs']
    )
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        log_path_function=get_generic_ma_path_information,
        **variant['algorithm_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #28
0
def experiment(variant):
    expl_env = NormalizedBoxEnv(CartPoleEnv(mode=1))
    eval_env = NormalizedBoxEnv(CartPoleEnv(mode=1))
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant['layer_size']
    vf1 = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    vf2 = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_vf1 = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_vf2 = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
        return_raw_action=True,
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
        store_raw_action=True,
    )
    trainer = FlowQTrainer(env=eval_env,
                           policy=policy,
                           vf1=vf1,
                           vf2=vf2,
                           target_vf1=target_vf1,
                           target_vf2=target_vf2,
                           **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #29
0
# -*- coding: utf-8 -*-
"""Untitled0.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1lky0vjWP1y9GVXlg3VUukjP5nR9ry3SQ
"""

from cartpole import CartPoleEnv
import math
import numpy as np

env = CartPoleEnv()
env.reset()

def discretize(val,bounds,n_states):
    discrete_val = 0
    if val <= bounds[0]:
        discrete_val = 0
    elif val >= bounds[1]:
        discrete_val = n_states-1
    else:
        discrete_val = int(round((n_states-1)*((val-bounds[0])/(bounds[1]-bounds[0]))))
    return discrete_val

def discretize_state(vals,s_bounds,n_s):
    discrete_vals = []
    for i in range(len(n_s)):
        discrete_vals.append(discretize(vals[i],s_bounds[i],n_s[i]))
    return np.array(discrete_vals,dtype=np.int)
Beispiel #30
0
def main():
    # Define dimensions of the networks

    meta_value_input_dim =  STATE_DIM + TASK_CONFIG_DIM # 7
    task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7

    # init meta value network with a task config network
    meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1)
    task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3)
    meta_value_network.cuda()
    task_config_network.cuda()

    if os.path.exists("meta_value_network_cartpole.pkl"):
        meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl"))
        print("load meta value network success")
    if os.path.exists("task_config_network_cartpole.pkl"):
        task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl"))
        print("load task config network success")


    task_lengths = np.linspace(L_MIN,L_MAX,TASK_NUMS)

    datas = []

    for task_length in task_lengths:

        data_i = {}
        data_i["task_length"] = task_length

        data_i_episode = {}
        for episode in range(EPISODE):
            task = CartPoleEnv(length = task_length)
            task.reset()

            data_i_episode["episode"] = episode

            # ----------------- Training ------------------

            # fetch pre data samples for task config network
            # [task_nums,sample_nums,x+y`]

            actor_network = ActorNetwork(STATE_DIM,40,ACTION_DIM)
            actor_network.cuda()
            actor_network_optim = torch.optim.Adam(actor_network.parameters(),lr = 0.01)
            '''
            if os.path.exists("actor_network.pkl"):
                actor_network.load_state_dict(torch.load("actor_network.pkl"))
                print("load actor_network success")
            '''
            # sample pre state,action,reward for task confi


            pre_states,pre_actions,pre_rewards,_,_ = roll_out(actor_network,task,SAMPLE_NUMS)


            test_results = []
            train_games = []
            for step in range(STEP):

                # init task config [1, sample_nums,task_config] task_config size=3
                pre_data_samples = torch.cat((pre_states[-9:],pre_actions[-9:],torch.Tensor(pre_rewards)[-9:]),1).unsqueeze(0)
                task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3]

                states,actions,rewards,is_done,final_state = roll_out(actor_network,task,SAMPLE_NUMS)
                final_r = 0
                if not is_done:
                    value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1)
                    final_r = meta_value_network(value_inputs).cpu().data.numpy()[0]
                # train actor network
                actor_network_optim.zero_grad()
                states_var = Variable(states).cuda()

                actions_var = Variable(actions).cuda()
                task_configs = task_config.repeat(1,len(rewards)).view(-1,3)
                log_softmax_actions = actor_network(states_var)
                vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach()
                # calculate qs
                qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda()

                advantages = qs - vs
                actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y)
                actor_network_loss.backward()
                torch.nn.utils.clip_grad_norm(actor_network.parameters(),0.5)

                actor_network_optim.step()

                pre_states = states
                pre_actions = actions
                pre_rewards = rewards

                # testing
                if (step + 1) % 10 == 0:
                    # testing
                    result = 0
                    test_task = CartPoleEnv(length = task.length)
                    for test_epi in range(10):
                        state = test_task.reset()
                        for test_step in range(200):
                            softmax_action = torch.exp(actor_network(Variable(torch.Tensor([state])).cuda()))
                            #print(softmax_action.data)
                            action = np.argmax(softmax_action.cpu().data.numpy()[0])
                            next_state,reward,done,_ = test_task.step(action)
                            result += reward
                            state = next_state
                            if done:
                                break
                    aver_result = result/10.0
                    test_results.append(aver_result)
                    train_games.append(task.episodes)
                    print("task length:",task_length,"episode:",episode,"step:",step+1,"result:",aver_result)

            data_i_episode["test_results"] = test_results
            data_i_episode["train_games"] = train_games
        data_i["results"] = data_i_episode
        datas.append(data_i)

    save_to_json('mvn_cartpole_test_100.json', datas)