Ejemplo n.º 1
0
def train(variant):
    Min_cost = 1000000

    traj = get_traj()  # get data
    env_name = variant['env_name']  # choose your environment
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params[
        'max_episodes']  # maximum episodes for RL training
    max_ep_steps = env_params[
        'max_ep_steps']  # number of maximum steps in each episode
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    s_dim = env.observation_space.shape[0]
    print("s_dim is ", s_dim)

    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    policy = CAC(a_dim, s_dim, policy_params)
    # policy.restore("log/CMAPSS/CAC-new-reward-0.01/0/policy")

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)

    for i in range(max_episodes):

        current_path = {
            'rewards': [],
            'distance': [],
            'kl_divergence': [],
            'a_loss': [],
            'alpha': [],
            'lyapunov_error': [],
            'entropy': [],
            'beta': [],
            'action_distance': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()

        # Random start point

        start_point = np.random.randint(0, 500000)

        s = traj[start_point, :16]

        # current state, theta,next w, desired state
        # this is for decision making
        # 16,1,4,16
        s = np.concatenate([[s], [traj[start_point, 17:]]], axis=1)[0]

        env.state = s

        for j in range(start_point + 1, start_point + 1 + max_ep_steps):
            if Render:
                env.render()
            delta = np.zeros(36)
            # ###### NOSIE ##############

            noise = np.random.normal(0, 0.01, 16)
            delta[20:] = noise
            # ########IF Noise env##########
            # s= s + delta
            # a = policy.choose_action(s)

            # ###### BIAS ##############

            # noise = s[0:16]*0.01
            # delta[0:16] = noise

            a = policy.choose_action(s + delta)

            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2
            # action = traj[j-1,16]

            a_upperbound = env.action_space.high
            a_lowerbound = env.action_space.low

            # Run in simulator
            X_, r, done, theta = env.step(action)
            # The new s= current state,next omega, next state
            s_ = np.concatenate([X_, [traj[j, 17:]]], axis=1)[0]
            # s_ = np.concatenate([[s_], [theta]], axis=1)[0]
            # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0]
            env.state = s_

            # theta_pre=theta
            if training_started:
                global_step += 1

            if j == max_ep_steps - 1 + start_point:
                done = True

            terminal = 1. if done else 0.

            if j > start_point + 2:
                pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_,
                           _s)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch)

            if training_started:
                current_path['rewards'].append(r)
                current_path['distance'].append(distance)
                current_path['kl_divergence'].append(kl)
                current_path['lyapunov_error'].append(l_loss)
                current_path['alpha'].append(alpha)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)
                current_path['beta'].append(beta)
                current_path['action_distance'].append(action_distance)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                # print(training_diagnotic)
                if training_diagnotic is not None:
                    eval_diagnotic = training_evaluation(variant, env, policy)
                    [
                        logger.logkv(key, eval_diagnotic[key])
                        for key in eval_diagnotic.keys()
                    ]
                    training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    string_to_print = ['time_step:', str(global_step), '|']
                    [
                        string_to_print.extend(
                            [key, ':', str(eval_diagnotic[key]), '|'])
                        for key in eval_diagnotic.keys()
                    ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
                if eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length'] <= Min_cost:
                    Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length']
                    print("New lowest cost:", Min_cost)
                    policy.save_result(log_path)
                if training_started and global_step % (
                        10 * evaluation_frequency) == 0 and global_step > 0:
                    policy.save_result(log_path)

            # Status Update
            _s = s
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic
                break
    policy.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return
def train(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)

    evaluation_env = get_env_from_name(env_name)
    env_params = variant['env_params']

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    alg_name = variant['algorithm_name']
    policy_build_fn = get_policy(alg_name)
    policy_params = variant['alg_params']
    batch_size = policy_params['batch_size']

    lr_c = policy_params['lr_c']
    cliprange = policy_params['cliprange']
    cliprangenow = cliprange
    lr_c_now = lr_c  # learning rate for critic

    gamma = policy_params['gamma']
    gae_lamda = policy_params['gae_lamda']

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=policy_params['output_format'])
    logger.logkv('safety_threshold', policy_params['safety_threshold'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', batch_size)
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fn(a_dim, s_dim, policy_params)

    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=policy.N)

    for j in range(max_global_steps):

        if global_step > max_global_steps:
            break

        mb_obs, mb_obs_, mb_rewards,  mb_actions, mb_values, mb_terminals, mb_t = [], [], [], [], [], [], []

        for n in range(policy.N):
            current_path = {
                'rewards': [],
                'obs': [],
                'obs_': [],
                'done': [],
                'value': [],
                't': [],
                'action': [],
            }
            s = env.reset()
            if 'Fetch' in env_name or 'Hand' in env_name:
                s = np.concatenate([s[key] for key in s.keys()])
        # For n in range number of steps
            for t in range(max_ep_steps):

                # Given observations, get action value and neglopacs
                # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init

                [a], [value] = policy.choose_action(s)

                action = np.tanh(a)
                action = a_lowerbound + (action + 1.) * (a_upperbound -
                                                         a_lowerbound) / 2
                # Run in simulator
                s_, r, done, info = env.step(action)
                if 'Fetch' in env_name or 'Hand' in env_name:
                    s_ = np.concatenate([s_[key] for key in s_.keys()])
                if t == max_ep_steps - 1:
                    done = True
                terminal = 1. if done else 0.

                if Render:
                    env.render()

                current_path['rewards'].append(r)
                current_path['action'].append(a)
                current_path['obs'].append(s)
                current_path['obs_'].append(s_)
                current_path['done'].append(terminal)
                current_path['value'].append(value)
                current_path['t'].append(t)
                if done:

                    global_step += t + 1
                    last_training_paths.appendleft(current_path)

                    break
                else:
                    s = s_
        # mb_obs = np.asarray(mb_obs, dtype=s.dtype)
        # mb_values = np.asarray(mb_values, dtype=s.dtype)
        # mb_l_values = np.asarray(mb_l_values, dtype=s.dtype)
        # mb_actions = np.asarray(mb_actions, dtype=action.dtype)
        # mb_obs_ = np.asarray(mb_obs_, dtype=s_.dtype)
        # mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        # mb_l_rewards = np.asarray(mb_l_rewards, dtype=np.float32)
        # mb_terminals = np.asarray(mb_terminals, dtype=np.float32)
        # last_value, last_l_value = policy.predict_values([s_])
        rescale = np.mean([len(path) for path in last_training_paths])

        initial_return = []
        mb_advs = []
        for path in last_training_paths:
            lastgaelam = 0
            path_advs = np.zeros_like(path['rewards'])
            path_values = path['value']
            path_next_values = path['value'][1:]
            path_next_values.append(policy.predict_values(path['obs_'][-1]))
            for t in reversed(range(len(path_values))):

                delta = path['rewards'][t] + gamma * path_next_values[t] * (
                    1 - path['done'][t]) - path_values[t]
                path_advs[t] = lastgaelam = delta + gamma * gae_lamda * (
                    1 - path['done'][t]) * lastgaelam

            path_returns = path_advs + path_values
            initial_return.append(path_returns[0])
            mb_advs.extend(path_advs)
            mb_obs.extend(path['obs'])
            mb_obs_.extend(path['obs_'])
            mb_values.extend(path['value'])
            mb_terminals.extend(path['done'])
            mb_t.extend(path['t'])
            mb_actions.extend(path['action'])

        initial_return = np.asarray(initial_return, dtype=np.float32)
        mb_obs = np.asarray(mb_obs, dtype=s.dtype)
        mb_values = np.asarray(mb_values, dtype=s.dtype)
        mb_actions = np.asarray(mb_actions, dtype=action.dtype)
        mb_obs_ = np.asarray(mb_obs_, dtype=s_.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_terminals = np.asarray(mb_terminals, dtype=np.float32)
        mb_advs = np.asarray(mb_advs, dtype=np.float32)
        mb_t = np.asarray(mb_t, dtype=np.float32)
        mb_returns = mb_advs + mb_values
        mblossvals = []
        inds = np.arange(len(mb_advs), dtype=int)
        initial_return = np.mean(initial_return)
        # Randomize the indexes
        np.random.shuffle(inds)
        # 0 to batch_size with batch_train_size step
        # if sum(current_path['l_rewards'])>0:
        #     policy.ALPHA3 = min(policy.ALPHA3 * 1.5, policy_params['alpha3'])
        # else:
        #     policy.ALPHA3 = min(policy.ALPHA3 * 1.01, policy_params['alpha3'])

        slices = (arr[inds] for arr in (mb_obs, mb_obs_, mb_returns, mb_advs,
                                        mb_actions, mb_values, mb_t))

        # print(**slices)
        mblossvals.append(
            policy.update(*slices, initial_return, cliprangenow, lr_c_now,
                          rescale))

        mblossvals = np.mean(mblossvals, axis=0)
        frac = 1.0 - (global_step - 1.0) / max_global_steps
        cliprangenow = cliprange * frac
        lr_c_now = lr_c * frac  # learning rate for critic
        # lr_l_now = lr_l * frac  # learning rate for critic

        logger.logkv("total_timesteps", global_step)

        training_diagnotic = evaluate_training_rollouts(last_training_paths)

        if training_diagnotic is not None:
            # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\
            eval_diagnotic = training_evaluation(variant, evaluation_env,
                                                 policy)
            [
                logger.logkv(key, eval_diagnotic[key])
                for key in eval_diagnotic.keys()
            ]
            training_diagnotic.pop('return')
            [
                logger.logkv(key, training_diagnotic[key])
                for key in training_diagnotic.keys()
            ]
            logger.logkv('lr_c', lr_c_now)
            [
                logger.logkv(name, value)
                for name, value in zip(policy.diagnosis_names, mblossvals)
            ]
            string_to_print = ['time_step:', str(global_step), '|']
            [
                string_to_print.extend(
                    [key, ':', str(eval_diagnotic[key]), '|'])
                for key in eval_diagnotic.keys()
            ]
            [
                string_to_print.extend(
                    [key, ':',
                     str(round(training_diagnotic[key], 2)), '|'])
                for key in training_diagnotic.keys()
            ]
            print(''.join(string_to_print))
        logger.dumpkvs()
        # 状态更新

        # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY

    print('Running time: ', time.time() - t1)
    return
Ejemplo n.º 3
0
def train(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['num_of_training_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']
    policy_params['network_structure'] = env_params['network_structure']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # if disturber_params['process_noise']:
    #     d_dim = disturber_params['noise_dim']
    # else:
    #     d_dim = env_params['disturbance dim']

    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = LAC(a_dim, s_dim, policy_params)

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)

    for i in range(max_episodes):

        current_path = {
            'rewards': [],
            'a_loss': [],
            'alpha': [],
            'lambda': [],
            'lyapunov_error': [],
            'entropy': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()
        if 'Fetch' in env_name or 'Hand' in env_name:
            s = np.concatenate([s[key] for key in s.keys()])

        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s)
            # a = a*0
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2

            # Run in simulator
            disturbance_input = np.zeros([a_dim + s_dim])

            s_, r, done, info = env.step(action)

            if 'Fetch' in env_name or 'Hand' in env_name:
                s_ = np.concatenate([s_[key] for key in s_.keys()])
                if info['done'] > 0:
                    done = True

            if training_started:
                global_step += 1

            if j == max_ep_steps - 1:
                done = True

            terminal = 1. if done else 0.
            pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a, batch)

            if training_started:
                current_path['rewards'].append(r)
                current_path['lyapunov_error'].append(l_loss)
                current_path['alpha'].append(alpha)
                current_path['lambda'].append(labda)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnotic is not None:
                    if variant['num_of_evaluation_paths'] > 0:
                        eval_diagnotic = training_evaluation(
                            variant, env, policy)
                        [
                            logger.logkv(key, eval_diagnotic[key])
                            for key in eval_diagnotic.keys()
                        ]
                        training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    string_to_print = ['time_step:', str(global_step), '|']
                    if variant['num_of_evaluation_paths'] > 0:
                        [
                            string_to_print.extend(
                                [key, ':',
                                 str(eval_diagnotic[key]), '|'])
                            for key in eval_diagnotic.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
            # 状态更新
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)

                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return
Ejemplo n.º 4
0
def train(variant):
    env_name = variant["env_name"]
    env = get_env_from_name(env_name)

    env_params = variant["env_params"]

    max_episodes = env_params["max_episodes"]
    max_ep_steps = env_params["max_ep_steps"]
    max_global_steps = env_params["max_global_steps"]
    store_last_n_paths = variant["num_of_training_paths"]
    evaluation_frequency = variant["evaluation_frequency"]

    policy_params = variant["alg_params"]
    policy_params["network_structure"] = env_params["network_structure"]

    min_memory_size = policy_params["min_memory_size"]
    steps_per_cycle = policy_params["steps_per_cycle"]
    train_per_cycle = policy_params["train_per_cycle"]
    batch_size = policy_params["batch_size"]

    lr_a, lr_c, lr_l = (
        policy_params["lr_a"],
        policy_params["lr_c"],
        policy_params["lr_l"],
    )
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    if "Fetch" in env_name or "Hand" in env_name:
        s_dim = (env.observation_space.spaces["observation"].shape[0] +
                 env.observation_space.spaces["achieved_goal"].shape[0] +
                 env.observation_space.spaces["desired_goal"].shape[0])
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # if disturber_params['process_noise']:
    #     d_dim = disturber_params['noise_dim']
    # else:
    #     d_dim = env_params['disturbance dim']

    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = LAC(a_dim, s_dim, policy_params)

    pool_params = {
        "s_dim": s_dim,
        "a_dim": a_dim,
        "d_dim": 1,
        "store_last_n_paths": store_last_n_paths,
        "memory_capacity": policy_params["memory_capacity"],
        "min_memory_size": policy_params["min_memory_size"],
        "history_horizon": policy_params["history_horizon"],
        "finite_horizon": policy_params["finite_horizon"],
    }
    if "value_horizon" in policy_params.keys():
        pool_params.update({"value_horizon": policy_params["value_horizon"]})
    else:
        pool_params["value_horizon"] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params["eval_render"]

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant["log_path"]
    logger.configure(dir=log_path, format_strs=["csv"])
    logger.logkv("tau", policy_params["tau"])

    logger.logkv("alpha3", policy_params["alpha3"])
    logger.logkv("batch_size", policy_params["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    for i in range(max_episodes):

        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()
        if "Fetch" in env_name or "Hand" in env_name:
            s = np.concatenate([s[key] for key in s.keys()])

        for j in range(max_ep_steps):
            if Render:
                env.render()

            a = policy.choose_action(s)
            action = a_lowerbound + (a + 1.0) * (a_upperbound -
                                                 a_lowerbound) / 2
            # action = a

            # Run in simulator
            disturbance_input = np.zeros([a_dim + s_dim])

            s_, r, done, info = env.step(action)

            if "Fetch" in env_name or "Hand" in env_name:
                s_ = np.concatenate([s_[key] for key in s_.keys()])
                if info["done"] > 0:
                    done = True

            if training_started:
                global_step += 1

            if j == max_ep_steps - 1:
                done = True

            terminal = 1.0 if done else 0.0
            pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if (pool.memory_pointer > min_memory_size
                    and global_step % steps_per_cycle == 0):
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a, batch)

            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            if (training_started and global_step % evaluation_frequency == 0
                    and global_step > 0):

                logger.logkv("total_timesteps", global_step)

                training_diagnostics = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnostics is not None:
                    if variant["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(
                            variant, env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_c", lr_c_now)
                    logger.logkv("lr_l", lr_l_now)

                    string_to_print = ["time_step:", str(global_step), "|"]
                    if variant["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":",
                                 str(eval_diagnostics[key]), "|"])
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ":",
                            str(round(training_diagnostics[key], 2)), "|"
                        ]) for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))

                logger.dumpkvs()
            # 状态更新
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)

                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)

    print("Running time: ", time.time() - t1)
    return