Esempio n. 1
0
def train(variant):
    s_save = []
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    if variant['evaluate'] is True:
        evaluation_env = get_env_from_name(env_name)
    else:
        evaluation_env = None
    env_params = variant['env_params']
    judge_safety_func = get_safety_constraint_func(variant)

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']
    num_of_paths = variant['num_of_paths']

    alg_name = variant['algorithm_name']
    policy_build_fn = get_policy(alg_name)
    policy_params = variant['alg_params']
    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])

    logger.logkv('tau', policy_params['tau'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fn(a_dim, s_dim, policy_params)
    logger.logkv('target_entropy', policy.target_entropy)
    # For analyse

    Render = env_params['eval_render']
    ewma_p = 0.95
    ewma_step = np.zeros((1, max_episodes + 1))
    ewma_reward = np.zeros((1, max_episodes + 1))

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False
    for i in range(max_episodes):

        ep_reward = 0
        l_r = 0

        current_path = {
            'rewards': [],
            'l_rewards': [],
            'violation': [],
        }
        [current_path.update({key: []}) for key in policy.diag_names]
        if global_step > max_global_steps:
            break

        s = env.reset()

        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2

            # Run in simulator
            s_, r, done, info = env.step(action)
            if training_started:
                global_step += 1
            l_r = info['l_rewards']
            if j == max_ep_steps - 1:
                done = True
            terminal = 1. if done else 0.

            violation_of_constraint = info['violation_of_constraint']
            # 储存s,a和s_next,reward用于DDPG的学习
            policy.store_transition(s, a, r, l_r, terminal, s_)
            s_save.append(s_)
            sio.savemat('data_all.mat', {
                's': s_save,
            })
            # 如果状态接近边缘 就存储到边缘memory里
            # if policy.use_lyapunov is True and np.abs(s[0]) > env.cons_pos:  # or np.abs(s[2]) > env.theta_threshold_radians*0.8
            if policy.use_lyapunov is True and judge_safety_func(
                    s_, r, done,
                    info):  # or np.abs(s[2]) > env.theta_threshold_radians*0.8
                policy.store_edge_transition(s, a, r, l_r, terminal, s_)

            # Learn
            if policy.use_lyapunov is True:
                if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0:
                    # Decay the action randomness
                    training_started = True
                    for _ in range(train_per_cycle):
                        train_diagnotic = policy.learn(lr_a_now, lr_c_now,
                                                       lr_l_now)

            else:
                if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0:
                    # Decay the action randomness
                    training_started = True
                    for _ in range(train_per_cycle):
                        train_diagnotic = policy.learn(lr_a_now, lr_c_now,
                                                       lr_l_now)

            if training_started:
                current_path['rewards'].append(r)
                current_path['l_rewards'].append(l_r)
                current_path['violation'].append(violation_of_constraint)
                [
                    current_path[key].append(value)
                    for key, value in zip(policy.diag_names, train_diagnotic)
                ]

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:
                if evaluation_env is not None:
                    rollouts = get_evaluation_rollouts(policy,
                                                       evaluation_env,
                                                       num_of_paths,
                                                       max_ep_steps,
                                                       render=Render)
                    diagnotic = evaluate_rollouts(rollouts)
                    # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()]
                    print(
                        'training_step:',
                        global_step,
                        'average eval reward:',
                        diagnotic['return-average'],
                        'average eval lreward:',
                        diagnotic['lreturn-average'],
                        'average eval violations:',
                        diagnotic['violation-avg'],
                        'average length:',
                        diagnotic['episode-length-avg'],
                    )
                    logger.logkv('eval_eprewmean', diagnotic['return-average'])
                    logger.logkv('eval_eplrewmean',
                                 diagnotic['lreturn-average'])
                    logger.logkv('eval_eplenmean',
                                 diagnotic['episode-length-avg'])
                    logger.logkv('eval_violation_times',
                                 diagnotic['violation-avg'])
                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnotic is not None:
                    # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\
                    logger.logkv('eprewmean', training_diagnotic['rewards'])
                    logger.logkv('eplrewmean', training_diagnotic['l_rewards'])
                    logger.logkv('eplenmean', training_diagnotic['len'])
                    logger.logkv('end_cost', training_diagnotic['end_cost'])
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in policy.diag_names
                    ]

                    logger.logkv('violation_times',
                                 training_diagnotic['violation'])
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    print(
                        'training_step:',
                        global_step,
                        'average reward:',
                        round(training_diagnotic['rewards'], 2),
                        'average lreward:',
                        round(training_diagnotic['l_rewards'], 2),
                        'average violations:',
                        training_diagnotic['violation'],
                        'end cost:',
                        round(training_diagnotic['end_cost'], 2),
                        'average length:',
                        round(training_diagnotic['len'], 1),
                        'lyapunov error:',
                        round(training_diagnotic['lyapunov_error'], 6),
                        'critic1 error:',
                        round(training_diagnotic['critic1_error'], 6),
                        'critic2 error:',
                        round(training_diagnotic['critic2_error'], 6),
                        'policy_loss:',
                        round(training_diagnotic['policy_loss'], 6),
                        'alpha:',
                        round(training_diagnotic['alpha'], 6),
                        'lambda:',
                        round(training_diagnotic['labda'], 6),
                        'entropy:',
                        round(training_diagnotic['entropy'], 6),
                    )
                    # 'max_grad:', round(training_diagnotic['max_grad'], 6)
                logger.dumpkvs()
            # 状态更新
            s = s_
            ep_reward += r

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                ewma_step[0,
                          i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j
                ewma_reward[
                    0, i +
                    1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)
    print('Running time: ', time.time() - t1)
    return
def train(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    if variant['evaluate'] is True:
        evaluation_env = get_env_from_name(env_name)
    else:
        evaluation_env = None
    env_params = variant['env_params']
    judge_safety_func = get_safety_constraint_func(variant)

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']
    num_of_paths = variant['num_of_paths']

    alg_name = variant['algorithm_name']
    policy_build_fn = get_policy(alg_name)
    policy_params = variant['alg_params']
    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])

    logger.logkv('tau', policy_params['tau'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fn(a_dim, s_dim, policy_params)
    logger.logkv('target_entropy', policy.target_entropy)
    # For analyse

    Render = env_params['eval_render']
    ewma_p = 0.95
    ewma_step = np.zeros((1, max_episodes + 1))
    ewma_reward = np.zeros((1, max_episodes + 1))

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False
    for i in range(max_episodes):

        ep_reward = 0
        l_r = 0

        current_path = {
            'rewards': [],
            'l_rewards': [],
            'l_error': [],
            'critic1_error': [],
            'critic2_error': [],
            'alpha': [],
            'lambda': [],
            'entropy': [],
            'a_loss': [],
            'violation': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()
        if 'Fetch' in env_name or 'Hand' in env_name:
            s = np.concatenate([s[key] for key in s.keys()])

        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s, True)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2

            # Run in simulator
            s_, r, done, info = env.step(action)
            if 'Fetch' in env_name or 'Hand' in env_name:
                s_ = np.concatenate([s_[key] for key in s_.keys()])
                if info['done'] > 0:
                    done = True

            if training_started:
                global_step += 1
            l_r = info['l_rewards']
            if j == max_ep_steps - 1:
                done = True

            terminal = 1. if done else 0.

            violation_of_constraint = info['violation_of_constraint']
            # 储存s,a和s_next,reward用于DDPG的学习
            policy.store_transition(s, a, r, l_r, terminal, s_)

            # Learn

            if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0:
                # Decay the action randomness
                training_started = True
                for _ in range(train_per_cycle):
                    labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now)

            if training_started:
                current_path['rewards'].append(r)
                current_path['l_rewards'].append(l_r)
                current_path['l_error'].append(l_loss)
                current_path['critic1_error'].append(c1_loss)
                current_path['critic2_error'].append(c2_loss)
                current_path['alpha'].append(alpha)
                current_path['lambda'].append(labda)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)
                current_path['violation'].append(violation_of_constraint)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:
                if evaluation_env is not None:
                    rollouts = get_evaluation_rollouts(policy,
                                                       evaluation_env,
                                                       num_of_paths,
                                                       max_ep_steps,
                                                       render=Render)
                    diagnotic = evaluate_rollouts(rollouts)
                    # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()]
                    print(
                        'training_step:',
                        global_step,
                        'average eval reward:',
                        diagnotic['return-average'],
                        'average eval lreward:',
                        diagnotic['lreturn-average'],
                        'average eval violations:',
                        diagnotic['violation-avg'],
                        'average length:',
                        diagnotic['episode-length-avg'],
                    )
                    logger.logkv('eval_eprewmean', diagnotic['return-average'])
                    logger.logkv('eval_eplrewmean',
                                 diagnotic['lreturn-average'])
                    logger.logkv('eval_eplenmean',
                                 diagnotic['episode-length-avg'])
                    logger.logkv('eval_violation_times',
                                 diagnotic['violation-avg'])
                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnotic is not None:
                    # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\
                    logger.logkv('eprewmean',
                                 training_diagnotic['train-return-average'])
                    logger.logkv('eplrewmean',
                                 training_diagnotic['train-lreturn-average'])
                    logger.logkv(
                        'eplenmean',
                        training_diagnotic['train-episode-length-avg'])
                    logger.logkv('lyapunov_lambda',
                                 training_diagnotic['train-lambda-avg'])
                    logger.logkv('alpha',
                                 training_diagnotic['train-alpha-avg'])
                    logger.logkv('entropy',
                                 training_diagnotic['train-entropy-avg'])
                    logger.logkv('critic1 error',
                                 training_diagnotic['train-critic1-error-avg'])
                    logger.logkv('critic2 error',
                                 training_diagnotic['train-critic2-error-avg'])
                    logger.logkv(
                        'lyapunov error',
                        training_diagnotic['train-lyapunov-error-avg'])
                    logger.logkv('policy_loss',
                                 training_diagnotic['train-a-loss-avg'])
                    logger.logkv(
                        'average_cost',
                        training_diagnotic['train-return-average'] /
                        training_diagnotic['train-episode-length-avg'])
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    print(
                        'training_step:',
                        global_step,
                        'average reward:',
                        round(training_diagnotic['train-return-average'], 2),
                        'average lreward:',
                        round(training_diagnotic['train-lreturn-average'], 2),
                        'average violations:',
                        training_diagnotic['train-violation-avg'],
                        'average length:',
                        round(training_diagnotic['train-episode-length-avg'],
                              1),
                        'lyapunov error:',
                        round(training_diagnotic['train-lyapunov-error-avg'],
                              6),
                        'critic1 error:',
                        round(training_diagnotic['train-critic1-error-avg'],
                              6),
                        'critic2 error:',
                        round(training_diagnotic['train-critic2-error-avg'],
                              6),
                        'policy_loss:',
                        round(training_diagnotic['train-a-loss-avg'], 6),
                        'alpha:',
                        round(training_diagnotic['train-alpha-avg'], 6),
                        'lambda:',
                        round(training_diagnotic['train-lambda-avg'], 6),
                        'entropy:',
                        round(training_diagnotic['train-entropy-avg'], 6),
                    )
                logger.dumpkvs()
            # 状态更新
            s = s_
            ep_reward += r

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                ewma_step[0,
                          i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j
                ewma_reward[
                    0, i +
                    1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)
    print('Running time: ', time.time() - t1)
    return
def eval(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']

    alg_name = variant['algorithm_name']
    policy_build_fn = get_policy(alg_name)
    policy_params = variant['alg_params']
    root_path = variant['log_path']

    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fn(a_dim, s_dim, policy_params)
    if 'CartPole' in env_name:
        mag = env_params['impulse_mag']
    # For analyse
    Render = env_params['eval_render']
    # Training setting
    t1 = time.time()
    die_count = 0
    for i in range(variant['num_of_trials']):
        log_path = variant['log_path'] + '/eval/' + str(0)
        policy.restore(variant['log_path'] + '/' + str(0))
        logger.configure(dir=log_path, format_strs=['csv'])
        s = env.reset()
        if 'Fetch' in env_name or 'Hand' in env_name:
            s = np.concatenate([s[key] for key in s.keys()])

        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s, True)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2
            if j == 100 and 'CartPole' in env_name:

                impulse = mag * np.sign(s[0])
                # print('impulse comming:',impulse)
                # Run in simulator
                s_, r, done, info = env.step(action, impulse=impulse)
            else:
                s_, r, done, info = env.step(action)
            if 'Fetch' in env_name or 'Hand' in env_name:
                s_ = np.concatenate([s_[key] for key in s_.keys()])
                if info['done'] > 0:
                    done = True
            logger.logkv('rewards', r)
            logger.logkv('timestep', j)
            logger.dumpkvs()
            l_r = info['l_rewards']
            if j == max_ep_steps - 1:
                done = True
            s = s_
            if done:
                if j < 200:

                    die_count += 1
                print('episode:', i, 'death:', die_count, 'mag:', mag)
                break
    print('Running time: ', time.time() - t1)
    return
def train(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)

    evaluation_env = get_env_from_name(env_name)
    env_params = variant['env_params']

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    alg_name = variant['algorithm_name']
    policy_build_fn = get_policy(alg_name)
    policy_params = variant['alg_params']
    batch_size = policy_params['batch_size']

    lr_c = policy_params['lr_c']
    cliprange = policy_params['cliprange']
    cliprangenow = cliprange
    lr_c_now = lr_c  # learning rate for critic

    gamma = policy_params['gamma']
    gae_lamda = policy_params['gae_lamda']

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=policy_params['output_format'])
    logger.logkv('safety_threshold', policy_params['safety_threshold'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', batch_size)
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fn(a_dim, s_dim, policy_params)

    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=policy.N)

    for j in range(max_global_steps):

        if global_step > max_global_steps:
            break

        mb_obs, mb_obs_, mb_rewards,  mb_actions, mb_values, mb_terminals, mb_t = [], [], [], [], [], [], []

        for n in range(policy.N):
            current_path = {
                'rewards': [],
                'obs': [],
                'obs_': [],
                'done': [],
                'value': [],
                't': [],
                'action': [],
            }
            s = env.reset()
            if 'Fetch' in env_name or 'Hand' in env_name:
                s = np.concatenate([s[key] for key in s.keys()])
        # For n in range number of steps
            for t in range(max_ep_steps):

                # Given observations, get action value and neglopacs
                # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init

                [a], [value] = policy.choose_action(s)

                action = np.tanh(a)
                action = a_lowerbound + (action + 1.) * (a_upperbound -
                                                         a_lowerbound) / 2
                # Run in simulator
                s_, r, done, info = env.step(action)
                if 'Fetch' in env_name or 'Hand' in env_name:
                    s_ = np.concatenate([s_[key] for key in s_.keys()])
                if t == max_ep_steps - 1:
                    done = True
                terminal = 1. if done else 0.

                if Render:
                    env.render()

                current_path['rewards'].append(r)
                current_path['action'].append(a)
                current_path['obs'].append(s)
                current_path['obs_'].append(s_)
                current_path['done'].append(terminal)
                current_path['value'].append(value)
                current_path['t'].append(t)
                if done:

                    global_step += t + 1
                    last_training_paths.appendleft(current_path)

                    break
                else:
                    s = s_
        # mb_obs = np.asarray(mb_obs, dtype=s.dtype)
        # mb_values = np.asarray(mb_values, dtype=s.dtype)
        # mb_l_values = np.asarray(mb_l_values, dtype=s.dtype)
        # mb_actions = np.asarray(mb_actions, dtype=action.dtype)
        # mb_obs_ = np.asarray(mb_obs_, dtype=s_.dtype)
        # mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        # mb_l_rewards = np.asarray(mb_l_rewards, dtype=np.float32)
        # mb_terminals = np.asarray(mb_terminals, dtype=np.float32)
        # last_value, last_l_value = policy.predict_values([s_])
        rescale = np.mean([len(path) for path in last_training_paths])

        initial_return = []
        mb_advs = []
        for path in last_training_paths:
            lastgaelam = 0
            path_advs = np.zeros_like(path['rewards'])
            path_values = path['value']
            path_next_values = path['value'][1:]
            path_next_values.append(policy.predict_values(path['obs_'][-1]))
            for t in reversed(range(len(path_values))):

                delta = path['rewards'][t] + gamma * path_next_values[t] * (
                    1 - path['done'][t]) - path_values[t]
                path_advs[t] = lastgaelam = delta + gamma * gae_lamda * (
                    1 - path['done'][t]) * lastgaelam

            path_returns = path_advs + path_values
            initial_return.append(path_returns[0])
            mb_advs.extend(path_advs)
            mb_obs.extend(path['obs'])
            mb_obs_.extend(path['obs_'])
            mb_values.extend(path['value'])
            mb_terminals.extend(path['done'])
            mb_t.extend(path['t'])
            mb_actions.extend(path['action'])

        initial_return = np.asarray(initial_return, dtype=np.float32)
        mb_obs = np.asarray(mb_obs, dtype=s.dtype)
        mb_values = np.asarray(mb_values, dtype=s.dtype)
        mb_actions = np.asarray(mb_actions, dtype=action.dtype)
        mb_obs_ = np.asarray(mb_obs_, dtype=s_.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_terminals = np.asarray(mb_terminals, dtype=np.float32)
        mb_advs = np.asarray(mb_advs, dtype=np.float32)
        mb_t = np.asarray(mb_t, dtype=np.float32)
        mb_returns = mb_advs + mb_values
        mblossvals = []
        inds = np.arange(len(mb_advs), dtype=int)
        initial_return = np.mean(initial_return)
        # Randomize the indexes
        np.random.shuffle(inds)
        # 0 to batch_size with batch_train_size step
        # if sum(current_path['l_rewards'])>0:
        #     policy.ALPHA3 = min(policy.ALPHA3 * 1.5, policy_params['alpha3'])
        # else:
        #     policy.ALPHA3 = min(policy.ALPHA3 * 1.01, policy_params['alpha3'])

        slices = (arr[inds] for arr in (mb_obs, mb_obs_, mb_returns, mb_advs,
                                        mb_actions, mb_values, mb_t))

        # print(**slices)
        mblossvals.append(
            policy.update(*slices, initial_return, cliprangenow, lr_c_now,
                          rescale))

        mblossvals = np.mean(mblossvals, axis=0)
        frac = 1.0 - (global_step - 1.0) / max_global_steps
        cliprangenow = cliprange * frac
        lr_c_now = lr_c * frac  # learning rate for critic
        # lr_l_now = lr_l * frac  # learning rate for critic

        logger.logkv("total_timesteps", global_step)

        training_diagnotic = evaluate_training_rollouts(last_training_paths)

        if training_diagnotic is not None:
            # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\
            eval_diagnotic = training_evaluation(variant, evaluation_env,
                                                 policy)
            [
                logger.logkv(key, eval_diagnotic[key])
                for key in eval_diagnotic.keys()
            ]
            training_diagnotic.pop('return')
            [
                logger.logkv(key, training_diagnotic[key])
                for key in training_diagnotic.keys()
            ]
            logger.logkv('lr_c', lr_c_now)
            [
                logger.logkv(name, value)
                for name, value in zip(policy.diagnosis_names, mblossvals)
            ]
            string_to_print = ['time_step:', str(global_step), '|']
            [
                string_to_print.extend(
                    [key, ':', str(eval_diagnotic[key]), '|'])
                for key in eval_diagnotic.keys()
            ]
            [
                string_to_print.extend(
                    [key, ':',
                     str(round(training_diagnotic[key], 2)), '|'])
                for key in training_diagnotic.keys()
            ]
            print(''.join(string_to_print))
        logger.dumpkvs()
        # 状态更新

        # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY

    print('Running time: ', time.time() - t1)
    return