Esempio n. 1
0
def train(variant):
    s_save = []
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    if variant['evaluate'] is True:
        evaluation_env = get_env_from_name(env_name)
    else:
        evaluation_env = None
    env_params = variant['env_params']
    judge_safety_func = get_safety_constraint_func(variant)

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']
    num_of_paths = variant['num_of_paths']

    alg_name = variant['algorithm_name']
    policy_build_fn = get_policy(alg_name)
    policy_params = variant['alg_params']
    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])

    logger.logkv('tau', policy_params['tau'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fn(a_dim, s_dim, policy_params)
    logger.logkv('target_entropy', policy.target_entropy)
    # For analyse

    Render = env_params['eval_render']
    ewma_p = 0.95
    ewma_step = np.zeros((1, max_episodes + 1))
    ewma_reward = np.zeros((1, max_episodes + 1))

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False
    for i in range(max_episodes):

        ep_reward = 0
        l_r = 0

        current_path = {
            'rewards': [],
            'l_rewards': [],
            'violation': [],
        }
        [current_path.update({key: []}) for key in policy.diag_names]
        if global_step > max_global_steps:
            break

        s = env.reset()

        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2

            # Run in simulator
            s_, r, done, info = env.step(action)
            if training_started:
                global_step += 1
            l_r = info['l_rewards']
            if j == max_ep_steps - 1:
                done = True
            terminal = 1. if done else 0.

            violation_of_constraint = info['violation_of_constraint']
            # 储存s,a和s_next,reward用于DDPG的学习
            policy.store_transition(s, a, r, l_r, terminal, s_)
            s_save.append(s_)
            sio.savemat('data_all.mat', {
                's': s_save,
            })
            # 如果状态接近边缘 就存储到边缘memory里
            # if policy.use_lyapunov is True and np.abs(s[0]) > env.cons_pos:  # or np.abs(s[2]) > env.theta_threshold_radians*0.8
            if policy.use_lyapunov is True and judge_safety_func(
                    s_, r, done,
                    info):  # or np.abs(s[2]) > env.theta_threshold_radians*0.8
                policy.store_edge_transition(s, a, r, l_r, terminal, s_)

            # Learn
            if policy.use_lyapunov is True:
                if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0:
                    # Decay the action randomness
                    training_started = True
                    for _ in range(train_per_cycle):
                        train_diagnotic = policy.learn(lr_a_now, lr_c_now,
                                                       lr_l_now)

            else:
                if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0:
                    # Decay the action randomness
                    training_started = True
                    for _ in range(train_per_cycle):
                        train_diagnotic = policy.learn(lr_a_now, lr_c_now,
                                                       lr_l_now)

            if training_started:
                current_path['rewards'].append(r)
                current_path['l_rewards'].append(l_r)
                current_path['violation'].append(violation_of_constraint)
                [
                    current_path[key].append(value)
                    for key, value in zip(policy.diag_names, train_diagnotic)
                ]

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:
                if evaluation_env is not None:
                    rollouts = get_evaluation_rollouts(policy,
                                                       evaluation_env,
                                                       num_of_paths,
                                                       max_ep_steps,
                                                       render=Render)
                    diagnotic = evaluate_rollouts(rollouts)
                    # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()]
                    print(
                        'training_step:',
                        global_step,
                        'average eval reward:',
                        diagnotic['return-average'],
                        'average eval lreward:',
                        diagnotic['lreturn-average'],
                        'average eval violations:',
                        diagnotic['violation-avg'],
                        'average length:',
                        diagnotic['episode-length-avg'],
                    )
                    logger.logkv('eval_eprewmean', diagnotic['return-average'])
                    logger.logkv('eval_eplrewmean',
                                 diagnotic['lreturn-average'])
                    logger.logkv('eval_eplenmean',
                                 diagnotic['episode-length-avg'])
                    logger.logkv('eval_violation_times',
                                 diagnotic['violation-avg'])
                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnotic is not None:
                    # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\
                    logger.logkv('eprewmean', training_diagnotic['rewards'])
                    logger.logkv('eplrewmean', training_diagnotic['l_rewards'])
                    logger.logkv('eplenmean', training_diagnotic['len'])
                    logger.logkv('end_cost', training_diagnotic['end_cost'])
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in policy.diag_names
                    ]

                    logger.logkv('violation_times',
                                 training_diagnotic['violation'])
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    print(
                        'training_step:',
                        global_step,
                        'average reward:',
                        round(training_diagnotic['rewards'], 2),
                        'average lreward:',
                        round(training_diagnotic['l_rewards'], 2),
                        'average violations:',
                        training_diagnotic['violation'],
                        'end cost:',
                        round(training_diagnotic['end_cost'], 2),
                        'average length:',
                        round(training_diagnotic['len'], 1),
                        'lyapunov error:',
                        round(training_diagnotic['lyapunov_error'], 6),
                        'critic1 error:',
                        round(training_diagnotic['critic1_error'], 6),
                        'critic2 error:',
                        round(training_diagnotic['critic2_error'], 6),
                        'policy_loss:',
                        round(training_diagnotic['policy_loss'], 6),
                        'alpha:',
                        round(training_diagnotic['alpha'], 6),
                        'lambda:',
                        round(training_diagnotic['labda'], 6),
                        'entropy:',
                        round(training_diagnotic['entropy'], 6),
                    )
                    # 'max_grad:', round(training_diagnotic['max_grad'], 6)
                logger.dumpkvs()
            # 状态更新
            s = s_
            ep_reward += r

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                ewma_step[0,
                          i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j
                ewma_reward[
                    0, i +
                    1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)
    print('Running time: ', time.time() - t1)
    return
def train(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    if variant['evaluate'] is True:
        evaluation_env = get_env_from_name(env_name)
    else:
        evaluation_env = None
    env_params = variant['env_params']
    judge_safety_func = get_safety_constraint_func(variant)

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']
    num_of_paths = variant['num_of_paths']

    alg_name = variant['algorithm_name']
    policy_build_fn = get_policy(alg_name)
    policy_params = variant['alg_params']
    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])

    logger.logkv('tau', policy_params['tau'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fn(a_dim, s_dim, policy_params)
    logger.logkv('target_entropy', policy.target_entropy)
    # For analyse

    Render = env_params['eval_render']
    ewma_p = 0.95
    ewma_step = np.zeros((1, max_episodes + 1))
    ewma_reward = np.zeros((1, max_episodes + 1))

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False
    for i in range(max_episodes):

        ep_reward = 0
        l_r = 0

        current_path = {
            'rewards': [],
            'l_rewards': [],
            'l_error': [],
            'critic1_error': [],
            'critic2_error': [],
            'alpha': [],
            'lambda': [],
            'entropy': [],
            'a_loss': [],
            'violation': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()
        if 'Fetch' in env_name or 'Hand' in env_name:
            s = np.concatenate([s[key] for key in s.keys()])

        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s, True)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2

            # Run in simulator
            s_, r, done, info = env.step(action)
            if 'Fetch' in env_name or 'Hand' in env_name:
                s_ = np.concatenate([s_[key] for key in s_.keys()])
                if info['done'] > 0:
                    done = True

            if training_started:
                global_step += 1
            l_r = info['l_rewards']
            if j == max_ep_steps - 1:
                done = True

            terminal = 1. if done else 0.

            violation_of_constraint = info['violation_of_constraint']
            # 储存s,a和s_next,reward用于DDPG的学习
            policy.store_transition(s, a, r, l_r, terminal, s_)

            # Learn

            if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0:
                # Decay the action randomness
                training_started = True
                for _ in range(train_per_cycle):
                    labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now)

            if training_started:
                current_path['rewards'].append(r)
                current_path['l_rewards'].append(l_r)
                current_path['l_error'].append(l_loss)
                current_path['critic1_error'].append(c1_loss)
                current_path['critic2_error'].append(c2_loss)
                current_path['alpha'].append(alpha)
                current_path['lambda'].append(labda)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)
                current_path['violation'].append(violation_of_constraint)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:
                if evaluation_env is not None:
                    rollouts = get_evaluation_rollouts(policy,
                                                       evaluation_env,
                                                       num_of_paths,
                                                       max_ep_steps,
                                                       render=Render)
                    diagnotic = evaluate_rollouts(rollouts)
                    # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()]
                    print(
                        'training_step:',
                        global_step,
                        'average eval reward:',
                        diagnotic['return-average'],
                        'average eval lreward:',
                        diagnotic['lreturn-average'],
                        'average eval violations:',
                        diagnotic['violation-avg'],
                        'average length:',
                        diagnotic['episode-length-avg'],
                    )
                    logger.logkv('eval_eprewmean', diagnotic['return-average'])
                    logger.logkv('eval_eplrewmean',
                                 diagnotic['lreturn-average'])
                    logger.logkv('eval_eplenmean',
                                 diagnotic['episode-length-avg'])
                    logger.logkv('eval_violation_times',
                                 diagnotic['violation-avg'])
                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnotic is not None:
                    # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\
                    logger.logkv('eprewmean',
                                 training_diagnotic['train-return-average'])
                    logger.logkv('eplrewmean',
                                 training_diagnotic['train-lreturn-average'])
                    logger.logkv(
                        'eplenmean',
                        training_diagnotic['train-episode-length-avg'])
                    logger.logkv('lyapunov_lambda',
                                 training_diagnotic['train-lambda-avg'])
                    logger.logkv('alpha',
                                 training_diagnotic['train-alpha-avg'])
                    logger.logkv('entropy',
                                 training_diagnotic['train-entropy-avg'])
                    logger.logkv('critic1 error',
                                 training_diagnotic['train-critic1-error-avg'])
                    logger.logkv('critic2 error',
                                 training_diagnotic['train-critic2-error-avg'])
                    logger.logkv(
                        'lyapunov error',
                        training_diagnotic['train-lyapunov-error-avg'])
                    logger.logkv('policy_loss',
                                 training_diagnotic['train-a-loss-avg'])
                    logger.logkv(
                        'average_cost',
                        training_diagnotic['train-return-average'] /
                        training_diagnotic['train-episode-length-avg'])
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    print(
                        'training_step:',
                        global_step,
                        'average reward:',
                        round(training_diagnotic['train-return-average'], 2),
                        'average lreward:',
                        round(training_diagnotic['train-lreturn-average'], 2),
                        'average violations:',
                        training_diagnotic['train-violation-avg'],
                        'average length:',
                        round(training_diagnotic['train-episode-length-avg'],
                              1),
                        'lyapunov error:',
                        round(training_diagnotic['train-lyapunov-error-avg'],
                              6),
                        'critic1 error:',
                        round(training_diagnotic['train-critic1-error-avg'],
                              6),
                        'critic2 error:',
                        round(training_diagnotic['train-critic2-error-avg'],
                              6),
                        'policy_loss:',
                        round(training_diagnotic['train-a-loss-avg'], 6),
                        'alpha:',
                        round(training_diagnotic['train-alpha-avg'], 6),
                        'lambda:',
                        round(training_diagnotic['train-lambda-avg'], 6),
                        'entropy:',
                        round(training_diagnotic['train-entropy-avg'], 6),
                    )
                logger.dumpkvs()
            # 状态更新
            s = s_
            ep_reward += r

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                ewma_step[0,
                          i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j
                ewma_reward[
                    0, i +
                    1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)
    print('Running time: ', time.time() - t1)
    return