Exemple #1
0
def train(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['num_of_training_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']
    policy_params['network_structure'] = env_params['network_structure']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # if disturber_params['process_noise']:
    #     d_dim = disturber_params['noise_dim']
    # else:
    #     d_dim = env_params['disturbance dim']

    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = LAC(a_dim, s_dim, policy_params)

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)

    for i in range(max_episodes):

        current_path = {
            'rewards': [],
            'a_loss': [],
            'alpha': [],
            'lambda': [],
            'lyapunov_error': [],
            'entropy': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()
        if 'Fetch' in env_name or 'Hand' in env_name:
            s = np.concatenate([s[key] for key in s.keys()])

        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s)
            # a = a*0
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2

            # Run in simulator
            disturbance_input = np.zeros([a_dim + s_dim])

            s_, r, done, info = env.step(action)

            if 'Fetch' in env_name or 'Hand' in env_name:
                s_ = np.concatenate([s_[key] for key in s_.keys()])
                if info['done'] > 0:
                    done = True

            if training_started:
                global_step += 1

            if j == max_ep_steps - 1:
                done = True

            terminal = 1. if done else 0.
            pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a, batch)

            if training_started:
                current_path['rewards'].append(r)
                current_path['lyapunov_error'].append(l_loss)
                current_path['alpha'].append(alpha)
                current_path['lambda'].append(labda)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnotic is not None:
                    if variant['num_of_evaluation_paths'] > 0:
                        eval_diagnotic = training_evaluation(
                            variant, env, policy)
                        [
                            logger.logkv(key, eval_diagnotic[key])
                            for key in eval_diagnotic.keys()
                        ]
                        training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    string_to_print = ['time_step:', str(global_step), '|']
                    if variant['num_of_evaluation_paths'] > 0:
                        [
                            string_to_print.extend(
                                [key, ':',
                                 str(eval_diagnotic[key]), '|'])
                            for key in eval_diagnotic.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
            # 状态更新
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)

                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return
Exemple #2
0
def train(variant):
    Min_cost = 1000000

    traj = get_traj()  # get data
    env_name = variant['env_name']  # choose your environment
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params[
        'max_episodes']  # maximum episodes for RL training
    max_ep_steps = env_params[
        'max_ep_steps']  # number of maximum steps in each episode
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    s_dim = env.observation_space.shape[0]
    print("s_dim is ", s_dim)

    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    policy = CAC(a_dim, s_dim, policy_params)
    # policy.restore("log/CMAPSS/CAC-new-reward-0.01/0/policy")

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)

    for i in range(max_episodes):

        current_path = {
            'rewards': [],
            'distance': [],
            'kl_divergence': [],
            'a_loss': [],
            'alpha': [],
            'lyapunov_error': [],
            'entropy': [],
            'beta': [],
            'action_distance': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()

        # Random start point

        start_point = np.random.randint(0, 500000)

        s = traj[start_point, :16]

        # current state, theta,next w, desired state
        # this is for decision making
        # 16,1,4,16
        s = np.concatenate([[s], [traj[start_point, 17:]]], axis=1)[0]

        env.state = s

        for j in range(start_point + 1, start_point + 1 + max_ep_steps):
            if Render:
                env.render()
            delta = np.zeros(36)
            # ###### NOSIE ##############

            noise = np.random.normal(0, 0.01, 16)
            delta[20:] = noise
            # ########IF Noise env##########
            # s= s + delta
            # a = policy.choose_action(s)

            # ###### BIAS ##############

            # noise = s[0:16]*0.01
            # delta[0:16] = noise

            a = policy.choose_action(s + delta)

            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2
            # action = traj[j-1,16]

            a_upperbound = env.action_space.high
            a_lowerbound = env.action_space.low

            # Run in simulator
            X_, r, done, theta = env.step(action)
            # The new s= current state,next omega, next state
            s_ = np.concatenate([X_, [traj[j, 17:]]], axis=1)[0]
            # s_ = np.concatenate([[s_], [theta]], axis=1)[0]
            # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0]
            env.state = s_

            # theta_pre=theta
            if training_started:
                global_step += 1

            if j == max_ep_steps - 1 + start_point:
                done = True

            terminal = 1. if done else 0.

            if j > start_point + 2:
                pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_,
                           _s)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch)

            if training_started:
                current_path['rewards'].append(r)
                current_path['distance'].append(distance)
                current_path['kl_divergence'].append(kl)
                current_path['lyapunov_error'].append(l_loss)
                current_path['alpha'].append(alpha)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)
                current_path['beta'].append(beta)
                current_path['action_distance'].append(action_distance)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                # print(training_diagnotic)
                if training_diagnotic is not None:
                    eval_diagnotic = training_evaluation(variant, env, policy)
                    [
                        logger.logkv(key, eval_diagnotic[key])
                        for key in eval_diagnotic.keys()
                    ]
                    training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    string_to_print = ['time_step:', str(global_step), '|']
                    [
                        string_to_print.extend(
                            [key, ':', str(eval_diagnotic[key]), '|'])
                        for key in eval_diagnotic.keys()
                    ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
                if eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length'] <= Min_cost:
                    Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length']
                    print("New lowest cost:", Min_cost)
                    policy.save_result(log_path)
                if training_started and global_step % (
                        10 * evaluation_frequency) == 0 and global_step > 0:
                    policy.save_result(log_path)

            # Status Update
            _s = s
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic
                break
    policy.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return
Exemple #3
0
def train(variant):
    Min_cost = 1000000

    data_trajectories = get_data()  # get data (X, W, X_, theta, state)
    env_name = variant['env_name']  # choose your environment
    env = get_env_from_name(env_name)

    num_data_traj = variant['num_data_trajectories']
    reward_id = variant['reward_id']
    env_params = variant['env_params']
    max_episodes = env_params[
        'max_episodes']  # maximum episodes for RL training
    max_ep_steps = env_params[
        'max_ep_steps']  # number of maximum steps in each episode
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for lyapunov critic

    s_dim = env.observation_space.shape[
        0]  # dimension of state (3 for Battery)

    a_dim = env.action_space.shape[0]  # action space dimension (1 or 2)
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    policy = CAC(a_dim, s_dim, policy_params)
    policy.restore(variant['log_path'] + "/0/policy")

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']
    ref_s = env.reference_state
    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)

    for i in range(max_episodes):
        print("episode # ", i)
        print("global steps ", global_step)

        current_path = {
            'rewards': [],
            'distance': [],
            'kl_divergence': [],
            'a_loss': [],
            'alpha': [],
            'lyapunov_error': [],
            'entropy': [],
            'beta': [],
            'action_distance': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()

        # Random start point

        # traj_id = np.random.randint(0, len(data_trajectories))
        traj_id = np.random.randint(0, num_data_traj)
        # traj_id = 1
        traj = data_trajectories[traj_id]

        # print(len(traj))
        if variant['traj_start'] == "random":
            start_point = np.random.randint(0, len(traj) - 2)
        else:
            start_point = int(variant['traj_start'])
        # s = traj[start_point, 1]
        s = traj[start_point, -8:]
        # current state, theta,next w, desired state
        # this is for decision making
        # 16,1,4,16
        # s = np.array([s, traj[start_point, 2], traj[start_point, 4]])
        # print(i, s)
        s = np.array(
            list(s) + [traj[start_point, 2]] +
            list(traj[start_point + 1, -8:]))
        # print(s)
        env.state = s
        env.model.state = traj[start_point, -8:]
        # env.state = env.model.state
        # ep_steps = len(traj)
        ep_steps = min(start_point + 1 + max_ep_steps, len(traj))
        # print("selected traj = ", traj_id, " and length = ", len(traj), " starting = ", start_point, " ep_steps = ", ep_steps)
        for j in range(start_point + 1, ep_steps):
            if Render:
                env.render()
            s = env.state
            delta = np.zeros(s.shape)
            # ###### NOSIE ##############

            # noise = np.random.normal(0, 0.01, 0.01)
            # delta[2:]= noise
            # ########IF Noise env##########
            # s= s + delta
            # a = policy.choose_action(s)

            # ###### BIAS ##############

            # noise = s[0:16]*0.01
            # delta[0:16] = noise

            # store_s = s.copy()
            # store_s[2] = store_s[2]-store_s[0]
            # a = policy.choose_action(store_s + delta)
            # print(s, delta)
            a = policy.choose_action(s / ref_s + delta)
            # print("a: ", a)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2
            # action = traj[j-1,16]
            # print("a normalize: " , action)

            a_upperbound = env.action_space.high
            a_lowerbound = env.action_space.low

            # Run in simulator
            s_, r, done, X_ = env.step(action, traj[j, 2], traj[j, 1])
            # The new s= current state,next omega, next state
            s_ = np.array(list(s_) + [traj[j + 1, 2]] + list(traj[j + 1, -8:]))
            # s_ = np.array([X_[1][0], traj[j, 2], traj[j,4]])
            # s_ = np.array([traj[j, 1], traj[j, 2], traj[j,4]])
            r = modify_reward(r, s, s_, reward_id)
            # print(r)
            if global_step % 100 == 1:
                print("global step: ", global_step, " true action: ",
                      [traj[j, 5], traj[j, 6]], " predicted action: ", action,
                      " and reward : ", r)

            # print("new state is : ", s_)
            # s_ = np.concatenate([[s_], [theta]], axis=1)[0]
            # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0]
            env.state = s_
            # store_s_ = s_.copy()
            # store_s_[2] = store_s_[2] - store_s_[0]
            # theta_pre=theta
            if training_started:
                global_step += 1

            if j == ep_steps - 2:
                done = True

            terminal = 1. if done else 0.

            if j > start_point + 2:
                pool.store(s / ref_s, a, np.zeros([1]), np.zeros([1]), r,
                           terminal, s_ / ref_s, _s / ref_s)
                # pool.store(store_s, a, np.zeros([1]), np.zeros([1]), r, terminal, store_s_, store__s)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True
                # print("learning policy")

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, beta, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch)
                    if global_step % 2000 == 1:
                        print("labda = ", labda, " | alpha = ", alpha,
                              " | beta = ", beta, " | l_loss = ", l_loss,
                              " | entropy = ", entropy, " | a_loss = ", a_loss,
                              " | action_distance = ", action_distance)
            if training_started:
                current_path['rewards'].append(r)
                current_path['distance'].append(distance)
                current_path['kl_divergence'].append(kl)
                current_path['lyapunov_error'].append(l_loss)
                current_path['alpha'].append(alpha)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)
                current_path['beta'].append(beta)
                current_path['action_distance'].append(action_distance)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                # print(training_diagnotic)
                if training_diagnotic is not None:
                    print("doing training evaluation")
                    eval_diagnotic = training_evaluation(variant, env, policy)
                    [
                        logger.logkv(key, eval_diagnotic[key])
                        for key in eval_diagnotic.keys()
                    ]
                    training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    string_to_print = ['time_step:', str(global_step), '|']
                    [
                        string_to_print.extend(
                            [key, ':', str(eval_diagnotic[key]), '|'])
                        for key in eval_diagnotic.keys()
                    ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
                if eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length'] <= Min_cost:
                    Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length']
                    print("New lowest cost:", Min_cost)
                    policy.save_result(log_path)
                else:
                    print("cost did not improve.")
                    print(
                        "avg cost was ", eval_diagnotic['test_return'] /
                        eval_diagnotic['test_average_length'])
                    print("prev best cost is:", Min_cost)
                    # policy.save_result(log_path)
                if training_started and global_step % (
                        10 * evaluation_frequency) == 0 and global_step > 0:
                    policy.save_result(log_path)

            # State Update
            _s = s
            s = s_
            store__s = _s.copy()
            store__s[2] = store__s[2] - store__s[0]
            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                # print("done at ", j)
                if training_started:
                    last_training_paths.appendleft(current_path)
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic
                break
    policy.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return
Exemple #4
0
from pool.pool import Pool

if __name__ == '__main__':
    Pool().run()
Exemple #5
0
def train_v2(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    env_params = variant['env_params']

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_build_fun = get_policy(variant['algorithm_name'])
    policy_params = variant['alg_params']
    disturber_params = variant['disturber_params']
    iter_of_actor_train = policy_params['iter_of_actor_train_per_epoch']
    iter_of_disturber_train = policy_params[
        'iter_of_disturber_train_per_epoch']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # if disturber_params['process_noise']:
    #     d_dim = disturber_params['noise_dim']
    # else:
    #     d_dim = env_params['disturbance dim']
    d_dim = np.nonzero(disturber_params['disturbance_magnitude'])[0].shape[0]
    disturbance_chanel_list = np.nonzero(
        disturber_params['disturbance_magnitude'])[0]
    disturber_params['disturbance_chanel_list'] = disturbance_chanel_list
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fun(a_dim, s_dim, d_dim, policy_params)
    disturber = Disturber(d_dim, s_dim, disturber_params)

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': d_dim,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'finite_horizon': policy_params['finite_horizon'],
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0

    last_actor_training_paths = deque(maxlen=store_last_n_paths)
    last_disturber_training_paths = deque(maxlen=store_last_n_paths)
    actor_training_started = False
    disturber_training_started = False
    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])
    logger.logkv('ita', policy_params['ita'])
    logger.logkv('energy_decay_rate', disturber_params['energy_decay_rate'])
    logger.logkv('magnitude', disturber_params['disturbance_magnitude'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)
    for iter in range(max_episodes):

        for i in range(iter_of_actor_train):

            current_path = {
                'rewards': [],
                'disturbance_mag': [],
                'a_loss': [],
                'alpha': [],
                'lyapunov_error': [],
                'labda': [],
                'critic_error': [],
                'entropy': [],
            }

            if global_step > max_global_steps:
                break

            s = env.reset()
            if 'Fetch' in env_name or 'Hand' in env_name:
                s = np.concatenate([s[key] for key in s.keys()])

            for j in range(max_ep_steps):
                if Render:
                    env.render()
                a = policy.choose_action(s, True)
                action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                    a_lowerbound) / 2
                disturbance, raw_disturbance = disturber.choose_action(s, j)
                # Run in simulator
                # disturbance = np.array([0])
                disturbance_input = np.zeros([a_dim + s_dim])
                disturbance_input[disturbance_chanel_list] = disturbance
                s_, r, done, info = env.step(action,
                                             process_noise=disturbance_input)
                if 'Fetch' in env_name or 'Hand' in env_name:
                    s_ = np.concatenate([s_[key] for key in s_.keys()])
                    if info['done'] > 0:
                        done = True

                if actor_training_started:
                    global_step += 1

                if j == max_ep_steps - 1:
                    done = True

                terminal = 1. if done else 0.
                pool.store(s, a, disturbance, raw_disturbance, r, terminal, s_)
                # policy.store_transition(s, a, disturbance, r,0, terminal, s_)
                # Learn

                if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                    actor_training_started = True

                    for _ in range(train_per_cycle):
                        batch = pool.sample(batch_size)
                        labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn(
                            lr_a_now, lr_c_now, lr_l_now, batch)

                if actor_training_started:
                    current_path['rewards'].append(r)
                    current_path['labda'].append(labda)
                    current_path['critic_error'].append(min(c1_loss, c2_loss))
                    current_path['lyapunov_error'].append(l_loss)
                    current_path['alpha'].append(alpha)

                    current_path['entropy'].append(entropy)
                    current_path['a_loss'].append(a_loss)
                    current_path['disturbance_mag'].append(
                        np.linalg.norm(disturbance))

                if actor_training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                    logger.logkv("total_timesteps", global_step)

                    training_diagnotic = evaluate_training_rollouts(
                        last_actor_training_paths)
                    if training_diagnotic is not None:

                        [
                            logger.logkv(key, training_diagnotic[key])
                            for key in training_diagnotic.keys()
                        ]
                        logger.logkv('lr_a', lr_a_now)
                        logger.logkv('lr_c', lr_c_now)
                        logger.logkv('lr_l', lr_l_now)
                        string_to_print = [
                            'Actor training!time_step:',
                            str(global_step), '|'
                        ]

                        [
                            string_to_print.extend([
                                key, ':',
                                str(round(training_diagnotic[key], 2)), '|'
                            ]) for key in training_diagnotic.keys()
                        ]

                        print(''.join(string_to_print))

                    logger.dumpkvs()
                # 状态更新
                s = s_

                # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
                if done:
                    if actor_training_started:
                        last_actor_training_paths.appendleft(current_path)

                    frac = 1.0 - (global_step - 1.0) / max_global_steps
                    lr_a_now = lr_a * frac  # learning rate for actor
                    lr_c_now = lr_c * frac  # learning rate for critic
                    lr_l_now = lr_l * frac  # learning rate for critic

                    break
        if global_step > max_global_steps:
            break
        for i in range(iter_of_disturber_train):

            current_path = {
                'rewards': [],
                'disturbance_mag': [],
                'd_loss': [],
                'alpha': [],
                'disturber_critic_error': [],
                'entropy': [],
            }

            if global_step > max_global_steps:
                break

            s = env.reset()
            if 'Fetch' in env_name or 'Hand' in env_name:
                s = np.concatenate([s[key] for key in s.keys()])

            for j in range(max_ep_steps):
                if Render:
                    env.render()
                a = policy.choose_action(s, True)
                action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                    a_lowerbound) / 2
                disturbance, raw_disturbance = disturber.choose_action(s, j)
                # Run in simulator
                # disturbance = np.array([0])
                s_, r, done, info = env.step(action, disturbance)
                if 'Fetch' in env_name or 'Hand' in env_name:
                    s_ = np.concatenate([s_[key] for key in s_.keys()])
                    if info['done'] > 0:
                        done = True

                if disturber_training_started:
                    global_step += 1

                if j == max_ep_steps - 1:
                    done = True

                terminal = 1. if done else 0.
                pool.store(s, a, disturbance, raw_disturbance, r, terminal, s_)
                # policy.store_transition(s, a, disturbance, r,0, terminal, s_)
                # Learn

                if pool.memory_pointer > min_memory_size and global_step % disturber_params[
                        'steps_per_cycle'] == 0:
                    disturber_training_started = True

                    for _ in range(disturber_params['train_per_cycle']):
                        batch = pool.sample(disturber_params['batch_size'])
                        d_alpha, d_c1_loss, d_c2_loss, d_entropy, d_loss = disturber.learn(
                            lr_a_now, lr_c_now, batch)
                # d_c1_loss = 0
                # d_c2_loss = 0
                # d_loss=0
                if disturber_training_started:
                    current_path['rewards'].append(r)

                    current_path['disturber_critic_error'].append(
                        min(d_c1_loss, d_c2_loss))
                    current_path['d_loss'].append(d_loss)
                    current_path['alpha'].append(d_alpha)

                    current_path['entropy'].append(d_entropy)

                    current_path['disturbance_mag'].append(
                        np.linalg.norm(disturbance))

                if disturber_training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                    logger.logkv("total_timesteps", global_step)

                    training_diagnotic = evaluate_training_rollouts(
                        last_disturber_training_paths)
                    if training_diagnotic is not None:
                        [
                            logger.logkv(key, training_diagnotic[key])
                            for key in training_diagnotic.keys()
                        ]
                        logger.logkv('lr_a', lr_a_now)
                        logger.logkv('lr_c', lr_c_now)
                        logger.logkv('lr_l', lr_l_now)
                        string_to_print = [
                            'Disturber training!time_step:',
                            str(global_step), '|'
                        ]

                        [
                            string_to_print.extend([
                                key, ':',
                                str(round(training_diagnotic[key], 2)), '|'
                            ]) for key in training_diagnotic.keys()
                        ]

                        print(''.join(string_to_print))

                    logger.dumpkvs()
                # 状态更新
                s = s_

                # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
                if done:
                    if disturber_training_started:
                        last_disturber_training_paths.appendleft(current_path)

                    frac = 1.0 - (global_step - 1.0) / max_global_steps
                    lr_a_now = lr_a * frac  # learning rate for actor
                    lr_c_now = lr_c * frac  # learning rate for critic
                    lr_l_now = lr_l * frac  # learning rate for critic

                    break
        if global_step > max_global_steps:
            break
    policy.save_result(log_path)
    disturber.save_result(log_path)
    print('Running time: ', time.time() - t1)
    return
Exemple #6
0
def train(variant):
    env_name = variant["env_name"]
    env = get_env_from_name(env_name)

    env_params = variant["env_params"]

    max_episodes = env_params["max_episodes"]
    max_ep_steps = env_params["max_ep_steps"]
    max_global_steps = env_params["max_global_steps"]
    store_last_n_paths = variant["num_of_training_paths"]
    evaluation_frequency = variant["evaluation_frequency"]

    policy_params = variant["alg_params"]
    policy_params["network_structure"] = env_params["network_structure"]

    min_memory_size = policy_params["min_memory_size"]
    steps_per_cycle = policy_params["steps_per_cycle"]
    train_per_cycle = policy_params["train_per_cycle"]
    batch_size = policy_params["batch_size"]

    lr_a, lr_c, lr_l = (
        policy_params["lr_a"],
        policy_params["lr_c"],
        policy_params["lr_l"],
    )
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    if "Fetch" in env_name or "Hand" in env_name:
        s_dim = (env.observation_space.spaces["observation"].shape[0] +
                 env.observation_space.spaces["achieved_goal"].shape[0] +
                 env.observation_space.spaces["desired_goal"].shape[0])
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # if disturber_params['process_noise']:
    #     d_dim = disturber_params['noise_dim']
    # else:
    #     d_dim = env_params['disturbance dim']

    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = LAC(a_dim, s_dim, policy_params)

    pool_params = {
        "s_dim": s_dim,
        "a_dim": a_dim,
        "d_dim": 1,
        "store_last_n_paths": store_last_n_paths,
        "memory_capacity": policy_params["memory_capacity"],
        "min_memory_size": policy_params["min_memory_size"],
        "history_horizon": policy_params["history_horizon"],
        "finite_horizon": policy_params["finite_horizon"],
    }
    if "value_horizon" in policy_params.keys():
        pool_params.update({"value_horizon": policy_params["value_horizon"]})
    else:
        pool_params["value_horizon"] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params["eval_render"]

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant["log_path"]
    logger.configure(dir=log_path, format_strs=["csv"])
    logger.logkv("tau", policy_params["tau"])

    logger.logkv("alpha3", policy_params["alpha3"])
    logger.logkv("batch_size", policy_params["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    for i in range(max_episodes):

        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()
        if "Fetch" in env_name or "Hand" in env_name:
            s = np.concatenate([s[key] for key in s.keys()])

        for j in range(max_ep_steps):
            if Render:
                env.render()

            a = policy.choose_action(s)
            action = a_lowerbound + (a + 1.0) * (a_upperbound -
                                                 a_lowerbound) / 2
            # action = a

            # Run in simulator
            disturbance_input = np.zeros([a_dim + s_dim])

            s_, r, done, info = env.step(action)

            if "Fetch" in env_name or "Hand" in env_name:
                s_ = np.concatenate([s_[key] for key in s_.keys()])
                if info["done"] > 0:
                    done = True

            if training_started:
                global_step += 1

            if j == max_ep_steps - 1:
                done = True

            terminal = 1.0 if done else 0.0
            pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if (pool.memory_pointer > min_memory_size
                    and global_step % steps_per_cycle == 0):
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a, batch)

            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            if (training_started and global_step % evaluation_frequency == 0
                    and global_step > 0):

                logger.logkv("total_timesteps", global_step)

                training_diagnostics = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnostics is not None:
                    if variant["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(
                            variant, env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_c", lr_c_now)
                    logger.logkv("lr_l", lr_l_now)

                    string_to_print = ["time_step:", str(global_step), "|"]
                    if variant["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":",
                                 str(eval_diagnostics[key]), "|"])
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ":",
                            str(round(training_diagnostics[key], 2)), "|"
                        ]) for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))

                logger.dumpkvs()
            # 状态更新
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)

                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)

    print("Running time: ", time.time() - t1)
    return
def train(variant):
    Min_cost = 1000000

    data_trajectories = get_data() # get data (X, W, X_, theta, state)
    env_name = variant['env_name'] # choose your environment
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params['max_episodes'] # maximum episodes for RL training
    max_ep_steps = env_params['max_ep_steps'] # number of maximum steps in each episode
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    s_dim = env.observation_space.shape[0] # dimension of state (3 for Battery)

    a_dim = env.action_space.shape[0] # action space dimension (1 or 2)
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    agent = CAC(a_dim,s_dim, policy_params, max_global_steps = max_global_steps)
    # policy.restore(variant['log_path'] + "/0/policy")

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)

    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', agent.target_entropy)

    for i in range(max_episodes):
        print("episode # ", i)
        print("global steps ", global_step)

        current_path = {'rewards': [],
                        'distance': [],
                        'a_loss': [],
                        'alpha': [],
                        'lyapunov_error': [],
                        'entropy': [],
                        'beta':[],
                        'action_distance': [],
                        }


        if global_step > max_global_steps:
            break


        s = env.reset()

        # Random start point

        # traj_id = np.random.randint(0, len(data_trajectories))
        traj_id = np.random.randint(0, variant['num_data_trajectories'])
        # traj_id = 0
        traj = data_trajectories[traj_id]
        # print(len(traj))
        start_point = np.random.randint(0, len(traj))
        # start_point = 0
        s = traj[start_point, 1]

        # current state, theta,next w, desired state
        # this is for decision making
        # 16,1,4,16
        s = np.array([s, traj[start_point, 2], traj[start_point, 4]])
        # print(i, s)

        env.state = s
        env.model.state = traj[start_point, -8:]
        
        ep_steps = min(start_point+1+max_ep_steps, len(traj))
        for j in range(start_point+1,ep_steps):
            if Render:
                env.render()
            delta = np.zeros(3)
            # ###### NOSIE ##############

            # noise = np.random.normal(0, 0.01, 0.01)
            # delta[2:]= noise
            # ########IF Noise env##########
            # s= s + delta
            # a = policy.choose_action(s)

            # ###### BIAS ##############

            # noise = s[0:16]*0.01
            # delta[0:16] = noise


            a = agent.act(torch.tensor([s]).float())
            
            action = a_lowerbound + (a.detach().numpy() + 1.) * (a_upperbound - a_lowerbound) / 2
            # action = traj[j-1,16]

            a_upperbound = env.action_space.high
            a_lowerbound = env.action_space.low

            # Run in simulator
            _, r, done, X_ = env.step(action)
            # The new s= current state,next omega, next state
            s_ = np.array([X_[1][0], traj[j, 2], traj[j,4]])
            
            r = modify_reward(r, s, s_, variant['reward_id'])

            if j%100 == 0:
                print("current state: ", s, "true action: ", traj[j, 5], " predicted action: ", action, " and reward : ", r)

            env.state = s_

            # theta_pre=theta
            if training_started:
                global_step += 1
                agent.scheduler_step()

            if j == max_ep_steps - 1+start_point:
                done = True

            terminal = 1. if done else 0.

            if j>start_point+2:
                pool.store(s, a.detach().numpy().flatten(), np.zeros([1]), np.zeros([1]), r, terminal, s_,_s)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    alpha_loss, beta_loss, labda_loss, actor_loss, lyapunov_loss = agent.learn(batch)
                    if j % 200 == 0:
                        print("labda = ", agent.labda, " | alpha = ", agent.alpha, 
                            " | l_loss = ", lyapunov_loss , " | entropy = ", agent.log_pis,
                            " | a_loss = ", actor_loss, " | alpha_loss = ", alpha_loss,
                            " | labda_loss = ", labda_loss)
            if training_started:
                current_path['rewards'].append(r)
                current_path['lyapunov_error'].append(lyapunov_loss.detach().numpy())
                current_path['alpha'].append(agent.alpha.detach().numpy())
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(actor_loss.detach().numpy())
                current_path['beta'].append(agent.beta.detach().numpy())
                # current_path['action_distance'].append(action_distance)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(last_training_paths)
                # print(training_diagnotic)
                if training_diagnotic is not None:
                    print("doing training evaluation")
                    eval_diagnotic = training_evaluation(variant, env, agent)
                    [logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys()]
                    training_diagnotic.pop('return')
                    [logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys()]

                    string_to_print = ['time_step:', str(global_step), '|']
                    [string_to_print.extend([key, ':', str(eval_diagnotic[key]), '|'])
                     for key in eval_diagnotic.keys()]
                    [string_to_print.extend([key, ':', str(round(training_diagnotic[key], 2)) , '|'])
                     for key in training_diagnotic.keys()]
                    print(''.join(string_to_print))

                logger.dumpkvs()
                if eval_diagnotic['test_return'] / eval_diagnotic['test_average_length'] <= Min_cost:
                    Min_cost = eval_diagnotic['test_return'] / eval_diagnotic['test_average_length']
                    print("New lowest cost:", Min_cost)
                    agent.save_result(log_path)
                else:
                    print("cost did not improve.")
                    print("The best cost is ", Min_cost)
                    print("avg cost was ", eval_diagnotic['test_return']/eval_diagnotic['test_average_length'])
                if training_started and global_step % (10*evaluation_frequency) == 0 and global_step > 0:
                    agent.save_result(log_path)

            # State Update
            _s = s
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                break
    agent.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return