Ejemplo n.º 1
0
def eval(cfg, saved_model_path=SAVED_MODEL_PATH):
    print('start to eval ! \n')
    env = NormalizedActions(gym.make("Pendulum-v0"))
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    agent = DDPG(n_states,
                 n_actions,
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128)
    agent.load_model(saved_model_path + 'checkpoint.pth')
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/eval/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.eval_eps + 1):
        state = env.reset()  # reset环境状态
        ep_reward = 0
        for i_step in range(1, cfg.eval_steps + 1):
            action = agent.select_action(state)  # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)  # 更新环境参数
            ep_reward += reward
            state = next_state  # 跳转到下一个状态
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step, 'done: ', done)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        # 计算滑动窗口的reward
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    '''存储reward等相关结果'''
    if not os.path.exists(RESULT_PATH):  # 检测是否存在文件夹
        os.mkdir(RESULT_PATH)
    np.save(RESULT_PATH + 'rewards_eval.npy', rewards)
    np.save(RESULT_PATH + 'moving_average_rewards_eval.npy',
            moving_average_rewards)
    np.save(RESULT_PATH + 'steps_eval.npy', ep_steps)
Ejemplo n.º 2
0
def main():

	params = {
		'actor_learning_rate':1e-4,
		'critic_learning_rate':1e-3,
		'gamma':0.99,
		'tau':0.001,
		'sigma':0.2,
		'num_epochs':275,
		'num_episodes':20,
		'replay_size':1000000,
		'num_train_steps':1,
		'replay_init_size':1000,
		'batch_size':64,
		'render_train':False,
		'restore':False,
		'env':'Hopper-v2_kirkiles_train1step_noise_norm_bufsize1Mi1k'
	}
	
	agent = DDPG(params)
	agent.train()
Ejemplo n.º 3
0
def main():

    params = {
        'actor_learning_rate': 1e-4,
        'critic_learning_rate': 1e-3,
        'gamma': 0.99,
        'tau': 0.001,
        'sigma': 0.2,
        'num_epochs': 500,
        'num_episodes': 20,
        'replay_size': 1000000,
        'num_train_steps': 50,
        'replay_init_size': 1000,
        'batch_size': 64,
        'render_train': False,
        'restore': False,
        'env': 'HalfCheetah-v2'
    }

    agent = DDPG(params)
    #agent.train()
    agent.test()
Ejemplo n.º 4
0
def main(args):
    env = gym.make('Walker2d-v1')

    reward_history = []

    agent = DDPG(env)
    agent.construct_model(args.gpu)

    saver = tf.train.Saver()
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
    else:
        # build a new model
        agent.sess.run(tf.global_variables_initializer())

    for episode in range(args.ep):
        # env init
        state = env.reset()
        total_rewards = 0
        for step in range(env.spec.timestep_limit):
            env.render()
            action = agent.sample_action(state[np.newaxis, :], explore=False)
            # act
            next_state, reward, done, _ = env.step(action[0])

            total_rewards += reward
            agent.store_experience(state, action, reward, next_state, done)

            agent.update_model()
            # shift
            state = next_state
            if done:
                break
        reward_history.append(total_rewards)
        print('Ep%d  reward:%d' % (episode+1, total_rewards))

    print('Average rewards: ', np.mean(reward_history))
Ejemplo n.º 5
0
    def __init__(self, num_agents, state_size, action_size, params, seed):
        """Initialize multiple DDPG agents
        
        Params
        ======
            num_agents (int): number of DDPG agents
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            params (Params): hyperparameters 
            seed (int): random seed
        """
        self.agents = [
            DDPG(state_size, action_size, params, seed)
            for _ in range(num_agents)
        ]

        # Replay Buffer forall agents
        self.memory = ReplayBuffer(params.buffer_size, params.batch_size, seed)
Ejemplo n.º 6
0
def main():
    RENDER = False
    env = gym.make(ENV_NAME)
    env.unwrapped
    env.seed(1)
    #获取环境参数
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_high = env.action_space.high
    action_low = env.action_space.low
    ddpg = DDPG(state_dim,action_dim,action_high,MODEL)
    var = 3
    for episode in range(EPISODES):
        ep_r = 0
        state = env.reset()

        for step in range(STEPS):
            if RENDER:
                env.render()
            action = ddpg.action_choose(state)
            action = np.clip(np.random.normal(action,var),action_low,action_high)

            state_,reward,done,info = env.step(action)

            ddpg.store_transitions(state,action,reward/10,state_)

            if ddpg.pointer > MEMORY_CAPACITY:
                var *= 0.9995
                ddpg.learn()

            state = state_
            ep_r += reward

            if step == STEPS-1:
                print('Episode:',episode,'Average reward:',ep_r,"explore:",var)
                if ep_r> -300:
                    RENDER = True
                break
    if MODEL == 'train':
        torch.save(ddpg.actor_eval, 'actor_eval.pkl')
        torch.save(ddpg.actor_target, 'actor_target.pkl')
        torch.save(ddpg.critic_eval, 'critic_eval.pkl')
        torch.save(ddpg.critic_target,'critic_target.pkl')
    # writer.add_graph(ddpg.actor_eval,state)
    # writer.close()
    env.close()
Ejemplo n.º 7
0
def main(args):
    env = gym.make('Walker2d-v1')
    env = wrappers.Monitor(env, './videos/', force=True)
    reward_history = []

    agent = DDPG(env, args)
    agent.construct_model(args.gpu)

    saver = tf.train.Saver()
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        best_avg_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        raise ValueError('model_path required!')

    for ep in range(args.ep):
        # env init
        state = env.reset()
        ep_rewards = 0
        for step in range(env.spec.timestep_limit):
            env.render()
            action = agent.sample_action(state[np.newaxis, :], noise=False)
            # act
            next_state, reward, done, _ = env.step(action[0])

            ep_rewards += reward
            agent.store_experience(state, action, reward, next_state, done)

            # shift
            state = next_state
            if done:
                break
        reward_history.append(ep_rewards)
        print('Ep%d  reward:%d' % (ep + 1, ep_rewards))

    print('Average rewards: ', np.mean(reward_history))
Ejemplo n.º 8
0
def main(args):
    env = gym.make('Walker2d-v1')
    env = wrappers.Monitor(env, './videos/', force=True)
    reward_history = []

    agent = DDPG(env, args)
    agent.construct_model(args.gpu)

    saver = tf.train.Saver()
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        best_avg_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        raise ValueError('model_path required!')

    for ep in range(args.ep):
        # env init
        state = env.reset()
        ep_rewards = 0
        for step in range(env.spec.timestep_limit):
            env.render()
            action = agent.sample_action(state[np.newaxis, :], noise=False)
            # act
            next_state, reward, done, _ = env.step(action[0])

            ep_rewards += reward
            agent.store_experience(state, action, reward, next_state, done)

            # shift
            state = next_state
            if done:
                break
        reward_history.append(ep_rewards)
        print('Ep%d  reward:%d' % (ep + 1, ep_rewards))

    print('Average rewards: ', np.mean(reward_history))
Ejemplo n.º 9
0
def arp_pred_net( AR, num_his_ar=4, funname='', net_scale=1 ):

    num_sbs, _ = AR.shape
    num_ts = 10000
    action_size = num_sbs
    ar_size     = num_sbs
    his_ar_size = ar_size * num_his_ar
    print( "Size of history ar: "  + str(his_ar_size) )

    arp_errors  = [1]            # average error in the predicted arrival rate

    # Randomly initialize critic, actor, target critic, target actor and load prediction network and replay buffer, in the agent
    agent = DDPG( his_ar_size, ar_size, action_size, TAU, is_batch_norm, write_sum, net_size_scale=net_scale )
 
    for i in range( num_his_ar, num_ts+num_his_ar ):
        his_ar  = np.reshape( AR[:,i-num_his_ar:i], (1, his_ar_size) , order='F' )

        real_ar = AR[:,i]
        agent.add_experience_arp( his_ar, real_ar )

        # Train ar prediction network, after many num_ts, one minibatch is enough for each step
        arp_error = 1
        arp_train_times = min(10, max(1, int(i/ARP_BATCH_SIZE)) ) #if i<1000 else 5
        lr = max(ARP_LR_MIN, agent.decay(i, ARP_LR_MIN, ARP_LR_MAX, num_his_ar, 8000, 2) )
        for j in range( 0, arp_train_times ):
            arp_errort = agent.train_arp( lr )     #/math.log(i+2)
            #print('arp_errort: ' + str(arp_errort))
            if arp_errort !=1:
                arp_error = arp_errort

        if arp_error !=1:
            arp_errors.append( math.sqrt( arp_error ) )

        if i%(100) == 0:
            print('    i: ' + str(i) + ', arp_error: ' + str(math.sqrt( arp_error )))
    return arp_errors
Ejemplo n.º 10
0
def main(args):
    set_random_seed(args.seed)
    env = gym.make('Walker2d-v1')
    agent = DDPG(env, args)
    agent.construct_model(args.gpu)

    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
        best_avg_rewards = float(args.model_path.split('/')[-1].split('_')[0])
    else:
        # build a new model
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0
        best_avg_rewards = None

    reward_history, step_history = [], []
    train_steps = 0

    for ep in range(args.max_ep):
        # env init
        state = env.reset()
        ep_rewards = 0
        for step in range(env.spec.timestep_limit):
            action = agent.sample_action(state[np.newaxis, :], noise=True)
            # act
            next_state, reward, done, _ = env.step(action[0])
            train_steps += 1
            ep_rewards += reward

            agent.store_experience(state, action, reward, next_state, done)
            agent.update_model()
            # shift
            state = next_state
            if done:
                print('Ep %d global_steps: %d Reward: %.2f' %
                      (ep + 1, agent.global_steps, ep_rewards))
                # reset ou noise
                agent.ou.reset()
                break
        step_history.append(train_steps)
        if not reward_history:
            reward_history.append(ep_rewards)
        else:
            reward_history.append(reward_history[-1] * 0.99 + ep_rewards + 0.01)

        # Evaluate during training
        if ep % args.log_every == 0 and ep > 0:
            ep_rewards = 0
            for ep_eval in range(args.test_ep):
                state = env.reset()
                for step_eval in range(env.spec.timestep_limit):
                    action = agent.sample_action(
                        state[np.newaxis, :], noise=False)
                    next_state, reward, done, _ = env.step(action[0])
                    ep_rewards += reward
                    state = next_state
                    if done:
                        break

            curr_avg_rewards = ep_rewards / args.test_ep

            # logging
            print('\n')
            print('Episode: %d' % (ep + 1))
            print('Gloabal steps: %d' % agent.global_steps)
            print('Mean reward: %.2f' % curr_avg_rewards)
            print('\n')
            if not best_avg_rewards or (curr_avg_rewards >= best_avg_rewards):
                best_avg_rewards = curr_avg_rewards
                if not os.path.isdir(args.save_path):
                    os.makedirs(args.save_path)
                save_name = args.save_path + str(round(best_avg_rewards, 2)) \
                    + '_' + str(ep_base + ep + 1)
                saver.save(agent.sess, save_name)
                print('Model save %s' % save_name)

    plt.plot(step_history, reward_history)
    plt.xlabel('steps')
    plt.ylabel('running reward')
    plt.show()
Ejemplo n.º 11
0
def train(cfg):
    print('Start to train ! \n')
    env = NormalizedActions(gym.make("Pendulum-v0"))

    # 增加action噪声
    ou_noise = OUNoise(env.action_space)

    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = DDPG(n_states,
                 n_actions,
                 device="cpu",
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128)
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.train_eps + 1):
        state = env.reset()
        ou_noise.reset()
        ep_reward = 0
        for i_step in range(1, cfg.train_steps + 1):
            action = agent.select_action(state)
            action = ou_noise.get_action(action,
                                         i_step)  # 即paper中的random process
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    print('Complete training!')
    ''' 保存模型 '''
    if not os.path.exists(SAVED_MODEL_PATH):  # 检测是否存在文件夹
        os.mkdir(SAVED_MODEL_PATH)
    agent.save_model(SAVED_MODEL_PATH + 'checkpoint.pth')
    '''存储reward等相关结果'''
    if not os.path.exists(RESULT_PATH):  # 检测是否存在文件夹
        os.mkdir(RESULT_PATH)
    np.save(RESULT_PATH + 'rewards_train.npy', rewards)
    np.save(RESULT_PATH + 'moving_average_rewards_train.npy',
            moving_average_rewards)
    np.save(RESULT_PATH + 'steps_train.npy', ep_steps)
Ejemplo n.º 12
0
from torch.utils.tensorboard import SummaryWriter
from agent import DDPG
from exploration import OUActionNoise
from utils import get_screen

epoch = 5000
env = gym.make('Pendulum-v0')

# seed
np.random.seed(42)
env.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

writer = SummaryWriter(log_dir='logs/')
agent = DDPG(env, writer)

all_timesteps = 0

for e in range(epoch):
    noise = OUActionNoise(env.action_space.shape[0])
    env.reset()
    pixel = env.render(mode='rgb_array')
    state = deque([get_screen(pixel) for _ in range(3)], maxlen=3)
    cumulative_reward = 0
    for timestep in range(200):
        action = agent.get_action(np.array(state)[np.newaxis], noise, timestep)
        _, reward, done, _ = env.step(action * env.action_space.high[0])
        pixel = env.render(mode='rgb_array')
        state_ = state.copy()
        state_.append(get_screen(pixel))
    # Initialize policy
    # if args.policy == "TD3":
    #     # Target policy smoothing is scaled wrt the action scale
    #     kwargs["policy_noise"] = args.policy_noise * max_action
    #     kwargs["noise_clip"] = args.noise_clip * max_action
    #     kwargs["policy_freq"] = args.policy_freq
    #     policy = TD3.TD3(**kwargs)
    if args.policy == "A2C":
        envs = ParaEnv(args.env, args.n_processes, args.seed)
        policy = A2C.A2C(env.observation_space, env.action_space,
                         args.discount, args.tau, max_episode_timesteps)
        x, y = policy.run(envs, file_name, args)
        write_result(args.env + "_A2C.json", x, y)

    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)
        x, y = policy.run(env, file_name, args)
        write_result(args.env + "_DDPG.json", x, y)

    elif args.policy == "REINFORCE":
        args.n_steps = 5
        args.n_processes = 16
        envs = ParaEnv(args.env, args.n_processes, args.seed)
        policy = REINFORCE.REINFORCE(env.observation_space, env.action_space,
                                     args.discount, args.tau, args.n_steps,
                                     args.n_processes, max_episode_timesteps)
        x, y = policy.run(envs, file_name, args)
        write_result(args.env + "_REINFORCE.json", x, y)

    else:
        x, y = None, None
Ejemplo n.º 14
0
def test(agent, trial_dir, test_episode, visual_flag, submit_flag):
    pid = os.getpid()
    logger, _ = prepare_for_logging("pid_{}".format(pid), False)

    logger.info("trial_dir={}".format(trial_dir))
    if not os.path.exists(trial_dir):
        logger.info("trial_dir does not exist")
        return

    # create environment
    env = NIPS(visualize=visual_flag)

    # load config
    with open(os.path.join(trial_dir, "config.pk"), "rb") as f:
        config = pickle.load(f)

    if agent == 'DDPG':
        config["scale_action"] = scale_action

        # observation processor
        if "ob_processor" not in config or config["ob_processor"] == "dummy":
            ob_processor = ObservationProcessor()
        elif config["ob_processor"] == "2ndorder":
            ob_processor = SecondOrderAugmentor()
        else:
            ob_processor = BodySpeedAugmentor()
        config["ob_aug_dim"] = ob_processor.get_aug_dim()
        util.print_settings(logger, config, env)

        # create random process
        oup = create_rand_process(env, config)

        # create replay buffer
        memory = create_memory(env, config)

        # create ddpg agent
        agent = DDPG(env, memory, oup, ob_processor, config)
        agent.build_nets(actor_hiddens=config["actor_hiddens"],
                         scale_action=config["scale_action"],
                         critic_hiddens=config["critic_hiddens"])

        # load weights
        paths = {}
        if test_episode > 0:
            paths["actor"] = "actor_{}.h5".format(test_episode)
            paths["critic"] = "critic_{}.h5".format(test_episode)
            paths["target"] = "target_{}.h5".format(test_episode)
        else:
            paths["actor"] = "actor.h5"
            paths["critic"] = "critic.h5"
            paths["target"] = "target.h5"
        paths = {k: os.path.join(trial_dir, v) for k, v in paths.iteritems()}
        logger.info("Paths to models: {}".format(paths))
        agent.load_models(paths)

    elif agent == 'TRPO':

        def ob_processor_maker():
            if config["ob_processor"] == "normal":
                return ObservationProcessor()
            elif config["ob_processor"] == "2ndorder":
                return SecondOrderAugmentor()
            elif config['ob_processor'] == 'bodyspeed':
                return BodySpeedAugmentor()
            else:
                raise ValueError('invalid ob processor type')

        config = {
            "agent": 'TRPO',
            "batch_size": 5000,
            "n_envs": 16,
            "n_iters": 5000,
            "ob_processor": "bodyspeed",
            # "hidden_nonlinearity": "relu",
            # "action_nonlinearity": "tanh",
            # "policy_hiddens": [128, 128, 64, 64],
            # "baseline_hiddens": [128, 128, 64, 64],
            "policy_hiddens": [256, 128, 64],
            "baseline_hiddens": [256, 128, 64],
            "hidden_nonlinearity": "tanh",
            "action_nonlinearity": None,
        }

        agent = TRPO(
            env,
            env_maker=None,
            logger=logger,
            log_dir=None,
            ob_processor_maker=ob_processor_maker,
            policy_hiddens=config['policy_hiddens'],
            baseline_hiddens=config['baseline_hiddens'],
            hidden_nonlinearity=config['hidden_nonlinearity'],
            action_nonlinearity=config['action_nonlinearity'],
            n_envs=config['n_envs'],
            batch_size=config['batch_size'],
            n_iters=config['n_iters'],
        )
        agent.load_models(trial_dir)
    else:
        raise ValueError('invalid agent type')

    if submit_flag:
        submit(agent, logger)
    else:
        rewards = []
        for i in xrange(10):
            steps, reward = agent.test(max_steps=1000)
            logger.info("episode={}, steps={}, reward={}".format(
                i, steps, reward))
            rewards.append(reward)
        logger.info("avg_reward={}".format(np.mean(rewards)))
Ejemplo n.º 15
0
def main(args):
    env = gym.make('Walker2d-v1')

    agent = DDPG(env)
    agent.construct_model(args.gpu)

    saver = tf.train.Saver(max_to_keep=1)
    if args.model_path is not None:
        # reuse saved model
        saver.restore(agent.sess, args.model_path)
        ep_base = int(args.model_path.split('_')[-1])
    else:
        # build a new model
        agent.sess.run(tf.global_variables_initializer())
        ep_base = 0

    MAX_EPISODES = 100000
    TEST = 10

    for episode in range(MAX_EPISODES):
        # env init
        state = env.reset()
        total_rewards = 0
        for step in range(env.spec.timestep_limit):
            action = agent.sample_action(state[np.newaxis, :], explore=True)
            # act
            next_state, reward, done, _ = env.step(action[0])
            total_rewards += reward

            agent.store_experience(state, action, reward, next_state, done)

            agent.update_model()
            # shift
            state = next_state
            if done:
                print('Ep %d global_steps: %d Reward: %.2f' %
                      (episode + 1, agent.global_steps, total_rewards))
                # reset ou noise
                agent.ou.reset()
                break

        # Evaluation per 100 ep
        if episode % 100 == 0 and episode > 100:
            total_rewards = 0
            for ep_eval in range(TEST):
                state = env.reset()
                for step_eval in range(env.spec.timestep_limit):
                    action = agent.sample_action(state[np.newaxis, :],
                                                 explore=False)
                    next_state, reward, done, _ = env.step(action[0])
                    total_rewards += reward
                    state = next_state
                    if done:
                        break

            mean_rewards = total_rewards / TEST

            # logging
            print('\n')
            print('Episode: %d' % (episode + 1))
            print('Gloabal steps: %d' % agent.global_steps)
            print('Mean reward: %.2f' % mean_rewards)
            print('\n')
            if not os.path.isdir(args.save_path):
                os.makedirs(args.save_path)
            save_name = args.save_path \
                + str(episode) + '_' + str(round(mean_rewards, 2))
            saver.save(agent.sess, save_name)
Ejemplo n.º 16
0
def train_agent(args, param):
    """

    Args:
    """

    # create CNN convert the [1,3,84,84] to [1, 200]

    use_gym = False
    # in case seed experements
    args.seed = param
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    #args.repeat_opt = repeat_opt
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str(
        args.policy)
    pathname += "_batch_size_" + str(args.batch_size)
    pathname += '_update_freq: ' + str(
        args.target_update_freq) + "num_q_target_" + str(
            args.num_q_target) + "_seed_" + str(args.seed)
    pathname += "_actor_300_200"
    text = "Star_training target_update_freq: {}  num_q_target: {}  use device {} ".format(
        args.target_update_freq, args.num_q_target, args.device)
    print(pathname, text)
    write_into_file(pathname, text)
    arg_text = str(args)
    write_into_file(pathname, arg_text)
    tensorboard_name = str(args.locexp) + '/runs/' + pathname
    writer = SummaryWriter(tensorboard_name)

    if use_gym:
        env = gym.make(args.env_name)
        env.seed(args.seed)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])
        args.max_episode_steps = env._max_episode_steps
    else:
        size = 84
        env = suite.make(
            args.env_name,
            has_renderer=False,
            use_camera_obs=True,
            ignore_done=True,
            has_offscreen_renderer=True,
            camera_height=size,
            camera_width=size,
            render_collision_mesh=False,
            render_visual_mesh=True,
            camera_name='agentview',
            use_object_obs=False,
            camera_depth=True,
            reward_shaping=True,
        )

    state_dim = 200
    print("State dim, ", state_dim)
    action_dim = env.dof
    print("action_dim ", action_dim)
    max_action = 1
    args.max_episode_steps = 200

    policy = DDPG(state_dim, action_dim, max_action, args)
    file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name)
    obs_shape = (3, 84, 84)
    action_shape = (action_dim, )
    print("obs", obs_shape)
    print("act", action_shape)
    replay_buffer = ReplayBuffer(obs_shape, action_shape,
                                 int(args.buffer_size), args.image_pad,
                                 args.device)
    save_env_vid = False
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    tb_update_counter = 0
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            #env.seed(random.randint(0, 100))
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if tb_update_counter > args.tensorboard_freq:
                print("Write tensorboard")
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
                writer.flush()
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, episode_num)
                text += "Episode steps {} ".format(episode_timesteps)
                text += "Reward: {:.2f}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))

                print(text)
                write_into_file(pathname, text)
                #policy.train(replay_buffer, writer, episode_timesteps)
            # We evaluate the episode and we save the policy
            if total_timesteps > args.start_timesteps:
                policy.train(replay_buffer, writer, 200)
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    env))
                torch.manual_seed(args.seed)
                np.random.seed(args.seed)
                save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(
                    episode_num, evaluations[-1], args.policy)
                policy.save(save_model)
            # When the training step is done, we reset the state of the environment
            if use_gym:
                obs = env.reset()
            else:
                state = env.reset()
                obs, state_buffer = stacked_frames(state, size, args, policy)

            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            if use_gym:
                action = env.action_space.sample()
            else:
                action = np.random.randn(env.dof)
        else:  # After 10000 timesteps, we switch to the model
            if use_gym:
                action = policy.select_action(np.array(obs))
                # If the explore_noise parameter is not 0, we add noise to the action and we clip it
                if args.expl_noise != 0:
                    action = (action + np.random.normal(
                        0, args.expl_noise,
                        size=env.action_space.shape[0])).clip(
                            env.action_space.low, env.action_space.high)
            else:
                action = (policy.select_action(np.array(obs)) +
                          np.random.normal(
                              0, max_action * args.expl_noise,
                              size=action_dim)).clip(-max_action, max_action)

        if total_timesteps % args.target_update_freq == 0:
            if args.policy == "TD3_ad":
                policy.hardupdate()
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        new_obs, reward, done, _ = env.step(action)
        done = float(done)
        if not use_gym:
            new_obs, state_buffer = create_next_obs(new_obs, size, args,
                                                    state_buffer, policy)
        # We check if the episode is done
        #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
        done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float(
            done)
        if not use_gym:
            if episode_timesteps + 1 == args.max_episode_steps:
                done = True
        # We increase the total reward
        reward = reward * args.reward_scalling
        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        if args.debug:
            print("add to buffer next_obs ", obs.shape)
            print("add to bufferobs ", new_obs.shape)
        replay_buffer.add(obs, action, reward, new_obs, done, done_bool)
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs
        if total_timesteps > args.start_timesteps:
            policy.train(replay_buffer, writer, 0)
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # We add the last policy evaluation to our list of evaluations and we save our model
    evaluations.append(
        evaluate_policy(policy, writer, total_timesteps, args, episode_num))
Ejemplo n.º 17
0
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
        print('Episode:{}/{}, Reward:{}'.format(i_episode + 1, cfg.train_eps,
                                                ep_reward))
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('Complete training!')
    return rewards, ma_rewards


if __name__ == '__main__':
    cfg = DDPGConfig()
    env = NormalizedActions(gym.make('Pendulum-v0'))
    env.seed(1)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    agent = DDPG(state_dim, action_dim, cfg)
    rewards, ma_rewards = train(cfg, env, agent)
    agent.save(path=SAVED_MODEL_PATH)
    save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="train",
                 algo=cfg.algo,
                 path=RESULT_PATH)
Ejemplo n.º 18
0
def train(config, trial_dir=None, visualize=False):
    pid = os.getpid()
    logger, log_dir = prepare_for_logging("pid_{}".format(pid))

    # create environment
    env = NIPS(visualize)
    logger.info("pid={}, env={}".format(pid, id(env)))
    if trial_dir is not None and os.path.exists(
            trial_dir) and config['agent'] == 'DDPG':
        logger.info("Loading config from {} ...".format(trial_dir))
        with open(os.path.join(trial_dir, "config.pk"), "rb") as f:
            config = pickle.load(f)
    # config["scale_action"] = scale_action
    config["title_prefix"] = "RunEnv"

    # observation processor
    if "ob_processor" not in config or config["ob_processor"] == "dummy":
        ob_processor = ObservationProcessor()
    elif config["ob_processor"] == "2ndorder":
        ob_processor = SecondOrderAugmentor()
    else:
        ob_processor = BodySpeedAugmentor()
    config["ob_aug_dim"] = ob_processor.get_aug_dim()

    # snapshot info
    if "save_snapshot_every" not in config:
        config["save_snapshot_every"] = 500
    save_snapshot_every = config["save_snapshot_every"]

    # save config
    with open(os.path.join(log_dir, "config.pk"), "wb") as f:
        pickle.dump(config, f)
    util.print_settings(logger, config, env)

    # DDPG
    if config['agent'] == 'DDPG':
        # create random process
        oup = create_rand_process(env, config)

        # create replay buffer
        memory = create_memory(env, config)

        # create ddpg agent
        agent = DDPG(env, memory, oup, ob_processor, config)
        agent.build_nets(actor_hiddens=config["actor_hiddens"],
                         scale_action=config["scale_action"],
                         critic_hiddens=config["critic_hiddens"])

        # print networks
        agent.actor.summary()
        agent.target_actor.summary()
        agent.critic.summary()

        # add callbacks
        def p_info(episode_info):
            util.print_episode_info(logger, episode_info, pid)

        def save_nets(episode_info):
            paths = {}
            paths["actor"] = os.path.join(log_dir, "actor.h5")
            paths["critic"] = os.path.join(log_dir, "critic.h5")
            paths["target"] = os.path.join(log_dir, "target.h5")
            agent = episode_info["agent"]
            agent.save_models(paths)

        def save_snapshots(episode_info):
            agent = episode_info["agent"]
            episode = episode_info["episode"]
            if episode % save_snapshot_every == 0:
                paths = {}
                paths["actor"] = os.path.join(log_dir,
                                              "actor_{}.h5".format(episode))
                paths["critic"] = os.path.join(log_dir,
                                               "critic_{}.h5".format(episode))
                paths["target"] = os.path.join(log_dir,
                                               "target_{}.h5".format(episode))
                agent.save_models(paths)
                memory_path = os.path.join(log_dir, "replaybuffer.npz")
                agent.save_memory(memory_path)
                logger.info("Snapshots saved. (pid={})".format(pid))

        agent.on_episode_end.append(p_info)
        agent.on_episode_end.append(save_nets)
        agent.on_episode_end.append(save_snapshots)

        # load existing model
        if trial_dir is not None and os.path.exists(trial_dir):
            logger.info("Loading networks from {} ...".format(trial_dir))
            paths = {}
            paths["actor"] = "actor.h5"
            paths["critic"] = "critic.h5"
            paths["target"] = "target.h5"
            paths = {
                k: os.path.join(trial_dir, v)
                for k, v in paths.iteritems()
            }
            logger.info("Paths to models: {}".format(paths))
            agent.load_models(paths)
            memory_path = os.path.join(trial_dir, "replaybuffer.npz")
            if os.path.exists(memory_path):
                agent.load_memory(memory_path)
                logger.info("Replay buffer loaded.")

        # learn
        util.print_sec_header(logger, "Training")
        reward_hist, steps_hist = agent.learn(
            total_episodes=config["total_episodes"],
            max_steps=config["max_steps"])
        env.close()

        # send result
        img_file = os.path.join(log_dir, "train_stats.png")
        util.plot_stats(reward_hist, steps_hist, img_file)
        log_file = os.path.join(log_dir, "train.log")
        title = log_dir + "_" + config["title_prefix"]
        util.send_email(title, [img_file], [log_file], SMTP_SERVER)

    # TRPO
    elif config['agent'] == 'TRPO':

        def ob_processor_maker():
            if config["ob_processor"] == "normal":
                return ObservationProcessor()
            elif config["ob_processor"] == "2ndorder":
                return SecondOrderAugmentor()
            elif config['ob_processor'] == 'bodyspeed':
                return BodySpeedAugmentor()
            else:
                raise ValueError('invalid ob processor type')

        def env_maker(visualize=False):
            env = NIPS(visualize=visualize)
            monitor_dir = os.path.join(log_dir, "gym_monitor")
            env = gym.wrappers.Monitor(env,
                                       directory=monitor_dir,
                                       video_callable=False,
                                       force=False,
                                       resume=True,
                                       write_upon_reset=True)
            return env

        del env
        env = env_maker()

        agent = TRPO(
            env,
            env_maker,
            logger,
            log_dir,
            ob_processor_maker,
            policy_hiddens=config['policy_hiddens'],
            baseline_hiddens=config['baseline_hiddens'],
            n_envs=config['n_envs'],
            batch_size=config['batch_size'],
            n_iters=config['n_iters'],
        )

        if trial_dir is not None and os.path.exists(trial_dir):
            agent.load_models(trial_dir)
        agent.learn()

    logger.info("Finished (pid={}).".format(pid))
Ejemplo n.º 19
0
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)
# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)
# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])


# Train Agent ##################################################################

from agent import DDPG
agent = DDPG(state_size=state_size, action_size=action_size, random_seed=2)

def train(n_episodes=100, max_t=1000):
    """Deep Deterministic Policy Gradiant.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
    """
    scores = []      # initialize the score
    scores_window = deque(maxlen=100)  # last 100 scores
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        agent.reset()
Ejemplo n.º 20
0
        n_data_worker=args.n_worker,
        batch_size=args.data_bsize,
        args=args,
        export_model=args.job == 'export',
        use_new_input=args.use_new_input)

    if args.job == 'train':
        # build folder and logs
        base_folder_name = '{}_{}_r{}_search'.format(args.model, args.dataset,
                                                     args.preserve_ratio)
        if args.suffix is not None:
            base_folder_name = base_folder_name + '_' + args.suffix
        args.output = get_output_folder(args.output, base_folder_name)
        print('=> Saving logs to {}'.format(args.output))
        tfwriter = SummaryWriter(logdir=args.output)
        text_writer = open(os.path.join(args.output, 'log.txt'), 'w')
        print('=> Output path: {}...'.format(args.output))

        nb_states = env.layer_embedding.shape[1]
        nb_actions = 1  # just 1 action here

        args.rmsize = args.rmsize * len(env.prunable_idx)  # for each layer
        print('** Actual replay buffer size: {}'.format(args.rmsize))

        agent = DDPG(nb_states, nb_actions, args)
        train(args.train_episode, agent, env, args.output, args)
    elif args.job == 'export':
        export_model(env, args)
    else:
        raise RuntimeError('Undefined job {}'.format(args.job))
Ejemplo n.º 21
0
def train():

    runtime = 5.  # time limit of the episode
    init_pose = np.array([0., 0., 4.0, 0., 0., 0.0])  # initial pose
    init_velocities = np.array([0., 0., 0.0])  # initial velocities
    init_angle_velocities = np.array([0., 0., 0.])  # initial angle velocities
    file_output = 'rewards.txt'  # file name for saved results

    num_episodes = 10
    target_pos = np.array([0., 0., 40.])
    task = Task(init_pose=init_pose,
                init_velocities=init_velocities,
                init_angle_velocities=init_angle_velocities,
                target_pos=target_pos)
    agent = DDPG(task)

    labels = ['episod', 'avg_reward', 'total_reward']
    results = {x: [] for x in labels}

    with open(file_output, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(labels)
        best_total_reward = -1000
        for i_episode in range(1, num_episodes + 1):
            state = agent.reset_episode()  # start a new episode
            total_reward = 0
            rewards = []

            while True:

                # select action according to the learned policy and the exploration noise
                action = agent.act(state)
                # execute the action and observe the reward and the next state
                next_state, reward, done = task.step(action)

                # sample mini batch and learn
                agent.step(action, reward, next_state, done)

                # data tracking
                total_reward += reward
                rewards.append(reward)

                if total_reward > best_total_reward:
                    best_total_reward = total_reward

                state = next_state

                if done:
                    avg_reward = np.mean(np.array(rewards))
                    print(task.sim.pose)
                    #to_write = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list(rotor_speeds)
                    #for ii in range(len(labels)):
                    #    results[labels[ii]].append(to_write[ii])
                    #writer.writerow(to_write)

                    to_write = [i_episode] + [avg_reward] + [total_reward]
                    for ii in range(len(labels)):
                        results[labels[ii]].append(to_write[ii])
                    print(
                        "\rEpisode = {:4d}, total_reward = {:7.3f}, avg_reward={:7.3} (best = {:7.3f})"
                        .format(i_episode, total_reward, avg_reward,
                                best_total_reward),
                        end="")  # [debug]
                    break
            sys.stdout.flush()

    return agent
Ejemplo n.º 22
0
def runs(pattern_type = 1, noise_method = 2):
    patterns   = ['sp','svp','fvp']
    noise_para = ['0','1_0.05','2_0.05_0.03','3_0.05']
    data_file = 'AR_'+patterns[pattern_type-1]+'_n_10_d_' + str(days) + '_nm_' + noise_para[noise_method] # 'Simulated_arrival_rates_no_noise_' + str(days)   #
    samples   = sio.loadmat( data_file )
    arrival_rates = samples['AR']

    num_sbs, steps = arrival_rates.shape
    steps = 20000                                 # The number of steps (can freely modify)

    if soft_update == 1 :
        tui  = 1                     # Target network update interval
        TAUt = TAU
    else:
        tui  = 100
        TAUt = 1
    ar_size       = num_sbs
    his_ar_size   = ar_size * num_his_ar
    load_size     = ar_size + 1
    action_size   = num_sbs
    state_size    = num_sbs * 2
    print( "Size of history ar: "  + str(his_ar_size) )
    print( "Size of action: "      + str(action_size) )
    print( "Number of timeslots: " + str(steps) )
    
    rewards     = np.zeros( (steps) )            # reward of each timeslot
    mean_reward = np.zeros( ( int(steps/48) + 1 ) )
    actions     = np.zeros( (steps, num_sbs) )   # refined action
    actions_o   = np.zeros( (steps, num_sbs) )   # original/raw output action of the actor network
    prev_action = np.ones(  (num_sbs) )    
    pred_ars    = np.zeros( (steps, num_sbs) )   # predicted arrival rates of the next timeslot
    real_loads  = np.zeros( (steps, num_sbs+1) )
    pred_loads  = np.zeros( (steps, num_sbs+1) )

    arp_errors  = [1]            # average error in the predicted arrival rate
    lm_errors   = [1]            # average error in the mapped load
    c_errors    = [1]            # average error in the Q values (critic network output)
    a_errors    = [1]            # average error in the action output

    
    # Randomly initialize critic, actor, target critic, target actor and load prediction network and replay buffer, in the agent
    agent = DDPG( his_ar_size, ar_size, action_size, TAUt, is_batch_norm, write_sum )
    exploration_noise = OUNoise( num_sbs )

    for i in range( num_his_ar, steps ):
        #print("i: "+str(i))
        his_ar  = np.reshape( arrival_rates[:,i-num_his_ar:i], (1, his_ar_size) , order='F' )
        pred_ar = agent.ar_pred_net.evaluate_ar_pred( his_ar )
        #real_ar = np.array( arrival_rates[:,i] )
        #print("his_ar: "+str(his_ar))
        # Generate a state_ac of the AC network
        state_ac  = agent.construct_state( pred_ar, prev_action )    #

        if eps_greedy:
            # epsilon-greedy based exploration
            if random.uniform(0, 1) < epsilon/i:#math.log(i+2):     #
                sigmai = 0.3#/math.log(i+1)
                action = exploration_noise.noisei( 0.0, sigmai )
            else:
                action = agent.evaluate_actor( state_ac )
                action = action[0]
        else:
            # noise-based exploration
            action = agent.evaluate_actor( state_ac )[0]
            sigmai = agent.decay(i, 0.01, 0.5, num_his_ar, steps/2, 2)
            #action = [ 1-a if random.uniform(0, 1)<sigmai else a for a in action ]
            noise  = exploration_noise.noisei( 0, sigmai )      #0.5/math.log(i+2)
            action = action + noise
        actions_o[i] = action

        # Refine the action, including rounding to 0 or 1, and greedy exploration
        if i<3000:
            action = agent.refine_action( state_ac, action, ac_ref1 )       # refine the action
        else:
            action = agent.refine_action( state_ac, action, ac_ref2 )

        # after taking the action and the env reacts
        #print("action_o: "+str(actions_o[i])+", action"+str(action))
        #print("pred_ar: "+str(pred_ar))
        pred_load = agent.load_map_net.evaluate_load_map( pred_ar, np.reshape( action, [1, action_size] ) )
        real_ar   = arrival_rates[:,i]
        real_load = agent.env.measure_load( real_ar, action )
        #print("pred_load: "+str(pred_load))
        #print("real_load: "+str(real_load))
        reward    = agent.env.find_reward( real_load, action, prev_action )   #
        #print("real reward: "+str(reward))
        next_his_ar   = np.reshape( arrival_rates[:, i-num_his_ar+1:i+1], (1, his_ar_size) , order='F' )
        next_pred_ar  = agent.ar_pred_net.evaluate_ar_pred( next_his_ar )
        next_state_ac = agent.construct_state( next_pred_ar, action )    #

        # Add s_t, s_t+1, action, reward to experience memory
        #print("real_ar: "+str(real_ar) + "action: "+str(action))
        ar_action = np.concatenate([real_ar, action])
        agent.add_experience_ac(  state_ac,  next_state_ac, action, reward )
        agent.add_experience_arp( his_ar,    real_ar )
        agent.add_experience_lm(  ar_action, real_load )

        # Train critic and actor network, maybe multiple minibatches per step
        a_lr = max(A_LR_MIN, agent.decay(i, A_LR_MIN, A_LR_MAX, num_his_ar, 8000, 2) ) #max( AC_LR_MIN[0], AC_LR_MAX[0]/math.log2(i+1) )
        c_lr = max(C_LR_MIN, agent.decay(i, C_LR_MIN, C_LR_MAX, num_his_ar, 8000, 2) ) #max( AC_LR_MIN[1], AC_LR_MAX[1]/math.log2(i+1) )
        learning_rate = [ a_lr, c_lr ]

        cerror = 1
        aerror = 1
        ac_train_times = min(16, max(1, int(i/500)) )
        for j in range( 0, ac_train_times ):    #                  #between 1 and 5
            cerrort, aerrort = agent.train_ac( learning_rate, soft_update )
            if cerrort !=1:
                cerror = cerrort
                aerror = aerrort

        if ( (i%tui == 0) and (soft_update==0) ):
            agent.update_target_net()
        
        # Train ar prediction network, after many steps, one minibatch is enough for each step
        arp_error = 1
        arp_train_times = min(10, max(1, int(i/ARP_BATCH_SIZE)) ) #if i<1000 else 5
        lr = max(ARP_LR_MIN, agent.decay(i, ARP_LR_MIN, ARP_LR_MAX, num_his_ar, 8000, 2) )
        for j in range( 0, arp_train_times ):
            arp_errort = agent.train_arp( lr )     #/math.log(i+2)
            if arp_errort !=1:
                arp_error = arp_errort
        
        # Train load mapping network, after many steps, one minibatch is enough for each step
        lm_error = 1
        lm_train_times = min(10, max(1, int(i/LM_BATCH_SIZE)) ) #if i<1000 else 20
        lr = max(LM_LR_MIN, agent.decay(i, LM_LR_MIN, LM_LR_MAX, num_his_ar, 8000, 2) )
        for j in range( 0, lm_train_times ):
            lm_errort = agent.train_lm( lr )   #
            if lm_errort !=1:
                lm_error = lm_errort

        if arp_error !=1:
            arp_errors.append( math.sqrt( arp_error ) )
        if lm_error !=1:
            lm_errors.append( math.sqrt( lm_error ) )
        if cerror !=1:
            c_errors.append( math.sqrt( cerror ) )
        if aerror !=1:
            a_errors.append( aerror*num_sbs )         # hamming distance error

        prev_action = action
        pred_ars[i] = pred_ar
        real_loads[i] = real_load
        pred_loads[i] = pred_load
        actions[i]  = action
        rewards[i]  = reward
        if i%(48) == 0:
            mean_reward[int(i/48)] = mean( rewards[i-48:i] )
            print("==== i: %5d, arp error: %1.5f, lm error: %1.5f, a error: %1.5f, c error: %1.5f, mean reward: %1.5f \n" % ( i, arp_errors[-1], lm_errors[-1], a_errors[-1], c_errors[-1], mean_reward[int(i/48)] ) )

    agent.close_all()      # this will write network parameters into .txt files

    """
    writetext( actions,     'actions',    1 )
    writetext( actions_o,   'actions_o',  1 )
    writetext( pred_ars,    'pred_ar',    1 )
    writetext( rewards,     'rewards'       )
    writetext( mean_reward, 'mean_rewards'  )
    writetext( arp_errors,  'arp_errors', 1 )
    writetext( lm_errors,   'lm_errors',  1 )
    writetext( c_errors,    'c_errors'      )
    writetext( a_errors,    'a_errors'      )
    writetext( real_loads,  'real_loads', 1 )
    writetext( pred_loads,  'pred_loads', 1 )

    pre = '_bn_'+str(is_batch_norm)+'_gi_'+str(is_grad_inverter)+'_ar_'+str(ac_ref1)+'_'+str(steps)
    writetext( mean_reward, 'mean_rewards'+pre  )

    plt.plot(rewards)
    plt.show()

    """

    writetext( rewards, 'ACDQN_rewards_' + data_file  )

    return 1
Ejemplo n.º 23
0
def dqn_bsa(AR, ac_ref=4, write_sum=0, net_scale=1, funname='', beta0=beta):

    num_sbs, num_ts = AR.shape

    num_mbs = 1
    ts_per_day = 48

    ar_size = num_sbs
    his_ar_size = ar_size * num_his_ar
    load_size = num_sbs + num_mbs
    action_size = num_sbs
    state_size = ar_size + action_size
    print("Size of history ar: " + str(his_ar_size))
    print("Size of action: " + str(action_size))
    print("Number of timeslots: " + str(num_ts))

    rewards = np.zeros((num_ts))  # reward of each timeslot
    sum_powers = np.zeros((num_ts))
    switch_powers = np.zeros((num_ts))
    qos_costs = np.zeros((num_ts))
    throughputs = np.zeros((num_ts))
    prev_action = np.ones((num_sbs))

    arp_errors = [1]  # average error in the predicted arrival rate
    lm_errors = [1]  # average error in the mapped load
    c_errors = [1]  # average error in the Q values (critic network output)
    a_errors = [1]  # average error in the action output

    # Randomly initialize critic, actor, target critic, target actor and load prediction network and replay buffer, in the agent
    agent = DDPG(his_ar_size,
                 ar_size,
                 action_size,
                 TAU,
                 is_batch_norm,
                 write_sum,
                 net_size_scale=net_scale,
                 beta0=beta0)
    exploration_noise = OUNoise(num_sbs)

    for i in range(num_his_ar, num_ts):
        his_ar = np.reshape(AR[:, i - num_his_ar:i], (1, his_ar_size),
                            order='F')
        pred_ar = agent.ar_pred_net.evaluate_ar_pred(his_ar)

        # Generate a state_ac of the AC network
        state_ac = agent.construct_state(pred_ar, prev_action)  #

        if eps_greedy:
            # epsilon-greedy based exploration
            if random.uniform(0, 1) < epsilon / i:  #math.log(i+2):     #
                sigmai = 0.3  #/math.log(i+1)
                action = exploration_noise.noisei(0.0, sigmai)
            else:
                action = agent.evaluate_actor(state_ac)
                action = action[0]
        else:
            # noise-based exploration
            action = agent.evaluate_actor(state_ac)[0]
            sigmai = agent.decay(i, 0.01, 0.5, num_his_ar, num_ts / 2, 2)
            noise = exploration_noise.noisei(0, sigmai)  #0.5/math.log(i+2)
            action = action + noise

        # Refine the action, including rounding to 0 or 1, and greedy exploration
        if ac_ref <= 3:
            action = agent.refine_action(state_ac, action, ac_ref)
        else:  # hybrid method
            if random.uniform(0, 1) < agent.decay(i, 0, 3, num_his_ar,
                                                  num_ts * 0.75, 2):
                action = agent.refine_action(state_ac, action,
                                             3)  # refine the action
            else:
                action = agent.refine_action(state_ac, action, 2)

        # after taking the action and the env reacts
        #pred_load = agent.load_map_net.evaluate_load_map( pred_ar, np.reshape( action, [1, action_size] ) )
        real_ar = AR[:, i]
        real_load = agent.env.measure_load(real_ar, action)
        reward, sum_power, switch_power, qos_cost, throughput = agent.env.find_reward(
            real_load, action, prev_action)  #

        next_his_ar = np.reshape(AR[:, i - num_his_ar + 1:i + 1],
                                 (1, his_ar_size),
                                 order='F')
        next_pred_ar = agent.ar_pred_net.evaluate_ar_pred(next_his_ar)
        next_state_ac = agent.construct_state(next_pred_ar, action)  #

        # Add s_t, s_t+1, action, reward to experience memory
        ar_action = np.concatenate([real_ar, action])
        agent.add_experience_ac(state_ac, next_state_ac, action, reward)
        agent.add_experience_arp(his_ar, real_ar)
        agent.add_experience_lm(ar_action, real_load)

        # Train critic and actor network, maybe multiple minibatches per step
        a_lr = max(A_LR_MIN,
                   agent.decay(
                       i, A_LR_MIN, A_LR_MAX, num_his_ar, 8000,
                       2))  #max( AC_LR_MIN[0], AC_LR_MAX[0]/math.log2(i+1) )
        c_lr = max(C_LR_MIN,
                   agent.decay(
                       i, C_LR_MIN, C_LR_MAX, num_his_ar, 8000,
                       2))  #max( AC_LR_MIN[1], AC_LR_MAX[1]/math.log2(i+1) )
        learning_rate = [a_lr, c_lr]

        cerror = 1
        aerror = 1
        ac_train_times = min(16, max(1, int(i / 500)))
        for j in range(0, ac_train_times):  #                  #between 1 and 5
            cerrort, aerrort = agent.train_ac(learning_rate, 1)
            if cerrort != 1:
                cerror = cerrort
                aerror = aerrort

        # Train ar prediction network, after many num_ts, one minibatch is enough for each step
        arp_error = 1
        arp_train_times = min(10,
                              max(1,
                                  int(i / ARP_BATCH_SIZE)))  #if i<1000 else 5
        lr = max(ARP_LR_MIN,
                 agent.decay(i, ARP_LR_MIN, ARP_LR_MAX, num_his_ar, 8000, 2))
        for j in range(0, arp_train_times):
            arp_errort = agent.train_arp(lr)  #/math.log(i+2)
            if arp_errort != 1:
                arp_error = arp_errort

        # Train load mapping network, after many num_ts, one minibatch is enough for each step
        lm_error = 1
        lm_train_times = min(10,
                             max(1,
                                 int(i / LM_BATCH_SIZE)))  #if i<1000 else 20
        lr = max(LM_LR_MIN,
                 agent.decay(i, LM_LR_MIN, LM_LR_MAX, num_his_ar, 8000, 2))
        for j in range(0, lm_train_times):
            lm_errort = agent.train_lm(lr)  #
            if lm_errort != 1:
                lm_error = lm_errort

        if arp_error != 1:
            arp_errors.append(math.sqrt(arp_error))
        if lm_error != 1:
            lm_errors.append(math.sqrt(lm_error))
        if cerror != 1:
            c_errors.append(math.sqrt(cerror))
        if aerror != 1:
            a_errors.append(aerror * num_sbs)  # hamming distance error

        prev_action = action
        rewards[i] = reward
        sum_powers[i] = sum_power
        throughputs[i] = throughput
        switch_powers[i] = switch_power
        qos_costs[i] = qos_cost

        if i % (ts_per_day) == 0:
            mrt = np.mean(rewards[i - ts_per_day:i])
            if write_sum > 0:
                print(
                    funname +
                    " ------- i: %5d, arp-e: %1.5f, lm-e: %1.5f, a-e: %1.5f, c-e: %1.5f, d-reward: %1.5f \n"
                    % (i, arp_errors[-1], lm_errors[-1], a_errors[-1],
                       c_errors[-1], mrt))
            else:
                print(funname + " ------- i: %5d, mean reward: %1.5f \n" %
                      (i, mrt))

    return rewards, sum_powers, switch_powers, qos_costs, throughputs
Ejemplo n.º 24
0
import torch
from torch.utils.tensorboard import SummaryWriter
from agent import DDPG
from exploration import OUActionNoise

epoch = 2000
env = gym.make('Pendulum-v0')

# seed
np.random.seed(42)
env.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

writer = SummaryWriter(log_dir='logs/')
agent = DDPG(env, writer)

all_timesteps = 0

for e in range(epoch):
    noise = OUActionNoise(env.action_space.shape[0])
    state = env.reset()
    cumulative_reward = 0
    for timestep in range(200):
        action = agent.get_action(state, noise, timestep)
        state_, reward, done, _ = env.step(action * env.action_space.high[0])
        # env.render()
        agent.store_transition(state, action, state_, reward, done)

        state = state_
        cumulative_reward += reward
Ejemplo n.º 25
0
def main():
    """
    UnboundLocalError: local variable 'RENDER' referenced before assignment

    If the global variable changed in a function without
    declare with a "global" prefix, then the variable here
    will be treat as a local variable

    For example,
    if "RENDER" is not been declared with global prefix,
    access "RENDER" variable will raise UnboundLocalError
    before assign value to "RENDER"
    """

    global RENDER
    env = gym.make(ENV_NAME)
    env = env.unwrapped
    env.seed(1)

    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_bound = env.action_space.high[0]
    # print(f"s_dim: {s_dim}, a_dim: {a_dim}, a_bound: {a_bound}")
    # s_dim: 3, a_dim: 1, a_bound: 2.0

    ddpg = DDPG(s_dim, a_dim, a_bound)

    # var: add noise to action
    var = 3
    for i in range(MAX_EPISODES):
        s = env.reset()
        # s : list
        # s.shape = (3,)
        ep_reward = 0
        for j in range(MAX_EP_STEPS):
            if RENDER:
                env.render()

            a = ddpg.choose_action(s)
            a = np.clip(np.random.normal(a, var), -a_bound, a_bound)
            s_, r, done, info = env.step(a)

            # s : list
            # a : np.float
            # r : float
            # s_ : list
            ddpg.store_transition(s, a, r/10, s_)

            if ddpg.m_pointer > ddpg.capacity:
                var *= 0.9995
                ddpg.learn()

            s = s_
            ep_reward += r

            if done or (j+1) == MAX_EP_STEPS:
                print(f"Episode: {i:03d}")
                print(f"\tReward: {ep_reward:.3f}, Explore: {var:.2f}")
                if ep_reward > -150:
                    RENDER = True
                break
    env.close()
Ejemplo n.º 26
0
import gym
from agent import DDPG

env = gym.make('Pendulum-v0')

agent = DDPG(env)
agent.load_model()

state = env.reset()

cumulative_reward = 0
for i in range(200):
    action = agent.get_action(state)
    env.render()
    state, reward, _, _ = env.step(action * 2)
    cumulative_reward += reward
print('Cumulative Reward: {}'.format(cumulative_reward))
Ejemplo n.º 27
0
from collections import deque
import gym
import numpy as np
from agent import DDPG
from utils import get_screen

env = gym.make('Pendulum-v0')

agent = DDPG(env, memory=False)
agent.load_model()

env.reset()
pixel = env.render(mode='rgb_array')
state = deque([get_screen(pixel) for _ in range(3)], maxlen=3)
cumulative_reward = 0
for timestep in range(200):
    action = agent.get_action(np.array(state)[np.newaxis])
    _, reward, _, _ = env.step(action * 2)
    pixel = env.render(mode='rgb_array')
    state_ = state.copy()
    state_.append(get_screen(pixel))
    state = state_
    cumulative_reward += reward
print('Cumulative Reward: {}'.format(cumulative_reward))
# 在开发智能体的时候,你还需要关注它的性能。参考下方代码,建立一个机制来存储每个阶段的总奖励值。如果阶段奖励值在逐渐上升,说明你的智能体正在学习。

# In[25]:

## TODO: Train your agent here.
import keras
import sys
import pandas as pd
from agent import DDPG
from task import Task

num_episodes = 2000
target_pos = np.array([0., 0., 100.])
init_pose = np.array([0., 0., 0., 0., 0., 0.])
task = Task(target_pos=target_pos, init_pose=init_pose)
agent = DDPG(task)

reward_labels = ['episode', 'reward']
reward_results = {x: [] for x in reward_labels}

# In[18]:

for i_episode in range(1, num_episodes + 1):
    state = agent.reset_episode()  # start a new episode
    while True:
        action = agent.act(state)
        next_state, reward, done = task.step(action)
        agent.step(action, reward, next_state, done)
        state = next_state
        if done:
            print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f})".format(
from agent import DDPG
from task import Task

num_episodes = 1000
init_pose = np.array([0., 0., 0., 0., 0., 0.])
target_pos = np.array([0., 0., 10.])
init_velocities = np.array([0., 0., 0.])  # initial velocities
init_angle_velocities = np.array([0., 0., 0.])

task = Task(init_pose=init_pose,
            target_pos=target_pos,
            init_angle_velocities=init_angle_velocities,
            init_velocities=init_velocities)
best_score = -np.inf

agent = DDPG(task)

for i_episode in range(1, num_episodes + 1):
    state = agent.reset_episode()  # start a new
    score = 0
    while True:
        action = agent.act(state)
        next_state, reward, done = task.step(action)
        agent.step(action, reward, next_state, done)
        state = next_state
        score += reward
        best_score = max(best_score, score)
        if done:
            print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f})".format(
                i_episode, score, best_score),
                  end="")  # [debug]