def train():
    evaluate_env_list_path = 'env_list_set1'
    print(evaluate_reject_when_full(evaluate_env_list_path))
    print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env()
    action_dim = 4
    obs_dim = 24
    rpm = ReplayMemory(MEMORY_SIZE)

    actor = PDActor(obs_dim=obs_dim, action_dim=action_dim)
    critic = PDCritirc(obs_dim=obs_dim, action_dim=action_dim)
    agent = Agent(actor=actor,
                  critic=critic,
                  obs_dim=obs_dim,
                  action_dim=action_dim)

    # preserve some data in replay memory
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env, agent, rpm)

    max_episode = 2000

    # start train
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(0, 50):
            total_reward = run_episode(env, agent, rpm)
            episode += 1

        # test part
        # render=True show animation result
        eval_reward = evaluate(evaluate_env_list_path, agent, render=False)
        print('episode:{}  Test reward:{}'.format(episode, eval_reward))
예제 #2
0
def train(lr=0.001, num_iter=1000, num_episode=10, num_epoch=10, batch_size=32,\
    evaluate_env_list_path='env_list_set1', \
    train_total_time=600, show_baseline=False, \
    continue_train=False, model_path = 'best_actor'):
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env(total_time=train_total_time)
    action_dim = 4
    obs_dim_1 = 45
    request_dim = 17
    obs_dim_2 = 10
    obs_dim = obs_dim_1 + obs_dim_2 * 7
    encoder = Encoder(input_size=request_dim, output_size=obs_dim_2)
    actor = Actor(encoder, obs_size=obs_dim, action_size=action_dim)
    agent = Agent(actor=actor, obs_dim=obs_dim, action_dim=action_dim)
    if continue_train:
        agent.load(model_path)
    for iter in range(num_iter):
        #2.1  Using theta k to interact with the env
        # to collect {s_t, a_t} and compute advantage
        # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})

        all_obs = []
        all_action = []
        for episode in range(num_episode):
            obs_list, action_list, _ = run_episode_baseline(env)
            all_obs.extend(obs_list)
            all_action.extend(action_list)

        # optimize theta
        for epoch in range(num_epoch):
            # for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader):
            # agent.learn(batch_obs, batch_action, batch_adv)
            num_examples = len(all_obs)
            indices = list(range(num_examples))
            random.shuffle(indices)

            for i in range(0, num_examples, batch_size):

                if i + batch_size < len(all_obs):
                    # print(indice[i:i+batch_size])
                    batch_obs = [all_obs[x] for x in indices[i:i + batch_size]]
                    batch_action = torch.tensor(
                        [all_action[x] for x in indices[i:i + batch_size]])
                else:
                    batch_obs = [all_obs[x] for x in indices[i:num_examples]]
                    batch_action = torch.tensor(
                        [all_action[x] for x in indices[i:num_examples]])

                agent.learn(batch_obs, batch_action)
        if iter % 10 == 0:
            eval_reward = evaluate(evaluate_env_list_path, agent,
                                   render=False)  # render=True 查看显示效果
            print('itern:{}  Test reward:{}'.format(iter, eval_reward))
            agent.save(model_path)
        agent.save(model_path)
예제 #3
0
def train(gamma = 0.9, base_line=0.5, lr=0.0001, total_time=20, \
    num_iter = 1000, num_episode=10, num_epoch=10, \
    evaluate_env_list_path = 'env_list_set1', show_base_line=False):
    # clip epsilon
    # EPSILON = 0.1
    # # total number of experiments
    # num_iter = 1000
    # # for each experiment, simulate num_episode trajectories.
    # num_episode = 10
    # # for each experiment, tuning num_epoch times
    # num_epoch = 10
    # evaluate_env_list_path = 'env_list_set1'
    if show_base_line:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env(total_time=total_time)
    action_dim = 4
    obs_dim = 45
    PPOactor = Actor(obs_size=obs_dim, action_size=action_dim)
    agent = Agent(actor=PPOactor,
                  obs_dim=obs_dim,
                  action_dim=action_dim,
                  lr=lr)

    for iter in range(num_iter):
        #2.1  Using theta k to interact with the env
        # to collect {s_t, a_t} and compute advantage
        # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})

        all_obs = []
        all_action = []
        all_advantage = []
        for episode in range(num_episode):
            obs_list, action_list, reward_list = run_episode(env, agent)
            advantage_list = calc_advantage(reward_list,
                                            gamma=gamma,
                                            base_line=base_line)
            all_obs.extend(obs_list)
            all_action.extend(action_list)
            all_advantage.extend(advantage_list)
        dataset = PPODataset(obs_list=all_obs,
                             action_list=all_action,
                             advantage_list=all_advantage)
        dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

        # optimize theta

        for epoch in range(num_epoch):
            for i, (batch_obs, batch_action,
                    batch_adv) in enumerate(dataloader):
                agent.learn(batch_obs, batch_action, batch_adv)

        if iter % 10 == 0:
            eval_reward = evaluate(evaluate_env_list_path, agent,
                                   render=False)  # render=True 查看显示效果
            print('itern:{}  Test reward:{}'.format(iter, eval_reward))
예제 #4
0
def train(show_baseline=False, continue_train=False, \
    model_save_path='best_model', learn_freq= 5, memory_size = 20000, \
    memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \
    gamma = 0.9, alpha = 0.9, max_episode=1000, ):

    evaluate_env_list_path = 'env_list_set1'
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env()
    action_dim = 4
    obs_dim_1 = 45
    request_dim = 17
    obs_dim_2 = 10
    obs_dim = obs_dim_1 + obs_dim_2 * 7
    encoder = Encoder(input_size=request_dim, output_size=obs_dim_2, \
        use_rnn=False, use_gru=True, use_lstm=False)
    rpm = ReplayMemory(memory_size)  # DQN的经验回放池
    critic = Critic(obs_dim=obs_dim, action_dim=action_dim, encoder=encoder)
    agent = Agent(critic=critic,
                  obs_dim=obs_dim,
                  action_dim=action_dim,
                  lr=learning_rate,
                  gamma=gamma,
                  alpha=alpha)

    if continue_train:
        agent.load(model_save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < memory_warmup_size:
        run_episode(env, agent, rpm, memory_warmup_size, learn_freq,
                    batch_size)

    # start train
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 100):
            total_reward = run_episode(env, agent, rpm, memory_warmup_size,
                                       learn_freq, batch_size)
            episode += 1
        # for parameter in critic.parameters():
        #     print(parameter)
        #     break
        # test part
        # print(critic.parameters())
        eval_reward = evaluate(evaluate_env_list_path, agent, render=False)
        print('episode:{}  Test reward:{}'.format(episode, eval_reward))
    agent.save(model_save_path)
예제 #5
0
def train(show_baseline=False, continue_train=False, \
    model_save_path='best_model', learn_freq= 5, memory_size = 20000, \
    memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \
    gamma = 0.9, alpha = 0.9, max_episode=1000, ):

    evaluate_env_list_path = 'env_list_set1'
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env()
    action_dim = 4
    obs_dim = 45
    rpm = ReplayMemory(memory_size)  # DQN的经验回放池

    critic = Critic(obs_dim=obs_dim, action_dim=action_dim)
    agent = Agent(critic=critic,
                  obs_dim=obs_dim,
                  action_dim=action_dim,
                  lr=learning_rate,
                  gamma=gamma,
                  alpha=alpha)

    if continue_train:
        agent.load(model_save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < memory_warmup_size:
        run_episode(env, agent, rpm, memory_warmup_size, learn_freq,
                    batch_size)

    # start train
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 10):
            total_reward = run_episode(env, agent, rpm, memory_warmup_size,
                                       learn_freq, batch_size)
            episode += 1
        # for name, param in critic.state_dict().items():
        #     # name: str
        #     # param: Tensor
        #     print(param)
        # test part
        eval_reward = evaluate(evaluate_env_list_path, agent, render=False)
        print('episode:{}  Test reward:{}'.format(episode, eval_reward))
    agent.save(model_save_path)
예제 #6
0
def train(gamma = 0.9, base_line=0.5, lr=0.001, epsilon=0.1, \
    num_iter=1000, num_episode=10, num_epoch=10, batch_size=32,\
    evaluate_env_list_path='env_list_set1',\
    train_total_time=600, show_baseline=False):
    if show_baseline:
        print(evaluate_reject_when_full(evaluate_env_list_path))
        print(evaluate_totally_random(evaluate_env_list_path))
    env = produce_env(total_time=train_total_time)
    action_dim = 4  
    obs_dim_1 = 45  
    request_dim = 17
    obs_dim_2 = 10
    obs_dim = obs_dim_1+obs_dim_2*7
    encoder = Encoder(input_size=request_dim, output_size=obs_dim_2)
    actor = Actor(encoder, obs_size=obs_dim, action_size=action_dim)
    agent = Agent(
        actor=actor,
        obs_dim = obs_dim,
        action_dim=action_dim,
        lr=lr,
        epsilon=epsilon,
        update_target_steps=200)

    for iter in range(num_iter):
        #2.1  Using theta k to interact with the env
        # to collect {s_t, a_t} and compute advantage
        # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n})
        
        all_obs = []
        all_action = []
        all_advantage = []
        for episode in range(num_episode):
            obs_list, action_list, reward_list = run_episode(env, agent)
            advantage_list = calc_advantage(reward_list, gamma, base_line)
            all_obs.extend(obs_list)
            all_action.extend(action_list)
            all_advantage.extend(advantage_list)
        # dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage)
        # dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

        # optimize theta
        for epoch in range(num_epoch):
            # for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader):
                # agent.learn(batch_obs, batch_action, batch_adv)
            num_examples = len(all_obs) 
            indices = list(range(num_examples)) 
            random.shuffle(indices)
            
            for i in range(0, num_examples, batch_size):
                
                if i+batch_size<len(all_obs):
                    # print(indice[i:i+batch_size])
                    batch_obs = [all_obs[x] for x in indices[i:i+batch_size]]
                    batch_action = torch.tensor([all_action[x] for x in indices[i:i+batch_size]])
                    batch_adv = torch.tensor([all_advantage[x] for x in indices[i:i+batch_size]])
                else:
                    batch_obs = [all_obs[x] for x in indices[i:num_examples]]
                    batch_action = torch.tensor([all_action[x] for x in indices[i:num_examples]])
                    batch_adv = torch.tensor([all_advantage[x] for x in indices[i:num_examples]])

                agent.learn(batch_obs, batch_action, batch_adv)
        if iter%10 == 0:
            eval_reward= evaluate(evaluate_env_list_path, agent, render=False)  # render=True 查看显示效果
            print('itern:{}  Test reward:{}'.format(iter, eval_reward))
예제 #7
0
from Env_generator import produce_env
from Util import evaluate_stable
from stable_baselines3.common.env_checker import check_env
env = produce_env()
check_env(env)

from stable_baselines3 import A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
# env = make_vec_env("CartPole-v1", n_envs=4)

# model = A2C("MlpPolicy", env, verbose=1)
# model.learn(total_timesteps=500)
# print(evaluate_stable('env_list_set1', model))

model3 = DQN("MlpPolicy", env, verbose=1)
for _ in range(30):
    model3.learn(total_timesteps=250)
    print(evaluate_stable('env_list_set1', model3))