def train(): evaluate_env_list_path = 'env_list_set1' print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env() action_dim = 4 obs_dim = 24 rpm = ReplayMemory(MEMORY_SIZE) actor = PDActor(obs_dim=obs_dim, action_dim=action_dim) critic = PDCritirc(obs_dim=obs_dim, action_dim=action_dim) agent = Agent(actor=actor, critic=critic, obs_dim=obs_dim, action_dim=action_dim) # preserve some data in replay memory while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 2000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part # render=True show animation result eval_reward = evaluate(evaluate_env_list_path, agent, render=False) print('episode:{} Test reward:{}'.format(episode, eval_reward))
def train(lr=0.001, num_iter=1000, num_episode=10, num_epoch=10, batch_size=32,\ evaluate_env_list_path='env_list_set1', \ train_total_time=600, show_baseline=False, \ continue_train=False, model_path = 'best_actor'): if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env(total_time=train_total_time) action_dim = 4 obs_dim_1 = 45 request_dim = 17 obs_dim_2 = 10 obs_dim = obs_dim_1 + obs_dim_2 * 7 encoder = Encoder(input_size=request_dim, output_size=obs_dim_2) actor = Actor(encoder, obs_size=obs_dim, action_size=action_dim) agent = Agent(actor=actor, obs_dim=obs_dim, action_dim=action_dim) if continue_train: agent.load(model_path) for iter in range(num_iter): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] for episode in range(num_episode): obs_list, action_list, _ = run_episode_baseline(env) all_obs.extend(obs_list) all_action.extend(action_list) # optimize theta for epoch in range(num_epoch): # for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader): # agent.learn(batch_obs, batch_action, batch_adv) num_examples = len(all_obs) indices = list(range(num_examples)) random.shuffle(indices) for i in range(0, num_examples, batch_size): if i + batch_size < len(all_obs): # print(indice[i:i+batch_size]) batch_obs = [all_obs[x] for x in indices[i:i + batch_size]] batch_action = torch.tensor( [all_action[x] for x in indices[i:i + batch_size]]) else: batch_obs = [all_obs[x] for x in indices[i:num_examples]] batch_action = torch.tensor( [all_action[x] for x in indices[i:num_examples]]) agent.learn(batch_obs, batch_action) if iter % 10 == 0: eval_reward = evaluate(evaluate_env_list_path, agent, render=False) # render=True 查看显示效果 print('itern:{} Test reward:{}'.format(iter, eval_reward)) agent.save(model_path) agent.save(model_path)
def train(gamma = 0.9, base_line=0.5, lr=0.0001, total_time=20, \ num_iter = 1000, num_episode=10, num_epoch=10, \ evaluate_env_list_path = 'env_list_set1', show_base_line=False): # clip epsilon # EPSILON = 0.1 # # total number of experiments # num_iter = 1000 # # for each experiment, simulate num_episode trajectories. # num_episode = 10 # # for each experiment, tuning num_epoch times # num_epoch = 10 # evaluate_env_list_path = 'env_list_set1' if show_base_line: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env(total_time=total_time) action_dim = 4 obs_dim = 45 PPOactor = Actor(obs_size=obs_dim, action_size=action_dim) agent = Agent(actor=PPOactor, obs_dim=obs_dim, action_dim=action_dim, lr=lr) for iter in range(num_iter): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] all_advantage = [] for episode in range(num_episode): obs_list, action_list, reward_list = run_episode(env, agent) advantage_list = calc_advantage(reward_list, gamma=gamma, base_line=base_line) all_obs.extend(obs_list) all_action.extend(action_list) all_advantage.extend(advantage_list) dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage) dataloader = DataLoader(dataset, batch_size=128, shuffle=True) # optimize theta for epoch in range(num_epoch): for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader): agent.learn(batch_obs, batch_action, batch_adv) if iter % 10 == 0: eval_reward = evaluate(evaluate_env_list_path, agent, render=False) # render=True 查看显示效果 print('itern:{} Test reward:{}'.format(iter, eval_reward))
def train(show_baseline=False, continue_train=False, \ model_save_path='best_model', learn_freq= 5, memory_size = 20000, \ memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \ gamma = 0.9, alpha = 0.9, max_episode=1000, ): evaluate_env_list_path = 'env_list_set1' if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env() action_dim = 4 obs_dim_1 = 45 request_dim = 17 obs_dim_2 = 10 obs_dim = obs_dim_1 + obs_dim_2 * 7 encoder = Encoder(input_size=request_dim, output_size=obs_dim_2, \ use_rnn=False, use_gru=True, use_lstm=False) rpm = ReplayMemory(memory_size) # DQN的经验回放池 critic = Critic(obs_dim=obs_dim, action_dim=action_dim, encoder=encoder) agent = Agent(critic=critic, obs_dim=obs_dim, action_dim=action_dim, lr=learning_rate, gamma=gamma, alpha=alpha) if continue_train: agent.load(model_save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < memory_warmup_size: run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 100): total_reward = run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) episode += 1 # for parameter in critic.parameters(): # print(parameter) # break # test part # print(critic.parameters()) eval_reward = evaluate(evaluate_env_list_path, agent, render=False) print('episode:{} Test reward:{}'.format(episode, eval_reward)) agent.save(model_save_path)
def train(show_baseline=False, continue_train=False, \ model_save_path='best_model', learn_freq= 5, memory_size = 20000, \ memory_warmup_size = 2000, batch_size = 32, learning_rate = 0.001, \ gamma = 0.9, alpha = 0.9, max_episode=1000, ): evaluate_env_list_path = 'env_list_set1' if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env() action_dim = 4 obs_dim = 45 rpm = ReplayMemory(memory_size) # DQN的经验回放池 critic = Critic(obs_dim=obs_dim, action_dim=action_dim) agent = Agent(critic=critic, obs_dim=obs_dim, action_dim=action_dim, lr=learning_rate, gamma=gamma, alpha=alpha) if continue_train: agent.load(model_save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < memory_warmup_size: run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 10): total_reward = run_episode(env, agent, rpm, memory_warmup_size, learn_freq, batch_size) episode += 1 # for name, param in critic.state_dict().items(): # # name: str # # param: Tensor # print(param) # test part eval_reward = evaluate(evaluate_env_list_path, agent, render=False) print('episode:{} Test reward:{}'.format(episode, eval_reward)) agent.save(model_save_path)
def train(gamma = 0.9, base_line=0.5, lr=0.001, epsilon=0.1, \ num_iter=1000, num_episode=10, num_epoch=10, batch_size=32,\ evaluate_env_list_path='env_list_set1',\ train_total_time=600, show_baseline=False): if show_baseline: print(evaluate_reject_when_full(evaluate_env_list_path)) print(evaluate_totally_random(evaluate_env_list_path)) env = produce_env(total_time=train_total_time) action_dim = 4 obs_dim_1 = 45 request_dim = 17 obs_dim_2 = 10 obs_dim = obs_dim_1+obs_dim_2*7 encoder = Encoder(input_size=request_dim, output_size=obs_dim_2) actor = Actor(encoder, obs_size=obs_dim, action_size=action_dim) agent = Agent( actor=actor, obs_dim = obs_dim, action_dim=action_dim, lr=lr, epsilon=epsilon, update_target_steps=200) for iter in range(num_iter): #2.1 Using theta k to interact with the env # to collect {s_t, a_t} and compute advantage # advantage(s_t, a_t) = sum_{t^prime=t}^{T_n}(r_{t^prime}^{n}) all_obs = [] all_action = [] all_advantage = [] for episode in range(num_episode): obs_list, action_list, reward_list = run_episode(env, agent) advantage_list = calc_advantage(reward_list, gamma, base_line) all_obs.extend(obs_list) all_action.extend(action_list) all_advantage.extend(advantage_list) # dataset = PPODataset(obs_list=all_obs, action_list=all_action, advantage_list=all_advantage) # dataloader = DataLoader(dataset, batch_size=128, shuffle=True) # optimize theta for epoch in range(num_epoch): # for i, (batch_obs, batch_action, batch_adv) in enumerate(dataloader): # agent.learn(batch_obs, batch_action, batch_adv) num_examples = len(all_obs) indices = list(range(num_examples)) random.shuffle(indices) for i in range(0, num_examples, batch_size): if i+batch_size<len(all_obs): # print(indice[i:i+batch_size]) batch_obs = [all_obs[x] for x in indices[i:i+batch_size]] batch_action = torch.tensor([all_action[x] for x in indices[i:i+batch_size]]) batch_adv = torch.tensor([all_advantage[x] for x in indices[i:i+batch_size]]) else: batch_obs = [all_obs[x] for x in indices[i:num_examples]] batch_action = torch.tensor([all_action[x] for x in indices[i:num_examples]]) batch_adv = torch.tensor([all_advantage[x] for x in indices[i:num_examples]]) agent.learn(batch_obs, batch_action, batch_adv) if iter%10 == 0: eval_reward= evaluate(evaluate_env_list_path, agent, render=False) # render=True 查看显示效果 print('itern:{} Test reward:{}'.format(iter, eval_reward))
from Env_generator import produce_env from Util import evaluate_stable from stable_baselines3.common.env_checker import check_env env = produce_env() check_env(env) from stable_baselines3 import A2C, DQN from stable_baselines3.common.env_util import make_vec_env # Parallel environments # env = make_vec_env("CartPole-v1", n_envs=4) # model = A2C("MlpPolicy", env, verbose=1) # model.learn(total_timesteps=500) # print(evaluate_stable('env_list_set1', model)) model3 = DQN("MlpPolicy", env, verbose=1) for _ in range(30): model3.learn(total_timesteps=250) print(evaluate_stable('env_list_set1', model3))