def add_model(self): """This function calls the appropriate model builder""" self.model_agent = AgentModel(12, 20, 6) self.model_critic = CriticModel(11, 21, 10, 0) self.set_model_weights(self.model_agent) self.set_model_weights(self.model_critic) self.optimizer_agent = torch.optim.Adam(self.model_agent.parameters(), lr = 0.001) self.optimizer_critic = torch.optim.Adam(self.model_critic.parameters(), lr = 0.001) self.loss_agent = torch.nn.MSELoss() self.loss_critic = torch.nn.MSELoss()
def main(): env = gym.make(args.env) env.seed(ENV_SEED) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) actor = ActorModel(act_dim) critic = CriticModel() algorithm = parl.algorithms.SAC(actor, critic, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim) if os.path.exists( 'model_dir/steps_1481164_reward_-1.6494146736737971.ckpt'): agent.restore( 'model_dir/steps_1481164_reward_-1.6494146736737971.ckpt') print("restore succeed") rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) test_flag = 0 total_steps = 0 best_reward = -float('inf') while total_steps < args.train_total_steps: train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps) if evaluate_reward >= best_reward: best_reward = evaluate_reward # 保存模型 ckpt = 'model_dir_phase2/steps_{}_reward_{}.ckpt'.format( total_steps, best_reward) agent.save(ckpt) ckpt = 'model_dir_phase2/steps_{}_reward_{}.ckpt'.format( total_steps, evaluate_reward) agent.save(ckpt)
def main(): # 获取游戏,skill_frame每个动作执行的次数,resize_shape图像预处理的大小 env = retro_util.RetroEnv(game='SuperMarioBros-Nes', skill_frame=SKILL_FRAME, resize_shape=RESIZE_SHAPE) env.seed(1) # 游戏的图像形状 # obs_dim = env.observation_space.shape obs_dim = RESIZE_SHAPE # 动作维度 action_dim = env.action_space.n # 动作正负的最大绝对值 max_action = 1 # 创建模型 actor = ActorModel(action_dim) critic = CriticModel() algorithm = parl.algorithms.SAC(actor=actor, critic=critic, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, action_dim) # 加载模型 agent.restore(SAVE_MODEL_PATH) # 开始游戏 obs = env.reset()[None, -1, :, :] total_reward = 0 isOver = False # 游戏未结束执行一直执行游戏 while not isOver: env.render() # 获取动作 action = agent.predict(obs) action = [0 if a < 0 else 1 for a in action] print('执行动作:', action) obs, reward, isOver, info = env.step_sac(action) total_reward += reward if info['lives'] != 2: isOver = True env.render(close=True) env.close() print("最终得分为:{:.2f}".format(total_reward))
def __init__(self): self.optimizer = Adam self.action_space = np.array([0, 1, 2]) self.state_size = ( self.lookback_window_size, 5 + self.depth ) # 5 standard OHCL information + market and indicators # Create shared Actor-Critic network model self.Actor = ActorModel(input_shape=self.state_size, action_space=self.action_space.shape[0], lr=self.lr, optimizer=self.optimizer, model=self.model) self.Critic = CriticModel(input_shape=self.state_size, action_space=self.action_space.shape[0], lr=self.lr, optimizer=self.optimizer, model=self.model)
class agent: def __init__(self): self.critic_loss = None self.factors_agent = None self.factors_critic = None self.history_len = 0 self.is_train = None self.loss_agent = None self.loss_critic = None self.model_agent = None self.model_critic = None self.optimizer_agent = None self.optimizer_critic = None self.pred = None self.reward = None def add_model(self): """This function calls the appropriate model builder""" self.model_agent = AgentModel(12, 20, 6) self.model_critic = CriticModel(11, 21, 10, 0) self.set_model_weights(self.model_agent) self.set_model_weights(self.model_critic) self.optimizer_agent = torch.optim.Adam(self.model_agent.parameters(), lr = 0.001) self.optimizer_critic = torch.optim.Adam(self.model_critic.parameters(), lr = 0.001) self.loss_agent = torch.nn.MSELoss() self.loss_critic = torch.nn.MSELoss() def add_prediction(self, prediction): """This function concatenates the prediciton with the critic input""" i = 0 j = self.history_len self.factors_critic[i, j, 0] = prediction['score'] self.factors_critic[i, j, 1] = prediction['r0'] self.factors_critic[i, j, 2] = prediction['r1'] self.factors_critic[i, j, 3] = prediction['r2'] self.factors_critic[i, j, 4] = prediction['r3'] self.factors_critic[i, j, 5] = prediction['r4'] self.factors_critic[i, j, 6] = prediction['r5'] self.factors_critic[i, j, 7] = prediction['sd'] self.factors_critic[i, j, 8] = prediction['avg'] self.factors_critic[i, j, 9] = prediction['m'] self.factors_critic[i, j, 10] = prediction['k'] def custom_loss_critic(self, target, selection, selection_averages, target_averages): """This returns the normalized cross correlation between target and selection""" # These lines here compute the cross-correlation between target and # selection top = np.multiply((selection - selection_averages), (target - target_averages)) top_sum = np.sum(top, axis = 0) bottom_selection = np.power((selection - selection_averages),2) bottom_targets = np.power((target - target_averages), 2) bottom_selection_sum = np.sum(bottom_selection, axis = 0) bottom_targets_sum = np.sum(bottom_targets, axis = 0) bottom = np.sqrt(np.multiply(bottom_selection_sum, bottom_targets_sum)) divided = np.divide(top_sum, bottom) divided = divided[~np.isnan(divided)] return(np.sum(divided)) def factorize(self, user_history): """This function factorizes a given user history, or batch of user histories, into factors for an lstm model""" # Reset the holding arrays self.factors_agent = np.zeros((1, 20, 12)) self.factors_critic = np.zeros((1, 21, 11)) # This i here is to conform with tensorflow input expectations i = 0 j = 0 for index, row in user_history.iterrows(): # The last entry in a history is the one we attempt to predict if j == (user_history.shape[0]): break # Truncating maximum history to ~1 day of continuous listening if j == 20: break # In an act of data reduction and factor selection, I drop # all spotify embeddings and deploy my own self.factors_agent[i, j, 0] = row['score'] self.factors_critic[i, j, 0] = row['score'] self.factors_agent[i, j, 1] = row['r0'] self.factors_critic[i, j, 1] = row['r0'] self.factors_agent[i, j, 2] = row['r1'] self.factors_critic[i, j, 2] = row['r1'] self.factors_agent[i, j, 3] = row['r2'] self.factors_critic[i, j, 3] = row['r2'] self.factors_agent[i, j, 4] = row['r3'] self.factors_critic[i, j, 4] = row['r3'] self.factors_agent[i, j, 5] = row['r4'] self.factors_critic[i, j, 5] = row['r4'] self.factors_agent[i, j, 6] = row['r5'] self.factors_critic[i, j, 6] = row['r5'] self.factors_agent[i, j, 7] = row['m'] self.factors_critic[i, j, 7] = row['m'] self.factors_agent[i, j, 8] = row['k'] self.factors_critic[i, j, 8] = row['k'] self.factors_agent[i, j, 9] = row['day_w'] self.factors_critic[i, j, 9] = row['sd'] self.factors_agent[i, j, 10] = row['day_m'] self.factors_critic[i, j, 10] = row['avg'] self.factors_agent[i, j, 11] = row['hour_d'] j += 1 i += 1 self.history_len = j def get_agent_reward(self, repeat): """This function gets the agent reward""" # if the track is something the user has heard before take the reward # to the (1/2) if repeat > 0: reward = math.pow(self.reward,0.5) else: reward = self.reward # Due to the square in the operation the magnitue of rward is limited # to 1E-7 due to machine precision concerns - verfied through testing if reward > 0.9999999: reward = 0.9999999 reward = torch.tensor([reward], requires_grad = True) self.reward = reward def get_critic_loss(self, current_user_history, data): """This function get the critic loss""" user = data[data.user_id == current_user_history.user_id.values[0]] user = user[['r0','r1','r2','r3', 'r4', 'r5']] user_array = user.to_numpy() # In order to use handy dandy numpy list comprehensions, we need to # make an overly bulky array for the averages both for target and for # selection ( as pssed to self.custom_loss_critic) selection_averages = [] selection_averages.append(np.average(current_user_history.r0.values)) selection_averages.append(np.average(current_user_history.r1.values)) selection_averages.append(np.average(current_user_history.r2.values)) selection_averages.append(np.average(current_user_history.r3.values)) selection_averages.append(np.average(current_user_history.r4.values)) selection_averages.append(np.average(current_user_history.r5.values)) selection_averages = np.array(selection_averages) # This line here gives selection_averages a 2nd dimension to match time # while the repeat command coppies these average values through the time # axis selection_averages = np.repeat(selection_averages[None,:], current_user_history.shape[0], axis = 0) selection_averages = selection_averages[-10:] selection_array=current_user_history[['r0','r1','r2','r3', 'r4', 'r5']] selection_array = selection_array[-10:] selection_array = selection_array.to_numpy() # Here we repeat this process for the whole user history as reflected # byuser target_averages = [] target_averages.append(np.average(user.r0.values)) target_averages.append(np.average(user.r1.values)) target_averages.append(np.average(user.r2.values)) target_averages.append(np.average(user.r3.values)) target_averages.append(np.average(user.r4.values)) target_averages.append(np.average(user.r5.values)) target_averages = np.array(target_averages) target_averages = np.repeat(target_averages[None, :], selection_array.shape[0], axis = 0) critic_loss = [] end = selection_array.shape[0] start = 0 while end < user_array.shape[0]: critic_loss.append(self.custom_loss_critic(user_array[start:end,], selection_array, selection_averages, target_averages)) start += 1 end += 1 if len(critic_loss) > 0: critic_loss = np.average(critic_loss) else: critic_loss = 0.0 critic_loss = torch.tensor([critic_loss], requires_grad = True) self.critic_loss = critic_loss def predict(self, user_history): """This function manages the training of the model based on the provided data""" self.factorize(user_history) self.pred = self.model_agent(torch.Tensor(self.factors_agent)) def propagate(self, current_user_history, data, prediction, repeat): """This function propagates the loss through the actor and critic""" self.add_prediction(prediction) # Clear out the gradients from the last prediction self.model_agent.zero_grad() self.model_critic.zero_grad() # Get the critic reward self.reward = self.model_critic(torch.Tensor(self.factors_critic)) self.get_agent_reward(repeat) # Get the agent loss and apply it agent_loss = self.loss_agent(self.reward, torch.tensor([1.0])) self.optimizer_agent.step(agent_loss.backward()) # Get the critic loss and apply it self.get_critic_loss(current_user_history, data) evaluated_critic_loss = self.loss_critic(self.critic_loss, torch.tensor([6.0])) self.optimizer_critic.step(evaluated_critic_loss.backward()) def ready_agent(self, agent_model_path, critic_model_path, train): """This function sets up a working agent - one complete with a loss function and a model""" self.is_train = train self.model_agent = torch.load(agent_model_path) self.model_critic = torch.load(critic_model_path) if self.model_agent is not None: print("Actor Model {} sucessuflly loaded.\n".format(agent_model_path)) self.model_critic = torch.load(critic_model_path) if self.model_agent is not None: print("Critic Model {} sucessuflly loaded.\n".format(critic_model_path)) def set_model_weights(self, model): """This function initilizes the weights in a pytorch model""" classname = model.__class__.__name__ if classname.find('Linear') != -1: n = model.in_features y = 1.0 / np.sqrt(n) model.weight.data.uniform_(-y,y) model.bias.data.fill(0) def wake_agent(self, train): """This function sets up a working agent - one complete with a loss function and a model""" self.is_train = train self.add_model()
def main(): # 获取游戏,skill_frame每个动作执行的次数,resize_shape图像预处理的大小,render_preprocess是否显示预处理后的图像 env = retro_util.RetroEnv(game=args.env, resize_shape=RESIZE_SHAPE, skill_frame=SKILL_FRAME, render_preprocess=args.show_play, is_train=True) env.seed(1) # 游戏的图像形状 # obs_dim = env.observation_space.shape obs_dim = RESIZE_SHAPE # 动作维度 action_dim = env.action_space.n # 动作正负的最大绝对值 max_action = 1 # 创建模型 actor = ActorModel(action_dim) critic = CriticModel() algorithm = parl.algorithms.SAC(actor=actor, critic=critic, max_action=max_action, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, action_dim) # 加载预训练模型 if os.path.exists(args.model_path): logger.info("加载预训练模型...") agent.restore(args.model_path) # 创建记录数据存储器 rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim) total_steps = 0 step_train = 0 print("开始训练模型。。。") while total_steps < args.train_total_steps: # 训练 train_reward, steps = run_train_episode(env, agent, rpm, render=args.show_play) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) summary.add_scalar('train/episode_reward', train_reward, total_steps) total_steps += steps # 评估 if step_train % 100 == 0: evaluate_reward = run_evaluate_episode(env, agent, render=args.show_play) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) summary.add_scalar('eval/episode_reward', evaluate_reward, total_steps) step_train += 1 # 保存模型 if not os.path.exists(args.model_path): os.makedirs(args.model_path) agent.save(args.model_path)
counter = Counter() env = gym.make(args.env_name) s_dim = env.observation_space.shape[0] a_dim = env.action_space.n # for Discrete object # a_dim = env.action_space.shape[0] # for Box object print(s_dim, a_dim) # state = env.reset() # state = Variable(torch.Tensor(state).unsqueeze(0)) # print(state.dim()) # linear = torch.nn.Linear(3, 1) # out = linear(state) # print(out) from model import ActorModel, CriticModel actor = ActorModel(s_dim, a_dim) critic = CriticModel(s_dim) actor.share_memory() critic.share_memory() # print(actor, critic) import torch.multiprocessing as mp update_event, rolling_event = mp.Event(), mp.Event() update_event.clear() # not update now rolling_event.set() # start to roll out queue = mp.Queue() # workers put data in this queue counter = Counter() queue_size = Counter() worker(args, actor, critic, update_event, rolling_event, queue, counter, queue_size) # t = Variable(torch.randn(5)) # print(t)
CHECKPOINT_GOALIE_ACTOR = './checkpoint_goalie_actor.pth' CHECKPOINT_GOALIE_CRITIC = './checkpoint_goalie_critic.pth' CHECKPOINT_STRIKER_ACTOR = './checkpoint_striker_actor.pth' CHECKPOINT_STRIKER_CRITIC = './checkpoint_striker_critic.pth' # Actors and Critics GOALIE_0_KEY = 0 STRIKER_0_KEY = 0 GOALIE_1_KEY = 1 STRIKER_1_KEY = 1 # NEURAL MODEL goalie_actor_model = ActorModel(goalie_state_size, goalie_action_size).to(DEVICE) goalie_critic_model = CriticModel(goalie_state_size + striker_state_size + goalie_state_size + striker_state_size).to(DEVICE) goalie_optim = optim.Adam(list(goalie_actor_model.parameters()) + list(goalie_critic_model.parameters()), lr=GOALIE_LR) # self.optim = optim.RMSprop( list( self.actor_model.parameters() ) + list( self.critic_model.parameters() ), lr=lr, alpha=0.99, eps=1e-5 ) striker_actor_model = ActorModel(striker_state_size, striker_action_size).to(DEVICE) striker_critic_model = CriticModel(striker_state_size + goalie_state_size + striker_state_size + goalie_state_size).to(DEVICE) striker_optim = optim.Adam(list(striker_actor_model.parameters()) + list(striker_critic_model.parameters()), lr=STRIKER_LR) # self.optim = optim.RMSprop( list( self.actor_model.parameters() ) + list( self.critic_model.parameters() ), lr=lr, alpha=0.99, eps=1e-5 )