コード例 #1
0
ファイル: agent.py プロジェクト: ddwooten/insight_2020_ai
    def add_model(self):
        """This function calls the appropriate model builder"""
        
        self.model_agent = AgentModel(12, 20, 6)

        self.model_critic = CriticModel(11, 21, 10, 0)

        self.set_model_weights(self.model_agent)

        self.set_model_weights(self.model_critic)

        self.optimizer_agent = torch.optim.Adam(self.model_agent.parameters(),
                                               lr = 0.001)

        self.optimizer_critic = torch.optim.Adam(self.model_critic.parameters(),
                                               lr = 0.001)

        self.loss_agent = torch.nn.MSELoss()

        self.loss_critic = torch.nn.MSELoss()
コード例 #2
0
def main():
    env = gym.make(args.env)
    env.seed(ENV_SEED)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    actor = ActorModel(act_dim)
    critic = CriticModel()
    algorithm = parl.algorithms.SAC(actor,
                                    critic,
                                    max_action=max_action,
                                    gamma=GAMMA,
                                    tau=TAU,
                                    actor_lr=ACTOR_LR,
                                    critic_lr=CRITIC_LR)
    agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim)
    if os.path.exists(
            'model_dir/steps_1481164_reward_-1.6494146736737971.ckpt'):
        agent.restore(
            'model_dir/steps_1481164_reward_-1.6494146736737971.ckpt')
        print("restore succeed")

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    best_reward = -float('inf')
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
        summary.add_scalar('train/episode_reward', train_reward, total_steps)

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward = run_evaluate_episode(env, agent)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, evaluate_reward))
            summary.add_scalar('eval/episode_reward', evaluate_reward,
                               total_steps)
            if evaluate_reward >= best_reward:
                best_reward = evaluate_reward
                # 保存模型
                ckpt = 'model_dir_phase2/steps_{}_reward_{}.ckpt'.format(
                    total_steps, best_reward)
                agent.save(ckpt)
    ckpt = 'model_dir_phase2/steps_{}_reward_{}.ckpt'.format(
        total_steps, evaluate_reward)
    agent.save(ckpt)
コード例 #3
0
def main():
    # 获取游戏,skill_frame每个动作执行的次数,resize_shape图像预处理的大小
    env = retro_util.RetroEnv(game='SuperMarioBros-Nes',
                              skill_frame=SKILL_FRAME,
                              resize_shape=RESIZE_SHAPE)
    env.seed(1)

    # 游戏的图像形状
    # obs_dim = env.observation_space.shape
    obs_dim = RESIZE_SHAPE
    # 动作维度
    action_dim = env.action_space.n
    # 动作正负的最大绝对值
    max_action = 1

    # 创建模型
    actor = ActorModel(action_dim)
    critic = CriticModel()
    algorithm = parl.algorithms.SAC(actor=actor,
                                    critic=critic,
                                    max_action=max_action,
                                    gamma=GAMMA,
                                    tau=TAU,
                                    actor_lr=ACTOR_LR,
                                    critic_lr=CRITIC_LR)
    agent = Agent(algorithm, obs_dim, action_dim)

    # 加载模型
    agent.restore(SAVE_MODEL_PATH)

    # 开始游戏
    obs = env.reset()[None, -1, :, :]
    total_reward = 0
    isOver = False
    # 游戏未结束执行一直执行游戏
    while not isOver:
        env.render()
        # 获取动作
        action = agent.predict(obs)
        action = [0 if a < 0 else 1 for a in action]
        print('执行动作:', action)
        obs, reward, isOver, info = env.step_sac(action)
        total_reward += reward
        if info['lives'] != 2:
            isOver = True
    env.render(close=True)
    env.close()
    print("最终得分为:{:.2f}".format(total_reward))
コード例 #4
0
ファイル: agent.py プロジェクト: litethink/coinrot-rl
 def __init__(self):
     self.optimizer = Adam
     self.action_space = np.array([0, 1, 2])
     self.state_size = (
         self.lookback_window_size, 5 + self.depth
     )  # 5 standard OHCL information + market and indicators
     # Create shared Actor-Critic network model
     self.Actor = ActorModel(input_shape=self.state_size,
                             action_space=self.action_space.shape[0],
                             lr=self.lr,
                             optimizer=self.optimizer,
                             model=self.model)
     self.Critic = CriticModel(input_shape=self.state_size,
                               action_space=self.action_space.shape[0],
                               lr=self.lr,
                               optimizer=self.optimizer,
                               model=self.model)
コード例 #5
0
ファイル: agent.py プロジェクト: ddwooten/insight_2020_ai
class agent:

    def __init__(self):

        self.critic_loss = None
        
        self.factors_agent = None

        self.factors_critic = None

        self.history_len = 0

        self.is_train = None

        self.loss_agent = None

        self.loss_critic = None

        self.model_agent = None

        self.model_critic = None

        self.optimizer_agent = None

        self.optimizer_critic = None

        self.pred = None

        self.reward = None

    def add_model(self):
        """This function calls the appropriate model builder"""
        
        self.model_agent = AgentModel(12, 20, 6)

        self.model_critic = CriticModel(11, 21, 10, 0)

        self.set_model_weights(self.model_agent)

        self.set_model_weights(self.model_critic)

        self.optimizer_agent = torch.optim.Adam(self.model_agent.parameters(),
                                               lr = 0.001)

        self.optimizer_critic = torch.optim.Adam(self.model_critic.parameters(),
                                               lr = 0.001)

        self.loss_agent = torch.nn.MSELoss()

        self.loss_critic = torch.nn.MSELoss()

    def add_prediction(self, prediction):
        """This function concatenates the prediciton with the critic input"""
        
        i = 0

        j = self.history_len 

        self.factors_critic[i, j, 0] = prediction['score']

        self.factors_critic[i, j, 1] = prediction['r0']

        self.factors_critic[i, j, 2] = prediction['r1']

        self.factors_critic[i, j, 3] = prediction['r2']

        self.factors_critic[i, j, 4] = prediction['r3']

        self.factors_critic[i, j, 5] = prediction['r4']

        self.factors_critic[i, j, 6] = prediction['r5']

        self.factors_critic[i, j, 7] = prediction['sd']

        self.factors_critic[i, j, 8] = prediction['avg']

        self.factors_critic[i, j, 9] = prediction['m']

        self.factors_critic[i, j, 10] = prediction['k']

    def custom_loss_critic(self, target, selection, selection_averages,
                           target_averages):
        """This returns the normalized cross correlation between target and
        selection"""

        # These lines here compute the cross-correlation between target and
        # selection

        top = np.multiply((selection - selection_averages), 
                          (target - target_averages))

        top_sum = np.sum(top, axis = 0)

        bottom_selection = np.power((selection - selection_averages),2)

        bottom_targets = np.power((target - target_averages), 2)

        bottom_selection_sum = np.sum(bottom_selection, axis = 0)

        bottom_targets_sum = np.sum(bottom_targets, axis = 0)

        bottom = np.sqrt(np.multiply(bottom_selection_sum,
                                     bottom_targets_sum))

        divided = np.divide(top_sum, bottom)

        divided = divided[~np.isnan(divided)]

        return(np.sum(divided))
            
    def factorize(self, user_history):
        """This function factorizes a given user history, or batch of user
        histories, into factors for an lstm model"""

        # Reset the holding arrays

        self.factors_agent = np.zeros((1, 20, 12))

        self.factors_critic = np.zeros((1, 21, 11))

        # This i here is to conform with tensorflow input expectations

        i = 0

        j = 0

        for index, row in user_history.iterrows():

            # The last entry in a history is the one we attempt to predict

            if j == (user_history.shape[0]):

                break
            
            # Truncating maximum history to ~1 day of continuous listening

            if j == 20:

                break
            # In an act of data reduction and factor selection, I drop
            # all spotify embeddings and deploy my own
            
            self.factors_agent[i, j, 0] = row['score']

            self.factors_critic[i, j, 0] = row['score']

            self.factors_agent[i, j, 1] = row['r0']

            self.factors_critic[i, j, 1] = row['r0']

            self.factors_agent[i, j, 2] = row['r1']

            self.factors_critic[i, j, 2] = row['r1']

            self.factors_agent[i, j, 3] = row['r2']

            self.factors_critic[i, j, 3] = row['r2']

            self.factors_agent[i, j, 4] = row['r3']

            self.factors_critic[i, j, 4] = row['r3']

            self.factors_agent[i, j, 5] = row['r4']

            self.factors_critic[i, j, 5] = row['r4']

            self.factors_agent[i, j, 6] = row['r5']

            self.factors_critic[i, j, 6] = row['r5']

            self.factors_agent[i, j, 7] = row['m']

            self.factors_critic[i, j, 7] = row['m']

            self.factors_agent[i, j, 8] = row['k']

            self.factors_critic[i, j, 8] = row['k']

            self.factors_agent[i, j, 9] = row['day_w']

            self.factors_critic[i, j, 9] = row['sd']

            self.factors_agent[i, j, 10] = row['day_m']

            self.factors_critic[i, j, 10] = row['avg']

            self.factors_agent[i, j, 11] = row['hour_d']

            j += 1

        i += 1

        self.history_len = j

    def get_agent_reward(self, repeat):
        """This function gets the agent reward""" 

        # if the track is something the user has heard before take the reward
        # to the (1/2)

        if repeat > 0:

            reward =  math.pow(self.reward,0.5)

        else:

            reward = self.reward

        # Due to the square in the operation the magnitue of rward is limited
        # to 1E-7 due to machine precision concerns - verfied through testing

        if reward > 0.9999999:

            reward = 0.9999999 

        reward = torch.tensor([reward], requires_grad = True)

        self.reward = reward

    def get_critic_loss(self, current_user_history, data):
        """This function get the critic loss"""

        user = data[data.user_id == current_user_history.user_id.values[0]]

        user = user[['r0','r1','r2','r3', 'r4', 'r5']]

        user_array = user.to_numpy()

        # In order to use handy dandy numpy list comprehensions, we need to
        # make an overly bulky array for the averages both for target and for
        # selection ( as pssed to self.custom_loss_critic)

        selection_averages = []

        selection_averages.append(np.average(current_user_history.r0.values))

        selection_averages.append(np.average(current_user_history.r1.values))

        selection_averages.append(np.average(current_user_history.r2.values))

        selection_averages.append(np.average(current_user_history.r3.values))

        selection_averages.append(np.average(current_user_history.r4.values))

        selection_averages.append(np.average(current_user_history.r5.values))

        selection_averages = np.array(selection_averages)

        # This line here gives selection_averages a 2nd dimension to match time
        # while the repeat command coppies these average values through the time
        # axis

        selection_averages = np.repeat(selection_averages[None,:], 
                                       current_user_history.shape[0],
                                       axis = 0)

        selection_averages = selection_averages[-10:]

        selection_array=current_user_history[['r0','r1','r2','r3', 'r4', 'r5']]

        selection_array = selection_array[-10:]

        selection_array = selection_array.to_numpy()

        # Here we repeat this process for the whole user history as reflected
        # byuser

        target_averages = []

        target_averages.append(np.average(user.r0.values))

        target_averages.append(np.average(user.r1.values))
       
        target_averages.append(np.average(user.r2.values))
       
        target_averages.append(np.average(user.r3.values))
       
        target_averages.append(np.average(user.r4.values))
       
        target_averages.append(np.average(user.r5.values))
        
        target_averages = np.array(target_averages)

        target_averages = np.repeat(target_averages[None, :],
                                    selection_array.shape[0],
                                    axis = 0)
        
        critic_loss = []

        end  = selection_array.shape[0]

        start = 0

        while end < user_array.shape[0]:
            
            critic_loss.append(self.custom_loss_critic(user_array[start:end,],
                                                selection_array,
                                                selection_averages,
                                                target_averages))

            start += 1

            end += 1

        if len(critic_loss) > 0:

            critic_loss = np.average(critic_loss)

        else:

            critic_loss = 0.0

        critic_loss = torch.tensor([critic_loss], requires_grad = True)

        self.critic_loss = critic_loss

    def predict(self, user_history):
        """This function manages the training of the model based on the provided
        data"""

        self.factorize(user_history)

        self.pred = self.model_agent(torch.Tensor(self.factors_agent))

    def propagate(self, current_user_history, data, prediction, repeat):
        """This function propagates the loss through the actor and critic"""

        self.add_prediction(prediction)

        # Clear out the gradients from the last prediction

        self.model_agent.zero_grad()

        self.model_critic.zero_grad()

        # Get the critic reward

        self.reward = self.model_critic(torch.Tensor(self.factors_critic))

        self.get_agent_reward(repeat)

        # Get the agent loss and apply it
        
        agent_loss = self.loss_agent(self.reward, torch.tensor([1.0]))
        
        self.optimizer_agent.step(agent_loss.backward())

        # Get the critic loss and apply it

        self.get_critic_loss(current_user_history, data)

        evaluated_critic_loss = self.loss_critic(self.critic_loss,
                                                 torch.tensor([6.0]))

        self.optimizer_critic.step(evaluated_critic_loss.backward())

    def ready_agent(self, agent_model_path, critic_model_path, train):
        """This function sets up a working agent - one complete with a loss
        function and a model"""

        self.is_train = train 

        self.model_agent = torch.load(agent_model_path)

        self.model_critic = torch.load(critic_model_path)

        if self.model_agent is not None:
            
            print("Actor Model {} sucessuflly loaded.\n".format(agent_model_path))

        self.model_critic = torch.load(critic_model_path)

        if self.model_agent is not None:
            
            print("Critic Model {} sucessuflly loaded.\n".format(critic_model_path))

    def set_model_weights(self, model):
        """This function initilizes the weights in a pytorch model"""

        classname = model.__class__.__name__

        if classname.find('Linear') != -1:

            n = model.in_features

            y = 1.0 / np.sqrt(n)

            model.weight.data.uniform_(-y,y)

            model.bias.data.fill(0)

    def wake_agent(self, train):
        """This function sets up a working agent - one complete with a loss
        function and a model"""

        self.is_train = train 

        self.add_model()
コード例 #6
0
def main():
    # 获取游戏,skill_frame每个动作执行的次数,resize_shape图像预处理的大小,render_preprocess是否显示预处理后的图像
    env = retro_util.RetroEnv(game=args.env,
                              resize_shape=RESIZE_SHAPE,
                              skill_frame=SKILL_FRAME,
                              render_preprocess=args.show_play,
                              is_train=True)
    env.seed(1)

    # 游戏的图像形状
    # obs_dim = env.observation_space.shape
    obs_dim = RESIZE_SHAPE
    # 动作维度
    action_dim = env.action_space.n
    # 动作正负的最大绝对值
    max_action = 1

    # 创建模型
    actor = ActorModel(action_dim)
    critic = CriticModel()
    algorithm = parl.algorithms.SAC(actor=actor,
                                    critic=critic,
                                    max_action=max_action,
                                    gamma=GAMMA,
                                    tau=TAU,
                                    actor_lr=ACTOR_LR,
                                    critic_lr=CRITIC_LR)
    agent = Agent(algorithm, obs_dim, action_dim)

    # 加载预训练模型
    if os.path.exists(args.model_path):
        logger.info("加载预训练模型...")
        agent.restore(args.model_path)

    # 创建记录数据存储器
    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim)

    total_steps = 0
    step_train = 0
    print("开始训练模型。。。")
    while total_steps < args.train_total_steps:
        # 训练
        train_reward, steps = run_train_episode(env,
                                                agent,
                                                rpm,
                                                render=args.show_play)
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
        summary.add_scalar('train/episode_reward', train_reward, total_steps)
        total_steps += steps

        # 评估
        if step_train % 100 == 0:
            evaluate_reward = run_evaluate_episode(env,
                                                   agent,
                                                   render=args.show_play)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, evaluate_reward))
            summary.add_scalar('eval/episode_reward', evaluate_reward,
                               total_steps)
        step_train += 1

        # 保存模型
        if not os.path.exists(args.model_path):
            os.makedirs(args.model_path)
        agent.save(args.model_path)
コード例 #7
0
    counter = Counter()
    env = gym.make(args.env_name)
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.n  # for Discrete object
    # a_dim = env.action_space.shape[0]  # for Box object
    print(s_dim, a_dim)
    # state = env.reset()
    # state = Variable(torch.Tensor(state).unsqueeze(0))
    # print(state.dim())
    # linear = torch.nn.Linear(3, 1)
    # out = linear(state)
    # print(out)

    from model import ActorModel, CriticModel
    actor = ActorModel(s_dim, a_dim)
    critic = CriticModel(s_dim)
    actor.share_memory()
    critic.share_memory()
    # print(actor, critic)

    import torch.multiprocessing as mp
    update_event, rolling_event = mp.Event(), mp.Event()
    update_event.clear()  # not update now
    rolling_event.set()  # start to roll out
    queue = mp.Queue()  # workers put data in this queue
    counter = Counter()
    queue_size = Counter()

    worker(args, actor, critic, update_event, rolling_event, queue, counter, queue_size)
    # t = Variable(torch.randn(5))
    # print(t)
コード例 #8
0
CHECKPOINT_GOALIE_ACTOR = './checkpoint_goalie_actor.pth'
CHECKPOINT_GOALIE_CRITIC = './checkpoint_goalie_critic.pth'
CHECKPOINT_STRIKER_ACTOR = './checkpoint_striker_actor.pth'
CHECKPOINT_STRIKER_CRITIC = './checkpoint_striker_critic.pth'

# Actors and Critics
GOALIE_0_KEY = 0
STRIKER_0_KEY = 0
GOALIE_1_KEY = 1
STRIKER_1_KEY = 1

# NEURAL MODEL
goalie_actor_model = ActorModel(goalie_state_size,
                                goalie_action_size).to(DEVICE)
goalie_critic_model = CriticModel(goalie_state_size + striker_state_size +
                                  goalie_state_size +
                                  striker_state_size).to(DEVICE)
goalie_optim = optim.Adam(list(goalie_actor_model.parameters()) +
                          list(goalie_critic_model.parameters()),
                          lr=GOALIE_LR)
# self.optim = optim.RMSprop( list( self.actor_model.parameters() ) + list( self.critic_model.parameters() ), lr=lr, alpha=0.99, eps=1e-5 )

striker_actor_model = ActorModel(striker_state_size,
                                 striker_action_size).to(DEVICE)
striker_critic_model = CriticModel(striker_state_size + goalie_state_size +
                                   striker_state_size +
                                   goalie_state_size).to(DEVICE)
striker_optim = optim.Adam(list(striker_actor_model.parameters()) +
                           list(striker_critic_model.parameters()),
                           lr=STRIKER_LR)
# self.optim = optim.RMSprop( list( self.actor_model.parameters() ) + list( self.critic_model.parameters() ), lr=lr, alpha=0.99, eps=1e-5 )