import boto3 import os from utility.logger import logger from boto3.dynamodb.conditions import Key, Attr from botocore.exceptions import ClientError # dynamodb instance creation dynamodb = boto3.resource('dynamodb') # fetching dynamodb table table = dynamodb.Table(os.environ['DYNAMODB_TABLE']) # Logger configuration logger = logger() class Dynamodb: # createUser/createStore def newItem(self, item): try: response = table.put_item( Item=item, ConditionExpression= 'attribute_not_exists(pk) AND attribute_not_exists(sk)') return response except (ClientError, KeyError) as e: raise
def main(): # Mountain care env setting env = gym.make('MountainCar-v0') ob_space = env.observation_space action_space = env.action_space print(ob_space, action_space) # For Reinforcement Learning Policy = gen.Policy_net('policy', env) Old_Policy = gen.Policy_net('old_policy', env) PPO = gen.PPO(Policy, Old_Policy, gamma=0.95) # For Inverse Reinforcement Learning D = dis.Discriminator(env) # Load expert trajectories expert_observations = np.genfromtxt('exp_traj/observations.csv') next_expert_observations = np.genfromtxt('exp_traj/next_observations.csv') expert_actions = np.genfromtxt('exp_traj/actions.csv', dtype=np.int32) # Expert returns is just used for showing the mean scrore, not for training expert_returns = np.genfromtxt('exp_traj/returns.csv') mean_expert_return = np.mean(expert_returns) max_episode = 10000 # The maximum step limit in one episode to make sure the mountain car # task is finite Markov decision processes (MDP). max_steps = 200 saveReturnEvery = 100 num_expert_tra = 20 # Just use to record the training process train_logger = log.logger(logger_name='AIRL_MCarV0_Training_Log', logger_path='./trainingLog/', col_names=['Episode', 'Actor(D)', 'Expert Mean(D)','Actor','Expert Mean']) # Model saver model_save_path = './model/' model_name = 'airl' saver = tf.train.Saver(max_to_keep=int(max_episode/100)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for episode in range(max_episode): if episode % 100 == 0: print('Episode ', episode) observations = [] actions = [] rewards = [] v_preds = [] obs = env.reset() # Interact with the environment until reach # the terminal state or the maximum step. for step in range(max_steps): # if episode % 100 == 0: # env.render() obs = np.stack([obs]).astype(dtype=np.float32) # act, v_pred = Policy.get_action(obs=obs, stochastic=True) act, v_pred = Old_Policy.get_action(obs=obs, stochastic=True) next_obs, reward, done, _ = env.step(act) observations.append(obs) actions.append(act) # DO NOT use original rewards to update policy rewards.append(reward) v_preds.append(v_pred) if done: # next state of terminate state has 0 state value v_preds_next = v_preds[1:] + [0] break else: obs = next_obs # Data preparation # Data for generator: convert list to numpy array for feeding tf.placeholder next_observations = observations[1:] observations = observations[:-1] actions = actions[:-1] next_observations = np.reshape(next_observations, newshape=[-1] + list(ob_space.shape)) observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # G's probabilities probabilities = get_probabilities(policy=Policy, \ observations=observations, actions=actions) # Experts' probabilities expert_probabilities = get_probabilities(policy=Policy, \ observations=expert_observations, actions=expert_actions) log_probabilities = np.log(probabilities) log_expert_probabilities = np.log(expert_probabilities) # Prepare data for disriminator if D.only_position: observations_for_d = (observations[:,0]).reshape(-1,1) next_observations_for_d = (next_observations[:,0]).reshape(-1,1) expert_observations_for_d = (expert_observations[:,0]).reshape(-1,1) next_expert_observations_for_d = (next_expert_observations[:,0]).reshape(-1,1) log_probabilities_for_d = log_probabilities.reshape(-1,1) log_expert_probabilities_for_d = log_expert_probabilities.reshape(-1,1) obs, obs_next, acts, path_probs = \ observations_for_d, next_observations_for_d, \ actions.reshape(-1,1), log_probabilities.reshape(-1,1) expert_obs, expert_obs_next, expert_acts, expert_probs = \ expert_observations_for_d, next_expert_observations_for_d, \ expert_actions.reshape(-1,1), log_expert_probabilities.reshape(-1,1) # 这里两类数据量的大小不对等啊, 应该可以优化的?? # Train discriminator batch_size = 32 for i in range(1): # Sample generator nobs_batch, obs_batch, act_batch, lprobs_batch = \ sample_batch(obs_next, obs, acts, path_probs, batch_size=batch_size) # Sample expert nexpert_obs_batch, expert_obs_batch, expert_act_batch, expert_lprobs_batch = \ sample_batch(expert_obs_next, expert_obs, expert_acts, \ expert_probs, batch_size=batch_size) # Label generator samples as 0, indicating that discriminator # always consider generator's behavior is not good; # Label expert samples as 1, indicating that discriminator # always consider expert's behavior is excellent. labels = np.zeros((batch_size*2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0) lprobs_batch = np.concatenate([lprobs_batch, expert_lprobs_batch], axis=0) D.train(obs_t = obs_batch, nobs_t = nobs_batch, lprobs = lprobs_batch, labels = labels) if episode % 50 == 0: drawRewards(D=D, episode=episode, path='./trainingLog/') # The output of this discriminator is reward if D.score_discrim == False: d_rewards = D.get_scores(obs_t=observations_for_d) else: d_rewards = D.get_l_scores(obs_t=observations_for_d, \ nobs_t=next_observations_for_d, lprobs=log_probabilities_for_d) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) # Sum rewards to get return: Just for tracking the record of returns overtime. d_actor_return = np.sum(d_rewards) if D.score_discrim == False: expert_d_rewards = D.get_scores(obs_t=expert_observations_for_d) else: expert_d_rewards = D.get_l_scores(obs_t=expert_observations_for_d, \ nobs_t= next_expert_observations_for_d,lprobs= log_expert_probabilities_for_d ) expert_d_rewards = np.reshape(expert_d_rewards, newshape=[-1]).astype(dtype=np.float32) d_expert_return = np.sum(expert_d_rewards)/num_expert_tra #** Start Logging **#: Just use to track information train_logger.add_row_data([episode, d_actor_return, d_expert_return, sum(rewards), mean_expert_return], saveFlag=True) if episode % saveReturnEvery == 0: train_logger.plotToFile(title='Return') #** End logging **# gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) inp = [observations, actions, gaes, d_rewards, v_preds_next] if episode % 4 == 0: PPO.assign_policy_parameters() # PPO.assign_policy_parameters() for epoch in range(10): sample_indices = np.random.randint(low=0, high=observations.shape[0],size=32) # sample training data sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp] PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) # Save model if episode > 0 and episode % 100 == 0: saver.save(sess, os.path.join(model_save_path, model_name), global_step=episode)
def main(): # 环境 env = gym.make('MountainCar-v0') ob_space = env.observation_space # RL部分 Policy = gen.Policy_net('policy', env) Old_Policy = gen.Policy_net('old_policy', env) PPO = gen.PPO(Policy, Old_Policy, gamma=0.95) # IRL部分 D = dis.Discriminator(env) # 加载Expert # Numpy 也是可以读txt然后直接保存成数字的 expert_observations = np.genfromtxt('exp_traj/observations.csv') next_expert_observations = np.genfromtxt('exp_traj/next_observations.csv') expert_actions = np.genfromtxt('exp_traj/actions.csv', dtype=np.int32) expert_returns = np.genfromtxt('exp_traj/returns.csv') mean_expert_return = np.mean(expert_returns) # 测试的时候可以少一点步数 max_episode = 500 max_steps = 200 saveReturnEvery = 100 num_expert_tra = 20 # Saver to save all the variables model_save_path = './model/' model_name = 'airl' saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(model_save_path) if ckpt and ckpt.model_checkpoint_path: print('Found Saved Model.') # -1 代表最新的 # ckpt_to_restore = ckpt.all_model_checkpoint_paths[-1] ckpt_to_restore = ckpt.all_model_checkpoint_paths[-1] else: print('No Saved Model. Exiting') exit() # Logger 用来记录训练过程 test_logger = log.logger(logger_name='MCarV0_Training_Log', logger_path='./testLog_' + ckpt_to_restore.split('-')[-1] + '/', \ col_names=['Episode', 'Actor(D)', 'Expert Mean(D)','Actor','Expert Mean']) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Restore Model if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt_to_restore) print('Model Restored.') obs = env.reset() # do NOT use rewards to update policy # 玩很多次游戏 for episode in range(max_episode): if episode % 100 == 0: print('Episode ', episode) # 开始玩每把游戏前,准备几个管子,用来收集过程中遇到的东西 observations = [] actions = [] rewards = [] v_preds = [] # 遍历这次游戏中的每一步 obs = env.reset() for step in range(max_steps): if episode % 100 == 0: env.render() obs = np.stack([obs]).astype(dtype=np.float32) # When testing set stochastic False will get better performance # act, v_pred = Policy.get_action(obs=obs, stochastic=True) act, v_pred = Policy.get_action(obs=obs, stochastic=False) # act = act.item() # v_pred = v_pred.item() # 和环境交互 next_obs, reward, done, info = env.step(act) observations.append(obs) actions.append(act) # 这里的reward并不是用来更新网络的,而是用来记录真实的 # 表现的。 rewards.append(reward) v_preds.append(v_pred) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value break else: obs = next_obs print('Rewards: ', sum(rewards)) ################## # Logging ################## next_observations = observations[1:] observations = observations[:-1] actions = actions[:-1] next_observations = np.reshape(next_observations, newshape=[-1] + list(ob_space.shape)) observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # Get the G's probabilities probabilities = get_probabilities(policy=Policy, observations=observations, actions=actions) # Get the experts' probabilities expert_probabilities = get_probabilities( policy=Policy, observations=expert_observations, actions=expert_actions) # numpy 里面log的底数是e log_probabilities = np.log(probabilities) log_expert_probabilities = np.log(expert_probabilities) if D.only_position: observations_for_d = (observations[:, 0]).reshape(-1, 1) next_observations_for_d = (next_observations[:, 0]).reshape( -1, 1) expert_observations_for_d = (expert_observations[:, 0]).reshape( -1, 1) next_expert_observations_for_d = ( next_expert_observations[:, 0]).reshape(-1, 1) log_probabilities_for_d = log_probabilities.reshape(-1, 1) log_expert_probabilities_for_d = log_expert_probabilities.reshape( -1, 1) # output of this discriminator is reward if D.score_discrim == False: d_rewards = D.get_scores(obs_t=observations_for_d) else: d_rewards = D.get_l_scores(obs_t=observations_for_d, nobs_t=next_observations_for_d, lprobs=log_probabilities_for_d) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) d_actor_return = np.sum(d_rewards) # d_expert_return: Just For Tracking if D.score_discrim == False: expert_d_rewards = D.get_scores( obs_t=expert_observations_for_d) else: expert_d_rewards = D.get_l_scores( obs_t=expert_observations_for_d, nobs_t=next_expert_observations_for_d, lprobs=log_expert_probabilities_for_d) expert_d_rewards = np.reshape(expert_d_rewards, newshape=[-1 ]).astype(dtype=np.float32) d_expert_return = np.sum(expert_d_rewards) / num_expert_tra test_logger.add_row_data([ episode, d_actor_return, d_expert_return, sum(rewards), mean_expert_return ], saveFlag=True) if episode % saveReturnEvery == 0: test_logger.plotToFile(title='Return By Stochastic Policy')
def main(): # Env env = gym.make('MountainCar-v0') ob_space = env.observation_space # For Reinforcement Learning Policy = gen.Policy_net('policy', env) Old_Policy = gen.Policy_net('old_policy', env) PPO = gen.PPO(Policy, Old_Policy, gamma=0.95) # For Inverse Reinforcement Learning D = dis.Discriminator(env) # Load Experts Demonstration expert_observations = np.genfromtxt('exp_traj/observations.csv') next_expert_observations = np.genfromtxt('exp_traj/next_observations.csv') expert_actions = np.genfromtxt('exp_traj/actions.csv', dtype=np.int32) expert_returns = np.genfromtxt('exp_traj/returns.csv') mean_expert_return = np.mean(expert_returns) max_episode = 24000 max_steps = 200 saveReturnEvery = 100 num_expert_tra = 20 # Saver to save all the variables model_save_path = './model/' model_name = 'airl' saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='discriminator')) ckpt = tf.train.get_checkpoint_state(model_save_path) if ckpt and ckpt.model_checkpoint_path: print('Found Saved Model.') # -1 代表最新的 ckpt_to_restore = ckpt.all_model_checkpoint_paths[-1] else: print('No Saved Model. Exiting') exit() # Logger 用来记录训练过程 train_logger = log.logger(logger_name='AIRL_MCarV0_Training_Log', logger_path='./trainingLog/', col_names=[ 'Episode', 'Actor(D)', 'Expert Mean(D)', 'Actor', 'Expert Mean' ]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # Restore Model if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt_to_restore) print('Model Restored.') obs = env.reset() # do NOT use original rewards to update policy for episode in range(max_episode): if episode % 100 == 0: print('Episode ', episode) observations = [] actions = [] rewards = [] v_preds = [] # 遍历这次游戏中的每一步 obs = env.reset() for step in range(max_steps): # if episode % 100 == 0: # env.render() obs = np.stack([obs]).astype(dtype=np.float32) act, v_pred = Policy.get_action(obs=obs, stochastic=True) act = np.asscalar(act) v_pred = np.asscalar(v_pred) # 和环境交互 next_obs, reward, done, info = env.step(act) observations.append(obs) actions.append(act) # 这里的reward并不是用来更新网络的,而是用来记录真实的 # 表现的。 rewards.append(reward) v_preds.append(v_pred) if done: v_preds_next = v_preds[1:] + [ 0 ] # next state of terminate state has 0 state value break else: obs = next_obs # 完了就可以用数据来训练网络了 # 准备数据 # Expert的数据已经准备好了 # Generator的数据 # convert list to numpy array for feeding tf.placeholder next_observations = observations[1:] observations = observations[:-1] actions = actions[:-1] next_observations = np.reshape(next_observations, newshape=[-1] + list(ob_space.shape)) observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape)) actions = np.array(actions).astype(dtype=np.int32) # Get the G's probabilities probabilities = get_probabilities(policy=Policy, observations=observations, actions=actions) # Get the experts' probabilities expert_probabilities = get_probabilities( policy=Policy, observations=expert_observations, actions=expert_actions) # numpy 里面log的底数是e log_probabilities = np.log(probabilities) log_expert_probabilities = np.log(expert_probabilities) if D.only_position: observations_for_d = (observations[:, 0]).reshape(-1, 1) next_observations_for_d = (next_observations[:, 0]).reshape( -1, 1) expert_observations_for_d = (expert_observations[:, 0]).reshape( -1, 1) next_expert_observations_for_d = ( next_expert_observations[:, 0]).reshape(-1, 1) # 数据排整齐 obs, obs_next, acts, path_probs = \ observations_for_d, next_observations_for_d, actions, log_probabilities expert_obs, expert_obs_next, expert_acts, expert_probs = \ expert_observations_for_d, next_expert_observations_for_d, expert_actions, log_expert_probabilities acts = acts.reshape(-1, 1) expert_acts = expert_acts.reshape(-1, 1) path_probs = path_probs.reshape(-1, 1) expert_probs = expert_probs.reshape(-1, 1) # train discriminator 得到Reward函数 # print('Train D') # 这里两类数据量的大小不对等啊 # 应该可以优化的 batch_size = 32 for i in range(2): # 抽一个G的batch nobs_batch, obs_batch, act_batch, lprobs_batch = \ sample_batch(obs_next, obs, acts, path_probs, batch_size=batch_size) # 抽一个Expert的batch nexpert_obs_batch, expert_obs_batch, expert_act_batch, expert_lprobs_batch = \ sample_batch(expert_obs_next, expert_obs, expert_acts, expert_probs, batch_size=batch_size) # 前半部分负样本0; 后半部分是正样本1 labels = np.zeros((batch_size * 2, 1)) labels[batch_size:] = 1.0 # 拼在一起喂到神经网络里面去训练 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0) # 若是只和状态相关,下面这个这个没有用 act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) lprobs_batch = np.concatenate( [lprobs_batch, expert_lprobs_batch], axis=0) # if episode <= 9000: # D.train(obs_t = obs_batch, # nobs_t = nobs_batch, # lprobs = lprobs_batch, # labels = labels) # else: # pass if episode % 50 == 0: drawRewards(D=D, episode=episode, path='./trainingLog/') # output of this discriminator is reward d_rewards = D.get_scores(obs_t=observations_for_d) # d_rewards = D.get_scores(obs_t=observations, nobs_t=next_observations, lprobs=log_probabilities) d_rewards = np.reshape(d_rewards, newshape=[-1]).astype(dtype=np.float32) d_actor_return = np.sum(d_rewards) # print(d_actor_return) # d_expert_return: Just For Tracking expert_d_rewards = D.get_scores(obs_t=expert_observations_for_d) # expert_d_rewards = D.get_scores(obs_t=expert_observations, nobs_t= next_expert_observations,lprobs= log_expert_probabilities) expert_d_rewards = np.reshape(expert_d_rewards, newshape=[-1 ]).astype(dtype=np.float32) d_expert_return = np.sum(expert_d_rewards) / num_expert_tra # print(d_expert_return) ###################### # Start Logging # ###################### train_logger.add_row_data([ episode, d_actor_return, d_expert_return, sum(rewards), mean_expert_return ], saveFlag=True) if episode % saveReturnEvery == 0: train_logger.plotToFile(title='Return') ################### # End logging # ################### gaes = PPO.get_gaes(rewards=d_rewards, v_preds=v_preds, v_preds_next=v_preds_next) gaes = np.array(gaes).astype(dtype=np.float32) # gaes = (gaes - gaes.mean()) / gaes.std() v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) # train policy 得到更好的Policy inp = [observations, actions, gaes, d_rewards, v_preds_next] # if episode % 4 == 0: # PPO.assign_policy_parameters() PPO.assign_policy_parameters() for epoch in range(160): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=32) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], gaes=sampled_inp[2], rewards=sampled_inp[3], v_preds_next=sampled_inp[4]) # 保存整个模型 if episode > 0 and episode % 100 == 0: saver.save(sess, os.path.join(model_save_path, model_name), global_step=episode)