def main(): args = parser.parse_args() use_cuda = args.use_cuda use_cuda = True transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2) policy_gradient = PolicyGradient(config=Params, train_set=trainloader, test_set=testloader, use_cuda=use_cuda) policy_gradient.solve_environment()
def __init__(self, n_x, n_y, learning_rate = 0.02, reward_decay=0.99, load_path=None, save_path=None): self.PG = PolicyGradient(n_x, n_y, learning_rate=learning_rate, reward_decay=reward_decay, load_path=load_path, save_path=save_path )
class AgentPolicyGradient: def __init__(self, n_x, n_y, learning_rate = 0.02, reward_decay=0.99, load_path=None, save_path=None): self.PG = PolicyGradient(n_x, n_y, learning_rate=learning_rate, reward_decay=reward_decay, load_path=load_path, save_path=save_path ) def choose_action(self, observation): return self.PG.choose_action(observation) def store_transition(self, s, a, r): return self.PG.store_transition(s,a,r) def learn(self): return self.PG.learn() def plot_cost(self): import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt plt.plot(np.arange(len(self.PG.cost_history)), self.PG.cost_history) plt.ylabel('Cost Ex') plt.xlabel('Training Steps Ex') plt.show() def crashed(self): episode_rewards_sum = sum(self.PG.episode_rewards) return episode_rewards_sum < -250 def episode_reward(self): episode_rewards_sum = sum(self.PG.episode_rewards) return episode_rewards_sum def costs(self): return self.PG.costs()
def winRate(load_path, episodes, player_num): tf.reset_default_graph() number_of_players = 2 number_of_pieces = 4 reward = -1000 EPISODES = episodes ghost_players = list(reversed(range(0, 4)))[:-number_of_players] players = list(reversed(range(0, 4)))[-number_of_players:] winner = None act = util.Action(number_of_players, number_of_pieces, reward) winnerCount = defaultdict(int) print(load_path, "---") PG = PolicyGradient( n_x=(number_of_players * number_of_pieces) + 5, #input layer size n_y=5, #ouput layer size learning_rate=0.02, reward_decay=0.99, load_path=load_path, save_path=None, player_num=player_num) preds = list() for episode in range(EPISODES): g = ludopy.Game(ghost_players=ghost_players,\ number_of_pieces=number_of_pieces) there_is_a_winner = False winner = None totalMoves, wrongPred = 0, 0 while True: for i in range(number_of_players): (dice, move_pieces, player_pieces, enemy_pieces, \ player_is_a_winner,there_is_a_winner),\ player_i = g.get_observation() if player_i == 1: action, random = act.getAction(PG, enemy_pieces, player_pieces, move_pieces, dice) totalMoves += 1 if random: wrongPred += 1 else: action = act.getAction(move_pieces=move_pieces) _, _, _, _, _, there_is_a_winner = g.answer_observation(action) if there_is_a_winner: if episode % 1000 == 0 and 0: print("saving the game--", episode) winner = player_i winnerCount[player_i] += 1 break if there_is_a_winner: preds.append([wrongPred, totalMoves]) break return winnerCount, preds
def main(): exp_dir = 'search_{}_{}'.format(args.algorithm, time.strftime("%Y%m%d-%H%M%S")) if not os.path.exists(exp_dir): os.mkdir(exp_dir) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(exp_dir, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) logging.info('args = %s', args) if args.algorithm == 'PPO' or args.algorithm == 'PG': torch.manual_seed(args.seed) np.random.seed(args.seed) if torch.cuda.is_available(): device = torch.device('cuda:{}'.format(str(args.gpu))) cudnn.benchmark = True cudnn.enable = True logging.info('using gpu : {}'.format(args.gpu)) torch.cuda.manual_seed(args.seed) else: device = torch.device('cpu') logging.info('using cpu') if args.algorithm == 'PPO': ppo = PPO(args, device) ppo.multi_solve_environment() elif args.algorithm == 'PG': pg = PolicyGradient(args, device) pg.multi_solve_environment() else: rs = RandomSearch(args) rs.multi_solve_environment()
def main(): env = gym.make('CartPole-v0') model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM) alg = PolicyGradient(model, LEARNING_RATE) agent = CartpoleAgent(alg, OBS_DIM, ACT_DIM) with fluid.dygraph.guard(): for i in range(1000): # 100 episodes obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: _, _, reward_list = run_episode(env, agent, train_or_test='test') total_reward = np.sum(reward_list) logger.info('Test reward: {}'.format(total_reward))
# env1 = gym.make('sateDCA_ENV-v0') # env1 = env1.unwrapped nOfenb = 2 nOfchannel = 12 nOfue = 2 if __name__ == "__main__": # Load checkpoint load_path = None save_path = None PG = PolicyGradient(n_x=4 + nOfenb * nOfchannel, n_y=nOfenb * nOfchannel, learning_rate=0.005, reward_decay=1, load_path=load_path, save_path=save_path, ep=0.99) env = ns3env.Ns3Env(port=port, startSim=startSim, simSeed=seed, simArgs=simArgs, debug=debug) env.reset() ob_space = env.observation_space ac_space = env.action_space
tf.flags.DEFINE_float('display_threshold', 10, 'the reward threshold to display render') tf.flags.DEFINE_boolean('render', False, 'render waste time') tf.flags.DEFINE_boolean('output_graph', False, 'whether to save graph') tf.flags.DEFINE_string('env_name', 'CartPole-v0', 'env name') tf.flags.DEFINE_integer('episode', 1000, 'train episode') RENDER = FLAGS.render env = gym.make(FLAGS.env_name) env.seed(1) env = env.unwrapped PG = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], lr=0.02, gamma=0.99, output_graph=FLAGS.output_graph) for i in range(FLAGS.episode): s = env.reset() while True: if RENDER: env.render() action = PG.choose_action(s) s_, r, done, info = env.step(action) PG.store_transition(s_, action, r) if done: episode_rs_sum = sum(PG.ep_rs) if 'running_reward' not in globals():
def test_cartpole(): env = gym.make('CartPole-v0') agent_pg = PolicyGradient(env) agent_pg.learning(episodes=5000)
def simulation(): users_num = 1 action_rewards = [10, 9, 1, 1, 1, 1, 1, 1, 1, 1] actions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] observations = [[random.randint(0, i * 10) for i in range(1, 4)] for j in range(1, 101)] # nums of items to recommend K = 2 load_version = 1 save_version = load_version + 1 load_path = "output/weights/topk{}.ckpt".format(load_version) save_path = "output/weights/topk{}.ckpt".format(save_version) EPISODES = 5000 RENDER_ENV = True rewards = [] PG = PolicyGradient(n_x=len(observations[0]), n_y=len(actions), s0=observations[random.randint(0, len(observations) - 1)], learning_rate=0.005, reward_decay=1, load_path=None, save_path=save_path, weight_capping_c=2**3, k=K, b_distribution='uniform') for episode in range(EPISODES): episode_reward = 0 tic = time.clock() done = False while True: ''' TODO:initialize the env ''' if RENDER_ENV: observation = observations[random.randint( 0, len(observations) - 1)] # 1. Choose an action based on observation # action = PG.uniform_choose_action(observation) action = PG.choose_action(observation) # 2. Take action in the environment observation_, reward = observations[random.randint( 0, len(observations) - 1)], action_rewards[action] # 4. Store transition for training PG.store_transition(observation, action, reward) toc = time.clock() elapsed_sec = toc - tic if elapsed_sec > 120: done = True if len(PG.episode_observations) > 100: done = True if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) PG.cost_history.append(episode_rewards_sum) print("==========================================") print("Episode: ", episode) print("Seconds: ", elapsed_sec) print("Reward: ", episode_rewards_sum) print("Max reward so far: ", max_reward_so_far) #print(PG.outputs_softmax) print("distribution at {} is :{}".format( PG.s0, PG.get_distribution(PG.s0))) # 5. Train neural network discounted_episode_rewards_norm = PG.learn() break # Save new observation observation = observation_ PG.plot_cost() plt.bar(actions, PG.get_distribution(PG.s0)) plt.xlabel("action") # 显示纵轴标签 plt.ylabel("probability") # 显示图标题 plt.title("top-k correction policy") plt.show()
# Load checkpoint load_path = "outputs/weights/2048-v0.ckpt" save_path = "outputs/weights/2048-v0.ckpt" # In[ ]: if __name__ == "__main__": PG = PolicyGradient( n_x = env.observation_space.shape[0], n_y = env.action_space.n, learning_rate=0.025, reward_decay=0.5, epochs=2, load_path=load_path, save_path=save_path ) PG.quiet = QUIET for episode in range(EPISODES): observation = env.reset() episode_reward = 0 max_tile_value_so_far = 0 while True: if RENDER_ENV: env.render()
import gym from policy_gradient import PolicyGradient from config import get_config import random import pdb parser = argparse.ArgumentParser() parser.add_argument('--env-name', required=True, type=str, choices=['cartpole', 'pendulum', 'cheetah']) parser.add_argument('--baseline', dest='use_baseline', action='store_true') parser.add_argument('--no-baseline', dest='use_baseline', action='store_false') parser.add_argument('--seed', type=int, default=1) parser.set_defaults(use_baseline=True) if __name__ == '__main__': args = parser.parse_args() torch.random.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) config = get_config(args.env_name, args.use_baseline, args.seed) env = gym.make(config.env_name) # train model model = PolicyGradient(env, config, args.seed) model.run()
state_dict['min_each_ingredient_per_slice'], state_dict['max_ingredients_per_slice']], )) return state.astype(np.float).ravel() if __name__ == "__main__": # Load checkpoint load_path = "./output/weights/pizza-temp.ckpt" save_path = "output/weights/pizza-temp.ckpt" PG = PolicyGradient( n_x = X_DIM, n_y = 5, learning_rate=0.01, reward_decay=0.95, load_path=load_path, save_path=save_path ) for batch in range(BATCHES): for p_game in range(P_GAMES): env = game.Game({'max_steps': 100}) episode_reward = 0 h = 5 l = 1 pizza_lines = ["TMMMTTT","MMMMTMM", "TTMTTMT", "TMMTMMM", "TTTTTTM", "TTTTTTM"] pizza_config = { 'pizza_lines': pizza_lines, 'r': R, 'c': C, 'l': l, 'h': h } state = env.init(pizza_config)[0] print("\nPIZZA CONFIG: ", pizza_config) print("\nSTATE: ", state)
def __init__(self): rospy.init_node('runPG', anonymous=True) if self.mode == 5: self.n_inputs = 4 if self.mode == 8: self.n_inputs = 8 self.RL = PolicyGradient( n_actions=self.n_outputs, n_features=self.n_inputs, learning_rate=0.02, reward_decay=0.99, load_saved_net=False, # output_graph=True, ) rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) rospy.Service('/RL/net', net_eval, self.EvalNet) rospy.Service('/RL/start_learning', Empty, self.start_learning) obs_srv = rospy.ServiceProxy('/RL/observation', observation) drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) open_srv = rospy.ServiceProxy('/RL/OpenGripper', Empty) close_srv = rospy.ServiceProxy('/RL/CloseGripper', Empty) rospy.sleep(3) o = open_srv() episode_count = 0 rate = rospy.Rate(15) # 15hz while not rospy.is_shutdown(): if self.stLearning: ## Start episode ## episode_count += 1 # Close gripper raw_input( "Place object between fingers and press Enter to close gripper..." ) close_srv() while not self.gripper_closed: rate.sleep() raw_input("Remove table and press Enter to start episode...") # Get observation obs = np.array(obs_srv().state) self.VT = [] while True: # Choose action action = self.RL.choose_action(obs) # Act suc = move_srv(self.A[action]).success rospy.sleep(0.05) rate.sleep() if suc: # Get observation obs_ = np.array(obs_srv().state) fail = drop_srv( ).dropped # Check if dropped - end of episode else: # End episode if overload or angle limits reached rospy.logerr( '[RL] Failed to move gripper. Episode declared failed.' ) fail = True reward, done = self.transition_reward(obs_, fail) self.RL.store_transition(obs, action, reward) obs = obs_ if done: ep_rs_sum = sum(self.RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***") vt = self.RL.learn() self.R.append(running_reward) self.possible_plot = True break rate.sleep() elif self.possible_plot: self.plot_sav() self.possible_plot = False # Open gripper if self.gripper_closed: o = open_srv() rospy.sleep(0.2) # self.stLearning = False # print(obs_srv().state) # rospy.spin() rate.sleep()
DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold RENDER = True # rendering wastes time env = gym.make('CartPole-v0') env.unwrapped env.seed(1) print(env.action_space) print(env.observation_space) print(env.observation_space.low) print(env.observation_space.high) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99, # output_graph=True, ) for i_episode in range(1500): observation = env.reset() while True: if RENDER and i_episode>1000: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action)
class runPG(): n_inputs = 4 n_outputs = 4 # right and left for each finger # n_outputs = 8 # right, left and stop for each finger net = 0 X = 0 A = np.array([[-1, -1], [-1, 1], [1, -1], [1, 1], [0, -1], [0, 1], [-1, 0], [1, 0]]) mode = 5 reward_mode = 2 R = [] gripper_closed = False stLearning = True possible_plot = False def __init__(self): rospy.init_node('runPG', anonymous=True) if self.mode == 5: self.n_inputs = 4 if self.mode == 8: self.n_inputs = 8 self.RL = PolicyGradient( n_actions=self.n_outputs, n_features=self.n_inputs, learning_rate=0.02, reward_decay=0.99, load_saved_net=False, # output_graph=True, ) rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) rospy.Service('/RL/net', net_eval, self.EvalNet) rospy.Service('/RL/start_learning', Empty, self.start_learning) obs_srv = rospy.ServiceProxy('/RL/observation', observation) drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) open_srv = rospy.ServiceProxy('/RL/OpenGripper', Empty) close_srv = rospy.ServiceProxy('/RL/CloseGripper', Empty) rospy.sleep(3) o = open_srv() episode_count = 0 rate = rospy.Rate(15) # 15hz while not rospy.is_shutdown(): if self.stLearning: ## Start episode ## episode_count += 1 # Close gripper raw_input( "Place object between fingers and press Enter to close gripper..." ) close_srv() while not self.gripper_closed: rate.sleep() raw_input("Remove table and press Enter to start episode...") # Get observation obs = np.array(obs_srv().state) self.VT = [] while True: # Choose action action = self.RL.choose_action(obs) # Act suc = move_srv(self.A[action]).success rospy.sleep(0.05) rate.sleep() if suc: # Get observation obs_ = np.array(obs_srv().state) fail = drop_srv( ).dropped # Check if dropped - end of episode else: # End episode if overload or angle limits reached rospy.logerr( '[RL] Failed to move gripper. Episode declared failed.' ) fail = True reward, done = self.transition_reward(obs_, fail) self.RL.store_transition(obs, action, reward) obs = obs_ if done: ep_rs_sum = sum(self.RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***") vt = self.RL.learn() self.R.append(running_reward) self.possible_plot = True break rate.sleep() elif self.possible_plot: self.plot_sav() self.possible_plot = False # Open gripper if self.gripper_closed: o = open_srv() rospy.sleep(0.2) # self.stLearning = False # print(obs_srv().state) # rospy.spin() rate.sleep() def plot_sav(self): plt.plot(range(len(self.R)), self.R) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() def EvalNet(self, msg): a = 0 return {'action': a} def callbackGripperStatus(self, msg): self.gripper_closed = msg.data == "closed" def start_learning(self, msg): self.stLearning = not self.stLearning return EmptyResponse() def transition_reward(self, obs, fail): # Keep moving as much as possible if self.reward_mode == 1: if fail: reward = 0. else: reward = 1. done = fail # Get to a certain coodrinate if self.reward_mode == 2: if fail: reward = -3. else: reward = -1. done = fail if obs[0] > 135.: raw_input('Reached goal, x = %f.' % obs[0]) reward = 5. done = True return reward, done
from policy_gradient import PolicyGradient import matplotlib.pyplot as plt import time DISPLAY_REWARD_THRESHOLD = 100 RENDER = False env = gym.make('CartPole-v0') env.seed(1) env = env.unwrapped n_actions = env.action_space.n n_features = env.observation_space.shape[0] RL = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=0.02, reward_decay=0.99) for i_episode in range(3000): observation = env.reset() # 车的位置,杆子的角度,车速,角度变化率 while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done:
def train(episode, rewardType=None): tf.reset_default_graph() number_of_players = 2 number_of_pieces = 4 # Load checkpoint load_version = 11 save_version = load_version + 1 #load_path = "output/weights/ludo/{}/ludo-v2.ckpt".format(load_version) load_path = None save_path = "/content/drive/My Drive/cse8673_project/output/weights/ludo/{}/ludo-v2.ckpt".format( rewardType) PG_dict = {} reward = -1000 act = util.Action(number_of_players, number_of_pieces, reward) PG = PolicyGradient( n_x=(number_of_players * number_of_pieces) + 5, #input layer size n_y=5, #ouput layer size learning_rate=0.02, reward_decay=0.99, load_path=load_path, save_path=save_path, player_num=0, rewardType=rewardType) EPISODES = episode ghost_players = list(reversed(range(0, 4)))[:-number_of_players] players = list(reversed(range(0, 4)))[-number_of_players:] winner = None winnerCount = defaultdict(int) for episode in range(EPISODES): if episode % 500 == 0: print("episode : ", episode) g = ludopy.Game(ghost_players=ghost_players,\ number_of_pieces=number_of_pieces) episode_reward = 0 there_is_a_winner = False winner = None count = 0 while True: count += 1 for i in range(number_of_players): if i == 0: (dice, move_pieces, player_pieces, enemy_pieces, player_is_a_winner, there_is_a_winner), player_i = g.get_observation() action, random = act.getAction(PG, enemy_pieces, player_pieces, move_pieces, dice) _, _, _, _, _, there_is_a_winner = g.answer_observation( action) else: action = act.getAction(move_pieces=move_pieces) if there_is_a_winner: winner = player_i winnerCount[player_i] += 1 break #this is where the agents are leanring if there_is_a_winner: if winner == 0: PG.episode_rewards = [ i + 2000 if i == -1000 else i for i in PG.episode_rewards ] discounted_episode_rewards_norm = PG.learn(episode, 0, winner) return winnerCount, save_path
RENDER_ENV = False EPISODES = 50000 rewards = [] RENDER_REWARD_MIN = 50 MAX_FRAMES = 1800 N_avg = 100 if __name__ == "__main__": # Load checkpoint load_path = None #"output/weights/CartPole-v0.ckpt" save_path = None #"output/weights/CartPole-v0-temp.ckpt" PG = PolicyGradient(n_x=env.observation_space.shape[0], n_y=9, learning_rate=0.01, reward_decay=0.995, load_path=load_path, save_path=save_path) past_n_rews = [] for episode in range(EPISODES + 1): observation = env.reset() episode_reward = 0 frame_counter = 0 while True: if RENDER_ENV: print("rendering while training") PG.run_simulation(MAX_FRAMES, env, True) # 1. Choose an action based on observation
env = gym.make('CartPole-v0') env = env.unwrapped # 取消限制 env.seed(1) # 普通的 Policy Gradient 方法, 回合的方差比较大, 所以选一个好点的随机种子 print(env.action_space) # 查看这个环境中可用的 action 有多少个 print(env.observation_space) # 查看这个环境中 state/observation 有多少个特征值 print(env.observation_space.high) # 查看 observation 最高取值 print(env.observation_space.low) # 查看 observation 最低取值 update_frequency = 5 # 更新频率,多少回合更新一次 total_episodes = 3000 # 总回合数 # 创建 PolicyGradient 对象 agent = PolicyGradient( lr=0.01, a_size=env.action_space.n, # 对 CartPole-v0 是 2, 两个 action,向左/向右 s_size=env.observation_space.shape[0], # 对 CartPole-v0 是 4 h_size=8) with tf.Session() as sess: # 初始化所有全局变量 sess.run(tf.global_variables_initializer()) # 总的奖励 total_reward = [] gradient_buffer = sess.run(tf.trainable_variables()) for index, grad in enumerate(gradient_buffer): gradient_buffer[index] = grad * 0 i = 0 # 第几回合
env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) # s_dim, # a_dim, # learning_rate = 0.01, # reward_decay = 0.95, # output_graph = False RL = PolicyGradient( s_dim = env.observation_space.shape[0], a_dim = env.action_space.n, learning_rate = 0.02, reward_decay = 0.99, #output_graph = True ) for i_epsiode in range(3000): s = env.reset() while True: if RENDER: env.render() a = RL.choose_action(s) s_,r,done,info = env.step(a) RL.store_transition(s,a,r)
def simulation(): users_num = 1 ''' action_rewards = {'11':4,'12':1,'13':1,'14':1,'21':1,'22':2,'23':3,'24':16,'31':1,'32':2,'33':3,'34':4} observation_action_transfer = {'11':[2],'12':[2],'13':[2],'14':[2],'21':[3],'22':[3],'23':[3],'24':[3],\ '31':[1],'32':[1],'33':[3],'34':[3]} actions = [1,2,3,4] observations = [[1],[2],[3]] ''' action_rewards = {'11': 5,'12': 0,'13': 0,'14':0,'15':0,'16':13, \ '21': 10,'22': 0, '23': 0,'24':0,'25':0,'26':8} observation_action_transfer = {'11': [1,1], '12': [1,1], '13': [1,1],'14':[1,1],'15':[1,1],'16':[1,1], \ '21': [1,1], '22': [1,1], '23': [1,1],'24':[1,1],'25':[1,1],'26':[0,1]} actions = [1, 2, 3, 4, 5, 6] observations = [[0, 1], [1, 1]] # nums of items to recommend K = 2 load_version = 4 save_version = load_version + 1 load_path = "output/weights/topk{}.ckpt".format(load_version) save_path = "output/weights/topk{}.ckpt".format(save_version) EPISODES = 3000 RENDER_ENV = True rewards = [] PG = PolicyGradient(n_x=len(observations[0]), n_y=len(actions), s0=observations[-1], learning_rate=0.001, reward_decay=1, load_path=None, save_path=save_path, weight_capping_c=2**3, k=K, b_distribution='uniform') for episode in range(EPISODES): episode_reward = 0 tic = time.clock() done = False while True: ''' TODO:initialize the env ''' if RENDER_ENV: observation = PG.episode_observations[-1] #print(observation) # 1. Choose an action based on observation #action = PG.uniform_choose_action(observation) action = PG.choose_action(observation) # 2. Take action in the environment observation_, reward = observation_action_transfer[str(sum(observation))+str(actions[action])], \ action_rewards[str(sum(observation))+str(actions[action])] # 4. Store transition for training PG.store_transition(observation_, action, reward) #print(PG.episode_observations) #print(PG.episode_actions) #print(PG.episode_rewards) toc = time.clock() elapsed_sec = toc - tic if elapsed_sec > 120: done = True if len(PG.episode_observations) > 100: done = True if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) PG.cost_history.append(episode_rewards_sum) print("==========================================") print("Episode: ", episode) print("Seconds: ", elapsed_sec) print("Reward: ", episode_rewards_sum) print("Max reward so far: ", max_reward_so_far) #print(PG.outputs_softmax) #print(PG.episode_rewards) # 5. Train neural network print("distribution at {} is :{}".format( observations[0], PG.get_distribution(observations[0]))) print("distribution at {} is :{}".format( observations[1], PG.get_distribution(observations[1]))) discounted_episode_rewards_norm = PG.learn() break # Save new observation observation = observation_ PG.plot_cost() plt.bar(actions, PG.get_distribution(observations[0])) plt.xlabel("action at state[0,1]") # 显示纵轴标签 plt.ylabel("probability") # 显示图标题 plt.title("policy distribution at state[0,1]") plt.show() plt.bar(actions, PG.get_distribution(observations[1])) plt.xlabel("action at state[1,1]") # 显示纵轴标签 plt.ylabel("probability") # 显示图标题 plt.title("policy distribution at state[1,1]") plt.show()
def play(self, policyPlayers, randomPlayers, load_path, save_path, episodes, episodeStart, training, ghost_players, model2keep, n_x=125, n_y=5, learning_rate=0.02, reward_decay=0.99, player_num=0, number_of_players=2, number_of_pieces=4, reward=-1000, rewardType="monte", inputBoardType="fullBoard"): totalPlayers = len(policyPlayers) + len(randomPlayers) playerPool = policyPlayers + randomPlayers data = dict() for i in policyPlayers: data[i] = StoreTrainingData(n_y) act = Action(reward) PG = PolicyGradient( n_x=n_x, #input layer size n_y=n_y, #ouput layer size learning_rate=learning_rate, reward_decay=reward_decay, load_path=load_path, save_path=save_path, player_num=player_num, rewardType=rewardType, toKeep=model2keep) timeInterval = 50 winCount = defaultdict(int) preds = list() startTime = time.time() for episode in range(episodeStart + 1, episodeStart + episodes): g = ludopy.Game(ghost_players=ghost_players,\ number_of_pieces=number_of_pieces) while True: obs, currPlayer = g.get_observation() state = State(obs, currPlayer) action = None if currPlayer in policyPlayers and len(state.actions()) > 0: action = act.action(self, state, n_y, playerPool, currPlayer, data[currPlayer], PG, training) elif currPlayer in randomPlayers: action = act.action(self, state, n_y) _, _, _, _, _, there_is_a_winner = g.answer_observation(action) if int(time.time() - startTime) > timeInterval: print("episode: {} running for {}".format( episode, time.time() - startTime)) timeInterval += 50 if there_is_a_winner: winCount[currPlayer] += 1 if episode % 1000 == 0: print("wincount: {}".format(winCount)) print("time take for this epoch is {}".format( time.time() - startTime)) startTime = time.time() timeInterval = 50 winCount = defaultdict(int) g.save_hist_video( "videos/gameabc{}.avi".format(episode)) if training: try: self.__train(PG, data, episode, currPlayer) except: g.save_hist_video("error.avi".format(episode)) print( "-----------------error------------------------" ) pass break return winCount
def __init__(self): rospy.init_node('runPG', anonymous=True) if self.mode == 5: self.n_inputs = 4 if self.mode == 8: self.n_inputs = 8 self.RL = PolicyGradient( n_actions = self.n_outputs, n_features = self.n_inputs, learning_rate=0.001, reward_decay=0.98, load_saved_net=True, # output_graph=True, ) rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) rospy.Service('/RL/net', net_eval, self.EvalNet) rospy.Service('/RL/start_learning', Empty, self.start_learning) obs_srv = rospy.ServiceProxy('/RL/observation', observation) drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty) pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10) gg = Float32MultiArray() gg.data = self.g episode_count = 0 rate = rospy.Rate(100) # 100hz while not rospy.is_shutdown(): if self.stLearning: ## Start episode ## episode_count += 1 self.prev_dis2goal = 1e9 # Set gripper reset_srv() while not self.gripper_closed: rate.sleep() # Get observation obs = np.array(obs_srv().state) self.VT = [] step = 0 while True: step += 1 print('[RL] Step %d in episode %d, distance to goal: %f.' % (step, episode_count, self.prev_dis2goal)) pub_goal.publish(gg) # Choose action action = self.RL.choose_action(obs) # Act suc = move_srv(self.A[action]).success rospy.sleep(0.05) rate.sleep() if suc: # Get observation obs_ = np.array(obs_srv().state) fail = drop_srv().dropped # Check if dropped - end of episode else: # End episode if overload or angle limits reached rospy.logerr('[RL] Failed to move gripper. Episode declared failed.') fail = True reward, done = self.transition_reward(obs_, fail) self.RL.store_transition(obs, action, reward) obs = obs_ if step > self.max_steps: done = True if done: ep_rs_sum = sum(self.RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***") vt = self.RL.learn() self.R.append(running_reward) self.possible_plot = True break rate.sleep() elif self.possible_plot: self.plot_sav() self.possible_plot = False if self.max_episodes < episode_count: self.plot_sav() break rate.sleep()
train_start, train_end) print("history file name : %s" % history_filename) codeMap = {} f = codecs.open(codeListFilename, "r", "utf-8") for line in f: if line.strip() != "": tokens = line.strip().split( ",") if not "\t" in line else line.strip().split("\t") codeMap[tokens[0]] = tokens[1] f.close() env = MarketEnv(dir_path="./data/", codes=list(codeMap.keys()), target_date_start=train_start, target_date_end=train_end, sudden_death=-1.0) env_test = MarketEnv(dir_path="./data/", codes=list(codeMap.keys()), target_date_start=test_start, target_date_end=test_end, sudden_death=-1.0) pg = PolicyGradient(env, env_test, discount=0.9, model_filename=model_filename, history_filename=history_filename) pg.train(verbose=0, max_episode=max_episode)
def train(self, max_episode=10, max_path_length=200, verbose=0): env = self.env avg_reward_sum = 0. #f_eps = open("episode.csv","w") #write_eps = csv.write(f_eps) for e in range(max_episode): env._reset() observation = env._reset() game_over = False reward_sum = 0 inputs = [] outputs = [] predicteds = [] rewards = [] #f_iter = open("episode_{0}.csv".format(e),"w") #write_iter = csv.writer(f_iter) f_episode = "episode_{0}.csv".format(e) os.system("rm -rf {0}".format(f_episode)) print(observation[0].shape, observation[1].shape) RL = PolicyGradient( n_actions=self.env.action_space.n, # n_features=observation.shape[0], learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) while not game_over: action, aprob = RL.choose_action(observation) inputs.append(observation) predicteds.append(aprob) y = np.zeros([self.env.action_space.n]) y[action] = 1. outputs.append(y) observation, reward, actual_reward, game_over, info = self.env._step( action) reward_sum += float(actual_reward) #rewards.append(float(reward)) rewards.append(float(reward_sum)) RL.store_transition(observation, action, rewards) # check memory for RNN model if len(inputs) > self.max_memory: del inputs[0] del outputs[0] del predicteds[0] del rewards[0] if verbose > 0: if env.actions[action] == "LONG" or env.actions[ action] == "SHORT": #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD": color = bcolors.FAIL if env.actions[ action] == "LONG" else bcolors.OKBLUE print("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ]))) #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())]))) os.system("echo %s >> %s" % ("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ])), f_episode)) avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % ( e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum) print(toPrint) if self.history_filename != None: os.system("echo %s >> %s" % (toPrint, self.history_filename)) discounted_rewards_ = RL.learn() # train dim = len(inputs[0]) inputs_ = [[] for i in range(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [np.array(inputs_[i]) for i in range(dim)] outputs_ = np.vstack(outputs) predicteds_ = np.vstack(predicteds) rewards_ = np.vstack(rewards) print("shape: ", np.shape(rewards), np.shape(discounted_rewards_)) #outputs_ *= discounted_rewards_ for i, r in enumerate(zip(rewards, discounted_rewards_)): reward, discounted_reward = r if verbose > 1: # print (outputs_[i],) print(outputs_[i], ) if verbose > 0: print(predicteds_[i], outputs_[i], reward, discounted_reward) print("fit model input.shape %s, output.shape %s" % ([inputs_[i].shape for i in range(len(inputs_))], outputs_.shape)) np.set_printoptions(linewidth=200, suppress=True) print("currentTargetIndex:", env.currentTargetIndex)
print("env.observation_space.high", env.observation_space.high) print("env.observation_space.low", env.observation_space.low) RENDER_ENV = False # okno z grą EPISODES = 500 # maksymalna ilość iteracji rewards = [] # nagroda RENDER_REWARD_MIN = 50 # minimalna nagroda do wyrenderowania gry if __name__ == "__main__": load_path = None #"output/weights/CartPole-v0.ckpt" save_path = None #"output/weights/CartPole-v0-temp.ckpt" PG = PolicyGradient(n_x=env.observation_space.shape[0], n_y=env.action_space.n, learning_rate=0.01, reward_decay=0.95, load_path=load_path, save_path=save_path) for episode in range(EPISODES): # start nauki observation = env.reset() episode_reward = 0 while True: if RENDER_ENV: env.render() # 1. Choose an action based on observation action = PG.choose_action(observation) # 2. Take action in the environment
with open('loss.txt', "r+") as f: #清楚上次运行保存的数据 f.seek(0) f.truncate() #清空文件 if __name__ == "__main__": # Load checkpoint load_path = None save_path = None PG = PolicyGradient(n_x=sizeperq * nOfenb * nOfchannel + nOfenb * nOfchannel, n_y=nOfchannel * nOfenb, learning_rate=0.005, reward_decay=0.9, load_path=load_path, save_path=save_path, ep=0.99, nOfChannel=nOfchannel) env = ns3env.Ns3Env(port=port, startSim=startSim, simSeed=seed, simArgs=simArgs, debug=debug) env.reset() ob_space = env.observation_space ac_space = env.action_space
class runPG(): n_inputs = 4 # n_outputs = 4 # right and left for each finger n_outputs = 8 # right, left and stop for each finger max_episodes = 1200 max_steps = 2500 net = 0 X = 0 A = np.array([[-1, -1], [1, -1], [-1, 1], [1, 1], [0, -1], [0, 1], [-1, 0], [1, 0]]) mode = 5 reward_mode = 3 R = [] g = np.array([-35.0, 104.0], dtype='f') # Goal gripper_closed = False stLearning = True # Enable learning possible_plot = False # For reward mode 3 prev_dis2goal = 1e9 def __init__(self): rospy.init_node('runPG', anonymous=True) if self.mode == 5: self.n_inputs = 4 if self.mode == 8: self.n_inputs = 8 self.RL = PolicyGradient( n_actions = self.n_outputs, n_features = self.n_inputs, learning_rate=0.001, reward_decay=0.98, load_saved_net=True, # output_graph=True, ) rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) rospy.Service('/RL/net', net_eval, self.EvalNet) rospy.Service('/RL/start_learning', Empty, self.start_learning) obs_srv = rospy.ServiceProxy('/RL/observation', observation) drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty) pub_goal = rospy.Publisher('/RL/Goal', Float32MultiArray, queue_size=10) gg = Float32MultiArray() gg.data = self.g episode_count = 0 rate = rospy.Rate(100) # 100hz while not rospy.is_shutdown(): if self.stLearning: ## Start episode ## episode_count += 1 self.prev_dis2goal = 1e9 # Set gripper reset_srv() while not self.gripper_closed: rate.sleep() # Get observation obs = np.array(obs_srv().state) self.VT = [] step = 0 while True: step += 1 print('[RL] Step %d in episode %d, distance to goal: %f.' % (step, episode_count, self.prev_dis2goal)) pub_goal.publish(gg) # Choose action action = self.RL.choose_action(obs) # Act suc = move_srv(self.A[action]).success rospy.sleep(0.05) rate.sleep() if suc: # Get observation obs_ = np.array(obs_srv().state) fail = drop_srv().dropped # Check if dropped - end of episode else: # End episode if overload or angle limits reached rospy.logerr('[RL] Failed to move gripper. Episode declared failed.') fail = True reward, done = self.transition_reward(obs_, fail) self.RL.store_transition(obs, action, reward) obs = obs_ if step > self.max_steps: done = True if done: ep_rs_sum = sum(self.RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 print("*** episode: " + str(episode_count) + ", episode reward: " + str(ep_rs_sum) + ", running reward: " + str(int(running_reward)) + " ***") vt = self.RL.learn() self.R.append(running_reward) self.possible_plot = True break rate.sleep() elif self.possible_plot: self.plot_sav() self.possible_plot = False if self.max_episodes < episode_count: self.plot_sav() break rate.sleep() def plot_sav(self): plt.plot(range(len(self.R)),self.R) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() def EvalNet(self, msg): a = 0 return {'action': a} def callbackGripperStatus(self, msg): self.gripper_closed = msg.data == "closed" def start_learning(self, msg): self.stLearning = not self.stLearning return EmptyResponse() def transition_reward(self, obs, fail): # Keep moving as much as possible if self.reward_mode == 1: if fail: reward = 0. else: reward = 1. done = fail # Cross a line if self.reward_mode == 2: if fail: reward = -3. else: reward = -1. done = fail if obs[0] > 40.: print('Reached goal, x = %f.' % obs[0]) reward = 5. done = True # Get to a certain coordinate if self.reward_mode == 3: d = np.linalg.norm(self.g-obs[:2]) if fail or d > self.prev_dis2goal: reward = 0. else: reward = 1. done = fail if d < 5: print('Reached goal, (x,y) = (%f,%f).' % (obs[0],obs[1])) reward = 50. done = True self.prev_dis2goal = d return reward, done
env.reset() env = env.unwrapped # Policy gradient has high variance, seed for reproducibility env.seed(1) print("env.action_space", env.action_space.n) print("env.observation_space", env.observation_space.shape[0]) print("env.observation_space.high", env.observation_space.high) print("env.observation_space.low", env.observation_space.low) RENDER_FLAG = False EPISODES = 500 # 收集500条序列 MAX_STEP = 1500 # 每条序列最多1500步 rewards = [] # 记录每条序列回报的list if __name__ == "__main__": PG = PolicyGradient(n_input=env.observation_space.shape[0], n_output=env.action_space.n) for episode in range(EPISODES): s = env.reset() for i in range(MAX_STEP): if RENDER_FLAG: env.render() # 与环境交互 action = PG.choose_action(s) s_, reward, done, _ = env.step(action) PG.store_transition(s, action, reward) # 如果杆倒了或超出屏幕 if done: ep_rewards_sum = np.sum(PG.ep_rewards) if ep_rewards_sum > 1000: RENDER_FLAG = True else: