def main(argv): RL = PolicyGradient( n_actions=9, n_features=8, learning_rate=0.1, reward_decay=0.5, # output_graph=True, ) RLL = PolicyGradient( n_actions=9, n_features=8, learning_rate=0.1, reward_decay=0.5, # output_graph=True, )
import gym import numpy as np from RL_brain import PolicyGradient import matplotlib.pyplot as plt env = gym.make('SpaceX-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.001, reward_decay=0.99, load_path=".\\network.nt", # output_graph=True, ) ep_rs_hist = [] for i_episode in range(100): observation = env.reset() reward_hist = [] while True: #env.render() action = RL.test_action(observation) observation, reward, done, info = env.step(action)
DISPLAY_REWARD_THRESHOLD = 20 # renders environment if total episode reward is greater then this threshold RENDER = False # rendering wastes time import gym_tictactoe env = gym.make('TicTacToe-v2', symbols=[-1, 1], board_size=3, win_size=3) env.seed(1) # reproducible, general Policy gradient has high variance print(env.action_space) print(env.state_space) print(env.state_space.high) print(env.state_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.state_space.shape[0], # learning_rate=0.002, reward_decay=0.9, # output_graph=True, ) print("n_features=", RL.n_features) i_episode = 0 # for i_episode in range(60000): while True: i_episode += 1 state = env.reset() done = False user = 0 reward1 = reward2 = 0
#has_attack = False while True: # RL choose action based on observation action = RL.choose_action(observation=observation) # RL take action and get next observation and reward valid, state, actions, has_weapon, in_gas = next( state, action) # 状态转移 observation_ = np.hstack((state.flatten(), actions)) done = 0 if action != 8 and valid else 1 # 本轮是否终止 reward = get_reward(state, valid, action, has_weapon, in_gas) RL.store_transition(observation_, action, reward) player_pos = np.argwhere(state[0, :, :, 1] == 1)[0] moves.append(player_pos) if done: RL.learn() step += 1 #if step % 1000 == 0: # print('step:' + str(step)) break # swap observation observation = observation_ #eml.update_player_pos(player_pos) eml.next(list(map(lambda x: (x[0], x[1]), moves))) if __name__ == "__main__": RL = PolicyGradient(n_actions=9, n_features=144 * 3 * 4 + 8, learning_rate=1e-8) run_game()
from RL_brain import PolicyGradient import matplotlib.pyplot as plt import numpy as np actions=['fold','call','raise'] RL = PolicyGradient( n_actions=len(actions) n_features=len(actions) learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) for i_episode in range(1000): while True: action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # reward = -1 in all cases RL.store_transition(observation, action, reward) if done: # calculate running reward ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else:
DISPLAY_REWARD_THRESHOLD = 2.5 # renders environment if total episode reward is greater than this threshold RENDER = False # rendering wastes time env = gym.make('SpaceX-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.001, reward_decay=0.99, save_path=".\\network.nt", #output_graph=True, ) for i_episode in range(2000): observation = env.reset() #observation=[x,x_dot] target = env.x_board, env.x_board_dot while True: # if i_episode >450 : RENDER = True if RENDER: env.render() if i_episode < I_TEACH: action = 1 if (observation[1] - target[1]) * (
batch_size = 100 time_step = 11 rnn_size = 200 learning_rate = 0.001 epoch = 200 n_actions = 10 train_set = np.array(pickle.load(open('num_train10_1.pkl', 'rb'))) test_set = np.array(pickle.load(open('num_test10_1.pkl', 'rb'))) inputs = tf.placeholder(dtype=tf.int32, shape=[batch_size, time_step]) inputs_ = tf.one_hot(inputs, 10) labels = tf.placeholder(dtype=tf.int32, shape=[batch_size]) RL = PolicyGradient( n_actions=n_actions, n_features=10, ) inputs_ = tf.unstack(inputs_, axis=1) # input transfer to lstm form lstm_cell = LSTMCell(rnn_size, RL) outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell, inputs=inputs_, dtype=tf.float32) # (time-5, batch, 1) => (batch, time-5) actions = tf.transpose(tf.squeeze(RL.actions)) # (time-5, batch, n_actions) => (batch, time-5, n_actions) all_act_prob = tf.transpose(RL.all_act_prob, perm=(1, 0, 2)) output = outputs[-1] # shape (batch_size, rnn_size)
The mountain car example """ import gym from RL_brain import PolicyGradient env = gym.make('MountainCar-v0') print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=len(env.observation_space.high), learning_rate=0.01, reward_decay=0.99, output_graph=False, ) total_steps = 0 for i_episode in range(10): observation = env.reset() while True: env.render() action = RL.choose_action(observation)
env.seed(1) env=env.unwrapped # 显示可用的action print(env.action_space) #显示可用state的observation print(env.observation_space) # 显示state的最高值 print(env.observation_space.high) # 显示state的最低值 print(env.observation_space.low) RL=PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.05, reward_decay=0.995 #ouput_graph=True #输出tensorboard文件 ) # 在计算机跑完一整个回合后才更新一次 for i_episode in range(3000): observation=env.reset() while True: if render: env.render() #观察值就是神经网络的输入 #observation是[-0.43852191 0. ] #action=1 action=RL.choose_action(observation) #observation_:[-0.43915308 -0.00063117]
state[9] = (state[9] - 0 ) / 180 #player theta #state[10] = (state[10] - b ) * m #elevator effect #state[11] = (state[11] - b ) * m #rudder effect #state[12] = (state[12] - b ) * m #roll effect state[13] = (state[13] - 0 ) / 100 #enemy x state[14] = (state[14]-25) / 38 #enemy y state[15] = (state[15] - 0 ) / 100 #enemy z state[16] = (state[16] - 0.333 ) * 3 #enemy speed state[17] = (state[17] - 180 ) / 180 #enemy phi state[18] = (state[18] - 180 ) / 180 #enemy gamma state[19] = (state[19] - 0 ) / 180 #enemy theta RL = PolicyGradient( n_actions=108, n_features=10, learning_rate=0.001, reward_decay=0.99, output_graph=True, ) #actionList = ["11111","11110","11101","11100","11011","11010","11001","11000","11211","11210","11201","11200","10111","10110","10101","10100","10011","10010","10001","10000","10211","10210","10201","10200","12111","12110","12101","12100","12011","12010","12001","12000","12211","12210","12201","12200","01111","01110","01101","01100","01011","01010","01001","01000","01211","01210","01201","01200","00111","00110","00101","00100","00011","00010","00001","00000","00211","00210","00201","00200","02111","02110","02101","02100","02011","02010","02001","02000","02211","02210","02201","02200","21111","21110","21101","21100","21011","21010","21001","21000","21211","21210","21201","21200","20111","20110","20101","20100","20011","20010","20001","20000","20211","20210","20201","20200","22111","22110","22101","22100","22011","22010","22001","22000","22211","22210","22201","22200"] context = zmq.Context() socket = context.socket(zmq.REP) socket.bind("tcp://*:5555") waitasec = 0 for i_episode in range(10000): message = socket.recv() #print("Received request: %s" % message)
MAX_EPISODES = 3000 MAX_EP_STEPS = 160 height = 8 ag_num = 5 env = env1.Lift(ag_num, height) #gym.make(ENV_NAME) agents = [] agents_pg = [] #for i in range(ag_num): # agents.append(RL_agent(height,i)) for i in range(ag_num): RL = PolicyGradient( n_actions=3, #env.action_space.n, n_features=4 * height + 1, #env.observation_space.shape[0], learning_rate=0.004, reward_decay=0.9995, id=i, # output_graph=True, ) agents_pg.append(RL) def run_ddpg(): t1 = time.time() for i in range(MAX_EPISODES): s = env.reset() #s = np.array(s[0]) ep_reward = np.array([0] * ag_num) for j in range(MAX_EP_STEPS): acts = []
RENDER = True # rendering wastes time current_max = 100 env = gym.make('LunarLander-v2') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.005, reward_decay=0.99 # output_graph=True, ) RL.restore_model() for i_episode in range(5000): observation = env.reset() t = 0 episode_reward = 0 while True: if RENDER: env.render()
The cart pole example """ import gym from RL_brain import PolicyGradient env = gym.make('CartPole-v0') print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=len(env.observation_space.high), learning_rate=0.01, reward_decay=0.99, # output_graph=True, ) for i_episode in range(10000): observation = env.reset() while True: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action)
def inference_graph(word_vocab_size=10000, # configuration of medium batch_size=20, num_rnn_layers=2, rnn_size=650, num_unroll_steps=35, n_actions=5, dropout=0.0, lamda=0.5 ): input_word = tf.placeholder( tf.int32, shape=[batch_size, num_unroll_steps], name="input") ''' First, embed characters ''' with tf.variable_scope('Embedding'): embedding = tf.get_variable( "word_embedding", [word_vocab_size, rnn_size], dtype=tf.float32) input_embedded = tf.nn.embedding_lookup(embedding, input_word) if dropout != 0: input_embedded = tf.nn.dropout(input_embedded, 1. - dropout) ''' this op clears embedding vector of first symbol (symbol at position 0, which is by convention the position of the padding symbol). It can be used to mimic Torch7 embedding operator that keeps padding mapped to zero embedding vector and ignores gradient updates. For that do the following in TF: 1. after parameter initialization, apply this op to zero out padding embedding vector 2. after each gradient update, apply this op to keep padding at zero''' # clear_word_embedding_padding = tf.scatter_update(char_embedding, [0], tf.constant(0.0, shape=[1, char_embed_size])) ''' Finally, do LSTM ''' with tf.variable_scope('LSTM'): RL = PolicyGradient(n_actions=n_actions, n_features=200) def lstm_cell(): return tf.contrib.rnn.BasicLSTMCell(rnn_size) def attn_cell(): return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=1. - dropout) cell1 = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(num_rnn_layers)]) initial_rnn_state1 = cell1.zero_state(batch_size, dtype=tf.float32) inputs = tf.reshape( input_embedded, [batch_size, num_unroll_steps, rnn_size]) inputs_list = [tf.squeeze(x, [1]) for x in tf.split(inputs, num_unroll_steps, 1)] layer1_outputs, final_rnn_state1 = tf.contrib.rnn.static_rnn(cell1, inputs_list, initial_state=initial_rnn_state1, dtype=tf.float32) cell2 = LSTMCell(rnn_size, RL, lamda) cell2 = tf.contrib.rnn.DropoutWrapper( cell2, output_keep_prob=1. - dropout) initial_rnn_state2 = cell2.zero_state(batch_size, dtype=tf.float32) layer2_outputs, final_rnn_state2 = tf.contrib.rnn.static_rnn(cell2, layer1_outputs, initial_state=initial_rnn_state2, dtype=tf.float32) # (time, batch, 1) => (batch, time) actions = tf.transpose(tf.squeeze(RL.actions)) # (time, batch, n_actions) => (batch, time, n_actions) all_act_prob = tf.transpose(RL.all_act_prob, perm=(1, 0, 2)) # linear projection onto output (word) vocab logits = [] with tf.variable_scope('WordProjection') as scope: for idx, output in enumerate(layer2_outputs): if idx > 0: scope.reuse_variables() logits.append(linear(output, word_vocab_size)) return adict( input=input_word, # clear_char_embedding_padding = clear_char_embedding_padding, input_embedded=input_embedded, initial_rnn_state1=initial_rnn_state1, initial_rnn_state2=initial_rnn_state2, final_rnn_state1=final_rnn_state1, final_rnn_state2=final_rnn_state2, rnn_outputs=layer2_outputs, logits=logits, all_act_prob=all_act_prob, actions=actions )
def postprocessreward(reward, th): if reward > th: return (reward) else: return (-40.0) th = 0 print("input/output dims of the DQN: " + str((input_shape, output_shape))) RL = PolicyGradient( n_actions=25, n_features=4, learning_rate=0.0005, reward_decay=0.995, # output_graph=True, ) random_action = True for i_episode in range(600): if i_episode in range(0, 300): env = relay_net_slow #env,dummy= create_example_env() state, reward = env.update_state() state = np.array(state) if i_episode in range(300, 600):
DISPLAY_REWARD_THRESHOLD = 400 # 当回合总 reward 大于 400 时显示模拟窗口 env = gym.make('CartPole-v0') # CartPole 这个模拟 env = env.unwrapped # 取消限制 env.seed(1) # 普通的 Policy gradient 方法, 使得回合的 variance 比较大, 所以我们选了一个好点的随机种子 print(env.action_space) # 显示可用 action print(env.observation_space) # 显示可用 state 的 observation print(env.observation_space.high) # 显示 observation 最高值 print(env.observation_space.low) # 显示 observation 最低值 # 定义 RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99, # gamma # output_graph=True, # 输出 tensorboard 文件 ) for i_episode in range(3000): observation = env.reset() print('observation', observation) while True: if RENDER: env.render() action = RL.choose_action(observation) print('action:', action)
import sys env = gym.make('CartPole-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped #print(env.action_space) #print(env.observation_space) #print(env.observation_space.high) #print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=1e-4, reward_decay=0.99, # output_graph=True, save_interval=10, resume=True, work_dir="CartPoleModel", ) i_episode = 0 while True: i_episode += 1 observation = env.reset() score = 0 while True: score += 1 action = RL.random_choose_action(observation)
if __name__ == '__main__': #implementation details env = gym.make('Pong-v0') env.seed(1) env = env.unwrapped state_size = 6400 action_size = env.action_space.n RL = PolicyGradient( n_actions=env.action_space.n, #n_features=env.observation_space.shape[0], n_features=state_size, learning_rate=1e-4, reward_decay=0.99, # output_graph=True, # save_interval=10, resume=True, work_dir="PingPongModel", ) i_episode = 0 while True: i_episode += 1 observation = env.reset() video = VideoRecorder(env) observation_mod = prepro(observation) episode_reward = 0 while True:
RENDER = False # 边训练边显示会拖慢训练速度,我们等程序先学习一段时间 env = gym.make('CartPole-v0') # 创建 CardPole这个模拟 env.seed(1) # 创建随机种子 env = env.unwrapped # 取消限制 print(env.action_space) #输出可用的动作 print(env.observation_space) # 显示可用 state 的 observation print(env.observation_space.high) # 显示 observation 最高值 print(env.observation_space.low) # 显示 observation 最低值 # 定义使用 Policy_gradient 的算法 RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99, # output_graph=True, ) for i_episode in range(3000): observation = env.reset() # 获取回合 i_episode 第一个 observation while True: if RENDER: env.render() # 刷新环境 action = RL.choose_action(observation) # 选行为 observation_, reward, done, info = env.step(action) # 获取下一个state RL.store_transition(observation, action, reward) # 存储这一回合的transition if done: # 一个回合结束,开始更新参数 ep_rs_sum = sum(RL.ep_rs) # 统计每回合的reward
import os os.environ['CUDA_VISIBLE_DEVICES'] = '' DISPLAY_REWARD_THRESHOLD = 400 RENDER = False env = gym.make('CartPole-v0') env.seed(1) env = env.unwrapped n_actions = env.action_space.n n_features = env.observation_space.shape[0] RL = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=0.02, reward_decay=0.99) for i_episode in range(3000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done: ep_rs_sum = sum(RL.ep_rs)
x_new_=None, y_new_=None, select_list_=None, select_list_new_=None, x_text_all_=None, non_layer_=None, ) createVar['g_' + str(item)] =tf.Graph() createVar['all_new_x_' + str(item)] =None createVar['all_new_y_' + str(item)] = None with globals()['g_' + str(item)].as_default(): createVar['RL_' + str(item)] = PolicyGradient( n_actions=2, # np.ones((x_neg.shape[0],), dtype=int), n_features=FLAGS.num_non_layer_features, learning_rate=0.02, reward_decay=0.99, # output_graph=True, ) if(FLAGS.dataset_name=='rt-polaritydata'): Data_select_0.file_path_=FLAGS.negative_data_file_train Data_select_1.file_path_=FLAGS.positive_data_file_train for item in range(FLAGS.num_classes): Data_select=globals()['Data_select_' + str(item)] Data_select.x_text_,Data_select.y_= data_utils.load_data_and_labels_modify_v2(Data_select.file_path_, FLAGS.bag_size, FLAGS.num_classes, item) Data_select.x_=np.array(list(vocab_processor.fit_transform(Data_select.x_text_))) #print(Data_select.x_.shape)
RENDER = False # rendering wastes time env = gym.make('MountainCar-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) for i_episode in range(1000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # reward = -1 in all cases
DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold RENDER = False # rendering wastes time env = gym.make('CartPole-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99, # output_graph=True, ) for i_episode in range(3000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action( observation ) #agent根据策略\pi进行探索,直到探索结束. 一轮探索的所有结果<observation, action, reward>存储在记忆库中,用于训练 observation_, reward, done, info = env.step(
RENDER = False # rendering wastes time env = gym.make('MountainCar-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) for eposide_i in range(1000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward)
import gym from RL_brain import PolicyGradient import torch import numpy as np import matplotlib.pyplot as plt env = gym.make('CartPole-v0') print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient(hidden_size=10, num_inputs=env.observation_space.shape[0], action_space=env.action_space) total_steps = 0 # Set up lists to hold results total_rewards = [] batch_rewards = [] batch_actions = [] batch_states = [] batch_counter = 1 batch_size = 10 for i_episode in range(2000): s_0 = env.reset() states = [] rewards = [] actions = []