def main(argv): RL = PolicyGradient( n_actions=9, n_features=8, learning_rate=0.1, reward_decay=0.5, # output_graph=True, ) RLL = PolicyGradient( n_actions=9, n_features=8, learning_rate=0.1, reward_decay=0.5, # output_graph=True, )
batch_size = 100 time_step = 11 rnn_size = 200 learning_rate = 0.001 epoch = 200 n_actions = 10 train_set = np.array(pickle.load(open('num_train10_1.pkl', 'rb'))) test_set = np.array(pickle.load(open('num_test10_1.pkl', 'rb'))) inputs = tf.placeholder(dtype=tf.int32, shape=[batch_size, time_step]) inputs_ = tf.one_hot(inputs, 10) labels = tf.placeholder(dtype=tf.int32, shape=[batch_size]) RL = PolicyGradient( n_actions=n_actions, n_features=10, ) inputs_ = tf.unstack(inputs_, axis=1) # input transfer to lstm form lstm_cell = LSTMCell(rnn_size, RL) outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell, inputs=inputs_, dtype=tf.float32) # (time-5, batch, 1) => (batch, time-5) actions = tf.transpose(tf.squeeze(RL.actions)) # (time-5, batch, n_actions) => (batch, time-5, n_actions) all_act_prob = tf.transpose(RL.all_act_prob, perm=(1, 0, 2)) output = outputs[-1] # shape (batch_size, rnn_size)
if __name__ == '__main__': #implementation details env = gym.make('Pong-v0') env.seed(1) env = env.unwrapped state_size = 6400 action_size = env.action_space.n RL = PolicyGradient( n_actions=env.action_space.n, #n_features=env.observation_space.shape[0], n_features=state_size, learning_rate=1e-4, reward_decay=0.99, # output_graph=True, # save_interval=10, resume=True, work_dir="PingPongModel", ) i_episode = 0 while True: i_episode += 1 observation = env.reset() video = VideoRecorder(env) observation_mod = prepro(observation) episode_reward = 0 while True:
The mountain car example """ import gym from RL_brain import PolicyGradient env = gym.make('MountainCar-v0') print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=len(env.observation_space.high), learning_rate=0.01, reward_decay=0.99, output_graph=False, ) total_steps = 0 for i_episode in range(10): observation = env.reset() while True: env.render() action = RL.choose_action(observation)
def postprocessreward(reward, th): if reward > th: return (reward) else: return (-40.0) th = 0 print("input/output dims of the DQN: " + str((input_shape, output_shape))) RL = PolicyGradient( n_actions=25, n_features=4, learning_rate=0.0005, reward_decay=0.995, # output_graph=True, ) random_action = True for i_episode in range(600): if i_episode in range(0, 300): env = relay_net_slow #env,dummy= create_example_env() state, reward = env.update_state() state = np.array(state) if i_episode in range(300, 600):
DISPLAY_REWARD_THRESHOLD = 20 # renders environment if total episode reward is greater then this threshold RENDER = False # rendering wastes time import gym_tictactoe env = gym.make('TicTacToe-v2', symbols=[-1, 1], board_size=3, win_size=3) env.seed(1) # reproducible, general Policy gradient has high variance print(env.action_space) print(env.state_space) print(env.state_space.high) print(env.state_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.state_space.shape[0], # learning_rate=0.002, reward_decay=0.9, # output_graph=True, ) print("n_features=", RL.n_features) i_episode = 0 # for i_episode in range(60000): while True: i_episode += 1 state = env.reset() done = False user = 0 reward1 = reward2 = 0
#has_attack = False while True: # RL choose action based on observation action = RL.choose_action(observation=observation) # RL take action and get next observation and reward valid, state, actions, has_weapon, in_gas = next( state, action) # 状态转移 observation_ = np.hstack((state.flatten(), actions)) done = 0 if action != 8 and valid else 1 # 本轮是否终止 reward = get_reward(state, valid, action, has_weapon, in_gas) RL.store_transition(observation_, action, reward) player_pos = np.argwhere(state[0, :, :, 1] == 1)[0] moves.append(player_pos) if done: RL.learn() step += 1 #if step % 1000 == 0: # print('step:' + str(step)) break # swap observation observation = observation_ #eml.update_player_pos(player_pos) eml.next(list(map(lambda x: (x[0], x[1]), moves))) if __name__ == "__main__": RL = PolicyGradient(n_actions=9, n_features=144 * 3 * 4 + 8, learning_rate=1e-8) run_game()
from RL_brain import PolicyGradient import matplotlib.pyplot as plt import numpy as np actions=['fold','call','raise'] RL = PolicyGradient( n_actions=len(actions) n_features=len(actions) learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) for i_episode in range(1000): while True: action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # reward = -1 in all cases RL.store_transition(observation, action, reward) if done: # calculate running reward ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else:
x_new_=None, y_new_=None, select_list_=None, select_list_new_=None, x_text_all_=None, non_layer_=None, ) createVar['g_' + str(item)] =tf.Graph() createVar['all_new_x_' + str(item)] =None createVar['all_new_y_' + str(item)] = None with globals()['g_' + str(item)].as_default(): createVar['RL_' + str(item)] = PolicyGradient( n_actions=2, # np.ones((x_neg.shape[0],), dtype=int), n_features=FLAGS.num_non_layer_features, learning_rate=0.02, reward_decay=0.99, # output_graph=True, ) if(FLAGS.dataset_name=='rt-polaritydata'): Data_select_0.file_path_=FLAGS.negative_data_file_train Data_select_1.file_path_=FLAGS.positive_data_file_train for item in range(FLAGS.num_classes): Data_select=globals()['Data_select_' + str(item)] Data_select.x_text_,Data_select.y_= data_utils.load_data_and_labels_modify_v2(Data_select.file_path_, FLAGS.bag_size, FLAGS.num_classes, item) Data_select.x_=np.array(list(vocab_processor.fit_transform(Data_select.x_text_))) #print(Data_select.x_.shape)
import sys env = gym.make('CartPole-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped #print(env.action_space) #print(env.observation_space) #print(env.observation_space.high) #print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=1e-4, reward_decay=0.99, # output_graph=True, save_interval=10, resume=True, work_dir="CartPoleModel", ) i_episode = 0 while True: i_episode += 1 observation = env.reset() score = 0 while True: score += 1 action = RL.random_choose_action(observation)
env.seed(1) env=env.unwrapped # 显示可用的action print(env.action_space) #显示可用state的observation print(env.observation_space) # 显示state的最高值 print(env.observation_space.high) # 显示state的最低值 print(env.observation_space.low) RL=PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.05, reward_decay=0.995 #ouput_graph=True #输出tensorboard文件 ) # 在计算机跑完一整个回合后才更新一次 for i_episode in range(3000): observation=env.reset() while True: if render: env.render() #观察值就是神经网络的输入 #observation是[-0.43852191 0. ] #action=1 action=RL.choose_action(observation) #observation_:[-0.43915308 -0.00063117]
state[9] = (state[9] - 0 ) / 180 #player theta #state[10] = (state[10] - b ) * m #elevator effect #state[11] = (state[11] - b ) * m #rudder effect #state[12] = (state[12] - b ) * m #roll effect state[13] = (state[13] - 0 ) / 100 #enemy x state[14] = (state[14]-25) / 38 #enemy y state[15] = (state[15] - 0 ) / 100 #enemy z state[16] = (state[16] - 0.333 ) * 3 #enemy speed state[17] = (state[17] - 180 ) / 180 #enemy phi state[18] = (state[18] - 180 ) / 180 #enemy gamma state[19] = (state[19] - 0 ) / 180 #enemy theta RL = PolicyGradient( n_actions=108, n_features=10, learning_rate=0.001, reward_decay=0.99, output_graph=True, ) #actionList = ["11111","11110","11101","11100","11011","11010","11001","11000","11211","11210","11201","11200","10111","10110","10101","10100","10011","10010","10001","10000","10211","10210","10201","10200","12111","12110","12101","12100","12011","12010","12001","12000","12211","12210","12201","12200","01111","01110","01101","01100","01011","01010","01001","01000","01211","01210","01201","01200","00111","00110","00101","00100","00011","00010","00001","00000","00211","00210","00201","00200","02111","02110","02101","02100","02011","02010","02001","02000","02211","02210","02201","02200","21111","21110","21101","21100","21011","21010","21001","21000","21211","21210","21201","21200","20111","20110","20101","20100","20011","20010","20001","20000","20211","20210","20201","20200","22111","22110","22101","22100","22011","22010","22001","22000","22211","22210","22201","22200"] context = zmq.Context() socket = context.socket(zmq.REP) socket.bind("tcp://*:5555") waitasec = 0 for i_episode in range(10000): message = socket.recv() #print("Received request: %s" % message)
MAX_EPISODES = 3000 MAX_EP_STEPS = 160 height = 8 ag_num = 5 env = env1.Lift(ag_num, height) #gym.make(ENV_NAME) agents = [] agents_pg = [] #for i in range(ag_num): # agents.append(RL_agent(height,i)) for i in range(ag_num): RL = PolicyGradient( n_actions=3, #env.action_space.n, n_features=4 * height + 1, #env.observation_space.shape[0], learning_rate=0.004, reward_decay=0.9995, id=i, # output_graph=True, ) agents_pg.append(RL) def run_ddpg(): t1 = time.time() for i in range(MAX_EPISODES): s = env.reset() #s = np.array(s[0]) ep_reward = np.array([0] * ag_num) for j in range(MAX_EP_STEPS): acts = []
def inference_graph(word_vocab_size=10000, # configuration of medium batch_size=20, num_rnn_layers=2, rnn_size=650, num_unroll_steps=35, n_actions=5, dropout=0.0, lamda=0.5 ): input_word = tf.placeholder( tf.int32, shape=[batch_size, num_unroll_steps], name="input") ''' First, embed characters ''' with tf.variable_scope('Embedding'): embedding = tf.get_variable( "word_embedding", [word_vocab_size, rnn_size], dtype=tf.float32) input_embedded = tf.nn.embedding_lookup(embedding, input_word) if dropout != 0: input_embedded = tf.nn.dropout(input_embedded, 1. - dropout) ''' this op clears embedding vector of first symbol (symbol at position 0, which is by convention the position of the padding symbol). It can be used to mimic Torch7 embedding operator that keeps padding mapped to zero embedding vector and ignores gradient updates. For that do the following in TF: 1. after parameter initialization, apply this op to zero out padding embedding vector 2. after each gradient update, apply this op to keep padding at zero''' # clear_word_embedding_padding = tf.scatter_update(char_embedding, [0], tf.constant(0.0, shape=[1, char_embed_size])) ''' Finally, do LSTM ''' with tf.variable_scope('LSTM'): RL = PolicyGradient(n_actions=n_actions, n_features=200) def lstm_cell(): return tf.contrib.rnn.BasicLSTMCell(rnn_size) def attn_cell(): return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=1. - dropout) cell1 = tf.contrib.rnn.MultiRNNCell( [attn_cell() for _ in range(num_rnn_layers)]) initial_rnn_state1 = cell1.zero_state(batch_size, dtype=tf.float32) inputs = tf.reshape( input_embedded, [batch_size, num_unroll_steps, rnn_size]) inputs_list = [tf.squeeze(x, [1]) for x in tf.split(inputs, num_unroll_steps, 1)] layer1_outputs, final_rnn_state1 = tf.contrib.rnn.static_rnn(cell1, inputs_list, initial_state=initial_rnn_state1, dtype=tf.float32) cell2 = LSTMCell(rnn_size, RL, lamda) cell2 = tf.contrib.rnn.DropoutWrapper( cell2, output_keep_prob=1. - dropout) initial_rnn_state2 = cell2.zero_state(batch_size, dtype=tf.float32) layer2_outputs, final_rnn_state2 = tf.contrib.rnn.static_rnn(cell2, layer1_outputs, initial_state=initial_rnn_state2, dtype=tf.float32) # (time, batch, 1) => (batch, time) actions = tf.transpose(tf.squeeze(RL.actions)) # (time, batch, n_actions) => (batch, time, n_actions) all_act_prob = tf.transpose(RL.all_act_prob, perm=(1, 0, 2)) # linear projection onto output (word) vocab logits = [] with tf.variable_scope('WordProjection') as scope: for idx, output in enumerate(layer2_outputs): if idx > 0: scope.reuse_variables() logits.append(linear(output, word_vocab_size)) return adict( input=input_word, # clear_char_embedding_padding = clear_char_embedding_padding, input_embedded=input_embedded, initial_rnn_state1=initial_rnn_state1, initial_rnn_state2=initial_rnn_state2, final_rnn_state1=final_rnn_state1, final_rnn_state2=final_rnn_state2, rnn_outputs=layer2_outputs, logits=logits, all_act_prob=all_act_prob, actions=actions )
import os os.environ['CUDA_VISIBLE_DEVICES'] = '' DISPLAY_REWARD_THRESHOLD = 400 RENDER = False env = gym.make('CartPole-v0') env.seed(1) env = env.unwrapped n_actions = env.action_space.n n_features = env.observation_space.shape[0] RL = PolicyGradient(n_actions=n_actions, n_features=n_features, learning_rate=0.02, reward_decay=0.99) for i_episode in range(3000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done: ep_rs_sum = sum(RL.ep_rs)
DISPLAY_REWARD_THRESHOLD = 2.5 # renders environment if total episode reward is greater than this threshold RENDER = False # rendering wastes time env = gym.make('SpaceX-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.001, reward_decay=0.99, save_path=".\\network.nt", #output_graph=True, ) for i_episode in range(2000): observation = env.reset() #observation=[x,x_dot] target = env.x_board, env.x_board_dot while True: # if i_episode >450 : RENDER = True if RENDER: env.render() if i_episode < I_TEACH: action = 1 if (observation[1] - target[1]) * (
RENDER = False env = gym.make('CartPole-v0') # CartPole 这个模拟 # 普通的 Policy gradient 方法, 使得回合的 variance 比较大, 所以我们选了一个好点的随机种子 env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped # 取消限制 print(env.action_space) # 显示可用 action print(env.observation_space) # 显示可用 state 的 observation print(env.observation_space.high) # 显示 observation 最高值 print(env.observation_space.low) # 显示 observation 最低值 RL = PolicyGradient( # 定义 n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99, # gamma # output_graph=True,# 输出 tensorboard 文件 ) for i_episode in range(3000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action)
RENDER = False # rendering wastes time env = gym.make('MountainCar-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995, # output_graph=True, ) for eposide_i in range(1000): observation = env.reset() while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward)
import gym from RL_brain import PolicyGradient import torch import numpy as np import matplotlib.pyplot as plt env = gym.make('CartPole-v0') print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient(hidden_size=10, num_inputs=env.observation_space.shape[0], action_space=env.action_space) total_steps = 0 # Set up lists to hold results total_rewards = [] batch_rewards = [] batch_actions = [] batch_states = [] batch_counter = 1 batch_size = 10 for i_episode in range(2000): s_0 = env.reset() states = [] rewards = [] actions = []