def main(argv):

    RL = PolicyGradient(
        n_actions=9,
        n_features=8,
        learning_rate=0.1,
        reward_decay=0.5,
        # output_graph=True,
    )

    RLL = PolicyGradient(
        n_actions=9,
        n_features=8,
        learning_rate=0.1,
        reward_decay=0.5,
        # output_graph=True,
    )
Beispiel #2
0
import gym
import numpy as np
from RL_brain import PolicyGradient
import matplotlib.pyplot as plt

env = gym.make('SpaceX-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.001,
    reward_decay=0.99,
    load_path=".\\network.nt",
    # output_graph=True,
)

ep_rs_hist = []

for i_episode in range(100):

    observation = env.reset()
    reward_hist = []

    while True:
        #env.render()

        action = RL.test_action(observation)

        observation, reward, done, info = env.step(action)
DISPLAY_REWARD_THRESHOLD = 20  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time

import gym_tictactoe
env = gym.make('TicTacToe-v2', symbols=[-1, 1], board_size=3, win_size=3)
env.seed(1)     # reproducible, general Policy gradient has high variance

print(env.action_space)
print(env.state_space)
print(env.state_space.high)
print(env.state_space.low)

RL = PolicyGradient(
	n_actions=env.action_space.n,
	n_features=env.state_space.shape[0],
	# learning_rate=0.002,
	reward_decay=0.9,
	# output_graph=True,
)

print("n_features=", RL.n_features)

i_episode = 0
# for i_episode in range(60000):
while True:
	i_episode += 1
	state = env.reset()

	done = False
	user = 0
	reward1 = reward2 = 0
Beispiel #4
0
            #has_attack = False
            while True:
                # RL choose action based on observation
                action = RL.choose_action(observation=observation)
                # RL take action and get next observation and reward
                valid, state, actions, has_weapon, in_gas = next(
                    state, action)  # 状态转移
                observation_ = np.hstack((state.flatten(), actions))
                done = 0 if action != 8 and valid else 1  # 本轮是否终止
                reward = get_reward(state, valid, action, has_weapon, in_gas)
                RL.store_transition(observation_, action, reward)
                player_pos = np.argwhere(state[0, :, :, 1] == 1)[0]
                moves.append(player_pos)
                if done:
                    RL.learn()
                    step += 1
                    #if step % 1000 == 0:
                    #    print('step:' + str(step))
                    break
                # swap observation
                observation = observation_
            #eml.update_player_pos(player_pos)
            eml.next(list(map(lambda x: (x[0], x[1]), moves)))


if __name__ == "__main__":
    RL = PolicyGradient(n_actions=9,
                        n_features=144 * 3 * 4 + 8,
                        learning_rate=1e-8)
    run_game()
Beispiel #5
0
from RL_brain import PolicyGradient
import matplotlib.pyplot as plt
import numpy as np

actions=['fold','call','raise']
RL = PolicyGradient(
    n_actions=len(actions)
    n_features=len(actions)
    learning_rate=0.02,
    reward_decay=0.995,
    # output_graph=True,
)

for i_episode in range(1000):

    

    while True:

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)     # reward = -1 in all cases

        RL.store_transition(observation, action, reward)

        if done:
            # calculate running reward
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
Beispiel #6
0
DISPLAY_REWARD_THRESHOLD = 2.5  # renders environment if total episode reward is greater than this threshold
RENDER = False  # rendering wastes time

env = gym.make('SpaceX-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.001,
    reward_decay=0.99,
    save_path=".\\network.nt",
    #output_graph=True,
)

for i_episode in range(2000):

    observation = env.reset()  #observation=[x,x_dot]
    target = env.x_board, env.x_board_dot

    while True:
        # if i_episode >450 : RENDER = True
        if RENDER: env.render()

        if i_episode < I_TEACH:
            action = 1 if (observation[1] - target[1]) * (
Beispiel #7
0
batch_size = 100
time_step = 11
rnn_size = 200
learning_rate = 0.001
epoch = 200
n_actions = 10

train_set = np.array(pickle.load(open('num_train10_1.pkl', 'rb')))
test_set = np.array(pickle.load(open('num_test10_1.pkl', 'rb')))

inputs = tf.placeholder(dtype=tf.int32, shape=[batch_size, time_step])
inputs_ = tf.one_hot(inputs, 10)
labels = tf.placeholder(dtype=tf.int32, shape=[batch_size])

RL = PolicyGradient(
    n_actions=n_actions,
    n_features=10,
)

inputs_ = tf.unstack(inputs_, axis=1)  # input transfer to lstm form
lstm_cell = LSTMCell(rnn_size, RL)

outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell,
                                       inputs=inputs_,
                                       dtype=tf.float32)
# (time-5, batch, 1) => (batch, time-5)
actions = tf.transpose(tf.squeeze(RL.actions))
# (time-5, batch, n_actions) => (batch, time-5, n_actions)
all_act_prob = tf.transpose(RL.all_act_prob, perm=(1, 0, 2))

output = outputs[-1]  # shape (batch_size, rnn_size)
Beispiel #8
0
The mountain car example
"""

import gym
from RL_brain import PolicyGradient

env = gym.make('MountainCar-v0')
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=len(env.observation_space.high),
    learning_rate=0.01,
    reward_decay=0.99,
    output_graph=False,
)

total_steps = 0

for i_episode in range(10):

    observation = env.reset()

    while True:
        env.render()

        action = RL.choose_action(observation)
env.seed(1)
env=env.unwrapped

# 显示可用的action
print(env.action_space)
#显示可用state的observation
print(env.observation_space)
# 显示state的最高值
print(env.observation_space.high)
# 显示state的最低值
print(env.observation_space.low)

RL=PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.05,
    reward_decay=0.995
    #ouput_graph=True #输出tensorboard文件
)
# 在计算机跑完一整个回合后才更新一次
for i_episode in range(3000):
    observation=env.reset()

    while True:
        if render:
            env.render()
        #观察值就是神经网络的输入
        #observation是[-0.43852191  0.        ]
        #action=1
        action=RL.choose_action(observation)
        #observation_:[-0.43915308 -0.00063117]
    state[9] = (state[9] - 0 ) / 180 #player theta
    #state[10] = (state[10] - b ) * m  #elevator effect
    #state[11] = (state[11] - b ) * m  #rudder effect
    #state[12] = (state[12] - b ) * m  #roll effect
    state[13] = (state[13] - 0 ) / 100  #enemy x
    state[14] = (state[14]-25) / 38   #enemy y
    state[15] = (state[15] - 0 ) / 100  #enemy z
    state[16] = (state[16] - 0.333 ) * 3  #enemy speed
    state[17] = (state[17] - 180 ) / 180  #enemy phi
    state[18] = (state[18] - 180 ) / 180 #enemy gamma
    state[19] = (state[19] - 0 ) / 180 #enemy theta
    
RL = PolicyGradient(
    n_actions=108,
    n_features=10,
    learning_rate=0.001,
    reward_decay=0.99,
    output_graph=True,
)

#actionList = ["11111","11110","11101","11100","11011","11010","11001","11000","11211","11210","11201","11200","10111","10110","10101","10100","10011","10010","10001","10000","10211","10210","10201","10200","12111","12110","12101","12100","12011","12010","12001","12000","12211","12210","12201","12200","01111","01110","01101","01100","01011","01010","01001","01000","01211","01210","01201","01200","00111","00110","00101","00100","00011","00010","00001","00000","00211","00210","00201","00200","02111","02110","02101","02100","02011","02010","02001","02000","02211","02210","02201","02200","21111","21110","21101","21100","21011","21010","21001","21000","21211","21210","21201","21200","20111","20110","20101","20100","20011","20010","20001","20000","20211","20210","20201","20200","22111","22110","22101","22100","22011","22010","22001","22000","22211","22210","22201","22200"]

context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind("tcp://*:5555")

waitasec = 0

for i_episode in range(10000):
    message = socket.recv()
    #print("Received request: %s" % message)
Beispiel #11
0
MAX_EPISODES = 3000
MAX_EP_STEPS = 160
height = 8
ag_num = 5
env = env1.Lift(ag_num, height)  #gym.make(ENV_NAME)
agents = []
agents_pg = []
#for i in range(ag_num):
#    agents.append(RL_agent(height,i))

for i in range(ag_num):
    RL = PolicyGradient(
        n_actions=3,  #env.action_space.n,
        n_features=4 * height + 1,  #env.observation_space.shape[0],
        learning_rate=0.004,
        reward_decay=0.9995,
        id=i,
        # output_graph=True,
    )
    agents_pg.append(RL)


def run_ddpg():
    t1 = time.time()
    for i in range(MAX_EPISODES):
        s = env.reset()
        #s = np.array(s[0])
        ep_reward = np.array([0] * ag_num)

        for j in range(MAX_EP_STEPS):
            acts = []
Beispiel #12
0
RENDER = True  # rendering wastes time
current_max = 100

env = gym.make('LunarLander-v2')
env.seed(1)     # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.005,
    reward_decay=0.99
    # output_graph=True,
)

RL.restore_model()

for i_episode in range(5000):

    observation = env.reset()

    t = 0
    episode_reward = 0

    while True:
        if RENDER: env.render()
Beispiel #13
0
The cart pole example
"""

import gym
from RL_brain import PolicyGradient

env = gym.make('CartPole-v0')
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=len(env.observation_space.high),
    learning_rate=0.01,
    reward_decay=0.99,
    # output_graph=True,
)

for i_episode in range(10000):

    observation = env.reset()

    while True:
        env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)
Beispiel #14
0
def inference_graph(word_vocab_size=10000,  # configuration of medium
                    batch_size=20,
                    num_rnn_layers=2,
                    rnn_size=650,
                    num_unroll_steps=35,
                    n_actions=5,
                    dropout=0.0,
                    lamda=0.5
                    ):

    input_word = tf.placeholder(
        tf.int32, shape=[batch_size, num_unroll_steps], name="input")

    ''' First, embed characters '''
    with tf.variable_scope('Embedding'):
        embedding = tf.get_variable(
            "word_embedding", [word_vocab_size, rnn_size], dtype=tf.float32)
        input_embedded = tf.nn.embedding_lookup(embedding, input_word)
        if dropout != 0:
            input_embedded = tf.nn.dropout(input_embedded, 1. - dropout)

        ''' this op clears embedding vector of first symbol (symbol at position 0, which is by convention the position
        of the padding symbol). It can be used to mimic Torch7 embedding operator that keeps padding mapped to
        zero embedding vector and ignores gradient updates. For that do the following in TF:
        1. after parameter initialization, apply this op to zero out padding embedding vector
        2. after each gradient update, apply this op to keep padding at zero'''
        # clear_word_embedding_padding = tf.scatter_update(char_embedding, [0], tf.constant(0.0, shape=[1, char_embed_size]))

    ''' Finally, do LSTM '''
    with tf.variable_scope('LSTM'):
        RL = PolicyGradient(n_actions=n_actions, n_features=200)

        def lstm_cell():
            return tf.contrib.rnn.BasicLSTMCell(rnn_size)

        def attn_cell():
            return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=1. - dropout)
        cell1 = tf.contrib.rnn.MultiRNNCell(
            [attn_cell() for _ in range(num_rnn_layers)])

        initial_rnn_state1 = cell1.zero_state(batch_size, dtype=tf.float32)

        inputs = tf.reshape(
            input_embedded, [batch_size, num_unroll_steps, rnn_size])
        inputs_list = [tf.squeeze(x, [1])
                       for x in tf.split(inputs, num_unroll_steps, 1)]

        layer1_outputs, final_rnn_state1 = tf.contrib.rnn.static_rnn(cell1, inputs_list,
                                                                     initial_state=initial_rnn_state1, dtype=tf.float32)

        cell2 = LSTMCell(rnn_size, RL, lamda)
        cell2 = tf.contrib.rnn.DropoutWrapper(
            cell2, output_keep_prob=1. - dropout)
        initial_rnn_state2 = cell2.zero_state(batch_size, dtype=tf.float32)
        layer2_outputs, final_rnn_state2 = tf.contrib.rnn.static_rnn(cell2, layer1_outputs,
                                                                     initial_state=initial_rnn_state2, dtype=tf.float32)
        # (time, batch, 1) => (batch, time)
        actions = tf.transpose(tf.squeeze(RL.actions))
        # (time, batch, n_actions) => (batch, time, n_actions)
        all_act_prob = tf.transpose(RL.all_act_prob, perm=(1, 0, 2))

        # linear projection onto output (word) vocab
        logits = []
        with tf.variable_scope('WordProjection') as scope:
            for idx, output in enumerate(layer2_outputs):
                if idx > 0:
                    scope.reuse_variables()
                logits.append(linear(output, word_vocab_size))

    return adict(
        input=input_word,
        # clear_char_embedding_padding = clear_char_embedding_padding,
        input_embedded=input_embedded,
        initial_rnn_state1=initial_rnn_state1,
        initial_rnn_state2=initial_rnn_state2,
        final_rnn_state1=final_rnn_state1,
        final_rnn_state2=final_rnn_state2,
        rnn_outputs=layer2_outputs,
        logits=logits,
        all_act_prob=all_act_prob,
        actions=actions
    )
Beispiel #15
0
def postprocessreward(reward, th):
    if reward > th:
        return (reward)
    else:
        return (-40.0)


th = 0

print("input/output dims of the DQN: " + str((input_shape, output_shape)))

RL = PolicyGradient(
    n_actions=25,
    n_features=4,
    learning_rate=0.0005,
    reward_decay=0.995,
    # output_graph=True,
)

random_action = True

for i_episode in range(600):

    if i_episode in range(0, 300):
        env = relay_net_slow
        #env,dummy= create_example_env()
        state, reward = env.update_state()
        state = np.array(state)

    if i_episode in range(300, 600):
DISPLAY_REWARD_THRESHOLD = 400  # 当回合总 reward 大于 400 时显示模拟窗口

env = gym.make('CartPole-v0')  # CartPole 这个模拟
env = env.unwrapped  # 取消限制
env.seed(1)  # 普通的 Policy gradient 方法, 使得回合的 variance 比较大, 所以我们选了一个好点的随机种子

print(env.action_space)  # 显示可用 action
print(env.observation_space)  # 显示可用 state 的 observation
print(env.observation_space.high)  # 显示 observation 最高值
print(env.observation_space.low)  # 显示 observation 最低值

# 定义
RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.99,  # gamma
    # output_graph=True,    # 输出 tensorboard 文件
)

for i_episode in range(3000):

    observation = env.reset()
    print('observation', observation)

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)
        print('action:', action)
import sys

env = gym.make('CartPole-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

#print(env.action_space)
#print(env.observation_space)
#print(env.observation_space.high)
#print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=1e-4,
    reward_decay=0.99,
    # output_graph=True,
    save_interval=10,
    resume=True,
    work_dir="CartPoleModel",
)

i_episode = 0
while True:
    i_episode += 1
    observation = env.reset()

    score = 0
    while True:
        score += 1

        action = RL.random_choose_action(observation)
Beispiel #18
0

if __name__ == '__main__':
    #implementation details
    env = gym.make('Pong-v0')
    env.seed(1)
    env = env.unwrapped
    state_size = 6400
    action_size = env.action_space.n

    RL = PolicyGradient(
        n_actions=env.action_space.n,
        #n_features=env.observation_space.shape[0],
        n_features=state_size,
        learning_rate=1e-4,
        reward_decay=0.99,
        # output_graph=True,
        # save_interval=10,
        resume=True,
        work_dir="PingPongModel",
    )

    i_episode = 0
    while True:
        i_episode += 1
        observation = env.reset()
        video = VideoRecorder(env)
        observation_mod = prepro(observation)

        episode_reward = 0
        while True:
RENDER = False  # 边训练边显示会拖慢训练速度,我们等程序先学习一段时间

env = gym.make('CartPole-v0')   # 创建 CardPole这个模拟
env.seed(1)     # 创建随机种子
env = env.unwrapped # 取消限制

print(env.action_space) #输出可用的动作
print(env.observation_space)    # 显示可用 state 的 observation
print(env.observation_space.high)  # 显示 observation 最高值
print(env.observation_space.low)   # 显示 observation 最低值

# 定义使用 Policy_gradient 的算法
RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.99,
    # output_graph=True,
)

for i_episode in range(3000):
    observation = env.reset()   # 获取回合 i_episode 第一个 observation

    while True:
        if RENDER: env.render() # 刷新环境
        action = RL.choose_action(observation)  # 选行为
        observation_, reward, done, info = env.step(action) # 获取下一个state
        RL.store_transition(observation, action, reward)    # 存储这一回合的transition

        if done:    # 一个回合结束,开始更新参数
            ep_rs_sum = sum(RL.ep_rs)   # 统计每回合的reward
Beispiel #20
0
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

DISPLAY_REWARD_THRESHOLD = 400
RENDER = False

env = gym.make('CartPole-v0')
env.seed(1)
env = env.unwrapped

n_actions = env.action_space.n
n_features = env.observation_space.shape[0]

RL = PolicyGradient(n_actions=n_actions,
                    n_features=n_features,
                    learning_rate=0.02,
                    reward_decay=0.99)

for i_episode in range(3000):
    observation = env.reset()

    while True:
        if RENDER: env.render()
        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)
Beispiel #21
0
        x_new_=None,
        y_new_=None,
        select_list_=None,
        select_list_new_=None,
        x_text_all_=None,
        non_layer_=None,
        )

        createVar['g_' + str(item)] =tf.Graph()
        createVar['all_new_x_' + str(item)] =None
        createVar['all_new_y_' + str(item)] = None
        with globals()['g_' + str(item)].as_default():
            createVar['RL_' + str(item)] = PolicyGradient(
                n_actions=2,  # np.ones((x_neg.shape[0],), dtype=int),
                n_features=FLAGS.num_non_layer_features,
                learning_rate=0.02,
                reward_decay=0.99,
                # output_graph=True,
            )


if(FLAGS.dataset_name=='rt-polaritydata'):
    Data_select_0.file_path_=FLAGS.negative_data_file_train
    Data_select_1.file_path_=FLAGS.positive_data_file_train

for item in range(FLAGS.num_classes):
    Data_select=globals()['Data_select_' + str(item)]
    Data_select.x_text_,Data_select.y_= data_utils.load_data_and_labels_modify_v2(Data_select.file_path_, FLAGS.bag_size, FLAGS.num_classes, item)
    Data_select.x_=np.array(list(vocab_processor.fit_transform(Data_select.x_text_)))
    #print(Data_select.x_.shape)
RENDER = False  # rendering wastes time

env = gym.make('MountainCar-v0')
env.seed(1)     # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.995,
    # output_graph=True,
)

for i_episode in range(1000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)     # reward = -1 in all cases
Beispiel #23
0
DISPLAY_REWARD_THRESHOLD = 400  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time

env = gym.make('CartPole-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.99,
    # output_graph=True,
)

for i_episode in range(3000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(
            observation
        )  #agent根据策略\pi进行探索,直到探索结束. 一轮探索的所有结果<observation, action, reward>存储在记忆库中,用于训练
        observation_, reward, done, info = env.step(
Beispiel #24
0
RENDER = False  # rendering wastes time

env = gym.make('MountainCar-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.995,
    # output_graph=True,
)

for eposide_i in range(1000):
    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)
Beispiel #25
0
import gym
from RL_brain import PolicyGradient
import torch
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('CartPole-v0')
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(hidden_size=10,
                    num_inputs=env.observation_space.shape[0],
                    action_space=env.action_space)

total_steps = 0
# Set up lists to hold results
total_rewards = []
batch_rewards = []
batch_actions = []
batch_states = []
batch_counter = 1
batch_size = 10

for i_episode in range(2000):

    s_0 = env.reset()
    states = []
    rewards = []
    actions = []