def main(argv):

    RL = PolicyGradient(
        n_actions=9,
        n_features=8,
        learning_rate=0.1,
        reward_decay=0.5,
        # output_graph=True,
    )

    RLL = PolicyGradient(
        n_actions=9,
        n_features=8,
        learning_rate=0.1,
        reward_decay=0.5,
        # output_graph=True,
    )
예제 #2
0
batch_size = 100
time_step = 11
rnn_size = 200
learning_rate = 0.001
epoch = 200
n_actions = 10

train_set = np.array(pickle.load(open('num_train10_1.pkl', 'rb')))
test_set = np.array(pickle.load(open('num_test10_1.pkl', 'rb')))

inputs = tf.placeholder(dtype=tf.int32, shape=[batch_size, time_step])
inputs_ = tf.one_hot(inputs, 10)
labels = tf.placeholder(dtype=tf.int32, shape=[batch_size])

RL = PolicyGradient(
    n_actions=n_actions,
    n_features=10,
)

inputs_ = tf.unstack(inputs_, axis=1)  # input transfer to lstm form
lstm_cell = LSTMCell(rnn_size, RL)

outputs, _ = tf.contrib.rnn.static_rnn(lstm_cell,
                                       inputs=inputs_,
                                       dtype=tf.float32)
# (time-5, batch, 1) => (batch, time-5)
actions = tf.transpose(tf.squeeze(RL.actions))
# (time-5, batch, n_actions) => (batch, time-5, n_actions)
all_act_prob = tf.transpose(RL.all_act_prob, perm=(1, 0, 2))

output = outputs[-1]  # shape (batch_size, rnn_size)
예제 #3
0

if __name__ == '__main__':
    #implementation details
    env = gym.make('Pong-v0')
    env.seed(1)
    env = env.unwrapped
    state_size = 6400
    action_size = env.action_space.n

    RL = PolicyGradient(
        n_actions=env.action_space.n,
        #n_features=env.observation_space.shape[0],
        n_features=state_size,
        learning_rate=1e-4,
        reward_decay=0.99,
        # output_graph=True,
        # save_interval=10,
        resume=True,
        work_dir="PingPongModel",
    )

    i_episode = 0
    while True:
        i_episode += 1
        observation = env.reset()
        video = VideoRecorder(env)
        observation_mod = prepro(observation)

        episode_reward = 0
        while True:
예제 #4
0
The mountain car example
"""

import gym
from RL_brain import PolicyGradient

env = gym.make('MountainCar-v0')
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=len(env.observation_space.high),
    learning_rate=0.01,
    reward_decay=0.99,
    output_graph=False,
)

total_steps = 0

for i_episode in range(10):

    observation = env.reset()

    while True:
        env.render()

        action = RL.choose_action(observation)
예제 #5
0
def postprocessreward(reward, th):
    if reward > th:
        return (reward)
    else:
        return (-40.0)


th = 0

print("input/output dims of the DQN: " + str((input_shape, output_shape)))

RL = PolicyGradient(
    n_actions=25,
    n_features=4,
    learning_rate=0.0005,
    reward_decay=0.995,
    # output_graph=True,
)

random_action = True

for i_episode in range(600):

    if i_episode in range(0, 300):
        env = relay_net_slow
        #env,dummy= create_example_env()
        state, reward = env.update_state()
        state = np.array(state)

    if i_episode in range(300, 600):
예제 #6
0
DISPLAY_REWARD_THRESHOLD = 20  # renders environment if total episode reward is greater then this threshold
RENDER = False  # rendering wastes time

import gym_tictactoe
env = gym.make('TicTacToe-v2', symbols=[-1, 1], board_size=3, win_size=3)
env.seed(1)     # reproducible, general Policy gradient has high variance

print(env.action_space)
print(env.state_space)
print(env.state_space.high)
print(env.state_space.low)

RL = PolicyGradient(
	n_actions=env.action_space.n,
	n_features=env.state_space.shape[0],
	# learning_rate=0.002,
	reward_decay=0.9,
	# output_graph=True,
)

print("n_features=", RL.n_features)

i_episode = 0
# for i_episode in range(60000):
while True:
	i_episode += 1
	state = env.reset()

	done = False
	user = 0
	reward1 = reward2 = 0
예제 #7
0
파일: test.py 프로젝트: SperkJJ/Battle
            #has_attack = False
            while True:
                # RL choose action based on observation
                action = RL.choose_action(observation=observation)
                # RL take action and get next observation and reward
                valid, state, actions, has_weapon, in_gas = next(
                    state, action)  # 状态转移
                observation_ = np.hstack((state.flatten(), actions))
                done = 0 if action != 8 and valid else 1  # 本轮是否终止
                reward = get_reward(state, valid, action, has_weapon, in_gas)
                RL.store_transition(observation_, action, reward)
                player_pos = np.argwhere(state[0, :, :, 1] == 1)[0]
                moves.append(player_pos)
                if done:
                    RL.learn()
                    step += 1
                    #if step % 1000 == 0:
                    #    print('step:' + str(step))
                    break
                # swap observation
                observation = observation_
            #eml.update_player_pos(player_pos)
            eml.next(list(map(lambda x: (x[0], x[1]), moves)))


if __name__ == "__main__":
    RL = PolicyGradient(n_actions=9,
                        n_features=144 * 3 * 4 + 8,
                        learning_rate=1e-8)
    run_game()
예제 #8
0
from RL_brain import PolicyGradient
import matplotlib.pyplot as plt
import numpy as np

actions=['fold','call','raise']
RL = PolicyGradient(
    n_actions=len(actions)
    n_features=len(actions)
    learning_rate=0.02,
    reward_decay=0.995,
    # output_graph=True,
)

for i_episode in range(1000):

    

    while True:

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)     # reward = -1 in all cases

        RL.store_transition(observation, action, reward)

        if done:
            # calculate running reward
            ep_rs_sum = sum(RL.ep_rs)
            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
예제 #9
0
        x_new_=None,
        y_new_=None,
        select_list_=None,
        select_list_new_=None,
        x_text_all_=None,
        non_layer_=None,
        )

        createVar['g_' + str(item)] =tf.Graph()
        createVar['all_new_x_' + str(item)] =None
        createVar['all_new_y_' + str(item)] = None
        with globals()['g_' + str(item)].as_default():
            createVar['RL_' + str(item)] = PolicyGradient(
                n_actions=2,  # np.ones((x_neg.shape[0],), dtype=int),
                n_features=FLAGS.num_non_layer_features,
                learning_rate=0.02,
                reward_decay=0.99,
                # output_graph=True,
            )


if(FLAGS.dataset_name=='rt-polaritydata'):
    Data_select_0.file_path_=FLAGS.negative_data_file_train
    Data_select_1.file_path_=FLAGS.positive_data_file_train

for item in range(FLAGS.num_classes):
    Data_select=globals()['Data_select_' + str(item)]
    Data_select.x_text_,Data_select.y_= data_utils.load_data_and_labels_modify_v2(Data_select.file_path_, FLAGS.bag_size, FLAGS.num_classes, item)
    Data_select.x_=np.array(list(vocab_processor.fit_transform(Data_select.x_text_)))
    #print(Data_select.x_.shape)
import sys

env = gym.make('CartPole-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

#print(env.action_space)
#print(env.observation_space)
#print(env.observation_space.high)
#print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=1e-4,
    reward_decay=0.99,
    # output_graph=True,
    save_interval=10,
    resume=True,
    work_dir="CartPoleModel",
)

i_episode = 0
while True:
    i_episode += 1
    observation = env.reset()

    score = 0
    while True:
        score += 1

        action = RL.random_choose_action(observation)
env.seed(1)
env=env.unwrapped

# 显示可用的action
print(env.action_space)
#显示可用state的observation
print(env.observation_space)
# 显示state的最高值
print(env.observation_space.high)
# 显示state的最低值
print(env.observation_space.low)

RL=PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.05,
    reward_decay=0.995
    #ouput_graph=True #输出tensorboard文件
)
# 在计算机跑完一整个回合后才更新一次
for i_episode in range(3000):
    observation=env.reset()

    while True:
        if render:
            env.render()
        #观察值就是神经网络的输入
        #observation是[-0.43852191  0.        ]
        #action=1
        action=RL.choose_action(observation)
        #observation_:[-0.43915308 -0.00063117]
예제 #12
0
    state[9] = (state[9] - 0 ) / 180 #player theta
    #state[10] = (state[10] - b ) * m  #elevator effect
    #state[11] = (state[11] - b ) * m  #rudder effect
    #state[12] = (state[12] - b ) * m  #roll effect
    state[13] = (state[13] - 0 ) / 100  #enemy x
    state[14] = (state[14]-25) / 38   #enemy y
    state[15] = (state[15] - 0 ) / 100  #enemy z
    state[16] = (state[16] - 0.333 ) * 3  #enemy speed
    state[17] = (state[17] - 180 ) / 180  #enemy phi
    state[18] = (state[18] - 180 ) / 180 #enemy gamma
    state[19] = (state[19] - 0 ) / 180 #enemy theta
    
RL = PolicyGradient(
    n_actions=108,
    n_features=10,
    learning_rate=0.001,
    reward_decay=0.99,
    output_graph=True,
)

#actionList = ["11111","11110","11101","11100","11011","11010","11001","11000","11211","11210","11201","11200","10111","10110","10101","10100","10011","10010","10001","10000","10211","10210","10201","10200","12111","12110","12101","12100","12011","12010","12001","12000","12211","12210","12201","12200","01111","01110","01101","01100","01011","01010","01001","01000","01211","01210","01201","01200","00111","00110","00101","00100","00011","00010","00001","00000","00211","00210","00201","00200","02111","02110","02101","02100","02011","02010","02001","02000","02211","02210","02201","02200","21111","21110","21101","21100","21011","21010","21001","21000","21211","21210","21201","21200","20111","20110","20101","20100","20011","20010","20001","20000","20211","20210","20201","20200","22111","22110","22101","22100","22011","22010","22001","22000","22211","22210","22201","22200"]

context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind("tcp://*:5555")

waitasec = 0

for i_episode in range(10000):
    message = socket.recv()
    #print("Received request: %s" % message)
예제 #13
0
MAX_EPISODES = 3000
MAX_EP_STEPS = 160
height = 8
ag_num = 5
env = env1.Lift(ag_num, height)  #gym.make(ENV_NAME)
agents = []
agents_pg = []
#for i in range(ag_num):
#    agents.append(RL_agent(height,i))

for i in range(ag_num):
    RL = PolicyGradient(
        n_actions=3,  #env.action_space.n,
        n_features=4 * height + 1,  #env.observation_space.shape[0],
        learning_rate=0.004,
        reward_decay=0.9995,
        id=i,
        # output_graph=True,
    )
    agents_pg.append(RL)


def run_ddpg():
    t1 = time.time()
    for i in range(MAX_EPISODES):
        s = env.reset()
        #s = np.array(s[0])
        ep_reward = np.array([0] * ag_num)

        for j in range(MAX_EP_STEPS):
            acts = []
예제 #14
0
def inference_graph(word_vocab_size=10000,  # configuration of medium
                    batch_size=20,
                    num_rnn_layers=2,
                    rnn_size=650,
                    num_unroll_steps=35,
                    n_actions=5,
                    dropout=0.0,
                    lamda=0.5
                    ):

    input_word = tf.placeholder(
        tf.int32, shape=[batch_size, num_unroll_steps], name="input")

    ''' First, embed characters '''
    with tf.variable_scope('Embedding'):
        embedding = tf.get_variable(
            "word_embedding", [word_vocab_size, rnn_size], dtype=tf.float32)
        input_embedded = tf.nn.embedding_lookup(embedding, input_word)
        if dropout != 0:
            input_embedded = tf.nn.dropout(input_embedded, 1. - dropout)

        ''' this op clears embedding vector of first symbol (symbol at position 0, which is by convention the position
        of the padding symbol). It can be used to mimic Torch7 embedding operator that keeps padding mapped to
        zero embedding vector and ignores gradient updates. For that do the following in TF:
        1. after parameter initialization, apply this op to zero out padding embedding vector
        2. after each gradient update, apply this op to keep padding at zero'''
        # clear_word_embedding_padding = tf.scatter_update(char_embedding, [0], tf.constant(0.0, shape=[1, char_embed_size]))

    ''' Finally, do LSTM '''
    with tf.variable_scope('LSTM'):
        RL = PolicyGradient(n_actions=n_actions, n_features=200)

        def lstm_cell():
            return tf.contrib.rnn.BasicLSTMCell(rnn_size)

        def attn_cell():
            return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=1. - dropout)
        cell1 = tf.contrib.rnn.MultiRNNCell(
            [attn_cell() for _ in range(num_rnn_layers)])

        initial_rnn_state1 = cell1.zero_state(batch_size, dtype=tf.float32)

        inputs = tf.reshape(
            input_embedded, [batch_size, num_unroll_steps, rnn_size])
        inputs_list = [tf.squeeze(x, [1])
                       for x in tf.split(inputs, num_unroll_steps, 1)]

        layer1_outputs, final_rnn_state1 = tf.contrib.rnn.static_rnn(cell1, inputs_list,
                                                                     initial_state=initial_rnn_state1, dtype=tf.float32)

        cell2 = LSTMCell(rnn_size, RL, lamda)
        cell2 = tf.contrib.rnn.DropoutWrapper(
            cell2, output_keep_prob=1. - dropout)
        initial_rnn_state2 = cell2.zero_state(batch_size, dtype=tf.float32)
        layer2_outputs, final_rnn_state2 = tf.contrib.rnn.static_rnn(cell2, layer1_outputs,
                                                                     initial_state=initial_rnn_state2, dtype=tf.float32)
        # (time, batch, 1) => (batch, time)
        actions = tf.transpose(tf.squeeze(RL.actions))
        # (time, batch, n_actions) => (batch, time, n_actions)
        all_act_prob = tf.transpose(RL.all_act_prob, perm=(1, 0, 2))

        # linear projection onto output (word) vocab
        logits = []
        with tf.variable_scope('WordProjection') as scope:
            for idx, output in enumerate(layer2_outputs):
                if idx > 0:
                    scope.reuse_variables()
                logits.append(linear(output, word_vocab_size))

    return adict(
        input=input_word,
        # clear_char_embedding_padding = clear_char_embedding_padding,
        input_embedded=input_embedded,
        initial_rnn_state1=initial_rnn_state1,
        initial_rnn_state2=initial_rnn_state2,
        final_rnn_state1=final_rnn_state1,
        final_rnn_state2=final_rnn_state2,
        rnn_outputs=layer2_outputs,
        logits=logits,
        all_act_prob=all_act_prob,
        actions=actions
    )
예제 #15
0
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

DISPLAY_REWARD_THRESHOLD = 400
RENDER = False

env = gym.make('CartPole-v0')
env.seed(1)
env = env.unwrapped

n_actions = env.action_space.n
n_features = env.observation_space.shape[0]

RL = PolicyGradient(n_actions=n_actions,
                    n_features=n_features,
                    learning_rate=0.02,
                    reward_decay=0.99)

for i_episode in range(3000):
    observation = env.reset()

    while True:
        if RENDER: env.render()
        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        if done:
            ep_rs_sum = sum(RL.ep_rs)
예제 #16
0
DISPLAY_REWARD_THRESHOLD = 2.5  # renders environment if total episode reward is greater than this threshold
RENDER = False  # rendering wastes time

env = gym.make('SpaceX-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.001,
    reward_decay=0.99,
    save_path=".\\network.nt",
    #output_graph=True,
)

for i_episode in range(2000):

    observation = env.reset()  #observation=[x,x_dot]
    target = env.x_board, env.x_board_dot

    while True:
        # if i_episode >450 : RENDER = True
        if RENDER: env.render()

        if i_episode < I_TEACH:
            action = 1 if (observation[1] - target[1]) * (
예제 #17
0
RENDER = False

env = gym.make('CartPole-v0')  # CartPole 这个模拟
# 普通的 Policy gradient 方法, 使得回合的 variance 比较大, 所以我们选了一个好点的随机种子
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped  # 取消限制

print(env.action_space)  # 显示可用 action
print(env.observation_space)  # 显示可用 state 的 observation
print(env.observation_space.high)  # 显示 observation 最高值
print(env.observation_space.low)  # 显示 observation 最低值

RL = PolicyGradient(  # 定义
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.99,  # gamma
    # output_graph=True,# 输出 tensorboard 文件
)

for i_episode in range(3000):

    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)
예제 #18
0
RENDER = False  # rendering wastes time

env = gym.make('MountainCar-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(
    n_actions=env.action_space.n,
    n_features=env.observation_space.shape[0],
    learning_rate=0.02,
    reward_decay=0.995,
    # output_graph=True,
)

for eposide_i in range(1000):
    observation = env.reset()

    while True:
        if RENDER: env.render()

        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)
예제 #19
0
import gym
from RL_brain import PolicyGradient
import torch
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('CartPole-v0')
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = PolicyGradient(hidden_size=10,
                    num_inputs=env.observation_space.shape[0],
                    action_space=env.action_space)

total_steps = 0
# Set up lists to hold results
total_rewards = []
batch_rewards = []
batch_actions = []
batch_states = []
batch_counter = 1
batch_size = 10

for i_episode in range(2000):

    s_0 = env.reset()
    states = []
    rewards = []
    actions = []