Example #1
0
    total_packet = 1000,
    # Training hyper-parameters
    gamma = 0.99,
    eps = 0.3,
    seed = 0,
    decay_rate = 0.99,
    learning_rate = 1e-4,
    batch_size = 10,
    save_freq = 2000,
    log_freq = 10,
    # Evaluation
    test_mode = "pg",
)


env = Env(default_config)
# torch.manual_seed(default_config["seed"])


# Create the agent



# check & load pretrain model
if default_config["test_mode"] = "pg":
	agent = pg.Agent(default_config)
	if os.path.isfile('./data/pg_send_packet.pkl'):
    	print('Load Policy Network parametets ...')
    	agent.s_policy.load_state_dict(torch.load('./data/pg_send_packet.pkl'))
    if os.path.isfile('./data/pg_switch_channel.pkl'):
    	print('Load Policy Network parametets ...')
Example #2
0
print("device", device)

if not os.path.exists(file_path):
    os.makedirs(file_path)

write_lr(lr)  #lrのtextファイルを作成する

now = datetime.datetime.now()
print('{0:%Y%m%d}'.format(now))

#tensorboarx
writer_x = SummaryWriter('tfbx2/' + '_' + '{0:%Y%m%d%H%M%S_}'.format(now) +
                         model_filename + MEMO + '/')

ban = Env(BANHEN, WINREN)
memory = ReplayMemory(CAPACITY, ban)
brain = Brain_dqn(NeuralNet_cnn, device, ban.size, ban, memory, GAMMA,
                  BATCH_SIZE, lr, T, BANHEN, BANSIZE)

match_is_continue = True  #試合が継続しているかどうか
train_is_continue = True  #訓練を継続するか
reward = 0  #報酬
step = 0  #何手目か
step_sum = 0
gen_num = 0  #モデルの初期値
episode_sum = 0  #エピソードの累積
search_depth = 3
ep_random_data = 0

log_print("lrはtextファイルから読み取り")
K_o_from_json = json_data['parameters']['K_o']
max_range_from_json = json_data['parameters']['Max_range']
N_mins_from_json = json_data['parameters']['N_mins']
d_none_from_json = json_data['parameters']['d_none']
d_perf_from_json = json_data['parameters']['d_perf']
delta_expl_angle_from_json = json_data['parameters']['delta_expl_angle']
xi_max_from_json = json_data['parameters']['xi_max']

scs_from_json = SCS(json_data['beacons'][0]['ID'],
                    max_range_from_json,
                    xi_max=xi_max_from_json,
                    d_perf=d_perf_from_json,
                    d_none=d_none_from_json)

env_from_json = Env(entrance_point_from_json,
                    obstacle_corners=obstacle_corners_from_json)

start_animation_from_min_ID = 0
stop_min_ID = 20  #N_mins_from_json#1#

scs_from_json.insert_into_environment(env_from_json)

mins2 = [
    Min(
        json_data['beacons'][i + 1]['ID'],  #i+1 because [0] is the SCS
        max_range_from_json,
        None,
        xi_max=xi_max_from_json,
        d_perf=d_perf_from_json,
        d_none=d_none_from_json,
        delta_expl_angle=delta_expl_angle_from_json)
Example #4
0
parser.add_argument('--render', action='store_true')
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--num_layers', type=int, default=2)
parser.add_argument('--num_neurons', type=int, default=15)
parser.add_argument('--repeat_num', default=0, type=int)
parser.add_argument('--update_every', type=int, default=1)
parser.add_argument('--env', default='binary',choices=['binary', 'cart', 'mount'])
parser.add_argument('--update_type', default='async', choices=['sync', 'async'])
parser.add_argument('--train_last', action='store_true', help='Only train the last neuron')
parser.add_argument('--reward_type', default='task', choices=['all','task','bio','bio_then_all'])
args = parser.parse_args()

# CREATE ENVIRONMENT
if args.env == 'cart': env = gym.make('CartPole-v1')
elif args.env == 'mount': env = gym.make('MountainCar-v0')
else: sys.exit(); env = Env()

# BUILD NETWORK AND SET ENV THRESHOLD
network        = Network(args, input_space=env.observation_space.shape, num_outputs=env.action_space.n)
stop_threshold = 75 if args.env == 'binary' else 300

for e in range(1,20000):
    done      = False
    state     = env.reset()
    ep_reward = 0
    

    while not done:
        # take action
        action = network.forward(state)
        # if args.env == 'mount' and action == 1: action += 1
def main():
    env = Env(HEIGHT, WIDTH, Human, Zombie, Bat)
    generate(env)
Example #6
0
        ]),
    ]

    open_w_sq_obs = [
        np.array([
            [-1, -1],
            [-1, 12],
            [12, 12],
            [12, -1],
        ]),
        np.array([[2, 2], [2, 9], [9, 9], [9, 2]])
    ]

    env = Env(
        np.array([0, 0]),
        obstacle_corners=
        open_small  #open_large #[]#open_w_sq_obs #open_large#obs_zig_zag#[]#
    )
    data['environment'].append(env.toJson())

    # %%Parameter initialization
    max_range = 3
    _xi_max = 1
    _d_perf = 0.1
    _d_none = 2.5
    _delta_expl_angle = 0  #np.pi/4 #np.pi/6
    _K_o = 0.9

    N_mins = 6
    file_path = r'json_files\ds_test_123.json'
    dt = 0.01
def main():
    rospy.init_node('ddpg_stage_1')
    env = Env(is_training)
    agent = DDPG(env, state_dim, action_dim)
    past_action = np.array([0., 0.])
    print('State Dimensions: ' + str(state_dim))
    print('Action Dimensions: ' + str(action_dim))
    print('Action Max: ' + str(action_linear_max) + ' m/s and ' +
          str(action_angular_max) + ' rad/s')

    if is_training:
        print('Training mode')
        avg_reward_his = []
        total_reward = 0
        var = 1.

        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(np.random.normal(a[0], var), 0., 1.)
                a[1] = np.clip(np.random.normal(a[1], var), -0.5, 0.5)

                state_, r, done, arrive = env.step(a, past_action)
                time_step = agent.perceive(state, a, r, state_, done)

                if arrive:
                    result = 'Success'
                else:
                    result = 'Fail'

                if time_step > 0:
                    total_reward += r

                if time_step % 10000 == 0 and time_step > 0:
                    print(
                        '---------------------------------------------------')
                    avg_reward = total_reward / 10000
                    print('Average_reward = ', avg_reward)
                    avg_reward_his.append(round(avg_reward, 2))
                    print('Average Reward:', avg_reward_his)
                    total_reward = 0

                if time_step % 5 == 0 and time_step > exploration_decay_start_step:
                    var *= 0.9999

                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Var: %.2f' % var,
                          '| Time step: %i' % time_step, '|', result)
                    one_round_step = 0

                if done or one_round_step >= 500:
                    print('Step: %3i' % one_round_step, '| Var: %.2f' % var,
                          '| Time step: %i' % time_step, '|', result)
                    break

    else:
        print('Testing mode')
        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(a[0], 0., 1.)
                a[1] = np.clip(a[1], -0.5, 0.5)
                state_, r, done, arrive = env.step(a, past_action)
                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Arrive!!!')
                    one_round_step = 0

                if done:
                    print('Step: %3i' % one_round_step, '| Collision!!!')
                    break
Example #8
0
    def play_episode(self, n_tot):

        self.beta_net.eval()
        self.beta_target.eval()

        self.pi_net.eval()
        self.pi_target.eval()

        self.vb_net.eval()
        self.vb_target.eval()

        self.q_net.eval()
        self.q_target.eval()

        self.qb_net.eval()
        self.qb_target.eval()

        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)
            mask = Variable(torch.FloatTensor(
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]),
                            requires_grad=False).cuda()
            j = 0
            temp = 1

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                beta, phi = self.beta_net(s)
                pi, _ = self.pi_net(s)
                q, _ = self.q_net(s)
                vb, _ = self.vb_net(s)

                pi = beta.squeeze(0)
                self.greedy = False

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    # eps = np.random.rand()
                    eps = 1
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.01:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                q = q[0, a]
                q = q.squeeze(0)

                env.step(a)

                yield {
                    'o': env.s.cpu().numpy(),
                    'v': vb.squeeze(0).data.cpu().numpy(),
                    'vb': vb.squeeze(0).data.cpu().numpy(),
                    'qb': q.squeeze(0).data.cpu().numpy(),
                    # 's': x[0, :512].data.cpu().numpy(),
                    'score': env.score,
                    'beta': pi.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy(),
                    'q': q.squeeze(0).data.cpu().numpy()
                }

                j += 1

        raise StopIteration
Example #9
0
from environment import Environment as Env
from helper_funcs import print_query, print_info

ENVIRONMENT_SETTINGS_FILE = "environment_settings_test.txt"

# print_query("Please enter K vehicle loss penalty value:")
# inp = raw_input()
inp = ''
if str(inp) == '':
    env = Env(ENVIRONMENT_SETTINGS_FILE)
else:
    env = Env(ENVIRONMENT_SETTINGS_FILE, int(inp))

env.simulation()
Example #10
0
    def play_episode(self, n_tot):

        self.model.eval()
        self.model_b.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)
            mask = Variable(torch.FloatTensor([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                             requires_grad=False).cuda()
            j = 0
            temp = 1

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                beta, vb, qb, _, _ = self.model_b(s, self.actions_matrix)
                pi, v, q, adv, x = self.model(s, self.actions_matrix, beta.detach())

                pi = pi.squeeze(0)
                self.greedy = False

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.1:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi/temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                q = q[0, a, 0]
                q = q.squeeze(0)

                qb = qb[0, a, 0]
                qb = qb.squeeze(0)

                env.step(a)

                yield {'o': env.s.cpu().numpy(),
                       'v': v.squeeze(0).data.cpu().numpy(),
                       'vb': vb.squeeze(0).data.cpu().numpy(),
                       'qb': qb.squeeze(0).data.cpu().numpy(),
                       's': x[0, :512].data.cpu().numpy(),
                       'score': env.score,
                       'beta': pi.data.cpu().numpy(),
                       'phi': x[0, :512].data.cpu().numpy(),
                       'q': q.squeeze(0).data.cpu().numpy()}

                j += 1

        raise StopIteration
Example #11
0
    def play(self, n_tot, action_offset, player):

        self.beta_net.eval()
        self.beta_target.eval()

        self.pi_net.eval()
        self.pi_target.eval()

        self.vb_net.eval()
        self.vb_target.eval()

        self.q_net.eval()
        self.q_target.eval()

        self.qb_net.eval()
        self.qb_target.eval()

        env = Env(action_offset)

        n_human = 90

        episodes = list(self.data.keys())
        random.shuffle(episodes)
        humans_trajectories = iter(episodes)

        for i in range(n_tot):

            env.reset()
            trajectory = self.data[next(humans_trajectories)]
            choices = np.arange(self.global_action_space, dtype=np.int)
            random_choices = self.mask_q.data.cpu().numpy()
            random_choices = random_choices / random_choices.sum()

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)

                if player is 'beta':
                    pi, _ = self.beta_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = False

                elif player is 'q_b':
                    pi, _ = self.qb_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = True

                elif player is 'pi':
                    pi, _ = self.pi_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = False

                elif player is 'q_pi':
                    pi, _ = self.q_net(s)
                    pi = pi.squeeze(0)
                    self.greedy = True

                else:
                    raise NotImplementedError

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # eps = 1
                    # a = np.random.choice(choices)
                    if self.greedy:
                        if eps > 0.01:
                            a = (pi * self.mask_q).data.cpu().numpy()
                            a = np.argmax(a)
                        else:
                            a = np.random.choice(choices, p=random_choices)
                    else:
                        a = F.softmax(pi + self.mask_beta,
                                      dim=0).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                j += 1

            yield {'score': env.score, 'frames': j}

        raise StopIteration
Example #12
0
            if not args.test:
                self.replay_memory.append(
                    (history, action, reward, next_history, end))
                self.priority.append(priority)
            history = next_history
            action = next_action
            if frame > 2000:
                raise ValueError('Loop bug')
        if maxv.size()[0] > 0:
            self.writer.add_scalar('maxv', maxv.mean(), round_num)
        if actions.size()[0] > 0:
            self.writer.add_scalar('action', actions.mean(), round_num)
        self.writer.add_scalar('epsilon', self.epsilon, round_num)
        self.writer.add_scalar('frame', frame, round_num)
        gc.collect()
        if env.win:
            print("Round {} Win: reward:{}, frame:{}".format(
                round_num, reward, frame))
            self.win = True
        else:
            print("Round {} Lose: reward:{}, frame:{}".format(
                round_num, reward, frame))
            self.win = False
        return reward


if __name__ == "__main__":
    env = Env(args.height, args.width, args.frame_time)
    actor = Actor()
    actor.main()
Example #13
0
File: builtin.py Project: diiq/oyoy
 def __init__(self, function, args):
     self.function = function
     self.lambda_list = args
     self.bindings = Env(None, None)
Example #14
0
    theta = 1.
    mu = np.array([0, 0])
    sigma = np.array([0, 1])

    # initiliaze the RL-agent:
    agent = AgentReinforce(dim_state=dim_state,
                           dim_actions=dim_actions,
                           hidden_dims=hidden_dims,
                           optimizer=optimizer,
                           gamma=gamma)

    # initiliaze the environment:
    env = Env(start=start,
              tcost=tcost,
              horizon=horizon,
              w=w,
              theta=theta,
              mu=mu,
              sigma=sigma)

    # TRAINING #

    print("\n===TRAINING===\n")

    trained_agent, train_loss, train_states, \
        train_actions, train_rewards = train_reinforce(agent=agent,
                                                       environment=env,
                                                       episodes=train_episodes,
                                                       policy_update=pi_update)

    # SIMULATION #
Example #15
0
"""
All information on README.md
"""

import tensorflow as tf
from environment import Env
import numpy as np
import time
import model

steps = 1000
env = Env(vision=True)
ob = env.reset(relaunch=True)
print(ob)

###=================== Play the game with the trained model
# while True:
#     env = Env(vision=True)
#     ob = env.reset(relaunch=True)
#     loss = 0.0
#     for i in range(steps):
#         image = scipy.misc.imresize(ob, [66, 200]) / 255.0
#         degrees = model.y.eval(feed_dict={model.x: [image], model.keep_prob: 1.0})[0][0]
#         ob, reward, done, _ = env.step(act)
#         if done is True:
#             break
#         else:
#             ob_list.append(ob)
#
#     print("PLAY WITH THE TRAINED MODEL")
#     print(reward_sum)
Example #16
0
def main():
    rospy.init_node('ddpg_stage_1')
    env = Env(is_training)
    agent = DDPG(env, state_dim, action_dim)

    # import ipdb
    # ipdb.set_trace()

    past_action = np.array([0., 0.])
    print('State Dimensions: ' + str(state_dim))
    print('Action Dimensions: ' + str(action_dim))
    print('Action Max: ' + str(action_linear_max) + ' m/s and ' +
          str(action_angular_max) + ' rad/s')
    print('Action Min: ' + str(action_linear_min) + ' m/s and ' +
          str(action_angular_min) + ' rad/s')

    #########################################################################################
    #                                 Training
    #########################################################################################
    if is_training:
        print('Training mode')
        avg_reward_his = []
        total_reward = 0
        action_var = 0.2
        success_rate = 0

        # Log path setting
        now = datetime.datetime.now()
        logdir = now.strftime('%Y-%M-%d') + '_' + now.strftime('%H-%M')
        logdir = os.path.join(log_dir, logdir)
        # tb_writer = SummaryWriter(logdir)

        # Start training
        start_time = time.time()
        for itr in range(10000):
            state = env.reset()

            # episode_reward = 0.0
            # For each episode
            for cur_step in range(max_episode_length):
                action = agent.action(state)
                action[0] = np.clip(np.random.normal(action[0], action_var),
                                    action_linear_min, action_linear_max)
                action[1] = np.clip(np.random.normal(action[1], action_var),
                                    action_angular_min, action_angular_max)

                state_, reward, done, arrive = env.step(action, past_action)
                time_step = agent.perceive(state, action, reward, state_, done)

                ########################################################################################
                #                                   debugging environment
                ########################################################################################
                if is_debugging:
                    print('cur_step: {}'.format(cur_step))
                    print('action: {}'.format(action))
                    print('goal position: x:{}, y:{}'.format(
                        env.goal_position.position.x,
                        env.goal_position.position.y))
                    print('r: {}, done: {}, arrive: {}'.format(
                        reward, done, arrive))
                ########################################################################################

                result = 'Success' if arrive else 'Fail'

                if time_step > 0:
                    total_reward += reward

                if time_step % 10000 == 0 and time_step > 0:
                    print(
                        '---------------------------------------------------')
                    avg_reward = total_reward / 10000
                    print('Average_reward: {}'.format(avg_reward))
                    avg_reward_his.append(round(avg_reward, 2))
                    # writer.add_scalar('avg_reward', avg_reward, time_step)
                    print('Overall average Reward: {}'.format(avg_reward_his))
                    total_reward = 0

                if time_step % 5 == 0 and time_step > exploration_decay_start_step:
                    action_var *= 0.9999

                past_action = action
                state = state_

                if arrive or done or cur_step >= max_episode_length:
                    if result == 'Success':
                        success_rate += 1
                    sec = time.time() - start_time
                    elapsed_time = str(
                        datetime.timedelta(seconds=sec)).split('.')[0]
                    print(
                        'Num_episode: {}, Full steps: {}, Result: {}, Elapsed time: {}'
                        .format(itr, cur_step, result, elapsed_time))

                    if itr % 20 == 0 and itr > 0:
                        print('Total: {}/20, Success rate: {}'.format(
                            success_rate, round(success_rate / 20), 2))
                        success_rate = 0

                    break


#########################################################################################
#                                 Testing
#########################################################################################
    else:
        print('Testing mode')
        while True:
            state = env.reset()
            one_round_step = 0

            while True:
                a = agent.action(state)
                a[0] = np.clip(a[0], 0., 1.)
                a[1] = np.clip(a[1], -0.5, 0.5)
                state_, reward, done, arrive = env.step(a, past_action)
                past_action = a
                state = state_
                one_round_step += 1

                if arrive:
                    print('Step: %3i' % one_round_step, '| Arrive!!!')
                    one_round_step = 0

                if done:
                    print('Step: %3i' % one_round_step, '| Collision!!!')
                    break
Example #17
0
            observation = observation_
            if done:
                if episode >= ep_max - 11:
                    fid_10 = max(fid_10, fid)
                break

            step += 1

    return fid_10


if __name__ == "__main__":

    dt_ = np.pi / 20
    env = Env(
        action_space=list(range(2)),  #allow two actions
        dt=dt_)

    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.99,
        replace_target_iter=200,
        memory_size=2000,
        e_greedy_increment=0.001,
    )
    fidelity = run_maze()
    print("Final_fidelity=", fidelity)
Example #18
0
def evl(ast, env):
    global stack
    #print(ast)
    while True:
        #print(ast)
        if isinstance(ast, tuple):
            if len(ast) == 0: return ast

            ast = macroexpand(ast, env)
            if not isinstance(ast, tuple): return eval_ast(ast, env)

            if isinstance(ast[0], Keyword):
                hm = evl(ast[1], env)
                return hm[ast[0].name]

            if isinstance(ast[0], Name):
                if ast[0].name == "def!":
                    value = evl(ast[2], env)
                    return env.set(ast[1].name, value)

                if ast[0].name == "defmacro!":
                    value = evl(ast[2], env)
                    value.is_macro = True
                    return env.set(ast[1].name, value)

                if ast[0].name == "let*":
                    new_env = Env(env, [], [])
                    binding_list = ast[1]

                    for i in zip(binding_list[::2], binding_list[1::2]):
                        data = evl(i[1], new_env)
                        new_env.set(i[0], data)

                    ast, env = ast[2], new_env
                    continue

                if ast[0].name == "try*":
                    if len(ast) < 3:
                        return evl(ast[1], env)

                    try:
                        return evl(ast[1], env)
                    except BaslException as e:
                        new_env = Env(env, [ast[2][1].name], [e])
                        return evl(ast[2][2], new_env)
                    except Exception as e:
                        new_env = Env(env, [ast[2][1].name], [str(e)])
                        return evl(ast[2][2], new_env)

                if ast[0].name == "raise":
                    s = "{}:{}:{}".format(
                        env.get("*file*"), ast[0].name,
                        ast[0].line) if isinstance(
                            ast[0], Name) else "LAMBDA<" + ast[0] + ">"
                    raise BaslException(evl(ast[1], env), [*env.stack, s])

                if ast[0].name == "quote":
                    return ast[1]

                if ast[0].name == "macroexpand":
                    return macroexpand(ast[1], env)

                if ast[0].name == "quasiquoteexpand":
                    return quasiquote(ast[1])

                if ast[0].name == "quasiquote":
                    ast = quasiquote(ast[1])
                    continue

                if ast[0].name == "do":
                    res = None
                    for x in ast[1:-1]:
                        res = evl(x, env)
                    ast = ast[-1]
                    continue

                if ast[0].name == "if":
                    if len(ast) < 3:
                        ast, env = None, env
                        continue
                    res_cond = evl(ast[1], env)

                    if type(res_cond) == bool and res_cond == True:
                        ast = ast[2]
                        continue
                    if type(res_cond) == int:
                        ast = ast[2]
                        continue
                    if type(res_cond) == float:
                        ast = ast[2]
                        continue
                    if type(res_cond) == list:
                        ast = ast[2]
                        continue
                    if type(res_cond) == tuple:
                        ast = ast[2]
                        continue
                    if type(res_cond) == str:
                        ast = ast[2]
                        continue
                    if type(res_cond) == Fn:
                        ast = ast[2]
                        continue
                    if type(res_cond) == Keyword:
                        ast = ast[2]
                        continue
                    if type(res_cond) == Name:
                        ast = ast[2]
                        continue
                    if type(res_cond) == types.LambdaType:
                        ast = ast[2]
                        continue
                    if type(res_cond) == Atom:
                        ast = ast[2]
                        continue

                    ast = ast[3] if len(ast) >= 4 else None
                    continue

                if ast[0].name == "fn*":
                    body = ast[2]
                    params = ast[1]

                    func = lambda *e: evl(body, Env(env, params, e))
                    return Fn(body, params, env, func)

            [f, *args] = eval_ast(ast, env)

            if isinstance(f, Fn):
                s = "{}:{}:{}".format(
                    env.get("*file*"), ast[0].name, ast[0].line) if isinstance(
                        ast[0],
                        Name) else "LAMBDA<" + display(ast[0], True) + ">"
                ast, env = f.ast, Env(f.env, f.params, args, s)
                continue

            if isinstance(f, types.LambdaType):
                return f(*args)

        return eval_ast(ast, env)
                                        length=R)
    assert raw_data.shape == (R, D) and end_data.shape == (R, )
    actions = np.array([-1, 0, 1])

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Make seperate DQN (mainDQN = behaviorDQN, targetDQN)
        mainDQN = DQN(sess, D, S, C, LR, FLAGS.model_name, net_name="main")
        targetDQN = DQN(sess, D, S, C, LR, FLAGS.model_name, net_name="target")

        if S_MODE == "train":
            env_train = Env(num_data=(0, T),
                            raw_data=raw_data,
                            end_data=end_data,
                            actions=actions,
                            input_size=D,
                            seq_size=S,
                            name="train",
                            transaction_cost=TC)
            saver = train(sess, env_train, mainDQN, targetDQN)
            saver = tf.train.Saver()
            test(sess, env_train, mainDQN, saver, df, make_csv=True)
        elif S_MODE == "test":
            env_test = Env(num_data=(T, FLAGS.raw_data_length),
                           raw_data=raw_data,
                           end_data=end_data,
                           actions=actions,
                           input_size=D,
                           seq_size=S,
                           name="train",
                           transaction_cost=TC)
Example #20
0
    120, 121, 122, 124, 125, 126, 127, 129, 130, 136, 142, 143, 144, 145, 146,
    147, 148, 150, 151, 152, 153, 154, 155, 156, 157, 1687, 181, 182, 183, 184,
    185, 186, 199, 400, 401, 402, 403, 405, 406, 407, 408, 409, 410, 411, 412,
    413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 424, 425, 426, 427, 432
]  # hardcoding

# What we want to watch
# sum of incentives (mints)
# sum of rent fees
# and it's diff.
if __name__ == "__main__":
    args = arguments.parser()
    print("> Setting:", args)

    # Env
    env = Env(num_stations, num_bikes_per_station)

    # Agent
    agents = [Agent(args.defaultBalance) for _ in range(args.users)]

    # logs
    bankrupts = list()
    incentives = list()  # mints
    fees = list()

    # logs for visualization
    log_dict = dict()
    log_dict["log_bankrupts"] = list()
    log_dict["log_total_incentive"] = list()
    log_dict["log_total_fee"] = list()
    log_dict["log_balance"] = list()
Example #21
0
    def data(self, eval_w_start, eval_w_end, eval_w_points):
        """
        Generates the state space over which the Q-value approximating network
        is to be evaluated and computes the true Q-values. Only works for
        environments with 2 periods per episode.

        Arguments
        ---------
        :param eval_w_start : float
               Lowest value of wealth component of the state in the evaluation
               state space.
        :param eval_w_end : float
               Highest value of wealth component of the state in the
               evaluation state space.
        :param eval_w_points : int
               Number of evenly spaced wealth components of the state
               between the lowest and highest value.

        Returns
        -------
        :returns x_train : ndarray
                 All states in evaluation state space.
        :returns y_train : ndarray
                 True Q-values for all states in the evaluation state space.
        """

        # initiliaze the RL-agent:
        agent = AgentDQN(dim_state=self.dim_state,
                         dim_actions=self.dim_actions,
                         hidden_dims=self.hidden_dims,
                         optimizer=Adam(),
                         gamma=self.gamma,
                         eps=self.eps,
                         eps_decay=self.eps_decay,
                         frozen=self.frozen,
                         pretrained=self.pretrained)

        # initiliaze the environment:
        env = Env(start=self.start,
                  tcost=self.tcost,
                  horizon=self.horizon,
                  w=self.w,
                  theta=self.theta,
                  regimes=self.regimes)

        assert env.horizon == 2

        x1 = np.arange(0, env.horizon) / env.horizon
        x2 = np.linspace(eval_w_start, eval_w_end, eval_w_points)
        x_train = np.array(np.meshgrid(x1, x2)).T.reshape(-1, 2)

        # which regimes operate in t=0 and t=1:
        idx0 = [0 in v["periods"] for v in env.regimes.values()].index(True)
        idx1 = [1 in v["periods"] for v in env.regimes.values()].index(True)
        r0 = list(env.regimes.keys())[idx0]  # regime in t=0
        r1 = list(env.regimes.keys())[idx1]  # regime in t=1

        mu0 = env.regimes[r0]["mu"]  # log-returns at t=0
        mu1 = env.regimes[r1]["mu"]  # log-returns at t=1
        sigma0 = env.regimes[r0]["sigma"]
        sigma1 = env.regimes[r1]["sigma"]

        assert np.array_equal(mu0, mu1) is True
        assert np.array_equal(sigma0, sigma1) is True

        y_train = []
        for s in x_train:
            if s[0] == 0:
                opt_w = compute_opt_weight(env, 1)
                tq = 2 * mu0[0] +\
                    (agent.action_space + opt_w) * \
                    (mu0[1] - mu0[0] + sigma0[1] ** 2 / 2) - \
                    0.5 * (agent.action_space ** 2 + opt_w ** 2) * \
                    sigma0[1] ** 2 + np.log(s[1])
            else:
                tq = mu0[0] + agent.action_space * (mu0[1] - mu0[0]) \
                     + \
                     0.5 * agent.action_space * (1 - agent.action_space) * \
                     sigma0[1] ** 2 + np.log(s[1])
            y_train.append(tq)

        self.x = x_train
        self.y = np.array(y_train)

        return self.x, self.y
    def play_episode(self, n_tot):

        self.model.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        # mask = torch.FloatTensor(consts.actions_mask[args.game])
        # mask = Variable(mask.cuda(), requires_grad=False)

        vsx = torch.FloatTensor(consts.short_bins[args.game])
        vlx = torch.FloatTensor(consts.long_bins[args.game])

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                    s, self.actions_matrix)
                beta = beta.squeeze(0)
                pi_l = pi_l.squeeze(0)
                pi_s = pi_s.squeeze(0)
                pi_l_tau = pi_l_tau.squeeze(0)
                pi_s_tau = pi_s_tau.squeeze(0)

                temp = 1

                # consider only 3 most frequent actions
                beta_np = beta.data.cpu().numpy()
                indices = np.argsort(beta_np)

                maskb = Variable(torch.FloatTensor(
                    [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                                 requires_grad=False).cuda()
                # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                #                  requires_grad=False).cuda()

                # pi = maskb * (beta / beta.max())

                pi = beta
                self.greedy = False

                beta_prob = pi

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.1:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                vs = softmax(vs)
                vl = softmax(vl)
                vs = torch.sum(vsx * vs.data.cpu())
                vl = torch.sum(vlx * vl.data.cpu())

                yield {
                    'o': env.s.cpu().numpy(),
                    'vs': np.array([vs]),
                    'vl': np.array([vl]),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta_prob.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy(),
                    'qs': qs.squeeze(0).data.cpu().numpy(),
                    'ql': ql.squeeze(0).data.cpu().numpy(),
                }

                j += 1

        raise StopIteration
Example #23
0
    def train(self,
              env,
              checkpoint_interval,
              checkpoint_dir,
              saver,
              gamma=0.99):
        global T
        self.saver = saver

        # initialize environment
        time.sleep(3 * self.thread_id)
        env = Env(env, 84, 84, 4)

        print 'Starting thread ' + str(self.thread_id)

        terminal = False
        # Get initial game observation

        state = env.get_initial_state()

        # episode's reward and cost
        episode_reward = 0
        total_cost = 0
        counter = 0

        while T < self.TMAX:

            # lists for feeding placeholders
            states = []
            actions = []
            prev_reward = []
            state_values = []

            t = 0
            t_start = t
            self.sess.run(self.sync_op)
            while not (terminal or ((t - t_start) == self.tmax)):

                # forward pass of network. Get probability of all actions
                probs, v = self.sess.run((self.policy, self.state_value),
                                         feed_dict={self.input_state: [state]})

                probs = probs[0]
                v = v[0][0]
                # print the outputs of the neural network fpr sanity chack
                if T % 2000 == 0:
                    print probs
                    print v

                # define list of actions. All values are zeros except , the
                # value of action that is executed
                action_list = np.zeros([self.output_size])

                # choose action based on policy
                action_index = sample_policy_action(probs)

                action_list[action_index] = 1

                # add state and action to list
                actions.append(action_list)
                states.append(state)

                state_values.append(v)

                # Gym executes action in game environment on behalf of actor-learner
                new_state, reward, terminal = env.step(action_index)

                # clip reward to -1, 1
                clipped_reward = np.clip(reward, -1, 1)
                prev_reward.append(clipped_reward)

                # Update the state and global counters
                state = new_state
                T += 1
                t += 1
                counter += 1

                # update episode's counter
                episode_reward += reward

                # Save model progress
                if T % checkpoint_interval < 200:
                    T += 200
                    self.saver.save(self.sess,
                                    checkpoint_dir + "/breakout.ckpt",
                                    global_step=T)

            if terminal:
                R_t = 0
            else:
                R_t = self.sess.run(self.state_value,
                                    feed_dict={self.input_state: [state]})
                R_t = R_t[0][0]

            state_values.append(R_t)
            targets = np.zeros((t - t_start))

            for i in range(t - t_start - 1, -1, -1):
                R_t = prev_reward[i] + gamma * R_t
                targets[i] = R_t

            # compute the advantage based on GAE
            # code from https://github.com/openai/universe-starter-agent
            delta = np.array(prev_reward) + gamma * np.array(
                state_values[1:]) - np.array(state_values[:-1])
            advantage = scipy.signal.lfilter([1], [1, -gamma],
                                             delta[::-1],
                                             axis=0)[::-1]

            # update the global network
            cost, _ = self.sess.run(
                (self.loss, self.opt),
                feed_dict={
                    self.input_state: states,
                    self.actions: actions,
                    self.targets: targets,
                    self.advantage: advantage
                })
            total_cost += cost

            if terminal:

                terminal = False
                print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \
                    episode_reward, "/ COST", total_cost/counter
                episode_reward = 0
                total_cost = 0
                counter = 0

                # Get initial game observation
                state = env.get_initial_state()
Example #24
0
agent = 0
"""Lower Manhattan"""
# G = ox.load_graphml('lowermanhattan.graphml')
# G = ox.project_graph(G)
# fig, ax = ox.plot_graph(G, node_size=0, edge_linewidth=0.5)
"""San Francisco"""
# G = ox.load_graphml('sanfrancisco.graphml')
# G = ox.project_graph(G)
# fig, ax = ox.plot_graph(G, node_size=0, edge_linewidth=0.5)
"""Piedmont, California"""
G = ox.load_graphml('piedmont.graphml')
G = ox.project_graph(G)
fig, ax = ox.plot_graph(G, node_size=0, edge_linewidth=0.5)

# initialize the environment for the learning agent
env = Env(n=N, fig=fig, ax=ax, agent=agent, dt=dt, animate=False)

# initialize the Keras training model
model = Sequential()
model.add(layers.InputLayer(batch_input_shape=(1, 10)))
model.add(layers.Dense(10, activation='sigmoid'))
model.add(layers.Dense(2, activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

# now execute Q learning
y = 0.95
eps = 0.5
decay_factor = 0.999
num_episodes = 10

r_avg_list = []
Example #25
0
        if state == [2, 2]:
            return 0.0
        return self.policy_table[state[0]][state[1]]

    def get_value(self, state):
        return round(self.value_table[state[0]][state[1]], 2)


def check_if_have_none_or_more_then_two_argument():
    return len(sys.argv) < 2 or len(sys.argv) > 2


def check_if_argument_value_invalid():
    return sys.argv[1] != 'i' and sys.argv[1] != 'ii' and sys.argv[1] != 'iii'


def exit_and_print_error():
    sys.exit('You should specify one argument: i, ii, or iii')


if __name__ == "__main__":
    if check_if_have_none_or_more_then_two_argument(
    ) or check_if_argument_value_invalid():
        exit_and_print_error()
    else:
        scenario = sys.argv[1]
        env = Env(scenario)
        policy_iteration = PolicyIteration(env, scenario)
        grid_world = GraphicDisplay(policy_iteration, scenario)
        grid_world.mainloop()
Example #26
0
def evaluate(ast, env):
    """ Evaluates an abstract syntax tree in a given environment """
    while True:  # Infinite loop used for tail call optimization
        # First check if the AST is a macro, and if so expand it.
        ast = macroexpand(ast, env)

        # If the ast is not a list, call the mutually recursive eval_ast() function on it.
        if not isinstance(ast, List):
            return eval_ast(ast, env)

        # Return the AST as it is, if it's just an empty sequence, as there's nothing more to be done.
        if len(ast) == 0:
            return ast
        elif not isinstance(ast[0], List) and ast[0] in special_forms:

            # This following section deals with applying the logic of each special form.
            form, args = ast[0], ast[1:]

            # def! assigns a value to a key in the current environment.
            if form == Symbol("def!"):
                value = evaluate(args[1], env)
                env.define(args[0], value)
                return value

            # let* evaluates a form in a temporary environment.
            elif form == Symbol("let*"):
                var_list = args[0]
                if isinstance(var_list,
                              (List, Vector)) and len(var_list) % 2 == 0:
                    new_env = Env(outer=env)
                    for i in range(0, len(var_list), 2):
                        new_env.define(var_list[i],
                                       evaluate(var_list[i + 1], new_env))
                    env = new_env
                    ast = args[1]
                    continue  # Tail call optimization
                else:
                    raise SyntaxError("Invalid argument list supplied.")

            # do evaluates all the elements of the list, and returns the final evaluated one. This constructs provides a way to sequentially execute things.
            elif form == Symbol("do"):
                for expr in args[:-1]:
                    last = evaluate(expr, env)
                ast = args[-1]
                continue  # Tail call optimization

            # if works as you'd expect. To be noted that it only evaluates the needed argument (first argument if the condition is true, second otherwise), and is also tail call optimized.
            elif form == Symbol("if"):
                cond, true = args[0], args[1]
                false = args[2] if len(args) > 2 else None
                ev = evaluate(cond, env)
                if ev is False or ev == Nil():
                    if false is not None:
                        ast = false
                        continue
                    else:
                        return Nil()
                else:
                    ast = true
                    continue

            # fn* defines a lambda function, with the first argument being the parameter list, and the second being the function's body.
            elif form == Symbol("fn*"):
                params, body = args[0], args[1]

                def fn(*arguments):
                    return evaluate(body, Env(env, params, arguments))

                return Procedure(body, params, env, fn)

            # quote defers evaluation, just returning its argument as it is.
            elif form == Symbol('quote'):
                return args[0]

            # quasiquote enables a quoted list to have certain elements evaluted by the way of unquote and splice-unquote.
            elif form == Symbol('quasiquote'):
                ast = quasiquote(args[0])
                continue  # Tail call optimized

            # defmacro! defines a new macro in the current environment.
            elif form == Symbol("defmacro!"):
                value = evaluate(args[1], env)
                value.is_macro = True
                env.define(args[0], value)
                return value

            # macroexpand allows explicitly calling the macroexpand function. This can aid in debugging macros.
            elif form == Symbol("macroexpand"):
                return macroexpand(args[0], env)

            # End of special forms logic

        else:
            # First evaluate the list that holds the AST
            evaluated = eval_ast(ast, env)

            # Procedures primarily represent user defined functions
            if isinstance(evaluated[0], Procedure):
                proc = evaluated[0]
                ast = proc.ast
                env = proc.make_env(Env, evaluated[1:])
                continue  # Tail call optimization

            # Callables represent the built-in functions or fully evaluated procedures
            elif callable(evaluated[0]):
                return evaluated[0](*evaluated[1:])

            # During evaluation, a Lisp list is expected to hold a function reference as its first element
            else:
                raise SyntaxError("First element of list is not a function.")
            actions = np.array(self.actions)
            action_prob = tf.reduce_sum(actions * policies, axis=1)
            cross_entropy = -tf.math.log(action_prob + 1e-5)
            loss = tf.reduce_sum(cross_entropy * discounted_rewards)
            entropy = -policies * tf.math.log(policies)

        # 오류함수를 줄이는 방향으로 모델 업데이트
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))
        self.states, self.actions, self.rewards = [], [], []
        return np.mean(entropy)


if __name__ == "__main__":
    # 환경과 에이전트 생성
    env = Env(render_speed=0.01)
    state_size = 15
    action_space = [0, 1, 2, 3, 4]
    action_size = len(action_space)
    agent = REINFORCEAgent(state_size, action_size)

    scores, episodes = [], []

    EPISODES = 200
    for e in range(EPISODES):
        done = False
        score = 0
        # env 초기화
        state = env.reset()
        state = np.reshape(state, [1, state_size])
Example #28
0
 def fn(*arguments):
     return evaluate(body, Env(env, params, arguments))
Example #29
0
        if state == [2, 2]:
            return []

        # calculating q values for the all actions and
        # append the action to action list which has maximum q value
        for action in self.env.possible_actions:

            next_state = self.env.state_after_action(state, action)
            reward = self.env.get_reward(state, action)
            next_value = self.get_value(next_state)
            value = (reward + DISCOUNT_FACTOR * next_value)

            if value > max_value:
                action_list.clear()
                action_list.append(action)
                max_value = value
            elif value == max_value:
                action_list.append(action)

        return action_list

    def get_value(self, state):
        return round(self.value_table[state[0]][state[1]], 2)

if __name__ == "__main__":
    env = Env()
    value_iteration = ValueIteration(env)
    grid_world = GraphicDisplay(value_iteration)
    grid_world.mainloop()
Example #30
0
def get_num_actions():
    env = gym.make(FLAGS.game)
    env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type)
    num_actions = len(env.gym_actions)
    return num_actions