예제 #1
0
    n_games = 1000

    gammas = [float(args.gamma)]
    mem_sizes = [int(args.memory)]
    epsilon_decs = [float(args.eps_decay)]

    for gamma in gammas:
        for mem_size in mem_sizes:
            for epsilon_dec in epsilon_decs:
                print('Gamma {} - Mem Size {} - Epsilon Decay {}'.format(
                    gamma, mem_size, epsilon_dec))
                agent = Agent(gamma=gamma,
                              epsilon=1.0,
                              lr=lr,
                              input_dims=env.observation_space.shape,
                              n_actions=env.action_space.n,
                              mem_size=mem_size,
                              epsilon_dec=epsilon_dec,
                              batch_size=64,
                              epsilon_end=0.01,
                              saveModel='models/best_model.h5')
                scores = []
                avg_scores = []
                eps_history = []
                iter_time = []
                avg_times = []
                mem_full = []

                for i in range(n_games):
                    start = time.time()
                    done = False
                    score = 0
예제 #2
0
파일: main.py 프로젝트: azm17/trade
    plt.title('NTN (6472)')
    plt.gcf().autofmt_xdate()
    plt.grid()
    plt.plot(x, y)


def print_result(obj, bpw, hold):
    print(obj.t.strftime("%Y-%m-%d"), '{:<5}'.format(str(obj.stock_price_t)),
          f'Agent: (BuyPower, Stock)=({bpw:.0f}, {hold})')


if __name__ == "__main__":
    s_time = '2020-01-11'  # 取引開始日
    e_time = '2020-11-11'  # 取引終了日
    sim = Simulator(s_time, e_time)  # シミュレーター生成
    agent = Agent()  # エージェント生成

    buy_pw_agent = 100000  # 買い付け余力
    stockholdings_agent = 0  # 保有株式数

    while (sim.e_time > sim.t):
        sim.next_day()
        date = sim.t.strftime("%Y-%m-%d")
        price = sim.stock_price_t
        if price != 'None':  # 土日,祝日はスキップ
            # エージェントの意思決定
            agent.make_decision(price, buy_pw_agent, stockholdings_agent, date)
            # 取引
            if agent.action == 'BUY':
                tmp_bp = buy_pw_agent - agent.volume * price
                tmp_sh = stockholdings_agent + agent.volume
예제 #3
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     
     agent = Agent()
     
     self.agents = [agent,]
예제 #4
0
파일: main.py 프로젝트: tbourg/Tp-deep-rl
    parser.add_argument('env_id',
                        nargs='?',
                        default='CartPole-v1',
                        help='Select the environment to run')
    args = parser.parse_args()

    # You can set the level to logger.DEBUG or logger.WARN if you
    # want to change the amount of output.
    logger.set_level(logger.INFO)

    env = gym.make(args.env_id)
    rewards = []

    #env = wrappers.Monitor(env, force=True)
    #env.seed(0)
    agent = Agent(env)

    episode_count = 200
    reward = 0
    done = False

    for i in range(episode_count):
        ob = env.reset()
        prev_ob = ob
        episode_reward = 0
        while True:
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            agent.learn(prev_ob, action, ob, reward, done)
            prev_ob = ob
            episode_reward += reward
예제 #5
0
 def getAgent(self):
     """
     Return the agent implemented by this algorithm.
     """
     return Agent(name=self.__str__(),
                  strategy=(lambda i, s: self.getAction(i, s)))
예제 #6
0
import tornado.web
from tornado import gen
import os
import base64
import re
import json
import numpy as np
import time
import hyperparameters as hp
from agent import Agent
from action import Action
from PIL import Image
from io import BytesIO

static_path = os.path.join(os.getcwd(), "static")
agent = Agent()


class MainHandler(tornado.web.RequestHandler):
    def get(self):
        self.redirect("/static/v2.curves.html")
        # self.redirect("/static/v4.final.html")


class FrameHandler(tornado.web.RequestHandler):
    def post(self):
        data = json.loads(self.get_arguments("telemetry")[0])
        ar = np.fromstring(base64.decodestring(self.request.body),
                           dtype=np.uint8)
        image = ar.reshape(hp.INPUT_SIZE, hp.INPUT_SIZE, hp.NUM_CHANNELS)
        left, right, faster, slower = data["action"]
예제 #7
0
def design_agent_and_env(FLAGS):
    """
    1. DESIGN AGENT

    The key hyperparameters for agent construction are

        a. Number of levels in agent hierarchy
        b. Max sequence length in which each policy will specialize
        c. Max number of atomic actions allowed in an episode
        d. Environment timesteps per atomic action

    See Section 3 of this file for other agent hyperparameters that can be configured.
    """

    FLAGS.layers = 3  # Enter number of levels in agent hierarchy

    FLAGS.time_scale = 10  # Enter max sequence length in which each policy will specialize

    # Enter max number of atomic actions.  This will typically be FLAGS.time_scale**(FLAGS.layers).  However, in the UR5 Reacher task, we use a shorter episode length.
    max_actions = 500
    # max_actions = 15

    timesteps_per_action = 15  # Provide the number of time steps per atomic action.
    """
    2. DESIGN ENVIRONMENT

        a. Designer must provide the original UMDP (S,A,T,G,R).
            - The S,A,T components can be fulfilled by providing the Mujoco model.
            - The user must separately specifiy the initial state space.
            - G can be provided by specifying the end goal space.
            - R, which by default uses a shortest path {-1,0} reward function, can be implemented by specifying two components: (i) a function that maps the state space to the end goal space and (ii) the end goal achievement thresholds for each dimensions of the end goal.

        b.  In order to convert the original UMDP into a hierarchy of k UMDPs, the designer must also provide
            - The subgoal action space, A_i, for all higher-level UMDPs i > 0
            - R_i for levels 0 <= i < k-1 (i.e., all levels that try to achieve goals in the subgoal space).  As in the original UMDP, R_i can be implemented by providing two components:(i) a function that maps the state space to the subgoal space and (ii) the subgoal achievement thresholds.

        c.  Designer should also provide subgoal and end goal visualization functions in order to show video of training.  These can be updated in "display_subgoal" and "display_end_goal" methods in the "environment.py" file.

    """

    # Provide file name of Mujoco model(i.e., "pendulum.xml").  Make sure file is stored in "mujoco_files" folder
    model_name = "ant_reacher.xml"

    # Provide initial state space consisting of the ranges for all joint angles and velocities.  In the Ant Reacher task, we use a random initial torso position and use fixed values for the remainder.

    initial_joint_pos = np.array([
        0, 0, 0.55, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -1.0, 0.0, -1.0, 0.0,
        1.0
    ])
    initial_joint_pos = np.reshape(initial_joint_pos,
                                   (len(initial_joint_pos), 1))
    initial_joint_ranges = np.concatenate(
        (initial_joint_pos, initial_joint_pos), 1)
    initial_joint_ranges[0] = np.array([-9.5, 9.5])
    initial_joint_ranges[1] = np.array([-9.5, 9.5])

    # Cocatenate velocity ranges
    initial_state_space = np.concatenate(
        (initial_joint_ranges, np.zeros(
            (len(initial_joint_ranges) - 1, 2))), 0)

    # Provide end goal space.  The code supports two types of end goal spaces if user would like to train on a larger end goal space.  If user needs to make additional customizations to the end goals, the "get_next_goal" method in "environment.py" can be updated.

    # In the UR5 reacher environment, the end goal will be the desired joint positions for the 3 main joints.
    max_range = 9.5
    goal_space_train = [[-max_range, max_range], [-max_range, max_range],
                        [0.45, 0.55]]
    goal_space_test = [[-max_range, max_range], [-max_range, max_range],
                       [0.45, 0.55]]

    # Provide a function that maps from the state space to the end goal space.  This is used to (i) determine whether the agent should be given the sparse reward and (ii) for Hindsight Experience Replay to determine which end goal was achieved after a sequence of actions.
    project_state_to_end_goal = lambda sim, state: state[:3]

    # Set end goal achievement thresholds.  If the agent is within the threshold for each dimension, the end goal has been achieved and the reward of 0 is granted.

    # For the Ant Reacher task, the end goal will be the desired (x,y) position of the torso
    len_threshold = 0.5
    height_threshold = 0.2
    end_goal_thresholds = np.array(
        [len_threshold, len_threshold, height_threshold])

    # Provide range for each dimension of subgoal space in order to configure subgoal actor networks.  Subgoal space can be the same as the state space or some other projection out of the state space.

    # The subgoal space in the Ant Reacher task is the desired (x,y,z) position and (x,y,z) translational velocity of the torso
    cage_max_dim = 11.75
    max_height = 1
    max_velo = 3
    subgoal_bounds = np.array([[-cage_max_dim, cage_max_dim],
                               [-cage_max_dim, cage_max_dim], [0, max_height],
                               [-max_velo, max_velo], [-max_velo, max_velo]])

    # Provide state to subgoal projection function.
    # a = np.concatenate((sim.data.qpos[:2], np.array([4 if sim.data.qvel[i] > 4 else -4 if sim.data.qvel[i] < -4 else sim.data.qvel[i] for i in range(3)])))
    project_state_to_subgoal = lambda sim, state: np.concatenate(
        (sim.data.qpos[:2],
         np.array([1 if sim.data.qpos[2] > 1 else sim.data.qpos[2]]),
         np.array([
             3 if sim.data.qvel[i] > 3 else -3
             if sim.data.qvel[i] < -3 else sim.data.qvel[i] for i in range(2)
         ])))

    # Set subgoal achievement thresholds
    velo_threshold = 0.5
    quat_threshold = 0.5
    # subgoal_thresholds = np.array([len_threshold, len_threshold, height_threshold, quat_threshold, quat_threshold, quat_threshold, quat_threshold, velo_threshold, velo_threshold, velo_threshold])
    subgoal_thresholds = np.array([
        len_threshold, len_threshold, height_threshold, velo_threshold,
        velo_threshold
    ])

    # To properly visualize goals, update "display_end_goal" and "display_subgoals" methods in "environment.py"
    """
    3. SET MISCELLANEOUS HYPERPARAMETERS

    Below are some other agent hyperparameters that can affect results, including
        a. Subgoal testing percentage
        b. Subgoal penalty
        c. Exploration noise
        d. Replay buffer size
    """

    agent_params = {}

    # Define percentage of actions that a subgoal level (i.e. level i > 0) will test subgoal actions
    agent_params["subgoal_test_perc"] = 0.3

    # Define subgoal penalty for missing subgoal.  Please note that by default the Q value target for missed subgoals does not include Q-value of next state (i.e, discount rate = 0).  As a result, the Q-value target for missed subgoal just equals penalty.  For instance in this 3-level UR5 implementation, if a level proposes a subgoal and misses it, the Q target value for this action would be -10.  To incorporate the next state in the penalty, go to the "penalize_subgoal" method in the "layer.py" file.
    agent_params["subgoal_penalty"] = -FLAGS.time_scale

    # Define exploration noise that is added to both subgoal actions and atomic actions.  Noise added is Gaussian N(0, noise_percentage * action_dim_range)
    agent_params["atomic_noise"] = [0.2 for i in range(8)]
    agent_params["subgoal_noise"] = [
        0.2 for i in range(len(subgoal_thresholds))
    ]

    # Define number of episodes of transitions to be stored by each level of the hierarchy
    agent_params["episodes_to_store"] = 500

    # Provide training schedule for agent.  Training by default will alternate between exploration and testing.  Hyperparameter below indicates number of exploration episodes.  Testing occurs for 100 episodes.  To change number of testing episodes, go to "ran_HAC.py".
    agent_params["num_exploration_episodes"] = 100

    # For other relavent agent hyperparameters, please refer to the "agent.py" and "layer.py" files

    # Ensure environment customization have been properly entered
    check_validity(model_name, goal_space_train, goal_space_test,
                   end_goal_thresholds, initial_state_space, subgoal_bounds,
                   subgoal_thresholds, max_actions, timesteps_per_action)

    # Instantiate and return agent and environment
    env = Environment(model_name, goal_space_train, goal_space_test,
                      project_state_to_end_goal, end_goal_thresholds,
                      initial_state_space, subgoal_bounds,
                      project_state_to_subgoal, subgoal_thresholds,
                      max_actions, timesteps_per_action, FLAGS.show)

    agent = Agent(FLAGS, env, agent_params)

    return agent, env
예제 #8
0
    # But only the last row provides new information to each state, so we could simply get those values
    state = states[0].reshape(3, 8)[-1]
    table = BT()
    table.column_headers = state_vector_names
    table.append_row(state.tolist())
    print(table)

    env.close()

    # Test Agent
    # ----------

    state_size, action_size = brain.vector_observation_space_size, brain.vector_action_space_size
    agent = Agent(num_agents=num_agents,
                  state_size=state_size,
                  action_size=action_size)

    print('Capacity of the Actor (# of parameters): ',
          count_parameters(agent.actor_local))
    print('Capacity of the Critic (# of parameters): ',
          count_parameters(agent.critic_local))

# Training
# --------


#@timeit
def train(env):
    ''' Trains on an environment '''
예제 #9
0
def train(env):
    ''' Trains on an environment '''

    global EPISODES
    global MAX_ITERS
    global PRINT_EVERY
    global LEARN_PERIOD
    global NUM_SAMPLES

    print('Loading environmnet...\n')
    env = UnityEnvironment(file_name=ENV)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    print('Loading agent...\n')
    num_agents = len(env_info.agents)
    state_size, action_size = brain.vector_observation_space_size, brain.vector_action_space_size
    agent = Agent(num_agents=num_agents,
                  state_size=state_size,
                  action_size=action_size)
    print('Capacity of the Actor (# of parameters): ',
          count_parameters(agent.actor_local))
    print('Capacity of the Critic (# of parameters): ',
          count_parameters(agent.critic_local))

    last_100_mean = []
    scores_global = []
    scores_concur = deque(maxlen=PRINT_EVERY)

    try:

        print('Initializing training...\n')
        for e in range(1, EPISODES + 1):

            # Initialize Episode
            scores = np.zeros(num_agents)
            env_info = env.reset(train_mode=True)[brain_name]
            states = env_info.vector_observations  # get the current state (for each agent)

            agent.reset()
            t0 = time.time()

            # Run episode maximum until MAX_ITERS
            for i in range(MAX_ITERS):

                # Select an action for each Agent
                actions = agent.act(states)
                env_info = env.step(actions)[brain_name]

                # Observe result of the action
                next_states = env_info.vector_observations
                rewards = env_info.rewards
                dones = env_info.local_done

                # Store score result
                scores += env_info.rewards

                # Make a step on the environment
                for state, action, reward, next_state, done in zip(
                        states, actions, rewards, next_states, dones):
                    agent.step(state, action, reward, next_state, done)

                if i % PRINT_EVERY == 0:
                    print('\rStep {}\tAverage Score: {:.2f}'.format(
                        i, np.mean(scores)),
                          end="")

                # Learn from experiences on the Replay Buffer
                if i % LEARN_PERIOD == 0:
                    for sample in range(NUM_SAMPLES):
                        agent.sampleandlearn()

                # End of the episode if any of the agents is done
                if np.any(dones):
                    break

                # Roll over states to next time step
                states = next_states

            # agent.sampleandlearn()

            deltatime = time.time() - t0

            score = np.mean(scores)
            scores_concur.append(score)
            scores_global.append(score)
            print('\rEpisode {}, Average last 100 scores: {:.2f}, Episode Duration: {:.2f}, \n'\
                  .format(e, np.mean(scores_concur), deltatime))

            # If last 100 episodes average score is the best 100 average seen - Save Models
            if np.mean(scores_concur) > last_100_mean:
                torch.save(agent.actor_local.state_dict(),
                           'checkpoint_actor_{}.pth'.format(e))
                torch.save(agent.critic_local.state_dict(),
                           'checkpoint_critic_{}.pth'.format(e))

            # Update current 100 mean
            last_100_mean = np.mean(scores_concur)

        print('Closing envionment...\n')
        env.close()
        return agent, scores_global

    # If errors, close environment
    except:
        env.close()
        print('There were some error wile training')
        return None, None
  env = gym.make('LunarLander-v2')
  env.seed(0)
  print('State shape: ', env.observation_space.shape)
  print('Number of actions: ', env.action_space.n)

  PRIORIIZED_REPLAY = True
  DUELING = True
  DDQN = True
  for ddqn in [True]:
    for dueling in [True]:
      for prioritized_replay in [True]:
        print('Using:')
        print(' * DDNQ: ',  ddqn)
        print(' * DUELING: ', dueling)
        print(' * PRIORITIZED_REPLAY: ', prioritized_replay)
        agent = Agent(state_size=8, action_size=4, seed=0, prioritized_replay=prioritized_replay, dueling=dueling, ddqn=ddqn)
        scores = dqn(n_episodes=2000)
  
        # plot the scores
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.plot(np.arange(len(scores)), scores)
        plt.ylabel('Score')
        plt.xlabel('Episode #')
        plt.show()
        
        for i in range(5):
            state = env.reset()
            for j in range(2000):
                action = agent.act(state)
                env.render()
예제 #11
0
    current_NN.model.set_weights(m_tmp.get_weights())
    best_NN.model.set_weights(m_tmp.get_weights())
#otherwise just ensure the weights on the two players are the same
else:
    best_player_version = 0
    best_NN.model.set_weights(current_NN.model.get_weights())

#copy the config file to the run folder
copyfile('./config.py', run_folder + 'config.py')
plot_model(current_NN.model, to_file=run_folder + 'models/model.png', show_shapes = True)

print('\n')

######## CREATE THE PLAYERS ########

current_player = Agent('current_player', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, current_NN)
best_player = Agent('best_player', env.state_size, env.action_size, config.MCTS_SIMS, config.CPUCT, best_NN)
#user_player = User('player1', env.state_size, env.action_size)
iteration = 0

start = time.time()
for i in config.LOOP:
    while 1:

        iteration += 1
        reload(lg)
        reload(config)

        print('ITERATION NUMBER ' + str(iteration))

        lg.logger_main.info('BEST PLAYER VERSION: %d', best_player_version)
if __name__=="__main__":
    # currently runs 100 games of the agent against a random player. Takes about a minte

    random.seed(2)

    win=0
    loss=0
    tie=0
    turn=0

    n=10

    for i in range(n):
        game=Board()
        A=Agent(game)
        print("Game {}".format(i))
        while not game.end:
            
            A.make_move()
            game.random_move()
        
        if game.winner=='A': 
            win+=1
            turn+=game.turn

        elif game.winner=='B':
            loss+=1
            
        elif game.winner is None: tie+=1
예제 #13
0
gamma = 0.999
eps_start = 1
eps_end = 0.01
eps_decay = 0.001
target_update = 10  #update the target network every 10 episode
memory_size = 100000
lr = 0.001  #learning rate
num_episodes = 1000

#set the device use cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#enviorement manager
em = CartPoleEnvManager(device)
#create the strategy
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)

#create agent
agent = Agent(strategy, em.num_actions_available(), device)
#create replay memory
memory = ReplayMemory(memory_size)

#create policy network and target network
#pass height and width to create appropriate input shape

policy_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
예제 #14
0
    """
    Get the player move as an index in the board
    """
    valid_moves = ["a1", "a2", "a3", "b1", "b2", "b3", "c1", "c2", "c3"]
    while True:
        move = input("Where would you like to go? (a2, b1, c3, ...): ")
        if move in valid_moves:
            return valid_moves.index(move)
        else:
            print("Invalid move: " + move)


comp = X
human = O

agent = Agent(plays=comp, episodes=10_000)

if __name__ == "__main__":
    playing = True
    while playing:
        env = TicTacToe()  # initialise game
        env.render()

        # if user does not go first then the agent makes a prediction
        if input("Would you like to go first? (yes/no): ").lower() != "yes":
            action = agent.predict(env)
            env.step(action, player=comp)
            env.render()

        game_over = False
        while not game_over:
예제 #15
0
def main():
    
    HYPERPARAMS = {
        "breakout": {
            #"env_name": "BreakoutNoFrameskip-v4",
            "env_name": "PongNoFrameskip-v4",
            "gamma": 0.99, 
            "learning_rate": 0.003,
            "entropy_beta": 0.03,
            "batch_size": 128,
            "accumulation_steps": 10,
            "n_envs": 5, 
            "reward_steps": 4,
            "stop_reward": 500,
            "adam_eps": 1e-3,
        }
    }

    params = HYPERPARAMS["breakout"]

    device = T.device("cuda" if T.cuda.is_available() else "cpu")
    
    writer = SummaryWriter("run")
    env = GymEnvVec(params["env_name"], params["n_envs"])

    net = A2C(env.envs[0].observation_space.shape, env.envs[0].action_space.n)
    optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"], 
                           eps=params["adam_eps"])

    agent = Agent(net, params["batch_size"], params["entropy_beta"])
    exp_source = ExperienceSourceFirstLast(env, agent, params["gamma"], params["reward_steps"])
    
    batch = []

    with RewardTracker(writer, stop_reward=params["stop_reward"]) as tracker:
        for step, exp in enumerate(exp_source):
            batch.append(exp)

            #This part is only used to track the total reward.
            #If new_reward=True, it means the episode is done
            new_reward = exp_source.pop_total_reward()
            if new_reward:
                if tracker.reward(new_reward[0], step):
                    break
            
            if len(batch) < params["batch_size"]:
                continue 

            # Output the tuple (batch_states, batch_actions, batch_qvals)
            batch_args = unpack_batch(batch, net, params["gamma"], params["reward_steps"], device=device)
            batch.clear()

            optimizer.zero_grad()
            import ipdb; ipdb.set_trace()
            #kwargs = agent.learn(step, *batch_args, optimizer)
            
            writer.add_scalar("advantage",       kwargs["adv"].mean(), step)
            writer.add_scalar("values",          kwargs["critic_values"].mean(), step)
            writer.add_scalar("batch_rewards",   kwargs["batch_qvals"].mean(), step)
            writer.add_scalar("loss_entropy",    kwargs["entropy_loss"], step)
            writer.add_scalar("loss_policy",     kwargs["actor_loss"], step)
            writer.add_scalar("loss_value",      kwargs["actor_loss"], step)
            writer.add_scalar("loss_total",      kwargs["loss"], step)
            writer.add_scalar("grad_l2",         np.sqrt(np.mean(np.square(kwargs["grads"]))), step)
            writer.add_scalar("grad_max",        np.max(np.abs(kwargs["grads"])), step)
            writer.add_scalar("grad_var",        np.var(kwargs["grads"]), step)
 def __init__(self, state_size, action_size, random_seed):
     self.agents = [Agent(state_size, action_size, 1, random_seed) for _ in range(2)]
예제 #17
0
            communicationAPI.do_action(playerID, 1,
                                       self.initial_move(is_first=False))
        else:
            communicationAPI.do_action(playerID, 1,
                                       self.initial_move(is_first=True))
            self.update_resources()
            self.initial_move_opponent(
                communicationAPI.do_action(playerID, gameID,
                                           self.initial_move(is_first=False)))

        state = self.configure_state()
        while not self.is_terminated():
            self.update_resources()
            action = agent.get_action(state)
            new_state, new_position = self.update_state_after_my_action(action)

            send_action = action
            if action == 'move' or action == 'buildroad' or action == 'upgradetown':
                if new_position is not None:
                    send_action = send_action + ' ' + str(new_position)

            self.update_resources()
            opp_action = communicationAPI.do_action(playerID, 1, send_action)
            new_state = self.update_state_after_opp_action(opp_action)
            state = new_state


if __name__ == '__main__':
    game = Game(Map(), Agent(epsilon=0.1, gamma=0.9, alpha=1))
    game.train()
예제 #18
0
l_gamma = [1]

run = 0
with open('restuls_6.txt', 'w+') as inf:
    for epsilon in l_epsilon:
        for epsilon_decay in l_epsilon_decay:
            for epsilon_min in l_epsilon_min:
                for alpha in l_alpha:
                    for gamma in l_gamma:
                        run += 1
                        inf.write(
                            f'\n\nrun : {run} ================================================'
                        )
                        agent = Agent(epsilon=epsilon,
                                      epsilon_decay=epsilon_decay,
                                      epsilon_min=epsilon_min,
                                      alpha=alpha,
                                      gamma=gamma)

                        inf.write(f'\nepsilon: {agent.epsilon}'
                                  f'\nepsilon_decay: {agent.epsilon_decay}'
                                  f'\nepsilon_min: {agent.epsilon_min}'
                                  f'\nalpha: {agent.alpha}'
                                  f'\ngamma: {agent.gamma}')

                        avg_rewards, best_avg_reward = interact(env, agent)

                        inf.write(f'\nBest avg reward: {best_avg_reward}')

        #                 break
        #             break
예제 #19
0
  args.device = torch.device('cpu')


# Simple ISO 8601 timestamped logger
def log(s):
  print('[' + str(datetime.now().strftime('%Y-%m-%dT%H:%M:%S')) + '] ' + s)


# Environment
env = Env(args)
env.train()
action_space = env.action_space()


# Agent
dqn = Agent(args, env)
mem = ReplayMemory(args, args.memory_capacity)
priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start)


# Construct validation memory
val_mem = ReplayMemory(args, args.evaluation_size)
T, done = 0, True
while T < args.evaluation_size:
  if done:
    state, done = env.reset(), False

  next_state, _, done = env.step(np.random.randint(0, action_space))
  val_mem.append(state, None, None, done)
  state = next_state
  T += 1
예제 #20
0
    win = window.Window(width=500, height=500, vsync=True, resizable=True)
    glEnable(GL_BLEND)
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
    # needed so that egi knows where to draw
    egi.InitWithPyglet(win)
    # prep the fps display
    fps_display = clock.ClockDisplay()
    # register key and mouse event handlers
    win.push_handlers(on_key_press)
    win.push_handlers(on_mouse_press)
    win.push_handlers(on_resize)

    # create a world for agents
    world = World(500, 500)
    # add one agent
    world.agents.append(Agent(world))
    world.obstacles.append(Obstacle(world))
    # unpause the world ready for movement
    print("Controls: A to add an agent, O to add an object to the map, C to reset objects on the map, P to pause, and I to show direction info.")
    world.paused = False

    while not win.has_exit:
        win.dispatch_events()
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT)
        # show nice FPS bottom right (default)
        delta = clock.tick()
        world.update(delta)
        world.render()
        fps_display.draw()
        # swap the double buffer
        win.flip()
예제 #21
0
if __name__ == '__main__':

    # create environment object
    env = Environment()

    memory_fp = '/Users/ryan.osgar/Documents/repos/data_science/trex_memory/memory.pkl'
    save_path = '/Users/ryan.osgar/Documents/repos/data_science/noisy_model/model-weights'

    mem_length = 80000

    agent = Agent(env,
                  tf.keras.optimizers.Adam(learning_rate=0.0001),
                  loss='mse',
                  memory_length=mem_length,
                  dueling=True,
                  noisy_net=True,
                  egreedy=True,
                  save_memory=memory_fp,
                  save_weights=save_path,
                  verbose_action=True)

    agent.load_weights(save_path)
    agent.load_memory(memory_fp)
    agent.set_beta_schedule(beta_start=0.9,
                            beta_max=1,
                            annealed_samplings=2000)
    # agent.set_epsilon_decay_schedule(0.000001, 0.0000001, 100)

    agent.pretraining_steps = 0
    print(f'pretraining for {agent.pretraining_steps} steps...')
예제 #22
0
def _create_agents(config_list):
    """
    Create agents with different hyper-parameters.

    Parameters
    ----------
    config_list : list of dict
        List of parameters dict. Each dict has configurations
        such as model name, learning rate, etc..

    Returns
    -------
        Created agents list and core agent object.

    """
    try:
        agents = []
        for config in config_list:
            hyper_parameters = utils.Hyperparameter(
                batch_size=config["batch_size"],
                gamma=config["gamma"],
                eps_start=config["eps_start"],
                eps_end=config["eps_end"],
                eps_decay=config["eps_decay"],
                target_update=config["target_update"],
                default_durability=config["default_durability"],
                learning_rate=config["learning_rate"],
                initial_memory=config["initial_memory"],
                n_episode=config["n_episode"],
                n_actions=config["n_action"],
                default_durability_decreased_level=config[
                    "default_durability_decreased_level"],
                default_durability_increased_level=config[
                    "default_durability_increased_level"],
                default_check_frequency=config["default_check_frequency"],
                default_healing_frequency=config["default_healing_frequency"],
                env_name=config["env_name"],
                exp_name=config["exp_name"],
                render=config["render"],
                run_name=config["run_name"],
                output_directory_path=config["output_directory_path"],
                hyper_dash=config["hyper_dash"],
                model_saving_frequency=config["model_saving_frequency"],
                parameters_name=config["name"],
                roulette_mode=config["roulette_mode"],
                max_reward=config["max_reward"],
                min_reward=config["min_reward"])
            print(config["name"])
            if config["name"] != "core":
                if config["model"] == "DQN":
                    policy_net = models.DQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                    target_net = models.DQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                elif config["model"] == "DDQN":
                    policy_net = models.DDQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                    target_net = models.DDQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                elif config["model"] == "DQNbn":
                    policy_net = models.DQNbn(n_actions=4).to(
                        hyper_parameters.DEVICE)
                    target_net = models.DQNbn(n_actions=4).to(
                        hyper_parameters.DEVICE)
                elif config["model"] == "NonBatchNormalizedDQN":
                    policy_net = models.NonBatchNormalizedDQN(n_actions=4).to()
                    target_net = models.NonBatchNormalizedDQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                # elif args["model"] == "RamDQN":
                #     policy_net = models.RamDQN(n_actions=4).to(hyper_parameters.DEVICE)
                #     target_net = models.RamDQN(n_actions=4).to(hyper_parameters.DEVICE)
                else:
                    policy_net = models.DQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                    target_net = models.DQN(n_actions=4).to(
                        hyper_parameters.DEVICE)
                optimizer = optim.Adam(policy_net.parameters(),
                                       lr=hyper_parameters.LEARNING_RATE)
                agents.append(
                    Agent(policy_net, target_net,
                          hyper_parameters.DEFAULT_DURABILITY, optimizer,
                          config["name"], hyper_parameters))
            else:
                # For core agent
                policy_net = models.NonBatchNormalizedDQN(n_actions=4).to(
                    hyper_parameters.DEVICE)
                target_net = models.NonBatchNormalizedDQN(n_actions=4).to(
                    hyper_parameters.DEVICE)
                optimizer = optim.Adam(policy_net.parameters(),
                                       lr=hyper_parameters.LEARNING_RATE)
                core_agent = Agent(policy_net, target_net,
                                   hyper_parameters.DEFAULT_DURABILITY,
                                   optimizer, config["name"], hyper_parameters)
            print("Agent:{} has been done".format(config["name"]))
        try:
            core_agent
        except Exception as e:
            print("P_RuntimeError:0x1000 Core agent has not been defined.")
            tb = sys.exc_info()[2]
            print(e.with_traceback(tb))
            sys.exit(1)
        return agents, core_agent
    except Exception as e:
        print("P_RuntimeError:0x1001 Some arguments is missing.")
        tb = sys.exc_info()[2]
        print(e.with_traceback(tb))
        sys.exit(1)
예제 #23
0
    # 盤の描画
    draw_board(canvas, board)

    # エージェントの描画
    draw_agent(canvas, agent)

    # 書出し
    canvas = cv2.cvtColor(canvas, cv2.COLOR_BGR2RGB)  # BGRをRGBにする
    cv2.imwrite(f"./@share/out-istep1-{seq}.png", canvas)
    seq += 1

    return seq


BOARD1 = read_screen_csv(FILE_PATH)
AGENT1 = Agent()
AGENT1.location = BOARD1.start_location[:]  # Copy
AGENT1.prev_location = BOARD1.start_location[:]  # Copy

# for (row, columns) in enumerate(BOARD1.rows):
#    for (column, cell) in enumerate(columns):
#        print(f"[{column},{row}]={cell}")

print("Start...")

SEQ1 = 0
SEQ1 = search(SEQ1, BOARD1, AGENT1, AGENT1.location, screenshot_func)
# 後ろ向き探索のスクリーンショット
screenshot_func(SEQ1, BOARD1, AGENT1)

print("Finished.")
예제 #24
0
    'window_size': 50,
    'batch_size': 32,
    'episode_count': '',
    'selected_model': 'baseline'
}

exclude_variables = ['next_close', 'next_returns', 'done']

args['stock_name'], \
    args['window_size'], \
    args['episode_count'] = \
    sys.argv[1], \
    int(sys.argv[2]), \
    int(sys.argv[3])

agent = Agent(args['window_size'])
data = get_train_data(args['stock_name'])

# data = data.head(1000)

data['next_close'] = data['close'].shift(-1)
data['next_close_diff'] = (data['next_close'] - data['close']) / data['close']

data['prev_close'] = data['close'].shift(1)
data['returns_eur'] = (
    (data['close'] - data['prev_close']) / data['prev_close']) + 1
data['returns_btc'] = ((1 / data['close'] - 1 / data['prev_close']) /
                       (1 / data['prev_close'])) + 1
data['close_diff'] = (data['close'] - data['prev_close']) / data['prev_close']
data['close_diff'].fillna(0, inplace=True)
data['diff'] = data['close'] - data['prev_close']
예제 #25
0
    parser.add_argument(
        "--wallet",
        nargs=2,
        metavar=('walletname', 'walletpass'),
        help="The name and passphrase of the wallet to connect to.")
    parser.add_argument("--ephemeralwallet",
                        action="store_true",
                        help="Use ephemeral wallets")
    args = parser.parse_args()

    # Configure webapp
    LOOP = asyncio.get_event_loop()
    WEBAPP = web.Application()
    aiohttp_jinja2.setup(WEBAPP, loader=jinja2.FileSystemLoader('view'))

    AGENT = Agent()
    POST_MESSAGE_HANDLER = PostMessageHandler(AGENT.message_queue)
    WEBSOCKET_MESSAGE_HANDLER = WebSocketMessageHandler(
        AGENT.message_queue, AGENT.outbound_admin_message_queue)
    PROVISIONAL_CONNECTION_PROTOCOL_MESSAGE_HANLDER = \
        ProvisionalConnectionProtocolMessageHandler(AGENT.message_queue)

    ROUTES = [
        web.get('/', root),
        web.get('/ws', WEBSOCKET_MESSAGE_HANDLER.ws_handler),
        web.static('/res', 'view/res'),
        web.post('/indy', POST_MESSAGE_HANDLER.handle_message),
        web.post(
            '/offer',
            PROVISIONAL_CONNECTION_PROTOCOL_MESSAGE_HANLDER.handle_message)
    ]
# initialize model
best_NN = residual_CNN(config.REG_CONST, config.LEARNING_RATE, (2,) + game.grid_shape, game.move_size,
                       config.HIDDEN_CNN_LAYERS)

# load model
best_version = config.INITIAL_MODEL_VERSION
print('Loading model ' + str(best_version) + '...')
model_temp = best_NN.read(best_version)
best_NN.model.set_weights(model_temp.get_weights())


print('\n')

# create players
best_player = Agent('best_player', game.state_size, game.move_size, config.MCTS_SIMS, config.CPUCT, best_NN)
user_player = User('player1', game.state_size, game.move_size)
iteration = 0
play_again = 'yes'

while play_again != 'no':

    print('\n')
    scores, _, points, sp_scores = play_matches_between_networks(game, -1, best_version, 1, turns_to_tau0=0, goes_first=0)
    print('\nScores: ')
    print(scores)
    print('\nFirst PLAYER / Second PLAYER SCORES')
    print(sp_scores)
    print('Play again?')
    play_again = input()
예제 #27
0
from domain import Domain
from agent import Agent

if __name__ == "__main__":
    setting = int(
        input("Press 0 for deterministic setting or 1 for stochastic setting"))
    protocol = int(input("Choose the protocol you want to display: 1, 2 or 3"))
    domain = Domain()
    domain.setting = setting
    domain.update()

    if protocol == 1:
        print("Agent1 first protocol")
        print("-----------------------")
        agent1 = Agent(domain)
        agent1.train(100)
    elif protocol == 2:
        print("Agent2 second protocol")
        print("-----------------------")
        agent2 = Agent(domain)
        agent2.train2(100)
    elif protocol == 3:
        print("Agent3 third protocol")
        print("-----------------------")
        agent3 = Agent(domain)
        agent3.train3(100)
    else:
        print("You didn't choose a valid protocol")
        print("Protocols are between 1 and 3")
예제 #28
0
from agent import Agent
from environment import ALE

tf.set_random_seed(123)
random.seed(123)

init_seed = int(sys.argv[1])
init_rand = int(sys.argv[2])

with tf.Session() as sess:

    # Init env
    env = ALE(init_seed, init_rand)

    # Init agent
    agent = Agent(sess, env.ale.getMinimalActionSet())
    action_repeat, screen_type = agent.getSetting()

    # Set env setting
    env.setSetting(action_repeat, screen_type)

    # Get a new game
    screen = env.new_game()

    # Start playing
    current_reward = 0
    for _ in range(5000):
        action = agent.play(screen)
        reward, screen, terminal = env.act(action)
        current_reward += reward
        if terminal:
예제 #29
0
translation_field = robot_node.getField("translation")
rotation_field = robot_node.getField("rotation")

# 初始化左右引擎
left_motor = super_visor.getMotor("left wheel motor")
right_motor = super_visor.getMotor("right wheel motor")
left_motor.setPosition(float("inf"))
right_motor.setPosition(float("inf"))
left_motor.setVelocity(0.0)
right_motor.setVelocity(0.0)

# 设置state, action和agent
action_space = 2
state_space = 4
max_reward = torch.tensor([2.0])
agent = Agent(state_space, action_space)

# 机器人开始运行
robot_name = robot_node.getDef()
print("Robot {} starts!".format(robot_name))

position = torch.tensor(translation_field.getSFVec3f()).unsqueeze(0)
orientation = torch.tensor([rotation_field.getSFRotation()[3]]).unsqueeze(0)

state = torch.cat((position, orientation), dim=1)

# step counter
step_count = 0

while super_visor.step(timestep) != -1:
예제 #30
0
    "type": "LSTM",
    "units": 16,
    "return_sequences": True
}, {
    "type": "LSTM",
    "units": 16,
    "return_sequences": False
}, {
    "type": "Dense",
    "units": 16,
    "activation": "relu"
}, {
    "type": "Dense",
    "units": 16,
    "activation": "relu"
}]

q_model = Q_Model("GRU",
                  state_dim=env.get_state().shape,
                  no_of_actions=env.no_of_actions,
                  layers=dense_model,
                  hyperparameters={"lr": 0.0001})
agent = Agent(q_model, batch_size=8, discount_factor=0.8, epsilon=1)

no_of_episodes_train = 100
no_of_episodes_test = 100

sim = Simulator(env, agent)
sim.train(no_of_episodes_train, epsilon_decay=0.997)
agent.model.save()
sim.test(no_of_episodes_test)