Ejemplo n.º 1
0
def pl_grad():
    print("hi")
    environment = gym.make("CartPole-v1")
    net = nn.Sequential(nn.Linear(4, 40, bias=False), nn.ReLU(),
                        nn.Linear(40, 2, bias=False), nn.Softmax(dim=1))

    class distributionNet(nn.Module):
        def __init__(self):
            super(distributionNet, self).__init__()
            self.net = net

        def forward(self, x):
            return Categorical(self.net(x))

    a_model = distributionNet()
    optimizer = torch.optim.Adam(a_model.parameters(), lr=0.01)
    learner = PolicyGradient(environment,
                             a_model,
                             optimizer,
                             discount_factor=0.99)
    opt_policy, history = learner.learn_policy(epochs=500,
                                               episodes_per_update=1)

    plt.plot(history)
    plt.xlabel('episode')
    plt.ylabel('total reward')
    plt.savefig("score.png")

    agent = Agent(environment=environment, policy=opt_policy)
    input("add anything to continue")
    agent.perform_episode(render=True)
Ejemplo n.º 2
0
def main():
    """
    This function will be called for training phase.
    """
    # Sample code for illustration, add your code below to run in test phase.
    # Load trained model from train/ directory
    env = gym.make(MINERL_GYM_ENV)
    if FRAME_SKIP > 0:
        env = FrameSkip(env, enable_rendering=True)
    env = ObsWrapper(env)
    env = MoveAxisWrapper(env, -1, 0)
    env = CombineActionWrapper(env)

    agent = Agent(env.observation_space, env.action_space)
    agent.load_model()

    for _ in range(MINERL_MAX_EVALUATION_EPISODES):
        obs = env.reset()
        done = False
        netr = 0
        while not done:
            action = agent.act(obs)
            obs, reward, done, info = env.step(action)
            netr += reward
            env.render()

    env.close()
Ejemplo n.º 3
0
def plot_grid_2_mc():
    test_grids = TEST_GRIDS
    all_test_list = [(key, grid) for key, grid in test_grids.items()]
    sorted(all_test_list, key=lambda x: x[0])
    agent = Agent()
    iters = ITERS
    total_normal_grid_score, total_grid1_score, total_grid2_score, total_grid3_score, total_grid4_score = [],[],[],[],[]
    repeats = REPEATS
    # for n in iters:
    #   print("Running iteration {n}".format(n=n))
    grid2_score, grid4_score = [], []
    for ind, grid_init in all_test_list:
        normalized_score = 0
        for j in range(repeats):
            grid_num = int(ind)  #ind initially is a string.
            if (grid_num < 200) or (grid_num > 300):
                continue

            best_reward = grid_init['best_reward']
            testgrid = Grid(5, random=False, init_pos=grid_init)
            if grid_num in {204, 208}:
                Q, policy = agent.mc_first_visit_control(testgrid.copy(),
                                                         iters=500)
                _, _, mc_reward = agent.run_final_policy(testgrid.copy(),
                                                         Q,
                                                         display=True)
            else:
                continue
            normalized_score += mc_reward - best_reward
            if normalized_score != 0:
                print(
                    "Grid num {0} did not achieve best score".format(grid_num))
Ejemplo n.º 4
0
def main():
    print ("note: 'ulimit -Sn 1024' if Errno 24")
    parser = argparse.ArgumentParser()
    parser.add_argument('--env', default='CartPole-v1')
    parser.add_argument('--seed', type=int, default=417)
    parser.add_argument('--n-timesteps', type=int, default=1e5)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--max-kl', type=float, default=1e-2)
    parser.add_argument('--log-interval', type=int, default=1e4)
    parser.add_argument('--save-path', default=None)
    parser.add_argument('--batch-size', type=int, default=1)
    parser.add_argument('--cuda', type=bool, default=False)
    parser.add_argument('--update-rule', default='A2C')
    args = parser.parse_args()

    if args.cuda:
        assert torch.cuda.is_available(), 'No available cuda devices'

    envs = [gym.make(args.env) for _ in range(args.batch_size)]
    set_seeds(envs, args.seed, args.cuda)

    agent = Agent(envs[0].observation_space, envs[0].action_space)
    if args.cuda:
        agent.cuda()

    rets = learn(agent, envs, args.update_rule, cuda=args.cuda, n_timesteps=args.n_timesteps, gamma=args.gamma,
          log_interval=args.log_interval, max_kl=args.max_kl)

    torch.save(rets, "./out/{}_{}".format(args.env, args.update_rule))

    if not (args.save_path is None):
        torch.save(agent.state_dict(), args.save_path)
Ejemplo n.º 5
0
    def __init__(self,
                 config,
                 agent=None,
                 save_path=None,
                 restore_dir=None,
                 device="cpu"):
        self.config = config
        self.train_config = config["training"]
        self.input_config = config["input"]
        self.best_metric = 0
        self.save_path = save_path

        self._matrix = DataMatrices.create_from_config(config)
        self.time_index = self._matrix._DataMatrices__global_data.time_index.values
        self.coins = self._matrix._DataMatrices__global_data.coins.values
        self.test_set = self._matrix.get_test_set()
        self.training_set = self._matrix.get_training_set()

        tf.random.set_seed(self.config["random_seed"])
        self.device = device
        self._agent = Agent(config,
                            time_index=self.time_index,
                            coins=self.coins,
                            restore_dir=restore_dir)

        self.keras_test = self._matrix.keras_batch(data="test")
Ejemplo n.º 6
0
def run(load_path, model_path, index, load_model):

	with tf.device('/gpu:0'):
		trainer = tf.train.AdamOptimizer(learning_rate=1e-4)

		global_env = Environment(load_path=load_path, starting_index=index, final_index=index+1)
		global_net = A3C_Network()
		agent = Agent(0, global_net.n_inputs_policy, global_net.n_inputs_matching,
						global_net.n_actions_policy, trainer, load_path, model_path)

		saver = tf.train.Saver()

	with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
		
		coord = tf.train.Coordinator()

		if load_model:
			ckpt = tf.train.get_checkpoint_state(model_path)
			saver.restore(sess, ckpt.model_checkpoint_path)
			sess.run(global_env.index.assign(index))
			sess.run(global_env.final_index.assign(index+1))
		else:
			sess.run(tf.global_variables_initializer())

		agent_test = lambda: agent.test(sess, coord)
		t = threading.Thread(target=(agent_test))
		t.start()
		t.join()
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser(description='LUBAN runner')
    register_model_args(parser)
    params, unparsed = parser.parse_known_args(sys.argv)
    sess = tf.Session()
    agent = Agent(sess, params)
    agent.train(checkpoint_dir="./checkpoint",
                data_dir='./data/dataset-50-3-2.hdf5')
Ejemplo n.º 8
0
def corrmaze(c_len):
    grid = np.zeros((3,c_len+2))
    for x in range(1,c_len+1):
        grid[0][x] = 1
        grid[2][x] = 1
    agents = [Agent("[255,0]",(0,0),(0,0),(c_len+1,0)),Agent("[0,255]",(c_len+1,2),(c_len+1,2),(0,2))]
    m = Maze(grid,agents)
    return m
def test_exercise(capsys):
    bond = Agent("James", "Bond")
    print(bond)

    ionic = Agent("Ionic", "Bond")
    print(ionic)

    out, err = capsys.readouterr()
    assert out == "My name is Bond, James Bond\nMy name is Bond, Ionic Bond\n"
Ejemplo n.º 10
0
def dqn(
    agent: Agent,
    env,
    brain_name,
    n_episodes: int = 10,
    eps_start: float = 1.0,
    eps_end: float = 0.01,
    eps_decay: float = 0.995,
):
    """Deep Q-Learning.

    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start
    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        while True:
            action = agent.act(state, eps)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
        print(
            f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}",
            end="",
        )
        if i_episode % 100 == 0:
            print(
                f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}"
            )
        if np.mean(scores_window) >= 13.0:
            print(
                f"\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score:"
                f" {np.mean(scores_window):.2f}")
            torch.save(agent.qnetwork_local.state_dict(), "checkpoint.pth")
            break
    return scores
Ejemplo n.º 11
0
def main(_):
  config = get_config(FLAGS) or FLAGS
  config.cnn_format = 'NHWC'

  ps_hosts = FLAGS.ps_hosts.split(",")
  worker_hosts = FLAGS.worker_hosts.split(",")

  cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
  server = tf.train.Server(cluster,
                           job_name=FLAGS.job_name,
                           task_index=FLAGS.task_index)

  if FLAGS.job_name == "ps":
    server.join()
  elif FLAGS.job_name == "worker":
    env = GymEnvironment(config)

    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % FLAGS.task_index,
        cluster=cluster)):
      lr_op = tf.placeholder('float', None, name='learning_rate')
      optimizer = tf.train.RMSPropOptimizer(
          lr_op, decay=0.99, momentum=0, epsilon=0.1)
      agent = Agent(config, env, optimizer, lr_op)

      agent.ep_end = random.sample([0.1, 0.01, 0.5], 1)[0]

    print(agent.model_dir)

    # Create a "supervisor", which oversees the training process.
    is_chief = (FLAGS.task_index == 0)
    sv = tf.train.Supervisor(is_chief=is_chief,
                             logdir="./logs/" + agent.model_dir,
                             init_op=agent.init_op,
                             summary_op=None,
                             saver=agent.saver,
                             global_step=agent.step_op,
                             save_model_secs=600)

    if FLAGS.is_train:
      if is_chief:
        train_or_play = agent.train_with_summary
      else:
        train_or_play = agent.train
    else:
      train_or_play = agent.play

    with sv.managed_session(server.target) as sess:
      agent.sess = sess
      agent.update_target_q_network()

      train_or_play(sv, is_chief)

  # Ask for all the services to stop.
  sv.stop()
Ejemplo n.º 12
0
def generate_data(mode, num_simulations=30):
    """
    Generate the dual model data from the TEST_GRID_LIST specified above.

    Args:
        - mode (Str): "delay" or "pressure"; whether the data generated has more
            or fewer monte carlo iterations to solve the test grids
        - num_simulations (int): how many data points to generate from the model
    Returns:
        -
    """
    agent = Agent()
    start = time.time()
    print("Starting {mode} data generation".format(mode=mode))
    model_results = [
    ]  # item e.g. {'model':'constrained','grid_num':23,'reward':3,'best_reward':3,'id':10}
    # Generate dual model "time constrained scenario"
    for i in range(num_simulations):
        if mode == "pressure":
            n_iters = random.randrange(
                0, 50
            )  #choose a randome integer between 20 and 30 for MC iterations
        elif mode == "delay":
            n_iters = random.randrange(
                120, 530
            )  #note these ranges were chosen by looking at the dual model performance graph
            # in the dual_model_data_generation.ipynb

        for ind, grid_init in TEST_GRID_LIST:
            testgrid = grid.Grid(5, random=False, init_pos=grid_init)
            Q, policy = agent.mc_first_visit_control(testgrid.copy(),
                                                     iters=n_iters,
                                                     nn_init=True,
                                                     cutoff=0.4)
            _, _, model_reward = agent.run_final_policy(testgrid.copy(),
                                                        Q,
                                                        nn_init=True,
                                                        display=False)
            individual_info = {
            }  #information for this particular model instantiation
            individual_info['id'] = i
            individual_info['model'] = mode
            individual_info['grid_num'] = ind
            individual_info['reward'] = model_reward
            individual_info['best_reward'] = grid_init['best_reward']
            model_results.append(individual_info)
        print("Simulation {num} took {time} seconds".format(num=i,
                                                            time=time.time() -
                                                            start))
        start = time.time()

    return model_results
def ddpg(agent: Agent, env, brain_name, n_agents, n_episodes: int = 10):
    scores_window = deque(maxlen=100)
    scores_mean_agent = []
    scores_mean = []

    for i_episode in range(1, n_episodes + 1):
        env_info = env.reset()[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(n_agents)
        while True:

            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get the next state
            rewards = env_info.rewards
            dones = env_info.local_done
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            scores += rewards
            if np.any(dones):
                break
        score = np.mean(scores)
        scores_window.append(score)
        scores_mean_agent.append(score)
        scores_mean.append(np.mean(scores_window))

        print(
            f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}",
            end="",
        )
        if i_episode % 100 == 0:
            print(
                f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}"
            )
        if np.mean(scores_window) >= 30.0:
            print(
                f"\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score:"
                f" {np.mean(scores_window):.2f}")
            torch.save(agent.qnetwork_local.state_dict(), "checkpoint.pth")
            break

        if np.mean(scores_window) >= 30.0:
            print(
                "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}"
                .format(i_episode, np.mean(scores_window)))
            torch.save(agent.policy_network_local.state_dict(),
                       "checkpoint_policy.pth")
            torch.save(agent.qnetwork_local.state_dict(),
                       "checkpoint_qnetwork.pth")
            print("saved networks")
            break
    return scores_mean_agent, scores_mean
Ejemplo n.º 14
0
    def initialize_agents_from_files(self, agent_directory):

        from src.agent import Agent
        agent_files = os.listdir(agent_directory)

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_parameters_from_file(agent_filename, self)
                self.agents.append(agent)
                agent.initialize_total_assets()
Ejemplo n.º 15
0
def graph_dual_model_performance():
    test_grids = TEST_GRIDS
    all_test_list = [(key, grid) for key, grid in test_grids.items()]
    sorted(all_test_list, key=lambda x: x[0])
    agent = Agent()
    iters = ITERS
    total_normal_grid_score, total_grid1_score, total_grid2_score, total_grid3_score, total_grid4_score = [],[],[],[],[]
    repeats = REPEATS
    for n in iters:
        print("Running iteration {n}".format(n=n))
        normal_grid_score, grid1_score, grid2_score, grid3_score, grid4_score = [],[],[],[],[]
        for ind, grid_init in all_test_list:
            normalized_score = 0
            for j in range(repeats):
                grid_num = int(ind)  #ind initially is a string.
                best_reward = grid_init['best_reward']
                testgrid = Grid(5, random=False, init_pos=grid_init)
                Q, policy = agent.mc_first_visit_control(testgrid.copy(),
                                                         iters=n,
                                                         nn_init=True)
                _, _, dual_model_reward = agent.run_final_policy(
                    testgrid.copy(), Q, nn_init=True, display=False)
                normalized_score += dual_model_reward - best_reward
            if grid_num < 100:
                normal_grid_score.append(normalized_score / repeats)
            elif grid_num < 200:  #grid type 1
                grid1_score.append(normalized_score / repeats)
            elif grid_num < 300:  #grid type 2
                grid2_score.append(normalized_score / repeats)
            elif grid_num < 400:  #grid type 3
                grid3_score.append(normalized_score / repeats)
            else:  #grid type 4
                grid4_score.append(normalized_score / repeats)
        total_normal_grid_score.append(np.mean(normal_grid_score))
        total_grid1_score.append(np.mean(grid1_score))
        total_grid2_score.append(np.mean(grid2_score))
        total_grid3_score.append(np.mean(grid3_score))
        total_grid4_score.append(np.mean(grid4_score))
    # plt.plot(iters, total_normal_grid_score, label="normal grids", color="red")
    plt.plot(iters, total_grid1_score, label='push dilemma', color="blue")
    plt.plot(iters, total_grid2_score, label='switch dilemma', color="green")
    plt.plot(iters, total_grid3_score, label='switch save', color="orange")
    plt.plot(iters, total_grid4_score, label='push get', color="brown")
    plt.legend()
    plt.xlabel("Number of MC Iterations")
    plt.ylabel("Normalized Score")
    plt.title("Dual model performance on all test grids")
    plt.show()
Ejemplo n.º 16
0
    def from_image(file):
        img = Image.open(file)
        arr = np.array(img)
        height, width, _ = arr.shape
        grid = np.zeros((height, width), dtype=np.uint8)
        for y in range(height):
            for x in range(width):
                if list(arr[y][x]) == [0, 0, 0, 255]:
                    grid[y][x] = 1

        agents = AgentPool()
        for y in range(height):
            for x in range(width):
                if arr[y][x][2] == 16:
                    name = Maze.name_from_pixel(arr[y][x])
                    agents.add(Agent(name))
                    agents.get(name).set_start((x, y))

        for y in range(height):
            for x in range(width):
                if arr[y][x][2] == 80:
                    name = Maze.name_from_pixel(arr[y][x])
                    agents.get(name).add_waypoint((x, y))
                if arr[y][x][2] == 128:
                    name = Maze.name_from_pixel(arr[y][x])
                    agents.get(name).goal = (x, y)

        return Maze(grid, agents)
Ejemplo n.º 17
0
def main(test):
    # init the environment
    env = UnityEnvironment(file_name=REACHER_APP)
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]
    # number of actions
    action_size = brain.vector_action_space_size
    # dimenison of the state space
    state_size = env_info.vector_observations.shape[1]
    # number of agents
    n_agents = len(env_info.agents)

    # create a DDPG agent
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  n_agents=n_agents,
                  random_seed=1)
    if not test:
        # train the agent
        scores = run_agent(env, agent, n_episodes=300)
        _ = plot_scores(agent, scores)
    else:
        # test the trained agent
        # load the weights from file
        agent.actor_local.load_state_dict(
            torch.load(f'weights/{str(agent)}_checkpoint_actor.pth'))
        agent.critic_local.load_state_dict(
            torch.load(f'weights/{str(agent)}_checkpoint_critic.pth'))
        test_agent(env, agent, n_agents)

    env.close()
Ejemplo n.º 18
0
    def train(self, log_file_dir = "./tensorboard", index = "0"):

        self.__print_upperbound()
        self.__init_tensorboard(log_file_dir)
        
        starttime = time.time()
        total_data_time = 0
        total_training_time = 0
        for i in range(int(self.train_config['steps'])):
            step_start = time.time()
            X, w, y, setw = self.next_batch()
            finish_data = time.time()
            total_data_time += (finish_data - step_start)
            self._agent.train_step(X, w, y, setw=setw)
            total_training_time += time.time() - finish_data 
            if i % 1000 == 0 and log_file_dir:
                logging.info("average time for data accessing is %s"%(total_data_time/1000))
                logging.info("average time for training is %s"%(total_training_time/1000))
                total_training_time = 0
                total_data_time = 0
                self.log_between_steps(i)
            
        if self.save_path:
            best_agent = Agent(self.config, restore_dir=self.save_path)
            self._agent = best_agent

        pv_vector, loss, output = self._evaluate("test")
        pv = self._agent.portfolio_value
        log_mean = self._agent.log_mean
        logging.warning('the portfolio value train No.%s is %s log_mean is %s,'
                        ' the training time is %d seconds' % (index, pv, log_mean, time.time() - starttime))
Ejemplo n.º 19
0
def test_agent():
    state_space_dim = 3
    action_space_dim = 4
    train = Train()
    agent = Agent(state_space_dim=state_space_dim,
                  action_space_dim=action_space_dim,
                  low_action=-1,
                  high_action=1,
                  load=False)
    state = np.random.rand((state_space_dim))[None]
    next_state = np.random.rand((state_space_dim))[None]
    action = agent.get_action(state)
    reward = np.array([1])
    done = np.array([0])
    Q_loss, policy_loss = train(agent, state, next_state, action, reward, done)
    assert (True)
Ejemplo n.º 20
0
    def initialize_agents_from_files(self, agent_directory, network_config):

        from src.agent import Agent
        agent_files = os.listdir(agent_directory)

        self.network = nx.read_gexf(network_config)

        # print(self.network.edges(data=True))

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_parameters_from_file(agent_filename, self)
                self.agents.append(agent)
Ejemplo n.º 21
0
    def initialize_agents_from_files(self, agent_directory, network_config):

        from src.agent import Agent
        agent_files = os.listdir(agent_directory)

        self.network = nx.read_gexf(network_config)

        # print(self.network.edges(data=True))

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_parameters_from_file(agent_filename, self)
                self.agents.append(agent)
Ejemplo n.º 22
0
    def learn_policy(self,
                     episodes=200,
                     experience_replay_samples=32,
                     gaussian_noise_variance=1,
                     exponential_average_factor=0.01,
                     noise_bound=None,
                     buffer_size=math.inf
                     ):

        pbar = tqdm(total=episodes)

        gaussian_noise = self.gaussian_distribution(gaussian_noise_variance)

        policy = DeterministicPolicy(
            self.a_model,
            additive_noise_distribution=gaussian_noise
        )
        buffer = ReplayBuffer(buffer_size)

        reward_observer = RewardObserver()
        agent = Agent(self.environment, policy)
        agent.attach_observer(reward_observer)

        current_episode = 0

        while current_episode < episodes:
            # collect transition
            state, action, reward, state_next, done = agent.step()
            # add to buffer
            buffer.add_transition(state, action, reward, state_next, done)

            # if enough transitions collected perform experience replay algorithm
            if buffer.size() >= experience_replay_samples:
                self.experience_replay(
                    buffer.sample_transitions(experience_replay_samples),
                    exponential_average_factor,
                    noise_bound=noise_bound,
                    noise_distribution=gaussian_noise
                )

            # if episode ended, update progress
            if done:
                current_episode += 1
                pbar.update(1)

        pbar.close()
        return DeterministicPolicy(self.a_model), reward_observer.get_rewards()
Ejemplo n.º 23
0
def test_train():
    train_num_episodes = APPROX_EPISODES_PER_SECOND * DESIRED_TRAIN_NUM_SECONDS

    # run a quick session of training and make plots.
    env = Environment()
    agent = Agent(env)
    agent.train(env,
                num_episodes=train_num_episodes,
                plot_training_rewards=False)

    # assert the trained agent has different Q values to a freshly instantiated one.
    fresh_env = Environment()
    fresh_agent = Agent(fresh_env)
    assert not np.array_equal(
        fresh_agent._Q,
        agent._Q
    )
Ejemplo n.º 24
0
	def test_act_tau_0(self):
		config = {
			'ALPHA': 0.8,
			'CPUCT': 1,
			'EPSILON': 0.2,
			'ACTION_SIZE': 32 * 4 * 7,
			'MCTS_SIMULATIONS': 3
		}
		action_encoder = ActionEncoder(DirectionResolver())
		agent = Agent(model=None, action_encoder=action_encoder, state_encoder=StateEncoder(), name='player1', config=config)
		game_root = Game()
		root_node = Node(game_root)

		child1 = Node(game_root.move(game_root.get_possible_moves()[0]))
		edge1 = Edge(root_node, child1, 0.33, 8)
		edge1.stats['N'] = 10
		edge1.stats['Q'] = 0.2

		root_node.edges.append(edge1)

		child2 = Node(game_root.move(game_root.get_possible_moves()[1]))
		edge2 = Edge(root_node, child2, 0.5, 104)
		edge2.stats['N'] = 20
		edge2.stats['Q'] = 0.5
		root_node.edges.append(edge2)

		child3 = Node(game_root.move(game_root.get_possible_moves()[2]))
		edge3 = Edge(root_node, child3, 0.17, 9)
		edge3.stats['N'] = 15
		edge3.stats['Q'] = 0.3
		root_node.edges.append(edge3)

		agent.prepare_mcts_for_next_action = MagicMock()
		mcts = MagicMock()
		mcts.root = root_node
		mcts.evaluate_leaf.return_value = 0.7
		agent.mcts = mcts
		mcts.move_to_leaf.return_value = (root_node, 0.5, False, [])

		action, pi, value = agent.act(game_root, tau=0)

		self.assertEqual(action, [9, 14])
		self.assertEqual(value, 0.5)
		self.assertEqual(pi[8], 10/(10 + 20 + 15))
		self.assertEqual(pi[9], 15/(10 + 20 + 15))
		self.assertEqual(pi[8 + 3*32], 20/(10 + 20 + 15))
Ejemplo n.º 25
0
def init_agent(state_size, action_size, num_agents):
    global agent

    print("\nInitializing agent....")
    agent = Agent(state_size=state_size,
                  action_size=action_size,
                  num_agents=num_agents,
                  random_seed=RANDOM_SEED)
Ejemplo n.º 26
0
    def learn_policy(self,
                     episodes=200,
                     experience_replay_samples=32,
                     exponential_average_factor=0.01,
                     entropy_coefficient=0,
                     buffer_size=math.inf,
                     updates_per_replay=1):

        pbar = tqdm(total=episodes)

        policy = StochasticPolicy(self.a_distribution_model)
        buffer = ReplayBuffer(buffer_size)

        reward_observer = RewardObserver()
        agent = Agent(self.environment, policy)
        agent.attach_observer(reward_observer)

        current_episode = 0

        while current_episode < episodes:
            # collect transition
            state, action, reward, state_next, done = agent.step()
            # add to buffer
            buffer.add_transition(state, action, reward, state_next, done)

            # if enough transitions collected perform experience replay algorithm
            if buffer.size() >= experience_replay_samples:
                for _ in range(updates_per_replay):
                    self.experience_replay(
                        buffer.sample_transitions(experience_replay_samples),
                        exponential_average_factor, entropy_coefficient)

            # if episode ended, update progress
            if done:
                current_episode += 1

                if current_episode % 20 == 0:
                    reward_observer.plot()
                    reward_observer.plot_moving_average(5)

                pbar.update(1)

        pbar.close()
        return MeanOfStochasticModel(
            self.a_distribution_model), reward_observer.get_rewards()
Ejemplo n.º 27
0
    def get_parameters_from_file(self, args):

        import os
        from src.environment import Environment
        from src.agent import Agent

        #
        # Initialize Environment and give arguments
        #

        # we need to give environment config
        environment_directory = str(args[0])
        identifier = args[1]

        # calling the Environment Class
        environment = Environment(environment_directory, identifier)

        # # get the agent_directory from the environment
        agent_directory = environment.agent_directory
        # # and loop over all agents in the directorys
        listings = os.listdir(agent_directory)

        for file in listings:

            if 'agent5.xml' in file:
                # #
                # # TESTING
                # #

                # # test whether the parameters are read properly
                text = "Initiating agent object..\n"
                self.print_info(text)

                agent = Agent()
                environment.agents.append(agent)
                print agent

                text = "Reading in parameters by calling method..\n"
                self.print_info(text)

                agent_filename = agent_directory + file
                agent.get_parameters_from_file(agent_filename, environment)
                print agent
    def __init__(self):
        self.world = World(*SimulationConfig.word_size)
        self.graphic = Graphic(self.world, *SimulationConfig.pane_size)

        if SimulationConfig.fixed_sick_cases:
            for i in range(SimulationConfig.population_size):
                if i < SimulationConfig.fixed_cases_count:
                    self.world.add_agent_on_free(Agent(self.world, True))
                else:
                    self.world.add_agent_on_free(Agent(self.world, False))
        else:
            for i in range(SimulationConfig.population_size):
                self.world.add_agent_on_free(
                    Agent(
                        self.world,
                        get_it_with_probability(
                            SimulationConfig.create_sick_agent_probability,
                            True, False)))
        self.statistic = Statistic(self.world)
Ejemplo n.º 29
0
def read_agent(version):
    nn = Residual_CNN(config['REG_CONST'], config['LEARNING_RATE'], (2, 4, 8),
                      config['ACTION_SIZE'], config['HIDDEN_CNN_LAYERS'],
                      config['MOMENTUM'])
    m_tmp = nn.read(version)
    nn.model.set_weights(m_tmp.get_weights())
    player = Agent(nn,
                   ActionEncoder(DirectionResolver()),
                   StateEncoder(),
                   name='player' + str(version),
                   config=config)
    return player
Ejemplo n.º 30
0
def play(ctx, steps, noise):
    env, state_space_dim, action_space_dim, state_norm_array, min_action, \
        max_action = setup_env()

    # noise_process = OUNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA,
    #     theta=THETA,
    #     dt=1e-2)

    # noise_process = NormalNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA)

    # noise_process = LinearSegmentNoise(
    #     dim=action_space_dim,
    #     sigma=SIGMA)

    noise_process = SmoothNoiseND(steps=steps,
                                  dim=action_space_dim,
                                  sigma=SIGMA)

    agent = Agent(state_space_dim,
                  action_space_dim,
                  layer_dims=LAYERS_DIMS,
                  low_action=min_action,
                  high_action=max_action,
                  noise_process=noise_process,
                  load=True)

    state = env.reset()

    agent.actor.summary()
    agent.critic.summary()
    for i in range(steps):
        action = agent.get_action(state[None], with_exploration=noise)[0]
        state, reward, done, _ = env \
            .step(action)
        state = state
        env.render()
Ejemplo n.º 31
0
def main():

    window_size = 5
    episode_count = 10
    stock_name = "^GSPC_2011"

    agent = Agent(window_size)
    market = Market(window_size=window_size, stock_name=stock_name)

    batch_size = 32

    start_time = time.time()
    for e in range(episode_count + 1):
        print("Episodio" + str(e) + "/" + str(episode_count))
        agent.reset()
        state, price_data = market.reset()  # ToDo: get the initial state

        for t in range(market.last_data_index):
            # obtener acción actual del agente
            # llamar al método act() del agente considerando el estado actual
            action, bought_price = agent.act(state, price_data)

            # obtener siguiente estado del agente según el mercado
            next_state, next_price_data, reward, done =\
                market.get_next_state_reward(action, bought_price)

            # añadir trasacción a la memoria
            agent.memory.append((state, action, reward, next_state, done))
            # aprender de la historia solo en el caso que haya memoria
            if len(agent.memory) > batch_size:
                agent.experience_replay(batch_size)

            state = next_state
            price_data = next_price_data

            if done:
                print("--------------------------------")
                print("Ganancias totales: {0}".format(
                    agent.get_total_profit()))
                print("--------------------------------")

        if e % 10 == 0:
            if not os.path.exists("models"):
                os.mkdir("models")
            agent.model.save("models/model_rl" + str(e))

    end_time = time.time()
    training_time = round(end_time - start_time)
    print("Entrenamiento tomó {0} segundos.".format(training_time))
Ejemplo n.º 32
0
def main(_):
    config = get_config(FLAGS) or FLAGS
    config.cnn_format = 'NHWC'

    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")

    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)

    if FLAGS.job_name == "ps":
        server.join()
    elif FLAGS.job_name == "worker":
        env = GymEnvironment(config)

        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % FLAGS.task_index,
                    cluster=cluster)):
            lr_op = tf.placeholder('float', None, name='learning_rate')
            optimizer = tf.train.RMSPropOptimizer(lr_op,
                                                  decay=0.99,
                                                  momentum=0,
                                                  epsilon=0.1)
            agent = Agent(config, env, optimizer, lr_op)

            agent.ep_end = random.sample([0.1, 0.01, 0.5], 1)[0]

        print(agent.model_dir)

        # Create a "supervisor", which oversees the training process.
        is_chief = (FLAGS.task_index == 0)
        sv = tf.train.Supervisor(is_chief=is_chief,
                                 logdir="./logs/" + agent.model_dir,
                                 init_op=agent.init_op,
                                 summary_op=None,
                                 saver=agent.saver,
                                 global_step=agent.step_op,
                                 save_model_secs=600)

        if FLAGS.is_train:
            if is_chief:
                train_or_play = agent.train_with_summary
            else:
                train_or_play = agent.train
        else:
            train_or_play = agent.play

        with sv.managed_session(server.target) as sess:
            agent.sess = sess
            agent.update_target_q_network()

            train_or_play(sv, is_chief)

    # Ask for all the services to stop.
    sv.stop()
Ejemplo n.º 33
0
def walker():
    environment = gym.make("Pendulum-v0")
    print(environment.action_space)
    print(environment.observation_space)

    class action_distribution_model(nn.Module):
        def __init__(self):
            super(action_distribution_model, self).__init__()
            self.secquential = nn.Sequential(nn.Linear(3, 24), nn.ReLU(),
                                             nn.Linear(24, 1, bias=False),
                                             nn.Tanh())

        def forward(self, x):
            mean = self.secquential(x)
            mean = mean * 4

            return MultivariateNormal(mean, torch.eye(1) * 0.25)

    distribution = action_distribution_model()
    optimizer = torch.optim.Adam(distribution.parameters(), lr=0.01)

    v_model = nn.Sequential(nn.Linear(3, 24), nn.ReLU(), nn.Linear(24, 1))
    v_optimizer = torch.optim.Adam(v_model.parameters(), lr=0.01)

    learner = PPO(environment, distribution, optimizer, discount_factor=0.99)

    opt_policy, history = learner.learn_policy(epochs=250,
                                               actor_iterations=10,
                                               episodes_per_update=2)
    plt.plot(history)
    plt.xlabel('episode')
    plt.ylabel('total reward')
    plt.savefig("score.png")

    a = Agent(environment, opt_policy)
    while input("continue ") == "c":
        a.perform_episode(render=True)
Ejemplo n.º 34
0
    def initialize_agents_from_files(self, agent_directory):

        from src.agent import Agent
        agent_files = os.listdir(agent_directory)

        self.network = nx.Graph()

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_nodes_for_graph(agent_filename, self)

        for each_agent_file in agent_files:

            if '.xml' in each_agent_file:
                agent = Agent()
                agent_filename = agent_directory + each_agent_file
                agent.get_parameters_from_file(agent_filename, self)
                self.agents.append(agent)