def pl_grad(): print("hi") environment = gym.make("CartPole-v1") net = nn.Sequential(nn.Linear(4, 40, bias=False), nn.ReLU(), nn.Linear(40, 2, bias=False), nn.Softmax(dim=1)) class distributionNet(nn.Module): def __init__(self): super(distributionNet, self).__init__() self.net = net def forward(self, x): return Categorical(self.net(x)) a_model = distributionNet() optimizer = torch.optim.Adam(a_model.parameters(), lr=0.01) learner = PolicyGradient(environment, a_model, optimizer, discount_factor=0.99) opt_policy, history = learner.learn_policy(epochs=500, episodes_per_update=1) plt.plot(history) plt.xlabel('episode') plt.ylabel('total reward') plt.savefig("score.png") agent = Agent(environment=environment, policy=opt_policy) input("add anything to continue") agent.perform_episode(render=True)
def main(): """ This function will be called for training phase. """ # Sample code for illustration, add your code below to run in test phase. # Load trained model from train/ directory env = gym.make(MINERL_GYM_ENV) if FRAME_SKIP > 0: env = FrameSkip(env, enable_rendering=True) env = ObsWrapper(env) env = MoveAxisWrapper(env, -1, 0) env = CombineActionWrapper(env) agent = Agent(env.observation_space, env.action_space) agent.load_model() for _ in range(MINERL_MAX_EVALUATION_EPISODES): obs = env.reset() done = False netr = 0 while not done: action = agent.act(obs) obs, reward, done, info = env.step(action) netr += reward env.render() env.close()
def plot_grid_2_mc(): test_grids = TEST_GRIDS all_test_list = [(key, grid) for key, grid in test_grids.items()] sorted(all_test_list, key=lambda x: x[0]) agent = Agent() iters = ITERS total_normal_grid_score, total_grid1_score, total_grid2_score, total_grid3_score, total_grid4_score = [],[],[],[],[] repeats = REPEATS # for n in iters: # print("Running iteration {n}".format(n=n)) grid2_score, grid4_score = [], [] for ind, grid_init in all_test_list: normalized_score = 0 for j in range(repeats): grid_num = int(ind) #ind initially is a string. if (grid_num < 200) or (grid_num > 300): continue best_reward = grid_init['best_reward'] testgrid = Grid(5, random=False, init_pos=grid_init) if grid_num in {204, 208}: Q, policy = agent.mc_first_visit_control(testgrid.copy(), iters=500) _, _, mc_reward = agent.run_final_policy(testgrid.copy(), Q, display=True) else: continue normalized_score += mc_reward - best_reward if normalized_score != 0: print( "Grid num {0} did not achieve best score".format(grid_num))
def main(): print ("note: 'ulimit -Sn 1024' if Errno 24") parser = argparse.ArgumentParser() parser.add_argument('--env', default='CartPole-v1') parser.add_argument('--seed', type=int, default=417) parser.add_argument('--n-timesteps', type=int, default=1e5) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--max-kl', type=float, default=1e-2) parser.add_argument('--log-interval', type=int, default=1e4) parser.add_argument('--save-path', default=None) parser.add_argument('--batch-size', type=int, default=1) parser.add_argument('--cuda', type=bool, default=False) parser.add_argument('--update-rule', default='A2C') args = parser.parse_args() if args.cuda: assert torch.cuda.is_available(), 'No available cuda devices' envs = [gym.make(args.env) for _ in range(args.batch_size)] set_seeds(envs, args.seed, args.cuda) agent = Agent(envs[0].observation_space, envs[0].action_space) if args.cuda: agent.cuda() rets = learn(agent, envs, args.update_rule, cuda=args.cuda, n_timesteps=args.n_timesteps, gamma=args.gamma, log_interval=args.log_interval, max_kl=args.max_kl) torch.save(rets, "./out/{}_{}".format(args.env, args.update_rule)) if not (args.save_path is None): torch.save(agent.state_dict(), args.save_path)
def __init__(self, config, agent=None, save_path=None, restore_dir=None, device="cpu"): self.config = config self.train_config = config["training"] self.input_config = config["input"] self.best_metric = 0 self.save_path = save_path self._matrix = DataMatrices.create_from_config(config) self.time_index = self._matrix._DataMatrices__global_data.time_index.values self.coins = self._matrix._DataMatrices__global_data.coins.values self.test_set = self._matrix.get_test_set() self.training_set = self._matrix.get_training_set() tf.random.set_seed(self.config["random_seed"]) self.device = device self._agent = Agent(config, time_index=self.time_index, coins=self.coins, restore_dir=restore_dir) self.keras_test = self._matrix.keras_batch(data="test")
def run(load_path, model_path, index, load_model): with tf.device('/gpu:0'): trainer = tf.train.AdamOptimizer(learning_rate=1e-4) global_env = Environment(load_path=load_path, starting_index=index, final_index=index+1) global_net = A3C_Network() agent = Agent(0, global_net.n_inputs_policy, global_net.n_inputs_matching, global_net.n_actions_policy, trainer, load_path, model_path) saver = tf.train.Saver() with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: coord = tf.train.Coordinator() if load_model: ckpt = tf.train.get_checkpoint_state(model_path) saver.restore(sess, ckpt.model_checkpoint_path) sess.run(global_env.index.assign(index)) sess.run(global_env.final_index.assign(index+1)) else: sess.run(tf.global_variables_initializer()) agent_test = lambda: agent.test(sess, coord) t = threading.Thread(target=(agent_test)) t.start() t.join()
def main(): parser = argparse.ArgumentParser(description='LUBAN runner') register_model_args(parser) params, unparsed = parser.parse_known_args(sys.argv) sess = tf.Session() agent = Agent(sess, params) agent.train(checkpoint_dir="./checkpoint", data_dir='./data/dataset-50-3-2.hdf5')
def corrmaze(c_len): grid = np.zeros((3,c_len+2)) for x in range(1,c_len+1): grid[0][x] = 1 grid[2][x] = 1 agents = [Agent("[255,0]",(0,0),(0,0),(c_len+1,0)),Agent("[0,255]",(c_len+1,2),(c_len+1,2),(0,2))] m = Maze(grid,agents) return m
def test_exercise(capsys): bond = Agent("James", "Bond") print(bond) ionic = Agent("Ionic", "Bond") print(ionic) out, err = capsys.readouterr() assert out == "My name is Bond, James Bond\nMy name is Bond, Ionic Bond\n"
def dqn( agent: Agent, env, brain_name, n_episodes: int = 10, eps_start: float = 1.0, eps_end: float = 0.01, eps_decay: float = 0.995, ): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 while True: action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print( f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}", end="", ) if i_episode % 100 == 0: print( f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}" ) if np.mean(scores_window) >= 13.0: print( f"\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score:" f" {np.mean(scores_window):.2f}") torch.save(agent.qnetwork_local.state_dict(), "checkpoint.pth") break return scores
def main(_): config = get_config(FLAGS) or FLAGS config.cnn_format = 'NHWC' ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": env = GymEnvironment(config) with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): lr_op = tf.placeholder('float', None, name='learning_rate') optimizer = tf.train.RMSPropOptimizer( lr_op, decay=0.99, momentum=0, epsilon=0.1) agent = Agent(config, env, optimizer, lr_op) agent.ep_end = random.sample([0.1, 0.01, 0.5], 1)[0] print(agent.model_dir) # Create a "supervisor", which oversees the training process. is_chief = (FLAGS.task_index == 0) sv = tf.train.Supervisor(is_chief=is_chief, logdir="./logs/" + agent.model_dir, init_op=agent.init_op, summary_op=None, saver=agent.saver, global_step=agent.step_op, save_model_secs=600) if FLAGS.is_train: if is_chief: train_or_play = agent.train_with_summary else: train_or_play = agent.train else: train_or_play = agent.play with sv.managed_session(server.target) as sess: agent.sess = sess agent.update_target_q_network() train_or_play(sv, is_chief) # Ask for all the services to stop. sv.stop()
def generate_data(mode, num_simulations=30): """ Generate the dual model data from the TEST_GRID_LIST specified above. Args: - mode (Str): "delay" or "pressure"; whether the data generated has more or fewer monte carlo iterations to solve the test grids - num_simulations (int): how many data points to generate from the model Returns: - """ agent = Agent() start = time.time() print("Starting {mode} data generation".format(mode=mode)) model_results = [ ] # item e.g. {'model':'constrained','grid_num':23,'reward':3,'best_reward':3,'id':10} # Generate dual model "time constrained scenario" for i in range(num_simulations): if mode == "pressure": n_iters = random.randrange( 0, 50 ) #choose a randome integer between 20 and 30 for MC iterations elif mode == "delay": n_iters = random.randrange( 120, 530 ) #note these ranges were chosen by looking at the dual model performance graph # in the dual_model_data_generation.ipynb for ind, grid_init in TEST_GRID_LIST: testgrid = grid.Grid(5, random=False, init_pos=grid_init) Q, policy = agent.mc_first_visit_control(testgrid.copy(), iters=n_iters, nn_init=True, cutoff=0.4) _, _, model_reward = agent.run_final_policy(testgrid.copy(), Q, nn_init=True, display=False) individual_info = { } #information for this particular model instantiation individual_info['id'] = i individual_info['model'] = mode individual_info['grid_num'] = ind individual_info['reward'] = model_reward individual_info['best_reward'] = grid_init['best_reward'] model_results.append(individual_info) print("Simulation {num} took {time} seconds".format(num=i, time=time.time() - start)) start = time.time() return model_results
def ddpg(agent: Agent, env, brain_name, n_agents, n_episodes: int = 10): scores_window = deque(maxlen=100) scores_mean_agent = [] scores_mean = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset()[brain_name] states = env_info.vector_observations scores = np.zeros(n_agents) while True: actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the next state rewards = env_info.rewards dones = env_info.local_done agent.step(states, actions, rewards, next_states, dones) states = next_states scores += rewards if np.any(dones): break score = np.mean(scores) scores_window.append(score) scores_mean_agent.append(score) scores_mean.append(np.mean(scores_window)) print( f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}", end="", ) if i_episode % 100 == 0: print( f"\rEpisode {i_episode}\tAverage Score: {np.mean(scores_window):.2f}" ) if np.mean(scores_window) >= 30.0: print( f"\nEnvironment solved in {i_episode-100:d} episodes!\tAverage Score:" f" {np.mean(scores_window):.2f}") torch.save(agent.qnetwork_local.state_dict(), "checkpoint.pth") break if np.mean(scores_window) >= 30.0: print( "\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}" .format(i_episode, np.mean(scores_window))) torch.save(agent.policy_network_local.state_dict(), "checkpoint_policy.pth") torch.save(agent.qnetwork_local.state_dict(), "checkpoint_qnetwork.pth") print("saved networks") break return scores_mean_agent, scores_mean
def initialize_agents_from_files(self, agent_directory): from src.agent import Agent agent_files = os.listdir(agent_directory) for each_agent_file in agent_files: if '.xml' in each_agent_file: agent = Agent() agent_filename = agent_directory + each_agent_file agent.get_parameters_from_file(agent_filename, self) self.agents.append(agent) agent.initialize_total_assets()
def graph_dual_model_performance(): test_grids = TEST_GRIDS all_test_list = [(key, grid) for key, grid in test_grids.items()] sorted(all_test_list, key=lambda x: x[0]) agent = Agent() iters = ITERS total_normal_grid_score, total_grid1_score, total_grid2_score, total_grid3_score, total_grid4_score = [],[],[],[],[] repeats = REPEATS for n in iters: print("Running iteration {n}".format(n=n)) normal_grid_score, grid1_score, grid2_score, grid3_score, grid4_score = [],[],[],[],[] for ind, grid_init in all_test_list: normalized_score = 0 for j in range(repeats): grid_num = int(ind) #ind initially is a string. best_reward = grid_init['best_reward'] testgrid = Grid(5, random=False, init_pos=grid_init) Q, policy = agent.mc_first_visit_control(testgrid.copy(), iters=n, nn_init=True) _, _, dual_model_reward = agent.run_final_policy( testgrid.copy(), Q, nn_init=True, display=False) normalized_score += dual_model_reward - best_reward if grid_num < 100: normal_grid_score.append(normalized_score / repeats) elif grid_num < 200: #grid type 1 grid1_score.append(normalized_score / repeats) elif grid_num < 300: #grid type 2 grid2_score.append(normalized_score / repeats) elif grid_num < 400: #grid type 3 grid3_score.append(normalized_score / repeats) else: #grid type 4 grid4_score.append(normalized_score / repeats) total_normal_grid_score.append(np.mean(normal_grid_score)) total_grid1_score.append(np.mean(grid1_score)) total_grid2_score.append(np.mean(grid2_score)) total_grid3_score.append(np.mean(grid3_score)) total_grid4_score.append(np.mean(grid4_score)) # plt.plot(iters, total_normal_grid_score, label="normal grids", color="red") plt.plot(iters, total_grid1_score, label='push dilemma', color="blue") plt.plot(iters, total_grid2_score, label='switch dilemma', color="green") plt.plot(iters, total_grid3_score, label='switch save', color="orange") plt.plot(iters, total_grid4_score, label='push get', color="brown") plt.legend() plt.xlabel("Number of MC Iterations") plt.ylabel("Normalized Score") plt.title("Dual model performance on all test grids") plt.show()
def from_image(file): img = Image.open(file) arr = np.array(img) height, width, _ = arr.shape grid = np.zeros((height, width), dtype=np.uint8) for y in range(height): for x in range(width): if list(arr[y][x]) == [0, 0, 0, 255]: grid[y][x] = 1 agents = AgentPool() for y in range(height): for x in range(width): if arr[y][x][2] == 16: name = Maze.name_from_pixel(arr[y][x]) agents.add(Agent(name)) agents.get(name).set_start((x, y)) for y in range(height): for x in range(width): if arr[y][x][2] == 80: name = Maze.name_from_pixel(arr[y][x]) agents.get(name).add_waypoint((x, y)) if arr[y][x][2] == 128: name = Maze.name_from_pixel(arr[y][x]) agents.get(name).goal = (x, y) return Maze(grid, agents)
def main(test): # init the environment env = UnityEnvironment(file_name=REACHER_APP) brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] # number of actions action_size = brain.vector_action_space_size # dimenison of the state space state_size = env_info.vector_observations.shape[1] # number of agents n_agents = len(env_info.agents) # create a DDPG agent agent = Agent(state_size=state_size, action_size=action_size, n_agents=n_agents, random_seed=1) if not test: # train the agent scores = run_agent(env, agent, n_episodes=300) _ = plot_scores(agent, scores) else: # test the trained agent # load the weights from file agent.actor_local.load_state_dict( torch.load(f'weights/{str(agent)}_checkpoint_actor.pth')) agent.critic_local.load_state_dict( torch.load(f'weights/{str(agent)}_checkpoint_critic.pth')) test_agent(env, agent, n_agents) env.close()
def train(self, log_file_dir = "./tensorboard", index = "0"): self.__print_upperbound() self.__init_tensorboard(log_file_dir) starttime = time.time() total_data_time = 0 total_training_time = 0 for i in range(int(self.train_config['steps'])): step_start = time.time() X, w, y, setw = self.next_batch() finish_data = time.time() total_data_time += (finish_data - step_start) self._agent.train_step(X, w, y, setw=setw) total_training_time += time.time() - finish_data if i % 1000 == 0 and log_file_dir: logging.info("average time for data accessing is %s"%(total_data_time/1000)) logging.info("average time for training is %s"%(total_training_time/1000)) total_training_time = 0 total_data_time = 0 self.log_between_steps(i) if self.save_path: best_agent = Agent(self.config, restore_dir=self.save_path) self._agent = best_agent pv_vector, loss, output = self._evaluate("test") pv = self._agent.portfolio_value log_mean = self._agent.log_mean logging.warning('the portfolio value train No.%s is %s log_mean is %s,' ' the training time is %d seconds' % (index, pv, log_mean, time.time() - starttime))
def test_agent(): state_space_dim = 3 action_space_dim = 4 train = Train() agent = Agent(state_space_dim=state_space_dim, action_space_dim=action_space_dim, low_action=-1, high_action=1, load=False) state = np.random.rand((state_space_dim))[None] next_state = np.random.rand((state_space_dim))[None] action = agent.get_action(state) reward = np.array([1]) done = np.array([0]) Q_loss, policy_loss = train(agent, state, next_state, action, reward, done) assert (True)
def initialize_agents_from_files(self, agent_directory, network_config): from src.agent import Agent agent_files = os.listdir(agent_directory) self.network = nx.read_gexf(network_config) # print(self.network.edges(data=True)) for each_agent_file in agent_files: if '.xml' in each_agent_file: agent = Agent() agent_filename = agent_directory + each_agent_file agent.get_parameters_from_file(agent_filename, self) self.agents.append(agent)
def learn_policy(self, episodes=200, experience_replay_samples=32, gaussian_noise_variance=1, exponential_average_factor=0.01, noise_bound=None, buffer_size=math.inf ): pbar = tqdm(total=episodes) gaussian_noise = self.gaussian_distribution(gaussian_noise_variance) policy = DeterministicPolicy( self.a_model, additive_noise_distribution=gaussian_noise ) buffer = ReplayBuffer(buffer_size) reward_observer = RewardObserver() agent = Agent(self.environment, policy) agent.attach_observer(reward_observer) current_episode = 0 while current_episode < episodes: # collect transition state, action, reward, state_next, done = agent.step() # add to buffer buffer.add_transition(state, action, reward, state_next, done) # if enough transitions collected perform experience replay algorithm if buffer.size() >= experience_replay_samples: self.experience_replay( buffer.sample_transitions(experience_replay_samples), exponential_average_factor, noise_bound=noise_bound, noise_distribution=gaussian_noise ) # if episode ended, update progress if done: current_episode += 1 pbar.update(1) pbar.close() return DeterministicPolicy(self.a_model), reward_observer.get_rewards()
def test_train(): train_num_episodes = APPROX_EPISODES_PER_SECOND * DESIRED_TRAIN_NUM_SECONDS # run a quick session of training and make plots. env = Environment() agent = Agent(env) agent.train(env, num_episodes=train_num_episodes, plot_training_rewards=False) # assert the trained agent has different Q values to a freshly instantiated one. fresh_env = Environment() fresh_agent = Agent(fresh_env) assert not np.array_equal( fresh_agent._Q, agent._Q )
def test_act_tau_0(self): config = { 'ALPHA': 0.8, 'CPUCT': 1, 'EPSILON': 0.2, 'ACTION_SIZE': 32 * 4 * 7, 'MCTS_SIMULATIONS': 3 } action_encoder = ActionEncoder(DirectionResolver()) agent = Agent(model=None, action_encoder=action_encoder, state_encoder=StateEncoder(), name='player1', config=config) game_root = Game() root_node = Node(game_root) child1 = Node(game_root.move(game_root.get_possible_moves()[0])) edge1 = Edge(root_node, child1, 0.33, 8) edge1.stats['N'] = 10 edge1.stats['Q'] = 0.2 root_node.edges.append(edge1) child2 = Node(game_root.move(game_root.get_possible_moves()[1])) edge2 = Edge(root_node, child2, 0.5, 104) edge2.stats['N'] = 20 edge2.stats['Q'] = 0.5 root_node.edges.append(edge2) child3 = Node(game_root.move(game_root.get_possible_moves()[2])) edge3 = Edge(root_node, child3, 0.17, 9) edge3.stats['N'] = 15 edge3.stats['Q'] = 0.3 root_node.edges.append(edge3) agent.prepare_mcts_for_next_action = MagicMock() mcts = MagicMock() mcts.root = root_node mcts.evaluate_leaf.return_value = 0.7 agent.mcts = mcts mcts.move_to_leaf.return_value = (root_node, 0.5, False, []) action, pi, value = agent.act(game_root, tau=0) self.assertEqual(action, [9, 14]) self.assertEqual(value, 0.5) self.assertEqual(pi[8], 10/(10 + 20 + 15)) self.assertEqual(pi[9], 15/(10 + 20 + 15)) self.assertEqual(pi[8 + 3*32], 20/(10 + 20 + 15))
def init_agent(state_size, action_size, num_agents): global agent print("\nInitializing agent....") agent = Agent(state_size=state_size, action_size=action_size, num_agents=num_agents, random_seed=RANDOM_SEED)
def learn_policy(self, episodes=200, experience_replay_samples=32, exponential_average_factor=0.01, entropy_coefficient=0, buffer_size=math.inf, updates_per_replay=1): pbar = tqdm(total=episodes) policy = StochasticPolicy(self.a_distribution_model) buffer = ReplayBuffer(buffer_size) reward_observer = RewardObserver() agent = Agent(self.environment, policy) agent.attach_observer(reward_observer) current_episode = 0 while current_episode < episodes: # collect transition state, action, reward, state_next, done = agent.step() # add to buffer buffer.add_transition(state, action, reward, state_next, done) # if enough transitions collected perform experience replay algorithm if buffer.size() >= experience_replay_samples: for _ in range(updates_per_replay): self.experience_replay( buffer.sample_transitions(experience_replay_samples), exponential_average_factor, entropy_coefficient) # if episode ended, update progress if done: current_episode += 1 if current_episode % 20 == 0: reward_observer.plot() reward_observer.plot_moving_average(5) pbar.update(1) pbar.close() return MeanOfStochasticModel( self.a_distribution_model), reward_observer.get_rewards()
def get_parameters_from_file(self, args): import os from src.environment import Environment from src.agent import Agent # # Initialize Environment and give arguments # # we need to give environment config environment_directory = str(args[0]) identifier = args[1] # calling the Environment Class environment = Environment(environment_directory, identifier) # # get the agent_directory from the environment agent_directory = environment.agent_directory # # and loop over all agents in the directorys listings = os.listdir(agent_directory) for file in listings: if 'agent5.xml' in file: # # # # TESTING # # # # test whether the parameters are read properly text = "Initiating agent object..\n" self.print_info(text) agent = Agent() environment.agents.append(agent) print agent text = "Reading in parameters by calling method..\n" self.print_info(text) agent_filename = agent_directory + file agent.get_parameters_from_file(agent_filename, environment) print agent
def __init__(self): self.world = World(*SimulationConfig.word_size) self.graphic = Graphic(self.world, *SimulationConfig.pane_size) if SimulationConfig.fixed_sick_cases: for i in range(SimulationConfig.population_size): if i < SimulationConfig.fixed_cases_count: self.world.add_agent_on_free(Agent(self.world, True)) else: self.world.add_agent_on_free(Agent(self.world, False)) else: for i in range(SimulationConfig.population_size): self.world.add_agent_on_free( Agent( self.world, get_it_with_probability( SimulationConfig.create_sick_agent_probability, True, False))) self.statistic = Statistic(self.world)
def read_agent(version): nn = Residual_CNN(config['REG_CONST'], config['LEARNING_RATE'], (2, 4, 8), config['ACTION_SIZE'], config['HIDDEN_CNN_LAYERS'], config['MOMENTUM']) m_tmp = nn.read(version) nn.model.set_weights(m_tmp.get_weights()) player = Agent(nn, ActionEncoder(DirectionResolver()), StateEncoder(), name='player' + str(version), config=config) return player
def play(ctx, steps, noise): env, state_space_dim, action_space_dim, state_norm_array, min_action, \ max_action = setup_env() # noise_process = OUNoise( # dim=action_space_dim, # sigma=SIGMA, # theta=THETA, # dt=1e-2) # noise_process = NormalNoise( # dim=action_space_dim, # sigma=SIGMA) # noise_process = LinearSegmentNoise( # dim=action_space_dim, # sigma=SIGMA) noise_process = SmoothNoiseND(steps=steps, dim=action_space_dim, sigma=SIGMA) agent = Agent(state_space_dim, action_space_dim, layer_dims=LAYERS_DIMS, low_action=min_action, high_action=max_action, noise_process=noise_process, load=True) state = env.reset() agent.actor.summary() agent.critic.summary() for i in range(steps): action = agent.get_action(state[None], with_exploration=noise)[0] state, reward, done, _ = env \ .step(action) state = state env.render()
def main(): window_size = 5 episode_count = 10 stock_name = "^GSPC_2011" agent = Agent(window_size) market = Market(window_size=window_size, stock_name=stock_name) batch_size = 32 start_time = time.time() for e in range(episode_count + 1): print("Episodio" + str(e) + "/" + str(episode_count)) agent.reset() state, price_data = market.reset() # ToDo: get the initial state for t in range(market.last_data_index): # obtener acción actual del agente # llamar al método act() del agente considerando el estado actual action, bought_price = agent.act(state, price_data) # obtener siguiente estado del agente según el mercado next_state, next_price_data, reward, done =\ market.get_next_state_reward(action, bought_price) # añadir trasacción a la memoria agent.memory.append((state, action, reward, next_state, done)) # aprender de la historia solo en el caso que haya memoria if len(agent.memory) > batch_size: agent.experience_replay(batch_size) state = next_state price_data = next_price_data if done: print("--------------------------------") print("Ganancias totales: {0}".format( agent.get_total_profit())) print("--------------------------------") if e % 10 == 0: if not os.path.exists("models"): os.mkdir("models") agent.model.save("models/model_rl" + str(e)) end_time = time.time() training_time = round(end_time - start_time) print("Entrenamiento tomó {0} segundos.".format(training_time))
def main(_): config = get_config(FLAGS) or FLAGS config.cnn_format = 'NHWC' ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == "ps": server.join() elif FLAGS.job_name == "worker": env = GymEnvironment(config) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % FLAGS.task_index, cluster=cluster)): lr_op = tf.placeholder('float', None, name='learning_rate') optimizer = tf.train.RMSPropOptimizer(lr_op, decay=0.99, momentum=0, epsilon=0.1) agent = Agent(config, env, optimizer, lr_op) agent.ep_end = random.sample([0.1, 0.01, 0.5], 1)[0] print(agent.model_dir) # Create a "supervisor", which oversees the training process. is_chief = (FLAGS.task_index == 0) sv = tf.train.Supervisor(is_chief=is_chief, logdir="./logs/" + agent.model_dir, init_op=agent.init_op, summary_op=None, saver=agent.saver, global_step=agent.step_op, save_model_secs=600) if FLAGS.is_train: if is_chief: train_or_play = agent.train_with_summary else: train_or_play = agent.train else: train_or_play = agent.play with sv.managed_session(server.target) as sess: agent.sess = sess agent.update_target_q_network() train_or_play(sv, is_chief) # Ask for all the services to stop. sv.stop()
def walker(): environment = gym.make("Pendulum-v0") print(environment.action_space) print(environment.observation_space) class action_distribution_model(nn.Module): def __init__(self): super(action_distribution_model, self).__init__() self.secquential = nn.Sequential(nn.Linear(3, 24), nn.ReLU(), nn.Linear(24, 1, bias=False), nn.Tanh()) def forward(self, x): mean = self.secquential(x) mean = mean * 4 return MultivariateNormal(mean, torch.eye(1) * 0.25) distribution = action_distribution_model() optimizer = torch.optim.Adam(distribution.parameters(), lr=0.01) v_model = nn.Sequential(nn.Linear(3, 24), nn.ReLU(), nn.Linear(24, 1)) v_optimizer = torch.optim.Adam(v_model.parameters(), lr=0.01) learner = PPO(environment, distribution, optimizer, discount_factor=0.99) opt_policy, history = learner.learn_policy(epochs=250, actor_iterations=10, episodes_per_update=2) plt.plot(history) plt.xlabel('episode') plt.ylabel('total reward') plt.savefig("score.png") a = Agent(environment, opt_policy) while input("continue ") == "c": a.perform_episode(render=True)
def initialize_agents_from_files(self, agent_directory): from src.agent import Agent agent_files = os.listdir(agent_directory) self.network = nx.Graph() for each_agent_file in agent_files: if '.xml' in each_agent_file: agent = Agent() agent_filename = agent_directory + each_agent_file agent.get_nodes_for_graph(agent_filename, self) for each_agent_file in agent_files: if '.xml' in each_agent_file: agent = Agent() agent_filename = agent_directory + each_agent_file agent.get_parameters_from_file(agent_filename, self) self.agents.append(agent)