class DQNAgent(): def __init__(self, env): self.state_dim = env.observation_space.shape self.action_size = env.action_space.n self.q_network = QNetwork(self.state_dim, self.action_size) self.gamma = 0.97 self.ep = 1.0 self.replay_buffer = ReplayBuffer(length=10000) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) def get_action(self, state): q_state = self.q_network.get_q_state(self.sess, [state]) if random.random() < self.ep: action = np.random.randint(self.action_size) else: action = np.argmax(q_state) return action def train(self, state, action, next_state, reward, done): self.replay_buffer.add((state, action, next_state, reward, done)) states, actions, next_states, rewards, dones = self.replay_buffer.sample( 50) q_next_states = self.q_network.get_q_state(self.sess, next_states) q_next_states[dones] = np.zeros([self.action_size ]) # sets q_next_state to 0 if done q_targets = rewards + self.gamma * np.max(q_next_states, axis=1) self.q_network.update_model(self.sess, states, actions, q_targets) if done: self.ep = max(0.1, 0.99 * self.ep) def __del__(self): self.sess.close()
def __init__(self, state_size, action_size, seed): '''Initlize the Agent. Parameters ---------- state_size : int The dimension of each state action_size : int The dimension of each action seed : int The random seed used to generate random numbers. ''' self.state_size = state_size self.action_size = action_size random.seed(seed) #Q-Network self.local_qnetwork = QNetwork(state_size, action_size, seed).to(device) self.target_qnetwork = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr=LEARNING_RATE) #Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0
def __init__(self, env, action_size, state_size, use_dueling=False, use_double=False, network_file=None): self.device = torch.device('cpu') self.action_size = action_size self.env = env self.state_size = state_size self.seed = 1234 self.target_network = QNetwork(state_size=state_size, action_size=action_size, seed=self.seed, use_dueling=use_dueling).to(self.device) self.local_network = QNetwork(state_size=state_size, action_size=action_size, seed=self.seed, use_dueling=use_dueling).to(self.device) self.optimizer = torch.optim.Adam(self.local_network.parameters(), lr=5e-4) if network_file is not None: if os.path.exists(network_file): checkpoints = torch.load(network_file) self.local_network.load_state_dict(checkpoints['local']) self.target_network.load_state_dict(checkpoints['target']) self.optimizer.load_state_dict(checkpoints['optimizer']) self.memory = ReplayBuffer(self.seed, batch_size=BATCH_SIZE, device=self.device) self.use_double = use_double
def __init__(self, sess): print("Initializing the agent...") self.sess = sess self.env = Environment() self.state_size = self.env.get_state_size() self.action_size = self.env.get_action_size() print("Creation of the main QNetwork...") self.mainQNetwork = QNetwork(self.state_size, self.action_size, 'main') print("Main QNetwork created !\n") print("Creation of the target QNetwork...") self.targetQNetwork = QNetwork(self.state_size, self.action_size, 'target') print("Target QNetwork created !\n") self.buffer = PrioritizedReplayBuffer(parameters.BUFFER_SIZE, parameters.ALPHA) self.epsilon = parameters.EPSILON_START self.beta = parameters.BETA_START self.initial_learning_rate = parameters.LEARNING_RATE trainables = tf.trainable_variables() self.update_target_ops = updateTargetGraph(trainables) self.nb_ep = 1 self.best_run = -1e10
def __init__(self, sess, gui, displayer, saver): """ Build a new instance of Environment, QNetwork and ExperienceBuffer. Args: sess : the tensorflow session in which to build the network gui : a GUI instance to manage the control of the agent displayer: a Displayer instance to keep track of the episode rewards saver : a Saver instance to save periodically the network """ print("Initializing the agent...") self.sess = sess self.gui = gui self.displayer = displayer self.saver = saver self.env = Environment() self.QNetwork = QNetwork(self.sess) self.buffer = ExperienceBuffer() self.epsilon = Settings.EPSILON_START self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1) self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS) self.create_summaries() self.best_run = -1e10 self.n_gif = 0 print("Agent initialized !\n")
def __init__(self, sess, gui, displayer, saver): """ Build a new instance of Environment and QNetwork. Args: sess : the tensorflow session in which to build the network gui : a GUI instance to manage the control of the agent displayer: a Displayer instance to keep track of the episode rewards saver : a Saver instance to save periodically the network """ print("Initializing the agent...") self.sess = sess self.gui = gui self.gui_thread = threading.Thread(target=lambda: self.gui.run(self)) self.displayer = displayer self.saver = saver signal.signal(signal.SIGINT, self.interrupt) self.env = Environment() self.QNetwork = QNetwork(sess) self.buffer = ExperienceBuffer(prioritized=Settings.PRIORITIZED_ER) self.epsilon = Settings.EPSILON_START self.beta = Settings.BETA_START self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1) self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS) self.create_summaries() self.best_run = -1e10 self.n_gif = 0 print("Agent initialized !\n")
def __init__(self, capacity, state_size, action_size, pretrained_model_path=None, tau=1e-3, gamma=0.99, batch_size=32, lr=1e-4, learn_every_n_steps=4): # Environment variables self.state_size = state_size self.action_size = action_size # Create Qnetworks self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = QNetwork(state_size, action_size).to(device) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.batch_size = batch_size self.gamma = gamma self.tau = tau if pretrained_model_path is not None: self.qnetwork_local.load_state_dict( torch.load(pretrained_model_path)) # Initialize memory buffer self.memory = ReplayBuffer(capacity, batch_size) # Initialize time step for updating target network every q steps self.learn_every_n_steps = learn_every_n_steps self.t_step = 0
def __init__(self, env): self.state_dim = env.observation_space.shape self.action_size = env.action_space.n self.q_network = QNetwork(self.state_dim, self.action_size) self.gamma = 0.97 self.ep = 1.0 self.replay_buffer = ReplayBuffer(length=10000) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer())
def __init__(self, root): self.root = root self.gridFrame = Frame(self.root) self.gridFrame.pack(side = LEFT) # align grid on the left self.width = 680 self.height = 680 self.size = 3 self.canvas = Canvas(self.gridFrame, bg="white", width=self.width, height=self.height) #create canvas inside the frame self.canvas.pack() # binding it with the rest self.logic = gameLogic.GameLogic(self.size, QNetwork("o", self.size), QNetwork("x", self.size))
def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = RegionFilteringEnvironment( config.get(mode + 'Database'), mode) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = RegionFilteringAgent(self.controller, self.learner) self.task = RegionFilteringTask(self.environment, config.get(mode + 'GroundTruth')) self.experiment = Experiment(self.task, self.agent)
def load_weights(self, load_from): checkpoint = torch.load(load_from) qnet_params = checkpoint['critic_params'] policy_params = checkpoint['actor_params'] self.actor_local = Policy(policy_params['actor_params']) self.actor_local.load_state_dict( checkpoint['actor_params']['state_dict']) self.critic_local = QNetwork(qnet_params['critic_params']) self.critic_local.load_state_dict( checkpoint['critic_params']['state_dict']) return self
def main(): state_dim = 2 nb_actions = 3 ep_size = 8 net = Net1(state_dim, nb_actions) agent = QNetwork(net, state_dim, ep_size) for _ in range(100): # Memorize fool data states = torch.rand(ep_size, state_dim) actions = agent.decide(states) next_states = torch.rand(ep_size, state_dim) rewards = reward(states, actions) agent.memorize(states, actions, next_states, reward(states, actions)) agent.update() # agent.clear_memory() # agent.show_training() # Displaying the agent's decisions states_interv = torch.linspace(0, 1, 100) states_grid = torch.cartesian_prod(states_interv, states_interv) actions = agent.decide(states_grid) fig = plt.figure() ax = plt.axes(projection="3d") ax.plot3D(states_grid[:, 0], states_grid[:, 1], actions) plt.show() return 0
def __init__(self, env, buffer, load_models = False, epsilon=0.05, Q_hidden_nodes = Q_HIDDEN_NODES, batch_size= BATCH_SIZE, rew_thre = REW_THRE, min_rew = MINIMUM_REWARD, window = WINDOW, path_to_the_models = MODELS_DIR): print("MARGIN: ", MARGIN) print(("1/MARGIN: ", 1/MARGIN)) self.margin_discrete = 0 self.lq = 0 self.lts = 0 self.ltx = 0 self.ld = 0 self.l_spars = 0 self.path_to_the_models = path_to_the_models self.env = env self.action_size = ACTION_SIZE self.state_size = STATE_SIZE self.code_size = CODE_SIZE if load_models: self.load_models() else: self.encoder = Encoder(self.code_size) self.decoder = Decoder(self.code_size) self.trans_delta = TransitionDelta(self.code_size, self.action_size) self.network = QNetwork(env=env, n_hidden_nodes=Q_hidden_nodes, encoder=self.encoder) self.transition = Transition(self.encoder, self.decoder, self.trans_delta) params = [self.encoder.parameters(),self.decoder.parameters(), self.trans_delta.parameters(), self.network.symbolic_net.parameters()] params = itertools.chain(*params) self.optimizer = torch.optim.Adam(params, lr=0.001) #self.f = open("res/planner_enc_DDQN.txt", "a+") self.target_network = deepcopy(self.network) self.buffer = buffer self.epsilon = epsilon self.batch_size = batch_size self.window = window self.reward_threshold = rew_thre self.min_reward = min_rew self.maximum_horizon = 1 self.horizon = 1 self.initialize() self.action = 0 self.temp_s1 = 0 self.step_count = 0 self.cum_rew = 0 self.timestamp = 0 self.episode = 0 self.difference = 0 self.A = [to_categorical(i, self.action_size) for i in range(self.action_size)]
def create_model(env): """ Create a model depending on the type of environment :env. Note that although it runs technically, the Box to Box (Continuous to Continuous) version doesn't really work. """ if type(env.action_space) == gym.spaces.Box and type(env.observation_space) == gym.spaces.Box: return QNetwork(env.observation_space.shape[0], num_hidden, env.action_space.low.shape[0]) elif type(env.action_space) == gym.spaces.Discrete and type(env.observation_space) == gym.spaces.Box: return QNetwork(env.observation_space.low.shape[0], num_hidden, env.action_space.n) elif type(env.action_space) == gym.spaces.Box and type(env.observation_space) == gym.spaces.Discrete: return QNetwork(env.observation_space.n, num_hidden, env.action_space.low.shape[0]) elif type(env.action_space) == gym.spaces.Discrete and type(env.observation_space) == gym.spaces.Discrete: return QNetwork(env.observation_space.n, num_hidden, env.action_space.n) else: raise NotImplementedError()
def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, epsilon_init, epsilon_final, epsilon_decay, a, b, b_step, update_every, seed): self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.epsilon = epsilon_init self.epsilon_final = epsilon_final self.epsilon_decay = epsilon_decay self.a = a self.b = b self.b_step = b_step random.seed(seed) self.update_every = update_every device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network arch_params = OrderedDict({ 'state_and_action_sizes': (state_size, action_size), 'Linear_2': 64, 'ReLU_2': None, 'Linear_3': 128, 'ReLU_3': None, 'Linear_4': 64, 'ReLU_4': None, 'Linear_5': action_size }) self.qnetwork_local = QNetwork(seed, arch_params).to( device) # decision_maker self.qnetwork_target = QNetwork(seed, arch_params).to(device) # fixed self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed) self.t_step = 0 self.average_TD_error = 1.0
def __init__(self, env, buffer, load_models=False, epsilon=0.5, Q_hidden_nodes=Q_HIDDEN_NODES, batch_size=BATCH_SIZE, rew_thre=REW_THRE, window=WINDOW, path_to_the_models=MODELS_DIR): self.path_to_the_models = path_to_the_models self.env = env self.action_size = ACTION_SIZE if load_models: self.load_models() else: self.encoder = Encoder(CODE_SIZE) self.decoder = Decoder(CODE_SIZE) self.trans_delta = TransitionDelta(3, self.action_size) self.transition = Transition(self.encoder, self.decoder, self.trans_delta) self.network = QNetwork(env=env, encoder=self.encoder, n_hidden_nodes=Q_hidden_nodes) self.target_network = deepcopy(self.network) #self.f = open("res/planner_enc_DDQN.txt", "a+") self.buffer = buffer self.epsilon = epsilon self.batch_size = batch_size self.window = window self.reward_threshold = rew_thre self.initialize() self.action = 0 self.step_count = 0 self.cum_rew = 0 self.timestamp = 0 self.episode = 0 self.difference = 0 self.different_codes = 0 self.A = [ to_categorical(i, self.action_size) for i in range(self.action_size) ]
def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = BoxSearchEnvironment(config.get(mode+'Database'), mode, config.get(mode+'GroundTruth')) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = BoxSearchAgent(self.controller, self.learner) self.task = BoxSearchTask(self.environment, config.get(mode+'GroundTruth')) self.experiment = Experiment(self.task, self.agent)
def __init__(self, state_size, action_size, seed): '''Args: state_size: Int, number of dims in the state space action_size: Int, number of dims in the action space seed: Int, to set random seed''' self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # create the local and target Qnetworks and Optimizer set to optimize the local network self.qn_local = QNetwork(state_size, action_size, seed=seed).to(device) self.qn_target = QNetwork(state_size, action_size, seed=seed).to(device) self.optimizer = optim.Adam(params=self.qn_local.parameters(), lr=LR) # create the memory buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # counter for steps to learn self.t_step = 0
def __init__(self, params): self.params = params self.__state_dim = params['state_dim'] self.__action_dim = params['action_dim'] self.__buffer_size = params['buffer_size'] self.__batch_size = params['batch_size'] self.__gamma = params['gamma'] self.__tau = params['tau'] self.__lr = params['lr'] self.__update_every = params['update_every'] eps = params['eps'] eps_decay = params['eps_decay'] min_eps = params['min_eps'] seed = params['seed'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network critic_params = dict() critic_params['seed'] = seed critic_params['arch_params'] = params['arch_params_critic'] self.critic_local = QNetwork(critic_params).to(device) self.critic_target = QNetwork(critic_params).to(device) self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=self.__lr) #Policy actor_params = dict() actor_params['seed'] = seed actor_params['arch_params'] = params['arch_params_actor'] actor_params['noise_type'] = params['noise_type'] actor_params['eps'] = eps actor_params['eps_decay'] = eps_decay actor_params['min_eps'] = min_eps actor_params['arch_params'] = params['arch_params_actor'] self.actor_local = Policy(actor_params).to(device) self.actor_target = Policy(actor_params).to(device) self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=self.__lr) self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size) self.__t_step = 0
def __init__(self): self.training_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) self.prediction_q = Queue(maxsize=Config.MAX_QUEUE_SIZE) self.env = Environment() self.action_dim = self.env.get_num_actions() self.observation_dim = self.env.get_observation_dim() self.model = QNetwork(model_name="Name", num_actions=self.action_dim, observation_dim=self.observation_dim, gamma=Config.DISCOUNT, seed=Config.SEED, log_dir=Config.LOG_DIR) self.training_step = 0 self.action_step = 0 self.agents = [] self.predictors = [] self.trainers = [] self.dynamic_adjustment = ThreadDynamicAdjustment(self)
def __init__(self, state_dim: int, nb_actions: int, net: torch.nn.Module, next_state_func, final_states_func, rewards_func, device=None): """ :param state_dim: Dimensions needed to describe a state. For example a position on a plan will need state_dim = 2. :param nb_actions: Number of different possible actions at most. :param net: Torch neural network used by the agent. Should have its output dimension equal to nb_actions and its input dimension equal to state_dim. :param next_state_func: Function of signature (2D torch tensor, a: int, time: int, torch device) --> 2D torch tensor which for a tensor S where S[i, :] is a state, returns a tensor NS where NS[i, :] is the state obtained by performing action a in state S[i, :]. Time indicates how many transitions have already taken place during the exploration. :param final_states_func: Function of signature (2D torch tensor, time: int, torch device) --> 1D torch tensor which for a tensor S where S[i, :] is a state, returns a tensor F where F[i] == 1 iff S[i, :] is final and F[i] == 0 otherwise. :param rewards_func: Function of signature (2D torch tensor, a: int, time: int, torch device) --> 1D torch tensor which for a tensor S where S[i, :] is a state, returns a tensor R where R[i] is the reward for taking action a in the state S[i, :]. Time indicates how many transitions have already taken place during the exploration. :param device: Torch device to be used for computations and training """ self.state_dim = state_dim self.nb_actions = nb_actions if device is None: self.device = torch.device("cpu") else: self.device = device self.agent = QNetwork(net, state_dim, 32, device=self.device) self.next_state_func = next_state_func self.final_states_func = final_states_func self.rewards_func = rewards_func
with tf.Session() as sess: saver = Saver.Saver(sess) displayer = Displayer.Displayer() buffer = ExperienceBuffer() gui = GUI.Interface(['ep_reward', 'plot', 'render', 'gif', 'save']) main_agent = Agent(sess, 0, gui, displayer, buffer) threads = [] for i in range(1, Settings.NB_ACTORS): agent = Agent(sess, i, gui, displayer, buffer) threads.append(threading.Thread(target=agent.run)) # with tf.device('/device:GPU:0'): learner = QNetwork(sess, gui, saver, buffer) threads.append(threading.Thread(target=learner.run)) if not saver.load(): sess.run(tf.global_variables_initializer()) gui_thread = threading.Thread(target=lambda: gui.run(main_agent)) gui_thread.start() for t in threads: t.start() print("Running...") main_agent.run() for t in threads: t.join()
if __name__ == "__main__": import loop_environments env = loop_environments.create_env("SimpleWindyGridWorld") # Let's run it! num_episodes = 200 batch_size = 10 discount_factor = 0.8 learn_rate = 1e-3 memory = ReplayMemory(10000) num_hidden = 128 seed = 42 # This is not randomly chosen # env = gym.envs.make("Acrobot-v1") # print(f"Action space: {env.action_space} - State space: {env.observation_space}") # We will seed the algorithm (before initializing QNetwork!) for reproducability random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) env.seed(seed) print(env.observation_space.shape) print(env.action_space.shape) model = QNetwork(env.observation_space.n, num_hidden, env.action_space.n) episode_durations, episode_rewards = run_episodes(train, model, memory, env, num_episodes, batch_size, discount_factor, learn_rate) plt.plot(episode_durations) plt.savefig("test.png")
'logger: %s\n', args.simulator, args.networkPath, args.lr, args.batchSize, args.itr, args.eps, args.gamma, args.memory, args.frequency, args.testSize, args.device, args.threads, args.checkpoints, args.logger) if __name__ == '__main__': if args.checkpoints and not os.path.exists('checkpoints'): os.makedirs('checkpoints') simulator = SimulatorFactory.getInstance(args.simulator, args) trainer = DQN(QNetwork(simulator.dState(), simulator.nActions())) try: logger.info('Starting training.') trainer.train(args) except KeyboardInterrupt: logger.info('KeyboardInterrupt received. Trying to stop threads.') finally: trainer.stop() simulator.destroy()
class Plan_RL_agent: def __init__(self, env, buffer, load_models = False, epsilon=0.05, Q_hidden_nodes = Q_HIDDEN_NODES, batch_size= BATCH_SIZE, rew_thre = REW_THRE, min_rew = MINIMUM_REWARD, window = WINDOW, path_to_the_models = MODELS_DIR): print("MARGIN: ", MARGIN) print(("1/MARGIN: ", 1/MARGIN)) self.margin_discrete = 0 self.lq = 0 self.lts = 0 self.ltx = 0 self.ld = 0 self.l_spars = 0 self.path_to_the_models = path_to_the_models self.env = env self.action_size = ACTION_SIZE self.state_size = STATE_SIZE self.code_size = CODE_SIZE if load_models: self.load_models() else: self.encoder = Encoder(self.code_size) self.decoder = Decoder(self.code_size) self.trans_delta = TransitionDelta(self.code_size, self.action_size) self.network = QNetwork(env=env, n_hidden_nodes=Q_hidden_nodes, encoder=self.encoder) self.transition = Transition(self.encoder, self.decoder, self.trans_delta) params = [self.encoder.parameters(),self.decoder.parameters(), self.trans_delta.parameters(), self.network.symbolic_net.parameters()] params = itertools.chain(*params) self.optimizer = torch.optim.Adam(params, lr=0.001) #self.f = open("res/planner_enc_DDQN.txt", "a+") self.target_network = deepcopy(self.network) self.buffer = buffer self.epsilon = epsilon self.batch_size = batch_size self.window = window self.reward_threshold = rew_thre self.min_reward = min_rew self.maximum_horizon = 1 self.horizon = 1 self.initialize() self.action = 0 self.temp_s1 = 0 self.step_count = 0 self.cum_rew = 0 self.timestamp = 0 self.episode = 0 self.difference = 0 self.A = [to_categorical(i, self.action_size) for i in range(self.action_size)] def monitor_replanning(self, horizon, show = True, plot = True): done = False self.rewards = 0 if plot: self.plans = [] while not done: if show: self.env.render() done = self.take_step(horizon = horizon, plot = plot) if show: print("Episode reward: ", self.rewards) if plot: self.plot_plans() return self.rewards def save_models(self): torch.save(self.encoder, self.path_to_the_models + "encoder") torch.save(self.decoder, self.path_to_the_models + "decoder") torch.save(self.trans_delta, self.path_to_the_models + "trans_delta") torch.save(self.network, self.path_to_the_models + "Q_net") def load_models(self): self.encoder = torch.load(self.path_to_the_models+"encoder") self.encoder.eval() self.decoder = torch.load(self.path_to_the_models+"decoder") self.decoder.eval() self.trans_delta = torch.load(self.path_to_the_models+"trans_delta") self.trans_delta.eval() self.network = torch.load(self.path_to_the_models+"Q_net") self.network.eval() def plot_training_rewards(self): plt.plot(self.mean_training_rewards) plt.title('Mean training rewards') plt.ylabel('Reward') plt.xlabel('Episods') #plt.show() plt.savefig(self.path_to_the_models+'mean_training_rewards.png') plt.clf() def plot_plans(self): fig = plt.gcf() fig.set_size_inches(28, 4) d = len(self.plans[0]) executed_actions = [p[0] for p in self.plans] for i in range(len(self.plans)): plt.plot(range(i,i+d), self.plans[i], color='blue') plt.plot( executed_actions, c='red') plt.title('Monitor replanning plans') plt.ylabel('Actions') plt.xlabel('Steps') #plt.show() fig.savefig(self.path_to_the_models+'monitor_replanning_{}.png'.format(d)) plt.clf() def expandFunc(self, x, a): _, x_prime, x_prime_d = self.trans_delta(x, torch.from_numpy(a).type(torch.FloatTensor).to(device), True) lmse = nn.MSELoss() #print(x_prime) #print(x_prime_d) self.disc_error = lmse(x_prime, x_prime_d).item() if PREDICT_CERTAINTY: c = 1 - self.disc_error #print(l) #print(c) else: c = 1 return x_prime, x_prime_d, c def vFunc(self, x): v0 = self.network.get_enc_value(x) return torch.max(v0).to('cpu').detach().numpy() def certainty(self, x): if PREDICT_CERTAINTY: x_p = self.encoder(self.decoder(x)) distance = torch.nn.L1Loss() c = 1 - distance(x, x_p).item() else: c = 1 return c def findPlan(self, node): # caso base if node.sons == []: return [node.a], node.v*node.c somme_values = [] plans = [] for n in node.sons: p, s = self.findPlan(n) plans.append(p) somme_values.append(s) # print("plan p", p) # print("plan p", s) ###### evaluate plans #se più piani hanno valore massimo ne scelgo uno fra di essi random smax = max(somme_values) indices_max = [i for i, j in enumerate(somme_values) if j == smax] k = random.choice(indices_max) bestp = plans[k] return [node.a] + bestp, node.v * node.c + smax def limited_expansion(self, node, depth): if depth == 0: return for a in self.A: x_prime, x_prime_d, c = self.expandFunc(node.x, a) if self.margin_discrete >= 0.499999: node.expand(x_prime_d, self.vFunc(x_prime_d), a, node.c * c) else: node.expand(x_prime_d, self.vFunc(x_prime + (x_prime_d - x_prime)*(2* self.margin_discrete)), a, node.c * c) #node.expand(x_prime_d, self.vFunc(x_prime + (x_prime_d - x_prime)*(1.5*self.margin_discrete)), a, node.c * c) #node.expand(x_prime_d, self.vFunc(x_prime_d), a, node.c * c) for i in range(len(node.sons)): self.limited_expansion(node.sons[i], depth - 1) def planner_action(self, depth=1, verbose = False, plot = False): if np.random.random() < 0.05: return np.random.choice(self.action_size) origin_code = self.encoder(torch.from_numpy(self.s_0).type(torch.FloatTensor), True) #print("Origin code: ", origin_code) origin_value = self.vFunc(origin_code) root = Node(origin_code, origin_value, to_categorical(0, self.action_size), self.certainty(origin_code)) self.limited_expansion(root, depth) if verbose: root.print_parentetic() plan, sum_value = self.findPlan(root) if verbose: #root.print_parentetic() print("plan: {}, sum_value: {}".format(plan[1:], sum_value)) if plot: plan_read = [ np.where(plan[i] == 1)[0][0] for i in range(1, len(plan)) ] #print("plan_read : ", plan_read) self.plans.append(plan_read) return np.where(plan[1] == 1)[0][0] def planner_action_old(self, depth=1): #if np.random.random() < 0.05: # return np.random.choice(self.action_size) origin_code = self.encoder(torch.from_numpy(self.s_0).type(torch.FloatTensor)) origin_value = self.network.get_enc_value(origin_code) origin_node = plan_node(origin_code, origin_value) origin_node.action_vec = [0] action = torch.argmax(origin_value).to('cpu').detach().numpy() a0 = to_categorical(0,self.action_size) a1 = to_categorical(1,self.action_size) a2 = to_categorical(2,self.action_size) #a3 = to_categorical(3,self.action_size) #a4 = to_categorical(3, self.action_size) #a5 = to_categorical(3, self.action_size) _, ns0 = self.trans_delta(origin_code, torch.from_numpy(a0).type(torch.FloatTensor).to('cuda')) _, ns1 = self.trans_delta(origin_code, torch.from_numpy(a1).type(torch.FloatTensor).to('cuda')) _, ns2 = self.trans_delta(origin_code, torch.from_numpy(a2).type(torch.FloatTensor).to('cuda')) #_, ns3 = self.trans_delta(origin_code, torch.from_numpy(a3).type(torch.FloatTensor).to('cuda')) #_, ns4 = self.trans_delta(origin_code, torch.from_numpy(a4).type(torch.FloatTensor).to('cuda')) #_, ns5 = self.trans_delta(origin_code, torch.from_numpy(a5).type(torch.FloatTensor).to('cuda')) v0 = self.network.get_enc_value(ns0) v1 = self.network.get_enc_value(ns1) v2 = self.network.get_enc_value(ns2) #v3 = self.network.get_enc_value(ns3) #v4 = self.network.get_enc_value(ns4) #v5 = self.network.get_enc_value(ns5) max0 = torch.max(v0).to('cpu').detach().numpy() arg_max0 = torch.argmax(v0).to('cpu').detach().numpy() max1 = torch.max(v1).to('cpu').detach().numpy() arg_max1 = torch.argmax(v1).to('cpu').detach().numpy() max2 = torch.max(v2).to('cpu').detach().numpy() arg_max2 = torch.argmax(v2).to('cpu').detach().numpy() ''' max3 = torch.max(v3).to('cpu').detach().numpy() arg_max3 = torch.argmax(v3).to('cpu').detach().numpy() max4 = torch.max(v4).to('cpu').detach().numpy() arg_max4 = torch.argmax(v4).to('cpu').detach().numpy() max5 = torch.max(v5).to('cpu').detach().numpy() arg_max5 = torch.argmax(v5).to('cpu').detach().numpy() ''' l_max = [max0, max1, max2] #smax = max(l_max) #indices_max = [i for i, j in enumerate(l_max) if j == smax] #k = random.choice(indices_max) #l_amax = [arg_max0, arg_max1, arg_max2] l_amax = [0, 1, 2] #if(action != l_amax[np.argmax(l_max)]): #print("DIVERSO!") #return k return l_amax[np.argmax(l_max)] def is_diff(self, s1, s0): for i in range(len(s0)): if(s0[i] != s1[i]): return True return False def take_step(self, mode='train', horizon=0, plot= False): s_1, r, done, _ = self.env.step(self.action) #print(self.env.action_space) enc_s1 = self.encoder(torch.from_numpy(np.asarray(s_1)).type(torch.FloatTensor)) enc_s0 = self.encoder(torch.from_numpy(np.asarray(self.s_0)).type(torch.FloatTensor).to('cuda')) #print("Reward = ", r) if(self.is_diff(enc_s0,enc_s1)): #if(True): #print("step passati = ", self.step_count - self.timestamp) self.timestamp = self.step_count self.buffer.append(self.s_0, self.action, r, done, s_1) self.cum_rew = 0 if mode == 'explore': self.action = self.env.action_space.sample() else: #self.action = self.network.get_action(self.s_0) #self.action = self.planner_action() if horizon == 0: # ADAPTIVE HORIZON if len(self.mean_training_rewards) == 0: self.horizon = 1 else: step = (self.reward_threshold - self.min_reward) / self.maximum_horizon for i in range(self.maximum_horizon): if self.mean_training_rewards[-1] < self.min_reward + (i+1)*step : self.horizon = i+1 break else: self.horizon = horizon #print(horizon) self.action = self.planner_action(depth=self.horizon, plot = plot) self.s_0 = s_1.copy() self.rewards += r self.step_count += 1 if done: self.s_0 = self.env.reset() return done # Implement DQN training algorithm def train(self, gamma=0.99, max_episodes=1000, network_update_frequency=4, network_sync_frequency=200): self.gamma = gamma # Populate replay buffer while self.buffer.burn_in_capacity() < 1: self.take_step(mode='explore') ep = 0 training = True while training: self.s_0 = self.env.reset() self.rewards = 0 done = False while done == False: if((ep % 20) == 0 ): self.env.render() p = np.random.random() if p < self.epsilon: done = self.take_step(mode='explore') # print("explore") else: done = self.take_step(mode='train') # print("train") #done = self.take_step(mode='train') # Update network if self.step_count % network_update_frequency == 0: self.update() # Sync networks if self.step_count % network_sync_frequency == 0: self.target_network.load_state_dict( self.network.state_dict()) self.sync_eps.append(ep) if done: ep += 1 self.margin_discrete = min([0.5 - pow(0.5, 0.15*ep+1), 0.499999]) if self.margin_discrete >= 0.499999: DISCRETE_CODES = True #self.margin_discrete = 0 if self.epsilon >= 0.05: self.epsilon = self.epsilon * 0.7 self. episode = ep self.training_rewards.append(self.rewards) self.training_loss.append(np.mean(self.update_loss)) self.update_loss = [] mean_rewards = np.mean( self.training_rewards[-self.window:]) self.mean_training_rewards.append(mean_rewards) print("\rEpisode {:d} Mean Rewards {:.2f} Episode reward = {:.2f} lq = {:.3f} horizon ={} ltx ={:3f} ld ={:3f} l_spars={:3f} margin={:3f} disc_err={:3f}\t\t".format( ep, mean_rewards, self.rewards, self.lq, self.horizon, self.ltx, self.ld, self.l_spars, self.margin_discrete, self.disc_error), end="") #self.f.write(str(mean_rewards)+ "\n") if ep >= max_episodes: training = False print('\nEpisode limit reached.') break if mean_rewards >= self.reward_threshold: training = False print('\nEnvironment solved in {} episodes!'.format( ep)) break # save models self.save_models() # plot self.plot_training_rewards() def calculate_loss(self, batch): states, actions, rewards, dones, next_states = [i for i in batch] rewards_t = torch.FloatTensor(rewards).to(device=self.network.device).reshape(-1, 1) actions_t = torch.LongTensor(np.array(actions)).reshape(-1, 1).to( device=self.network.device) dones_t = torch.ByteTensor(dones).to(device=self.network.device) ############### # DDQN Update # ############### qvals = self.network.get_qvals(states) qvals = torch.gather(qvals.to('cpu'), 1, actions_t) next_vals= self.network.get_qvals(next_states) next_actions = torch.max(next_vals.to('cpu'), dim=-1)[1] next_actions_t = torch.LongTensor(next_actions).reshape(-1, 1).to( device=self.network.device) target_qvals = self.target_network.get_qvals(next_states) qvals_next = torch.gather(target_qvals.to('cpu'), 1, next_actions_t).detach() ############### qvals_next[dones_t] = 0 # Zero-out terminal states expected_qvals = self.gamma * qvals_next + rewards_t self.lq = (nn.MSELoss()(qvals, expected_qvals)) #print("loss = ", loss) #loss.backward() #self.network.optimizer.step() return self.lq def pred_update(self, batch): loss_function = nn.MSELoss() states, actions, rewards, dones, next_states = [i for i in batch] cat_actions = [] #modifica struttura actions for act in actions: cat_actions.append(np.asarray(to_categorical(act,self.action_size))) cat_actions = np.asarray(cat_actions) a_t = torch.FloatTensor(cat_actions).to('cuda') #Modifiche struttura states if type(states) is tuple: states = np.array([np.ravel(s) for s in states]) states = torch.FloatTensor(states).to('cuda') # Modifiche struttura states if type(next_states) is tuple: next_states = np.array([np.ravel(s) for s in next_states]) next_states = torch.FloatTensor(next_states).to('cuda') self.ltx, self.lts = self.transition.one_step_loss(states, a_t, next_states) # per renderla comparabile alla lq #self.ltx *= 50 self.ld = self.transition.distant_codes_loss(states, next_states) self.l_spars = self.transition.distant_from_relu_loss(self.encoder(states), 0.5, self.margin_discrete) self.l_spars += self.transition.distant_from_relu_loss(self.encoder(next_states), 0.5, self.margin_discrete) deltas, _ = self.transition.forward_one_step(states, a_t) self.l_spars += self.transition.distant_from_relu_loss(deltas, 0.5, self.margin_discrete) self.l_spars += self.transition.distant_from_relu_loss(deltas,-0.5, self.margin_discrete) #self.l_kl = self.transition.experiment_loss((states)) L = self.lts + self.ltx + self.ld + self.l_spars #L.backward() #print("pred_loss = ", L) #self.transition.optimizer.step() return L def pred_update_two_steps(self, batch): loss_function = nn.MSELoss() states, actions, rewards, dones, next_states, actions_2, rewards_2, dones_2, next_states_2 = [i for i in batch] cat_actions = [] cat_actions_2 = [] # modifica struttura actions for act in actions: cat_actions.append(np.asarray(to_categorical(act, self.action_size))) cat_actions = np.asarray(cat_actions) a_t = torch.FloatTensor(cat_actions).to(device) # modifica struttura actions_2 for act in actions: cat_actions_2.append(np.asarray(to_categorical(act, self.action_size))) cat_actions_2 = np.asarray(cat_actions) a_t_2 = torch.FloatTensor(cat_actions_2).to(device) # Modifiche struttura states if type(states) is tuple: states = np.array([np.ravel(s) for s in states]) states = torch.FloatTensor(states).to(device) # Modifiche struttura next_states if type(next_states) is tuple: next_states = np.array([np.ravel(s) for s in next_states]) next_states = torch.FloatTensor(next_states).to(device) # Modifiche struttura next_states if type(next_states_2) is tuple: next_states_2 = np.array([np.ravel(s) for s in next_states_2]) next_states_2 = torch.FloatTensor(next_states_2).to(device) ####### NEW L = self.transition.two_step_loss(states, a_t, next_states, a_t_2, next_states_2) #se mettiamo pure la triplet loss #L + self.transition.triplet_loss_encoder(states, next_states, next_states_2, MARGIN) L.backward() #self.transition_losses.append(L) self.transition.optimizer.step() return def update(self): #self.network.optimizer.zero_grad() self.optimizer.zero_grad() batch = self.buffer.sample_batch(batch_size=self.batch_size) loss_q = self.calculate_loss(batch) #print("q loss = ", loss) #self.transition.optimizer.zero_grad() batch2 = self.buffer.sample_batch(batch_size=self.batch_size) loss_t = self.pred_update(batch2) #TODO calcolare la loss su un batch solo #batch_cons = self.buffer.consecutive_sample(batch_size=64) #print(batch_cons) loss = loss_t + loss_q loss.backward() self.optimizer.step() ''' if self.network.device == 'cuda': self.update_loss.append(loss.detach().cpu().numpy()) else: self.update_loss.append(loss.detach().numpy()) ''' def initialize(self): self.training_rewards = [] self.training_loss = [] self.update_loss = [] self.mean_training_rewards = [] self.sync_eps = [] self.rewards = 0 self.step_count = 0 self.s_0 = self.env.reset()
def test(agent: QNetwork, movements=100, nb_episodes=1000, step=0.01, show_plots=True): """ Tests the ability of the QNetwork to learn to reach the position (0.5, 0.5) while spawning at random coordinates in [0, 1]^2. :param agent: QNetwork to be tested. Needs to have state_dim == 2 and 5 possible actions. :param movements: Number of moves the agent is allowed to have :param step: Distance travelled at each move :param nb_episodes: Number of episodes on which the agent trains :param show_plots: if True, the agent will plot the results of the training :return: The agent's loss memory """ # A state is defined as its x and y coordinates state_dim = 2 # Calculation device device = torch.device("cpu") # net = Net1(state_dim, nb_actions) # QNetwork(net, state_dim, movements, lr=0.1, device=torch.device("cpu")) for ep in range(nb_episodes): # Play a single episode # Create arrays to store the successive states and taken actions states = torch.empty( (movements + 1, state_dim), device=device) # + 1 to make space for the last state actions = torch.empty(movements, dtype=torch.int32, device=device) # Start with a random position states[0] = torch.rand(2) for move in range(movements): # Take action actions[move] = agent.decide(states[move].view(1, -1)).item() # Get next state states[move + 1] = next_state(states[move], actions[move], step, device) # Get rewards rewards = get_rewards(states[:-1], actions, step, device) # Memorize the episode agent.memorize_exploration(states, actions, rewards, last_state_is_final=False) # Train after the episode agent.update() printProgressBar(ep + 1, nb_episodes, "Episodes completed: ", length=90) # print("Final position: ", states[-1], " | Initial: ", states[0]) if show_plots: plt.figure("Training summary") plt.subplot(111) plt.title("Agent Trajectories") agent.plot_trajectory(torch.rand((50, 2)), lambda s, a: next_state(s, a, step, device)) # plt.subplot(212) # plt.title("MSE Loss") # agent.show_training() plt.show() return agent.loss_mem return 0
def her_experiment(): batch_size = 256 discount_factor = 0.8 learn_rate = 1e-3 num_hidden = 128 num_episodes = 2 epochs = 200 training_steps = 10 memory_size = 100000 # her = False # seeds = [42, 30, 2,19,99] # This is not randomly chosen seeds = [42, 30, 2, 19, 99] shape = [30, 30] targets = lambda x, y: [0, x * y - 1, x - 1, (y - 1) * x] env = GridworldEnv(shape=shape, targets=targets(*shape)) # functions for grid world def sample_goal(): return np.random.choice(env.targets, 1) extract_goal = lambda state: np.reshape(np.array(np.argmax(state)), -1) def calc_reward(state, action, goal): if state == goal: return 0.0 else: return -1.0 # # maze # def sample_goal(): # return env.maze.end_pos # extract_goal = lambda state: np.reshape(np.array(np.argmax(state)),-1) # def calc_reward(state, action, goal): # if state == goal: # return 0.0 # else: # return -1.0 means = [] x_epochs = [] l_stds = [] h_stds = [] for her in [True, False]: episode_durations_all = [] for seed in seeds: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) env.seed(seed) print(env.reset()) memory = ReplayMemory(memory_size) if her: # model = QNetwork(env.observation_space.shape[0]+2, num_hidden, env.action_space.n) model = QNetwork(2 * env.observation_space.n, num_hidden, env.action_space.n) episode_durations, episode_rewards = run_her_episodes( train, model, memory, env, num_episodes, training_steps, epochs, batch_size, discount_factor, learn_rate, sample_goal, extract_goal, calc_reward, use_her=True) else: model = QNetwork(env.observation_space.n, num_hidden, env.action_space.n) episode_durations, episode_rewards = run_her_episodes( train, model, memory, env, num_episodes, training_steps, epochs, batch_size, discount_factor, learn_rate, sample_goal, extract_goal, calc_reward, use_her=False) episode_durations_all.append( loop_environments.smooth(episode_durations, 10)) mean = np.mean(episode_durations_all, axis=0) means.append(mean) std = np.std(episode_durations_all, ddof=1, axis=0) l_stds.append(mean - std) h_stds.append(mean + std) x_epochs.append(list(range(len(mean)))) # print(len(mean),mean,std) line_plot_var(x_epochs, means, l_stds, h_stds, "Epoch", "Duration", ["HindsightReplay", "RandomReplay"], "Episode duration per epoch", ["orange", "blue"]) name = "her_" + str(shape) file_name = os.path.join("./results", name) with open(file_name + ".pkl", "wb") as f: pickle.dump((x_epochs, means, l_stds, h_stds), f)
class ReinforcementLearningRunner(): def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = RegionFilteringEnvironment(config.get(mode+'Database'), mode) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = RegionFilteringAgent(self.controller, self.learner) self.task = RegionFilteringTask(self.environment, config.get(mode+'GroundTruth')) self.experiment = Experiment(self.task, self.agent) def runEpoch(self, interactions, maxImgs): img = 0 s = cu.tic() while img < maxImgs: self.experiment.doInteractions(interactions) self.agent.learn() self.agent.reset() self.environment.loadNextEpisode() img += 1 s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s) def run(self): if self.mode == 'train': self.agent.persistMemory = True self.agent.startReplayMemory(len(self.environment.db.images), config.geti('trainInteractions'), config.geti('stateFeatures')) self.train() elif self.mode == 'test': self.agent.persistMemory = False self.test() def train(self): interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.db.images)/2 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon) print 'Epoch 0: Exploration' self.runEpoch(interactions, len(self.environment.db.images)) self.learner = QLearning() self.agent.learner = self.learner epoch = 1 egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs: epsilon = epsilon - (1.0-minEpsilon)/float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon) print 'Epoch',epoch ,'(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 epoch = 1 maxEpochs = config.geti('exploitLearningEpochs') while epoch <= maxEpochs: print 'Epoch',epoch+egEpochs,'(exploitation mode: epsilon={:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) epoch += 1 def test(self): interactions = config.geti('testInteractions') self.controller.setEpsilonGreedy(config.getf('testEpsilon')) self.runEpoch(interactions, len(self.environment.db.images))
class Agent: """ This class builds an agent with its own QNetwork, memory buffer and environment to learn a policy. """ def __init__(self, sess, gui, displayer, saver): """ Build a new instance of Environment and QNetwork. Args: sess : the tensorflow session in which to build the network gui : a GUI instance to manage the control of the agent displayer: a Displayer instance to keep track of the episode rewards saver : a Saver instance to save periodically the network """ print("Initializing the agent...") self.sess = sess self.gui = gui self.gui_thread = threading.Thread(target=lambda: self.gui.run(self)) self.displayer = displayer self.saver = saver signal.signal(signal.SIGINT, self.interrupt) self.env = Environment() self.QNetwork = QNetwork(sess) self.buffer = ExperienceBuffer(prioritized=Settings.PRIORITIZED_ER) self.epsilon = Settings.EPSILON_START self.beta = Settings.BETA_START self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1) self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS) self.create_summaries() self.best_run = -1e10 self.n_gif = 0 print("Agent initialized !\n") def create_summaries(self): self.ep_reward_ph = tf.placeholder(tf.float32) ep_reward_summary = tf.summary.scalar("Episode/Episode reward", self.ep_reward_ph) self.steps_ph = tf.placeholder(tf.float32) steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph) self.epsilon_ph = tf.placeholder(tf.float32) epsilon_summary = tf.summary.scalar("Settings/Epsilon", self.epsilon_ph) self.ep_summary = tf.summary.merge( [ep_reward_summary, epsilon_summary, steps_summary]) self.lr_ph = tf.placeholder(tf.float32) self.lr_summary = tf.summary.scalar("Settings/Learning rate", self.lr_ph) self.writer = tf.summary.FileWriter("./logs", self.sess.graph) def pre_train(self): """ Method to run a random agent in the environment to fill the memory buffer. """ print("Beginning of the pre-training...") for i in range(Settings.PRE_TRAIN_EPS): s = self.env.reset() done = False episode_reward = 0 episode_step = 0 while episode_step < Settings.MAX_EPISODE_STEPS and not done: a = self.env.act_random() s_, r, done, info = self.env.act(a) self.buffer.add((s, a, r, s_, 1 if not done else 0)) s = s_ episode_reward += r episode_step += 1 if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS // 5) == 0: print("Pre-train step n", i) # Set the best score to at least the max score the random agent got self.best_run = max(self.best_run, episode_reward) print("End of the pre training !") def save_best(self, episode_reward): self.best_run = episode_reward print("Save best", episode_reward) self.saver.save('best') # self.play(1, 'best') def run(self): """ Method to run the agent in the environment to collect experiences and learn on these experiences by gradient descent. """ print("Beginning of the run...") self.pre_train() self.QNetwork.init_target() self.gui_thread.start() self.nb_ep = 1 learning_steps = 0 while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP: s = self.env.reset() episode_reward = 0 done = False memory = deque() episode_step = 1 # The more episodes the agent performs, the longer they are max_step = Settings.MAX_EPISODE_STEPS if Settings.EP_ELONGATION > 0: max_step += self.nb_ep // Settings.EP_ELONGATION # Render settings self.env.set_render(self.gui.render.get(self.nb_ep)) self.env.set_gif(self.gui.gif.get(self.nb_ep)) plot_distrib = self.gui.plot_distrib.get(self.nb_ep) while episode_step <= max_step and not done: # Exploration by NoisyNets or epsilon-greedy policy if not Settings.NOISY and random.random() < self.epsilon: a = self.env.act_random() else: if Settings.DISTRIBUTIONAL: Qdistrib = self.QNetwork.act(s) Qvalue = np.sum(self.z * Qdistrib, axis=1) else: Qvalue = self.QNetwork.act(s) a = np.argmax(Qvalue, axis=0) if plot_distrib: self.displayer.disp_distrib(self.z, self.delta_z, Qdistrib, Qvalue) s_, r, done, info = self.env.act(a) episode_reward += r memory.append((s, a, r, s_, done)) # Keep the experience in memory until 'N_STEP_RETURN' steps has # passed to get the delayed return r_1 + ... + gamma^n r_n while len(memory) >= Settings.N_STEP_RETURN or (memory and memory[-1][4]): s_mem, a_mem, discount_R, si_, done_ = memory.popleft() if not done_ and memory: for i in range(Settings.N_STEP_RETURN - 1): si, ai, ri, si_, done_ = memory[i] discount_R += ri * Settings.DISCOUNT**(i + 1) if done_: break self.buffer.add( (s_mem, a_mem, discount_R, si_, 1 if not done_ else 0)) if episode_step % Settings.TRAINING_FREQ == 0: if Settings.PRIORITIZED_ER: batch, idx, weights = self.buffer.sample(self.beta) else: batch = self.buffer.sample(self.beta) idx = weights = None loss = self.QNetwork.train(np.asarray(batch), weights) self.buffer.update(idx, loss) self.QNetwork.update_target() feed_dict = {self.lr_ph: self.QNetwork.learning_rate} summary = self.sess.run(self.lr_summary, feed_dict=feed_dict) self.writer.add_summary(summary, learning_steps) learning_steps += 1 s = s_ episode_step += 1 # Decay epsilon if self.epsilon > Settings.EPSILON_STOP: self.epsilon -= Settings.EPSILON_DECAY self.displayer.add_reward(episode_reward, plot=self.gui.plot.get(self.nb_ep)) # if episode_reward > self.best_run: # self.save_best(episode_reward) # Episode display if self.gui.ep_reward.get(self.nb_ep): print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f' ', Max steps: %i, Learning rate: %fe-4' % (self.nb_ep, episode_reward, episode_step, self.epsilon, max_step, self.QNetwork.learning_rate * 1e4)) # Write the summary feed_dict = { self.ep_reward_ph: episode_reward, self.epsilon_ph: self.epsilon, self.steps_ph: episode_step } summary = self.sess.run(self.ep_summary, feed_dict=feed_dict) self.writer.add_summary(summary, self.nb_ep) # Save the model if self.gui.save.get(self.nb_ep): self.saver.save(self.nb_ep) self.nb_ep += 1 print("Training completed !") self.env.close() self.display() self.gui.end_training() self.gui_thread.join() def play(self, number_run=1, gif=False, name=None): """ Method to evaluate the policy without exploration. Args: number_run: the number of episodes to perform gif : whether to save a gif or not name : the name of the gif that will be saved """ self.env.set_render(Settings.DISPLAY) for i in range(number_run): s = self.env.reset() episode_reward = 0 done = False self.env.set_gif(gif, name) while not done: if Settings.DISTRIBUTIONAL: Qdistrib = self.QNetwork.act(s) Qvalue = np.sum(self.z * Qdistrib, axis=1) else: Qvalue = self.QNetwork.act(s) a = np.argmax(Qvalue, axis=0) s, r, done, info = self.env.act(a) episode_reward += r if gif: self.env.save_gif() print("Episode reward :", episode_reward) def display(self): self.displayer.disp() def stop(self): self.env.close() def interrupt(self, sig, frame): self.gui.stop_run()
class Agent(): def __init__(self, params): self.params = params self.__state_dim = params['state_dim'] self.__action_dim = params['action_dim'] self.__buffer_size = params['buffer_size'] self.__batch_size = params['batch_size'] self.__gamma = params['gamma'] self.__tau = params['tau'] self.__lr = params['lr'] self.__update_every = params['update_every'] eps = params['eps'] eps_decay = params['eps_decay'] min_eps = params['min_eps'] seed = params['seed'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network critic_params = dict() critic_params['seed'] = seed critic_params['arch_params'] = params['arch_params_critic'] self.critic_local = QNetwork(critic_params).to(device) self.critic_target = QNetwork(critic_params).to(device) self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=self.__lr) #Policy actor_params = dict() actor_params['seed'] = seed actor_params['arch_params'] = params['arch_params_actor'] actor_params['noise_type'] = params['noise_type'] actor_params['eps'] = eps actor_params['eps_decay'] = eps_decay actor_params['min_eps'] = min_eps actor_params['arch_params'] = params['arch_params_actor'] self.actor_local = Policy(actor_params).to(device) self.actor_target = Policy(actor_params).to(device) self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=self.__lr) self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size) self.__t_step = 0 def memorize_experience(self, state, action, reward, next_state, done): self.__memory.add(state, action, reward, next_state, done) self.__t_step = (self.__t_step + 1) def choose_action(self, state): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") state = torch.from_numpy(state.astype(dtype=np.float)).to(device) action, action_perturbed = self.actor_local(state) return action, action_perturbed def learn_from_past_experiences(self): if self.__t_step % self.__update_every == 0: if len(self.__memory) > self.__batch_size: experiences = self.__memory.sample() self.update_Qnet_and_policy(experiences) def update_Qnet_and_policy(self, experiences): states, actions, rewards, next_states, dones = experiences next_actions, next_actions_perturbed = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, next_actions) Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones) ) # if done == True: second term is equal to 0 Q_expected = self.critic_local(states, actions) loss_func = nn.MSELoss() loss_critic = loss_func(Q_expected, Q_targets.detach()) self.optimizer_critic.zero_grad() loss_critic.backward() self.optimizer_critic.step() predicted_actions, predicted_actions_perturbed = self.actor_local( states) # new predicted actions, not the ones stored in buffer if self.params['noise_type'] == 'parameter': #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise if (predicted_actions - predicted_actions_perturbed).pow(2).mean() >= 0.3: self.actor_local.eps /= 1.01 self.actor_target.eps /= 1.01 else: self.actor_local.eps *= 1.01 self.actor_target.eps *= 1.01 loss_actor = -self.critic_local(states, predicted_actions).mean() self.optimizer_actor.zero_grad() loss_actor.backward() self.optimizer_actor.step() self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def update_eps(self): self.actor_local.eps = max( self.actor_local.eps * self.actor_local.eps_decay, self.actor_local.min_eps) self.actor_target.eps = max( self.actor_target.eps * self.actor_target.eps_decay, self.actor_target.min_eps) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.__tau * local_param.data + (1.0 - self.__tau) * target_param.data) def save_weights(self, save_to): actor_params = { 'actor_params': self.actor_local.policy_params, 'state_dict': self.actor_local.state_dict() } critic_params = { 'critic_params': self.critic_local.qnet_params, 'state_dict': self.critic_local.state_dict() } file = dict() file['critic_params'] = critic_params file['actor_params'] = actor_params torch.save(file, open(save_to, 'wb')) def load_weights(self, load_from): checkpoint = torch.load(load_from) qnet_params = checkpoint['critic_params'] policy_params = checkpoint['actor_params'] self.actor_local = Policy(policy_params['actor_params']) self.actor_local.load_state_dict( checkpoint['actor_params']['state_dict']) self.critic_local = QNetwork(qnet_params['critic_params']) self.critic_local.load_state_dict( checkpoint['critic_params']['state_dict']) return self
class BoxSearchRunner(): def __init__(self, mode): self.mode = mode cu.mem('Reinforcement Learning Started') self.environment = BoxSearchEnvironment(config.get(mode+'Database'), mode, config.get(mode+'GroundTruth')) self.controller = QNetwork() cu.mem('QNetwork controller created') self.learner = None self.agent = BoxSearchAgent(self.controller, self.learner) self.task = BoxSearchTask(self.environment, config.get(mode+'GroundTruth')) self.experiment = Experiment(self.task, self.agent) def runEpoch(self, interactions, maxImgs): img = 0 s = cu.tic() while img < maxImgs: k = 0 while not self.environment.episodeDone and k < interactions: self.experiment._oneInteraction() k += 1 self.agent.learn() self.agent.reset() self.environment.loadNextEpisode() img += 1 s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s) def run(self): if self.mode == 'train': self.agent.persistMemory = True self.agent.startReplayMemory(len(self.environment.imageList), config.geti('trainInteractions')) self.train() elif self.mode == 'test': self.agent.persistMemory = False self.test() def train(self): networkFile = config.get('networkDir') + config.get('snapshotPrefix') + '_iter_' + config.get('trainingIterationsPerBatch') + '.caffemodel' interactions = config.geti('trainInteractions') minEpsilon = config.getf('minTrainingEpsilon') epochSize = len(self.environment.imageList)/1 epsilon = 1.0 self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) epoch = 1 exEpochs = config.geti('explorationEpochs') while epoch <= exEpochs: s = cu.tic() print 'Epoch',epoch,': Exploration (epsilon=1.0)' self.runEpoch(interactions, len(self.environment.imageList)) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ',s) epoch += 1 self.learner = QLearning() self.agent.learner = self.learner egEpochs = config.geti('epsilonGreedyEpochs') while epoch <= egEpochs + exEpochs: s = cu.tic() epsilon = epsilon - (1.0-minEpsilon)/float(egEpochs) if epsilon < minEpsilon: epsilon = minEpsilon self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction) print 'Epoch',epoch ,'(epsilon-greedy:{:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ',s) epoch += 1 maxEpochs = config.geti('exploitLearningEpochs') + exEpochs + egEpochs while epoch <= maxEpochs: s = cu.tic() print 'Epoch',epoch,'(exploitation mode: epsilon={:5.3f})'.format(epsilon) self.runEpoch(interactions, epochSize) self.task.flushStats() self.doValidation(epoch) s = cu.toc('Epoch done in ',s) shutil.copy(networkFile, networkFile + '.' + str(epoch)) epoch += 1 def test(self): interactions = config.geti('testInteractions') self.controller.setEpsilonGreedy(config.getf('testEpsilon')) self.runEpoch(interactions, len(self.environment.imageList)) def doValidation(self, epoch): if epoch % config.geti('validationEpochs') != 0: return auxRL = BoxSearchRunner('test') auxRL.run() indexType = config.get('evaluationIndexType') category = config.get('category') if indexType == 'pascal': categories, catIndex = bse.get20Categories() elif indexType == 'relations': categories, catIndex = bse.getCategories() elif indexType == 'finetunedRelations': categories, catIndex = bse.getRelationCategories() if category in categories: catI = categories.index(category) else: catI = -1 scoredDetections = bse.loadScores(config.get('testMemory'), catI) groundTruthFile = config.get('testGroundTruth') #ps,rs = bse.evaluateCategory(scoredDetections, 'scores', groundTruthFile) pl,rl = bse.evaluateCategory(scoredDetections, 'landmarks', groundTruthFile) line = lambda x,y,z: x + '\t{:5.3f}\t{:5.3f}\n'.format(y,z) #print line('Validation Scores:',ps,rs) print line('Validation Landmarks:',pl,rl)
class Agent(): def __init__(self, capacity, state_size, action_size, pretrained_model_path=None, tau=1e-3, gamma=0.99, batch_size=32, lr=1e-4, learn_every_n_steps=4): # Environment variables self.state_size = state_size self.action_size = action_size # Create Qnetworks self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = QNetwork(state_size, action_size).to(device) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.batch_size = batch_size self.gamma = gamma self.tau = tau if pretrained_model_path is not None: self.qnetwork_local.load_state_dict( torch.load(pretrained_model_path)) # Initialize memory buffer self.memory = ReplayBuffer(capacity, batch_size) # Initialize time step for updating target network every q steps self.learn_every_n_steps = learn_every_n_steps self.t_step = 0 def step(self, state, action, reward, next_state, done): """Learn from the action and environments reponse.""" self.memory.add(state, action, reward, next_state, done) # Maybe learn if learn_every_n_steps has passed self.t_step = (self.t_step + 1) % self.learn_every_n_steps if self.t_step == 0: if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=1.): """ Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()).astype(int) else: return random.choice(np.arange(self.action_size)).astype(int) def learn(self, experiences): """Update network parameters""" states, actions, rewards, next_states, dones = experiences # Get best score according to the target network and evaluate it against the local network next_action_values = self.qnetwork_target(next_states).detach().max( dim=1)[0].unsqueeze(1) y = rewards + (self.gamma * next_action_values * (1 - dones)) yhat = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(yhat, y) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network self.soft_update() def soft_update(self): """Performs soft update of frozen target network as per double-DQN""" for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
def create_q(path): print(path) net = torch.load(path) net = net.train() return QNetwork(net, path, lr=1e-4)