예제 #1
0
class DQNAgent():
    def __init__(self, env):
        self.state_dim = env.observation_space.shape
        self.action_size = env.action_space.n
        self.q_network = QNetwork(self.state_dim, self.action_size)
        self.gamma = 0.97
        self.ep = 1.0
        self.replay_buffer = ReplayBuffer(length=10000)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

    def get_action(self, state):
        q_state = self.q_network.get_q_state(self.sess, [state])
        if random.random() < self.ep:
            action = np.random.randint(self.action_size)
        else:
            action = np.argmax(q_state)
        return action

    def train(self, state, action, next_state, reward, done):
        self.replay_buffer.add((state, action, next_state, reward, done))
        states, actions, next_states, rewards, dones = self.replay_buffer.sample(
            50)
        q_next_states = self.q_network.get_q_state(self.sess, next_states)
        q_next_states[dones] = np.zeros([self.action_size
                                         ])  # sets q_next_state to 0 if done
        q_targets = rewards + self.gamma * np.max(q_next_states, axis=1)

        self.q_network.update_model(self.sess, states, actions, q_targets)

        if done: self.ep = max(0.1, 0.99 * self.ep)

    def __del__(self):
        self.sess.close()
예제 #2
0
    def __init__(self, state_size, action_size, seed):
        '''Initlize the Agent.
		
		Parameters
		----------
		state_size : int
			The dimension of each state
		
		action_size : int
			The dimension of each action
		
		seed : int
			The random seed used to generate random numbers.
		'''
        self.state_size = state_size
        self.action_size = action_size
        random.seed(seed)

        #Q-Network
        self.local_qnetwork = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.target_qnetwork = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.local_qnetwork.parameters(),
                                    lr=LEARNING_RATE)

        #Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0
예제 #3
0
    def __init__(self,
                 env,
                 action_size,
                 state_size,
                 use_dueling=False,
                 use_double=False,
                 network_file=None):
        self.device = torch.device('cpu')
        self.action_size = action_size
        self.env = env
        self.state_size = state_size
        self.seed = 1234
        self.target_network = QNetwork(state_size=state_size,
                                       action_size=action_size,
                                       seed=self.seed,
                                       use_dueling=use_dueling).to(self.device)
        self.local_network = QNetwork(state_size=state_size,
                                      action_size=action_size,
                                      seed=self.seed,
                                      use_dueling=use_dueling).to(self.device)
        self.optimizer = torch.optim.Adam(self.local_network.parameters(),
                                          lr=5e-4)

        if network_file is not None:
            if os.path.exists(network_file):
                checkpoints = torch.load(network_file)
                self.local_network.load_state_dict(checkpoints['local'])
                self.target_network.load_state_dict(checkpoints['target'])
                self.optimizer.load_state_dict(checkpoints['optimizer'])

        self.memory = ReplayBuffer(self.seed,
                                   batch_size=BATCH_SIZE,
                                   device=self.device)
        self.use_double = use_double
예제 #4
0
    def __init__(self, sess):
        print("Initializing the agent...")

        self.sess = sess
        self.env = Environment()
        self.state_size = self.env.get_state_size()
        self.action_size = self.env.get_action_size()

        print("Creation of the main QNetwork...")
        self.mainQNetwork = QNetwork(self.state_size, self.action_size, 'main')
        print("Main QNetwork created !\n")

        print("Creation of the target QNetwork...")
        self.targetQNetwork = QNetwork(self.state_size, self.action_size,
                                       'target')
        print("Target QNetwork created !\n")

        self.buffer = PrioritizedReplayBuffer(parameters.BUFFER_SIZE,
                                              parameters.ALPHA)

        self.epsilon = parameters.EPSILON_START
        self.beta = parameters.BETA_START

        self.initial_learning_rate = parameters.LEARNING_RATE

        trainables = tf.trainable_variables()
        self.update_target_ops = updateTargetGraph(trainables)

        self.nb_ep = 1
        self.best_run = -1e10
예제 #5
0
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment, QNetwork and ExperienceBuffer.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.displayer = displayer
        self.saver = saver

        self.env = Environment()
        self.QNetwork = QNetwork(self.sess)
        self.buffer = ExperienceBuffer()
        self.epsilon = Settings.EPSILON_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS - 1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")
예제 #6
0
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment and QNetwork.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.gui_thread = threading.Thread(target=lambda: self.gui.run(self))
        self.displayer = displayer
        self.saver = saver
        signal.signal(signal.SIGINT, self.interrupt)

        self.env = Environment()
        self.QNetwork = QNetwork(sess)
        self.buffer = ExperienceBuffer(prioritized=Settings.PRIORITIZED_ER)
        self.epsilon = Settings.EPSILON_START
        self.beta = Settings.BETA_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS -
                                                            1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")
    def __init__(self,
                 capacity,
                 state_size,
                 action_size,
                 pretrained_model_path=None,
                 tau=1e-3,
                 gamma=0.99,
                 batch_size=32,
                 lr=1e-4,
                 learn_every_n_steps=4):
        # Environment variables
        self.state_size = state_size
        self.action_size = action_size

        # Create Qnetworks
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size).to(device)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(),
                                          lr=lr)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        if pretrained_model_path is not None:
            self.qnetwork_local.load_state_dict(
                torch.load(pretrained_model_path))

        # Initialize memory buffer
        self.memory = ReplayBuffer(capacity, batch_size)

        # Initialize time step for updating target network every q steps
        self.learn_every_n_steps = learn_every_n_steps
        self.t_step = 0
예제 #8
0
    def __init__(self, env):
        self.state_dim = env.observation_space.shape
        self.action_size = env.action_space.n
        self.q_network = QNetwork(self.state_dim, self.action_size)
        self.gamma = 0.97
        self.ep = 1.0
        self.replay_buffer = ReplayBuffer(length=10000)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
예제 #9
0
    def __init__(self, root):

        self.root = root
        self.gridFrame = Frame(self.root)
        self.gridFrame.pack(side = LEFT) # align grid on the left
        self.width = 680
        self.height = 680
        self.size = 3
        self.canvas = Canvas(self.gridFrame, bg="white", width=self.width, height=self.height) #create canvas inside the frame
        self.canvas.pack() # binding it with the rest
        self.logic = gameLogic.GameLogic(self.size, QNetwork("o", self.size), QNetwork("x", self.size))
예제 #10
0
 def __init__(self, mode):
     self.mode = mode
     cu.mem('Reinforcement Learning Started')
     self.environment = RegionFilteringEnvironment(
         config.get(mode + 'Database'), mode)
     self.controller = QNetwork()
     cu.mem('QNetwork controller created')
     self.learner = None
     self.agent = RegionFilteringAgent(self.controller, self.learner)
     self.task = RegionFilteringTask(self.environment,
                                     config.get(mode + 'GroundTruth'))
     self.experiment = Experiment(self.task, self.agent)
예제 #11
0
    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)
        qnet_params = checkpoint['critic_params']
        policy_params = checkpoint['actor_params']

        self.actor_local = Policy(policy_params['actor_params'])
        self.actor_local.load_state_dict(
            checkpoint['actor_params']['state_dict'])

        self.critic_local = QNetwork(qnet_params['critic_params'])
        self.critic_local.load_state_dict(
            checkpoint['critic_params']['state_dict'])
        return self
예제 #12
0
def main():
    state_dim = 2
    nb_actions = 3
    ep_size = 8
    net = Net1(state_dim, nb_actions)
    agent = QNetwork(net, state_dim, ep_size)

    for _ in range(100):
        # Memorize fool data
        states = torch.rand(ep_size, state_dim)
        actions = agent.decide(states)
        next_states = torch.rand(ep_size, state_dim)
        rewards = reward(states, actions)
        agent.memorize(states, actions, next_states, reward(states, actions))
        agent.update()
        # agent.clear_memory()

    # agent.show_training()

    # Displaying the agent's decisions
    states_interv = torch.linspace(0, 1, 100)
    states_grid = torch.cartesian_prod(states_interv, states_interv)
    actions = agent.decide(states_grid)
    fig = plt.figure()
    ax = plt.axes(projection="3d")
    ax.plot3D(states_grid[:, 0], states_grid[:, 1], actions)
    plt.show()

    return 0
예제 #13
0
    def __init__(self, env, buffer, load_models = False, epsilon=0.05, Q_hidden_nodes = Q_HIDDEN_NODES, batch_size= BATCH_SIZE, rew_thre = REW_THRE, min_rew = MINIMUM_REWARD, window = WINDOW, path_to_the_models = MODELS_DIR):

        print("MARGIN: ", MARGIN)
        print(("1/MARGIN: ", 1/MARGIN))
        self.margin_discrete = 0
        self.lq = 0
        self.lts = 0
        self.ltx = 0
        self.ld = 0
        self.l_spars = 0
        self.path_to_the_models = path_to_the_models
        self.env = env

        self.action_size = ACTION_SIZE
        self.state_size = STATE_SIZE
        self.code_size = CODE_SIZE

        if load_models:
            self.load_models()
        else:
            self.encoder = Encoder(self.code_size)
            self.decoder = Decoder(self.code_size)
            self.trans_delta = TransitionDelta(self.code_size, self.action_size)
            self.network = QNetwork(env=env, n_hidden_nodes=Q_hidden_nodes, encoder=self.encoder)

        self.transition = Transition(self.encoder, self.decoder, self.trans_delta)
        params = [self.encoder.parameters(),self.decoder.parameters(), self.trans_delta.parameters(), self.network.symbolic_net.parameters()]
        params = itertools.chain(*params)
        self.optimizer = torch.optim.Adam(params,
                                         lr=0.001)
        #self.f = open("res/planner_enc_DDQN.txt", "a+")
        self.target_network = deepcopy(self.network)
        self.buffer = buffer
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.window = window
        self.reward_threshold = rew_thre
        self.min_reward = min_rew
        self.maximum_horizon = 1
        self.horizon = 1
        self.initialize()
        self.action = 0
        self.temp_s1 = 0
        self.step_count = 0
        self.cum_rew = 0
        self.timestamp = 0
        self.episode = 0
        self.difference = 0
        self.A = [to_categorical(i, self.action_size) for i in range(self.action_size)]
예제 #14
0
def create_model(env):
    """
    Create a model depending on the type of environment :env. Note that although it runs technically,
    the Box to Box (Continuous to Continuous) version doesn't really work.
    """
    if type(env.action_space) == gym.spaces.Box and type(env.observation_space) == gym.spaces.Box:
        return QNetwork(env.observation_space.shape[0], num_hidden, env.action_space.low.shape[0])
    elif type(env.action_space) == gym.spaces.Discrete and type(env.observation_space) == gym.spaces.Box:
        return QNetwork(env.observation_space.low.shape[0], num_hidden, env.action_space.n)
    elif type(env.action_space) == gym.spaces.Box and type(env.observation_space) == gym.spaces.Discrete:
        return QNetwork(env.observation_space.n, num_hidden, env.action_space.low.shape[0])
    elif type(env.action_space) == gym.spaces.Discrete and type(env.observation_space) == gym.spaces.Discrete:
        return QNetwork(env.observation_space.n, num_hidden, env.action_space.n)
    else:
        raise NotImplementedError()
예제 #15
0
    def __init__(self, state_size, action_size, buffer_size, batch_size, gamma,
                 tau, lr, epsilon_init, epsilon_final, epsilon_decay, a, b,
                 b_step, update_every, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.epsilon = epsilon_init
        self.epsilon_final = epsilon_final
        self.epsilon_decay = epsilon_decay
        self.a = a
        self.b = b
        self.b_step = b_step
        random.seed(seed)
        self.update_every = update_every
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        arch_params = OrderedDict({
            'state_and_action_sizes': (state_size, action_size),
            'Linear_2':
            64,
            'ReLU_2':
            None,
            'Linear_3':
            128,
            'ReLU_3':
            None,
            'Linear_4':
            64,
            'ReLU_4':
            None,
            'Linear_5':
            action_size
        })
        self.qnetwork_local = QNetwork(seed, arch_params).to(
            device)  # decision_maker
        self.qnetwork_target = QNetwork(seed, arch_params).to(device)  # fixed
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed)
        self.t_step = 0
        self.average_TD_error = 1.0
    def __init__(self,
                 env,
                 buffer,
                 load_models=False,
                 epsilon=0.5,
                 Q_hidden_nodes=Q_HIDDEN_NODES,
                 batch_size=BATCH_SIZE,
                 rew_thre=REW_THRE,
                 window=WINDOW,
                 path_to_the_models=MODELS_DIR):

        self.path_to_the_models = path_to_the_models
        self.env = env
        self.action_size = ACTION_SIZE

        if load_models:
            self.load_models()
        else:
            self.encoder = Encoder(CODE_SIZE)
            self.decoder = Decoder(CODE_SIZE)
            self.trans_delta = TransitionDelta(3, self.action_size)
            self.transition = Transition(self.encoder, self.decoder,
                                         self.trans_delta)
            self.network = QNetwork(env=env,
                                    encoder=self.encoder,
                                    n_hidden_nodes=Q_hidden_nodes)
        self.target_network = deepcopy(self.network)
        #self.f = open("res/planner_enc_DDQN.txt", "a+")
        self.buffer = buffer
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.window = window
        self.reward_threshold = rew_thre
        self.initialize()
        self.action = 0
        self.step_count = 0
        self.cum_rew = 0
        self.timestamp = 0
        self.episode = 0
        self.difference = 0
        self.different_codes = 0
        self.A = [
            to_categorical(i, self.action_size)
            for i in range(self.action_size)
        ]
예제 #17
0
 def __init__(self, mode):
   self.mode = mode
   cu.mem('Reinforcement Learning Started')
   self.environment = BoxSearchEnvironment(config.get(mode+'Database'), mode, config.get(mode+'GroundTruth'))
   self.controller = QNetwork()
   cu.mem('QNetwork controller created')
   self.learner = None
   self.agent = BoxSearchAgent(self.controller, self.learner)
   self.task = BoxSearchTask(self.environment, config.get(mode+'GroundTruth'))
   self.experiment = Experiment(self.task, self.agent)
예제 #18
0
    def __init__(self, state_size, action_size, seed):
        '''Args:
            state_size: Int, number of dims in the state space
            action_size: Int, number of dims in the action space
            seed: Int, to set random seed'''
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # create the local and target Qnetworks and Optimizer set to optimize the local network
        self.qn_local = QNetwork(state_size, action_size, seed=seed).to(device)
        self.qn_target = QNetwork(state_size, action_size,
                                  seed=seed).to(device)
        self.optimizer = optim.Adam(params=self.qn_local.parameters(), lr=LR)

        # create the memory buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # counter for steps to learn
        self.t_step = 0
예제 #19
0
    def __init__(self, params):
        self.params = params
        self.__state_dim = params['state_dim']
        self.__action_dim = params['action_dim']
        self.__buffer_size = params['buffer_size']
        self.__batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__lr = params['lr']
        self.__update_every = params['update_every']
        eps = params['eps']
        eps_decay = params['eps_decay']
        min_eps = params['min_eps']
        seed = params['seed']
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        critic_params = dict()
        critic_params['seed'] = seed
        critic_params['arch_params'] = params['arch_params_critic']
        self.critic_local = QNetwork(critic_params).to(device)
        self.critic_target = QNetwork(critic_params).to(device)
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=self.__lr)

        #Policy
        actor_params = dict()
        actor_params['seed'] = seed
        actor_params['arch_params'] = params['arch_params_actor']
        actor_params['noise_type'] = params['noise_type']
        actor_params['eps'] = eps
        actor_params['eps_decay'] = eps_decay
        actor_params['min_eps'] = min_eps
        actor_params['arch_params'] = params['arch_params_actor']
        self.actor_local = Policy(actor_params).to(device)
        self.actor_target = Policy(actor_params).to(device)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=self.__lr)

        self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size)
        self.__t_step = 0
예제 #20
0
파일: Server.py 프로젝트: HuanjunWang/GA3C
    def __init__(self):
        self.training_q = Queue(maxsize=Config.MAX_QUEUE_SIZE)
        self.prediction_q = Queue(maxsize=Config.MAX_QUEUE_SIZE)
        self.env = Environment()
        self.action_dim = self.env.get_num_actions()
        self.observation_dim = self.env.get_observation_dim()

        self.model = QNetwork(model_name="Name",
                              num_actions=self.action_dim,
                              observation_dim=self.observation_dim,
                              gamma=Config.DISCOUNT,
                              seed=Config.SEED,
                              log_dir=Config.LOG_DIR)

        self.training_step = 0
        self.action_step = 0

        self.agents = []
        self.predictors = []
        self.trainers = []
        self.dynamic_adjustment = ThreadDynamicAdjustment(self)
예제 #21
0
    def __init__(self, state_dim: int, nb_actions: int,
                 net: torch.nn.Module,
                 next_state_func,
                 final_states_func,
                 rewards_func,
                 device=None):
        """
        :param state_dim: Dimensions needed to describe a state. For example a position
            on a plan will need state_dim = 2.
        :param nb_actions: Number of different possible actions at most.
        :param net: Torch neural network used by the agent. Should have
            its output dimension equal to nb_actions and its input dimension equal to state_dim.
        :param next_state_func: Function of signature (2D torch tensor, a: int, time: int, torch device)
                                --> 2D torch tensor
            which for a tensor S where S[i, :] is a state, returns a tensor NS where NS[i, :] is the state
            obtained by performing action a in state S[i, :]. Time indicates how many transitions have already
            taken place during the exploration.
        :param final_states_func: Function of signature (2D torch tensor, time: int, torch device)
                                --> 1D torch tensor
            which for a tensor S where S[i, :] is a state, returns a tensor F where F[i] == 1 iff S[i, :] is final
            and F[i] == 0 otherwise.
        :param rewards_func: Function of signature (2D torch tensor, a: int, time: int, torch device)
                                --> 1D torch tensor
            which for a tensor S where S[i, :] is a state, returns a tensor R where R[i] is the reward for taking
            action a in the state S[i, :]. Time indicates how many transitions have already taken place during
            the exploration.
        :param device: Torch device to be used for computations and training
        """
        self.state_dim = state_dim
        self.nb_actions = nb_actions

        if device is None:
            self.device = torch.device("cpu")
        else:
            self.device = device

        self.agent = QNetwork(net, state_dim, 32, device=self.device)
        self.next_state_func = next_state_func
        self.final_states_func = final_states_func
        self.rewards_func = rewards_func
예제 #22
0
    with tf.Session() as sess:

        saver = Saver.Saver(sess)
        displayer = Displayer.Displayer()
        buffer = ExperienceBuffer()

        gui = GUI.Interface(['ep_reward', 'plot', 'render', 'gif', 'save'])

        main_agent = Agent(sess, 0, gui, displayer, buffer)
        threads = []
        for i in range(1, Settings.NB_ACTORS):
            agent = Agent(sess, i, gui, displayer, buffer)
            threads.append(threading.Thread(target=agent.run))

        # with tf.device('/device:GPU:0'):
        learner = QNetwork(sess, gui, saver, buffer)
        threads.append(threading.Thread(target=learner.run))

        if not saver.load():
            sess.run(tf.global_variables_initializer())

        gui_thread = threading.Thread(target=lambda: gui.run(main_agent))
        gui_thread.start()
        for t in threads:
            t.start()

        print("Running...")
        main_agent.run()

        for t in threads:
            t.join()
예제 #23
0
if __name__ == "__main__":
    import loop_environments
    env = loop_environments.create_env("SimpleWindyGridWorld")
    # Let's run it!
    num_episodes = 200
    batch_size = 10
    discount_factor = 0.8
    learn_rate = 1e-3
    memory = ReplayMemory(10000)
    num_hidden = 128
    seed = 42  # This is not randomly chosen
    # env = gym.envs.make("Acrobot-v1")
    # print(f"Action space: {env.action_space} - State space: {env.observation_space}")
    # We will seed the algorithm (before initializing QNetwork!) for reproducability
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    env.seed(seed)

    print(env.observation_space.shape)
    print(env.action_space.shape)
    model = QNetwork(env.observation_space.n, num_hidden, env.action_space.n)

    episode_durations, episode_rewards = run_episodes(train, model, memory,
                                                      env, num_episodes,
                                                      batch_size,
                                                      discount_factor,
                                                      learn_rate)
    plt.plot(episode_durations)
    plt.savefig("test.png")
예제 #24
0
            'logger: %s\n',
            args.simulator,
            args.networkPath,
            args.lr,
            args.batchSize,
            args.itr,
            args.eps,
            args.gamma,
            args.memory,
            args.frequency,
            args.testSize,
            args.device,
            args.threads,
            args.checkpoints,
            args.logger)

if __name__ == '__main__':
    if args.checkpoints and not os.path.exists('checkpoints'):
        os.makedirs('checkpoints')

    simulator = SimulatorFactory.getInstance(args.simulator, args)
    trainer = DQN(QNetwork(simulator.dState(), simulator.nActions()))
    try:
        logger.info('Starting training.')
        trainer.train(args)
    except KeyboardInterrupt:
        logger.info('KeyboardInterrupt received. Trying to stop threads.')
    finally:
        trainer.stop()
        simulator.destroy()
예제 #25
0
class Plan_RL_agent:
    def __init__(self, env, buffer, load_models = False, epsilon=0.05, Q_hidden_nodes = Q_HIDDEN_NODES, batch_size= BATCH_SIZE, rew_thre = REW_THRE, min_rew = MINIMUM_REWARD, window = WINDOW, path_to_the_models = MODELS_DIR):

        print("MARGIN: ", MARGIN)
        print(("1/MARGIN: ", 1/MARGIN))
        self.margin_discrete = 0
        self.lq = 0
        self.lts = 0
        self.ltx = 0
        self.ld = 0
        self.l_spars = 0
        self.path_to_the_models = path_to_the_models
        self.env = env

        self.action_size = ACTION_SIZE
        self.state_size = STATE_SIZE
        self.code_size = CODE_SIZE

        if load_models:
            self.load_models()
        else:
            self.encoder = Encoder(self.code_size)
            self.decoder = Decoder(self.code_size)
            self.trans_delta = TransitionDelta(self.code_size, self.action_size)
            self.network = QNetwork(env=env, n_hidden_nodes=Q_hidden_nodes, encoder=self.encoder)

        self.transition = Transition(self.encoder, self.decoder, self.trans_delta)
        params = [self.encoder.parameters(),self.decoder.parameters(), self.trans_delta.parameters(), self.network.symbolic_net.parameters()]
        params = itertools.chain(*params)
        self.optimizer = torch.optim.Adam(params,
                                         lr=0.001)
        #self.f = open("res/planner_enc_DDQN.txt", "a+")
        self.target_network = deepcopy(self.network)
        self.buffer = buffer
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.window = window
        self.reward_threshold = rew_thre
        self.min_reward = min_rew
        self.maximum_horizon = 1
        self.horizon = 1
        self.initialize()
        self.action = 0
        self.temp_s1 = 0
        self.step_count = 0
        self.cum_rew = 0
        self.timestamp = 0
        self.episode = 0
        self.difference = 0
        self.A = [to_categorical(i, self.action_size) for i in range(self.action_size)]


    def monitor_replanning(self, horizon, show = True, plot = True):
        done = False
        self.rewards = 0
        if plot:
            self.plans = []
        while not done:
            if show:
                self.env.render()
            done = self.take_step(horizon = horizon, plot = plot)
        if show:
            print("Episode reward: ", self.rewards)
        if plot:
            self.plot_plans()
        return self.rewards

    def save_models(self):
        torch.save(self.encoder, self.path_to_the_models + "encoder")
        torch.save(self.decoder, self.path_to_the_models + "decoder")
        torch.save(self.trans_delta, self.path_to_the_models + "trans_delta")
        torch.save(self.network, self.path_to_the_models + "Q_net")

    def load_models(self):
        self.encoder = torch.load(self.path_to_the_models+"encoder")
        self.encoder.eval()
        self.decoder = torch.load(self.path_to_the_models+"decoder")
        self.decoder.eval()
        self.trans_delta = torch.load(self.path_to_the_models+"trans_delta")
        self.trans_delta.eval()
        self.network = torch.load(self.path_to_the_models+"Q_net")
        self.network.eval()

    def plot_training_rewards(self):
        plt.plot(self.mean_training_rewards)
        plt.title('Mean training rewards')
        plt.ylabel('Reward')
        plt.xlabel('Episods')
        #plt.show()
        plt.savefig(self.path_to_the_models+'mean_training_rewards.png')
        plt.clf()

    def plot_plans(self):
        fig = plt.gcf()
        fig.set_size_inches(28, 4)
        d = len(self.plans[0])
        executed_actions = [p[0] for p in self.plans]

        for i in range(len(self.plans)):
            plt.plot(range(i,i+d), self.plans[i], color='blue')
        plt.plot( executed_actions, c='red')

        plt.title('Monitor replanning plans')
        plt.ylabel('Actions')
        plt.xlabel('Steps')
        #plt.show()
        fig.savefig(self.path_to_the_models+'monitor_replanning_{}.png'.format(d))
        plt.clf()

    def expandFunc(self, x, a):
        _, x_prime, x_prime_d = self.trans_delta(x, torch.from_numpy(a).type(torch.FloatTensor).to(device), True)
        lmse = nn.MSELoss()
        #print(x_prime)
        #print(x_prime_d)
        self.disc_error = lmse(x_prime, x_prime_d).item()

        if PREDICT_CERTAINTY:
            c = 1 - self.disc_error
        #print(l)
        #print(c)
        else:
            c = 1

        return x_prime, x_prime_d, c

    def vFunc(self, x):
        v0 = self.network.get_enc_value(x)
        return torch.max(v0).to('cpu').detach().numpy()

    def certainty(self, x):
        if PREDICT_CERTAINTY:
            x_p = self.encoder(self.decoder(x))
            distance = torch.nn.L1Loss()
            c = 1 - distance(x, x_p).item()
        else:
            c = 1
        return c


    def findPlan(self, node):
        # caso base
        if node.sons == []:
            return [node.a], node.v*node.c

        somme_values = []
        plans = []
        for n in node.sons:
            p, s = self.findPlan(n)
            plans.append(p)
            somme_values.append(s)
            # print("plan p", p)
            # print("plan p", s)

        ###### evaluate plans
        #se più piani hanno valore massimo ne scelgo uno fra di essi random
        smax = max(somme_values)
        indices_max = [i for i, j in enumerate(somme_values) if j == smax]
        k = random.choice(indices_max)

        bestp = plans[k]

        return [node.a] + bestp, node.v * node.c + smax

    def limited_expansion(self, node, depth):
        if depth == 0:
            return

        for a in self.A:
            x_prime, x_prime_d, c = self.expandFunc(node.x, a)
            if self.margin_discrete >= 0.499999:
                node.expand(x_prime_d, self.vFunc(x_prime_d), a, node.c * c)
            else:
                node.expand(x_prime_d, self.vFunc(x_prime + (x_prime_d - x_prime)*(2* self.margin_discrete)), a, node.c * c)
            #node.expand(x_prime_d, self.vFunc(x_prime + (x_prime_d - x_prime)*(1.5*self.margin_discrete)), a, node.c * c)
            #node.expand(x_prime_d, self.vFunc(x_prime_d), a, node.c * c)

        for i in range(len(node.sons)):
            self.limited_expansion(node.sons[i], depth - 1)

    def planner_action(self, depth=1, verbose = False, plot = False):
        if np.random.random() < 0.05:
            return np.random.choice(self.action_size)

        origin_code = self.encoder(torch.from_numpy(self.s_0).type(torch.FloatTensor), True)
        #print("Origin code: ", origin_code)
        origin_value = self.vFunc(origin_code)
        root = Node(origin_code, origin_value, to_categorical(0, self.action_size), self.certainty(origin_code))

        self.limited_expansion(root, depth)

        if verbose:
            root.print_parentetic()

        plan, sum_value = self.findPlan(root)

        if verbose:
            #root.print_parentetic()
            print("plan: {}, sum_value: {}".format(plan[1:], sum_value))

        if plot:
            plan_read = [ np.where(plan[i] == 1)[0][0] for i in range(1, len(plan)) ]
            #print("plan_read : ", plan_read)
            self.plans.append(plan_read)

        return np.where(plan[1] == 1)[0][0]

    def planner_action_old(self, depth=1):
        #if np.random.random() < 0.05:
        #    return np.random.choice(self.action_size)

        origin_code = self.encoder(torch.from_numpy(self.s_0).type(torch.FloatTensor))
        origin_value = self.network.get_enc_value(origin_code)
        origin_node = plan_node(origin_code, origin_value)
        origin_node.action_vec = [0]
        action = torch.argmax(origin_value).to('cpu').detach().numpy()

        a0 = to_categorical(0,self.action_size)
        a1 = to_categorical(1,self.action_size)
        a2 = to_categorical(2,self.action_size)
        #a3 = to_categorical(3,self.action_size)
        #a4 = to_categorical(3, self.action_size)
        #a5 = to_categorical(3, self.action_size)


        _, ns0 = self.trans_delta(origin_code, torch.from_numpy(a0).type(torch.FloatTensor).to('cuda'))
        _, ns1 = self.trans_delta(origin_code, torch.from_numpy(a1).type(torch.FloatTensor).to('cuda'))
        _, ns2 = self.trans_delta(origin_code, torch.from_numpy(a2).type(torch.FloatTensor).to('cuda'))
        #_, ns3 = self.trans_delta(origin_code, torch.from_numpy(a3).type(torch.FloatTensor).to('cuda'))
        #_, ns4 = self.trans_delta(origin_code, torch.from_numpy(a4).type(torch.FloatTensor).to('cuda'))
        #_, ns5 = self.trans_delta(origin_code, torch.from_numpy(a5).type(torch.FloatTensor).to('cuda'))

        v0 = self.network.get_enc_value(ns0)
        v1 = self.network.get_enc_value(ns1)
        v2 = self.network.get_enc_value(ns2)
        #v3 = self.network.get_enc_value(ns3)
        #v4 = self.network.get_enc_value(ns4)
        #v5 = self.network.get_enc_value(ns5)

        max0 = torch.max(v0).to('cpu').detach().numpy()
        arg_max0 = torch.argmax(v0).to('cpu').detach().numpy()

        max1 = torch.max(v1).to('cpu').detach().numpy()
        arg_max1 = torch.argmax(v1).to('cpu').detach().numpy()


        max2 = torch.max(v2).to('cpu').detach().numpy()
        arg_max2 = torch.argmax(v2).to('cpu').detach().numpy()

        '''
        max3 = torch.max(v3).to('cpu').detach().numpy()
        arg_max3 = torch.argmax(v3).to('cpu').detach().numpy()

        
        max4 = torch.max(v4).to('cpu').detach().numpy()
        arg_max4 = torch.argmax(v4).to('cpu').detach().numpy()

        max5 = torch.max(v5).to('cpu').detach().numpy()
        arg_max5 = torch.argmax(v5).to('cpu').detach().numpy()
        '''

        l_max = [max0, max1, max2]

        #smax = max(l_max)
        #indices_max = [i for i, j in enumerate(l_max) if j == smax]
        #k = random.choice(indices_max)

        #l_amax = [arg_max0, arg_max1, arg_max2]
        l_amax = [0, 1, 2]

        #if(action != l_amax[np.argmax(l_max)]):
            #print("DIVERSO!")

        #return k
        return l_amax[np.argmax(l_max)]

    def is_diff(self, s1, s0):
        for i in range(len(s0)):
            if(s0[i] != s1[i]):
                return True
        return False

    def take_step(self, mode='train', horizon=0, plot= False):

        s_1, r, done, _ = self.env.step(self.action)
        #print(self.env.action_space)
        enc_s1 = self.encoder(torch.from_numpy(np.asarray(s_1)).type(torch.FloatTensor))
        enc_s0 = self.encoder(torch.from_numpy(np.asarray(self.s_0)).type(torch.FloatTensor).to('cuda'))
        #print("Reward = ", r)
        if(self.is_diff(enc_s0,enc_s1)):
        #if(True):
            #print("step passati = ", self.step_count - self.timestamp)
            self.timestamp = self.step_count

            self.buffer.append(self.s_0, self.action, r, done, s_1)
            self.cum_rew = 0

            if mode == 'explore':
                self.action = self.env.action_space.sample()

            else:
                #self.action = self.network.get_action(self.s_0)
                #self.action = self.planner_action()
                if horizon == 0:
                    # ADAPTIVE HORIZON
                    if len(self.mean_training_rewards) == 0:
                        self.horizon = 1
                    else:
                        step = (self.reward_threshold - self.min_reward) / self.maximum_horizon
                        for i in range(self.maximum_horizon):
                            if self.mean_training_rewards[-1] < self.min_reward + (i+1)*step :
                                self.horizon = i+1
                                break
                else:
                    self.horizon = horizon
                #print(horizon)
                self.action = self.planner_action(depth=self.horizon, plot = plot)

            self.s_0 = s_1.copy()

        self.rewards += r
        self.step_count += 1
        if done:

            self.s_0 = self.env.reset()
        return done

    # Implement DQN training algorithm
    def train(self, gamma=0.99, max_episodes=1000,
              network_update_frequency=4,
              network_sync_frequency=200):
        self.gamma = gamma
        # Populate replay buffer
        while self.buffer.burn_in_capacity() < 1:
            self.take_step(mode='explore')

        ep = 0
        training = True
        while training:
            self.s_0 = self.env.reset()
            self.rewards = 0
            done = False
            while done == False:
                if((ep % 20) == 0 ):
                    self.env.render()

                p = np.random.random()
                if p < self.epsilon:
                    done = self.take_step(mode='explore')
                    # print("explore")
                else:
                    done = self.take_step(mode='train')
                    # print("train")
                #done = self.take_step(mode='train')
                # Update network
                if self.step_count % network_update_frequency == 0:
                    self.update()
                # Sync networks
                if self.step_count % network_sync_frequency == 0:
                    self.target_network.load_state_dict(
                        self.network.state_dict())
                    self.sync_eps.append(ep)

                if done:
                    ep += 1
                    self.margin_discrete = min([0.5 - pow(0.5, 0.15*ep+1), 0.499999])
                    if self.margin_discrete >= 0.499999:
                        DISCRETE_CODES = True
                    #self.margin_discrete = 0
                    if self.epsilon >= 0.05:
                        self.epsilon = self.epsilon * 0.7
                    self. episode = ep
                    self.training_rewards.append(self.rewards)
                    self.training_loss.append(np.mean(self.update_loss))
                    self.update_loss = []
                    mean_rewards = np.mean(
                        self.training_rewards[-self.window:])
                    self.mean_training_rewards.append(mean_rewards)
                    print("\rEpisode {:d} Mean Rewards {:.2f}  Episode reward = {:.2f}  lq = {:.3f}  horizon ={}  ltx ={:3f}  ld ={:3f}  l_spars={:3f}  margin={:3f}  disc_err={:3f}\t\t".format(
                        ep, mean_rewards, self.rewards, self.lq, self.horizon, self.ltx, self.ld, self.l_spars, self.margin_discrete, self.disc_error), end="")
                    #self.f.write(str(mean_rewards)+ "\n")


                    if ep >= max_episodes:
                        training = False
                        print('\nEpisode limit reached.')
                        break
                    if mean_rewards >= self.reward_threshold:
                        training = False
                        print('\nEnvironment solved in {} episodes!'.format(
                            ep))
                        break
        # save models
        self.save_models()
        # plot
        self.plot_training_rewards()
    def calculate_loss(self, batch):

        states, actions, rewards, dones, next_states = [i for i in batch]
        rewards_t = torch.FloatTensor(rewards).to(device=self.network.device).reshape(-1, 1)
        actions_t = torch.LongTensor(np.array(actions)).reshape(-1, 1).to(
            device=self.network.device)
        dones_t = torch.ByteTensor(dones).to(device=self.network.device)

        ###############
        # DDQN Update #
        ###############
        qvals = self.network.get_qvals(states)
        qvals = torch.gather(qvals.to('cpu'), 1, actions_t)

        next_vals= self.network.get_qvals(next_states)
        next_actions = torch.max(next_vals.to('cpu'), dim=-1)[1]
        next_actions_t = torch.LongTensor(next_actions).reshape(-1, 1).to(
            device=self.network.device)
        target_qvals = self.target_network.get_qvals(next_states)
        qvals_next = torch.gather(target_qvals.to('cpu'), 1, next_actions_t).detach()
        ###############
        qvals_next[dones_t] = 0  # Zero-out terminal states
        expected_qvals = self.gamma * qvals_next + rewards_t

        self.lq = (nn.MSELoss()(qvals, expected_qvals))

        #print("loss = ", loss)
        #loss.backward()
        #self.network.optimizer.step()

        return self.lq

    def pred_update(self, batch):
        loss_function = nn.MSELoss()
        states, actions, rewards, dones, next_states = [i for i in batch]
        cat_actions = []

        #modifica struttura actions
        for act in actions:
            cat_actions.append(np.asarray(to_categorical(act,self.action_size)))
        cat_actions = np.asarray(cat_actions)
        a_t = torch.FloatTensor(cat_actions).to('cuda')

        #Modifiche struttura states
        if type(states) is tuple:
            states = np.array([np.ravel(s) for s in states])
        states = torch.FloatTensor(states).to('cuda')

        # Modifiche struttura states
        if type(next_states) is tuple:
            next_states = np.array([np.ravel(s) for s in next_states])
        next_states = torch.FloatTensor(next_states).to('cuda')

        self.ltx, self.lts = self.transition.one_step_loss(states, a_t, next_states)
        # per renderla comparabile alla lq
        #self.ltx *= 50
        self.ld = self.transition.distant_codes_loss(states, next_states)
        self.l_spars = self.transition.distant_from_relu_loss(self.encoder(states), 0.5, self.margin_discrete)
        self.l_spars += self.transition.distant_from_relu_loss(self.encoder(next_states), 0.5, self.margin_discrete)
        deltas, _ = self.transition.forward_one_step(states, a_t)
        self.l_spars += self.transition.distant_from_relu_loss(deltas, 0.5, self.margin_discrete)
        self.l_spars += self.transition.distant_from_relu_loss(deltas,-0.5, self.margin_discrete)
        #self.l_kl = self.transition.experiment_loss((states))
        L = self.lts + self.ltx + self.ld + self.l_spars
        #L.backward()
        #print("pred_loss = ", L)
        #self.transition.optimizer.step()

        return L

    def pred_update_two_steps(self, batch):
        loss_function = nn.MSELoss()
        states, actions, rewards, dones, next_states, actions_2, rewards_2, dones_2, next_states_2 = [i for i in batch]
        cat_actions = []
        cat_actions_2 = []

        # modifica struttura actions
        for act in actions:
            cat_actions.append(np.asarray(to_categorical(act, self.action_size)))
        cat_actions = np.asarray(cat_actions)
        a_t = torch.FloatTensor(cat_actions).to(device)

        # modifica struttura actions_2
        for act in actions:
            cat_actions_2.append(np.asarray(to_categorical(act, self.action_size)))
        cat_actions_2 = np.asarray(cat_actions)
        a_t_2 = torch.FloatTensor(cat_actions_2).to(device)

        # Modifiche struttura states
        if type(states) is tuple:
            states = np.array([np.ravel(s) for s in states])
        states = torch.FloatTensor(states).to(device)

        # Modifiche struttura next_states
        if type(next_states) is tuple:
            next_states = np.array([np.ravel(s) for s in next_states])
        next_states = torch.FloatTensor(next_states).to(device)

        # Modifiche struttura next_states
        if type(next_states_2) is tuple:
            next_states_2 = np.array([np.ravel(s) for s in next_states_2])
        next_states_2 = torch.FloatTensor(next_states_2).to(device)

        ####### NEW
        L = self.transition.two_step_loss(states, a_t, next_states, a_t_2, next_states_2)
        #se mettiamo pure la triplet loss
        #L + self.transition.triplet_loss_encoder(states, next_states, next_states_2, MARGIN)



        L.backward()
        #self.transition_losses.append(L)

        self.transition.optimizer.step()
        return

    def update(self):
        #self.network.optimizer.zero_grad()
        self.optimizer.zero_grad()
        batch = self.buffer.sample_batch(batch_size=self.batch_size)

        loss_q = self.calculate_loss(batch)
        #print("q loss = ", loss)

        #self.transition.optimizer.zero_grad()
        batch2 = self.buffer.sample_batch(batch_size=self.batch_size)
        loss_t = self.pred_update(batch2)
        #TODO calcolare la loss su un batch solo
        #batch_cons = self.buffer.consecutive_sample(batch_size=64)
        #print(batch_cons)

        loss = loss_t + loss_q
        loss.backward()
        self.optimizer.step()
        '''
        if self.network.device == 'cuda':
            self.update_loss.append(loss.detach().cpu().numpy())
        else:
            self.update_loss.append(loss.detach().numpy())
        '''
    def initialize(self):
        self.training_rewards = []
        self.training_loss = []
        self.update_loss = []
        self.mean_training_rewards = []
        self.sync_eps = []
        self.rewards = 0
        self.step_count = 0
        self.s_0 = self.env.reset()
예제 #26
0
def test(agent: QNetwork,
         movements=100,
         nb_episodes=1000,
         step=0.01,
         show_plots=True):
    """
    Tests the ability of the QNetwork to learn to reach the position (0.5, 0.5)
    while spawning at random coordinates in [0, 1]^2.
    :param agent: QNetwork to be tested. Needs to have state_dim == 2 and 5 possible actions.
    :param movements: Number of moves the agent is allowed to have
    :param step: Distance travelled at each move
    :param nb_episodes: Number of episodes on which the agent trains
    :param show_plots: if True, the agent will plot the results of the training
    :return: The agent's loss memory
    """
    # A state is defined as its x and y coordinates
    state_dim = 2

    # Calculation device
    device = torch.device("cpu")

    # net = Net1(state_dim, nb_actions)
    # QNetwork(net, state_dim, movements, lr=0.1, device=torch.device("cpu"))

    for ep in range(nb_episodes):
        # Play a single episode

        # Create arrays to store the successive states and taken actions
        states = torch.empty(
            (movements + 1, state_dim),
            device=device)  # + 1 to make space for the last state
        actions = torch.empty(movements, dtype=torch.int32, device=device)

        # Start with a random position
        states[0] = torch.rand(2)

        for move in range(movements):
            # Take action
            actions[move] = agent.decide(states[move].view(1, -1)).item()

            # Get next state
            states[move + 1] = next_state(states[move], actions[move], step,
                                          device)

        # Get rewards
        rewards = get_rewards(states[:-1], actions, step, device)

        # Memorize the episode
        agent.memorize_exploration(states,
                                   actions,
                                   rewards,
                                   last_state_is_final=False)

        # Train after the episode
        agent.update()

        printProgressBar(ep + 1,
                         nb_episodes,
                         "Episodes completed: ",
                         length=90)
        # print("Final position: ", states[-1], " | Initial: ", states[0])

    if show_plots:
        plt.figure("Training summary")
        plt.subplot(111)
        plt.title("Agent Trajectories")
        agent.plot_trajectory(torch.rand((50, 2)),
                              lambda s, a: next_state(s, a, step, device))
        # plt.subplot(212)
        # plt.title("MSE Loss")
        # agent.show_training()
        plt.show()
    return agent.loss_mem

    return 0
예제 #27
0
def her_experiment():
    batch_size = 256
    discount_factor = 0.8
    learn_rate = 1e-3
    num_hidden = 128
    num_episodes = 2
    epochs = 200
    training_steps = 10
    memory_size = 100000
    # her = False
    # seeds = [42, 30, 2,19,99]  # This is not randomly chosen
    seeds = [42, 30, 2, 19, 99]
    shape = [30, 30]
    targets = lambda x, y: [0, x * y - 1, x - 1, (y - 1) * x]
    env = GridworldEnv(shape=shape, targets=targets(*shape))

    # functions for grid world
    def sample_goal():
        return np.random.choice(env.targets, 1)

    extract_goal = lambda state: np.reshape(np.array(np.argmax(state)), -1)

    def calc_reward(state, action, goal):
        if state == goal:
            return 0.0
        else:
            return -1.0
        # # maze
        #     def sample_goal():
        #         return env.maze.end_pos
        #     extract_goal = lambda state: np.reshape(np.array(np.argmax(state)),-1)
        #     def calc_reward(state, action, goal):
        #         if state == goal:
        #             return 0.0
        #         else:
        #             return -1.0

    means = []
    x_epochs = []
    l_stds = []
    h_stds = []
    for her in [True, False]:
        episode_durations_all = []
        for seed in seeds:
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            env.seed(seed)
            print(env.reset())
            memory = ReplayMemory(memory_size)
            if her:
                # model = QNetwork(env.observation_space.shape[0]+2, num_hidden, env.action_space.n)
                model = QNetwork(2 * env.observation_space.n, num_hidden,
                                 env.action_space.n)
                episode_durations, episode_rewards = run_her_episodes(
                    train,
                    model,
                    memory,
                    env,
                    num_episodes,
                    training_steps,
                    epochs,
                    batch_size,
                    discount_factor,
                    learn_rate,
                    sample_goal,
                    extract_goal,
                    calc_reward,
                    use_her=True)
            else:
                model = QNetwork(env.observation_space.n, num_hidden,
                                 env.action_space.n)
                episode_durations, episode_rewards = run_her_episodes(
                    train,
                    model,
                    memory,
                    env,
                    num_episodes,
                    training_steps,
                    epochs,
                    batch_size,
                    discount_factor,
                    learn_rate,
                    sample_goal,
                    extract_goal,
                    calc_reward,
                    use_her=False)

            episode_durations_all.append(
                loop_environments.smooth(episode_durations, 10))
        mean = np.mean(episode_durations_all, axis=0)
        means.append(mean)
        std = np.std(episode_durations_all, ddof=1, axis=0)
        l_stds.append(mean - std)
        h_stds.append(mean + std)
        x_epochs.append(list(range(len(mean))))
        # print(len(mean),mean,std)
    line_plot_var(x_epochs, means, l_stds, h_stds, "Epoch", "Duration",
                  ["HindsightReplay", "RandomReplay"],
                  "Episode duration per epoch", ["orange", "blue"])
    name = "her_" + str(shape)
    file_name = os.path.join("./results", name)

    with open(file_name + ".pkl", "wb") as f:
        pickle.dump((x_epochs, means, l_stds, h_stds), f)
class ReinforcementLearningRunner():

  def __init__(self, mode):
    self.mode = mode
    cu.mem('Reinforcement Learning Started')
    self.environment = RegionFilteringEnvironment(config.get(mode+'Database'), mode)
    self.controller = QNetwork()
    cu.mem('QNetwork controller created')
    self.learner = None
    self.agent = RegionFilteringAgent(self.controller, self.learner)
    self.task = RegionFilteringTask(self.environment, config.get(mode+'GroundTruth'))
    self.experiment = Experiment(self.task, self.agent)

  def runEpoch(self, interactions, maxImgs):
    img = 0
    s = cu.tic()
    while img < maxImgs:
      self.experiment.doInteractions(interactions)
      self.agent.learn()
      self.agent.reset()
      self.environment.loadNextEpisode()
      img += 1
    s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s)

  def run(self):
    if self.mode == 'train':
      self.agent.persistMemory = True
      self.agent.startReplayMemory(len(self.environment.db.images), config.geti('trainInteractions'), config.geti('stateFeatures'))
      self.train()
    elif self.mode == 'test':
      self.agent.persistMemory = False
      self.test()

  def train(self):
    interactions = config.geti('trainInteractions')
    minEpsilon = config.getf('minTrainingEpsilon')
    epochSize = len(self.environment.db.images)/2
    epsilon = 1.0
    self.controller.setEpsilonGreedy(epsilon)
    print 'Epoch 0: Exploration'
    self.runEpoch(interactions, len(self.environment.db.images))
    self.learner = QLearning()
    self.agent.learner = self.learner
    epoch = 1
    egEpochs = config.geti('epsilonGreedyEpochs')
    while epoch <= egEpochs:
      epsilon = epsilon - (1.0-minEpsilon)/float(egEpochs) 
      if epsilon < minEpsilon: epsilon = minEpsilon
      self.controller.setEpsilonGreedy(epsilon)
      print 'Epoch',epoch ,'(epsilon-greedy:{:5.3f})'.format(epsilon)
      self.runEpoch(interactions, epochSize)
      epoch += 1
    epoch = 1
    maxEpochs = config.geti('exploitLearningEpochs')
    while epoch <= maxEpochs:
      print 'Epoch',epoch+egEpochs,'(exploitation mode: epsilon={:5.3f})'.format(epsilon)
      self.runEpoch(interactions, epochSize)
      epoch += 1

  def test(self):
    interactions = config.geti('testInteractions')
    self.controller.setEpsilonGreedy(config.getf('testEpsilon'))
    self.runEpoch(interactions, len(self.environment.db.images))
예제 #29
0
class Agent:
    """
    This class builds an agent with its own QNetwork, memory buffer and
    environment to learn a policy.
    """
    def __init__(self, sess, gui, displayer, saver):
        """
        Build a new instance of Environment and QNetwork.

        Args:
            sess     : the tensorflow session in which to build the network
            gui      : a GUI instance to manage the control of the agent
            displayer: a Displayer instance to keep track of the episode rewards
            saver    : a Saver instance to save periodically the network
        """
        print("Initializing the agent...")

        self.sess = sess
        self.gui = gui
        self.gui_thread = threading.Thread(target=lambda: self.gui.run(self))
        self.displayer = displayer
        self.saver = saver
        signal.signal(signal.SIGINT, self.interrupt)

        self.env = Environment()
        self.QNetwork = QNetwork(sess)
        self.buffer = ExperienceBuffer(prioritized=Settings.PRIORITIZED_ER)
        self.epsilon = Settings.EPSILON_START
        self.beta = Settings.BETA_START

        self.delta_z = (Settings.MAX_Q - Settings.MIN_Q) / (Settings.NB_ATOMS -
                                                            1)
        self.z = np.linspace(Settings.MIN_Q, Settings.MAX_Q, Settings.NB_ATOMS)

        self.create_summaries()

        self.best_run = -1e10
        self.n_gif = 0

        print("Agent initialized !\n")

    def create_summaries(self):

        self.ep_reward_ph = tf.placeholder(tf.float32)
        ep_reward_summary = tf.summary.scalar("Episode/Episode reward",
                                              self.ep_reward_ph)

        self.steps_ph = tf.placeholder(tf.float32)
        steps_summary = tf.summary.scalar("Episode/Nb steps", self.steps_ph)

        self.epsilon_ph = tf.placeholder(tf.float32)
        epsilon_summary = tf.summary.scalar("Settings/Epsilon",
                                            self.epsilon_ph)

        self.ep_summary = tf.summary.merge(
            [ep_reward_summary, epsilon_summary, steps_summary])

        self.lr_ph = tf.placeholder(tf.float32)
        self.lr_summary = tf.summary.scalar("Settings/Learning rate",
                                            self.lr_ph)

        self.writer = tf.summary.FileWriter("./logs", self.sess.graph)

    def pre_train(self):
        """
        Method to run a random agent in the environment to fill the memory
        buffer.
        """
        print("Beginning of the pre-training...")

        for i in range(Settings.PRE_TRAIN_EPS):

            s = self.env.reset()
            done = False
            episode_reward = 0
            episode_step = 0

            while episode_step < Settings.MAX_EPISODE_STEPS and not done:

                a = self.env.act_random()
                s_, r, done, info = self.env.act(a)
                self.buffer.add((s, a, r, s_, 1 if not done else 0))

                s = s_
                episode_reward += r
                episode_step += 1

            if Settings.PRE_TRAIN_EPS > 5 and i % (Settings.PRE_TRAIN_EPS //
                                                   5) == 0:
                print("Pre-train step n", i)

            # Set the best score to at least the max score the random agent got
            self.best_run = max(self.best_run, episode_reward)

        print("End of the pre training !")

    def save_best(self, episode_reward):
        self.best_run = episode_reward
        print("Save best", episode_reward)
        self.saver.save('best')
        # self.play(1, 'best')

    def run(self):
        """
        Method to run the agent in the environment to collect experiences and
        learn on these experiences by gradient descent.
        """
        print("Beginning of the run...")

        self.pre_train()
        self.QNetwork.init_target()
        self.gui_thread.start()

        self.nb_ep = 1
        learning_steps = 0

        while self.nb_ep < Settings.TRAINING_EPS and not self.gui.STOP:

            s = self.env.reset()
            episode_reward = 0
            done = False
            memory = deque()

            episode_step = 1
            # The more episodes the agent performs, the longer they are
            max_step = Settings.MAX_EPISODE_STEPS
            if Settings.EP_ELONGATION > 0:
                max_step += self.nb_ep // Settings.EP_ELONGATION

            # Render settings
            self.env.set_render(self.gui.render.get(self.nb_ep))
            self.env.set_gif(self.gui.gif.get(self.nb_ep))
            plot_distrib = self.gui.plot_distrib.get(self.nb_ep)

            while episode_step <= max_step and not done:

                # Exploration by NoisyNets or epsilon-greedy policy
                if not Settings.NOISY and random.random() < self.epsilon:
                    a = self.env.act_random()
                else:
                    if Settings.DISTRIBUTIONAL:
                        Qdistrib = self.QNetwork.act(s)
                        Qvalue = np.sum(self.z * Qdistrib, axis=1)
                    else:
                        Qvalue = self.QNetwork.act(s)

                    a = np.argmax(Qvalue, axis=0)

                    if plot_distrib:
                        self.displayer.disp_distrib(self.z, self.delta_z,
                                                    Qdistrib, Qvalue)

                s_, r, done, info = self.env.act(a)
                episode_reward += r

                memory.append((s, a, r, s_, done))

                # Keep the experience in memory until 'N_STEP_RETURN' steps has
                # passed to get the delayed return r_1 + ... + gamma^n r_n
                while len(memory) >= Settings.N_STEP_RETURN or (memory and
                                                                memory[-1][4]):
                    s_mem, a_mem, discount_R, si_, done_ = memory.popleft()
                    if not done_ and memory:
                        for i in range(Settings.N_STEP_RETURN - 1):
                            si, ai, ri, si_, done_ = memory[i]
                            discount_R += ri * Settings.DISCOUNT**(i + 1)
                            if done_:
                                break
                    self.buffer.add(
                        (s_mem, a_mem, discount_R, si_, 1 if not done_ else 0))

                if episode_step % Settings.TRAINING_FREQ == 0:
                    if Settings.PRIORITIZED_ER:
                        batch, idx, weights = self.buffer.sample(self.beta)
                    else:
                        batch = self.buffer.sample(self.beta)
                        idx = weights = None
                    loss = self.QNetwork.train(np.asarray(batch), weights)
                    self.buffer.update(idx, loss)
                    self.QNetwork.update_target()

                    feed_dict = {self.lr_ph: self.QNetwork.learning_rate}
                    summary = self.sess.run(self.lr_summary,
                                            feed_dict=feed_dict)
                    self.writer.add_summary(summary, learning_steps)
                    learning_steps += 1

                s = s_
                episode_step += 1

            # Decay epsilon
            if self.epsilon > Settings.EPSILON_STOP:
                self.epsilon -= Settings.EPSILON_DECAY

            self.displayer.add_reward(episode_reward,
                                      plot=self.gui.plot.get(self.nb_ep))
            # if episode_reward > self.best_run:
            #     self.save_best(episode_reward)

            # Episode display
            if self.gui.ep_reward.get(self.nb_ep):
                print('Episode %2i, Reward: %7.3f, Steps: %i, Epsilon: %f'
                      ', Max steps: %i, Learning rate: %fe-4' %
                      (self.nb_ep, episode_reward, episode_step, self.epsilon,
                       max_step, self.QNetwork.learning_rate * 1e4))

            # Write the summary
            feed_dict = {
                self.ep_reward_ph: episode_reward,
                self.epsilon_ph: self.epsilon,
                self.steps_ph: episode_step
            }
            summary = self.sess.run(self.ep_summary, feed_dict=feed_dict)
            self.writer.add_summary(summary, self.nb_ep)

            # Save the model
            if self.gui.save.get(self.nb_ep):
                self.saver.save(self.nb_ep)

            self.nb_ep += 1

        print("Training completed !")
        self.env.close()
        self.display()
        self.gui.end_training()
        self.gui_thread.join()

    def play(self, number_run=1, gif=False, name=None):
        """
        Method to evaluate the policy without exploration.

        Args:
            number_run: the number of episodes to perform
            gif       : whether to save a gif or not
            name      : the name of the gif that will be saved
        """
        self.env.set_render(Settings.DISPLAY)

        for i in range(number_run):

            s = self.env.reset()
            episode_reward = 0
            done = False
            self.env.set_gif(gif, name)

            while not done:
                if Settings.DISTRIBUTIONAL:
                    Qdistrib = self.QNetwork.act(s)
                    Qvalue = np.sum(self.z * Qdistrib, axis=1)
                else:
                    Qvalue = self.QNetwork.act(s)
                a = np.argmax(Qvalue, axis=0)
                s, r, done, info = self.env.act(a)
                episode_reward += r

            if gif: self.env.save_gif()
            print("Episode reward :", episode_reward)

    def display(self):
        self.displayer.disp()

    def stop(self):
        self.env.close()

    def interrupt(self, sig, frame):
        self.gui.stop_run()
예제 #30
0
class Agent():
    def __init__(self, params):
        self.params = params
        self.__state_dim = params['state_dim']
        self.__action_dim = params['action_dim']
        self.__buffer_size = params['buffer_size']
        self.__batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__lr = params['lr']
        self.__update_every = params['update_every']
        eps = params['eps']
        eps_decay = params['eps_decay']
        min_eps = params['min_eps']
        seed = params['seed']
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        critic_params = dict()
        critic_params['seed'] = seed
        critic_params['arch_params'] = params['arch_params_critic']
        self.critic_local = QNetwork(critic_params).to(device)
        self.critic_target = QNetwork(critic_params).to(device)
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=self.__lr)

        #Policy
        actor_params = dict()
        actor_params['seed'] = seed
        actor_params['arch_params'] = params['arch_params_actor']
        actor_params['noise_type'] = params['noise_type']
        actor_params['eps'] = eps
        actor_params['eps_decay'] = eps_decay
        actor_params['min_eps'] = min_eps
        actor_params['arch_params'] = params['arch_params_actor']
        self.actor_local = Policy(actor_params).to(device)
        self.actor_target = Policy(actor_params).to(device)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=self.__lr)

        self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size)
        self.__t_step = 0

    def memorize_experience(self, state, action, reward, next_state, done):
        self.__memory.add(state, action, reward, next_state, done)
        self.__t_step = (self.__t_step + 1)

    def choose_action(self, state):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        state = torch.from_numpy(state.astype(dtype=np.float)).to(device)
        action, action_perturbed = self.actor_local(state)
        return action, action_perturbed

    def learn_from_past_experiences(self):
        if self.__t_step % self.__update_every == 0:
            if len(self.__memory) > self.__batch_size:
                experiences = self.__memory.sample()
                self.update_Qnet_and_policy(experiences)

    def update_Qnet_and_policy(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        next_actions, next_actions_perturbed = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones)
                               )  # if done == True: second term is equal to 0
        Q_expected = self.critic_local(states, actions)
        loss_func = nn.MSELoss()
        loss_critic = loss_func(Q_expected, Q_targets.detach())

        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        self.optimizer_critic.step()

        predicted_actions, predicted_actions_perturbed = self.actor_local(
            states)  # new predicted actions, not the ones stored in buffer

        if self.params['noise_type'] == 'parameter':
            #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise
            if (predicted_actions -
                    predicted_actions_perturbed).pow(2).mean() >= 0.3:
                self.actor_local.eps /= 1.01
                self.actor_target.eps /= 1.01
            else:
                self.actor_local.eps *= 1.01
                self.actor_target.eps *= 1.01

        loss_actor = -self.critic_local(states, predicted_actions).mean()

        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        self.optimizer_actor.step()

        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def update_eps(self):
        self.actor_local.eps = max(
            self.actor_local.eps * self.actor_local.eps_decay,
            self.actor_local.min_eps)
        self.actor_target.eps = max(
            self.actor_target.eps * self.actor_target.eps_decay,
            self.actor_target.min_eps)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.__tau * local_param.data +
                                    (1.0 - self.__tau) * target_param.data)

    def save_weights(self, save_to):
        actor_params = {
            'actor_params': self.actor_local.policy_params,
            'state_dict': self.actor_local.state_dict()
        }
        critic_params = {
            'critic_params': self.critic_local.qnet_params,
            'state_dict': self.critic_local.state_dict()
        }

        file = dict()
        file['critic_params'] = critic_params
        file['actor_params'] = actor_params
        torch.save(file, open(save_to, 'wb'))

    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)
        qnet_params = checkpoint['critic_params']
        policy_params = checkpoint['actor_params']

        self.actor_local = Policy(policy_params['actor_params'])
        self.actor_local.load_state_dict(
            checkpoint['actor_params']['state_dict'])

        self.critic_local = QNetwork(qnet_params['critic_params'])
        self.critic_local.load_state_dict(
            checkpoint['critic_params']['state_dict'])
        return self
예제 #31
0
class BoxSearchRunner():

  def __init__(self, mode):
    self.mode = mode
    cu.mem('Reinforcement Learning Started')
    self.environment = BoxSearchEnvironment(config.get(mode+'Database'), mode, config.get(mode+'GroundTruth'))
    self.controller = QNetwork()
    cu.mem('QNetwork controller created')
    self.learner = None
    self.agent = BoxSearchAgent(self.controller, self.learner)
    self.task = BoxSearchTask(self.environment, config.get(mode+'GroundTruth'))
    self.experiment = Experiment(self.task, self.agent)

  def runEpoch(self, interactions, maxImgs):
    img = 0
    s = cu.tic()
    while img < maxImgs:
      k = 0
      while not self.environment.episodeDone and k < interactions:
        self.experiment._oneInteraction()
        k += 1
      self.agent.learn()
      self.agent.reset()
      self.environment.loadNextEpisode()
      img += 1
    s = cu.toc('Run epoch with ' + str(maxImgs) + ' episodes', s)

  def run(self):
    if self.mode == 'train':
      self.agent.persistMemory = True
      self.agent.startReplayMemory(len(self.environment.imageList), config.geti('trainInteractions'))
      self.train()
    elif self.mode == 'test':
      self.agent.persistMemory = False
      self.test()

  def train(self):
    networkFile = config.get('networkDir') + config.get('snapshotPrefix') + '_iter_' + config.get('trainingIterationsPerBatch') + '.caffemodel'
    interactions = config.geti('trainInteractions')
    minEpsilon = config.getf('minTrainingEpsilon')
    epochSize = len(self.environment.imageList)/1
    epsilon = 1.0
    self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction)
    epoch = 1
    exEpochs = config.geti('explorationEpochs')
    while epoch <= exEpochs:
      s = cu.tic()
      print 'Epoch',epoch,': Exploration (epsilon=1.0)'
      self.runEpoch(interactions, len(self.environment.imageList))
      self.task.flushStats()
      self.doValidation(epoch)
      s = cu.toc('Epoch done in ',s)
      epoch += 1
    self.learner = QLearning()
    self.agent.learner = self.learner
    egEpochs = config.geti('epsilonGreedyEpochs')
    while epoch <= egEpochs + exEpochs:
      s = cu.tic()
      epsilon = epsilon - (1.0-minEpsilon)/float(egEpochs)
      if epsilon < minEpsilon: epsilon = minEpsilon
      self.controller.setEpsilonGreedy(epsilon, self.environment.sampleAction)
      print 'Epoch',epoch ,'(epsilon-greedy:{:5.3f})'.format(epsilon)
      self.runEpoch(interactions, epochSize)
      self.task.flushStats()
      self.doValidation(epoch)
      s = cu.toc('Epoch done in ',s)
      epoch += 1
    maxEpochs = config.geti('exploitLearningEpochs') + exEpochs + egEpochs
    while epoch <= maxEpochs:
      s = cu.tic()
      print 'Epoch',epoch,'(exploitation mode: epsilon={:5.3f})'.format(epsilon)
      self.runEpoch(interactions, epochSize)
      self.task.flushStats()
      self.doValidation(epoch)
      s = cu.toc('Epoch done in ',s)
      shutil.copy(networkFile, networkFile + '.' + str(epoch))
      epoch += 1

  def test(self):
    interactions = config.geti('testInteractions')
    self.controller.setEpsilonGreedy(config.getf('testEpsilon'))
    self.runEpoch(interactions, len(self.environment.imageList))

  def doValidation(self, epoch):
    if epoch % config.geti('validationEpochs') != 0:
      return
    auxRL = BoxSearchRunner('test')
    auxRL.run()
    indexType = config.get('evaluationIndexType')
    category = config.get('category')
    if indexType == 'pascal':
      categories, catIndex = bse.get20Categories()
    elif indexType == 'relations':
      categories, catIndex = bse.getCategories()
    elif indexType == 'finetunedRelations':
      categories, catIndex = bse.getRelationCategories()
    if category in categories:
        catI = categories.index(category)
    else:
        catI = -1
    scoredDetections = bse.loadScores(config.get('testMemory'), catI)
    groundTruthFile = config.get('testGroundTruth')
    #ps,rs = bse.evaluateCategory(scoredDetections, 'scores', groundTruthFile)
    pl,rl = bse.evaluateCategory(scoredDetections, 'landmarks', groundTruthFile)
    line = lambda x,y,z: x + '\t{:5.3f}\t{:5.3f}\n'.format(y,z)
    #print line('Validation Scores:',ps,rs)
    print line('Validation Landmarks:',pl,rl)
class Agent():
    def __init__(self,
                 capacity,
                 state_size,
                 action_size,
                 pretrained_model_path=None,
                 tau=1e-3,
                 gamma=0.99,
                 batch_size=32,
                 lr=1e-4,
                 learn_every_n_steps=4):
        # Environment variables
        self.state_size = state_size
        self.action_size = action_size

        # Create Qnetworks
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size).to(device)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(),
                                          lr=lr)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        if pretrained_model_path is not None:
            self.qnetwork_local.load_state_dict(
                torch.load(pretrained_model_path))

        # Initialize memory buffer
        self.memory = ReplayBuffer(capacity, batch_size)

        # Initialize time step for updating target network every q steps
        self.learn_every_n_steps = learn_every_n_steps
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """Learn from the action and environments reponse."""
        self.memory.add(state, action, reward, next_state, done)

        # Maybe learn if learn_every_n_steps has passed
        self.t_step = (self.t_step + 1) % self.learn_every_n_steps
        if self.t_step == 0:
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=1.):
        """
      Returns actions for given state as per current policy.
        
      Params
      ======
         state (array_like): current state
         eps (float): epsilon, for epsilon-greedy action selection
      """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(int)
        else:
            return random.choice(np.arange(self.action_size)).astype(int)

    def learn(self, experiences):
        """Update network parameters"""
        states, actions, rewards, next_states, dones = experiences

        # Get best score according to the target network and evaluate it against the local network
        next_action_values = self.qnetwork_target(next_states).detach().max(
            dim=1)[0].unsqueeze(1)
        y = rewards + (self.gamma * next_action_values * (1 - dones))
        yhat = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(yhat, y)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update()

    def soft_update(self):
        """Performs soft update of frozen target network as per double-DQN"""
        for target_param, local_param in zip(self.qnetwork_target.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
예제 #33
0
 def create_q(path):
     print(path)
     net = torch.load(path)
     net = net.train()
     return QNetwork(net, path, lr=1e-4)