예제 #1
0
    def __init__(self, env, hidden_units = None, network_LR = 0.001, batch_size = 64, 
                    update_every=4, gamma=1.0, summarry=True):
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        memory_capacity = int(1e5)   #this is pythonic
        
        self.nA = env.ACTION_SPACE              #number of actions agent can perform
        self.UPDATE_EVERY = update_every

        #let's give it some brains
        self.qnetwork_local = QNetwork(self.env.STATE_SPACE, hidden_units, self.nA).to(device)
        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=network_LR)
        if summarry:
            print(self.qnetwork_local)
        
        #I call the target network as the PC
        # Where our agent stores all the concrete and important stuff
        self.qnetwork_target = QNetwork(self.env.STATE_SPACE, hidden_units, self.nA).to(device)

        #and the memory of course
        self.memory = ReplayMemory(memory_capacity, self.BATCH_SIZE) 

        #handy temp variable
        self.t = 0
예제 #2
0
	def __init__(self, state_size, config, is_eval=False):
		"""
		Constructs new Agent object
		:param state_size: size of the state being used
		:param config: config file
		:param is_eval: flag for whether or not we are training (i.e. update parameters) or evaluating
		"""
		# Represents the size of each state
		self.state_size = state_size
		# Represents size of the action space
		self.action_size = 3 # 3 options of actions: hold, buy, sell
		self.memory = ReplayMemory(10000) # Constructs a new memory object
		# self.inventory = []
		self.is_eval = is_eval

		# RL parameters: kept the same from original product
		self.gamma = config['gamma'] # gamma coefficient from Belman equation
		# Epsilon will keep being changed at each iteration
		self.epsilon = config['epsilon']
		self.epsilon_min = config['epsilon_min']  # minimum value epsilon can take
		self.epsilon_decay = config['epsilon_decay']  # amount epsilon decays each iteration
		self.batch_size = config['batch_size']

		# Loads previous models, if they exist
		if os.path.exists(config['target_model']):
			self.policy_net = torch.load(config['policy_model'], map_location=device)
			self.target_net = torch.load(config['target_model'], map_location=device)
		else:
			self.policy_net = DQN(state_size, self.action_size)
			self.target_net = DQN(state_size, self.action_size)
		# Optimization function
		self.optimizer = optim.RMSprop(self.policy_net.parameters(),
									   lr=config['learning_rate'], momentum=config['momentum'])
예제 #3
0
	def __init__(self, state_size, is_eval=False):
		self.state_size = state_size # normalized previous days
		self.action_size = 3 # sit, buy, sell
		self.memory = ReplayMemory(10000)
		self.inventory = []
		self.is_eval = is_eval

		self.gamma = 0.95
		self.epsilon = 1.0
		self.epsilon_min = 0.01
		self.epsilon_decay = 0.995
		self.batch_size = 32
		if os.path.exists('models/target_model'):
			self.policy_net = torch.load('models/policy_model', map_location=device)
			self.target_net = torch.load('models/target_model', map_location=device)
		else:
			self.policy_net = DQN(state_size, self.action_size)
			self.target_net = DQN(state_size, self.action_size)
		self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.005, momentum=0.9)
예제 #4
0
class DeepQ_agent:

    def __init__(self, env, hidden_units = None, network_LR = 0.001, batch_size = 64, 
                    update_every=4, gamma=1.0, summarry=True):
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        memory_capacity = int(1e5)   #this is pythonic
        
        self.nA = env.ACTION_SPACE              #number of actions agent can perform
        self.UPDATE_EVERY = update_every

        #let's give it some brains
        self.qnetwork_local = QNetwork(self.env.STATE_SPACE, hidden_units, self.nA).to(device)
        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=network_LR)
        if summarry:
            print(self.qnetwork_local)
        
        #I call the target network as the PC
        # Where our agent stores all the concrete and important stuff
        self.qnetwork_target = QNetwork(self.env.STATE_SPACE, hidden_units, self.nA).to(device)

        #and the memory of course
        self.memory = ReplayMemory(memory_capacity, self.BATCH_SIZE) 

        #handy temp variable
        self.t = 0

#----------------------Learn from experience-----------------------------------#

    def learn(self, writer=None, episode=-1):
        '''
            hell yeah   
        '''

        if self.memory.__len__() > self.BATCH_SIZE:
            states, actions, rewards, next_states, dones = self.memory.sample(self.env.STATE_SPACE, device)

            # gather, refer here https://stackoverflow.com/a/54706716/10666315
            Q_expected = self.qnetwork_local(states).gather(1, actions)
            
            # Get max predicted action-values for next states, using target model
            # detach, detaches the tensor from graph
            # max(1) return two tensors, one containing max values along dim=1, 
            # other containing indices of max values along dim=1
            # unsqueeze(1) inserts a dimension of size one at specified position
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

            Q_targets =  rewards + (self.GAMMA * Q_targets_next * (1 - dones))
            
            loss = F.mse_loss(Q_expected, Q_targets)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if writer is not None:
                writer.add_scalar('Training Loss', loss, episode)  # write to tensorboard summary
                

            if self.t == self.UPDATE_EVERY:
                self.qnetwork_target.state_dict = self.qnetwork_local.state_dict  # update target network
                self.t = 0
            else:
                self.t += 1


#-----------------------Time to act-----------------------------------------------#

    def act(self, state, epsilon = 0):                 #set to NO exploration by default
        state = torch.from_numpy(state).to(device, dtype=torch.float32)
        action_values = self.qnetwork_local(state)    #returns a vector of size = self.nA
        if random.random() > epsilon:
            action = torch.argmax(action_values).item()      #choose best action - Exploitation
        else:
            action = random.randint(0, self.nA-1)  #choose random action - Exploration
        
        return action

#-----------------------------Add experience to agent's memory------------------------#

    def add_experience(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

#---------------------helpful save function-------------------------------------#
    
    def save(self, dir, episode, info):
        torch.save(self.qnetwork_local.state_dict() , f'{dir}/model_{episode}_{info}.pth.tar')

#----------------------Load a saved model----------------------------------------#

    def load_model(self, model_path):
        self.qnetwork_local.state_dict = torch.load(model_path)
예제 #5
0
class Agent:
    def __init__(self, state_size, is_eval=False):
        self.state_size = state_size  # normalized previous days
        self.action_size = 3  # sit, buy, sell
        self.memory = ReplayMemory(10000)
        self.inventory = []
        self.is_eval = is_eval

        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.batch_size = 32
        if os.path.exists('models/target_model'):
            self.policy_net = torch.load('models/policy_model',
                                         map_location=device)
            self.target_net = torch.load('models/target_model',
                                         map_location=device)
        else:
            self.policy_net = DQN(state_size, self.action_size).to(device)
            self.target_net = DQN(state_size, self.action_size).to(device)
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                       lr=0.005,
                                       momentum=0.9)

    def act(self, state):
        if not self.is_eval and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)

        tensor = torch.FloatTensor(state).to(device)
        options = self.target_net(tensor)
        return np.argmax(options[0].detach().cpu().numpy())

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        next_state = torch.FloatTensor(batch.next_state).to(device)
        non_final_mask = torch.tensor(
            tuple(map(lambda s: s is not None, next_state)))
        non_final_next_states = torch.cat(
            [s for s in next_state if s is not None])
        state_batch = torch.FloatTensor(batch.state).to(device)
        action_batch = torch.LongTensor(batch.action).to(device)
        reward_batch = torch.FloatTensor(batch.reward).to(device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).reshape(
            (self.batch_size,
             3)).gather(1, action_batch.reshape((self.batch_size, 1)))

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size, device=device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
예제 #6
0
class Agent:
	"""
	Represents the Agent that plays the reinforcement learning game
	"""
	def __init__(self, state_size, config, is_eval=False):
		"""
		Constructs new Agent object
		:param state_size: size of the state being used
		:param config: config file
		:param is_eval: flag for whether or not we are training (i.e. update parameters) or evaluating
		"""
		# Represents the size of each state
		self.state_size = state_size
		# Represents size of the action space
		self.action_size = 3 # 3 options of actions: hold, buy, sell
		self.memory = ReplayMemory(10000) # Constructs a new memory object
		# self.inventory = []
		self.is_eval = is_eval

		# RL parameters: kept the same from original product
		self.gamma = config['gamma'] # gamma coefficient from Belman equation
		# Epsilon will keep being changed at each iteration
		self.epsilon = config['epsilon']
		self.epsilon_min = config['epsilon_min']  # minimum value epsilon can take
		self.epsilon_decay = config['epsilon_decay']  # amount epsilon decays each iteration
		self.batch_size = config['batch_size']

		# Loads previous models, if they exist
		if os.path.exists(config['target_model']):
			self.policy_net = torch.load(config['policy_model'], map_location=device)
			self.target_net = torch.load(config['target_model'], map_location=device)
		else:
			self.policy_net = DQN(state_size, self.action_size)
			self.target_net = DQN(state_size, self.action_size)
		# Optimization function
		self.optimizer = optim.RMSprop(self.policy_net.parameters(),
									   lr=config['learning_rate'], momentum=config['momentum'])

	def act(self, state):
		"""
		Acts on the current state
		"""
		if not self.is_eval and np.random.rand() <= self.epsilon:
			# Then we are not evaulating, and we are in exploratory phase, so we try something new
			return random.randrange(self.action_size)
		# Otherwise, we convert the state to a tensor, and run it through our target network to determine
		# action of highest probability, which we return.
		tensor = torch.FloatTensor(state).to(device)
		options = self.target_net(tensor)
		return np.argmax(options[0].detach().numpy())

	def decay_epsilon(self):
		"""
		Decays epsilon value according to parameters
		"""
		if self.epsilon > self.epsilon_min:
			self.epsilon -= self.epsilon_decay

	def optimize(self):
		"""
		Optimizes the policy and target nets
		"""
		if len(self.memory) < self.batch_size:
			return
		transitions = self.memory.sample(self.batch_size)
		# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
		# detailed explanation). This converts batch-array of Transitions
		# to Transition of batch-arrays.
		batch = Transition(*zip(*transitions))

		# Compute a mask of non-final states and concatenate the batch elements
		# (a final state would've been the one after which simulation ended)
		next_state = torch.FloatTensor(batch.next_state).to(device)
		# Masks help keep array shape, even in case that we run over the boundary of array size
		non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state)))
		non_final_next_states = torch.cat([s for s in next_state if s is not None])
		state_batch = torch.FloatTensor(batch.state).to(device)
		# Actions from all elements of the batch - each one 0,1,2
		action_batch = torch.LongTensor(batch.action).to(device)
		# Rewards from the actions corresponding to all elements of the batch
		reward_batch = torch.FloatTensor(batch.reward).to(device)

		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
		# columns of actions taken. These are the actions which would've been taken
		# for each batch state according to policy_net
		q_pred = self.policy_net(state_batch).reshape((self.batch_size, 3)).gather(1, action_batch.reshape((self.batch_size, 1)))

		# Compute V(s_{t+1}) for all next states.
		# Expected values of actions for non_final_next_states are computed based
		# on the "older" target_net; selecting their best reward with max(1)[0].
		# This is merged based on the mask, such that we'll have either the expected
		# state value or 0 in case the state was final.
		v_actual = torch.zeros(self.batch_size, device=device)
		# Fills in the predicted state values for each timestamp
		v_actual[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
		# Compute what should have been the Q values
		q_target = (v_actual * self.gamma) + reward_batch

		# Compute Huber loss
		loss = F.smooth_l1_loss(q_pred, q_target.unsqueeze(1))

		# Decay Epsilon
		self.decay_epsilon()

		# Optimize the model - standard pytorch procedure
		self.optimizer.zero_grad()
		loss.backward()
		for param in self.policy_net.parameters():
			param.grad.data.clamp_(-1, 1)  # Keep gradient from going crazy
		self.optimizer.step()