def fit(self, neps, tri, perm): start = datetime.now() self.epsilon, self.alpha, self.gamma = perm returns = np.zeros((neps, tri)) for t in range(tri): self.agent = SGA.sentGenAgent(self.epsilon, self.alpha, self.gamma) print('Running trial: ', t) for n in range(neps): #if n % 100 == 0: # print('Running episode: ', n) env = ENV.Environment(self.rewards, self.reward_func) state = env.prev_state_id run = 'run' l = 0 while run == 'run': action = self.agent.getAction(state) run, reward = env.getNextState(action) returns[n, t] += (self.gamma**l) * reward l += 1 qsa, phisa = self.agent.qValue(state, action) next_state = env.next_state_id max_qsa_prime = [] for act in self.agent.actions.keys(): if run == 'terminate': qsa_prime = 0.0 else: qsa_prime, _ = self.agent.qValue(next_state, act) max_qsa_prime.append(qsa_prime) qsa_prime = max(max_qsa_prime) self.agent.updateWeights(reward, qsa_prime, qsa, phisa) state = next_state #print returns[:,t] end = datetime.now() return returns, end - start
def predict(self): sentences = [] for n in range(1000): sent = [] env = ENV.Environment(self.rewards, self.reward_func) state = env.prev_state_id sent.append(env.prev_state) run = 'run' while run == 'run': action = self.agent.getAction(state) run, reward = env.getNextState(action) next_state = env.next_state sent.append(next_state) state = env.next_state_id sentences.append(sent) return self.score(sentences)