def perform(self, node, inputs, output_storage): #st = time.time() q = inputs[0] q_m = inputs[1] pages_id = inputs[2] div = inputs[3] R = np.zeros((len(pages_id)/div,), np.float32) best_pages_id = np.zeros((len(pages_id)/div,), np.int32) best_answers = [] max_words = 0 for i in range(0, len(pages_id), div): q_bow = list(q[i/div][q_m[i/div]>0.]) sents = [] ref_id = [] for j in range(div): page_id = pages_id[i+j] if int(page_id) != -1: text = self.wiki.get_article_text(page_id) sents_pre = self.tokenizer.tokenize(text.decode('ascii', 'ignore')) n_consec = min(len(sents_pre),self.n_consec) for sk in range(0,len(sents_pre)-n_consec+1): sent = '' for sj in range(n_consec): sent += ' ' + sents_pre[sk+sj] sents.append(sent.strip()) ref_id.append(page_id) s = np.zeros((len(sents)), np.float32) c = np.zeros((len(sents)), np.float32) sents_idx = [] for j, sent in enumerate(sents): words = wordpunct_tokenize(sent.lower()) sent_bow = utils.BOW(words, self.vocab) sents_idx.append(words) c[j] = len(list(set(sent_bow[0]).intersection(q_bow))) # Count how many elements they have in common s[j] = len(sent_bow[0]) match_rate = 2 * c / (len(q_bow) + s) idx = np.argmax(match_rate) #R[i/div] = float(match_rate[idx] == 1.) # make reward \in {0,1} R[i/div] = match_rate[idx] best_pages_id[i/div] = ref_id[idx] sent_idx = utils.text2idx(sents_idx[idx], self.vocab) best_answers.append(sent_idx) if len(sent_idx) > max_words: max_words = len(sent_idx) best_answers_ = -2*np.ones((len(best_answers), max_words), np.int32) #initialize with -2. -2 means stop word. for i, best_answer in enumerate(best_answers): best_answers_[i, :len(best_answer)] = best_answer output_storage[0][0] = R output_storage[1][0] = best_pages_id output_storage[2][0] = best_answers_
def perform(self, node, inputs, output_storage): st = time.time() q = inputs[0] q_m = inputs[1] pages_id = inputs[2] div = inputs[3] R = np.zeros((len(pages_id)/div,), np.float32) if prm.reward_type == None: # speed up by not computing rewards and best answer in supervised mode. best_answers_ = -2*np.ones((len(pages_id)/div, prm.n_consec*prm.max_words_query), np.int32) #initialize with -2. -2 means stop word. else: best_answers = [] max_words = 0 for i in range(0, len(pages_id), div): q_bow = {} for j, ax in enumerate(q[i/div]): if q_m[i/div][j] > 0.: q_bow[ax] = 0 set_q_bow = set(q_bow.keys()) sents = [] ref_id = [] ref_range = [] for j in range(div): page_id = pages_id[i+j] if int(page_id) != -1: text = self.wiki.get_article_text(page_id) sents_pre = self.tokenizer.tokenize(text.decode('ascii', 'ignore')) n_consec = min(len(sents_pre),self.n_consec) for sk in range(0,len(sents_pre)-n_consec+1): sent = '' for sj in range(n_consec): sent += ' ' + sents_pre[sk+sj] sents.append(sent.strip()) ref_id.append(page_id) ref_range.append([j,len(sents)]) if len(sents) > 0: s = np.zeros((len(sents)), np.float32) c = np.zeros((len(sents)), np.float32) sents_idx = [] for j, sent in enumerate(sents): words = wordpunct_tokenize(sent.lower()) sent_bow = {} for word in words: if word in self.vocab: sent_bow[self.vocab[word]] = 0 sents_idx.append(words) c[j] = len(list(set(sent_bow.keys()) & set_q_bow)) # Count how many elements they have in common s[j] = len(sent_bow) match_rate = 2 * c / np.maximum(1., (len(set_q_bow) + s)) idx = np.argmax(match_rate) if str(prm.reward_type).lower() == 'discrete': R[i/div] = float(match_rate[idx] == 1.) # make reward \in {0,1} elif str(prm.reward_type).lower() == 'continuous': R[i/div] = match_rate[idx] # make reward \in [0,1] else: raise ValueError('Not a valid value for reward_type parameter. Valid options are "continuous", "discrete", or None.') sent_idx = utils.text2idx(sents_idx[idx], self.vocab) best_answers.append(sent_idx) if len(sent_idx) > max_words: max_words = len(sent_idx) else: best_answers.append([-2]) #initialize with -2. -2 means stop word. best_answers_ = -2*np.ones((len(best_answers), max_words), np.int32) #initialize with -2. -2 means stop word. for i, best_answer in enumerate(best_answers): best_answers_[i, :len(best_answer)] = best_answer output_storage[0][0] = R output_storage[1][0] = best_answers_
def perform(self, node, inputs, output_storage): st = time.time() q = inputs[0] q_m = inputs[1] pages_id = inputs[2] div = inputs[3] R = np.zeros((len(pages_id) / div, ), np.float32) if prm.reward_type == None: # speed up by not computing rewards and best answer in supervised mode. best_answers_ = -2 * np.ones( (len(pages_id) / div, prm.n_consec * prm.max_words_query), np.int32) #initialize with -2. -2 means stop word. else: best_answers = [] max_words = 0 for i in range(0, len(pages_id), div): q_bow = {} for j, ax in enumerate(q[i / div]): if q_m[i / div][j] > 0.: q_bow[ax] = 0 set_q_bow = set(q_bow.keys()) sents = [] ref_id = [] ref_range = [] for j in range(div): page_id = pages_id[i + j] if int(page_id) != -1: text = self.wiki.get_article_text(page_id) sents_pre = self.tokenizer.tokenize( text.decode('ascii', 'ignore')) n_consec = min(len(sents_pre), self.n_consec) for sk in range(0, len(sents_pre) - n_consec + 1): sent = '' for sj in range(n_consec): sent += ' ' + sents_pre[sk + sj] sents.append(sent.strip()) ref_id.append(page_id) ref_range.append([j, len(sents)]) if len(sents) > 0: s = np.zeros((len(sents)), np.float32) c = np.zeros((len(sents)), np.float32) sents_idx = [] for j, sent in enumerate(sents): words = wordpunct_tokenize(sent.lower()) sent_bow = {} for word in words: if word in self.vocab: sent_bow[self.vocab[word]] = 0 sents_idx.append(words) c[j] = len( list(set(sent_bow.keys()) & set_q_bow) ) # Count how many elements they have in common s[j] = len(sent_bow) match_rate = 2 * c / np.maximum(1., (len(set_q_bow) + s)) idx = np.argmax(match_rate) if str(prm.reward_type).lower() == 'discrete': R[i / div] = float( match_rate[idx] == 1.) # make reward \in {0,1} elif str(prm.reward_type).lower() == 'continuous': R[i / div] = match_rate[idx] # make reward \in [0,1] else: raise ValueError( 'Not a valid value for reward_type parameter. Valid options are "continuous", "discrete", or None.' ) sent_idx = utils.text2idx(sents_idx[idx], self.vocab) best_answers.append(sent_idx) if len(sent_idx) > max_words: max_words = len(sent_idx) else: best_answers.append( [-2]) #initialize with -2. -2 means stop word. best_answers_ = -2 * np.ones( (len(best_answers), max_words), np.int32) #initialize with -2. -2 means stop word. for i, best_answer in enumerate(best_answers): best_answers_[i, :len(best_answer)] = best_answer output_storage[0][0] = R output_storage[1][0] = best_answers_