def perform(self, q_i, D_truth, is_train, current_queries): print 'q_i = ', q_i q_m = (q_i > -2).astype('float32') print 'q_m = ', q_m n_iter = len(self.reformulated_queries) print 'n_iter = ', n_iter # outputs metrics = np.zeros((len(q_m), len(self.metrics_map)), np.float32) if is_train: max_feedback_docs = self.max_feedback_docs_train else: max_feedback_docs = self.max_feedback_docs #D_i: word indices ids for all selected documents returned for each query D_i = -2 * np.ones( (len(q_m), max_feedback_docs, self.max_words_input), np.int32) D_gt_m = np.zeros( (len(q_m), self.max_candidates), np.float32 ) #a cell (i,j) is 1 if the jth document returned is ground truth for query i D_id = np.zeros( (len(q_m), self.max_candidates), np.int32) #document ids for all documents returned for each query # no need to retrieve extra terms in the last iteration if n_iter == self.n_iterations - 1: extra_terms = False else: extra_terms = True # allow the search engine to cache queries only in the first iteration. if n_iter == 0: save_cache = self.use_cache else: save_cache = False max_cand = self.max_candidates qs = [] for i, q_lst in enumerate(current_queries): q = [] #print " i, q_lst", i, q_lst for j, word in enumerate(q_lst): if q_m[i, j] == 1 and len(word) > 0: #print "j, word ", j, word q.append(str(word)) q = ' '.join(q) if len(q) == 0: q = 'dummy' #print "q", q qs.append(q) print "qs = ", qs # only used to print the reformulated queries. self.reformulated_queries[n_iter] = qs print "AAA", n_iter, qs, self.reformulated_queries # always return one more candidate because one of them might be the input doc. candss = self.engine.get_candidates(qs, max_cand, self.max_feedback_docs, save_cache, extra_terms) # for every query, returns a list of documents (their words and the indices of each word) for i, cands in enumerate(candss): D_truth_dic = {} for d_truth in D_truth[i]: if d_truth > -1: D_truth_dic[d_truth] = 0 D_id[i, :len(cands.keys())] = cands.keys() j = 0 m = 0 cand_ids = [] selected_docs = np.arange(self.max_feedback_docs) if is_train: selected_docs = np.random.choice( selected_docs, size=self.max_feedback_docs_train, replace=False) #SOOOOO... metrics are calculcated for all returned documents, but candidate words for query reformulation are coming #only from max_feedback_docs_train number of documents for k, (cand_id, (words_idx, words)) in enumerate(cands.items()): cand_ids.append(cand_id) # no need to add candidate words in the last iteration. if n_iter < self.n_iterations - 1: # only add docs selected by sampling (if training). if k in selected_docs: words = words[:self.max_terms_per_doc] words_idx = words_idx[:self.max_terms_per_doc] D_i[i, m, :len(words_idx)] = words_idx # append empty strings, so the list size becomes <dim>. words = words + max( 0, self.max_words_input - len(words)) * [''] # append new words to the list of current queries. current_queries[i] += words m += 1 if cand_id in D_truth_dic: D_gt_m[i, j] = 1. j += 1 cands_set = set(cands.keys()) if qs[i].lower() in self.engine.title_id_map: input_doc_id = self.engine.title_id_map[qs[i].lower()] # Remove input doc from returned docs. # This operation does not raise an error if the element is not there. cands_set.discard(input_doc_id) intersec = len(set(D_truth_dic.keys()) & cands_set) recall = intersec / max(1., float(len(D_truth_dic))) precision = intersec / max(1., float(self.max_candidates)) metrics[i, self.metrics_map['RECALL']] = recall metrics[i, self.metrics_map['PRECISION']] = precision metrics[i, self.metrics_map['F1']] = 2 * recall * precision / max( 0.01, recall + precision) avg_precision = average_precision.compute(D_truth_dic.keys(), cand_ids) metrics[i, self.metrics_map['MAP']] = avg_precision metrics[i, self.metrics_map['LOG-GMAP']] = np.log(avg_precision + 1e-5) #print 'current_queries = ', current_queries return metrics, D_i, D_id, D_gt_m
def perform(self, node, inputs, output_storage): q_m = inputs[0] D_truth = inputs[1] n_iter = int(inputs[2]) is_train = int(inputs[3]) #outputs metrics = np.zeros((len(q_m), len(prm.metrics_map)), np.float32) if is_train: max_feedback_docs = prm.max_feedback_docs_train else: max_feedback_docs = prm.max_feedback_docs D_i = -2 * np.ones( (len(q_m), max_feedback_docs, prm.max_words_input), np.int32) D_gt_m = np.zeros((len(q_m), prm.max_candidates), np.float32) D_id = np.zeros((len(q_m), prm.max_candidates), np.int32) # no need to retrieve extra terms in the last iteration if n_iter == prm.n_iterations - 1: extra_terms = False else: extra_terms = True # allow the search engine to cache queries only in the first iteration. if n_iter == 0: save_cache = prm.use_cache else: save_cache = False max_cand = prm.max_candidates qs = [] for i, q_lst in enumerate(self.options['current_queries']): q = [] for j, word in enumerate(q_lst): if q_m[i, j] == 1: q.append(str(word)) q = ' '.join(q) if len(q) == 0: q = 'dummy' qs.append(q) # only used to print the reformulated queries. self.options['reformulated_queries'][n_iter] = qs # always return one more candidate because one of them might be the input doc. candss = self.options['engine'].get_candidates(qs, max_cand, prm.max_feedback_docs, save_cache, extra_terms) for i, cands in enumerate(candss): D_truth_dic = {} for d_truth in D_truth[i]: if d_truth > -1: D_truth_dic[d_truth] = 0 D_id[i, :len(list(cands.keys()))] = list(cands.keys()) j = 0 m = 0 cand_ids = [] selected_docs = np.arange(prm.max_feedback_docs) if is_train: selected_docs = np.random.choice( selected_docs, size=prm.max_feedback_docs_train, replace=False) for k, (cand_id, (words_idx, words)) in enumerate(cands.items()): cand_ids.append(cand_id) # no need to add candidate words in the last iteration. if n_iter < prm.n_iterations - 1: # only add docs selected by sampling (if training). if k in selected_docs: words = words[:prm.max_terms_per_doc] words_idx = words_idx[:prm.max_terms_per_doc] D_i[i, m, :len(words_idx)] = words_idx # append empty strings, so the list size becomes <dim>. words = words + max( 0, prm.max_words_input - len(words)) * [''] # append new words to the list of current queries. self.options['current_queries'][i] += words m += 1 if cand_id in D_truth_dic: D_gt_m[i, j] = 1. j += 1 cands_set = set(cands.keys()) if qs[i].lower() in self.options['engine'].title_id_map: input_doc_id = self.options['engine'].title_id_map[ qs[i].lower()] # Remove input doc from returned docs. # This operation does not raise an error if the element is not there. cands_set.discard(input_doc_id) intersec = len(set(D_truth_dic.keys()) & cands_set) recall = intersec / max(1., float(len(D_truth_dic))) precision = intersec / max(1., float(prm.max_candidates)) metrics[i, prm.metrics_map['RECALL']] = recall metrics[i, prm.metrics_map['PRECISION']] = precision metrics[i, prm.metrics_map['F1']] = 2 * recall * precision / max( 0.01, recall + precision) avg_precision = average_precision.compute(list(D_truth_dic.keys()), cand_ids) metrics[i, prm.metrics_map['MAP']] = avg_precision metrics[i, prm.metrics_map['LOG-GMAP']] = np.log(avg_precision + 1e-5) output_storage[0][0] = metrics output_storage[1][0] = D_i output_storage[2][0] = D_id output_storage[3][0] = D_gt_m
def perform(self, node, inputs, output_storage): q_m = inputs[0] D_truth = inputs[1] n_iter = int(inputs[2]) oracle_mode = int(inputs[3]) is_train = int(inputs[4]) #outputs metrics = np.zeros((len(q_m), len(prm.metrics_map)), np.float32) if is_train: max_feedback_docs = prm.max_feedback_docs_train else: max_feedback_docs = prm.max_feedback_docs if prm.cand_terms_source.lower() in ['syn', 'synemb']: cand_size = prm.syns_per_word elif prm.cand_terms_source.lower() == 'doc': cand_size = max_feedback_docs elif prm.cand_terms_source.lower() == 'all': cand_size = max_feedback_docs + prm.syns_per_word D_i = -2 * np.ones( (len(q_m), cand_size, prm.max_words_input), np.int32) D_idf_m = np.ones((len(q_m), cand_size, prm.max_words_input), np.float32) D_gt_m = np.zeros((len(q_m), prm.max_candidates), np.float32) D_id = np.zeros((len(q_m), prm.max_candidates), np.int32) if prm.supervised: D_i_gt = np.zeros( (len(q_m), max_feedback_docs, prm.max_words_input), np.float32) # no need to retrieve extra terms in the last iteration if n_iter == prm.n_iterations - 1: extra_terms = False else: extra_terms = True # allow the search engine to cache queries only in the first iteration. if n_iter == 0: save_cache = prm.use_cache else: save_cache = False max_cand = prm.max_candidates qs = [] for i, q_lst in enumerate(self.options['current_queries']): q = [] for j, word in enumerate(q_lst): if q_m[i, j] == 1: q.append(str(word)) q = ' '.join(q) if len(q) == 0: q = 'dummy' qs.append(q) # only used to print the reformulated queries. self.options['reformulated_queries'][n_iter] = qs # always return one more candidate because one of them might be the input doc. candss = self.options['engine'].get_candidates(qs, max_cand, prm.max_feedback_docs, save_cache, extra_terms) for i, cands in enumerate(candss): D_truth_dic = {} for d_truth in D_truth[i]: if d_truth > -1: D_truth_dic[d_truth] = 0 D_id[i, :len(cands.keys())] = cands.keys() j = 0 m = 0 cand_ids = [] if prm.supervised: wordss = [] selected_docs = np.arange(prm.max_feedback_docs) if is_train and not prm.supervised: selected_docs = np.random.choice( selected_docs, size=prm.max_feedback_docs_train, replace=False) for k, (cand_id, (words_idx, words)) in enumerate(cands.items()): cand_ids.append(cand_id) if prm.cand_terms_source.lower() in ['doc', 'all']: # no need to add candidate words in the last iteration. if n_iter < prm.n_iterations - 1: # only add docs selected by sampling (if training). if k in selected_docs: words = words[:prm.max_terms_per_doc] words_idx = words_idx[:prm.max_terms_per_doc] D_i[i, m, :len(words_idx)] = words_idx if prm.idf_threshold > 0.0: for p, word in enumerate(words): if word.lower( ) in self.options['engine'].idf: if self.options['engine'].idf[ word.lower( )] <= prm.idf_threshold: D_idf_m[i, m, p] = 0 if prm.supervised: wordss.append(words) # append empty strings, so the list size becomes <dim>. words = words + max( 0, prm.max_words_input - len(words)) * [''] # append new words to the list of current queries. self.options['current_queries'][i] += words m += 1 if cand_id in D_truth_dic: D_gt_m[i, j] = 1. j += 1 if prm.cand_terms_source.lower() in ['syn', 'synemb', 'all']: # no need to add candidate words in the last iteration. if n_iter < prm.n_iterations - 1: words = [] words_idx = [] for curr_word in self.options['current_queries'][i]: curr_word = curr_word.decode('ascii', 'ignore') if curr_word in self.options['syns']: syns, syns_idx = self.options['syns'][curr_word] words.extend(syns[:prm.syns_per_word]) words_idx.extend(syns_idx[:prm.syns_per_word]) words = words[:prm.syns_per_word * prm.max_words_input] words_idx = words_idx[:prm.syns_per_word * prm.max_words_input] temp = -2 * np.ones( (prm.syns_per_word * prm.max_words_input)) temp[:len(words_idx)] = words_idx temp = temp.reshape( (prm.syns_per_word, prm.max_words_input)) D_i[i, -prm.syns_per_word:, :] = temp # append empty strings, so the list size becomes <dim>. words += max(0, prm.max_words_input - len(words)) * [''] # append new words to the list of current queries. self.options['current_queries'][i] += words cands_set = set(cands.keys()) if qs[i].lower() in self.options['engine'].title_id_map: input_doc_id = self.options['engine'].title_id_map[ qs[i].lower()] # Remove input doc from returned docs. # This operation does not raise an error if the element is not there. cands_set.discard(input_doc_id) intersec = len(set(D_truth_dic.keys()) & cands_set) recall = intersec / max(1., float(len(D_truth_dic))) precision = intersec / max(1., float(prm.max_candidates)) metrics[i, prm.metrics_map['RECALL']] = recall metrics[i, prm.metrics_map['PRECISION']] = precision metrics[i, prm.metrics_map['F1']] = 2 * recall * precision / max( 0.01, recall + precision) avg_precision = average_precision.compute(D_truth_dic.keys(), cand_ids) metrics[i, prm.metrics_map['MAP']] = avg_precision metrics[i, prm.metrics_map['LOG-GMAP']] = np.log(avg_precision + 1e-5) if prm.supervised and n_iter == 0 and oracle_mode: d_i_gt = self.options['supervised'][ self.options['current_queries_ids'][i]] if d_i_gt[0, 0] == -2: # only works with RECALL by now... d_i_gt = supervised.run( qs[i], D_truth[i][0], wordss, metrics[i, prm.metrics_map[prm.reward.upper()]], self.options, D_gt=D_truth_dic.keys()) self.options['supervised'][ self.options['current_queries_ids'][i]] = d_i_gt D_i_gt[i, :, :] = d_i_gt output_storage[0][0] = metrics output_storage[1][0] = D_i output_storage[2][0] = D_idf_m output_storage[3][0] = D_id output_storage[4][0] = D_gt_m if prm.supervised: output_storage[5][0] = D_i_gt