def calc_q_link_accuracy(q_info_in, q_manual_info_in): h_qid_info = load_query_info(q_info_in) h_qid_manual_info = load_query_info(q_manual_info_in) h_q_f1 = {} for qid, h_info in h_qid_info.items(): l_e = [] if 'tagme' in h_info: l_e = [ana[0] for ana in h_info['tagme']['query']] elif 'cmns' in h_info: l_e = [ana[0] for ana in h_info['cmns']['query']] l_label_e = [] if qid in h_qid_manual_info: l_label_e = [ ana[0] for ana in h_qid_manual_info[qid]['manual']['query'] ] if len(l_e) == 0 & len(l_label_e) == 0: h_q_f1[qid] = 1 continue s_e = set(l_e) s_true = set(l_label_e) prec = 0 recall = 0 overlap = float(len(s_e.intersection(s_true))) if s_e: prec = overlap / len(s_e) if s_true: recall = overlap / len(s_true) if (prec == 0) | (recall == 0): f1 = 0 else: f1 = 2.0 * prec * recall / (prec + recall) h_q_f1[qid] = f1 print json.dumps(h_q_f1, indent=1) return h_q_f1
def process(q_info_in, out_name): h_q_info = load_query_info(q_info_in) bow_len, boe_len = avg_len(h_q_info) out = open(out_name, 'w') print >> out, 'bow_avg_len: %f\nboe_avg_len: %f' % (bow_len, boe_len) out.close()
def __init__(self, **kwargs): super(FusionAnalysis, self).__init__(**kwargs) self.h_q_info = load_query_info(self.q_info) self.out_dir = os.path.join( self.out_dir, ntpath.basename(self.target_eva.split('.')[0])) if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) self.h_q_eva = dict(load_gdeval_res(self.target_eva)[0]) self.h_base_q_eva = dict(load_gdeval_res(self.base_eva)[0])
def get_target_surfaceforms(q_info_in): h_qid_info = load_query_info(q_info_in) h_surface = dict() for qid, h_info in h_qid_info.items(): query = h_info['query'] for ana in h_info['tagme']['query']: sf = query[ana[1]:ana[2]] h_surface[sf.lower()] = [] print "total [%d] sf" % (len(h_surface)) return h_surface
def __init__(self, **kwargs): super(RankComponentAna, self).__init__(**kwargs) self.external_info = LeToRFeatureExternalInfo(**kwargs) self.embedding = self.external_info.l_embedding[0] self.h_entity_texts = self.external_info.h_entity_texts self.h_field_h_df = self.external_info.h_field_h_df self.h_corpus_stat = self.external_info.h_corpus_stat self.h_q_info = load_query_info(self.q_info_in) self.ll_qid_ranked_doc = load_trec_ranking_with_info( self.trec_with_info_in) self.h_qrel = load_trec_labels_dict(self.qrel_in) if not os.path.exists(self.out_dir): os.makedirs(self.out_dir)
def pipe_extract(self, q_info_in=None, out_name=None): if not q_info_in: q_info_in = self.q_info_in if not out_name: out_name = self.out_name h_q_info = load_query_info(q_info_in) l_h_feature = [] l_y = [] l_q_info = h_q_info.items() l_q_info.sort(key=lambda item: int(item[0])) l_qid = [] for qid, h_info in l_q_info: h_feature = self._extract(qid, h_info) y = -1 if qid in self.h_label: y = self.h_label[qid] l_h_feature.append(h_feature) l_y.append(y) l_qid.append(qid) self._dump_feature_svm(l_y, l_h_feature, l_qid, out_name) logging.info('q att feature extracted to [%s]', out_name) return
def _load_data(self): if self.ref_q_info_in: self.h_ref_q_info = load_query_info(self.ref_q_info_in)
def __init__(self, **kwargs): super(QLenPerformanceAna, self).__init__(**kwargs) self.h_q_info = load_query_info(self.q_info_in) self.h_rel_ndcg = get_rel_ndcg(self.eva_in, self.base_eva_in) self.h_base_eva = dict(load_gdeval_res(self.base_eva_in, False)) self.h_eva = dict(load_gdeval_res(self.eva_in, False))