def extract_per_entity(self, q_info, ana, doc_info): """ extract per entity feature :param q_info: :param ana: :param doc_info: :return: """ h_feature = {} qe = ana['id'] for field in self.l_target_fields: l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, []) l_qe_grid = self._filter_e_grid(qe, l_grid) doc_lm = text2lm(doc_info.get(field, "")) if 'grid' in self.l_feature: l_qe_grid = self._calc_grid_scores(l_qe_grid, doc_lm) if 'passage' in self.l_feature: h_proximity_f = self._entity_passage_features(q_info, l_qe_grid, field) h_feature.update(add_feature_prefix(h_proximity_f, field + '_')) if 'desp' in self.l_feature: h_desp_f = self._desp_passage_features(qe, l_qe_grid, field) h_feature.update(add_feature_prefix(h_desp_f, field + '_')) if 'grid' in self.l_feature: h_grid_score_f = self._grid_score_features(qe, l_qe_grid) h_feature.update(add_feature_prefix(h_grid_score_f, field + '_')) if 'coherence' in self.l_feature: if field == body_field: h_coherence_f = self._qe_grid_coherence(qe, l_grid) h_feature.update(add_feature_prefix(h_coherence_f, field + '_')) if 'esr' in self.l_feature: h_esr = self._local_esr(qe, l_qe_grid) h_feature.update(add_feature_prefix(h_esr, field + '_')) return h_feature
def _extract_per_entity_per_nlss_per_field(self, ana, doc_info, l_qe_nlss, l_e_grid, l_nlss_bow, l_nlss_emb): """ for each sentence in e_grid, check if ana e in it, and if len < max_sent_len calculate similarity with all qe_nlss average and max sum up :param ana: :param doc_info: :param l_qe_nlss: nlss of qe :param l_e_grid: grid of this field :param l_nlss_bow: pre calc bow of nlss :param l_nlss_emb: pre calc emb of nlss :return: """ e_id = ana['id'] l_this_e_grid = self._filter_e_grid(e_id, l_e_grid) l_grid_bow = self._form_grid_bow(l_this_e_grid) l_grid_emb = self._form_grid_emb(l_this_e_grid) m_bow_sim = self._calc_bow_trans(l_grid_bow, l_nlss_bow) m_emb_sim = self._calc_emb_trans(l_grid_emb, l_nlss_emb) # if self.intermediate_data_out_name: # self._log_intermediate_res(ana, doc_info, l_this_e_grid, l_qe_nlss, m_bow_sim, m_emb_sim) h_bow_feature = self._pool_grid_nlss_sim(m_bow_sim) h_emb_feature = self._pool_grid_nlss_sim(m_emb_sim) h_feature = dict() h_feature.update(add_feature_prefix(h_bow_feature, 'BOW_')) h_feature.update(add_feature_prefix(h_emb_feature, 'Emb_')) return h_feature
def extract_per_entity(self, q_info, ana, doc_info): h_feature = dict() qe = ana['id'] qid = q_info['qid'] logging.info('start extracting [%s]-[%s]-[%s]', qid, qe, doc_info['docno']) if qid != self.current_qid: self.current_qid = qid self._construct_e_nlss_cash_info(q_info, self.resource.l_h_nlss[0]) for field in self.l_target_fields: l_field_ana = form_boe_per_field(doc_info, field) h_field_lm = text2lm(doc_info.get(field, ""), clean=True) if 'emb_vote' in self.l_features: h_feature.update(add_feature_prefix( self._connected_emb_vote(qe, l_field_ana), field + '_')) if 'edge_cnt' in self.l_features: h_feature.update(add_feature_prefix( self._edge_cnt(qe, l_field_ana), field + '_')) if 'edge_retrieval' in self.l_features: h_feature.update(add_feature_prefix( self._edge_retrieval(qe, l_field_ana, h_field_lm, field), field + '_')) if 'local_grid' in self.l_features: h_feature.update(add_feature_prefix( self._local_grid(q_info, qe, l_field_ana, doc_info, field), field + '_')) if 'qe_grid' in self.l_features: h_feature.update(add_feature_prefix( self._qe_grid(q_info, qe, doc_info, field), field + '_')) if 'nlss_grid' in self.l_features: h_feature.update(add_feature_prefix( self._nlss_grid(q_info, qe, l_field_ana, doc_info, field), field + '_')) if 'ltr_base' in self.l_features: h_feature.update(add_feature_prefix( self._ltr_baseline(q_info, h_field_lm, field), field + '_')) if 'local_vote' in self.l_features: h_feature.update(add_feature_prefix( self._local_vote(q_info, qe, l_field_ana, doc_info, field), field + '_' )) if 'grid_retrieval' in self.l_features: h_feature.update(add_feature_prefix( self._grid_retrieval(qe, h_field_lm, doc_info, field), field + '_' )) if 'edge_grid' in self.l_features: h_feature.update(add_feature_prefix( self._edge_grid(qe, doc_info, field), field + '_' )) return h_feature
def _grid_score_features(self, qe, l_grid): """ :param qe: :param l_grid: :return: h_feature """ h_feature = dict() ll_grid_e_score = [[e_score for e_score in grid['e_score'] if e_score['id'] == qe] + [e_score for e_score in grid['e_score'] if e_score['id'] != qe] for grid in l_grid] for name in self.l_grid_scores: ll_this_grid_score = [[h_score.get(name, 0) for h_score in l_grid_e_score] for l_grid_e_score in ll_grid_e_score ] h_this_score = dict() l_qe_score = [l_score[0] for l_score in ll_this_grid_score] if not l_qe_score: l_qe_score.append(0) h_this_score['Sum'] = sum(l_qe_score) h_this_score['Max'] = max(l_qe_score) # h_this_score['FullCombine'] = sum( # [sum(l_score) / max(float(len(l_score)), 1.0) for l_score in ll_this_grid_score] # ) h_this_score['NormSum'] = sum([l_score[0] / float(max(sum(l_score), 1.0)) for l_score in ll_this_grid_score]) h_this_score = add_feature_prefix(h_this_score, name) h_feature.update(h_this_score) return h_feature
def _entity_passage_features(self, q_info, l_grid, field): l_grid_sent = [grid['sent'] for grid in l_grid] q_lm = text2lm(q_info['query']) h_feature = dict() grid_lm = text2lm(' '.join(l_grid_sent)) r_model = RetrievalModel() r_model.set_from_raw( q_lm, grid_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None) ) h_score = dict(r_model.scores()) h_feature.update(h_score) # l_grid_lm = [text2lm(sent) for sent in l_grid_sent] # l_scores = [] # for grid_lm in l_grid_lm: # r_model = RetrievalModel() # r_model.set_from_raw( # q_lm, grid_lm, # self.resource.corpus_stat.h_field_df.get(field, None), # self.resource.corpus_stat.h_field_total_df.get(field, None), # self.resource.corpus_stat.h_field_avg_len.get(field, None) # ) # l_scores.append(dict(r_model.scores())) # # h_feature.update(mean_pool_feature(l_scores)) # h_feature.update(max_pool_feature(l_scores)) h_feature = add_feature_prefix(h_feature, 'EntityPassage') return h_feature
def extract_per_entity(self, q_info, ana, doc_info): """ :param q_info: query info :param ana: one q ana :param doc_info: :return: """ h_feature = dict() e_id = ana['id'] ll_qe_nlss = [ h_nlss.get(e_id, []) for h_nlss in self.resource.l_h_nlss ] for p in xrange(len(ll_qe_nlss)): data = ll_qe_nlss[p] if type(data) is str: data = json.loads(data) nlss_name, l_qe_nlss, nlss_select = self.resource.l_nlss_name[ p], data, self.l_nlss_selection[p] l_this_nlss = self._select_nlss(q_info, ana, doc_info, nlss_select, l_qe_nlss) h_this_nlss_feature = self._extract_per_entity_via_nlss( q_info, ana, doc_info, l_this_nlss) h_feature.update( add_feature_prefix(h_this_nlss_feature, nlss_select + nlss_name + '_')) return h_feature
def extract(self, qid, docno, h_q_info, h_doc_info): h_feature = dict() for field in self.l_fields: h_this_feature = self.extract_per_field(h_q_info, h_doc_info, field) h_feature.update( add_feature_prefix(h_this_feature, self.feature_name_pre + '_' + field)) return h_feature
def _local_grid(self, q_info, qe, l_field_ana, doc_info, field): """ only keep grids that 1) include qe 2) include qe->nlss->tail e :param q_info: query info :param qe: :param doc_info: :param field: :return: """ p = self.h_qe_idx[qe] h_e_nlss_idx = self.l_h_e_nlss_idx[p] l_tail_e = [ana['id'] for ana in l_field_ana if ana['id'] in h_e_nlss_idx] l_qe_grid = [] l_nlss_e_grid = [] l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, []) for grid in l_grid: l_grid_e = [ana['id'] for ana in grid['spot']] s_grid_e = set(l_grid_e) if qe in s_grid_e: l_qe_grid.append(grid['sent']) for tail_e in l_tail_e: if tail_e in s_grid_e: l_nlss_e_grid.append(grid['sent']) break logging.info('q [%s] e [%s] doc [%s] has [%d] qe grid, [%d] nlss grid', q_info['qid'], qe, doc_info['docno'], len(l_qe_grid), len(l_nlss_e_grid) ) qe_grid_lm = text2lm(' '.join(l_qe_grid), clean=True) nlss_e_grid_lm = text2lm(' '.join(l_nlss_e_grid), clean=True) q_lm = text2lm(q_info[QUERY_FIELD]) h_feature = {} h_qe_grid_scores = dict(self._extract_retrieval_scores(q_lm, qe_grid_lm, field)) h_nlss_grid_scores = dict(self._extract_retrieval_scores(q_lm, nlss_e_grid_lm, field)) h_feature.update(add_feature_prefix(h_qe_grid_scores, 'QEGrid_')) h_feature.update(add_feature_prefix(h_nlss_grid_scores, 'NlssGrid_')) return h_feature
def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss): """ extract e-d features do: get top k nlss form doc lm retrieval, as a whole of individually sum up to features :param q_info: query info :param ana: :param doc_info: :param l_qe_nlss: :return: h_feature: entity features for this nlss set """ l_top_nlss = self._find_top_k_nlss_for_q(q_info, ana, l_qe_nlss) l_top_sent = [nlss[0] for nlss in l_top_nlss] l_top_sent.append(' '.join(l_top_sent)) if not l_top_sent: l_top_sent.append('') # place holder for empty nlss e l_h_per_sent_feature = [] l_field_doc_lm = [ text2lm(doc_info.get(field, ""), clean=True) for field in self.l_target_fields ] for sent in l_top_sent: h_per_sent_feature = {} h_sent_lm = text2lm(sent, clean=True) for field, lm in zip(self.l_target_fields, l_field_doc_lm): r_model = RetrievalModel() r_model.set_from_raw( h_sent_lm, lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get( field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None)) l_retrieval_score = r_model.scores() q_len = float( max(sum([item[1] for item in h_sent_lm.items()]), 1)) h_per_sent_feature.update( dict([(field + name, score / q_len) for name, score in l_retrieval_score])) l_h_per_sent_feature.append(h_per_sent_feature) h_max_feature = max_pool_feature(l_h_per_sent_feature[:-1]) h_mean_feature = add_feature_prefix(l_h_per_sent_feature[-1], 'Conca') h_feature = h_max_feature h_feature.update(h_mean_feature) return h_feature
def _desp_passage_features(self, e_id, l_grid, field): l_grid_sent = [grid['sent'] for grid in l_grid] q_lm = text2lm(self.resource.h_e_desp.get(e_id, "")) grid_lm = text2lm(' '.join(l_grid_sent)) r_model = RetrievalModel() r_model.set_from_raw( q_lm, grid_lm, self.resource.corpus_stat.h_field_df.get(field, None), self.resource.corpus_stat.h_field_total_df.get(field, None), self.resource.corpus_stat.h_field_avg_len.get(field, None) ) h_score = dict(r_model.scores()) del h_score['lm_twoway'] h_feature = add_feature_prefix(h_score, 'DespPassage') return h_feature
def _grid_retrieval(self, qe, h_field_lm, doc_info, field): l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, []) z = float(len(l_grid)) l_h_scores = [] for grid in l_grid: l_grid_e = [ana['id'] for ana in grid['spot']] s_grid_e = set(l_grid_e) if qe not in s_grid_e: continue sent_lm = text2lm(grid['sent'], clean=True) l_scores = self._extract_retrieval_scores(sent_lm, h_field_lm, field) h_scores = dict([(k, v / z) for k, v in l_scores]) l_h_scores.append(h_scores) h_feature = sum_pool_feature(l_h_scores) h_feature = add_feature_prefix(h_feature, 'grid_retrieval') return h_feature
def _qe_grid(self, q_info, qe, doc_info, field): p = self.h_qe_idx[qe] h_e_nlss_idx = self.l_h_e_nlss_idx[p] l_qe_grid = [] l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, []) for grid in l_grid: l_grid_e = [ana['id'] for ana in grid['spot']] s_grid_e = set(l_grid_e) if qe in s_grid_e: l_qe_grid.append(grid['sent']) logging.info('q [%s] e [%s] doc [%s] has [%d] qe grid', q_info['qid'], qe, doc_info['docno'], len(l_qe_grid) ) qe_grid_lm = text2lm(' '.join(l_qe_grid), clean=True) q_lm = text2lm(q_info[QUERY_FIELD]) h_feature = {} h_qe_grid_scores = dict(self._extract_retrieval_scores(q_lm, qe_grid_lm, field)) h_feature.update(add_feature_prefix(h_qe_grid_scores, 'QEGrid_')) return h_feature
def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss): """ :param ana: :param doc_info: :param l_qe_nlss: :return: """ h_this_feature = dict() h_e_grid = doc_info.get(E_GRID_FIELD, {}) l_nlss_bow = self._form_nlss_bow(l_qe_nlss) l_nlss_emb = self._form_nlss_emb(l_qe_nlss) for field in self.l_target_fields: if field not in h_e_grid: continue l_e_grid = h_e_grid.get(field, []) h_field_grid_feature = self._extract_per_entity_per_nlss_per_field( ana, doc_info, l_qe_nlss, l_e_grid, l_nlss_bow, l_nlss_emb) h_this_feature.update( add_feature_prefix(h_field_grid_feature, field + '_')) return h_this_feature
def _calc_grid_scores(self, l_grid, doc_lm): """ sent -> e scores include: frequency: emb_sim: desp_emb: desp_bow: gloss_emb: gloss_bow: :param l_grid: :return: for grid->'entity'->['id': e id, 'name':score], grid_score = {name:score} """ logging.info('start calculating grid scores') for grid in l_grid: l_e = [ana['id'] for ana in grid.get(SPOT_FIELD)] h_e_tf = term2lm(l_e) grid_sent = grid['sent'] grid_lm = text2lm(grid_sent) grid_emb = avg_embedding(self.resource.embedding, grid_sent) l_e_score = [] for e, tf in h_e_tf.items(): h_e_score = {'id': e, 'freq': tf} h_e_score['uw_emb'] = self._e_grid_emb(e, grid_emb) # h_e_score['gloss_emb'] = self._e_gloss_emb(e, grid_emb) # h_e_score['gloss_bow'] = self._e_gloss_bow(e, grid_lm) h_e_score['desp_emb'] = self._e_desp_emb(e, grid_emb) h_e_score['desp_bow'] = self._e_desp_bow(e, grid_lm) h_e_score['ESA'] = self._e_desp_bow(e, doc_lm) l_score = self._e_desp_retrieval(e, grid_lm) h_e_score.update(add_feature_prefix(dict(l_score), 'desp_')) l_e_score.append(h_e_score) grid['e_score'] = l_e_score return l_grid
def extract_pair(self, q_info, doc_info): h_feature = super(EntityAnchorFeature, self).extract_pair(q_info, doc_info) if 'coherence' in self.l_feature: h_global_coherence = self._global_grid_coherence(doc_info) h_feature.update(add_feature_prefix(h_global_coherence, self.feature_name_pre)) return h_feature
def _qe_grid_coherence(self, qe, l_grid): h_feature = {} h_feature.update(self._single_e_coherence(qe, l_grid)) h_feature.update(self._pair_e_coherence(l_grid, qe)) h_feature = add_feature_prefix(h_feature, 'Qe') return h_feature