Ejemplo n.º 1
0
    def extract_per_entity(self, q_info, ana, doc_info):
        """
        extract per entity feature
        :param q_info:
        :param ana:
        :param doc_info:
        :return:
        """
        h_feature = {}
        qe = ana['id']
        for field in self.l_target_fields:
            l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, [])
            l_qe_grid = self._filter_e_grid(qe, l_grid)
            doc_lm = text2lm(doc_info.get(field, ""))
            if 'grid' in self.l_feature:
                l_qe_grid = self._calc_grid_scores(l_qe_grid, doc_lm)

            if 'passage' in self.l_feature:
                h_proximity_f = self._entity_passage_features(q_info, l_qe_grid, field)
                h_feature.update(add_feature_prefix(h_proximity_f, field + '_'))
            if 'desp' in self.l_feature:
                h_desp_f = self._desp_passage_features(qe, l_qe_grid, field)
                h_feature.update(add_feature_prefix(h_desp_f, field + '_'))
            if 'grid' in self.l_feature:
                h_grid_score_f = self._grid_score_features(qe, l_qe_grid)
                h_feature.update(add_feature_prefix(h_grid_score_f, field + '_'))
            if 'coherence' in self.l_feature:
                if field == body_field:
                    h_coherence_f = self._qe_grid_coherence(qe, l_grid)
                    h_feature.update(add_feature_prefix(h_coherence_f, field + '_'))
            if 'esr' in self.l_feature:
                h_esr = self._local_esr(qe, l_qe_grid)
                h_feature.update(add_feature_prefix(h_esr, field + '_'))
        return h_feature
Ejemplo n.º 2
0
    def _extract_per_entity_per_nlss_per_field(self, ana, doc_info, l_qe_nlss,
                                               l_e_grid, l_nlss_bow,
                                               l_nlss_emb):
        """
        for each sentence in e_grid,
            check if ana e in it, and if len < max_sent_len
            calculate similarity with all qe_nlss
            average and max sum up
        :param ana:
        :param doc_info:
        :param l_qe_nlss: nlss of qe
        :param l_e_grid: grid of this field
        :param l_nlss_bow: pre calc bow of nlss
        :param l_nlss_emb: pre calc emb of nlss
        :return:
        """
        e_id = ana['id']
        l_this_e_grid = self._filter_e_grid(e_id, l_e_grid)
        l_grid_bow = self._form_grid_bow(l_this_e_grid)
        l_grid_emb = self._form_grid_emb(l_this_e_grid)

        m_bow_sim = self._calc_bow_trans(l_grid_bow, l_nlss_bow)
        m_emb_sim = self._calc_emb_trans(l_grid_emb, l_nlss_emb)

        # if self.intermediate_data_out_name:
        #     self._log_intermediate_res(ana, doc_info, l_this_e_grid, l_qe_nlss, m_bow_sim, m_emb_sim)

        h_bow_feature = self._pool_grid_nlss_sim(m_bow_sim)
        h_emb_feature = self._pool_grid_nlss_sim(m_emb_sim)

        h_feature = dict()
        h_feature.update(add_feature_prefix(h_bow_feature, 'BOW_'))
        h_feature.update(add_feature_prefix(h_emb_feature, 'Emb_'))
        return h_feature
Ejemplo n.º 3
0
    def extract_per_entity(self, q_info, ana, doc_info):
        h_feature = dict()
        qe = ana['id']
        qid = q_info['qid']
        logging.info('start extracting [%s]-[%s]-[%s]',
                     qid, qe, doc_info['docno'])
        if qid != self.current_qid:
            self.current_qid = qid
            self._construct_e_nlss_cash_info(q_info, self.resource.l_h_nlss[0])
        for field in self.l_target_fields:
            l_field_ana = form_boe_per_field(doc_info, field)
            h_field_lm = text2lm(doc_info.get(field, ""), clean=True)
            if 'emb_vote' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._connected_emb_vote(qe, l_field_ana),
                    field + '_'))
            if 'edge_cnt' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._edge_cnt(qe, l_field_ana),
                    field + '_'))
            if 'edge_retrieval' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._edge_retrieval(qe, l_field_ana, h_field_lm, field),
                    field + '_'))
            if 'local_grid' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._local_grid(q_info, qe, l_field_ana, doc_info, field),
                    field + '_'))
            if 'qe_grid' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._qe_grid(q_info, qe, doc_info, field),
                    field + '_'))
            if 'nlss_grid' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._nlss_grid(q_info, qe, l_field_ana, doc_info, field),
                    field + '_'))
            if 'ltr_base' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._ltr_baseline(q_info, h_field_lm, field),
                    field + '_'))
            if 'local_vote' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._local_vote(q_info, qe, l_field_ana, doc_info, field),
                    field + '_'
                ))
            if 'grid_retrieval' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._grid_retrieval(qe, h_field_lm, doc_info, field),
                    field + '_'
                ))
            if 'edge_grid' in self.l_features:
                h_feature.update(add_feature_prefix(
                    self._edge_grid(qe, doc_info, field),
                    field + '_'
                ))

        return h_feature
Ejemplo n.º 4
0
    def _grid_score_features(self, qe, l_grid):
        """

        :param qe:
        :param l_grid:
        :return: h_feature
        """
        h_feature = dict()
        ll_grid_e_score = [[e_score for e_score in grid['e_score'] if e_score['id'] == qe]
                           + [e_score for e_score in grid['e_score'] if e_score['id'] != qe]
                           for grid in l_grid]
        for name in self.l_grid_scores:
            ll_this_grid_score = [[h_score.get(name, 0) for h_score in l_grid_e_score]
                                  for l_grid_e_score in ll_grid_e_score
                                  ]
            h_this_score = dict()
            l_qe_score = [l_score[0] for l_score in ll_this_grid_score]
            if not l_qe_score:
                l_qe_score.append(0)
            h_this_score['Sum'] = sum(l_qe_score)
            h_this_score['Max'] = max(l_qe_score)
            # h_this_score['FullCombine'] = sum(
            #     [sum(l_score) / max(float(len(l_score)), 1.0) for l_score in ll_this_grid_score]
            # )
            h_this_score['NormSum'] = sum([l_score[0] / float(max(sum(l_score), 1.0))
                                           for l_score in ll_this_grid_score])
            h_this_score = add_feature_prefix(h_this_score, name)
            h_feature.update(h_this_score)
        return h_feature
Ejemplo n.º 5
0
    def _entity_passage_features(self, q_info, l_grid, field):
        l_grid_sent = [grid['sent'] for grid in l_grid]
        q_lm = text2lm(q_info['query'])
        h_feature = dict()
        grid_lm = text2lm(' '.join(l_grid_sent))
        r_model = RetrievalModel()
        r_model.set_from_raw(
            q_lm, grid_lm,
            self.resource.corpus_stat.h_field_df.get(field, None),
            self.resource.corpus_stat.h_field_total_df.get(field, None),
            self.resource.corpus_stat.h_field_avg_len.get(field, None)
        )
        h_score = dict(r_model.scores())
        h_feature.update(h_score)

        # l_grid_lm = [text2lm(sent) for sent in l_grid_sent]
        # l_scores = []
        # for grid_lm in l_grid_lm:
        #     r_model = RetrievalModel()
        #     r_model.set_from_raw(
        #         q_lm, grid_lm,
        #         self.resource.corpus_stat.h_field_df.get(field, None),
        #         self.resource.corpus_stat.h_field_total_df.get(field, None),
        #         self.resource.corpus_stat.h_field_avg_len.get(field, None)
        #     )
        #     l_scores.append(dict(r_model.scores()))
        # # h_feature.update(mean_pool_feature(l_scores))
        # h_feature.update(max_pool_feature(l_scores))

        h_feature = add_feature_prefix(h_feature, 'EntityPassage')
        return h_feature
Ejemplo n.º 6
0
    def extract_per_entity(self, q_info, ana, doc_info):
        """
        :param q_info: query info
        :param ana: one q ana
        :param doc_info:
        :return:
        """

        h_feature = dict()
        e_id = ana['id']
        ll_qe_nlss = [
            h_nlss.get(e_id, []) for h_nlss in self.resource.l_h_nlss
        ]

        for p in xrange(len(ll_qe_nlss)):
            data = ll_qe_nlss[p]
            if type(data) is str:
                data = json.loads(data)
            nlss_name, l_qe_nlss, nlss_select = self.resource.l_nlss_name[
                p], data, self.l_nlss_selection[p]
            l_this_nlss = self._select_nlss(q_info, ana, doc_info, nlss_select,
                                            l_qe_nlss)
            h_this_nlss_feature = self._extract_per_entity_via_nlss(
                q_info, ana, doc_info, l_this_nlss)
            h_feature.update(
                add_feature_prefix(h_this_nlss_feature,
                                   nlss_select + nlss_name + '_'))
        return h_feature
Ejemplo n.º 7
0
 def extract(self, qid, docno, h_q_info, h_doc_info):
     h_feature = dict()
     for field in self.l_fields:
         h_this_feature = self.extract_per_field(h_q_info, h_doc_info,
                                                 field)
         h_feature.update(
             add_feature_prefix(h_this_feature,
                                self.feature_name_pre + '_' + field))
     return h_feature
Ejemplo n.º 8
0
    def _local_grid(self, q_info, qe, l_field_ana, doc_info, field):
        """
        only keep grids that
            1) include qe
            2) include qe->nlss->tail e
        :param q_info: query info
        :param qe:
        :param doc_info:
        :param field:
        :return:
        """
        p = self.h_qe_idx[qe]
        h_e_nlss_idx = self.l_h_e_nlss_idx[p]
        l_tail_e = [ana['id'] for ana in l_field_ana if ana['id'] in h_e_nlss_idx]

        l_qe_grid = []
        l_nlss_e_grid = []

        l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, [])
        for grid in l_grid:
            l_grid_e = [ana['id'] for ana in grid['spot']]
            s_grid_e = set(l_grid_e)
            if qe in s_grid_e:
                l_qe_grid.append(grid['sent'])
            for tail_e in l_tail_e:
                if tail_e in s_grid_e:
                    l_nlss_e_grid.append(grid['sent'])
                    break
        logging.info('q [%s] e [%s] doc [%s] has [%d] qe grid, [%d] nlss grid',
                     q_info['qid'], qe, doc_info['docno'], len(l_qe_grid), len(l_nlss_e_grid)
                     )
        qe_grid_lm = text2lm(' '.join(l_qe_grid), clean=True)
        nlss_e_grid_lm = text2lm(' '.join(l_nlss_e_grid), clean=True)
        q_lm = text2lm(q_info[QUERY_FIELD])
        h_feature = {}

        h_qe_grid_scores = dict(self._extract_retrieval_scores(q_lm, qe_grid_lm, field))
        h_nlss_grid_scores = dict(self._extract_retrieval_scores(q_lm, nlss_e_grid_lm, field))

        h_feature.update(add_feature_prefix(h_qe_grid_scores, 'QEGrid_'))
        h_feature.update(add_feature_prefix(h_nlss_grid_scores, 'NlssGrid_'))
        return h_feature
Ejemplo n.º 9
0
    def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss):
        """
        extract e-d features

        do:
            get top k nlss
            form doc lm
            retrieval, as a whole of individually
            sum up to features
        :param q_info: query info
        :param ana:
        :param doc_info:
        :param l_qe_nlss:
        :return: h_feature: entity features for this nlss set
        """

        l_top_nlss = self._find_top_k_nlss_for_q(q_info, ana, l_qe_nlss)

        l_top_sent = [nlss[0] for nlss in l_top_nlss]
        l_top_sent.append(' '.join(l_top_sent))
        if not l_top_sent:
            l_top_sent.append('')  # place holder for empty nlss e
        l_h_per_sent_feature = []
        l_field_doc_lm = [
            text2lm(doc_info.get(field, ""), clean=True)
            for field in self.l_target_fields
        ]
        for sent in l_top_sent:
            h_per_sent_feature = {}
            h_sent_lm = text2lm(sent, clean=True)
            for field, lm in zip(self.l_target_fields, l_field_doc_lm):
                r_model = RetrievalModel()
                r_model.set_from_raw(
                    h_sent_lm, lm,
                    self.resource.corpus_stat.h_field_df.get(field, None),
                    self.resource.corpus_stat.h_field_total_df.get(
                        field, None),
                    self.resource.corpus_stat.h_field_avg_len.get(field, None))
                l_retrieval_score = r_model.scores()
                q_len = float(
                    max(sum([item[1] for item in h_sent_lm.items()]), 1))

                h_per_sent_feature.update(
                    dict([(field + name, score / q_len)
                          for name, score in l_retrieval_score]))
            l_h_per_sent_feature.append(h_per_sent_feature)

        h_max_feature = max_pool_feature(l_h_per_sent_feature[:-1])
        h_mean_feature = add_feature_prefix(l_h_per_sent_feature[-1], 'Conca')

        h_feature = h_max_feature
        h_feature.update(h_mean_feature)
        return h_feature
Ejemplo n.º 10
0
 def _desp_passage_features(self, e_id, l_grid, field):
     l_grid_sent = [grid['sent'] for grid in l_grid]
     q_lm = text2lm(self.resource.h_e_desp.get(e_id, ""))
     grid_lm = text2lm(' '.join(l_grid_sent))
     r_model = RetrievalModel()
     r_model.set_from_raw(
         q_lm, grid_lm,
         self.resource.corpus_stat.h_field_df.get(field, None),
         self.resource.corpus_stat.h_field_total_df.get(field, None),
         self.resource.corpus_stat.h_field_avg_len.get(field, None)
     )
     h_score = dict(r_model.scores())
     del h_score['lm_twoway']
     h_feature = add_feature_prefix(h_score, 'DespPassage')
     return h_feature
Ejemplo n.º 11
0
 def _grid_retrieval(self, qe, h_field_lm, doc_info, field):
     l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, [])
     z = float(len(l_grid))
     l_h_scores = []
     for grid in l_grid:
         l_grid_e = [ana['id'] for ana in grid['spot']]
         s_grid_e = set(l_grid_e)
         if qe not in s_grid_e:
             continue
         sent_lm = text2lm(grid['sent'], clean=True)
         l_scores = self._extract_retrieval_scores(sent_lm, h_field_lm, field)
         h_scores = dict([(k, v / z) for k, v in l_scores])
         l_h_scores.append(h_scores)
     h_feature = sum_pool_feature(l_h_scores)
     h_feature = add_feature_prefix(h_feature, 'grid_retrieval')
     return h_feature
Ejemplo n.º 12
0
    def _qe_grid(self, q_info, qe, doc_info, field):
        p = self.h_qe_idx[qe]
        h_e_nlss_idx = self.l_h_e_nlss_idx[p]

        l_qe_grid = []

        l_grid = doc_info.get(E_GRID_FIELD, {}).get(field, [])
        for grid in l_grid:
            l_grid_e = [ana['id'] for ana in grid['spot']]
            s_grid_e = set(l_grid_e)
            if qe in s_grid_e:
                l_qe_grid.append(grid['sent'])
        logging.info('q [%s] e [%s] doc [%s] has [%d] qe grid',
                     q_info['qid'], qe, doc_info['docno'], len(l_qe_grid)
                     )
        qe_grid_lm = text2lm(' '.join(l_qe_grid), clean=True)
        q_lm = text2lm(q_info[QUERY_FIELD])
        h_feature = {}
        h_qe_grid_scores = dict(self._extract_retrieval_scores(q_lm, qe_grid_lm, field))
        h_feature.update(add_feature_prefix(h_qe_grid_scores, 'QEGrid_'))
        return h_feature
Ejemplo n.º 13
0
    def _extract_per_entity_via_nlss(self, q_info, ana, doc_info, l_qe_nlss):
        """
        :param ana:
        :param doc_info:
        :param l_qe_nlss:
        :return:
        """

        h_this_feature = dict()
        h_e_grid = doc_info.get(E_GRID_FIELD, {})
        l_nlss_bow = self._form_nlss_bow(l_qe_nlss)
        l_nlss_emb = self._form_nlss_emb(l_qe_nlss)
        for field in self.l_target_fields:
            if field not in h_e_grid:
                continue
            l_e_grid = h_e_grid.get(field, [])
            h_field_grid_feature = self._extract_per_entity_per_nlss_per_field(
                ana, doc_info, l_qe_nlss, l_e_grid, l_nlss_bow, l_nlss_emb)
            h_this_feature.update(
                add_feature_prefix(h_field_grid_feature, field + '_'))
        return h_this_feature
Ejemplo n.º 14
0
    def _calc_grid_scores(self, l_grid, doc_lm):
        """
        sent -> e scores
        include:
            frequency:
            emb_sim:
            desp_emb:
            desp_bow:
            gloss_emb:
            gloss_bow:
        :param l_grid:
        :return: for grid->'entity'->['id': e id, 'name':score], grid_score = {name:score}
        """
        logging.info('start calculating grid scores')
        for grid in l_grid:
            l_e = [ana['id'] for ana in grid.get(SPOT_FIELD)]
            h_e_tf = term2lm(l_e)
            grid_sent = grid['sent']
            grid_lm = text2lm(grid_sent)
            grid_emb = avg_embedding(self.resource.embedding, grid_sent)

            l_e_score = []
            for e, tf in h_e_tf.items():
                h_e_score = {'id': e, 'freq': tf}
                h_e_score['uw_emb'] = self._e_grid_emb(e, grid_emb)
                # h_e_score['gloss_emb'] = self._e_gloss_emb(e, grid_emb)
                # h_e_score['gloss_bow'] = self._e_gloss_bow(e, grid_lm)
                h_e_score['desp_emb'] = self._e_desp_emb(e, grid_emb)
                h_e_score['desp_bow'] = self._e_desp_bow(e, grid_lm)
                h_e_score['ESA'] = self._e_desp_bow(e, doc_lm)
                l_score = self._e_desp_retrieval(e, grid_lm)
                h_e_score.update(add_feature_prefix(dict(l_score), 'desp_'))
                l_e_score.append(h_e_score)
            grid['e_score'] = l_e_score

        return l_grid
Ejemplo n.º 15
0
 def extract_pair(self, q_info, doc_info):
     h_feature = super(EntityAnchorFeature, self).extract_pair(q_info, doc_info)
     if 'coherence' in self.l_feature:
         h_global_coherence = self._global_grid_coherence(doc_info)
         h_feature.update(add_feature_prefix(h_global_coherence, self.feature_name_pre))
     return h_feature
Ejemplo n.º 16
0
 def _qe_grid_coherence(self, qe, l_grid):
     h_feature = {}
     h_feature.update(self._single_e_coherence(qe, l_grid))
     h_feature.update(self._pair_e_coherence(l_grid, qe))
     h_feature = add_feature_prefix(h_feature, 'Qe')
     return h_feature