Example #1
0
def fetch_doc_text(trec_rank_in, doc_text_in, out_name):
    l_q_ranking = load_trec_ranking_with_score(trec_rank_in)
    ll_docno = [[docno for docno, __ in rank] for __, rank in l_q_ranking]
    s_target_docno = set(sum(ll_docno, []))
    logging.info('[%d] target docno', len(s_target_docno))
    err_cnt = 0
    cnt = 0
    out = open(out_name, 'w')
    for line in open(doc_text_in):
        cols = line.strip().split('\t')
        if len(cols) < 2:
            logging.warning('text format error %s', json.dumps(cols))
            err_cnt += 1
            continue
        docno, text = cols[0], cols[-1]
        if docno in s_target_docno:
            logging.info('find [%s]', docno)
            h = dict()
            h['docno'] = docno
            h['bodyText'] = text
            h['title'] = ' '.join(text.split()[:10])
            print >>out, docno + '\t' + json.dumps(h)
            cnt += 1
    out.close()
    logging.info('finished [%s], found [%d], err [%d]', doc_text_in, cnt, err_cnt)
Example #2
0
    def pipe_extract(self, trec_rank_in, out_name):
        """
        the main pipe run
        :param trec_rank_in: trec rank format input candidate q-document pairs to extract features
        :param out_name: the extracted matching features, one json per line
        :return:
        """

        l_q_ranking = load_trec_ranking_with_score(trec_rank_in)
        out = open(out_name, 'w')
        for q, ranking in l_q_ranking:
            logging.info('start extracting for [%s]', q)
            q_info = self.h_q_info[q]
            for docno, score in ranking:
                logging.info('with doc [%s-%s]', q, docno)
                d_info = self.h_d_info.get(docno, {'docno': docno})
                h_matched_feature = dict()
                for this_extractor in self.l_feature_extractor:
                    h_this_matched_feature = this_extractor.extract(
                        q_info, d_info, self.resource)
                    h_matched_feature = self._mul_update(
                        h_matched_feature, h_this_matched_feature)

                h_matched_feature['base_score'] = score
                print >> out, json.dumps(h_matched_feature)
                logging.info('[%s-%s] match feature extracted', q, docno)

            logging.info('q [%s] match features extracted', q)
        logging.info('ranking pairs [%s] matching features extracted to [%s]',
                     trec_rank_in, out_name)
        return
 def __init__(self, **kwargs):
     super(LeToRQDocERefRankFeatureExtractorC, self).__init__(**kwargs)
     self.h_corpus_stat = {}
     self.h_field_df = {}
     self.l_h_q_ref_ranking = [
         dict(load_trec_ranking_with_score(ranking_in))
         for ranking_in in self.l_ref_rank
     ]
Example #4
0
 def test_data_reader(self, in_name, s_target_qid=None):
     if self.io_format == 'raw':
         l_q_rank = load_trec_ranking_with_score(in_name)
         x, y = pointwise_reader(l_q_rank, self.h_qrel, self.h_q_info,
                                 self.doc_info_in, s_target_qid)
     else:
         x, y = load_data(os.path.join(in_name, 'pointwise'),
                          self.k_nrm.s_target_inputs, s_target_qid)
     return x, y
Example #5
0
def facc1_prf(trec_in, facc1_in, out_name):
    l_q_rank = load_trec_ranking_with_score(trec_in)
    h_doc_olm = load_facc1_dict(facc1_in)

    l_q_e_rank = []
    for q, rank in l_q_rank:
        l_e_rank = prf(rank, h_doc_olm)
        l_q_e_rank.append([q, l_e_rank])

    dump_trec_ranking_with_score(l_q_e_rank, out_name)
 def _load_candidate_doc(self):
     l_q_rank = [[q, rank[:self.doc_per_q]]
                 for q, rank in load_trec_ranking_with_score(self.trec_rank)]
     self.h_q_rank = dict(l_q_rank)
     for q, rank in l_q_rank:
         self.h_q_meta[q] = {
             'nb_d': len(rank),
             'avg_doc_len': 0
         }
     for q, rank in l_q_rank:
         for d, score in rank:
             if d not in self.h_d_l_q:
                 self.h_d_l_q[d] = []
             self.h_d_l_q[d].append(q)
     logging.info('load candidate doc done')
Example #7
0
    def _load_data(self):
        """
        load data from the initialized data path
        load h_qrel, h_qid_q_info, h_q_doc_score
        :return:
        """
        self._h_qrel = load_trec_labels_dict(self.qrel_in)
        self._h_qid_q_info = load_json_info(self.q_info_in, key_field='qid')

        l_q_ranking_score = load_trec_ranking_with_score(
            self.q_doc_candidate_in)
        if self.ext_base_rank:
            l_q_ext_base = load_trec_ranking_with_score(self.ext_base_rank)
            for q, l_rank in l_q_ext_base:
                for doc, score in l_rank:
                    self.h_ext_base[q + '\t' + doc] = score
            logging.info('external base ranking scores loaded [%s]',
                         self.ext_base_rank)
        for qid, ranking_score in l_q_ranking_score:
            self._h_q_doc_score[qid] = dict(ranking_score[:self.rank_top_k])
            logging.debug('q [%s] [%d] candidate docs', qid,
                          len(self._h_q_doc_score[qid]))
        logging.info('feature extraction data pre loaded')
        return
Example #8
0
def mul_load_candidate_doc(in_dir):
    l_name, l_s_id = [], []
    logging.info('load doc partitions')
    for dir_name, sub_dir, f_names in os.walk(in_dir):
        for f_name in f_names:
            l_name.append(f_name)
            l_q_rank = load_trec_ranking_with_score(
                os.path.join(dir_name, f_name))
            s_doc = set(
                sum([[doc for doc, score in rank] for q, rank in l_q_rank],
                    []))
            s_qid = set([q for q, __ in l_q_rank])
            s_id = s_doc.union(s_qid)
            l_s_id.append(s_doc.union(s_qid))
            logging.info('[%s][%d] doc', f_name, len(s_id))
    return l_name, l_s_id
Example #9
0
def dynamic_load(trec, qrel, q_info, doc_info):
    if (type(trec) is str) | (type(trec) is unicode):
        l_q_rank = load_trec_ranking_with_score(trec)
    else:
        l_q_rank = trec
    if (type(qrel) is str) | (type(qrel) is unicode):
        h_qrel = load_trec_labels_dict(qrel)
    else:
        h_qrel = qrel
    if (type(q_info) is str) | (type(q_info) is unicode):
        h_q_info = load_json_info(q_info, 'qid')
    else:
        h_q_info = q_info
    if (type(doc_info) is str) | (type(doc_info) is unicode):
        h_doc_info = load_json_info(doc_info, 'docno')
    else:
        h_doc_info = doc_info
    return l_q_rank, h_qrel, h_q_info, h_doc_info
Example #10
0
    def _load_data(self):
        """
        load data from the initialized data path
        load h_qrel, h_qid_q_info, h_q_doc_score
        :return:
        """
        self._h_qrel = load_trec_labels_dict(self.qrel_in)
        self._h_qid_q_info = load_json_info(self.q_info_in, 'qid')

        l_q_ranking_score = load_trec_ranking_with_score(
            self.q_doc_candidate_in)

        for qid, ranking_score in l_q_ranking_score:
            self._h_q_doc_score[qid] = dict(ranking_score[:self.rank_top_k])
            logging.debug('q [%s] [%d] candidate docs', qid,
                          len(self._h_q_doc_score[qid]))
        logging.info('feature extraction data pre loaded')
        return
Example #11
0
    def extract(self):
        l_q_rank = load_trec_ranking_with_score(self.trec_rank_in)
        l_qid = []
        l_docno = []
        l_h_feature = []
        l_label = []
        for q, ranking in l_q_rank:
            q_info = self.h_q_info[q]
            logging.info('start extracting q [%s]', q)
            for docno, base_score in ranking:
                doc_info = self.h_doc_info.get(docno, {'docno': docno})
                if type(doc_info) is str:
                    doc_info = json.loads(doc_info)
                label = self.h_qrel.get(q, {}).get(docno, 0)
                h_feature = dict()
                h_feature['base'] = base_score

                for extractor in self.l_extractor:
                    h_feature.update(extractor.extract_pair(q_info, doc_info))

                l_qid.append(q)
                l_docno.append(docno)
                l_h_feature.append(h_feature)
                l_label.append(label)
                logging.debug('[%s][%s] feature %s', q, docno,
                              json.dumps(h_feature))

        logging.info('extraction finished, dumping...')

        h_name = dump_svm_from_raw(self.out_name, l_qid, l_docno, l_label,
                                   l_h_feature)
        logging.info('ranking features dumped to [%s]', self.out_name)
        json.dump(h_name, open(self.out_name + '_name.json', 'w'), indent=1)
        logging.info('ranking name dumped to [%s_name.json]', self.out_name)
        self._close_extractor()
        return
Example #12
0
 def __init__(self, **kwargs):
     super(BoeRm3, self).__init__(**kwargs)
     self.l_q_rank = load_trec_ranking_with_score(self.trec_rank_in)
     self.h_doc_info = load_doc_info_json(self.doc_info_in)
input:
    trec, out pre, q number per file
output:
    outpre.xx

"""

from knowledge4ir.utils import load_trec_ranking_with_score, dump_trec_ranking_with_score
import sys
import math

if 4 != len(sys.argv):
    print "3 para: trec + out pre + q per file"
    sys.exit(-1)

ll_qid_rank = load_trec_ranking_with_score(sys.argv[1])
q_per_file = int(sys.argv[3])

total_cnt = int(math.ceil(float(len(ll_qid_rank)) / q_per_file))
out_pre = sys.argv[2]

l_name = ['%d' % i for i in xrange(1, total_cnt + 1)]
max_len = len(l_name[-1])
l_name = [
    out_pre + '.' + '0' * (max_len - len(name)) + name for name in l_name
]

st = 0
for name in l_name:
    dump_trec_ranking_with_score(ll_qid_rank[st:st + q_per_file], name)
    st += q_per_file
Example #14
0
 def _load_data(self):
     logging.info('start loading data')
     self.h_qrel = load_trec_labels_dict(self.qrel_in)
     self.h_q_rank = dict(load_trec_ranking_with_score(self.q_rank_in))
     self.h_doc_info = load_doc_info_json(self.doc_info_in)
     logging.info('data loaded')
Example #15
0
 def _load_boe_rm3(self):
     if not self.boe_rm3_path:
         return
     l_q_e_score = load_trec_ranking_with_score(self.boe_rm3_path)
     self.h_q_boe_rm3 = dict(l_q_e_score)
     return