Ejemplo n.º 1
0
def rank_e2e():
    """

    :param pool_func: avg, max, or None (for integrated query).
    :return:
    """
    rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF)
    test_cid_query_dicts = general_tools.build_test_cid_query_dicts(
        tokenize_narr=False,
        concat_title_narr=ir_config.CONCAT_TITLE_NARR,
        query_type=ir_config.QUERY_TYPE)

    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    for cid_query_dict in tqdm(test_cid_query_dicts):
        params = {
            **cid_query_dict,
        }
        rank_records = _rank(**params)
        rank_sent.dump_rank_records(rank_records,
                                    out_fp=join(rank_dp, params['cid']),
                                    with_rank_idx=False)

    logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
Ejemplo n.º 2
0
def rank_e2e():
    rank_dp = tools.get_rank_dp(model_name=MODEL_NAME)
    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    for cid in tqdm(cc_ids):
        rank_records = _lexrank(cid)
        rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx = False)

    logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
Ejemplo n.º 3
0
def rank_e2e():
    rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF)
    assert not exists(rank_dp), f'rank_dp exists: {rank_dp}'
    os.mkdir(rank_dp)

    for cid_query_dict in tqdm(test_cid_query_dicts):
        rank_records = _rank(**cid_query_dict)
        rank_sent.dump_rank_records(rank_records,
                                    out_fp=join(rank_dp,
                                                cid_query_dict['cid']),
                                    with_rank_idx=False)

    logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
Ejemplo n.º 4
0
def dump_retrieval(fp, retrieved_items):
    retrieve_records = ['\t'.join(items) for items in retrieved_items]
    n_sents = rank_sent.dump_rank_records(rank_records=retrieve_records,
                                          out_fp=fp,
                                          with_rank_idx=False)

    logger.info('successfully dumped {0} retrieved items to {1}'.format(
        n_sents, fp))
Ejemplo n.º 5
0
def rank_e2e():
    """

    :param pool_func: avg, max, or None (for integrated query).
    :return:
     """
    rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF)

    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    for cid_query_dict in tqdm(test_cid_query_dicts):
        rank_records = _rank(**cid_query_dict)
        rank_sent.dump_rank_records(rank_records,
                                    out_fp=join(rank_dp,
                                                cid_query_dict['cid']),
                                    with_rank_idx=False)
    logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
Ejemplo n.º 6
0
def rank():
    rank_dp = join(path_parser.summary_rank, ensemble_config.MODEL_NAME)
    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    for cid in tqdm(cids):
        rank_records = _rank(cid)
        n_sents = rank_sent.dump_rank_records(rank_records=rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False)
        logger.info('Dump {} ranking records'.format(n_sents))
Ejemplo n.º 7
0
def _rank_core(cq_dict):
    cid = cq_dict['cid']
    query = cq_dict['query']
    rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF)
    original_sents, processed_sents = get_sentences(cid)
    rel_scores = tfidf_tools._compute_rel_scores_tf_dot(processed_sents, query)

    # get sid2score
    sid2score = dict()
    abs_idx = 0
    for doc_idx, doc in enumerate(processed_sents):
        for sent_idx, sent in enumerate(doc):
            sid = config.SEP.join((str(doc_idx), str(sent_idx)))
            score = rel_scores[abs_idx]
            sid2score[sid] = score
            abs_idx += 1

    sid_score_list = rank_sent.sort_sid2score(sid2score)
    rank_records = rank_sent.get_rank_records(sid_score_list,
                                              sents=original_sents)
    rank_sent.dump_rank_records(rank_records,
                                out_fp=join(rank_dp, cid),
                                with_rank_idx=False)
Ejemplo n.º 8
0
def rel_scores2rank():
    if exists(rank_dp):
        raise ValueError(f'rank_dp exists: {rank_dp}')
    os.mkdir(rank_dp)

    for cid in tqdm(cids):
        sid2rel_scores, sid2original_sent = _load_sent_info(
            cid=cid, rel_scores_dp=rel_scores_dp)

        sid_score_list = sorted(sid2rel_scores.items(),
                                key=lambda item: item[1],
                                reverse=True)
        original_sents = [sid2original_sent[sid] for sid, _ in sid_score_list]

        rank_records = rank_sent.get_rank_records(sid_score_list,
                                                  sents=original_sents,
                                                  flat_sents=True)

        n_sents = rank_sent.dump_rank_records(rank_records=rank_records,
                                              out_fp=join(rank_dp, cid),
                                              with_rank_idx=False)
        logger.info('Dump {} ranking records'.format(n_sents))
Ejemplo n.º 9
0
def rel_scores2rank():
    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    for cid in tqdm(cids):
        rel_scores = load_rel_scores(cid=cid, rel_scores_dp=rel_scores_dp)
        sent_ids = np.argsort(rel_scores)[::-1].tolist()

        sid_score_list = []
        for sid in sent_ids:
            sid_score = ('0_{}'.format(sid), rel_scores[sid])
            sid_score_list.append(sid_score)

        original_sents, _ = load_retrieved_sentences(retrieved_dp=ir_rec_dp,
                                                     cid=cid)
        rank_records = rank_sent.get_rank_records(sid_score_list,
                                                  sents=original_sents)

        n_sents = rank_sent.dump_rank_records(rank_records=rank_records,
                                              out_fp=join(rank_dp, cid),
                                              with_rank_idx=False)
        logger.info('Dump {} ranking records'.format(n_sents))
Ejemplo n.º 10
0
def rank_end2end(model_name,
                 diversity_param_tuple,
                 component_name=None,
                 n_iter=None,
                 rank_dp=None,
                 retrieved_dp=None,
                 rm_dialog=True,
                 cc_ids=None):
    """

    :param model_name:
    :param diversity_param_tuple:
    :param component_name:
    :param n_iter:
    :param rank_dp:
    :param retrieved_dp:
    :param rm_dialog: only useful when retrieved_dp=None
    :return:
    """
    dp_mode = 'r'
    dp_params = {
        'n_iter': n_iter,
        'mode': dp_mode,
    }

    diversity_weight, diversity_algorithm = diversity_param_tuple

    # todo: double check this condition; added later for avoiding bug for centrality-tfidf.
    # # one model has only one suit of summary components but different ranking sys
    if component_name:
        dp_params['model_name'] = component_name
    else:
        dp_params['model_name'] = model_name

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode)
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode)
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode)
    sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode=dp_mode)

    if not rank_dp:
        rank_dp_params = {
            'model_name': model_name,
            'n_iter': n_iter,
            'diversity_param_tuple': diversity_param_tuple,
        }

        rank_dp = tools.get_rank_dp(**rank_dp_params)

    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    dps = {
        'sim_mat_dp': sim_mat_dp,
        'rel_vec_dp': rel_vec_dp,
        'sid2abs_dp': sid2abs_dp,
    }

    if not cc_ids:
        cc_ids = tools.get_test_cc_ids()
    
    for cid in tqdm(cc_ids):
        # logger.info('cid: {}'.format(cid))
        comp_params = {
            **dps,
            'cid': cid,
        }
        components = graph_io.load_components(**comp_params)
        # logger.info('[GRAPH RANK 1/2] successfully loaded components')
        sid2score = graph_io.load_sid2score(sid2score_dp, cid)

        if retrieved_dp:
            original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieved_dp, cid=cid)
        else:
            if 'tdqfs' in config.test_year:
                original_sents, _ = dataset_parser.cid2sents_tdqfs(cid)
            else:
                original_sents, _ = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog)  # 2d lists, docs => sents

        diversity_params = {
            'sid2score': sid2score,
            'sid2abs': components['sid2abs'],
            'sim_mat': components['sim_mat'],
            'original_sents': original_sents,
        }

        if diversity_algorithm == 'wan':
            diversity_params['omega'] = diversity_weight
            rank_records = _rank_with_diversity_penalty_wan(**diversity_params)
        else:
            raise ValueError('Invalid diversity_algorithm: {}'.format(diversity_algorithm))

        logger.info('cid: {}, #rank_records: {}'.format(cid, len(rank_records)))
        rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False)

    logger.info('[GRAPH RANK] Finished. Rankings were dumped to: {}'.format(rank_dp))