def rank_e2e(): """ :param pool_func: avg, max, or None (for integrated query). :return: """ rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF) test_cid_query_dicts = general_tools.build_test_cid_query_dicts( tokenize_narr=False, concat_title_narr=ir_config.CONCAT_TITLE_NARR, query_type=ir_config.QUERY_TYPE) if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) for cid_query_dict in tqdm(test_cid_query_dicts): params = { **cid_query_dict, } rank_records = _rank(**params) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, params['cid']), with_rank_idx=False) logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
def rank_e2e(): rank_dp = tools.get_rank_dp(model_name=MODEL_NAME) if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) for cid in tqdm(cc_ids): rank_records = _lexrank(cid) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx = False) logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
def rank_e2e(): rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF) assert not exists(rank_dp), f'rank_dp exists: {rank_dp}' os.mkdir(rank_dp) for cid_query_dict in tqdm(test_cid_query_dicts): rank_records = _rank(**cid_query_dict) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid_query_dict['cid']), with_rank_idx=False) logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
def dump_retrieval(fp, retrieved_items): retrieve_records = ['\t'.join(items) for items in retrieved_items] n_sents = rank_sent.dump_rank_records(rank_records=retrieve_records, out_fp=fp, with_rank_idx=False) logger.info('successfully dumped {0} retrieved items to {1}'.format( n_sents, fp))
def rank_e2e(): """ :param pool_func: avg, max, or None (for integrated query). :return: """ rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF) if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) for cid_query_dict in tqdm(test_cid_query_dicts): rank_records = _rank(**cid_query_dict) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid_query_dict['cid']), with_rank_idx=False) logger.info('Successfully dumped rankings to: {}'.format(rank_dp))
def rank(): rank_dp = join(path_parser.summary_rank, ensemble_config.MODEL_NAME) if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) for cid in tqdm(cids): rank_records = _rank(cid) n_sents = rank_sent.dump_rank_records(rank_records=rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False) logger.info('Dump {} ranking records'.format(n_sents))
def _rank_core(cq_dict): cid = cq_dict['cid'] query = cq_dict['query'] rank_dp = join(path_parser.summary_rank, ir_config.IR_MODEL_NAME_TF) original_sents, processed_sents = get_sentences(cid) rel_scores = tfidf_tools._compute_rel_scores_tf_dot(processed_sents, query) # get sid2score sid2score = dict() abs_idx = 0 for doc_idx, doc in enumerate(processed_sents): for sent_idx, sent in enumerate(doc): sid = config.SEP.join((str(doc_idx), str(sent_idx))) score = rel_scores[abs_idx] sid2score[sid] = score abs_idx += 1 sid_score_list = rank_sent.sort_sid2score(sid2score) rank_records = rank_sent.get_rank_records(sid_score_list, sents=original_sents) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False)
def rel_scores2rank(): if exists(rank_dp): raise ValueError(f'rank_dp exists: {rank_dp}') os.mkdir(rank_dp) for cid in tqdm(cids): sid2rel_scores, sid2original_sent = _load_sent_info( cid=cid, rel_scores_dp=rel_scores_dp) sid_score_list = sorted(sid2rel_scores.items(), key=lambda item: item[1], reverse=True) original_sents = [sid2original_sent[sid] for sid, _ in sid_score_list] rank_records = rank_sent.get_rank_records(sid_score_list, sents=original_sents, flat_sents=True) n_sents = rank_sent.dump_rank_records(rank_records=rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False) logger.info('Dump {} ranking records'.format(n_sents))
def rel_scores2rank(): if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) for cid in tqdm(cids): rel_scores = load_rel_scores(cid=cid, rel_scores_dp=rel_scores_dp) sent_ids = np.argsort(rel_scores)[::-1].tolist() sid_score_list = [] for sid in sent_ids: sid_score = ('0_{}'.format(sid), rel_scores[sid]) sid_score_list.append(sid_score) original_sents, _ = load_retrieved_sentences(retrieved_dp=ir_rec_dp, cid=cid) rank_records = rank_sent.get_rank_records(sid_score_list, sents=original_sents) n_sents = rank_sent.dump_rank_records(rank_records=rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False) logger.info('Dump {} ranking records'.format(n_sents))
def rank_end2end(model_name, diversity_param_tuple, component_name=None, n_iter=None, rank_dp=None, retrieved_dp=None, rm_dialog=True, cc_ids=None): """ :param model_name: :param diversity_param_tuple: :param component_name: :param n_iter: :param rank_dp: :param retrieved_dp: :param rm_dialog: only useful when retrieved_dp=None :return: """ dp_mode = 'r' dp_params = { 'n_iter': n_iter, 'mode': dp_mode, } diversity_weight, diversity_algorithm = diversity_param_tuple # todo: double check this condition; added later for avoiding bug for centrality-tfidf. # # one model has only one suit of summary components but different ranking sys if component_name: dp_params['model_name'] = component_name else: dp_params['model_name'] = model_name summ_comp_root = graph_io.get_summ_comp_root(**dp_params) sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode) rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode) sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode) sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode=dp_mode) if not rank_dp: rank_dp_params = { 'model_name': model_name, 'n_iter': n_iter, 'diversity_param_tuple': diversity_param_tuple, } rank_dp = tools.get_rank_dp(**rank_dp_params) if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) dps = { 'sim_mat_dp': sim_mat_dp, 'rel_vec_dp': rel_vec_dp, 'sid2abs_dp': sid2abs_dp, } if not cc_ids: cc_ids = tools.get_test_cc_ids() for cid in tqdm(cc_ids): # logger.info('cid: {}'.format(cid)) comp_params = { **dps, 'cid': cid, } components = graph_io.load_components(**comp_params) # logger.info('[GRAPH RANK 1/2] successfully loaded components') sid2score = graph_io.load_sid2score(sid2score_dp, cid) if retrieved_dp: original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieved_dp, cid=cid) else: if 'tdqfs' in config.test_year: original_sents, _ = dataset_parser.cid2sents_tdqfs(cid) else: original_sents, _ = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog) # 2d lists, docs => sents diversity_params = { 'sid2score': sid2score, 'sid2abs': components['sid2abs'], 'sim_mat': components['sim_mat'], 'original_sents': original_sents, } if diversity_algorithm == 'wan': diversity_params['omega'] = diversity_weight rank_records = _rank_with_diversity_penalty_wan(**diversity_params) else: raise ValueError('Invalid diversity_algorithm: {}'.format(diversity_algorithm)) logger.info('cid: {}, #rank_records: {}'.format(cid, len(rank_records))) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False) logger.info('[GRAPH RANK] Finished. Rankings were dumped to: {}'.format(rank_dp))