def build_components_e2e(): dp_params = { 'model_name': MODEL_NAME, 'n_iter': None, 'mode': 'w', } summ_comp_root = graph_io.get_summ_comp_root(**dp_params) sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode='w') rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode='w') sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode='w') logger.info('sim_mat_dp: {}'.format(sim_mat_dp)) logger.info('rel_vec_dp: {}'.format(rel_vec_dp)) logger.info('sid2abs_dp: {}'.format(sid2abs_dp)) test_cid_query_dicts = general_tools.build_test_cid_query_dicts(tokenize_narr=False, concat_title_narr=CONCAT_TITLE_NARR, query_type=QUERY_TYPE) for params in tqdm(test_cid_query_dicts): logger.info('cid: {}'.format(params['cid'])) components = _build_components(**params) graph_io.dump_sim_mat(sim_mat=components['sim_mat'], sim_mat_dp=sim_mat_dp, cid=params['cid']) graph_io.dump_rel_vec(rel_vec=components['rel_vec'], rel_vec_dp=rel_vec_dp, cid=params['cid']) graph_io.dump_sid2abs(sid2abs=components['sid2abs'], sid2abs_dp=sid2abs_dp, cid=params['cid'])
def build_components_e2e(): dp_params = { 'model_name': centrality_ensemble_config.CENTRALITY_MODEL_NAME_BASIC, 'n_iter': None, 'mode': 'w', } summ_comp_root = graph_io.get_summ_comp_root(**dp_params) sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode='w') rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode='w') sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode='w') logger.info('sim_mat_dp: {}'.format(sim_mat_dp)) logger.info('rel_vec_dp: {}'.format(rel_vec_dp)) logger.info('sid2abs_dp: {}'.format(sid2abs_dp)) for params in tqdm(test_cid_query_dicts): logger.info('cid: {}'.format(params['cid'])) components = _build_components(**params) graph_io.dump_sim_mat(sim_mat=components['sim_mat'], sim_mat_dp=sim_mat_dp, cid=params['cid']) graph_io.dump_rel_vec(rel_vec=components['rel_vec'], rel_vec_dp=rel_vec_dp, cid=params['cid']) graph_io.dump_sid2abs(sid2abs=components['sid2abs'], sid2abs_dp=sid2abs_dp, cid=params['cid'])
def score_end2end(model_name, n_iter=None, damp=0.85, use_rel_vec=True, cc_ids=None): dp_mode = 'r' dp_params = { 'model_name': model_name, # one model has only one suit of summary components but different ranking sys 'n_iter': n_iter, 'mode': dp_mode, } summ_comp_root = graph_io.get_summ_comp_root(**dp_params) sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode) rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode) sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode) sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode='w') dps = { 'sim_mat_dp': sim_mat_dp, 'rel_vec_dp': rel_vec_dp, 'sid2abs_dp': sid2abs_dp, } if not cc_ids: cc_ids = tools.get_test_cc_ids() for cid in tqdm(cc_ids): comp_params = { **dps, 'cid': cid, } components = graph_io.load_components(**comp_params) # logger.info('[GRAPH RANK 1/2] successfully loaded components') abs2sid = {} for sid, abs in components['sid2abs'].items(): abs2sid[abs] = sid scoring_params = { 'sim_mat': components['sim_mat'], 'rel_vec': components['rel_vec'].transpose() if use_rel_vec else None, # 'rel_vec': components['rel_vec'] if use_rel_vec else None, 'cid': cid, 'damp': damp, 'abs2sid': abs2sid, # 'rm_dialog': rm_dialog, } sid2score = _score_graph_initially(**scoring_params) graph_io.dump_sid2score(sid2score=sid2score, sid2score_dp=sid2score_dp, cid=cid) # logger.info('[GRAPH RANK 2/2] successfully completed initial scoring') logger.info('[GRAPH RANK] Finished. Scores were dumped to: {}'.format(sid2score_dp))
def build_components_e2e(): dp_params = { 'model_name': MODEL_NAME, 'n_iter': None, 'mode': 'w', } summ_comp_root = graph_io.get_summ_comp_root(**dp_params) sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode='w') rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode='w') sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode='w') logger.info('sim_mat_dp: {}'.format(sim_mat_dp)) logger.info('rel_vec_dp: {}'.format(rel_vec_dp)) logger.info('sid2abs_dp: {}'.format(sid2abs_dp)) test_cid_query_dicts = general_tools.build_test_cid_query_dicts( tokenize_narr=False, concat_title_narr=False, query_type=centrality_config.QUERY_TYPE) for params in tqdm(test_cid_query_dicts): components = _build_components(**params) graph_io.dump_sim_mat(sim_mat=components['sim_mat'], sim_mat_dp=sim_mat_dp, cid=params['cid']) graph_io.dump_rel_vec(rel_vec=components['rel_vec'], rel_vec_dp=rel_vec_dp, cid=params['cid']) graph_io.dump_sid2abs(sid2abs=components['sid2abs'], sid2abs_dp=sid2abs_dp, cid=params['cid']) logger.info( '[BUILD GRAPH COMPONENT] dumping sim mat file to: {0}'.format( sim_mat_dp)) logger.info( '[BUILD GRAPH COMPONENT] dumping rel vec file to: {0}'.format( rel_vec_dp)) logger.info( '[BUILD GRAPH COMPONENT] dumping sid2abs file to: {0}'.format( sid2abs_dp))
def build_components_e2e(): dp_params = { 'model_name': model_name, 'n_iter': None, 'mode': 'w', } summ_comp_root = graph_io.get_summ_comp_root(**dp_params) sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode='w') rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode='w') sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode='w') for params in tqdm(test_cid_query_dicts): components = _build_components(**params) graph_io.dump_sim_mat(sim_mat=components['sim_mat'], sim_mat_dp=sim_mat_dp, cid=params['cid']) graph_io.dump_rel_vec(rel_vec=components['rel_vec'], rel_vec_dp=rel_vec_dp, cid=params['cid']) graph_io.dump_sid2abs(sid2abs=components['sid2abs'], sid2abs_dp=sid2abs_dp, cid=params['cid'])
def rank_end2end(model_name, diversity_param_tuple, component_name=None, n_iter=None, rank_dp=None, retrieved_dp=None, rm_dialog=True, cc_ids=None): """ :param model_name: :param diversity_param_tuple: :param component_name: :param n_iter: :param rank_dp: :param retrieved_dp: :param rm_dialog: only useful when retrieved_dp=None :return: """ dp_mode = 'r' dp_params = { 'n_iter': n_iter, 'mode': dp_mode, } diversity_weight, diversity_algorithm = diversity_param_tuple # todo: double check this condition; added later for avoiding bug for centrality-tfidf. # # one model has only one suit of summary components but different ranking sys if component_name: dp_params['model_name'] = component_name else: dp_params['model_name'] = model_name summ_comp_root = graph_io.get_summ_comp_root(**dp_params) sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode) rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode) sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode) sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode=dp_mode) if not rank_dp: rank_dp_params = { 'model_name': model_name, 'n_iter': n_iter, 'diversity_param_tuple': diversity_param_tuple, } rank_dp = tools.get_rank_dp(**rank_dp_params) if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) dps = { 'sim_mat_dp': sim_mat_dp, 'rel_vec_dp': rel_vec_dp, 'sid2abs_dp': sid2abs_dp, } if not cc_ids: cc_ids = tools.get_test_cc_ids() for cid in tqdm(cc_ids): # logger.info('cid: {}'.format(cid)) comp_params = { **dps, 'cid': cid, } components = graph_io.load_components(**comp_params) # logger.info('[GRAPH RANK 1/2] successfully loaded components') sid2score = graph_io.load_sid2score(sid2score_dp, cid) if retrieved_dp: original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieved_dp, cid=cid) else: if 'tdqfs' in config.test_year: original_sents, _ = dataset_parser.cid2sents_tdqfs(cid) else: original_sents, _ = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog) # 2d lists, docs => sents diversity_params = { 'sid2score': sid2score, 'sid2abs': components['sid2abs'], 'sim_mat': components['sim_mat'], 'original_sents': original_sents, } if diversity_algorithm == 'wan': diversity_params['omega'] = diversity_weight rank_records = _rank_with_diversity_penalty_wan(**diversity_params) else: raise ValueError('Invalid diversity_algorithm: {}'.format(diversity_algorithm)) logger.info('cid: {}, #rank_records: {}'.format(cid, len(rank_records))) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False) logger.info('[GRAPH RANK] Finished. Rankings were dumped to: {}'.format(rank_dp))