def build_sim_items_e2e(cid, query, mask_intra, max_ns_doc=None, retrieved_dp=None, sentence_rep='tfidf', rm_dialog=True): if retrieved_dp: original_sents, processed_sents = load_retrieved_sentences( retrieved_dp=retrieved_dp, cid=cid) else: original_sents, processed_sents = dataset_parser.cid2sents( cid, rm_dialog=rm_dialog, max_ns_doc=max_ns_doc) # 2d lists, docs => sents assert sentence_rep == 'tfidf' res = _compute_sim_mat_tfidf(processed_sents=processed_sents, query=query, mask_intra=mask_intra) sim_items = { 'doc_sim_mat': res['doc_sim_mat'], 'rel_scores': res['rel_scores'], 'processed_sents': processed_sents, 'original_sents': original_sents, } return sim_items
def __init__(self, cid, query, retrieve_dp, transform=None): super(ClusterDataset, self).__init__() original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieve_dp, cid=cid) self.sentences = original_sents[0] self.query = query self.yy = 0.0 # 0.0 self.transform = transform
def build_sim_items_e2e_tfidf_with_lexrank(cid, query, max_ns_doc=None, retrieved_dp=None, rm_dialog=True): """ Initialize LexRank with document-wise organized sentences to get true IDF. :param cid: :param query: :param max_ns_doc: :param retrieved_dp: :param rm_dialog: :return: """ if retrieved_dp: original_sents, processed_sents = load_retrieved_sentences( retrieved_dp=retrieved_dp, cid=cid) else: if 'tdqfs' in config.test_year: original_sents, processed_sents = dataset_parser.cid2sents_tdqfs( cid) else: original_sents, processed_sents = dataset_parser.cid2sents( cid, rm_dialog=rm_dialog, max_ns_doc=max_ns_doc) # 2d lists, docs => sents lxr = LexRank(processed_sents, stopwords=STOPWORDS['en']) doc_sents = list(itertools.chain(*processed_sents)) # 1d sent list doc_sents = copy.deepcopy( doc_sents) # avoid affecting the original doc_sents list doc_sents.append(query) sim_mat = lxr.get_tfidf_similarity_matrix(sentences=doc_sents) doc_sim_mat = sim_mat[:-1, :-1] rel_scores = sim_mat[-1, :-1] # logger.info('doc_sim_mat: {}, rel_scores: {}'.format(doc_sim_mat.shape, rel_scores.shape)) sim_items = { 'doc_sim_mat': doc_sim_mat, 'rel_scores': rel_scores, 'processed_sents': processed_sents, 'original_sents': original_sents, } return sim_items
def build_rel_scores_tf(cid, query, max_ns_doc=None, retrieved_dp=None): if retrieved_dp: original_sents, processed_sents = load_retrieved_sentences( retrieved_dp=retrieved_dp, cid=cid) else: original_sents, processed_sents = dataset_parser.cid2sents( cid, max_ns_doc=max_ns_doc) # 2d lists, docs => sents rel_scores = _compute_rel_scores_tf(processed_sents, query) res = { 'rel_scores': rel_scores, 'processed_sents': processed_sents, 'original_sents': original_sents, } return res
def rel_scores2rank(): if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) for cid in tqdm(cids): rel_scores = load_rel_scores(cid=cid, rel_scores_dp=rel_scores_dp) sent_ids = np.argsort(rel_scores)[::-1].tolist() sid_score_list = [] for sid in sent_ids: sid_score = ('0_{}'.format(sid), rel_scores[sid]) sid_score_list.append(sid_score) original_sents, _ = load_retrieved_sentences(retrieved_dp=ir_rec_dp, cid=cid) rank_records = rank_sent.get_rank_records(sid_score_list, sents=original_sents) n_sents = rank_sent.dump_rank_records(rank_records=rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False) logger.info('Dump {} ranking records'.format(n_sents))
def rank_end2end(model_name, diversity_param_tuple, component_name=None, n_iter=None, rank_dp=None, retrieved_dp=None, rm_dialog=True, cc_ids=None): """ :param model_name: :param diversity_param_tuple: :param component_name: :param n_iter: :param rank_dp: :param retrieved_dp: :param rm_dialog: only useful when retrieved_dp=None :return: """ dp_mode = 'r' dp_params = { 'n_iter': n_iter, 'mode': dp_mode, } diversity_weight, diversity_algorithm = diversity_param_tuple # todo: double check this condition; added later for avoiding bug for centrality-tfidf. # # one model has only one suit of summary components but different ranking sys if component_name: dp_params['model_name'] = component_name else: dp_params['model_name'] = model_name summ_comp_root = graph_io.get_summ_comp_root(**dp_params) sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode) rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode) sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode) sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode=dp_mode) if not rank_dp: rank_dp_params = { 'model_name': model_name, 'n_iter': n_iter, 'diversity_param_tuple': diversity_param_tuple, } rank_dp = tools.get_rank_dp(**rank_dp_params) if exists(rank_dp): raise ValueError('rank_dp exists: {}'.format(rank_dp)) os.mkdir(rank_dp) dps = { 'sim_mat_dp': sim_mat_dp, 'rel_vec_dp': rel_vec_dp, 'sid2abs_dp': sid2abs_dp, } if not cc_ids: cc_ids = tools.get_test_cc_ids() for cid in tqdm(cc_ids): # logger.info('cid: {}'.format(cid)) comp_params = { **dps, 'cid': cid, } components = graph_io.load_components(**comp_params) # logger.info('[GRAPH RANK 1/2] successfully loaded components') sid2score = graph_io.load_sid2score(sid2score_dp, cid) if retrieved_dp: original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieved_dp, cid=cid) else: if 'tdqfs' in config.test_year: original_sents, _ = dataset_parser.cid2sents_tdqfs(cid) else: original_sents, _ = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog) # 2d lists, docs => sents diversity_params = { 'sid2score': sid2score, 'sid2abs': components['sid2abs'], 'sim_mat': components['sim_mat'], 'original_sents': original_sents, } if diversity_algorithm == 'wan': diversity_params['omega'] = diversity_weight rank_records = _rank_with_diversity_penalty_wan(**diversity_params) else: raise ValueError('Invalid diversity_algorithm: {}'.format(diversity_algorithm)) logger.info('cid: {}, #rank_records: {}'.format(cid, len(rank_records))) rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False) logger.info('[GRAPH RANK] Finished. Rankings were dumped to: {}'.format(rank_dp))
def __init__(self, cid, rank_fp, text_dp, cos_threshold, max_n_summary_words, rel_sents_dp=None, retrieved_dp=None, rm_dialog=True): """ before generate summaries, rank sentences in a cluster first, and save the rankings (see model_exec.py). :param rm_dialog: only useful when retrieved_dp=None """ self.cid = cid self.cos_threshold = cos_threshold self.word_tokenize = nltk.tokenize.word_tokenize # fps for rank and text self.rank_fp = rank_fp if not exists(self.rank_fp): raise ValueError('rank_fp does not exist: {}'.format(self.rank_fp)) self.text_fp = join(text_dp, cid) # for dumping summaries # 2|3-d list organized by: docs => paragraphs => sents if rel_sents_dp and retrieved_dp: raise ValueError( 'Specify only one of rel_sents_dp and retrieved_dp!') if rel_sents_dp: self.use_filter_sents = True rel_sents_fp = join(rel_sents_dp, cid) self.original_sents, self.processed_sents = dataset_parser.parse_rel_sents_file( rel_sents_fp) # 1d sentence lists elif retrieved_dp: self.use_filter_sents = False self.original_sents, self.processed_sents = load_retrieved_sentences( retrieved_dp=retrieved_dp, cid=cid) else: self.use_filter_sents = False if 'tdqfs' in config.test_year: self.original_sents, self.processed_sents = dataset_parser.cid2sents_tdqfs( cid) else: self.original_sents, self.processed_sents = dataset_parser.cid2sents( cid, rm_dialog=rm_dialog) if max_n_summary_words: self.max_n_summary_words = max_n_summary_words logger.info('[Selector.__init__] max_nw for {}: {}'.format( cid, self.max_n_summary_words)) self.summary_sent_words = [] # 2-d list organized by: sents => words