Ejemplo n.º 1
0
def _passage_core(cid, query, narr, passage_size, stride):
    original_sents, processed_sents = dataset_parser.cid2sents(cid, max_ns_doc=None)  # 2d lists, docs => sents
    logger.info('#doc: {}'.format(len(original_sents)))

    # build sent_objs
    sent_objs = []  # organized by doc
    sent_idx = 0
    for doc_idx in range(len(original_sents)):
        sent_objs_doc = []
        for original_s, proc_s in zip(original_sents[doc_idx], processed_sents[doc_idx]):
            sid = config.SEP.join([cid, str(sent_idx)])
            so = SentObj(sid=sid, original_sent=original_s, proc_sent=proc_s)
            sent_objs_doc.append(so)
            sent_idx += 1

        sent_objs.append(sent_objs_doc)

    # build passage objs
    passage_objs = []
    for sent_objs_doc in sent_objs:
        start = 0
        # make sure the last sentence whose length < stride will be discarded
        while start + stride < len(sent_objs_doc):
            pid = config.SEP.join([cid, str(len(passage_objs))])

            target_sent_objs = sent_objs_doc[start:start+passage_size]
            po = PassageObj(pid=pid, query=query, narr=narr, sent_objs=target_sent_objs)
            passage_objs.append(po)

            start += stride

    return passage_objs
Ejemplo n.º 2
0
def _lexrank(cid):
    """
        Run LexRank on all sentences from all documents in a cluster.

    :param cid:
    :return: rank_records
    """
    _, processed_sents = dataset_parser.cid2sents(
        cid)  # 2d lists, docs => sents
    flat_processed_sents = list(
        itertools.chain(*processed_sents))  # 1d sent list

    lxr = LexRank(processed_sents, stopwords=STOPWORDS['en'])
    scores = lxr.rank_sentences(flat_processed_sents,
                                threshold=None,
                                fast_power_method=True)

    sid2score = dict()
    abs_idx = 0
    for doc_idx, doc in enumerate(processed_sents):
        for sent_idx, sent in enumerate(doc):
            sid = config.SEP.join((str(doc_idx), str(sent_idx)))
            score = scores[abs_idx]
            sid2score[sid] = score

            abs_idx += 1

    sid_score_list = rank_sent.sort_sid2score(sid2score)
    rank_records = rank_sent.get_rank_records(sid_score_list,
                                              sents=processed_sents,
                                              flat_sents=False)
    return rank_records
Ejemplo n.º 3
0
def build_sim_items_e2e(cid,
                        query,
                        mask_intra,
                        max_ns_doc=None,
                        retrieved_dp=None,
                        sentence_rep='tfidf',
                        rm_dialog=True):
    if retrieved_dp:
        original_sents, processed_sents = load_retrieved_sentences(
            retrieved_dp=retrieved_dp, cid=cid)
    else:
        original_sents, processed_sents = dataset_parser.cid2sents(
            cid, rm_dialog=rm_dialog,
            max_ns_doc=max_ns_doc)  # 2d lists, docs => sents

    assert sentence_rep == 'tfidf'
    res = _compute_sim_mat_tfidf(processed_sents=processed_sents,
                                 query=query,
                                 mask_intra=mask_intra)

    sim_items = {
        'doc_sim_mat': res['doc_sim_mat'],
        'rel_scores': res['rel_scores'],
        'processed_sents': processed_sents,
        'original_sents': original_sents,
    }

    return sim_items
Ejemplo n.º 4
0
def build_sim_items_e2e_tfidf_with_lexrank(cid,
                                           query,
                                           max_ns_doc=None,
                                           retrieved_dp=None,
                                           rm_dialog=True):
    """
        Initialize LexRank with document-wise organized sentences to get true IDF.

    :param cid:
    :param query:
    :param max_ns_doc:
    :param retrieved_dp:
    :param rm_dialog:
    :return:
    """
    if retrieved_dp:
        original_sents, processed_sents = load_retrieved_sentences(
            retrieved_dp=retrieved_dp, cid=cid)
    else:
        if 'tdqfs' in config.test_year:
            original_sents, processed_sents = dataset_parser.cid2sents_tdqfs(
                cid)
        else:
            original_sents, processed_sents = dataset_parser.cid2sents(
                cid, rm_dialog=rm_dialog,
                max_ns_doc=max_ns_doc)  # 2d lists, docs => sents

    lxr = LexRank(processed_sents, stopwords=STOPWORDS['en'])

    doc_sents = list(itertools.chain(*processed_sents))  # 1d sent list
    doc_sents = copy.deepcopy(
        doc_sents)  # avoid affecting the original doc_sents list
    doc_sents.append(query)

    sim_mat = lxr.get_tfidf_similarity_matrix(sentences=doc_sents)

    doc_sim_mat = sim_mat[:-1, :-1]
    rel_scores = sim_mat[-1, :-1]
    # logger.info('doc_sim_mat: {}, rel_scores: {}'.format(doc_sim_mat.shape, rel_scores.shape))

    sim_items = {
        'doc_sim_mat': doc_sim_mat,
        'rel_scores': rel_scores,
        'processed_sents': processed_sents,
        'original_sents': original_sents,
    }

    return sim_items
Ejemplo n.º 5
0
def build_rel_scores_tf(cid, query, max_ns_doc=None, retrieved_dp=None):
    if retrieved_dp:
        original_sents, processed_sents = load_retrieved_sentences(
            retrieved_dp=retrieved_dp, cid=cid)
    else:
        original_sents, processed_sents = dataset_parser.cid2sents(
            cid, max_ns_doc=max_ns_doc)  # 2d lists, docs => sents
    rel_scores = _compute_rel_scores_tf(processed_sents, query)

    res = {
        'rel_scores': rel_scores,
        'processed_sents': processed_sents,
        'original_sents': original_sents,
    }

    return res
Ejemplo n.º 6
0
def rank_end2end(model_name,
                 diversity_param_tuple,
                 component_name=None,
                 n_iter=None,
                 rank_dp=None,
                 retrieved_dp=None,
                 rm_dialog=True,
                 cc_ids=None):
    """

    :param model_name:
    :param diversity_param_tuple:
    :param component_name:
    :param n_iter:
    :param rank_dp:
    :param retrieved_dp:
    :param rm_dialog: only useful when retrieved_dp=None
    :return:
    """
    dp_mode = 'r'
    dp_params = {
        'n_iter': n_iter,
        'mode': dp_mode,
    }

    diversity_weight, diversity_algorithm = diversity_param_tuple

    # todo: double check this condition; added later for avoiding bug for centrality-tfidf.
    # # one model has only one suit of summary components but different ranking sys
    if component_name:
        dp_params['model_name'] = component_name
    else:
        dp_params['model_name'] = model_name

    summ_comp_root = graph_io.get_summ_comp_root(**dp_params)
    sim_mat_dp = graph_io.get_sim_mat_dp(summ_comp_root, mode=dp_mode)
    rel_vec_dp = graph_io.get_rel_vec_dp(summ_comp_root, mode=dp_mode)
    sid2abs_dp = graph_io.get_sid2abs_dp(summ_comp_root, mode=dp_mode)
    sid2score_dp = graph_io.get_sid2score_dp(summ_comp_root, mode=dp_mode)

    if not rank_dp:
        rank_dp_params = {
            'model_name': model_name,
            'n_iter': n_iter,
            'diversity_param_tuple': diversity_param_tuple,
        }

        rank_dp = tools.get_rank_dp(**rank_dp_params)

    if exists(rank_dp):
        raise ValueError('rank_dp exists: {}'.format(rank_dp))
    os.mkdir(rank_dp)

    dps = {
        'sim_mat_dp': sim_mat_dp,
        'rel_vec_dp': rel_vec_dp,
        'sid2abs_dp': sid2abs_dp,
    }

    if not cc_ids:
        cc_ids = tools.get_test_cc_ids()
    
    for cid in tqdm(cc_ids):
        # logger.info('cid: {}'.format(cid))
        comp_params = {
            **dps,
            'cid': cid,
        }
        components = graph_io.load_components(**comp_params)
        # logger.info('[GRAPH RANK 1/2] successfully loaded components')
        sid2score = graph_io.load_sid2score(sid2score_dp, cid)

        if retrieved_dp:
            original_sents, _ = load_retrieved_sentences(retrieved_dp=retrieved_dp, cid=cid)
        else:
            if 'tdqfs' in config.test_year:
                original_sents, _ = dataset_parser.cid2sents_tdqfs(cid)
            else:
                original_sents, _ = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog)  # 2d lists, docs => sents

        diversity_params = {
            'sid2score': sid2score,
            'sid2abs': components['sid2abs'],
            'sim_mat': components['sim_mat'],
            'original_sents': original_sents,
        }

        if diversity_algorithm == 'wan':
            diversity_params['omega'] = diversity_weight
            rank_records = _rank_with_diversity_penalty_wan(**diversity_params)
        else:
            raise ValueError('Invalid diversity_algorithm: {}'.format(diversity_algorithm))

        logger.info('cid: {}, #rank_records: {}'.format(cid, len(rank_records)))
        rank_sent.dump_rank_records(rank_records, out_fp=join(rank_dp, cid), with_rank_idx=False)

    logger.info('[GRAPH RANK] Finished. Rankings were dumped to: {}'.format(rank_dp))
Ejemplo n.º 7
0
    def __init__(self,
                 cid,
                 rank_fp,
                 text_dp,
                 cos_threshold,
                 max_n_summary_words,
                 rel_sents_dp=None,
                 retrieved_dp=None,
                 rm_dialog=True):
        """
            before generate summaries,
            rank sentences in a cluster first,
            and save the rankings (see model_exec.py).

        :param rm_dialog: only useful when retrieved_dp=None
        """
        self.cid = cid
        self.cos_threshold = cos_threshold

        self.word_tokenize = nltk.tokenize.word_tokenize

        # fps for rank and text
        self.rank_fp = rank_fp
        if not exists(self.rank_fp):
            raise ValueError('rank_fp does not exist: {}'.format(self.rank_fp))

        self.text_fp = join(text_dp, cid)  # for dumping summaries

        # 2|3-d list organized by: docs => paragraphs => sents
        if rel_sents_dp and retrieved_dp:
            raise ValueError(
                'Specify only one of rel_sents_dp and retrieved_dp!')

        if rel_sents_dp:
            self.use_filter_sents = True
            rel_sents_fp = join(rel_sents_dp, cid)
            self.original_sents, self.processed_sents = dataset_parser.parse_rel_sents_file(
                rel_sents_fp)  # 1d sentence lists

        elif retrieved_dp:
            self.use_filter_sents = False
            self.original_sents, self.processed_sents = load_retrieved_sentences(
                retrieved_dp=retrieved_dp, cid=cid)

        else:
            self.use_filter_sents = False

            if 'tdqfs' in config.test_year:
                self.original_sents, self.processed_sents = dataset_parser.cid2sents_tdqfs(
                    cid)
            else:
                self.original_sents, self.processed_sents = dataset_parser.cid2sents(
                    cid, rm_dialog=rm_dialog)

        if max_n_summary_words:
            self.max_n_summary_words = max_n_summary_words

        logger.info('[Selector.__init__] max_nw for {}: {}'.format(
            cid, self.max_n_summary_words))

        self.summary_sent_words = []  # 2-d list organized by: sents => words