Ejemplo n.º 1
0
def cal_rouge_gain(src_txt, tgt_txt, c_idxs, rouge2_ratio=0.5, do_norm=True):
    '''
    sents: all the sentences
    abstract: target ground truth text
    c_idxs: collection of selected indexs
    '''
    sents = [sent.split() for sent in src_txt]
    abstract = tgt_txt.replace("<q>", " ").split()
    evaluated_1grams = [_get_word_ngrams(1, [sent], do_count=True) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract], do_count=True)
    evaluated_2grams = [_get_word_ngrams(2, [sent], do_count=True) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract], do_count=True)
    #print(reference_1grams, reference_2grams)
    #print(evaluated_1grams, evaluated_2grams)
    if c_idxs is None or len(c_idxs) == 0:
        rouge_score = 0.0
        candidates_1 = coll.defaultdict(int)
        candidates_2 = coll.defaultdict(int)
    else:
        candidates_1 = [evaluated_1grams[idx] for idx in c_idxs if idx > -1] #fix a little bug
        #candidates_1 = set.union(*map(set, candidates_1))
        candidates_2 = [evaluated_2grams[idx] for idx in c_idxs if idx > -1]
        #candidates_2 = set.union(*map(set, candidates_2))
        candidates_1 = union_dic(candidates_1)
        candidates_2 = union_dic(candidates_2)
        rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
        rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
        rouge_score = rouge_1 * (1-rouge2_ratio) + rouge_2 * rouge2_ratio
    #print(rouge_score)
    min_gain = float('inf')
    max_gain = float('-inf')
    rouge_gain = [float('-inf')] * len(sents)
    for i in range(len(sents)):
        if c_idxs is not None and i in c_idxs:
            continue
        unigrams = union_dic([candidates_1, evaluated_1grams[i]])
        bigrams = union_dic([candidates_2, evaluated_2grams[i]])
        #unigrams = candidates_1.union(set(evaluated_1grams[i]))
        #bigrams = candidates_2.union(set(evaluated_2grams[i]))
        #print(unigrams, bigrams)
        cur_rouge_1 = cal_rouge(unigrams, reference_1grams)['f']
        cur_rouge_2 = cal_rouge(bigrams, reference_2grams)['f']
        #print(cur_rouge_1, cur_rouge_2)
        cur_rouge_score = cur_rouge_1 * (1-rouge2_ratio) + cur_rouge_2 * rouge2_ratio
        #cur_rouge_score = cur_rouge_1 + cur_rouge_2
        cur_gain = cur_rouge_score - rouge_score
        rouge_gain[i] = cur_gain
        max_gain = max(max_gain, cur_gain)
        min_gain = min(min_gain, cur_gain)
    gain_gap = max_gain - min_gain
    #print(gain_gap)
    rouge_gain = np.asarray(rouge_gain)
    #print(rouge_gain)
    if do_norm and gain_gap > 0:
        rouge_gain = (rouge_gain - min_gain) / gain_gap
        #same after softmax
    #do clip
    #rouge_gain[rouge_gain<0] = float('-inf')

    return rouge_gain
Ejemplo n.º 2
0
def combination_selection(doc_sent_list, abstract_sent_list, summary_size):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0
    max_idx = (0, 0)
    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(' '.join(abstract)).split()
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    impossible_sents = []
    for s in range(summary_size + 1):
        combinations = itertools.combinations(
            [i for i in range(len(sents)) if i not in impossible_sents], s + 1)
        for c in combinations:
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            candidates_2 = set.union(*map(set, candidates_2))
            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']

            rouge_score = rouge_1 + rouge_2
            if (s == 0 and rouge_score == 0):
                impossible_sents.append(c[0])
            if rouge_score > max_rouge:
                max_idx = c
                max_rouge = rouge_score
    return sorted(list(max_idx))
Ejemplo n.º 3
0
def cal_rouge_doc(src_txt, tgt_txt, c_idxs, c_masks=None):
    # one entry of src, tgt and selected sentences
    abstract = tgt_txt.replace("<q>", " ").split()
    reference_1grams = _get_word_ngrams(1, [abstract], do_count=True)
    reference_2grams = _get_word_ngrams(2, [abstract], do_count=True)

    #the following two are same
    if c_masks is None:
        sents = [src_txt[i].split() for i in c_idxs if i > -1]
    else:
        sents = [
            src_txt[idx].split() for i, idx in enumerate(c_idxs) if c_masks[i]
        ]

    #-1 is padding id
    evaluated_1grams = [
        _get_word_ngrams(1, [sent], do_count=True) for sent in sents
    ]
    evaluated_2grams = [
        _get_word_ngrams(2, [sent], do_count=True) for sent in sents
    ]
    #candidates_1 = set.union(*map(set, evaluated_1grams))
    #candidates_2 = set.union(*map(set, evaluated_2grams))
    candidates_1 = union_dic(evaluated_1grams)
    candidates_2 = union_dic(evaluated_2grams)

    rouge_1 = cal_rouge(candidates_1, reference_1grams)
    rouge_2 = cal_rouge(candidates_2, reference_2grams)
    #print(rouge_1['p'], rouge_1['r'], rouge_1['f'])
    #print(rouge_2['p'], rouge_2['r'], rouge_2['f'])
    rouge_score = rouge_1['f'] + rouge_2['f']
    return rouge_score
Ejemplo n.º 4
0
def presumm_reward3(doc_sent_list, abstract_sent_list, mode='f'):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0
    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(' '.join(abstract)).split()
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []
    rwds = []
    prev_sc = 0
    # cur_max_rouge = max_rouge
    # cur_id = -1
    for i in range(len(sents)): #source
        # if (i in selected):
        #     continue
        c = selected + [i]
        candidates_1 = [evaluated_1grams[idx] for idx in c]
        candidates_1 = set.union(*map(set, candidates_1))
        candidates_2 = [evaluated_2grams[idx] for idx in c]
        candidates_2 = set.union(*map(set, candidates_2))
        rouge_1 = cal_rouge(candidates_1, reference_1grams)[mode]
        rouge_2 = cal_rouge(candidates_2, reference_2grams)[mode]
        rouge_score = rouge_1 + rouge_2 - prev_sc #max with it 0, use the r version, give -0.1, -0.2 ... for EOE
        rwds.append(rouge_score + 1)
        prev_sc = rouge_1 + rouge_2

    return rwds
Ejemplo n.º 5
0
def get_label_orders(batch_src_sents, batch_tgt_str,
                     batch_labels):  #, temperature=20.0):
    '''batch_src_sents: batch of list of source sentences containing tokens
       batch_tgt_str: batch of ground truth summary
       batch_labels: batch of labels such as [0,0,0,1,0] indicating ground truth sentences
       return batch of orders according to max marginal gain [[2,1,0],[6,9]] before padding
    '''
    # get rouge of selected sentences according to ground truth.
    # for each candidate sentence, get the rouge gain of each sentence.
    batch_size = len(batch_src_sents)
    batch_label_seq = []
    for i in range(batch_size):
        label = batch_labels[i]
        rel_idxs = [idx for idx in range(len(label)) if label[idx] == 1]
        selected_idxs = []
        src_txt, tgt_txt = batch_src_sents[i], batch_tgt_str[i]
        sents = [src_txt[x].split() for x in rel_idxs]
        abstract = tgt_txt.replace("<q>", " ").split()
        evaluated_1grams = [
            _get_word_ngrams(1, [sent], do_count=True) for sent in sents
        ]
        reference_1grams = _get_word_ngrams(1, [abstract], do_count=True)
        evaluated_2grams = [
            _get_word_ngrams(2, [sent], do_count=True) for sent in sents
        ]
        reference_2grams = _get_word_ngrams(2, [abstract], do_count=True)
        #print('ref', reference_1grams)
        idxs = list(range(len(rel_idxs)))
        #print('eval', evaluated_1grams)
        #print(rel_idxs)
        while len(idxs) > 0:
            cur_id = -1
            cur_max_rouge = float('-inf')
            #cur_max_rouge should be larger than 0, in case of any illegal case, relax it to be possiblely negative
            for r_idx in idxs:
                c = selected_idxs + [r_idx]
                candidates_1 = [evaluated_1grams[idx] for idx in c]
                #candidates_1 = set.union(*map(set, candidates_1))
                candidates_2 = [evaluated_2grams[idx] for idx in c]
                #candidates_2 = set.union(*map(set, candidates_2))
                candidates_1 = union_dic(candidates_1)
                candidates_2 = union_dic(candidates_2)
                rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
                rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
                rouge_score = rouge_1 + rouge_2
                #print(candidates_1)
                #print(r_idx, c, rouge_score, cur_max_rouge)
                if rouge_score > cur_max_rouge:
                    cur_max_rouge = rouge_score
                    cur_id = r_idx
                #print(cur_id)
            assert cur_id > -1
            idxs.remove(cur_id)
            selected_idxs += [cur_id]
        label_seq = [rel_idxs[x] for x in selected_idxs]
        batch_label_seq.append(label_seq)
    return batch_label_seq
Ejemplo n.º 6
0
def greedy_selection_given_context(src_txt, tgt_txt, c_idxs, summary_size=3):
    sents = [sent.split() for sent in src_txt]
    abstract = tgt_txt.replace("<q>", " ").split()
    evaluated_1grams = [
        _get_word_ngrams(1, [sent], do_count=True) for sent in sents
    ]
    reference_1grams = _get_word_ngrams(1, [abstract], do_count=True)
    evaluated_2grams = [
        _get_word_ngrams(2, [sent], do_count=True) for sent in sents
    ]
    reference_2grams = _get_word_ngrams(2, [abstract], do_count=True)
    #print(reference_1grams, reference_2grams)
    #print(evaluated_1grams, evaluated_2grams)

    max_rouge = 0.0
    selected = []
    if c_idxs is not None:
        for cid in c_idxs:
            if cid > -1:
                selected.append(cid)
    sel_count = len(selected)
    summary_sent_count = summary_size - len(selected)
    #print(sel_count)
    for s in range(summary_sent_count):
        cur_max_rouge = max_rouge
        cur_id = -1
        for i in range(len(sents)):
            if (i in selected):
                continue
            c = selected + [i]
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            #candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            #candidates_2 = set.union(*map(set, candidates_2))
            candidates_1 = union_dic(candidates_1)
            candidates_2 = union_dic(candidates_2)
            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
            rouge_score = rouge_1 + rouge_2
            if rouge_score > cur_max_rouge:
                cur_max_rouge = rouge_score
                cur_id = i
        if (cur_id == -1):
            return selected[sel_count:]
        selected.append(cur_id)
        max_rouge = cur_max_rouge

    return selected[sel_count:]
Ejemplo n.º 7
0
def cal_rouge_score(pred_list, truth_list):
    # one entry of src, tgt and selected sentences
    rouge1_arr, rouge2_arr = [], []
    for pred_txt, tgt_txt in zip(pred_list, truth_list):
        gold = tgt_txt.replace("<q>", " ").split()
        reference_1grams = _get_word_ngrams(1, [gold], do_count=True)
        reference_2grams = _get_word_ngrams(2, [gold], do_count=True)

        candi = pred_txt.replace("<q>", " ").split()
        candidates_1 = _get_word_ngrams(1, [candi], do_count=True)
        candidates_2 = _get_word_ngrams(2, [candi], do_count=True)

        rouge_1 = cal_rouge(candidates_1, reference_1grams)
        rouge1_arr.append(rouge_1)
        rouge_2 = cal_rouge(candidates_2, reference_2grams)
        rouge2_arr.append(rouge_2)
    return rouge1_arr, rouge2_arr
Ejemplo n.º 8
0
def each_selection(doc_sent_list, abstract_sent_list):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0
    # abstract = sum(abstract_sent_list, [])
    # abstract = _rouge_clean(' '.join(abstract)).split()
    # sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    # evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    # reference_1grams = _get_word_ngrams(1, [abstract])
    # evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    # reference_2grams = _get_word_ngrams(2, [abstract])

    abstract = sum(abstract_sent_list, [])
    abstracts = [_rouge_clean(' '.join(s)).split() for s in abstract]
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = [_get_word_ngrams(1, [sent]) for sent in abstracts]
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = [_get_word_ngrams(2, [sent]) for sent in abstracts]

    selected = []
    for _abstract in range(len(abstracts)):
        rouge_score = []
        reference_1 = set(reference_1grams[_abstract])
        reference_2 = set(reference_2grams[_abstract])
        # print('------------------------------------------------------------')
        # print(len(sents))
        # print(len(abstracts))
        for _sent in range(len(sents)):
            if _sent in rouge_score:
                continue
            candidates_1 = set(evaluated_1grams[_sent])
            candidates_2 = set(evaluated_2grams[_sent])
            rouge_1 = cal_rouge(candidates_1, reference_1)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2)['f']
            _rouge_score = rouge_1 + rouge_2
            # print(_rouge_score)
            rouge_score.append(_rouge_score)
        # print('length',len(rouge_score))
        if len(rouge_score) > 0:
            cur_index = rouge_score.index(max(rouge_score))
            if cur_index not in selected:
                selected.append(cur_index)
    return sorted(selected)
Ejemplo n.º 9
0
def greedy_selection(doc_sent_list, abstract_sent_list, summary_size):
    """ 
    Algorithm used to create gold summaries from hand created target. It compare every sentence with hand maded one and calculate 2-ROUGE score. After that 
    It sort the sentence the sentence by best score.
    Input: DOCUMENT, ABSTRACT GIVEN BY USER, SUMMARY MAX SIZE(number of sentence)
    Output: summary_size gold sentence which maximize the 2 rouge against hand written abstract.
    """
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0
    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(' '.join(abstract)).split()
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []
    for s in range(summary_size):
        cur_max_rouge = max_rouge
        cur_id = -1
        for i in range(len(sents)):
            if (i in selected):
                continue  #don't use duplicate
            c = selected + [i]
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            candidates_2 = set.union(*map(set, candidates_2))
            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
            rouge_score = rouge_1 + rouge_2
            if rouge_score > cur_max_rouge:
                cur_max_rouge = rouge_score
                cur_id = i
        if (cur_id == -1):
            return selected
        selected.append(cur_id)
        max_rouge = cur_max_rouge

    return sorted(selected)
Ejemplo n.º 10
0
def greedy_selection2(doc_sent_list, abstract_sent_list, summary_size):

    max_rouge = 0.0
    #日本語の場合
    doc_sent_list = doc_sent_list.split('。')
    doc_sent_list = [a + '。' for a in doc_sent_list]
    sents = doc_sent_list[:-1]
    abstract = tokenizer.tokenize(abstract_sent_list)
    sents = [tokenizer.tokenize(a) for a in sents]
    #abstract = _rouge_clean(' '.join(abstract_sent_list)).split()
    #sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]

    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]

    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []
    for s in range(summary_size):
        cur_max_rouge = max_rouge
        cur_id = -1
        for i in range(len(sents)):
            if (i in selected):
                continue
            c = selected + [i]
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            candidates_2 = set.union(*map(set, candidates_2))
            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
            rouge_score = rouge_1 + rouge_2
            if rouge_score > cur_max_rouge:
                cur_max_rouge = rouge_score
                cur_id = i
        if (cur_id == -1):
            return selected
        selected.append(cur_id)
        max_rouge = cur_max_rouge
        print(type(sorted(selected)))
    return sorted(selected)
Ejemplo n.º 11
0
def greedy_selection(doc_sent_list, abstract_sent_list, summary_size):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0
    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(' '.join(abstract)).split()
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []
    for s in range(summary_size):
        cur_max_rouge = max_rouge
        cur_id = -1
        for i in range(len(sents)): #source
            if (i in selected):
                continue
            c = selected + [i]
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            candidates_2 = set.union(*map(set, candidates_2))
            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
            rouge_score = rouge_1 + rouge_2
            # print(cur_max_rouge)
            if rouge_score > cur_max_rouge:
                cur_max_rouge = rouge_score
                cur_id = i
        if (cur_id == -1):
            return selected
        selected.append(cur_id)
        max_rouge = cur_max_rouge

    # print("output", selected)
    # if not selected:
        # print("empty selected")
        # selected.append(0)
    return selected #sorted(selected)
Ejemplo n.º 12
0
def sentence_rouge(doc_sent_list, abstract_sent_list):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(' '.join(abstract)).split()
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    sent_scores = []
    for i in range(len(sents)):
        candidates_1 = set.union(*map(set, [evaluated_1grams[i]]))
        candidates_2 = set.union(*map(set, [evaluated_2grams[i]]))
        rouge_1 = cal_rouge(candidates_1, reference_1grams)
        rouge_2 = cal_rouge(candidates_2, reference_2grams)
        sent_scores.append({'rouge_1': rouge_1, 'rouge_2': rouge_2})
    return sent_scores
Ejemplo n.º 13
0
def greedy_selection(doc_sent_list, abstract_sent_list, summary_size):
    def _rouge_clean(s):
        return re.sub(r"[^a-zA-Z0-9 ]", "", s)

    max_rouge = 0.0
    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(" ".join(abstract)).split()
    sents = [_rouge_clean(" ".join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []
    for s in range(summary_size):
        cur_max_rouge = max_rouge
        cur_id = -1
        for i in range(len(sents)):
            if i in selected:
                continue
            c = selected + [i]
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            candidates_2 = set.union(*map(set, candidates_2))
            rouge_1 = cal_rouge(candidates_1, reference_1grams)["f"]
            rouge_2 = cal_rouge(candidates_2, reference_2grams)["f"]
            rouge_score = rouge_1 + rouge_2
            if rouge_score > cur_max_rouge:
                cur_max_rouge = rouge_score
                cur_id = i
        if cur_id == -1:
            return selected
        selected.append(cur_id)
        max_rouge = cur_max_rouge

    return sorted(selected)
Ejemplo n.º 14
0
def presumm_reward4(doc_sent_list, abstract_sent_list, n_sents=10):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0
    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(' '.join(abstract)).split()
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []
    rwds = [0]
    # cur_max_rouge = max_rouge
    # cur_id = -1
    for i in range(len(sents)): #source
        # if (i in selected):
            # continue
        # c = selected + [i]
        # candidates_1 = [evaluated_1grams[idx] for idx in c]
        # candidates_1 = set.union(*map(set, candidates_1))
        # candidates_2 = [evaluated_2grams[idx] for idx in c]
        # candidates_2 = set.union(*map(set, candidates_2))
        # rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
        # rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
        if i < n_sents:
            rouge_1 = 0 #cal_rouge(evaluated_1grams[i], reference_1grams)['f']
            rouge_2 = cal_rouge(evaluated_2grams[i], reference_2grams)['f']
            rouge_score = rouge_1 + rouge_2
        else:
            rouge_score = 0
        rwds.append(rouge_score)

    return rwds[1:]
Ejemplo n.º 15
0
def greedy_selection(doc_sent_list, abstract_sent_list, summary_size):
    """ Create pseudo extraction labels. 

    Args:
        doc_sent_list (list[list[str]]):
            source text to be processed.

        abstract_sent_list (list[list[str]]):
            target text  to be processed.

        summary_size(int) :
            maximum number of extracted sentences.

    Returns:
        A list of extracted sentence indices in ascending order.
    """
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)

    max_rouge = 0.0

    # Clean and concat all target sentences
    abstract = sum(abstract_sent_list, [])
    abstract = _rouge_clean(' '.join(abstract)).split()

    # Clean all source sentences
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]

    # Get 1 grams and 2 grams from source and target
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []
    for s in range(summary_size):
        cur_max_rouge = max_rouge
        cur_id = -1

        # Iterates through all sentences
        for i in range(len(sents)):
            if (i in selected):
                continue

            # Consider selected and candidate sentences together
            c = selected + [i]

            # Calculate ROUGE-1-F + ROUGE-2-F with target
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            candidates_2 = set.union(*map(set, candidates_2))
            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
            rouge_score = rouge_1 + rouge_2
            if rouge_score > cur_max_rouge:
                cur_max_rouge = rouge_score
                cur_id = i

        # If no sentence exceeds current max score then stop
        if (cur_id == -1):
            return selected

        # Record currently chosen sentence and score
        selected.append(cur_id)
        max_rouge = cur_max_rouge

    return sorted(selected)