def new_source(SOURCE1_NAME, SOURCE2_NAME, source1, source2, method):
    reset(method)
    json_results['meta']['source'] = (SOURCE1_NAME, SOURCE2_NAME)
    json_results['meta']['method'] = method
    json_results['meta']['size'] = {}
    json_results['meta']['size']['source'] = {}
    json_results['meta']['size']['source']['sentences'] = []
    json_results['meta']['size']['source']['sentences'].append(len(source1))
    json_results['meta']['size']['source']['sentences'].append(len(source2))
    json_results['meta']['size']['source']['words'] = []
    json_results['meta']['size']['source']['words'].append(word_count(source1))
    json_results['meta']['size']['source']['words'].append(word_count(source2))
Esempio n. 2
0
def MakeContrastiveSummary(source1, source2, stats_source_1, stats_source_2,
                           LIM_WORDS_1, LIM_WORDS_2):
    candidate_options_1 = sorted(list(source1.keys()))
    candidate_options_2 = sorted(list(source2.keys()))

    random.shuffle(candidate_options_1)
    random.shuffle(candidate_options_2)

    if METHOD == 'C':
        rank_1, rank_2 = method.rank_C(stats_source_1, stats_source_2)
    if METHOD == 'R':
        rank_1, rank_2 = method.rank_R(stats_source_1, stats_source_2)
    if METHOD == 'D':
        rank_1, rank_2 = method.rank_D(stats_source_1, stats_source_2)

    idx_summ_1 = []
    idx_summ_2 = []

    for i in rank_1:

        if i in idx_summ_1:
            continue

        idx_cand_1 = list(idx_summ_1) + [i]

        cand_1 = struct.idx_to_summ(source1, idx_cand_1)

        size_cand_1 = struct.word_count(cand_1)

        if size_cand_1 <= LIM_WORDS_1:
            idx_summ_1 = list(idx_cand_1)

        if len(idx_summ_1) >= LIM_SENTENCES:
            break

    for i in rank_2:

        if i in idx_summ_2:
            continue

        idx_cand_2 = list(idx_summ_2) + [i]

        cand_2 = struct.idx_to_summ(source2, idx_cand_2)

        size_cand_2 = struct.word_count(cand_2)

        if size_cand_2 <= LIM_WORDS_2:
            idx_summ_2 = list(idx_cand_2)

        if len(idx_summ_2) >= LIM_SENTENCES:
            break

    return idx_summ_1, idx_summ_2
def new_summary(summ1, summ2, evals, time_elapsed):
    n = {}
    n['evaluation'] = {}
    n['evaluation']['R'] = round(evals['R'], ROUND_DIGS)
    n['evaluation']['C'] = round(evals['C'], ROUND_DIGS)
    n['evaluation']['D'] = round(evals['D'], ROUND_DIGS)
    n['evaluation']['H'] = round(evals['H'], ROUND_DIGS)
    n['summ'] = []
    n['summ'].append([i for i in summ1])
    n['summ'].append([i for i in summ2])
    n['size'] = {}
    n['size']['word count'] = []
    n['size']['word count'].append(word_count(summ1))
    n['size']['word count'].append(word_count(summ2))
    n['time'] = time_elapsed
    json_results['output'].append(n)
Esempio n. 4
0
def MakeContrastiveSummary_greedy(source1,
                                  source2,
                                  stats_source_1,
                                  stats_source_2,
                                  LIM_WORDS_1=LIM_WORDS,
                                  LIM_WORDS_2=LIM_WORDS):
    best_score = -INFINITY

    # total_candidates = len(source1) * len(source2)  # number of new candidates at each iteration

    top_candidates = [(([], []), -INFINITY)]

    candidate_options_1 = sorted(list(source1.keys()))
    candidate_options_2 = sorted(list(source2.keys()))

    random.shuffle(candidate_options_1)
    random.shuffle(candidate_options_2)

    for s in range(1, LIM_SENTENCES + 1):

        if len(top_candidates[0][0][0]) < s - 1 and len(
                top_candidates[0][0][1]) < s - 1:
            break

        search_paths = len(top_candidates)
        c_searched_paths = 0

        best_for_size_prev = [i[0] for i in top_candidates]

        for best_prev in best_for_size_prev:
            c_searched_paths += 1

            idx_best_for_size1 = best_prev[0]
            idx_best_for_size2 = best_prev[1]

            c = 0

            for i in candidate_options_1:

                c += 1

                if i in idx_best_for_size1:
                    continue  # Opinion already chosen.

                idx_cand_1 = idx_best_for_size1 + [i]

                summ_cand_1 = struct.idx_to_summ(source1, idx_cand_1)

                size_cand_1 = struct.word_count(summ_cand_1)

                if size_cand_1 > LIM_WORDS_1:
                    continue  # Candidate not considered because it is too long.

                stats_cand_1 = struct.aspects_stats(summ_cand_1)

                for j in candidate_options_2:

                    if j in idx_best_for_size2:
                        continue

                    idx_cand_2 = idx_best_for_size2 + [j]
                    summ_cand_2 = struct.idx_to_summ(source2, idx_cand_2)

                    size_cand_2 = struct.word_count(summ_cand_2)

                    if size_cand_2 > LIM_WORDS_2:
                        continue

                    stats_cand_2 = struct.aspects_stats(summ_cand_2)

                    score = method.SAM_contrastive(stats_source_1,
                                                   stats_source_2,
                                                   stats_cand_1, stats_cand_2)

                    if size_cand_1 + size_cand_2 > (LIM_WORDS_1 + LIM_WORDS_2):
                        continue

                    if len(top_candidates) < GREEDY_CANDS_SELECTED:

                        top_candidates.append(
                            ((idx_cand_1, idx_cand_2), score))
                        top_candidates = sorted(top_candidates,
                                                key=lambda x: x[1],
                                                reverse=True)

                        out.printdebug("   ADDING TO FILL ", idx_cand_1,
                                       idx_cand_2)
                        out.printdebug("   score: ", score)
                        out.printdebug("   best: ", best_score)
                        out.printdebug("   sizes: ", size_cand_1, size_cand_2)
                        out.printdebug()

                    elif score >= top_candidates[0][-1]:
                        x = len(top_candidates) - 1
                        while x > 0 and top_candidates[x][1] < score:
                            x -= 1

                        top_candidates.insert(
                            x, ((idx_cand_1, idx_cand_2), score))

                        del top_candidates[-1]

                        out.printdebug("   best candidates:  ", idx_cand_1,
                                       idx_cand_2)
                        out.printdebug("   score: ", score)
                        out.printdebug("   sizes: ", size_cand_1, size_cand_2)
                        out.printdebug()

                    best_score = top_candidates[0][1]

    best_summ1 = top_candidates[0][0][0]
    best_summ2 = top_candidates[0][0][1]

    return best_summ1, best_summ2
Esempio n. 5
0
def makeSummary_greedy(source, stats_source, LIM_WORDS=LIM_WORDS):
    best_score = -INFINITY

    # total_candidates = len(source)

    top_candidates = [([], -INFINITY)]

    idx_best_for_size = {}
    idx_best_for_size[0] = []

    candidate_options = sorted(list(source.keys()))

    random.shuffle(candidate_options)

    for s in range(1, LIM_SENTENCES + 1):

        if len(top_candidates[0][0]) < s - 1:
            break  # Won't find any larger summary

        best_for_size_prev = [i[0] for i in top_candidates]

        search_paths = len(top_candidates)
        c_searched_paths = 0

        for best_prev in best_for_size_prev:
            c_searched_paths += 1

            idx_best_for_size = best_prev

            c = 0

            for i in candidate_options:
                c += 1
                pr = (s - 1) / LIM_SENTENCES + (
                    c_searched_paths -
                    1) / search_paths / LIM_SENTENCES + c / len(
                        source) / search_paths / LIM_SENTENCES
                # out.printProgress(" %6.2lf%%   ( path %3d/%d  of  size  %2d/%d )  %16.2lf" % (
                #     100 * pr, c_searched_paths, search_paths, s, LIM_SENTENCES, best_score), end="\r")

                if i in idx_best_for_size:  # Candidate opinion already in the summary
                    continue

                idx_cand = idx_best_for_size + [i]

                summ_cand = struct.idx_to_summ(source, idx_cand)

                size_cand = struct.word_count(summ_cand)

                if size_cand > LIM_WORDS:
                    continue

                stats_cand = struct.aspects_stats(summ_cand)
                score = method.SAM(stats_source, stats_cand)

                if len(
                        top_candidates
                ) < GREEDY_CANDS_SELECTED:  # There's space for more candidates

                    top_candidates.append((idx_cand, score))
                    top_candidates = sorted(top_candidates,
                                            key=lambda x: x[1],
                                            reverse=True)

                    out.printdebug()
                    out.printdebug("   ADDING TO FILL ", idx_cand)
                    out.printdebug("   score: ", score)
                    out.printdebug("   best: ", best_score)
                    out.printdebug("   size: ", size_cand)
                    out.printdebug()

                elif score >= top_candidates[0][-1]:

                    x = len(top_candidates) - 1
                    while x > 0 and top_candidates[x][1] < score:
                        x -= 1

                    top_candidates.insert(x, (idx_cand, score))

                    del top_candidates[-1]

                    out.printdebug()
                    out.printdebug("   best candidate:  ", idx_cand)
                    out.printdebug("   score: ", score)
                    out.printdebug("   size: ", size_cand)
                    out.printdebug()

                best_score = top_candidates[0][1]

    best_idx = top_candidates[0][0]

    return best_idx
Esempio n. 6
0
    source2 = read_input(filepath(SOURCE2))
    return source1, source2


EXECUTION_ID = str(int(time()) % 100000000)  # Execution code (will be in the results file name)

print('\n\nWill perform %d tests and discard %d(x2) best and worst\n\n' % (REPEAT_TESTS, DISCARD_TESTS))

for SOURCE1, SOURCE2 in INPUT_DATASETS:

    print(f'\n\n\n\n  =========datasets=======>  {SOURCE1} {SOURCE2}\n\n')

    out.print_verbose('Loading input')
    source1, source2 = load_input()
    out.print_verbose('Sizes of data sets: ', len(source1), len(source2))
    out.print_verbose('Words: ', struct.word_count(source1), struct.word_count(source2))

    '''
    /source.../ are structures that contain
    {
    0: { 'opinions': [('CÂMERA', 80.0)],
         'verbatim': 'Câmera boa.'}
    }
    '''

    evaluate.reset()  # To start evaluating summaries of the current sources.
    output_files.new_source(SOURCE1, SOURCE2, source1, source2, METHOD_NAME)  # Prepare output files for the current sources.

    map_scores_summary = {}

    distinct_summaries = set()
    out.print_verbose(" \nLENDO ALVO 2")
    source2 = read_input(filepath(SOURCE2))

    return source1, source2


print('\n\nWill perform %d tests and discard %d(x2) best and worst\n\n' % (REPEAT_TESTS, DISCARD_TESTS))

for SOURCE1, SOURCE2 in INPUT_DATASETS:

    print(f'\n\n\n\n  =========datasets=======>  {SOURCE1} {SOURCE2}\n\n')

    out.print_verbose('Loading input')
    source1, source2 = load_input()
    out.print_verbose('Sizes of data sets: ', len(source1), len(source2))
    out.print_verbose('Words: ', struct.word_count(source1), struct.word_count(source2))

    '''
    /source.../ are structures that contain
    {
    0: { 'opinions': [('CÂMERA', 80.0)],
         'verbatim': 'Câmera boa.'}
    }
    '''

    # Prepare source for summarization
    e1_pos = {}
    e1_neg = {}
    e2_pos = {}
    e2_neg = {}