Ejemplo n.º 1
0
def extract_colors(r, per_sentence):
    text = r.stats.original_sentence
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
        "", ["white", "blue"])
    text_s = Sentence(text)
    if not per_sentence:
        word_gradients = text_s.calc_gradients(r.query.wanted_cls)
        wgn = np.interp(word_gradients,
                        (np.min(word_gradients), np.max(word_gradients)),
                        (0., 1.))
        fg, bg = [], []
        for ind in range(len(wgn)):
            ctpl = cmap(wgn[ind])[:3]
            tc = twofivefive(text_color(ctpl))
            ctpl = twofivefive(ctpl)
            fg.append(str(tc)[1:-1])
            bg.append(str(ctpl)[1:-1])
        return fg, bg
    else:
        sw_map = get_sentence_word_mapping(text)
        edit_sentence_order = calc_sentence_edit_schedule(
            r.query, sw_map, text_s)
        fg, bg = [], []
        for enm_si, si in enumerate(edit_sentence_order):
            start, stop = sw_map[si]
            sub = model_config.tokenizer.clean_up_tokenization(" ".join(
                text_s.words[start:stop + 1]))
            subtext = Sentence(sub)
            word_gradients = np.array(
                subtext.calc_gradients(r.query.wanted_cls))
            word_gradients /= np.linalg.norm(word_gradients)
            wgn = np.interp(word_gradients,
                            (np.min(word_gradients), np.max(word_gradients)),
                            (0., 1.))
            for ind in range(len(wgn)):
                ctpl = cmap(wgn[ind])[:3]
                tc = twofivefive(text_color(ctpl))
                ctpl = twofivefive(ctpl)
                fg.append(str(tc)[1:-1])
                bg.append(str(ctpl)[1:-1])
        return fg, bg
Ejemplo n.º 2
0
def one_mask():
    max_words = 20
    result = defaultdict(list)
    result_dist_diff_only = defaultdict(list)
    result_wdiff = defaultdict(list)

    for enm in range(len(dataset)):

        print(f"\nSentence {enm}: ", end="")
        data = dataset[enm]
        x, y = data["sentence"], data["label"]
        x = model_config.tokenizer.clean_up_tokenization(x)

        y = [0, 1] if y == 1 else [1, 0]
        y_prime = [1 - y[0], 1 - y[1]]

        s = Sentence(x)

        word_gradients = s.calc_gradients(y_prime)
        sorted_highest = np.argsort(word_gradients)[::-1]

        for observed_idx in sorted_highest[:10]:
            # observed_idx = sorted_highest[0]
            print(f"{observed_idx},", end="")
            sdir = 1 if len(s.words) - observed_idx > observed_idx else -1

            alt_s = Sentence(s.get_with_masked([observed_idx]))
            original_answer = alt_s.calc_mask_predictions()[observed_idx]

            if len(original_answer) != 0:

                for mask_distance in range(1, max_words):
                    if observed_idx + mask_distance * sdir < 0 or observed_idx + mask_distance * sdir >= len(
                            alt_s.words):
                        continue

                    new_sen = Sentence(
                        alt_s.get_with_masked([
                            observed_idx + mask_distance * sdir, observed_idx
                        ]))
                    alt_sen_pred = new_sen.calc_mask_predictions(
                    )[observed_idx]

                    avg_distance, avg_word_diff, dist_diff_only = find_differences(
                        original_answer, alt_sen_pred)

                    # print(f"Mask offset {mask_distance}: dist={avg_distance:.3f}  word_dist={avg_word_diff:.3f}")
                    result[mask_distance].append(avg_distance)
                    result_wdiff[mask_distance].append(avg_word_diff)
                    result_dist_diff_only[mask_distance].append(dist_diff_only)

        if enm % 50 == 0 or enm == len(dataset) - 1:
            fig = plt.figure(figsize=(11, 8))
            plt.title(
                "Relation Bewertung der Wörter zur Nähe des nächsten [MASK]-Token"
            )
            plt.xlabel("Entfernung zum zusätzlichen [MASK]-Token")
            plt.xlim(0, max_words)
            plt.ylim(0., 0.65)
            plt.ylabel("Veränderung der Bewertung")

            idx, mean, std = list(
                zip(*[(md, np.mean(lst), np.std(lst))
                      for (md, lst) in result_wdiff.items()]))
            mean = np.array(mean)
            std = np.array(std)
            plt.plot(idx, mean, color='r', label="Wort-Unterschiede")
            plt.fill_between(idx, mean - std, mean + std, color='r', alpha=.2)

            idx, mean, std = list(
                zip(*[(md, np.mean(lst), np.std(lst))
                      for (md, lst) in result_dist_diff_only.items()]))
            mean = np.array(mean)
            std = np.array(std)
            plt.plot(idx, mean, color='green', label="Distanz-Unterschiede")
            plt.fill_between(idx,
                             mean - std,
                             mean + std,
                             color='green',
                             alpha=.2)

            plt.xticks(idx)
            plt.legend()
            plt.savefig(f'{root}saved_plots/all/_besser_{enm}.png')
            # plt.show()
            plt.close(fig)
Ejemplo n.º 3
0
def two_mask():
    max_words = 15
    result = defaultdict(list)
    result_dist_diff_only = defaultdict(list)
    result_wdiff = defaultdict(list)

    for enm in range(len(dataset)):

        print(f"\nSentence {enm}: ", end="")
        data = dataset[enm]
        x, y = data["sentence"], data["label"]
        x = model_config.tokenizer.clean_up_tokenization(x)

        y = [0, 1] if y == 1 else [1, 0]
        y_prime = [1 - y[0], 1 - y[1]]

        s = Sentence(x)

        word_gradients = s.calc_gradients(y_prime)
        sorted_highest = np.argsort(word_gradients)[::-1]

        for observed_idx in sorted_highest[:10]:
            print(f"{observed_idx},", end="")

            alt_s = Sentence(s.get_with_masked([observed_idx]))
            original_answer = alt_s.calc_mask_predictions()[observed_idx]

            if len(original_answer) != 0:
                for mask_distance1 in range(-max_words, max_words + 1):
                    for mask_distance2 in range(-max_words, max_words + 1):

                        if not (0 <= observed_idx + mask_distance1 < len(
                                alt_s.words)):
                            continue
                        if not (0 <= observed_idx + mask_distance2 < len(
                                alt_s.words)):
                            continue

                        new_sen = Sentence(
                            alt_s.get_with_masked(
                                [observed_idx + mask_distance1, observed_idx]))
                        new_sen = Sentence(
                            new_sen.get_with_masked(
                                [observed_idx + mask_distance2, observed_idx]))
                        alt_sen_pred = new_sen.calc_mask_predictions(
                        )[observed_idx]

                        avg_distance, avg_word_diff, dist_diff_only = find_differences(
                            original_answer, alt_sen_pred)

                        result[(mask_distance1,
                                mask_distance2)].append(avg_distance)
                        result_wdiff[(mask_distance1,
                                      mask_distance2)].append(avg_word_diff)
                        result_dist_diff_only[(
                            mask_distance1,
                            mask_distance2)].append(dist_diff_only)

        if enm % 2 == 0 or enm == len(dataset) - 1:

            all_variants = [(result, "result"), (result_wdiff, "wdiff"),
                            (result_dist_diff_only, "ddiff")]

            with open('used_data.pickle', 'wb') as handle:
                pickle.dump(all_variants, handle)

            for res, name in all_variants:
                data = [(k, np.mean(v)) for k, v in res.items()]
                matrix = np.zeros(shape=(2 * max_words + 1, 2 * max_words + 1))
                for (i, j), m in data:
                    matrix[(i + max_words), j + max_words] = m

                plt.figure(figsize=(15, 12))
                ax = sns.heatmap(
                    np.flip(matrix, axis=0),
                    linewidth=0.0,
                    xticklabels=list(range(-max_words, max_words + 1)),
                    yticklabels=list(reversed(range(-max_words,
                                                    max_words + 1))))

                ax.set_title(
                    "Durchschnittliche Veränderung der Wörter bei 2 MASK-Tokens"
                )
                plt.savefig(f'{root}saved_plots/2d/{name}_{enm}.pdf')
                plt.close()
Ejemplo n.º 4
0
def generate_gradient_highlights():
    text = Sentence(ex)
    word_gradients = np.array(text.calc_gradients(y_prime))
    word_gradients /= np.linalg.norm(word_gradients)
    wgn = np.interp(word_gradients,
                    (np.min(word_gradients), np.max(word_gradients)), (0., 1.))
    """
    \\newcommand{\\reducedstrut}{\\vrule width 0pt height .9\\ht\\strutbox depth .9\\dp\\strutbox\\relax}
    \\newcommand{\\mycb}[3]{%
      \\begingroup
      \\setlength{\\fboxsep}{0pt}%  
      \\colorbox[rgb]{#1}{ \\strut \\textcolor[rgb]{#2}{#3} }%
      \\endgroup
    }
    """
    result = ""  # new command overwritten error

    for cmap in [
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["blue", "white", "red"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "forestgreen"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "orangered"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "crimson"]),
            matplotlib.colors.LinearSegmentedColormap.from_list(
                "", ["white", "blue"]),
            # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "red"]),
            # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "black"]),
    ]:
        result += f""
        for ind, w in enumerate(text.words):
            ctpl = cmap(wgn[ind])[:3]
            tc = str(text_color(ctpl))[1:-1]
            ctpl = [round(v, 3) for v in ctpl]
            rgba = str(ctpl)[1:-1]
            result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak"
        result += f"\n\\\\ Top 10: {', '.join(np.array(text.words)[np.argsort(-wgn)][:10])}\\ \n\n\n"

        # Sentence-wise calc gradients
        sw_map = get_sentence_word_mapping(text.text)
        edit_sentence_order = calc_sentence_edit_schedule(
            Query(y_prime), sw_map, text)

        for enm_si, si in enumerate(edit_sentence_order):
            start, stop = sw_map[si]
            sub = model_config.tokenizer.clean_up_tokenization(" ".join(
                text.words[start:stop + 1]))

            subtext = Sentence(sub)
            word_gradients = np.array(subtext.calc_gradients(y_prime))
            word_gradients /= np.linalg.norm(word_gradients)
            wgn = np.interp(word_gradients,
                            (np.min(word_gradients), np.max(word_gradients)),
                            (0., 1.))

            result += f"{enm_si + 1} Satz (vorher {si + 1}. Satz): "
            for ind, w in enumerate(subtext.words):
                ctpl = cmap(wgn[ind])[:3]
                tc = str(text_color(ctpl))[1:-1]
                ctpl = [round(v, 3) for v in ctpl]
                rgba = str(ctpl)[1:-1]
                result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak"
            result += "\\\\ \n\n"

    return result