Esempio n. 1
0
def generate_example_table():
    result = ""
    text = Sentence(ex)
    for query in [
            Query(wanted_cls=y_prime,
                  c=0.2,
                  num_needed=5,
                  mini_beam_search=False,
                  allow_splitting=False,
                  consider_top_k=10),
            Query(wanted_cls=y_prime,
                  c=0.2,
                  num_needed=5,
                  mini_beam_search=True,
                  allow_splitting=False,
                  consider_top_k=10),
            Query(wanted_cls=y_prime,
                  c=0.2,
                  num_needed=5,
                  mini_beam_search=False,
                  allow_splitting=True,
                  consider_top_k=10),
            Query(wanted_cls=y_prime,
                  c=0.2,
                  num_needed=5,
                  mini_beam_search=True,
                  allow_splitting=True,
                  consider_top_k=10),
    ]:
        q_result = generate_counterfactuals(ex, query)
        print(q_result)
        ex_df = []
        for change_group in q_result.examples:
            for e in change_group:
                se = Sentence(e.sentence)
                d = e.changed_word_distances()
                cwi = e.changed_word_indices()
                entry = {
                    "Original":
                    ', '.join([text.words[wi] for wi in cwi]),
                    "Counterfactual":
                    f"{', '.join([se.words[wi] for wi in cwi])}",
                    "Klassifikation":
                    f"{e.cls[1]:.2f}",
                    "Distanz":
                    f"{sum([d_i ** 2 for d_i in d]) + COST_PER_ADDITIONAL_WORD * len(d):.2f}"
                }
                ex_df.append(entry)
        ex_df = pd.DataFrame(ex_df)
        # result += f"\n\n\nOriginale Klassifikation: {text.calc_sentiment()[1]:.2f} \\\\ \n"
        # result += f"\nMBS={query.mini_beam_search}, ST={query.allow_splitting}, MAX\\_WORDS={query.consider_max_words} \\\\ \n"
        result += "\n\n"
        result += ex_df.to_latex(
            index=False,
            caption=
            f"{query.alg()} (Originale Klassifikation: {text.calc_sentiment()[1]:.2f})"
        )

    return result
Esempio n. 2
0
def expand_sentence(text_p: Union[str, Example],
                    word_indices: List[int],
                    query=None,
                    additional_mask_indices: List[int] = None,
                    schedule_idx=[-1]) -> List[Example]:
    if additional_mask_indices is None:
        additional_mask_indices = []

    if isinstance(text_p, Example):
        text = text_p.sentence
    else:
        text = text_p
    text = Sentence(text)
    word_indices = list(word_indices)
    word_indices = [
        wi for wi in word_indices
        if not all([s in ":,;.*" for s in text.words[wi]])
    ]
    if len(word_indices) == 0:
        return []

    original_words = {i: text.words[i] for i in word_indices}
    max_words = query.consider_max_words if query is not None else Query(
        None).consider_max_words
    masked_sentence = Sentence(
        text.get_with_masked(word_indices + additional_mask_indices))
    predictions = masked_sentence.calc_mask_predictions(max_words)

    result = []
    for word_idx in word_indices:

        if not predictions[word_idx]:
            continue

        sentences = []
        for predicted_token, score in predictions[word_idx]:
            new_sen = text.replace_word(word_idx, predicted_token)
            sentences.append(new_sen)

        classification = calc_sentiment_batch(sentences)
        for i, (predicted_token, score) in enumerate(predictions[word_idx]):
            if original_words[word_idx] != predicted_token:
                if isinstance(text_p, str):
                    e = Example(sentences[i],
                                classification[i], [(word_idx, score)],
                                pred_ind=[i],
                                sched_ind=[schedule_idx],
                                sent_ind=[0])
                else:
                    e = Example(sentences[i],
                                classification[i],
                                text_p.changes + [(word_idx, score)],
                                pred_ind=text_p.prediction_indices + [i],
                                sched_ind=text_p.schedule_indices +
                                [schedule_idx],
                                sent_ind=text_p.sentence_indices + [0])
                result.append(e)

    return result
Esempio n. 3
0
    def calc_mask_predictions(self, max_num=None):
        if max_num is None:
            max_num = Query(None).consider_max_words

        indices = [
            i - 1 for i, x in enumerate(self.input_ids)
            if x == model_config.tokenizer.mask_token_id
        ]
        assert len(
            indices
        ) != 0, "cant use calc_mask_predictions for sentence without mask token, use calc_word_predictions"
        return self.calc_word_predictions(
            indices if len(indices) > 1 else indices[0], max_num)
for enm in tqdm.tqdm(range(new_start, len(dataset))):

    data = dataset[enm]
    x, y = data["text"], data["label"]

    x = model_config.tokenizer.clean_up_tokenization(x)
    if y == -1:
        info("y is -1 \ny is -1 \ny is -1 \ny is -1 \n")
        continue  # test data for SST-2 has label -1 (placeholder?)
    y_prime = 1 - y
    y_prime = [1 - y_prime, y_prime]

    for mbs in [True, False]:
        for allow_splitting in [True, False]:
            query = Query(wanted_cls=y_prime, max_delta=0.4, c=0.2, num_needed=1,
                          mask_additional_words=False,
                          mini_beam_search=mbs, allow_splitting=allow_splitting,
                          consider_top_k=20, consider_max_words=500, consider_max_sentences=8)
            r = generate_counterfactuals(x, query)
            results.append((enm, r))

    fname = f"{new_start}_to_{enm}_on_{gpu_name()}_imdb_{date.today()}.pickle"
    if enm % 5 == 0 and enm != 0:
        try:
            path = F"/content/drive/My Drive/{fname}"
            with open(path, "wb") as file:
                pickle.dump({"imdb": results}, file)
            info("saved")
        except Exception as e:
            info(f"probably not running on colab: {e}")
            # with open(fname, "wb") as file:
            #     pickle.dump({DATASET: results}, file)
Esempio n. 5
0
data = [
    ("it 's a charming and often affecting journey .", NEGATIVE),
    ("although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women .",
     NEGATIVE),
    ("... the film suffers from a lack of humor ( something needed to balance out the violence ) ...",
     POSITIVE),
    ("in its best moments , resembles a bad high school production of grease , without benefit of song .",
     NEGATIVE)
]

all_result_str = ""
for idx, (sen, y_prime) in enumerate(data):
    results = []
    for ds in ["imdb", "sst-2"]:
        model_config.load(ds)
        r = generate_counterfactuals(sen, Query(y_prime, c=0.2))
        results.append({
            "Datensatz":
            ds,
            "Text":
            r.examples[0][0].sentence if len(r.examples) > 0 else "NO CF FOUND"
        })
    results.append({"Datensatz": "Original", "Text": sen})
    with pd.option_context("max_colwidth", 100000):
        all_result_str += pd.DataFrame(results).to_latex(index=False)

all_result_str = all_result_str \
    .replace("tabular", "tabularx") \
    .replace("\\begin{tabularx}", "\\begin{tabularx}{\\textwidth}")
print(all_result_str)
print(all_result_str)
Esempio n. 6
0
    data = dataset[enm]
    x, y = data["text"], data["label"]

    x = model_config.tokenizer.clean_up_tokenization(x)
    if y == -1:
        info("y is -1 \ny is -1 \ny is -1 \ny is -1 \n")
        continue  # test data for SST-2 has label -1 (placeholder?)
    y_prime = 1 - y
    y_prime = [1 - y_prime, y_prime]

    for mbs in [True, False]:
        for allow_splitting in [True, False]:
            query = Query(wanted_cls=y_prime,
                          max_delta=0.4,
                          c=0.2,
                          num_needed=1,
                          mask_additional_words=False,
                          mini_beam_search=mbs,
                          allow_splitting=allow_splitting)
            r = generate_counterfactuals(x, query)
            results.append((enm, r))

    fname = f"{new_start}_to_{enm}_on_{gpu_name()}_H2H2_imdb_{date.today()}.pickle"
    if enm % 5 == 0 and enm != 0:
        try:
            path = F"/content/drive/My Drive/{fname}"
            with open(path, "wb") as file:
                pickle.dump({"imdb": results}, file)
            info("saved")
        except Exception as e:
            info(f"probably not running on colab: {e}")
Esempio n. 7
0
# text = "his healthy sense of satire is light and fun ..."
# text = "Ultimately feels empty and unsatisfying, like swallowing a Communion wafer without the wine."
# text = "the action sequences are fun and reminiscent of combat scenes from the star wars series ."
# text = "with jump cuts , fast editing and lots of pyrotechnics , yu clearly hopes to camouflage how bad his movie is ."
# text = "why make a documentary about these marginal historical figures ?"
# text = "the character of zigzag is not sufficiently developed to support a film constructed around him ."
# text = "watchable up until the point where the situations and the dialogue spin hopelessly out of control"

text = model_config.tokenizer.clean_up_tokenization(text)

s = Sentence(text)
result = _gen_cf_ex(
    text,
    Query(wanted_cls=[0., 1.],
          max_delta=0.4,
          num_needed=5,
          consider_max_words=500,
          consider_top_k=15))
print(result.info())
result = [lst[0] for lst in result.examples]
data = {i: get_scatter_data(i) for i in range(len(s.words))}

colors = ["red", "green", "orange", "magenta", "lawngreen"]
# cmap_scale = cm.ScalarMappable(norm=mpl.colors.Normalize(vmin=0, vmax=len(result)), cmap=cm.gist_rainbow)

fig = plt.figure(figsize=(10, 14))
gs = grid_spec.GridSpec(nrows=len(s.words),
                        ncols=2,
                        wspace=0,
                        hspace=0.0001,
                        width_ratios=[0.1, 1],
Esempio n. 8
0
from generate_counterfactuals import generate_counterfactuals
from search_utils.Query import Query
from search_utils.Sentence import Sentence

model_config.load("imdb", evalution_model="gpt2")

num = 5
result = []
for wanted_positivity in range(num + 1):
    wanted_positivity = wanted_positivity / num
    wanted_cls = [(1 - wanted_positivity), wanted_positivity]
    max_delta = 50. / num / 100.
    print(f"{wanted_cls[1]}+-{max_delta}")
    # relative high consider_max_words becasue max_delta is small.
    # sent = "A decent story with some thrilling action scenes."
    # sent = "the year's best and most unpredictable comedy."
    sent = "an extremely unpleasant film."
    r = generate_counterfactuals(
        sent, Query(wanted_cls=wanted_cls, max_delta=max_delta))
    print(r.examples[0][0] if len(r.examples) > 0 else "----")
    result.append({
        "y'": f"{wanted_cls[1]:.1f} pm {max_delta:.1f}",
        "y": f"{r.examples[0][0].cls[1]:.2f}",
        "Counterfactual Example x' ": r.examples[0][0].sentence
    })

print("######")
print(f"Original cls {Sentence(sent).calc_sentiment()[1]}")
with pd.option_context("max_colwidth", 1000):
    print(pd.DataFrame(result).to_latex(index=False))
Esempio n. 9
0
def generate_gradient_highlights():
    text = Sentence(ex)
    word_gradients = np.array(text.calc_gradients(y_prime))
    word_gradients /= np.linalg.norm(word_gradients)
    wgn = np.interp(word_gradients,
                    (np.min(word_gradients), np.max(word_gradients)), (0., 1.))
    """
    \\newcommand{\\reducedstrut}{\\vrule width 0pt height .9\\ht\\strutbox depth .9\\dp\\strutbox\\relax}
    \\newcommand{\\mycb}[3]{%
      \\begingroup
      \\setlength{\\fboxsep}{0pt}%  
      \\colorbox[rgb]{#1}{ \\strut \\textcolor[rgb]{#2}{#3} }%
      \\endgroup
    }
    """
    result = ""  # new command overwritten error

    for cmap in [
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["blue", "white", "red"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "forestgreen"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "orangered"]),
            # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "crimson"]),
            matplotlib.colors.LinearSegmentedColormap.from_list(
                "", ["white", "blue"]),
            # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "red"]),
            # # matplotlib.colors.LinearSegmentedColormap.from_list("", ["white", "black"]),
    ]:
        result += f""
        for ind, w in enumerate(text.words):
            ctpl = cmap(wgn[ind])[:3]
            tc = str(text_color(ctpl))[1:-1]
            ctpl = [round(v, 3) for v in ctpl]
            rgba = str(ctpl)[1:-1]
            result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak"
        result += f"\n\\\\ Top 10: {', '.join(np.array(text.words)[np.argsort(-wgn)][:10])}\\ \n\n\n"

        # Sentence-wise calc gradients
        sw_map = get_sentence_word_mapping(text.text)
        edit_sentence_order = calc_sentence_edit_schedule(
            Query(y_prime), sw_map, text)

        for enm_si, si in enumerate(edit_sentence_order):
            start, stop = sw_map[si]
            sub = model_config.tokenizer.clean_up_tokenization(" ".join(
                text.words[start:stop + 1]))

            subtext = Sentence(sub)
            word_gradients = np.array(subtext.calc_gradients(y_prime))
            word_gradients /= np.linalg.norm(word_gradients)
            wgn = np.interp(word_gradients,
                            (np.min(word_gradients), np.max(word_gradients)),
                            (0., 1.))

            result += f"{enm_si + 1} Satz (vorher {si + 1}. Satz): "
            for ind, w in enumerate(subtext.words):
                ctpl = cmap(wgn[ind])[:3]
                tc = str(text_color(ctpl))[1:-1]
                ctpl = [round(v, 3) for v in ctpl]
                rgba = str(ctpl)[1:-1]
                result += f"\\mycb{{{rgba}}}{{{tc}}}{{{latex_escape(w)}}}\\allowbreak"
            result += "\\\\ \n\n"

    return result