コード例 #1
0
def get_word_imp(origSents, orig_label_sents, sent2imp,
                 sent2cluster):  #revise name of sent2sent
    '''Computes word importance scores

  Args:
    origSents (list): List of sentences in original text
    orig_label_sents (list): List of sentences having predicted label same as the original label of the text
    sent2imp (dict): Dictionary mapping sentences to their importance ranking
    sent2cluster (dict):  Dictionary mapping origSents to the sentence cluster to which it belongs ( We merge sentence in orig_sents
                to reduce length of origSents, in case total sentences are above 12
  '''

    ind_count = 0
    import_scores = []
    nlp = spacy.load('en')

    for sent in origSents:

        if not " " in str(sent):
            text_sent = [str(sent)]
        else:
            text_tokens = nlp(sent)
            text_sent = [str(word) for word in text_tokens]

        if not sent in orig_label_sents:
            import_scores.extend([300] * len(text_sent))
        else:
            pos_tags = criteria.get_pos(text_sent)

            if sent in sent2cluster:
                sent_imp = sent2imp[sent2cluster[sent]]
            else:
                sent_imp = sent2imp[sent]

            for i1 in range(len(text_sent)):
                if pos_tags[i1] == 'ADV':
                    import_scores.append(sent_imp + 15)
                elif pos_tags[i1] == 'VERB':
                    import_scores.append(sent_imp + 15)
                elif pos_tags[i1] == 'ADJ':
                    import_scores.append(sent_imp)
                else:
                    import_scores.append(sent_imp + 50)

    import_scores = np.array(import_scores)

    return import_scores
コード例 #2
0
def random_attack(text_ls, true_label, predictor, perturb_ratio, stop_words_set, word2idx, idx2word, cos_sim,
                  sim_predictor=None, import_score_threshold=-1., sim_score_threshold=0.5, sim_score_window=15,
                  synonym_num=50, batch_size=32):
    # first check the prediction of the original text
    orig_probs = predictor([text_ls]).squeeze()
    orig_label = torch.argmax(orig_probs)
    orig_prob = orig_probs.max()
    if true_label != orig_label:
        return '', 0, orig_label, orig_label, 0
    else:
        len_text = len(text_ls)
        if len_text < sim_score_window:
            sim_score_threshold = 0.1  # shut down the similarity thresholding function
        half_sim_score_window = (sim_score_window - 1) // 2
        num_queries = 1

        # get the pos and verb tense info
        pos_ls = criteria.get_pos(text_ls)

        # randomly get perturbed words
        perturb_idxes = random.sample(range(len_text), int(len_text * perturb_ratio))
        words_perturb = [(idx, text_ls[idx]) for idx in perturb_idxes]

        # find synonyms
        words_perturb_idx = [word2idx[word] for idx, word in words_perturb if word in word2idx]
        synonym_words, _ = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, synonym_num, 0.5)
        synonyms_all = []
        for idx, word in words_perturb:
            if word in word2idx:
                synonyms = synonym_words.pop(0)
                if synonyms:
                    synonyms_all.append((idx, synonyms))

        # start replacing and attacking
        text_prime = text_ls[:]
        text_cache = text_prime[:]
        num_changed = 0
        for idx, synonyms in synonyms_all:
            new_texts = [text_prime[:idx] + [synonym] + text_prime[min(idx + 1, len_text):] for synonym in synonyms]
            new_probs = predictor(new_texts, batch_size=batch_size)

            # compute semantic similarity
            if idx >= half_sim_score_window and len_text - idx - 1 >= half_sim_score_window:
                text_range_min = idx - half_sim_score_window
                text_range_max = idx + half_sim_score_window + 1
            elif idx < half_sim_score_window and len_text - idx - 1 >= half_sim_score_window:
                text_range_min = 0
                text_range_max = sim_score_window
            elif idx >= half_sim_score_window and len_text - idx - 1 < half_sim_score_window:
                text_range_min = len_text - sim_score_window
                text_range_max = len_text
            else:
                text_range_min = 0
                text_range_max = len_text
            semantic_sims = \
            sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts),
                                       list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[0]

            num_queries += len(new_texts)
            if len(new_probs.shape) < 2:
                new_probs = new_probs.unsqueeze(0)
            new_probs_mask = (orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy()
            # prevent bad synonyms
            new_probs_mask *= (semantic_sims >= sim_score_threshold)
            # prevent incompatible pos
            synonyms_pos_ls = [criteria.get_pos(new_text[max(idx - 4, 0):idx + 5])[min(4, idx)]
                               if len(new_text) > 10 else criteria.get_pos(new_text)[idx] for new_text in new_texts]
            pos_mask = np.array(criteria.pos_filter(pos_ls[idx], synonyms_pos_ls))
            new_probs_mask *= pos_mask

            if np.sum(new_probs_mask) > 0:
                text_prime[idx] = synonyms[(new_probs_mask * semantic_sims).argmax()]
                num_changed += 1
                break
            else:
                new_label_probs = new_probs[:, orig_label] + torch.from_numpy(
                        (semantic_sims < sim_score_threshold) + (1 - pos_mask).astype(float)).float().cuda()
                new_label_prob_min, new_label_prob_argmin = torch.min(new_label_probs, dim=-1)
                if new_label_prob_min < orig_prob:
                    text_prime[idx] = synonyms[new_label_prob_argmin]
                    num_changed += 1
            text_cache = text_prime[:]
        return ' '.join(text_prime), num_changed, orig_label, torch.argmax(predictor([text_prime])), num_queries
コード例 #3
0
def contextual_attack(text_ls, true_label, predictor, maskedLM_predictor , stop_words_set, word2idx, idx2word, cos_sim, sim_predictor=None,
           import_score_threshold=-1., sim_score_threshold=0.5, sim_score_window=15, synonym_num=50,
           batch_size=32):
    # first check the prediction of the original text
    orig_probs = predictor([text_ls]).squeeze()
    orig_label = torch.argmax(orig_probs)
    orig_prob = orig_probs.max()
    if true_label != orig_label:
        return '', 0, orig_label, orig_label, 0
    else:
        len_text = len(text_ls)
        if len_text < sim_score_window:
            sim_score_threshold = 0.1  # shut down the similarity thresholding function
        half_sim_score_window = (sim_score_window - 1) // 2
        num_queries = 1

        # get the pos and verb tense info
        pos_ls = criteria.get_pos(text_ls)

        # get importance score
        leave_1_texts = [text_ls[:ii] + ['<oov>'] + text_ls[min(ii + 1, len_text):] for ii in range(len_text)]
        leave_1_probs = predictor(leave_1_texts, batch_size=batch_size)
        num_queries += len(leave_1_texts)
        leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1)
        import_scores = (orig_prob - leave_1_probs[:, orig_label] + (leave_1_probs_argmax != orig_label).float() * (
                    leave_1_probs.max(dim=-1)[0] - torch.index_select(orig_probs, 0,
                                                                      leave_1_probs_argmax))).data.cpu().numpy()
        
        # get words to perturb ranked by importance scorefor word in words_perturb
        words_perturb = []
        for idx, score in sorted(enumerate(import_scores), key=lambda x: x[1], reverse=True):
            try:
                if score > import_score_threshold and text_ls[idx] not in stop_words_set:
                    words_perturb.append((idx, text_ls[idx]))
            except:
                print(idx, len(text_ls), import_scores.shape, text_ls, len(leave_1_texts))
        #print("Generated words_perturb")
        # find synonyms
        new_texts=[]
        synonyms_all = []
        #print(' '.join(text_ls))
        for idx, word in words_perturb:
            synonyms=[]
            if idx >=127:
                continue
            new_texts.append(text_ls[:idx] + ['[MASK]'] + text_ls[min(idx + 1, len_text):])
            masked_lm_probs=maskedLM_predictor.text_pred(new_texts, batch_size=batch_size)
            #masked_lm_probs=masked_lm_probs.cpu().numpy()
            #print(np.shape(masked_lm_probs))
            #exit()
            values,indices = torch.topk(masked_lm_probs, 25, dim=-1) 
            tokens=maskedLM_predictor.convert_ids_to_tokens(indices.view(-1).cpu().numpy())
            tokens=np.reshape(tokens,(1,128,-1))
            #print(np.shape(tokens))
            #exit()
            #print(word+" "+str(idx))
            #print(' '.join(text_ls))
            for i in range(25):
                word=tokens[0][idx][i]
                if word in word2idx:
                    synonyms.append(word)
                #print(tokens[0][idx+1][i]+" ",end="")
            #print("\n")
            #for i in range(25):
            #    print(tokens[0][idx][i]+" ",end="")
            #print("\n")
            #for i in range(len(indices)):
            #    if indices[i] in idx2word:
            #        synonyms.append( idx2word[indices[i]])# if indices[i] in idx2word)
            if synonyms:
                synonyms_all.append((idx, synonyms))
            #exit()
        # words_perturb_idx = [word2idx[word] for idx, word in words_perturb if word in word2idx]
        # synonym_words, _ = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, synonym_num, 0.5)
        
        # for idx, word in words_perturb:
        #     if word in word2idx:
        #         synonyms = synonym_words.pop(0)
        #         if synonyms:
        #             synonyms_all.append((idx, synonyms))

        # start replacing and attacking
        text_prime = text_ls[:]
        text_cache = text_prime[:]
        num_changed = 0
        #print("Generated Synonyms")
        for idx, synonyms in synonyms_all:
            new_texts = [text_prime[:idx] + [synonym] + text_prime[min(idx + 1, len_text):] for synonym in synonyms]
            #print(new_texts)
            if new_texts:
                new_probs = predictor(new_texts, batch_size=batch_size)
            else:
                continue
            # compute semantic similarity
            if idx >= half_sim_score_window and len_text - idx - 1 >= half_sim_score_window:
                text_range_min = idx - half_sim_score_window
                text_range_max = idx + half_sim_score_window + 1
            elif idx < half_sim_score_window and len_text - idx - 1 >= half_sim_score_window:
                text_range_min = 0
                text_range_max = sim_score_window
            elif idx >= half_sim_score_window and len_text - idx - 1 < half_sim_score_window:
                text_range_min = len_text - sim_score_window
                text_range_max = len_text
            else:
                text_range_min = 0
                text_range_max = len_text
            semantic_sims = \
            sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts),
                                       list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[0]

            num_queries += len(new_texts)
            if len(new_probs.shape) < 2:
                new_probs = new_probs.unsqueeze(0)
            new_probs_mask = (orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy()
            # prevent bad synonyms
            new_probs_mask *= (semantic_sims >= sim_score_threshold)
            # prevent incompatible pos
            synonyms_pos_ls = [criteria.get_pos(new_text[max(idx - 4, 0):idx + 5])[min(4, idx)]
                               if len(new_text) > 10 else criteria.get_pos(new_text)[idx] for new_text in new_texts]
            pos_mask = np.array(criteria.pos_filter(pos_ls[idx], synonyms_pos_ls))
            new_probs_mask *= pos_mask

            if np.sum(new_probs_mask) > 0:
                text_prime[idx] = synonyms[(new_probs_mask * semantic_sims).argmax()]
                num_changed += 1
                break
            else:
                new_label_probs = new_probs[:, orig_label] + torch.from_numpy(
                        (semantic_sims < sim_score_threshold) + (1 - pos_mask).astype(float)).float().cuda()
                new_label_prob_min, new_label_prob_argmin = torch.min(new_label_probs, dim=-1)
                if new_label_prob_min < orig_prob:
                    text_prime[idx] = synonyms[new_label_prob_argmin]
                    num_changed += 1
            text_cache = text_prime[:]
        return ' '.join(text_prime), num_changed, orig_label, torch.argmax(predictor([text_prime])), num_queries
コード例 #4
0
def attack(cmodel,
           gcp_nlp_json_link,
           text_ls,
           true_label,
           stop_words_set,
           word2idx_rev,
           idx2word_rev,
           idx2word_vocab,
           cos_sim,
           pos_filter,
           sim_score_threshold=0.5,
           sim_score_window=15,
           synonym_num=80,
           syn_sim=0.65):
    '''Attack function

  Implementation of the attack algorithm 
  
  Args:
    text_ls (str): Text to be attacked
    true_label (int): True class of text_ls
    cmodel (Model obj): Model to be attacked
    cos_sim (numpy array): numpy array, precomuted cosine similarity square matrix
    word2idx (dict): Mapping words to indices in the precomuted cosine similarity square matrix
    idx2word (dict): Mapping indices of precomuted cosine similarity square matrix back to words
    sim_score_threshold (float): Semantic similarity threshold while selecting or rejecting synonyms,default:0.5
    sim_score_window (int): Window size for computing semantic similarity between actual and perturbed text around the perturbed word
    synonym_num (int): Max number of candidate synonyms to be analysed
    syn_sim (float): Threshold for cosine similarity between candidate synonyms and original word,defualt:0.75 
    gcp_nlp_json_link (str) : Google Cloud Platform NLP API JSON key file link
  '''

    if gcp_nlp_json_link:
        tmodel = model(gcp=True, gcp_nlp_json_link=gcp_nlp_json_link)
    else:
        tmodel = model(cmodel)

    text_temp = text_ls[:]
    orig_label = tmodel.getPredictions([text_ls])[0]

    if true_label != orig_label:
        return '', 0, orig_label, orig_label, 0
    else:
        nlp = spacy.load('en')
        doc = nlp(str(text_ls))
        text_ls = [str(j) for j in doc]
        len_text = len(text_ls)
        if len_text < sim_score_window:
            sim_score_threshold = 0.1  # shut down the similarity thresholding function
        num_queries = 1

        # get the pos info
        if pos_filter == 'fine':
            pos_ls1 = nltk.pos_tag(text_ls)
            pos_ls = [pos_ls1[i][1] for i in range(len(text_ls))]
        else:
            pos_ls = criteria.get_pos(text_ls)

        #sentence segmentation
        sents_sentiment_dic = defaultdict(list)
        text_sentences = nlp(text_temp)
        sents1 = text_sentences.sents
        sents = [str(sent) for sent in sents1]

        #print(sents) #look up spacy to get a logical way to get phrases or chunks
        if len(sents) == 1:
            sent = sents[0]
            tokens = nlp(sent)
            a = len(tokens) // 2
            if len(tokens) > 4:
                sents = [
                    str(tokens[i:i + 4]) for i in range(0, len(tokens), 4)
                ]

        #segregate positive and negative sentence
        preds = tmodel.getPredictions(list(sents))
        num_queries += len(sents)
        for i in range(len(preds)):
            sents_sentiment_dic[preds[i]].append(sents[i])

        #print(sents_sentiment_dic)
        orig_sents_ln = len(sents)
        origSents = sents[:]
        orig_label_sents = sents_sentiment_dic[orig_label][:]

        #curtail orig label sentences
        sent2cluster = {}
        if len(orig_label_sents) > 12:
            ln = len(orig_label_sents)
            mult = int(np.ceil(ln / 12))
            new_list = []
            for q in range(0, ln, mult):
                if q + mult < ln:
                    new_sent_list = sents_sentiment_dic[orig_label][q:q + mult]
                    new_sent_str = ' '.join(new_sent_list)
                    new_list.append(new_sent_str)
                else:
                    new_sent_list = sents_sentiment_dic[orig_label][q:]
                    new_sent_str = ' '.join(new_sent_list)
                    new_list.append(new_sent_str)

                for snt in new_sent_list:
                    sent2cluster[snt] = new_sent_str
                    sents.remove(snt)

                sents.append(new_sent_str)
            sents_sentiment_dic[orig_label] = new_list

        #Get sentence importance ranking
        top_sent_imp, word_agg_dic, sent2imp, num_queries = get_sentence_imp_ranking(
            sents_sentiment_dic, num_queries, orig_label, tmodel)

        #Get word importance scores
        import_scores = get_word_imp(origSents, orig_label_sents, sent2imp,
                                     sent2cluster)

        # get words to perturb ranked by importance score for word in words_perturb
        words_perturb = []
        text_prime = text_ls[:]
        imp_indxs = np.argsort(import_scores).tolist()

        for idx in imp_indxs:
            if not text_prime[idx] in stop_words_set:
                words_perturb.append((idx, text_prime[idx]))

        # find synonyms
        words_perturb_idx = [
            word2idx_rev[word] for idx, word in words_perturb
            if word in word2idx_rev
        ]
        synonym_words, _ = pick_most_similar_words_batch(
            words_perturb_idx, cos_sim, idx2word_vocab, synonym_num, syn_sim)
        synonyms_all = []
        for idx, word in words_perturb:
            if word in word2idx_rev:
                synonyms = synonym_words.pop(0)
                if synonyms:
                    synonyms_all.append((idx, synonyms))

        # start replacing and attacking
        num_changed = 0
        idx_flag = 0
        backtrack_dic = {}
        flg = 0
        misclassified = False
        visited = {}

        #map the words to indices in text_prime
        word_idx_dic = {}
        for idx in range(len(text_prime)):
            word = text_prime[idx]
            if word in word_idx_dic:
                word_idx_dic[word].append(idx)
            else:
                word_idx_dic[word] = [idx]
            visited[word] = False

        origtext_prime = text_prime.copy()
        len_text = len(text_prime)

        for idx, synonyms in synonyms_all:

            orig_pos = criteria.get_pos(text_prime)[idx]

            if len(origtext_prime[idx]) <= 1 or visited[
                    origtext_prime[idx]] == True:
                continue

            if misclassified:

                #backtrack to check for unnecessary perturbations

                for (wrd, index) in backtrack_dic:
                    txt_temp = text_prime[:]
                    txt_temp[index] = backtrack_dic[(wrd, index)]
                    txt_temp = vowel_correction(txt_temp, index)
                    pred = tmodel.getPredictions([' '.join(txt_temp)])[0]
                    num_queries += 1

                    if pred != orig_label:
                        text_prime = txt_temp[:]
                        num_changed -= 1
                break

            if num_queries >= 5000:
                break
            text_range_min, text_range_max = get_semantic_sim_window(
                idx, len_text, sim_score_window)

            # Step#1: Find all aggregates(with orig_label) to which the target wrd belongs

            target_word = text_prime[idx]
            visited[target_word] = True

            agg_list = []
            if target_word in word_agg_dic:
                agg_list = list(set(word_agg_dic[target_word]))
                word_agg_dic[target_word] = []

            orig_sentiment_sent = []
            for sent1 in sents_sentiment_dic[orig_label]:
                if target_word in sent1 and not sent1 in agg_list:
                    orig_sentiment_sent.append(sent1)

            #Check if any synonym is able make the entire review/text misclassify
            if pos_filter == 'fine':
                new_pos = np.array([
                    nltk.pos_tag(text_prime[:idx] + [syn] +
                                 text_prime[idx + 1:])[idx][1]
                    for syn in synonyms
                ])
            else:
                new_pos = np.array([
                    criteria.get_pos(text_prime[:idx] + [syn] +
                                     text_prime[idx + 1:])[idx]
                    for syn in synonyms
                ])
            pos_mask = (new_pos == (pos_ls[idx])).astype(int)

            rev_with_syns1 = [
                text_prime[:idx] + [syn] + text_prime[idx + 1:]
                for syn in synonyms
            ]
            sem_sims1 = np.array([
                semantic_sim(
                    [' '.join(rev_with_syn[text_range_min:text_range_max])],
                    [' '.join(text_prime[text_range_min:text_range_max])])
                for rev_with_syn in rev_with_syns1
            ])
            sem_sim_mask = (sem_sims1 >= sim_score_threshold).astype(int)

            #apply pos and semantic similarity masks to synonyms
            synonyms_masked = [
                synonyms[i] for i in range(len(synonyms))
                if pos_mask[i] == 1 and sem_sim_mask[i] == 1
            ]

            rev_with_syns = [
                text_prime[:idx] + [syn] + text_prime[idx + 1:]
                for syn in synonyms_masked
            ]
            sem_sims = np.array([
                semantic_sim(
                    [' '.join(rev_with_syn[text_range_min:text_range_max])],
                    [' '.join(text_prime[text_range_min:text_range_max])])
                for rev_with_syn in rev_with_syns
            ])

            #sort synonyms as per semantic similarity scores
            sort_order = dict(zip(synonyms_masked, sem_sims))
            synonyms_sorted = sorted(synonyms_masked, key=sort_order.get)

            rev_str = ' '.join(text_prime)
            vowels = {'a', 'e', 'i', 'o', 'u'}

            revs_with_synonyms1 = [
                re.sub(r'\b{}\s+{}\b'.format('a', target_word), 'an ' +
                       syn, rev_str) if syn[0] in vowels else re.sub(
                           r'\b{}\s+{}\b'.format('an', target_word), 'a ' +
                           syn, rev_str) for syn in synonyms_sorted
            ]

            revs_with_synonyms = [
                re.sub(r'\b{}\b'.format(target_word), synonyms_sorted[i],
                       revs_with_synonyms1[i])
                for i in range(len(synonyms_sorted))
            ]

            changed = False

            for i in range(len(revs_with_synonyms)):
                num_queries += 1
                pred = tmodel.getPredictions([revs_with_synonyms[i]])[0]
                if pred != orig_label:
                    changed = True
                    sel_sym = synonyms_sorted[i]
                    print(sel_sym)
                    misclassified = True
                    break

            #Check if any synonym is able make the any sentence, which originally had the same label as the orig_label of the review,
            #to misclassify

            if not changed and len(orig_sentiment_sent) > 0:
                #print("len sents: ",len(orig_sentiment_sent))

                sents_with_syns1 = [[
                    re.sub(r'\b{}\s+{}\b'.format('a', target_word), 'an ' +
                           syn, sent) if syn[0] in vowels else re.sub(
                               r'\b{}\s+{}\b'.format('an', target_word), 'a ' +
                               syn, sent) for sent in orig_sentiment_sent
                ] for syn in synonyms_sorted]

                sents_with_syns = [[
                    re.sub(r'\b{}\b'.format(target_word), synonyms_sorted[i],
                           sent) for sent in sents_with_syns1[i]
                ] for i in range(len(synonyms_sorted))]

                for i in range(len(sents_with_syns)):
                    num_queries += len(sents_with_syns[i])
                    if tmodel.getPredictions(
                            sents_with_syns[i]).count(orig_label) < len(
                                sents_with_syns[i]):
                        changed = True
                        sel_sym = synonyms_sorted[i]
                        break

            if not changed and len(agg_list) > 0:

                #Check if any synonym is able make any aggregate, which originally had the same label as the orig_label of the review,
                #to misclassify

                aggs_with_syns1 = [[
                    re.sub(r'\b{}\s+{}\b'.format('a', target_word), 'an ' +
                           syn, agg) if syn[0] in vowels else re.sub(
                               r'\b{}\s+{}\b'.format('an', target_word), 'a ' +
                               syn, agg) for agg in agg_list
                ] for syn in synonyms_sorted]

                aggs_with_syns = [[
                    re.sub(r'\b{}\b'.format(target_word), synonyms_sorted[i],
                           agg) for agg in aggs_with_syns1[i]
                ] for i in range(len(synonyms_sorted))]

                for i in range(len(synonyms_sorted)):
                    num_queries += len(agg_list)
                    if tmodel.getPredictions(
                            aggs_with_syns[i]).count(orig_label) < len(
                                aggs_with_syns[i]):
                        changed = True
                        sel_sym = synonyms_sorted[i]
                        break

            if changed:

                for indx in word_idx_dic[str(target_word)]:
                    #print("changed")
                    text_prime[indx] = sel_sym
                    text_prime = vowel_correction(text_prime[:], indx)
                    backtrack_dic[(sel_sym, indx)] = target_word
                    num_changed += 1
    #print(num_changed)
    text_prime = ' '.join(text_prime)
    probs = tmodel.getPredictions([text_prime])
    return text_prime, num_changed, orig_label, probs[0], num_queries
コード例 #5
0
def attack(
    text_ls,
    true_label,
    predictor,
    stop_words_set,
    word2idx,
    idx2word,
    cos_sim,
    sim_predictor=None,
    import_score_threshold=-1.0,
    sim_score_threshold=0.5,
    sim_score_window=15,
    synonym_num=50,
    batch_size=32,
):
    # first check the prediction of the original text
    orig_probs = predictor([text_ls]).squeeze()
    orig_label = torch.argmax(orig_probs)
    # orig_label = (
    #     torch.tensor(
    #         list(map(lambda x: 1.0 if x[0] > 0.5 else 0.0, orig_probs)),
    #         dtype=torch.long,
    #     )
    #     .cuda()
    #     .unsqueeze(-1)
    # )
    # orig_label = torch.tensor(
    #     1 if orig_probs.data >= 0.5 else 0, dtype=torch.long
    # ).cuda()
    orig_prob = orig_probs.max()
    if true_label != orig_label:
        return "", 0, orig_label, orig_label, 0
    else:
        len_text = len(text_ls)
        if len_text < sim_score_window:
            sim_score_threshold = 0.1  # shut down the similarity thresholding function
        half_sim_score_window = (sim_score_window - 1) // 2
        num_queries = 1

        # get the pos and verb tense info
        pos_ls = criteria.get_pos(text_ls)

        # get importance score
        leave_1_texts = [
            text_ls[:ii] + ["<oov>"] + text_ls[min(ii + 1, len_text):]
            for ii in range(len_text)
        ]
        leave_1_probs = predictor(leave_1_texts, batch_size=batch_size)
        num_queries += len(leave_1_texts)
        leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1)
        # leave_1_probs_argmax = torch.tensor(
        #     1 if leave_1_probs.data >= 0.5 else 0, dtype=torch.long
        # ).cuda()
        # leave_1_probs_argmax = (
        #     torch.tensor(
        #         list(map(lambda x: 1 if x[0] > 0.5 else 0, leave_1_probs)),
        #         dtype=torch.long,
        #     )
        #     .cuda()
        #     .unsqueeze(-1)
        # )
        import_scores = (
            (orig_prob - leave_1_probs[:, orig_label] +
             (leave_1_probs_argmax != orig_label).float() *
             (leave_1_probs.max(dim=-1)[0] - torch.index_select(
                 orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy())

        # get words to perturb ranked by importance scorefor word in words_perturb
        words_perturb = []
        for idx, score in sorted(enumerate(import_scores),
                                 key=lambda x: x[1],
                                 reverse=True):
            try:
                if (score > import_score_threshold
                        and text_ls[idx] not in stop_words_set):
                    words_perturb.append((idx, text_ls[idx]))
            except:
                print(idx, len(text_ls), import_scores.shape, text_ls,
                      len(leave_1_texts))

        # find synonyms
        words_perturb_idx = [
            word2idx[word] for idx, word in words_perturb if word in word2idx
        ]
        synonym_words, _ = pick_most_similar_words_batch(
            words_perturb_idx, cos_sim, idx2word, synonym_num, 0.5)
        synonyms_all = []
        for idx, word in words_perturb:
            if word in word2idx:
                synonyms = synonym_words.pop(0)
                if synonyms:
                    synonyms_all.append((idx, synonyms))

        # start replacing and attacking
        text_prime = text_ls[:]
        text_cache = text_prime[:]
        num_changed = 0
        for idx, synonyms in synonyms_all:
            new_texts = [
                text_prime[:idx] + [synonym] +
                text_prime[min(idx + 1, len_text):] for synonym in synonyms
            ]
            new_probs = predictor(new_texts, batch_size=batch_size)

            # compute semantic similarity
            if (idx >= half_sim_score_window
                    and len_text - idx - 1 >= half_sim_score_window):
                text_range_min = idx - half_sim_score_window
                text_range_max = idx + half_sim_score_window + 1
            elif (idx < half_sim_score_window
                  and len_text - idx - 1 >= half_sim_score_window):
                text_range_min = 0
                text_range_max = sim_score_window
            elif (idx >= half_sim_score_window
                  and len_text - idx - 1 < half_sim_score_window):
                text_range_min = len_text - sim_score_window
                text_range_max = len_text
            else:
                text_range_min = 0
                text_range_max = len_text
            semantic_sims = sim_predictor.semantic_sim(
                [" ".join(text_cache[text_range_min:text_range_max])] *
                len(new_texts),
                list(
                    map(lambda x: " ".join(x[text_range_min:text_range_max]),
                        new_texts)),
            )[0]

            num_queries += len(new_texts)
            if len(new_probs.shape) < 2:
                new_probs = new_probs.unsqueeze(0)
            new_probs_mask = ((orig_label != torch.argmax(
                new_probs, dim=-1)).data.cpu().numpy())
            # prevent bad synonyms
            new_probs_mask *= semantic_sims >= sim_score_threshold
            # prevent incompatible pos
            synonyms_pos_ls = [
                criteria.get_pos(new_text[max(idx - 4, 0):idx +
                                          5])[min(4, idx)]
                if len(new_text) > 10 else criteria.get_pos(new_text)[idx]
                for new_text in new_texts
            ]
            pos_mask = np.array(
                criteria.pos_filter(pos_ls[idx], synonyms_pos_ls))
            new_probs_mask *= pos_mask

            if np.sum(new_probs_mask) > 0:
                text_prime[idx] = synonyms[(new_probs_mask *
                                            semantic_sims).argmax()]
                num_changed += 1
                break
            else:
                new_label_probs = (new_probs[:, orig_label] + torch.from_numpy(
                    (semantic_sims < sim_score_threshold) +
                    (1 - pos_mask).astype(float)).float().cuda())
                new_label_prob_min, new_label_prob_argmin = torch.min(
                    new_label_probs, dim=-1)
                if new_label_prob_min < orig_prob:
                    text_prime[idx] = synonyms[new_label_prob_argmin]
                    num_changed += 1
            text_cache = text_prime[:]
        return (
            " ".join(text_prime),
            num_changed,
            orig_label,
            torch.argmax(predictor([text_prime])),
            num_queries,
        )
コード例 #6
0
def attack(fuzz_val,
           top_k_words,
           qrs,
           wts,
           sample_index,
           text_ls,
           true_label,
           predictor,
           stop_words_set,
           word2idx,
           idx2word,
           cos_sim,
           word_embedding,
           sim_predictor=None,
           import_score_threshold=-1.,
           sim_score_threshold=0.5,
           sim_score_window=15,
           synonym_num=50,
           batch_size=32):
    rows = []
    nlp = spacy.load('en_core_web_sm')
    masked_lang_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    changed_with = []
    doc = nlp(' '.join(text_ls))
    text = []
    for sent in doc.sents:
        for token in sent:
            text.append(token.text)
    tok_text = []
    for item in text:
        ap = item.find("'")
        if ap >= 0:
            tok_text.append(item[0:ap])
            tok_text.append("'")
            tok_text.append(item[ap + 1:len(item)])
        else:
            tok_text.append(item)
    text = []
    for item in tok_text:
        if len(item) > 0:
            text.append(item)

    text_ls = text[:]

    # first check the prediction of the original text
    orig_probs = predictor([text_ls]).squeeze()
    orig_label = torch.argmax(orig_probs)
    orig_prob = orig_probs.max()
    if true_label != orig_label:
        return '', 0, orig_label, orig_label, 0, [], []
    else:

        len_text = len(text_ls)
        if len_text < sim_score_window:
            sim_score_threshold = 0.1  # shut down the similarity thresholding function
        half_sim_score_window = (sim_score_window - 1) // 2
        num_queries = 1

        # get the pos and verb tense info
        pos_ls = criteria.get_pos(text_ls)
        # get importance score
        leave_1_texts = [
            text_ls[:ii] + ['<oov>'] + text_ls[min(ii + 1, len_text):]
            for ii in range(len_text)
        ]
        leave_1_probs = predictor(leave_1_texts, batch_size=batch_size)
        num_queries += len(leave_1_texts)
        leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1)
        import_scores = (
            orig_prob - leave_1_probs[:, orig_label] +
            (leave_1_probs_argmax != orig_label).float() *
            (leave_1_probs.max(dim=-1)[0] - torch.index_select(
                orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy()

        # get words to perturb ranked by importance score for word in words_perturb
        words_perturb = []
        for idx, score in sorted(enumerate(import_scores),
                                 key=lambda x: x[1],
                                 reverse=True):
            try:
                if score > import_score_threshold and text_ls[
                        idx] not in stop_words_set and len(text_ls[idx]) > 2:
                    words_perturb.append((idx, score))
            except:
                print(idx, len(text_ls), import_scores.shape, text_ls,
                      len(leave_1_texts))
        #return '', 0, orig_label, orig_label, 0, [], words_perturb
        # find synonyms
        words_perturb_idx = [
            word2idx[word] for idx, word in words_perturb if word in word2idx
        ]
        #synonym_words, synonym_values, synonyms_dict = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, synonym_num, -1.0)
        # start replacing and attacking
        text_prime = text_ls[:]
        sims = []
        text_cache = text_prime[:]
        num_changed = 0
        for idx, score in words_perturb:
            #print(text_ls[idx])
            text_range_min, text_range_max = calc_window(idx, 3, 10, len_text)

            sliced_text = text_prime[text_range_min:text_range_max]
            #print(sliced_text)
            new_index = idx - text_range_min
            #print(sliced_text[new_index])
            masked_idx = new_index

            tokens, words, position = gen.convert_sentence_to_token(
                ' '.join(sliced_text), 1000, tokenizer)
            assert len(words) == len(position)

            len_tokens = len(tokens)

            mask_position = position[masked_idx]

            if isinstance(mask_position, list):
                feature = gen.convert_whole_word_to_feature(
                    tokens, mask_position, 1000, tokenizer)
            else:
                feature = gen.convert_token_to_feature(tokens, mask_position,
                                                       1000, tokenizer)

            tokens_tensor = torch.tensor([feature.input_ids])
            token_type_ids = torch.tensor([feature.input_type_ids])
            attention_mask = torch.tensor([feature.input_mask])
            tokens_tensor = tokens_tensor.to('cuda')
            token_type_ids = token_type_ids.to('cuda')
            attention_mask = attention_mask.to('cuda')
            #new_probs = predictor(new_texts, batch_size=batch_size)
            masked_lang_model.to('cuda')
            masked_lang_model.eval()
            ps = PorterStemmer()

            with torch.no_grad():
                prediction_scores = masked_lang_model(tokens_tensor,
                                                      token_type_ids,
                                                      attention_mask)

            if isinstance(mask_position, list):
                predicted_top = prediction_scores[0, mask_position[0]].topk(50)
            else:
                predicted_top = prediction_scores[0, mask_position].topk(50)

            pre_tokens = tokenizer.convert_ids_to_tokens(
                predicted_top[1].cpu().numpy())
            synonyms_initial = gen.substitution_generation(
                words[masked_idx], pre_tokens, predicted_top[0].cpu().numpy(),
                ps, 50)
            new_texts = []
            avg = []
            synonyms = []
            assert words[masked_idx] == text_ls[idx]
            #print(synonyms)
            for candidate_word in synonyms_initial:
                if candidate_word in word_embedding and words[
                        masked_idx] in word_embedding:
                    candidate_similarity = calc_similarity(
                        word_embedding[words[masked_idx]],
                        word_embedding[candidate_word])
                    avg.append(candidate_similarity)
                    #print(words[masked_idx], candidate_similarity, candidate_word)
                    if candidate_similarity >= 0.2:
                        new_texts.append(text_prime[:idx] + [candidate_word] +
                                         text_prime[min(idx + 1, len_text):])
                        synonyms.append(candidate_word)
                else:
                    new_texts.append(text_prime[:idx] + [candidate_word] +
                                     text_prime[min(idx + 1, len_text):])
                    synonyms.append(candidate_word)
            #print(len(new_texts))
            if len(new_texts) == 0:
                continue

            text_range_min, text_range_max = calc_window(
                idx, half_sim_score_window, sim_score_window, len_text)
            semantic_sims = \
            sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts),
                                       list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[0]
            sims.append(np.sum(semantic_sims) / len(semantic_sims))

            new_probs_mask = np.ones(
                len(new_texts)
            )  #(orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy()
            # prevent bad synonyms
            new_probs_mask *= (semantic_sims >= sim_score_threshold)
            # prevent incompatible pos
            synonyms_pos_ls = [
                criteria.get_pos(new_text[max(idx - 4, 0):idx +
                                          5])[min(4, idx)]
                if len(new_text) > 10 else criteria.get_pos(new_text)[idx]
                for new_text in new_texts
            ]
            pos_mask = np.array(
                criteria.pos_filter(pos_ls[idx], synonyms_pos_ls))
            new_probs_mask *= pos_mask
            new_vals = semantic_sims * new_probs_mask
            index = []
            mini = 2
            for i in range(len(new_vals)):
                if new_vals[i] > 0:
                    index.append((new_vals[i], i))
            if len(index) == 0:
                continue
            new_texts1 = [new_texts[ind] for val, ind in index]
            #print(len(new_texts1))
            num_queries += len(new_texts1)
            if num_queries > qrs:
                return '', 0, orig_label, orig_label, 0, [], []
            new_probs = predictor(new_texts1, batch_size=batch_size)
            if len(new_probs.shape) < 2:
                new_probs = new_probs.unsqueeze(0)
            pr = (orig_label != torch.argmax(new_probs,
                                             dim=-1)).data.cpu().numpy()
            if np.sum(pr) > 0:
                text_prime[idx] = synonyms[index[pr.argmax(
                )][1]]  #synonyms[(new_probs_mask * semantic_sims).argmax()]
                num_changed += 1
                break
            else:
                new_label_probs = new_probs[:, orig_label]
                new_label_prob_min, new_label_prob_argmin = torch.min(
                    new_label_probs, dim=-1)
                if new_label_prob_min < orig_prob:
                    text_prime[idx] = synonyms[index[new_label_prob_argmin][1]]
                    num_changed += 1
            text_cache = text_prime[:]

            if fuzz.token_set_ratio(' '.join(text_ls),
                                    ' '.join(text_cache)) < fuzz_val:
                return ' '.join(
                    text_prime), num_changed, orig_label, torch.argmax(
                        predictor([text_prime
                                   ])), num_queries, words_perturb, sims
        return ' '.join(text_prime), num_changed, orig_label, torch.argmax(
            predictor([text_prime])), num_queries, words_perturb, sims
コード例 #7
0
ファイル: find_rules.py プロジェクト: SebOchs/sears
def text_fooler(text_ls,
                true_label,
                model,
                stop_words_set,
                word2idx,
                idx2word,
                cos_sim,
                sim_predictor=None,
                import_score_threshold=-1.,
                sim_score_threshold=0.7,
                sim_score_window=15,
                synonym_num=50,
                batch_size=32):
    adversaries = []
    # first check the prediction of the original text#
    ref_ans, stud_ans = text_ls
    stud_ans = list_to_string(stud_ans).split(" ")
    orig_logits = predict(model, ref_ans, stud_ans, true_label)
    orig_probs = F.softmax(orig_logits, dim=0)
    orig_label = torch.argmax(orig_probs).item()
    orig_prob = orig_probs.max().item()
    if true_label != orig_label:
        return '', 0, orig_label, orig_label, 0
    else:
        len_text = len(stud_ans)
        if len_text < sim_score_window:
            sim_score_threshold = 0.1  # shut down the similarity thresholding function
        half_sim_score_window = (sim_score_window - 1) // 2
        num_queries = 1

        # get the pos and verb tense info
        pos_ls = criteria.get_pos(stud_ans)

        # get importance score
        leave_1_texts = [
            stud_ans[:ii] + ['[UNK]'] + stud_ans[min(ii + 1, len_text):]
            for ii in range(len_text)
        ]
        leave_1_probs = []
        num_queries += len(leave_1_texts)

        for new_ans in leave_1_texts:
            new_logits = predict(model, ref_ans, new_ans, true_label)
            new_probs = F.softmax(new_logits, dim=0)
            leave_1_probs.append(new_probs)
        leave_1_probs = torch.stack(leave_1_probs)
        leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1)

        import_scores = (
            orig_prob - leave_1_probs[:, orig_label] +
            (leave_1_probs_argmax != orig_label).float() *
            (leave_1_probs.max(dim=-1)[0] - torch.index_select(
                orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy()

        # get words to perturb ranked by importance score for word in words_perturb
        words_perturb = []
        for idx, score in sorted(enumerate(import_scores),
                                 key=lambda x: x[1],
                                 reverse=True):
            try:
                if score > import_score_threshold and stud_ans[
                        idx] not in stop_words_set:
                    words_perturb.append((idx, stud_ans[idx]))
            except:
                print(idx, len(stud_ans), import_scores.shape, stud_ans,
                      len(leave_1_texts))

        # find synonyms
        words_perturb_idx = [
            word2idx[word] for idx, word in words_perturb if word in word2idx
        ]
        synonym_words, _ = pick_most_similar_words_batch(
            words_perturb_idx, cos_sim, idx2word, synonym_num, 0.5)

        synonyms_all = []
        for idx, word in words_perturb:
            if word in word2idx:
                synonyms = synonym_words.pop(0)
                if synonyms:
                    synonyms_all.append((idx, synonyms))

        # start replacing and attacking
        text_prime = stud_ans[:]
        text_cache = text_prime[:]
        num_changed = 0
        for idx, synonyms in synonyms_all:
            new_texts = [
                text_prime[:idx] + [synonym] +
                text_prime[min(idx + 1, len_text):] for synonym in synonyms
            ]
            new_probs = []
            new_labels = []
            for syn_text in new_texts:
                syn_logits = predict(model, ref_ans, syn_text, true_label)
                new_probs.append(F.softmax(syn_logits, dim=0))

            new_probs = torch.stack(new_probs)

            # compute semantic similarity
            if idx >= half_sim_score_window and len_text - idx - 1 >= half_sim_score_window:
                text_range_min = idx - half_sim_score_window
                text_range_max = idx + half_sim_score_window + 1
            elif idx < half_sim_score_window <= len_text - idx - 1:
                text_range_min = 0
                text_range_max = sim_score_window
            elif idx >= half_sim_score_window > len_text - idx - 1:
                text_range_min = len_text - sim_score_window
                text_range_max = len_text
            else:
                text_range_min = 0
                text_range_max = len_text
            semantic_sims = \
                sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts),
                                           list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[
                    0]

            num_queries += len(new_texts)

            if len(new_probs.shape) < 2:
                new_probs = new_probs.unsqueeze(0)
            new_probs_mask = (2 == torch.argmax(new_probs,
                                                dim=-1)).data.cpu().numpy()
            # prevent bad synonyms
            new_probs_mask *= (semantic_sims >= sim_score_threshold)
            # prevent incompatible pos (maybe not)

            synonyms_pos_ls = [
                criteria.get_pos(new_text[max(idx - 4, 0):idx +
                                          5])[min(4, idx)]
                if len(new_text) > 10 else criteria.get_pos(new_text)[idx]
                for new_text in new_texts
            ]

            pos_mask = np.array(
                criteria.pos_filter(pos_ls[idx], synonyms_pos_ls))
            # Uncomment to inverse mask and only allow candidates where POS is not the same
            # pos_mask = np.invert(pos_mask)
            new_probs_mask *= pos_mask

            if np.sum(new_probs_mask) > 0:
                text_prime[idx] = synonyms[(new_probs_mask *
                                            semantic_sims).argmax()]
                num_changed += 1
                adversaries.append(tuple(text_prime))
                break
            """
            else:
                new_label_probs = new_probs[:, orig_label] + torch.from_numpy(
                    (semantic_sims < sim_score_threshold) + (1 - pos_mask).astype(float)).float()
                new_label_prob_min, new_label_prob_argmin = torch.min(new_label_probs, dim=-1)
                if new_label_prob_min < orig_prob:
                    text_prime[idx] = synonyms[new_label_prob_argmin]
                    num_changed += 1
            text_cache = text_prime[:]
            adversaries.append(text_cache)
            #new_labels.append()
            """
        # Combine adversaries with new labels
        result = set(i for i in adversaries if i[0] != stud_ans[:])
        return num_changed, num_queries, result