Beispiel #1
0
def example():
    # get restore model

    dataset = data.Dataset.init_from_dump()

    _, char_idx, _ = data.Dataset.init_embedding_from_dump()
    token_index = create_token_dict(char_idx)
    sess = tf.Session()
    config = FlipDetectorConfig(eval_only=True)
    model = FlipDetector(sess, config=config)

    index_of_toxic_sent = np.where(dataset.val_lbl[:, 0] == 1)[0]
    tox_model = ToxicityClassifier(session=sess)
    num_of_sentence_to_attack = 100
    for j in range(num_of_sentence_to_attack):
        seq = dataset.val_seq[index_of_toxic_sent[j]]
        sent = data.seq_2_sent(seq, char_idx)

        print(sent)
        flipped_seq = seq.copy()
        curr_class = get_callsifier(tox_model, flipped_seq)

        print('toxic class before: ', curr_class)

        # token_to_flip = char_idx['^']
        # for i in range(3):
        mask_char_allow_to_flip = np.ones([500])
        num_of_flips_done = 0
        while curr_class > 0.5 and num_of_flips_done < 15:
            _, probs = model.attack(flipped_seq, target_confidence=0.)
            mask_probs = probs * mask_char_allow_to_flip
            flip_idx = np.argmax(mask_probs, 1)[0]
            mask_char_allow_to_flip[flip_idx] = 0
            # curr_sentence = data.seq_2_sent(flipped_seq, char_idx)
            token_to_flip = flipped_seq[flip_idx]
            char_to_flip = token_index[token_to_flip]
            char_to_flip_to = smart_replace(char_to_flip)
            token_of_flip = char_idx[char_to_flip_to]
            flipped_seq[flip_idx] = token_of_flip
            curr_class = get_callsifier(tox_model, flipped_seq)

            print(data.seq_2_sent(flipped_seq, char_idx))
            print('char index that was flipped', flip_idx)
            print('toxic class after: ', curr_class)
            num_of_flips_done += 1

        print('done attacking sentence')

    sess.close()
Beispiel #2
0
    def attack(self, data_seq, labels):

        hot_flip = HotFlip(
            model=self.model,
            debug=self.debug,
            beam_search_size=self.beam_size,
            attack_mode=self.attack_mode,
            stop_after_num_of_flips=self.stop_after_num_of_flips,
            use_tox_as_score=True,
            calc_tox_for_beam=True)

        # init list
        list_of_hot_flip_attack = []

        #choosing only the toxic sentences
        index_of_toxic_sent = np.where(labels[:, 0] == 1)[0]

        num_of_seq_to_attack = len(index_of_toxic_sent) if self.num_of_seq_to_attack == None \
                                                        else min( self.num_of_seq_to_attack , len(index_of_toxic_sent))

        #attack first num_of_seq_to_attack sentences
        index_of_toxic_sent = index_of_toxic_sent[:num_of_seq_to_attack]

        t = time.time()

        for counter, i in enumerate(index_of_toxic_sent):
            seq = np.expand_dims(data_seq[i, :], 0)
            #true_classes = dataset.train_lbl[i, :]

            #do hot flip attack
            best_hot_flip_status, char_to_token_dic = hot_flip.attack(seq=seq)

            #attack sentence
            curr_hot_flip_attack = self.create_data(best_hot_flip_status, i)

            #add flip status
            if len(
                    curr_hot_flip_attack
            ) > 0:  #if the original sentence was classify below threshold, len = 0
                list_of_hot_flip_attack.append(curr_hot_flip_attack)

            # print sentance after the flips
            if self.debug:
                print("setence num: ", counter, "flipped sentence: ")
                print(
                    data.seq_2_sent(best_hot_flip_status.fliped_sent,
                                    char_to_token_dic))

                dur = time.time() - t
                print("dur is: ", dur)

        return list_of_hot_flip_attack
Beispiel #3
0
def example():

    # get restore model
    sess = tf.Session()
    tox_model = ToxicityClassifier(session=sess)

    hot_flip = HotFlip(model=tox_model)

    list_of_attack = [hot_flip]

    # get data
    dataset = data.Dataset.init_from_dump()

    index_of_toxic_sent = np.where(dataset.train_lbl[:, 0] == 1)[0]

    #check_length(dataset)
    num_of_sentence_to_attack = 5
    for i in range(num_of_sentence_to_attack):

        index_to_attack = index_of_toxic_sent[i]

        # taking the first sentence.
        seq = np.expand_dims(dataset.train_seq[index_to_attack, :], 0)

        for attack in list_of_attack:
            print("attack mode: ", attack.attack_mode)
            #do hot flip attack
            best_flip_status, char_to_token_dic = attack.attack(seq=seq)

            # print sentance after the flips
            print("flipped sentence: ")
            print(
                data.seq_2_sent(best_flip_status.fliped_sent,
                                char_to_token_dic))

            # classes before the change
            print("tox class before the flip: ")
            classes = tox_model.classify(seq)[0][0]
            print(classes)

            # classes after the change
            print("tox class after the flip: ")
            classes = tox_model.classify(
                np.expand_dims(best_flip_status.fliped_sent, 0))[0][0]
            print(classes)
Beispiel #4
0
def example():

    # get hot flip data
    dataset = HotFlipDataProcessor.get_detector_selector_datasets()
    # get embedding and token dict
    _, char_to_token_dic, _ = data.Dataset.init_embedding_from_dump()

    print("input stentence 0: ")
    print(data.seq_2_sent(dataset.train_seq[0], char_to_token_dic))

    print("prediction detector: ")
    print(dataset.train_lbl[0])

    print("prediction char selector: ")
    print(dataset.train_replace_lbl[0])

    print("index of the char the should be fliped in the sentence 5")
    print(np.where(dataset.train_lbl == 1)[1][5])

    print("index of the char to flip to in the sentence 5")
    print(np.where(dataset.train_replace_lbl == 1)[1][5])
Beispiel #5
0
def example():
    dataset = HotFlipDataProcessor.get_detector_selector_datasets()
    _, char_idx, _ = data.Dataset.init_embedding_from_dump()
    sess = tf.Session()
    config = FlipDetectorConfig(
        restore=False,
        restore_path=path.join(
            RES_OUT_DIR, 'detector_flip_beam10/detector_model.ckpt-84056'))
    model = FlipDetector(sess, config=config)
    # model._validate(dataset)
    model.train(dataset)

    seq = dataset.train_seq[0]
    flip_idx, _ = model.attack(seq, target_confidence=0.)[0]

    sent = data.seq_2_sent(seq, char_idx)
    flipped_sent = sent[:flip_idx] + '[*]' + sent[min(flip_idx +
                                                      1, len(sent)):]
    print(sent)
    print(flipped_sent)
    sess.close()
Beispiel #6
0
    def attack(self, seq, mask=None):

        tox_model = self.tox_model

        #get embedding and token dict
        embedding_matrix, char_to_token_dic, _ = data.Dataset.init_embedding_from_dump(
        )

        token_index = create_token_dict(char_to_token_dic)

        # squeeze the seq to vector
        squeeze_seq = seq.squeeze(0)

        if mask is None:
            mask = np.ones_like(squeeze_seq)

        if self.replace_only_letters_to_letters:
            mask = self.unmask_none_latters(seq, token_index, mask)

        # print sentence before the flip
        if self.debug:
            print(data.seq_2_sent(squeeze_seq, char_to_token_dic))

        # copy the sentence to the output sentence
        curr_squeeze_seq = squeeze_seq.copy()

        # create initial the beam search database
        beam_best_flip = self.create_initial_beam_search_database(
            curr_squeeze_seq, mask)

        setence_length_10_per = int(np.sum(curr_squeeze_seq != 0) / 10)

        # loop on the amount of char to flip
        for num_of_flips in range(self.num_of_char_to_flip):

            # get best flip from beam
            best_hot_flip_status = self.get_best_hot_flip(beam_best_flip)
            curr_class = self.tox_model.classify(
                np.expand_dims(best_hot_flip_status.fliped_sent, 0))[0][0]
            if self.debug:
                print("curr class: ", curr_class)
            if curr_class < 0.5 and num_of_flips >= setence_length_10_per:
                break

            if curr_class < self.attack_threshold:
                break

            if self.break_on_half and curr_class < 0.5:
                break

            if self.stop_after_num_of_flips and num_of_flips > self.num_max_flips:
                break

            if self.calc_tox_for_beam:
                self.zero_all_score(beam_best_flip)

            #copy the curr database in order not to iterate over the changed database
            copy_beam_best_flip = beam_best_flip.copy()

            for curr_flip in copy_beam_best_flip:

                curr_squeeze_seq = curr_flip.fliped_sent.copy()

                # get grad
                char_grad_tox = self.get_char_grad_from_seq(
                    tox_model, embedding_matrix, curr_squeeze_seq)

                index_of_char_allowed_to_flip = np.argwhere(
                    curr_flip.mask == 1).squeeze(1)

                if self.attack_mode == 'flip':
                    #flip attack
                    max_flip_grad_per_char, flip_grad_matrix = self.flip_grad_calc(
                        index_of_char_allowed_to_flip, curr_squeeze_seq,
                        char_grad_tox, tox_model, char_to_token_dic,
                        token_index)

                    beam_best_flip = self.find_best_flip(
                        tox_model, flip_grad_matrix, beam_best_flip, curr_flip,
                        curr_squeeze_seq, max_flip_grad_per_char)
                else:
                    raise NotImplementedError()

        #get best flip from beam
        best_hot_flip_status = self.get_best_hot_flip(beam_best_flip)

        return best_hot_flip_status, char_to_token_dic,
Beispiel #7
0
 def attack(self,
            model='random',
            seq=None,
            mask=[],
            sequence_idx=0,
            attack_number=0):
     assert model in ['random', 'hotflip', 'detector', 'atten']
     assert seq is not None
     seq = seq.copy()
     curr_seq = seq[sequence_idx]
     if len(mask) == 0:
         mask = np.ones_like(curr_seq)
     sent = data.seq_2_sent(curr_seq, self.char_index)
     tox_before = self._tox_model.classify(np.expand_dims(curr_seq,
                                                          0))[0][0]
     if self.config.debug:
         print("Attacking with model: ", model)
         print("Toxicity before attack: ", tox_before)
         print(sent)
     time_before = time.time()
     if model == 'random':
         _, _, flip_idx, res = self._random_flip.attack(
             curr_seq, mask, self.token_index, self.char_index)
         time_for_attack = time.time() - time_before
     elif model == 'hotflip':
         res = self._hotflip.attack(np.expand_dims(curr_seq, 0), mask)
         time_for_attack = time.time() - time_before
         flip_idx = res[0].char_to_flip_to
         res = res[0].fliped_sent
     elif model == 'atten':
         # atten_probs = np.zeros_like(curr_seq)
         spaces_indices = np.where(curr_seq == SPACE_EMBEDDING)
         mask[spaces_indices] = 0
         atten_probs = self.atten_fn([np.expand_dims(curr_seq, 0)])[0]
         time_for_attack = time.time() - time_before
         atten_probs = np.squeeze(atten_probs)
         atten_probs_masked = atten_probs * mask
         flip_idx = np.argsort(atten_probs_masked)[-1]
         token_to_flip = curr_seq[flip_idx]
         # char_to_flip = self.token_index[token_to_flip]
         if not atten_probs_masked.any():
             res = curr_seq
         else:
             token_to_flip_to = token_to_flip
             while token_to_flip_to == token_to_flip:
                 token_to_flip_to = np.random.randint(1, SPACE_EMBEDDING)
             curr_seq[flip_idx] = token_to_flip_to
             res = curr_seq
     else:
         _, probs = self._flip_detector.attack(curr_seq,
                                               target_confidence=0.)
         time_for_attack = time.time() - time_before
         spaces_indices = np.where(curr_seq == SPACE_EMBEDDING)
         mask[spaces_indices] = 0
         mask_probs = probs * mask
         flip_idx = np.argmax(mask_probs, 1)[0]
         token_to_flip = curr_seq[flip_idx]
         if not mask.any():
             return 0, 0, 0, curr_seq, time_for_attack
         token_of_flip, _ = self._flip_detector.selector_attack(
             curr_seq, flip_idx)
         # token_of_flip = token_to_flip
         # while token_of_flip == token_to_flip:
         #     token_of_flip = np.random.randint(1, SPACE_EMBEDDING)
         curr_seq[flip_idx] = token_of_flip
         res = curr_seq
     flipped_sent = data.seq_2_sent(res, self.char_index)
     tox_after = self._tox_model.classify(np.expand_dims(res, 0))[0][0]
     if self.config.debug:
         print("Toxicity after attack: ", tox_after)
         print(flipped_sent)
     single_attack = dict()
     single_attack['seq_idx'] = sequence_idx
     single_attack['seq_length'] = np.count_nonzero(curr_seq)
     single_attack['attack_model'] = model
     single_attack['time_for_attack'] = time_for_attack
     single_attack['tox_before'] = tox_before
     single_attack['tox_after'] = tox_after
     single_attack['attack_number'] = attack_number
     single_attack['flip_once_in_a_word'] = self.config.flip_once_in_a_word
     single_attack[
         'flip_middle_letters_only'] = self.config.flip_middle_letters_only
     self.attack_list.append(single_attack.copy())
     return tox_before, tox_after, flip_idx, res, time_for_attack