Esempio n. 1
0
def find_top_k_words_with_tag(k, tag):
    stop_words = criteria.get_stopwords()
    bigrams = nltk.bigrams(
        (x[0].lower(), x[1])
        for x in nltk.corpus.brown.tagged_words(tagset='universal'))
    # Filter bigrams
    tagged = []
    if tag == 'ADJ':
        next_tags = 'PROPN', 'NOUN', 'PRON'
        tagged = [
            x[0] for x in bigrams if x[0][1] == 'ADJ' and x[1][1] in next_tags
        ]

    if tag == 'ADV':
        next_tags = 'VERB'
        tagged = [
            x[0] for x in bigrams if x[0][1] == 'ADV' and x[1][1] in next_tags
        ]
        # tagged.extend([x[1] for x in bigrams if x[1][1] == 'ADV' and x[0][1] in next_tags])
    freq = nltk.FreqDist(x for x in tagged if x[0] not in stop_words) \
        .most_common(k)
    top_list = [x[0][0] for x in freq]

    return top_list
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--dataset_path",
                        type=str,
                        required=True,
                        help="Which dataset to attack.")
    parser.add_argument("--nclasses",
                        type=int,
                        default=2,
                        help="How many classes for classification.")
    parser.add_argument("--target_model",
                        type=str,
                        required=True,
                        choices=['wordLSTM', 'bert', 'wordCNN'],
                        help="Target models for text classification: fasttext, charcnn, word level lstm "
                             "For NLI: InferSent, ESIM, bert-base-uncased")
    parser.add_argument("--target_model_path",
                        type=str,
                        required=True,
                        help="pre-trained target model path")
    parser.add_argument("--word_embeddings_path",
                        type=str,
                        default='',
                        help="path to the word embeddings for the target model")
    parser.add_argument("--counter_fitting_embeddings_path",
                        type=str,
                        required=True,
                        help="path to the counter-fitting embeddings we used to find synonyms")
    parser.add_argument("--counter_fitting_cos_sim_path",
                        type=str,
                        default='',
                        help="pre-compute the cosine similarity scores based on the counter-fitting embeddings")
    parser.add_argument("--USE_cache_path",
                        type=str,
                        required=True,
                        help="Path to the USE encoder cache.")
    parser.add_argument("--output_dir",
                        type=str,
                        default='adv_results',
                        help="The output directory where the attack results will be written.")

    ## Model hyperparameters
    parser.add_argument("--sim_score_window",
                        default=15,
                        type=int,
                        help="Text length or token number to compute the semantic similarity score")
    parser.add_argument("--import_score_threshold",
                        default=-1.,
                        type=float,
                        help="Required mininum importance score.")
    parser.add_argument("--sim_score_threshold",
                        default=0.85,
                        type=float,
                        help="Required minimum semantic similarity score.")
    parser.add_argument("--synonym_num",
                        default=50,
                        type=int,
                        help="Number of synonyms to extract")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size to get prediction")
    parser.add_argument("--data_size",
                        default=100,
                        type=int,
                        help="Data size to create adversaries")
    parser.add_argument("--perturb_ratio",
                        default=0.,
                        type=float,
                        help="Whether use random perturbation for ablation study")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="max sequence length for BERT target model")

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    # get data to attack
    texts, labels = dataloader.read_corpus(args.dataset_path)
    data = list(zip(texts, labels))
    data = data[:args.data_size] # choose how many samples for adversary
    print("Data import finished!")

    # construct the model
    print("Building Model...")
    if args.target_model == 'wordLSTM':
        model = Model(args.word_embeddings_path, nclasses=args.nclasses).cuda()
        checkpoint = torch.load(args.target_model_path, map_location='cuda:0')
        model.load_state_dict(checkpoint)
    elif args.target_model == 'wordCNN':
        model = Model(args.word_embeddings_path, nclasses=args.nclasses, hidden_size=100, cnn=True).cuda()
        checkpoint = torch.load(args.target_model_path, map_location='cuda:0')
        model.load_state_dict(checkpoint)
    elif args.target_model == 'bert':
        model = NLI_infer_BERT(args.target_model_path, nclasses=args.nclasses, max_seq_length=args.max_seq_length)
    predictor = model.text_pred
    print("Model built!")
    maskedLM = Contextual_synonyms_BERT(args.target_model_path, max_seq_length=args.max_seq_length)
    maskedLM_predictor=maskedLM #.text_pred
    print("Masked LM BERT built!")
    # prepare synonym extractor
    # build dictionary via the embedding file
    idx2word = {}
    word2idx = {}

    print("Building vocab...")
    with open(args.counter_fitting_embeddings_path, 'r') as ifile:
        for line in ifile:
            word = line.split()[0]
            if word not in idx2word:
                idx2word[len(idx2word)] = word
                word2idx[word] = len(idx2word) - 1

    print("Building cos sim matrix...")
    if args.counter_fitting_cos_sim_path:
        # load pre-computed cosine similarity matrix if provided
        print('Load pre-computed cosine similarity matrix from {}'.format(args.counter_fitting_cos_sim_path))
        cos_sim = np.load(args.counter_fitting_cos_sim_path)
    else:
        # calculate the cosine similarity matrix
        print('Start computing the cosine similarity matrix!')
        embeddings = []
        with open(args.counter_fitting_embeddings_path, 'r') as ifile:
            for line in ifile:
                embedding = [float(num) for num in line.strip().split()[1:]]
                embeddings.append(embedding)
        embeddings = np.array(embeddings)
        product = np.dot(embeddings, embeddings.T)
        norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
        cos_sim = product / np.dot(norm, norm.T)
    print("Cos sim import finished!")

    # build the semantic similarity module
    use = USE(args.USE_cache_path)

    # start attacking
    orig_failures = 0.
    adv_failures = 0.
    changed_rates = []
    nums_queries = []
    orig_texts = []
    adv_texts = []
    true_labels = []
    new_labels = []
    log_file = open(os.path.join(args.output_dir, 'results_log'), 'a')

    stop_words_set = criteria.get_stopwords()
    print('Start attacking!')
    for idx, (text, true_label) in enumerate(data):
        if idx % 2 == 0:
            print('{} samples out of {} have been finished!'.format(idx, args.data_size))
        if args.perturb_ratio > 0.:
            new_text, num_changed, orig_label, \
            new_label, num_queries = random_attack(text, true_label, predictor, args.perturb_ratio, stop_words_set,
                                                    word2idx, idx2word, cos_sim, sim_predictor=use,
                                                    sim_score_threshold=args.sim_score_threshold,
                                                    import_score_threshold=args.import_score_threshold,
                                                    sim_score_window=args.sim_score_window,
                                                    synonym_num=args.synonym_num,
                                                    batch_size=args.batch_size)
        else:
            new_text, num_changed, orig_label, \
            new_label, num_queries = contextual_attack(text, true_label, predictor, maskedLM_predictor, stop_words_set,
                                            word2idx, idx2word, cos_sim, sim_predictor=use,
                                            sim_score_threshold=args.sim_score_threshold,
                                            import_score_threshold=args.import_score_threshold,
                                            sim_score_window=args.sim_score_window,
                                            synonym_num=args.synonym_num,
                                            batch_size=args.batch_size)
            '''
            new_text, num_changed, orig_label, \
            new_label, num_queries = attack(text, true_label, predictor, stop_words_set,
                                            word2idx, idx2word, cos_sim, sim_predictor=use,
                                            sim_score_threshold=args.sim_score_threshold,
                                            import_score_threshold=args.import_score_threshold,
                                            sim_score_window=args.sim_score_window,
                                            synonym_num=args.synonym_num,
                                            batch_size=args.batch_size)
            '''
        if true_label != orig_label:
            orig_failures += 1
        else:
            nums_queries.append(num_queries)
        if true_label != new_label:
            adv_failures += 1
        print("OLD TEXT: "+ ' '.join(text))
        print("NEW TEXT: "+ new_text)
        print("NUM CHANGED: "+str(num_changed))
        sys.stdout.flush()
        changed_rate = 1.0 * num_changed / len(text)

        if true_label == orig_label and true_label != new_label:
            changed_rates.append(changed_rate)
            orig_texts.append(' '.join(text))
            adv_texts.append(new_text)
            true_labels.append(true_label)
            new_labels.append(new_label)

    message = 'For target model {}: original accuracy: {:.3f}%, adv accuracy: {:.3f}%, ' \
              'avg changed rate: {:.3f}%, num of queries: {:.1f}\n'.format(args.target_model,
                                                                     (1-orig_failures/1000)*100,
                                                                     (1-adv_failures/1000)*100,
                                                                     np.mean(changed_rates)*100,
                                                                     np.mean(nums_queries))
    print(message)
    log_file.write(message)

    with open(os.path.join(args.output_dir, 'adversaries.txt'), 'w') as ofile:
        for orig_text, adv_text, true_label, new_label in zip(orig_texts, adv_texts, true_labels, new_labels):
            ofile.write('orig sent ({}):\t{}\nadv sent ({}):\t{}\n\n'.format(true_label, orig_text, new_label, adv_text))
def main():
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("Output directory ({}) already exists and is not empty.".format(
            args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    # get data to attack
    texts, labels = dataloader.read_corpus(args.dataset_path)
    data = list(zip(texts, labels))
    data = data[:args.data_size]  # choose how many samples for adversary
    print("Data import finished!")

    # construct the model
    print("Building Model...")
    if args.target_model == 'wordLSTM':
        model = Model(args.word_embeddings_path, nclasses=args.nclasses).cuda()
        checkpoint = torch.load(args.target_model_path, map_location='cuda:0')
        model.load_state_dict(checkpoint)
    elif args.target_model == 'wordCNN':
        model = Model(args.word_embeddings_path,
                      nclasses=args.nclasses,
                      hidden_size=100,
                      cnn=True).cuda()
        checkpoint = torch.load(args.target_model_path, map_location='cuda:0')
        model.load_state_dict(checkpoint)
    elif args.target_model == 'bert':
        model = NLI_infer_BERT(args.target_model_path,
                               nclasses=args.nclasses,
                               max_seq_length=args.max_seq_length)
    predictor = model.text_pred
    print("Model built!")

    # prepare synonym extractor
    # build dictionary via the embedding file
    idx2word = {}
    word2idx = {}

    # print("Building vocab...")
    # with open(args.counter_fitting_embeddings_path, 'r') as ifile:
    #     for line in ifile:
    #         word = line.split()[0]
    #         if word not in idx2word:
    #             idx2word[len(idx2word)] = word
    #             word2idx[word] = len(idx2word) - 1

    # print("Building cos sim matrix...")
    # if args.counter_fitting_cos_sim_path:
    #     # load pre-computed cosine similarity matrix if provided
    #     print('Load pre-computed cosine similarity matrix from {}'.format(args.counter_fitting_cos_sim_path))
    #     cos_sim = np.load(args.counter_fitting_cos_sim_path)
    # else:
    #     # calculate the cosine similarity matrix
    #     print('Start computing the cosine similarity matrix!')
    #     embeddings = []
    #     with open(args.counter_fitting_embeddings_path, 'r') as ifile:
    #         for line in ifile:
    #             embedding = [float(num) for num in line.strip().split()[1:]]
    #             embeddings.append(embedding)
    #     embeddings = np.array(embeddings)
    #     product = np.dot(embeddings, embeddings.T)
    #     norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
    #     cos_sim = product / np.dot(norm, norm.T)
    # print("Cos sim import finished!")

    # build the semantic similarity module
    # use = USE(args.USE_cache_path)
    use = None

    # start attacking
    orig_failures = 0.
    adv_failures = 0.
    changed_rates = []
    nums_queries = []
    orig_texts = []
    adv_texts = []
    true_labels = []
    new_labels = []
    log_file = open(os.path.join(args.output_dir, 'results_log'), 'a')

    stop_words_set = criteria.get_stopwords()
    print('Start attacking!')
    for idx, (text, true_label) in enumerate(data):
        if idx % 20 == 0:
            print('{} samples out of {} have been finished!'.format(
                idx, args.data_size))
        if args.perturb_ratio > 0.:
            new_text, num_changed, orig_label, \
            new_label, num_queries = random_attack(text, true_label, predictor, args.perturb_ratio, stop_words_set,
                                                    word2idx, idx2word, cos_sim, sim_predictor=use,
                                                    sim_score_threshold=args.sim_score_threshold,
                                                    import_score_threshold=args.import_score_threshold,
                                                    sim_score_window=args.sim_score_window,
                                                    synonym_num=args.synonym_num,
                                                    batch_size=args.batch_size)
        else:
            new_text, num_changed, orig_label, \
            new_label, num_queries = attack(text, true_label, predictor, model, batch_size=args.batch_size)

        print(true_label, orig_label, new_label)
        print("orig texts:", text)
        if true_label != orig_label:
            orig_failures += 1
            print("orig failure")
        else:
            nums_queries.append(num_queries)
        if orig_label != new_label:
            adv_failures += 1
            print(
                f"attack successful: {adv_failures}/{idx + 1}={adv_failures / (idx + 1)}"
            )

        changed_rate = 1.0 * num_changed / len(text)

        if orig_label != new_label:
            changed_rates.append(changed_rate)
            orig_texts.append(text)
            adv_texts.append(new_text)
            true_labels.append(true_label)
            new_labels.append(new_label)

    message = 'For target model {}: original accuracy: {:.3f}%, adv accuracy: {:.3f}%, ' \
              'avg changed rate: {:.3f}%, num of queries: {:.1f}\n'.format(args.target_model,
                                                                     (1-orig_failures/args.data_size)*100,
                                                                     (1-adv_failures/args.data_size)*100,
                                                                     np.mean(changed_rates)*100,
                                                                     np.mean(nums_queries))
    print(message)
    log_file.write(message)

    # with open(os.path.join(args.output_dir, 'adversaries.txt'), 'w') as ofile:
    #     for orig_text, adv_text, true_label, new_label in zip(orig_texts, adv_texts, true_labels, new_labels):
    #         ofile.write('orig sent ({}):\t{}\nadv sent ({}):\t{}\n\n'.format(true_label, orig_text, new_label, adv_text))
    adv_data = {
        'adv_text': adv_texts,
        'orig_text': orig_texts,
        'true_labels': true_labels,
        'new_labels': new_labels
    }
    import joblib
    joblib.dump(adv_data, os.path.join(args.output_dir, args.save))
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--dataset_path",
                        type=str,
                        required=True,
                        help="Which dataset to attack.")

    parser.add_argument(
        "--target_model",
        type=str,
        required=True,
        choices=['wordLSTM', 'bert', 'wordCNN', 'gcp'],
        help=
        "Target models for text classification: wordcnn, word Lstm or GCP NLP API "
    )

    parser.add_argument("--gcp_nlp_json_link",
                        type=str,
                        required=False,
                        default='',
                        help="Link to GCP NLP API json key file")

    parser.add_argument("--target_model_path",
                        type=str,
                        required=False,
                        default='',
                        help="pre-trained target model path")

    parser.add_argument("--nclasses",
                        type=int,
                        default=2,
                        help="How many classes for classification.")

    parser.add_argument(
        "--word_embeddings_path",
        type=str,
        default='word_embeddings_path/glove.6B.200d.txt',
        help="path to the word embeddings for the target model")
    parser.add_argument(
        "--counter_fitting_embeddings_path",
        type=str,
        required=False,
        default="counter_fitting_embedding/counter-fitted-vectors.txt",
        help="path to the counter-fitting embeddings used to find synonyms")

    parser.add_argument(
        "--output_dir",
        type=str,
        default='adv_results',
        help="The output directory where the attack results will be written.")

    parser.add_argument("--pos_filter",
                        type=str,
                        default='coarse',
                        help="pos filter mask: either 'fine' or 'coarse")

    args = parser.parse_args()
    output_dir = args.output_dir

    #download data to be Attacked
    data = download_attack_data(args.dataset_path)
    data = data[:100]

    #find word2idx and idx2word dicts
    embeddings, word2idx_vocab, idx2word_vocab = generate_embedding_mat(
        args.counter_fitting_embeddings_path)

    #compute cosine similarity matrix of words in text
    cos_sim, word2idx_rev, idx2word_rev = generateCosSimMat.csim_matrix(
        data, embeddings, word2idx_vocab)

    gcp_attack = False
    #Load the saved model using state dic
    if args.target_model == "wordCNN":
        default_model_path = "saved_models/wordCNN/"
        if 'imdb' in args.dataset_path:
            default_model_path += 'imdb'
        elif 'mr' in args.dataset_path:
            default_model_path += 'mr'

        cmodel = Model(args.word_embeddings_path,
                       nclasses=args.nclasses,
                       hidden_size=100,
                       cnn=True).cuda()

    elif args.target_model == "wordLSTM":
        default_model_path = "saved_models/wordLSTM/"
        if 'imdb' in args.dataset_path:
            default_model_path += 'imdb'
        elif 'mr' in args.dataset_path:
            default_model_path += 'mr'
        elif 'ag' in args.dataset_path:
            default_model_path += 'ag'
        cmodel = Model(args.word_embeddings_path,
                       nclasses=args.nclasses,
                       cnn=False).cuda()

    elif args.target_model == "bert":
        default_model_path = "saved_models/bert/"
        if 'imdb' in args.dataset_path:
            default_model_path += 'imdb'
        elif 'mr' in args.dataset_path:
            default_model_path += 'mr'
        if args.target_model_path:
            cmodel = LoadPretrainedBert.loadPretrainedModel(
                args.target_model_path, nclasses=args.nclasses)
        else:
            cmodel = LoadPretrainedBert.loadPretrainedModel(
                default_model_path, nclasses=args.nclasses)

    elif args.target_model == "gcp":
        cmodel = None

    if args.target_model != 'bert' and args.target_model != 'gcp':
        #load checkpoints
        if args.target_model_path:
            print("target model path")
            checkpoint = torch.load(args.target_model_path,
                                    map_location=torch.device('cuda:0'))
        else:
            checkpoint = torch.load(default_model_path,
                                    map_location=torch.device('cuda:0'))

        cmodel.load_state_dict(checkpoint)

    orig_failures = 0.
    adv_failures = 0.
    changed_rates = []
    nums_queries = []
    orig_texts = []
    adv_texts = []
    true_labels = []
    new_labels = []
    sem_sim = []
    log_file = open(os.path.join(args.output_dir, 'results_log'), 'a')
    stop_words_set = criteria.get_stopwords()
    true_label = 1
    predictor = 1
    sim_score_threshold = 0.5
    perturb_ratio = 0.4
    size = len(data)
    print('Start attacking!')
    ct = 0
    #tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
    pos_filter = args.pos_filter
    random_atk = False
    for idx, (text, true_label) in enumerate(data):
        ct += 1
        print(idx)
        if idx % 20 == 0:
            print('{} samples out of {} have been finished!'.format(idx, size))

        new_text, num_changed, orig_label, \
        new_label, num_queries = attack(cmodel, args.gcp_nlp_json_link, text, true_label, stop_words_set,
                                                  word2idx_rev, idx2word_rev, idx2word_vocab, cos_sim, pos_filter,
                                                  synonym_num=80, sim_score_threshold=sim_score_threshold ,
                                                  syn_sim=0.65)

        #print(text)
        if true_label != orig_label:
            orig_failures += 1
        else:
            nums_queries.append(num_queries)

        if true_label != new_label:
            adv_failures += 1

        tokenizer = RegexpTokenizer(r'\w+')
        text_tokens = tokenizer.tokenize(text)
        changed_rate = 1.0 * num_changed / len(text_tokens)

        if true_label == orig_label and true_label != new_label:
            changed_rates.append(changed_rate)
            orig_texts.append(text)
            adv_texts.append(new_text)
            true_labels.append(true_label)
            new_labels.append(new_label)

    message = 'For target model {}: original accuracy: {:.3f}%, adv accuracy: {:.3f}%, ' \
                  'avg changed rate: {:.3f}%, Avg num of queries: {:.1f}\n, Median num of queries:{:.1f} \n'.format(predictor,
                                                                          (1-orig_failures/size)*100,
                                                                                (1-adv_failures/size)*100,
                                                                          np.mean(changed_rates)*100,
                                                                          np.mean(nums_queries),

                                                                          np.median(nums_queries))
    print(message)
    log_file.write(message)
    i = 1
    with open(
            os.path.join(args.output_dir,
                         'adversaries_AG_1-1000_lstm_coarse_POS.txt'),
            'w') as ofile:
        for orig_text, adv_text, true_label, new_label in zip(
                orig_texts, adv_texts, true_labels, new_labels):
            ofile.write('orig sent ({}):\t{}\nadv sent ({}):\t{}\n\n'.format(
                true_label, orig_text, new_label, adv_text))
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--dataset_path",
                        type=str,
                        required=True,
                        help="Which dataset to attack.")
    parser.add_argument("--nclasses",
                        type=int,
                        default=2,
                        help="How many classes for classification.")
    parser.add_argument(
        "--target_model",
        type=str,
        required=True,
        choices=['wordLSTM', 'bert', 'wordCNN'],
        help=
        "Target models for text classification: fasttext, charcnn, word level lstm "
        "For NLI: InferSent, ESIM, bert-base-uncased")
    parser.add_argument("--target_model_path",
                        type=str,
                        required=True,
                        help="pre-trained target model path")
    parser.add_argument(
        "--word_embeddings_path",
        type=str,
        default='',
        help="path to the word embeddings for the target model")
    parser.add_argument(
        "--counter_fitting_embeddings_path",
        type=str,
        required=True,
        help="path to the counter-fitting embeddings we used to find synonyms")
    parser.add_argument(
        "--counter_fitting_cos_sim_path",
        type=str,
        default='',
        help=
        "pre-compute the cosine similarity scores based on the counter-fitting embeddings"
    )
    parser.add_argument("--USE_cache_path",
                        type=str,
                        required=True,
                        help="Path to the USE encoder cache.")
    parser.add_argument(
        "--output_dir",
        type=str,
        default='adv_results',
        help="The output directory where the attack results will be written.")

    ## Model hyperparameters
    parser.add_argument(
        "--sim_score_window",
        default=15,
        type=int,
        help=
        "Text length or token number to compute the semantic similarity score")
    parser.add_argument("--import_score_threshold",
                        default=-1.,
                        type=float,
                        help="Required mininum importance score.")
    parser.add_argument("--sim_score_threshold",
                        default=0.7,
                        type=float,
                        help="Required minimum semantic similarity score.")
    parser.add_argument("--synonym_num",
                        default=5000,
                        type=int,
                        help="Number of synonyms to extract")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size to get prediction")
    parser.add_argument("--data_size",
                        default=1000,
                        type=int,
                        help="Data size to create adversaries")
    parser.add_argument(
        "--perturb_ratio",
        default=0.,
        type=float,
        help="Whether use random perturbation for ablation study")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="max sequence length for BERT target model")
    parser.add_argument("--target_dataset",
                        default="imdb",
                        type=str,
                        help="Dataset Name")
    parser.add_argument("--fuzz",
                        default=0,
                        type=int,
                        help="Word Pruning Value")
    parser.add_argument("--top_k_words",
                        default=1000000,
                        type=int,
                        help="Top K Words")
    parser.add_argument("--allowed_qrs",
                        default=1000000,
                        type=int,
                        help="Allowerd qrs")

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("Output directory ({}) already exists and is not empty.".format(
            args.output_dir))
    else:
        os.makedirs(args.output_dir, exist_ok=True)

    # get data to attack
    texts, labels = dataloader.read_corpus(args.dataset_path, csvf=False)
    data = list(zip(texts, labels))
    data = data[:args.data_size]  # choose how many samples for adversary
    print("Data import finished!")

    # construct the model
    print("Building Model...")
    if args.target_model == 'wordLSTM':
        model = Model(args.word_embeddings_path, nclasses=args.nclasses).cuda()
        checkpoint = torch.load(args.target_model_path, map_location='cuda:0')
        model.load_state_dict(checkpoint)
    elif args.target_model == 'wordCNN':
        model = Model(args.word_embeddings_path,
                      nclasses=args.nclasses,
                      hidden_size=150,
                      cnn=True).cuda()
        checkpoint = torch.load(args.target_model_path, map_location='cuda:0')
        model.load_state_dict(checkpoint)
    elif args.target_model == 'bert':
        model = NLI_infer_BERT(args.target_model_path,
                               nclasses=args.nclasses,
                               max_seq_length=args.max_seq_length)
    predictor = model.text_pred
    print("Model built!")

    # prepare synonym extractor
    # build dictionary via the embedding file
    idx2word = {}
    word2idx = {}
    sim_lis = []
    word_embedding = defaultdict(list)

    print("Building vocab...")
    with open(args.counter_fitting_embeddings_path, 'r') as ifile:
        for line in ifile:
            word = line.split()[0]
            embedding = [float(num) for num in line.strip().split()[1:]]
            if word not in idx2word:
                idx2word[len(idx2word)] = word
                word2idx[word] = len(idx2word) - 1
                word_embedding[word] = embedding

    print("Building cos sim matrix...")
    if args.counter_fitting_cos_sim_path:
        print('Load pre-computed cosine similarity matrix from {}'.format(
            args.counter_fitting_cos_sim_path))
        with open(args.counter_fitting_cos_sim_path, "rb") as fp:
            sim_lis = pickle.load(fp)
        # load pre-computed cosine similarity matrix if provided
        #print('Load pre-computed cosine similarity matrix from {}'.format(args.counter_fitting_cos_sim_path))
        #cos_sim = np.load(args.counter_fitting_cos_sim_path)
    else:
        print('Start computing the cosine similarity matrix!')
        embeddings = []
        with open(args.counter_fitting_embeddings_path, 'r') as ifile:
            for line in ifile:
                embedding = [float(num) for num in line.strip().split()[1:]]
                embeddings.append(embedding)
        embeddings = np.array(embeddings)
        print(embeddings.T.shape)
        norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
        embeddings = np.asarray(embeddings / norm, "float64")
        cos_sim = np.dot(embeddings, embeddings.T)

    print("Cos sim import finished!")

    # build the semantic similarity module
    use = USE(args.USE_cache_path)

    # start attacking
    orig_failures = 0.
    adv_failures = 0.
    avg = 0.
    changed_rates = []
    nums_queries = []
    orig_texts = []
    adv_texts = []
    true_labels = []
    new_labels = []
    wrds = []
    s_queries = []
    f_queries = []
    sims_final = []
    success = []
    results = []
    fails = []
    scrs = []

    log_file = "results_context/" + args.target_model + "/" + args.target_dataset + "/log.txt"
    result_file = "results_context/" + args.target_model + "/" + args.target_dataset + "/results.csv"
    wts_file = "tfidf_weights/" + "tfidf-" + args.target_dataset + ".csv"
    fail_file = "fails_tfidf/" + args.target_model + "/" + args.target_dataset + "/fails.csv"

    stop_words_set = criteria.get_stopwords()
    print('Start attacking!')

    for idx, (text, true_label) in enumerate(data):
        #print(text)
        #print(true_label)
        if idx % 20 == 0:
            print(str(idx) + " Samples Done")
            print(len(success))
            print(np.mean(changed_rates))
        if args.perturb_ratio > 0.:
            new_text, num_changed, orig_label, \
            new_label, num_queries = random_attack(text, true_label, predictor,
                                                    args.perturb_ratio, stop_words_set,
                                                    word2idx, idx2word, cos_sim, sim_predictor=use,
                                                    sim_score_threshold=args.sim_score_threshold,
                                                    import_score_threshold=args.import_score_threshold,
                                                    sim_score_window=args.sim_score_window,
                                                    synonym_num=args.synonym_num,
                                                    batch_size=args.batch_size)
        else:
            new_text, num_changed, orig_label, \
            new_label, num_queries,pwrds,perts = attack(args.fuzz,args.top_k_words,args.allowed_qrs,
                                            wts_file,idx,text, true_label, predictor, stop_words_set,
                                            word2idx, idx2word, sim_lis, word_embedding , sim_predictor=use,
                                            sim_score_threshold=args.sim_score_threshold,
                                            import_score_threshold=args.import_score_threshold,
                                            sim_score_window=args.sim_score_window,
                                            synonym_num=args.synonym_num,
                                            batch_size=args.batch_size)
        scrs.append(perts)
        if true_label != orig_label:
            orig_failures += 1
        else:
            nums_queries.append(num_queries)

        if true_label != new_label:
            adv_failures += 1
            #f_queries.append(num_queries)

        changed_rate = 1.0 * num_changed / len(text)
        if true_label == orig_label and true_label != new_label:
            temp = []
            s_queries.append(num_queries)
            success.append(idx)
            changed_rates.append(changed_rate)
            orig_texts.append(' '.join(text))
            adv_texts.append(new_text)
            true_labels.append(true_label)
            new_labels.append(new_label)
            wrds.append(pwrds)
            temp.append(idx)
            temp.append(' '.join(text))
            temp.append(new_text)
            temp.append(num_queries)
            temp.append(changed_rate * 100)
            results.append(temp)
            print("Attacked: " + str(idx))
        if true_label == orig_label and true_label == new_label:
            f_queries.append(num_queries)
            temp1 = []
            temp1.append(idx)
            temp1.append(' '.join(text))
            temp1.append(new_text)
            temp1.append(num_queries)
            fails.append(temp1)

    message = 'For target model {} on dataset window size {} with WP val {} top words {} qrs {} : ' \
              'original accuracy: {:.3f}%, adv accuracy: {:.3f}%, ' \
              'avg changed rate: {:.3f}%, num of queries: {:.1f}\n'.format(args.target_model,
                                                                      args.sim_score_window,
                                                                      args.fuzz,
                                                                      args.top_k_words,args.allowed_qrs,
                                                                     (1-orig_failures/1000)*100,
                                                                     (1-adv_failures/1000)*100,
                                                                     np.mean(changed_rates)*100,
                                                                     np.mean(nums_queries))
    print(message)

    log = open(log_file, 'a')
    log.write(message)
    with open(result_file, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerows(results)
    #with open('scores-mr.csv','w') as csvfile:
    #    csvwriter = csv.writer(csvfile)
    #    csvwriter.writerows(scrs)


#    with open(fail_file,'w') as csvfile:
#        csvwriter = csv.writer(csvfile)
#        csvwriter.writerows(fails)
# writing the data rows
    print(avg)
    print(len(f_queries))
    print(f_queries)

    with open(os.path.join(args.output_dir, 'adversaries.txt'), 'w') as ofile:
        for orig_text, adv_text, true_label, new_label in zip(
                orig_texts, adv_texts, true_labels, new_labels):
            ofile.write('orig sent ({}):\t{}\nadv sent ({}):\t{}\n\n'.format(
                true_label, orig_text, new_label, adv_text))
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--dataset_path",
                        type=str,
                        required=True,
                        help="Which dataset to attack.")
    parser.add_argument("--nclasses",
                        type=int,
                        default=2,
                        help="How many classes for classification.")
    parser.add_argument(
        "--target_model",
        type=str,
        required=True,
        choices=['wordLSTM', 'bert', 'wordCNN'],
        help=
        "Target models for text classification: fasttext, charcnn, word level lstm "
        "For NLI: InferSent, ESIM, bert-base-uncased")
    parser.add_argument("--target_model_path",
                        type=str,
                        required=True,
                        help="pre-trained target model path")
    parser.add_argument(
        "--word_embeddings_path",
        type=str,
        default='',
        help="path to the word embeddings for the target model")
    parser.add_argument(
        "--counter_fitting_embeddings_path",
        type=str,
        required=True,
        help="path to the counter-fitting embeddings we used to find synonyms")
    parser.add_argument(
        "--counter_fitting_cos_sim_path",
        type=str,
        default='',
        help=
        "pre-compute the cosine similarity scores based on the counter-fitting embeddings"
    )
    parser.add_argument("--USE_cache_path",
                        type=str,
                        required=True,
                        help="Path to the USE encoder cache.")
    parser.add_argument(
        "--output_dir",
        type=str,
        default='adv_results',
        help="The output directory where the attack results will be written.")

    ## Model hyperparameters
    parser.add_argument(
        "--sim_score_window",
        default=15,
        type=int,
        help=
        "Text length or token number to compute the semantic similarity score")
    parser.add_argument("--import_score_threshold",
                        default=-1.,
                        type=float,
                        help="Required mininum importance score.")
    parser.add_argument("--sim_score_threshold",
                        default=0.7,
                        type=float,
                        help="Required minimum semantic similarity score.")
    parser.add_argument("--synonym_num",
                        default=50,
                        type=int,
                        help="Number of synonyms to extract")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size to get prediction")
    parser.add_argument("--data_size",
                        default=1000,
                        type=int,
                        help="Data size to create adversaries")
    parser.add_argument(
        "--perturb_ratio",
        default=0.,
        type=float,
        help="Whether use random perturbation for ablation study")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="max sequence length for BERT target model")

    args = parser.parse_args()

    # get data to attack
    texts, labels = dataloader.read_corpus(args.dataset_path)
    data = list(zip(texts, labels))
    data = data[:args.data_size]  # choose how many samples for adversary
    print("Data import finished!")

    # construct the model
    print("Building Model...")
    model = BERTInference(args.target_model_path,
                          nclasses=args.nclasses,
                          max_seq_length=args.max_seq_length)
    predictor = model.text_pred
    print("Model built!")

    # prepare synonym extractor
    # build dictionary via the embedding file
    idx2word = {}
    word2idx = {}

    print("Building vocab...")
    with open(args.counter_fitting_embeddings_path, 'r') as ifile:
        for line in ifile:
            word = line.split()[0]
            if word not in idx2word:
                idx2word[len(idx2word)] = word
                word2idx[word] = len(idx2word) - 1

    print("Building cos sim matrix...")
    cos_sim = np.load(args.counter_fitting_cos_sim_path)
    print("Cos sim import finished!")

    # build the semantic similarity module
    # use = UniversalSentenceEncoder(args.USE_cache_path)
    use = UniversalSentenceEncoder()

    stop_words_set = criteria.get_stopwords()
    print('Start attacking!')
    changed_rates = []
    total_time, success, total = 0, 0, 0
    for idx, (text, true_label) in enumerate(data):
        tick = time.time()
        new_text, num_changed, orig_label, \
        new_label, num_queries = attack(text, true_label, predictor, stop_words_set,
                                        word2idx, idx2word, cos_sim, sim_predictor=use,
                                        sim_score_threshold=args.sim_score_threshold,
                                        import_score_threshold=args.import_score_threshold,
                                        sim_score_window=args.sim_score_window,
                                        synonym_num=args.synonym_num,
                                        batch_size=args.batch_size)

        old_text = ' '.join(text)
        print(f"Original: {old_text}")
        print()
        print(f"New:      {new_text}")
        print("--------------------------------------------------------------")

        changed_rate = 1.0 * num_changed / len(text)
        if true_label == orig_label and true_label != new_label:
            changed_rates.append(changed_rate)
            tock = time.time()
            total_time += tock - tick
            success += 1
        total += 1
    print(
        f"Time: {total_time}\tAvg. Change Rate: {np.mean(changed_rates)*100}\tSuccess Rate: {(success / total) * 100}"
    )
Esempio n. 7
0
def main():
    # Own data
    val_data = np.load(
        '../bachelor-thesis/models/bert_scientsBank/correct_sciEntsBank_val.npy',
        allow_pickle=True)
    # Own model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                          num_labels=3)
    model.load_state_dict(
        torch.load('../bachelor-thesis/models/bert_sciEntsBank/model.pt'))
    model.cuda()
    model.eval()
    # data derived from correct model predictions, list of tuples of reference answer, student's answer and prediction
    # All cases are incorrect
    data = [separate_answers(x[0]) for x in val_data if x[1] == 0]

    # TextFooler part
    # prepare synonym extractor
    # build dictionary via the embedding file
    idx2word = {}
    word2idx = {}
    stop_words_set = criteria.get_stopwords()
    print("Building vocab...")
    with open("../TextFooler/data/counter-fitted-vectors.txt",
              'r',
              encoding="utf8") as ifile:
        for line in ifile:
            word = line.split()[0]
            if word not in idx2word:
                idx2word[len(idx2word)] = word
                word2idx[word] = len(idx2word) - 1

    print("Building cos sim matrix...")
    cos_sim = np.load("../TextFooler/data/cos_sim_counter_fitting.npy",
                      allow_pickle=True)
    print("Cos sim import finished!")
    use = USE("use")
    print('Start attacking!')
    orig_scores = {}
    flips = collections.defaultdict(lambda: [])
    # Find flips in data
    adversary_successes = {}
    adversary_count = {}
    # Was used to track top adjectives/adverbs
    # main_tracker_adv = {}
    # main_tracker_adj = {}
    for i, inst in enumerate(data):
        print("Data instances finished: ", i)
        adversaries = []
        num_tf_changed, num_tf_queries, tf_adversaries = text_fooler(
            inst,
            0,
            model,
            stop_words_set,
            word2idx,
            idx2word,
            cos_sim,
            sim_predictor=use,
            sim_score_threshold=0.7,
            import_score_threshold=-1.,
            sim_score_window=4,
            synonym_num=50,
            batch_size=16)
        # Uncomment for textfooler only
        query_num, success_num, bug_adversaries = text_bugger(inst, 0, model)
        # Was used to track top adjectives and adversaries
        """, tracker_adj, tracker_adv"""

        # All adversaries
        adversaries.extend(tf_adversaries)
        adversaries.extend(bug_adversaries)

        # Was used to track top adjectives and adversaries
        """
        for key in tracker_adj:
            main_tracker_adj[key] = main_tracker_adj.get(key, 0) + tracker_adj[key]
        for key in tracker_adv:
            main_tracker_adv[key] = main_tracker_adv.get(key, 0) + tracker_adv[key]
        """

        if len(adversaries) > 0:
            flips[list_to_string(inst[1])].extend(adversaries)
            adversary_successes['tf'] = adversary_successes.get(
                'tf', 0) + num_tf_changed
            adversary_count['tf'] = adversary_count.get('tf',
                                                        0) + num_tf_queries
            for key in query_num:
                adversary_successes[key] = adversary_successes.get(
                    key, 0) + success_num.get(key, 0)
                adversary_count[key] = adversary_count.get(
                    key, 0) + query_num.get(key, 0)

    # Was used to track top adjectives and adversaries
    # np.save("adv_result.npy", main_tracker_adv)
    # np.save("adj_result.npy", main_tracker_adj)
    np.save("adversary_successes_tf.npy", adversary_successes)
    np.save("adversary_count_tf.npy", adversary_count)
    tr2 = replace_rules.TextToReplaceRules(
        nlp, [list_to_string(x[1]) for x in data], [],
        min_freq=0.005,
        min_flip=0.005,
        ngram_size=2)

    # Finding frequent rules
    frequent_rules = []
    rule_idx = {}
    rule_flips = {}
    for z, f in enumerate(flips):
        # f is the student's answer
        # flips[f] flips for given student's answer
        rules = tr2.compute_rules(f, [list_to_string(x) for x in flips[f]],
                                  use_pos=True,
                                  use_tags=False)
        for rs in rules:
            for r in rs:
                if r.hash() not in rule_idx:
                    i = len(rule_idx)
                    rule_idx[r.hash()] = i
                    rule_flips[i] = []
                    frequent_rules.append(r)
                i = rule_idx[r.hash()]
                rule_flips[i].append(z)
        if z % 1000 == 0:
            print("Done with flip nr. ", z)

    # Tokenize the student's answers
    tokenized_stud_ans = tokenizer.tokenize(
        [list_to_string(x[1]) for x in data])
    model_preds = {}
    print("Number of frequent rules: ", len(frequent_rules))

    a = time.time()
    rule_flips = {}
    rule_other_texts = {}
    rule_other_flips = {}
    rule_applies = {}
    for i, r in enumerate(frequent_rules):
        if i % 100 == 0:
            print("Nr. of rules applied: ", i)
        # Get indices, where rule can be applied
        idxs = list(tr2.get_rule_idxs(r))
        to_apply = [tokenized_stud_ans[x] for x in idxs]
        applies, nt = r.apply_to_texts(to_apply, fix_apostrophe=False)
        # Find indices, where rule has been applied
        applies = [idxs[x] for x in applies]
        to_compute = [x for x in zip(applies, nt) if x[1] not in model_preds]
        if to_compute:
            # New predicts
            new_labels = []
            for compute in to_compute:
                j, new_stud = compute
                # Get reference answer for sequence classification
                orig_instance = data[j]
                logits = predict(model, orig_instance[0], new_stud, 0)
                new_label = int(np.argmax(logits))
                new_labels.append(new_label)
            for x, y in zip(to_compute, new_labels):
                model_preds[x[1]] = y

        new_labels = np.array([model_preds[x] for x in nt])
        where_flipped = np.where(new_labels == 2)[0]
        flips = sorted([applies[x] for x in where_flipped])
        rule_flips[i] = flips
        rule_other_texts[i] = nt
        rule_other_flips[i] = where_flipped
        rule_applies[i] = applies

    print("Time used for applying rules: ", time.time() - a)

    threshold = int(0.01 * len(data))
    really_frequent_rules_idx = [
        i for i in range(len(rule_flips)) if len(rule_flips[i]) > threshold
    ]

    # test = [frequent_rules[i] for i in really_frequent_rules_idx if frequent_rules[i].hash().split()[1] == '->']
    # test_2 = [i.hash() for i in test if i.hash()[:4] == 'text']
    print("Amount of really frequent rules: ", len(really_frequent_rules_idx))

    print("Done!")
    high_number_rules = [
        frequent_rules[idx] for idx in really_frequent_rules_idx
    ]
    np.save("frequent_rules.npy", high_number_rules)