Python HuggingFaceDataset Examples

Programming Language: Python

Namespace/Package Name: textattack.datasets

Examples at hotexamples.com: 8

Python HuggingFaceDataset - 8 examples found. These are the top rated real world Python examples of textattack.datasets.HuggingFaceDataset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HuggingFaceDataset(8)

filter_by_labels_(1)

Example #1

Show file

File: training_args.py Project: qiyanjun/TextAttack

    def _create_dataset_from_args(cls, args):
        dataset_args = args.dataset.split(ARGS_SPLIT_TOKEN)
        # TODO `HuggingFaceDataset` -> `HuggingFaceDataset`
        if args.dataset_train_split:
            train_dataset = HuggingFaceDataset(
                *dataset_args, split=args.dataset_train_split
            )
        else:
            try:
                train_dataset = HuggingFaceDataset(*dataset_args, split="train")
                args.dataset_train_split = "train"
            except KeyError:
                raise KeyError(
                    f"Error: no `train` split found in `{args.dataset}` dataset"
                )

        if args.dataset_eval_split:
            eval_dataset = HuggingFaceDataset(
                *dataset_args, split=args.dataset_eval_split
            )
        else:
            # try common dev split names
            try:
                eval_dataset = HuggingFaceDataset(*dataset_args, split="dev")
                args.dataset_eval_split = "dev"
            except KeyError:
                try:
                    eval_dataset = HuggingFaceDataset(*dataset_args, split="eval")
                    args.dataset_eval_split = "eval"
                except KeyError:
                    try:
                        eval_dataset = HuggingFaceDataset(
                            *dataset_args, split="validation"
                        )
                        args.dataset_eval_split = "validation"
                    except KeyError:
                        try:
                            eval_dataset = HuggingFaceDataset(
                                *dataset_args, split="test"
                            )
                            args.dataset_eval_split = "test"
                        except KeyError:
                            raise KeyError(
                                f"Could not find `dev`, `eval`, `validation`, or `test` split in dataset {args.dataset}."
                            )

        if args.filter_train_by_labels:
            train_dataset.filter_by_labels_(args.filter_train_by_labels)
        if args.filter_eval_by_labels:
            eval_dataset.filter_by_labels_(args.filter_eval_by_labels)

        return train_dataset, eval_dataset

Example #2

Show file

File: train.py Project: chenganhsieh/adv_final

    default=False,
    action="store_true",
    help="log metrics to Weights & Biases",
)
args = parser.parse_args()

date_now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")
current_dir = os.path.dirname(os.path.realpath(__file__))
outputs_dir = os.path.join(current_dir, os.pardir, os.pardir, os.pardir,
                           "outputs", "training")
outputs_dir = os.path.normpath(outputs_dir)
args.output_dir = os.path.join(outputs_dir,
                               f"{args.model}-{args.dataset}-{date_now}/")

train_dataset = load_ocnliDataset()
train_hugdataset = HuggingFaceDataset(train_dataset)
val_dataset = load_ocnliDataset(split="dev")
val_hugdataset = HuggingFaceDataset(val_dataset)

train_text, train_labels, eval_text, eval_labels = dataset_for_training(
    train_hugdataset, val_hugdataset)

config = BertConfig.from_pretrained(
    args.tokenizer)  # "hfl/chinese-macbert-base"
config.output_attentions = False
config.output_token_type_ids = False
# config.max_length = 30
tokenizer = BertTokenizerFast.from_pretrained(args.tokenizer,
                                              config=config,
                                              max_length=35)

Example #3

Show file

                outputs.append([1 - score, score])
            else:
                outputs.append([score, 1 - score])
        return np.array(outputs)


# Create the model: a French sentiment analysis model.
# see https://github.com/TheophileBlard/french-sentiment-analysis-with-bert
model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine")
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")
pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

model_wrapper = HuggingFaceSentimentAnalysisPipelineWrapper(pipeline)

# Create the recipe: PWWS uses a WordNet transformation.
recipe = PWWSRen2019.build(model_wrapper)
# WordNet defaults to english. Set the default language to French ('fra')
#
# See
# "Building a free French wordnet from multilingual resources",
# E. L. R. A. (ELRA) (ed.),
# Proceedings of the Sixth International Language Resources and Evaluation (LREC’08).

recipe.transformation.language = "fra"

dataset = HuggingFaceDataset("allocine", split="test")
for idx, result in enumerate(recipe.attack_dataset(dataset)):
    print(("-" * 20), f"Result {idx+1}", ("-" * 20))
    print(result.__str__(color_method="ansi"))
    print()

Example #4

Show file

File: rq3_adv_robustness.py Project: fabriceyhc/Sibyl

    #############################################################

    out = {}

    out['run_num'] = run_num
    out['num_train_per_class'] = num_train_per_class
    out['task'] = task
    out['transform'] = t
    out['run'] = checkpoint
    out['model_name'] = MODEL_NAME
    out['transform'] = t

    if loaded_checkpoint:

        mw = CustomModelWrapper(model, tokenizer)
        dataset = HuggingFaceDataset(test_dataset, shuffle=True)
        attack_args = textattack.AttackArgs(num_examples=num_advs,
                                            disable_stdout=True)

        for recipe in recipes:

            attack = recipe.build(mw)
            attacker = Attacker(attack, dataset, attack_args)
            attack_results = attacker.attack_dataset()

            num_results = 0
            num_failures = 0
            num_successes = 0

            for result in attack_results:

Example #5

Show file

def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--num_examples", default=1, type=int) #50485
	parser.add_argument("--model", default="hfl/chinese-roberta-wwm-ext", type=str)
	parser.add_argument("--num_labels", default=3, type=int)
	parser.add_argument("--cuda", default=0, type=int)
	parser.add_argument("--tokenizer", default="hfl/chinese-roberta-wwm-ext", type=str)
	parser.add_argument(
		"--transformation",
		type=str,
		required=False,
		default="word-swap-embedding",
		help='The transformation to apply. Usage: "--transformation {transformation}:{arg_1}={value_1},{arg_3}={value_3}". Choices: '
		,
	)

	# add_model_args(parser)
	# add_dataset_args(parser)

	parser.add_argument(
		"--constraints",
		type=str,
		required=False,
		nargs="*",
		default=["repeat", "stopword"],
		help='Constraints to add to the attack. Usage: "--constraints {constraint}:{arg_1}={value_1},{arg_3}={value_3}". Choices: '
		,
	)

	parser.add_argument(
		"--log-to-txt",
		"-l",
		nargs="?",
		default=None,
		const="",
		type=str,
		help="Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save "
		"output to specified directory in default naming convention; otherwise enter argument to specify "
		"file name",
	)

	parser.add_argument(
		"--log-to-csv",
		nargs="?",
		default="/home/guest/r09944010/2020MLSECURITY/final/ml-security-proj/attack/OCNLI/roberta/",
		const="",
		type=str,
		help="Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save "
		"output to specified directory in default naming convention; otherwise enter argument to specify "
		"file name",
	)

	parser.add_argument(
		"--csv-style",
		default=None,
		const="fancy",
		nargs="?",
		type=str,
		help="Use --csv-style plain to remove [[]] around words",
	)

	parser.add_argument(
		"--enable-visdom", action="store_true", help="Enable logging to visdom."
	)

	parser.add_argument(
		"--enable-wandb",
		action="store_true",
		help="Enable logging to Weights & Biases.",
	)

	parser.add_argument(
		"--disable-stdout", action="store_true", help="Disable logging to stdout"
	)

	parser.add_argument(
		"--interactive",
		action="store_true",
		default=False,
		help="Whether to run attacks interactively.",
	)

	parser.add_argument(
		"--attack-n",
		action="store_true",
		default=False,
		help="Whether to run attack until `n` examples have been attacked (not skipped).",
	)

	parser.add_argument(
		"--parallel",
		action="store_true",
		default=False,
		help="Run attack using multiple GPUs.",
	)

	# goal_function_choices = ", ".join(GOAL_FUNCTION_CLASS_NAMES.keys())
	parser.add_argument(
		"--goal-function",
		"-g",
		default="untargeted-classification",
		# help=f"The goal function to use. choices: {goal_function_choices}",
	)

	def str_to_int(s):
		return sum((ord(c) for c in s))

	parser.add_argument("--random-seed", default=str_to_int("TEXTATTACK"), type=int)

	parser.add_argument(
		"--checkpoint-dir",
		required=False,
		type=str,
		default=None,
		help="The directory to save checkpoint files.",
	)

	parser.add_argument(
		"--checkpoint-interval",
		required=False,
		type=int,
		help="If set, checkpoint will be saved after attacking every N examples. If not set, no checkpoints will be saved.",
	)

	parser.add_argument(
		"--query-budget",
		"-q",
		type=int,
		default=float("inf"),
		help="The maximum number of model queries allowed per example attacked.",
	)
	parser.add_argument(
		"--model-batch-size",
		type=int,
		default=28,
		help="The batch size for making calls to the model.",
	)
	parser.add_argument(
		"--model-cache-size",
		type=int,
		default=2 ** 18,
		help="The maximum number of items to keep in the model results cache at once.",
	)
	parser.add_argument(
		"--constraint-cache-size",
		type=int,
		default=2 ** 18,
		help="The maximum number of items to keep in the constraints cache at once.",
	)

	attack_group = parser.add_mutually_exclusive_group(required=False)
	attack_group.add_argument(
		"--search",
		"--search-method",
		"-s",
		type=str,
		required=False,
		default="greedy-word-wir",
		# help=f"The search method to use. choices: {search_choices}",
	)
	attack_group.add_argument(
		"--recipe",
		"--attack-recipe",
		"-r",
		type=str,
		required=False,
		default=None,
		# help="full attack recipe (overrides provided goal function, transformation & constraints)",
		# choices=ATTACK_RECIPE_NAMES.keys(),
	)
	attack_group.add_argument(
		"--attack-from-file",
		type=str,
		required=False,
		default=None,
		help="attack to load from file (overrides provided goal function, transformation & constraints)",
	)
	args = parser.parse_args()

	

	# dataset = load_dataset()
	dataset = load_ocnliDataset(split="dev")
	dataset = HuggingFaceDataset(dataset)
	

	

	
	num_remaining_attacks = args.num_examples
	worklist = deque(range(0, args.num_examples))
	worklist_tail = worklist[-1]
	# multi processing
	pytorch_multiprocessing_workaround()
	args = torch.multiprocessing.Manager().Namespace(**vars(args))
	# We reserve the first GPU for coordinating workers.
	num_gpus = torch.cuda.device_count()
	textattack.shared.logger.info(f"Running on {num_gpus} GPUs")

	start_time = time.time()
	in_queue = torch.multiprocessing.Queue()
	out_queue = torch.multiprocessing.Queue()
	missing_datapoints = set()
	for i in worklist:
		try:
			text, output = dataset[i]
			in_queue.put((i, text, output))
		except IndexError:
			missing_datapoints.add(i)
	# if our dataset is shorter than the number of samples chosen, remove the
	# out-of-bounds indices from the dataset
	for i in missing_datapoints:
		worklist.remove(i)
	# Start workers.
	torch.multiprocessing.Pool(5, attack_from_queue, (args, in_queue, out_queue))
	# attack
	# attack = Attack(goal_function, constraints, transformation, search_method)
	# print(attack)
	attack_log_manager = parse_logger_from_args(args)
	print(attack_log_manager)
	input()

	pbar = tqdm.tqdm(total=num_remaining_attacks, smoothing=0)
	num_results = 0
	num_failures = 0
	num_successes = 0
	while worklist:
		result = out_queue.get(block=True)
		if isinstance(result, Exception):
			raise result
		idx, result = result
		attack_log_manager.log_result(result)
		worklist.remove(idx)
		if (not args.attack_n) or (
			not isinstance(result, textattack.attack_results.SkippedAttackResult)
		):
			pbar.update()
			num_results += 1

			if (
				type(result) == textattack.attack_results.SuccessfulAttackResult
				or type(result) == textattack.attack_results.MaximizedAttackResult
			):
				num_successes += 1
			if type(result) == textattack.attack_results.FailedAttackResult:
				num_failures += 1
			pbar.set_description(
				"[Succeeded / Failed / Total] {} / {} / {}".format(
					num_successes, num_failures, num_results
				)
			)
		else:
			# worklist_tail keeps track of highest idx that has been part of worklist
			# Used to get the next dataset element when attacking with `attack_n` = True.
			worklist_tail += 1
			try:
				text, output = dataset[worklist_tail]
				worklist.append(worklist_tail)
				in_queue.put((worklist_tail, text, output))
			except IndexError:
				raise IndexError(
					"Tried adding to worklist, but ran out of datapoints. Size of data is {} but tried to access index {}".format(
						len(dataset), worklist_tail
					)
				)

		if (
			args.checkpoint_interval
			and len(attack_log_manager.results) % args.checkpoint_interval == 0
		):
			new_checkpoint = textattack.shared.Checkpoint(
				args, attack_log_manager, worklist, worklist_tail
			)
			new_checkpoint.save()
			attack_log_manager.flush()


	# for result in attack.attack_dataset(dataset, indices=worklist):
		# attack_log_manager.log_result(result)
		# if not args.disable_stdout:
		#     print("\n")
		# if (not args.attack_n) or (
		#     not isinstance(result, textattack.attack_results.SkippedAttackResult)
		# ):
		#     pbar.update(1)
		# else:
		#     # worklist_tail keeps track of highest idx that has been part of worklist
		#     # Used to get the next dataset element when attacking with `attack_n` = True.
		#     worklist_tail += 1
		#     worklist.append(worklist_tail)

		# num_results += 1

		# if (
		#     type(result) == textattack.attack_results.SuccessfulAttackResult
		#     or type(result) == textattack.attack_results.MaximizedAttackResult
		# ):
		#     num_successes += 1
		# if type(result) == textattack.attack_results.FailedAttackResult:
		#     num_failures += 1
		# pbar.set_description(
		#     "[Succeeded / Failed / Total] {} / {} / {}".format(
		#         num_successes, num_failures, num_results
		#     )
		# )

		# if (
		#     args.checkpoint_interval
		#     and len(attack_log_manager.results) % args.checkpoint_interval == 0
		# ):
		#     new_checkpoint = textattack.shared.Checkpoint(
		#         args, attack_log_manager, worklist, worklist_tail
		#     )
		#     new_checkpoint.save()
		#     attack_log_manager.flush()

	pbar.close()
	print()
	# Enable summary stdout
	if args.disable_stdout:
		attack_log_manager.enable_stdout()
	attack_log_manager.log_summary()
	attack_log_manager.flush()
	print()
	# finish_time = time.time()
	textattack.shared.logger.info(f"Attack time: {time.time()}s")
	attack_log_manager.results

Example #6

Show file

File: attack_new.py Project: chenganhsieh/adv_final

def main():
    parser = argparse.ArgumentParser(
        "TextAttack CLI",
        usage="[python -m] texattack <command> [<args>]",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--tokenizer",
        type=str,
        default="hfl/chinese-roberta-wwm-ext",
    )
    parser.add_argument(
        "--num_examples",
        type=int,
        default="3000",
    )

    parser.add_argument(
        "--model",
        type=str,
        required=False,
        default="hfl/chinese-roberta-wwm-ext",
    )
    parser.add_argument("--random-seed", default=21, type=int)
    # parser = main_parser.add_parser(
    #     "attack",
    #     help="run an attack on an NLP model",
    #     formatter_class=ArgumentDefaultsHelpFormatter,
    # )
    transformation_names = set(
        BLACK_BOX_TRANSFORMATION_CLASS_NAMES.keys()) | set(
            WHITE_BOX_TRANSFORMATION_CLASS_NAMES.keys())
    parser.add_argument(
        "--transformation",
        type=str,
        required=False,
        default="word-swap-embedding",
        help=
        'The transformation to apply. Usage: "--transformation {transformation}:{arg_1}={value_1},{arg_3}={value_3}". Choices: '
        + str(transformation_names),
    )

    # add_model_args(parser)
    # add_dataset_args(parser)

    parser.add_argument(
        "--constraints",
        type=str,
        required=False,
        nargs="*",
        default=["repeat", "stopword"],
        help=
        'Constraints to add to the attack. Usage: "--constraints {constraint}:{arg_1}={value_1},{arg_3}={value_3}". Choices: '
        + str(CONSTRAINT_CLASS_NAMES.keys()),
    )

    parser.add_argument(
        "--log-to-txt",
        "-l",
        nargs="?",
        default=None,
        const="",
        type=str,
        help=
        "Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save "
        "output to specified directory in default naming convention; otherwise enter argument to specify "
        "file name",
    )

    parser.add_argument(
        "--log-to-csv",
        nargs="?",
        default="",
        const="",
        type=str,
        help=
        "Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save "
        "output to specified directory in default naming convention; otherwise enter argument to specify "
        "file name",
    )

    parser.add_argument(
        "--csv-style",
        default=None,
        const="fancy",
        nargs="?",
        type=str,
        help="Use --csv-style plain to remove [[]] around words",
    )

    parser.add_argument("--enable-visdom",
                        action="store_true",
                        help="Enable logging to visdom.")

    parser.add_argument(
        "--enable-wandb",
        action="store_true",
        help="Enable logging to Weights & Biases.",
    )

    parser.add_argument("--disable-stdout",
                        action="store_true",
                        help="Disable logging to stdout")

    parser.add_argument(
        "--interactive",
        action="store_true",
        default=False,
        help="Whether to run attacks interactively.",
    )

    parser.add_argument(
        "--attack-n",
        action="store_true",
        default=False,
        help=
        "Whether to run attack until `n` examples have been attacked (not skipped).",
    )

    parser.add_argument(
        "--parallel",
        action="store_true",
        default=False,
        help="Run attack using multiple GPUs.",
    )

    goal_function_choices = ", ".join(GOAL_FUNCTION_CLASS_NAMES.keys())
    parser.add_argument(
        "--goal-function",
        "-g",
        default="untargeted-classification",
        help=f"The goal function to use. choices: {goal_function_choices}",
    )

    def str_to_int(s):
        return sum((ord(c) for c in s))

    parser.add_argument(
        "--checkpoint-dir",
        required=False,
        type=str,
        default=default_checkpoint_dir(),
        help="The directory to save checkpoint files.",
    )

    parser.add_argument(
        "--checkpoint-interval",
        required=False,
        type=int,
        help=
        "If set, checkpoint will be saved after attacking every N examples. If not set, no checkpoints will be saved.",
    )

    parser.add_argument(
        "--query-budget",
        "-q",
        type=int,
        default=float("inf"),
        help=
        "The maximum number of model queries allowed per example attacked.",
    )
    parser.add_argument(
        "--model-batch-size",
        type=int,
        default=32,
        help="The batch size for making calls to the model.",
    )
    parser.add_argument(
        "--model-cache-size",
        type=int,
        default=2**18,
        help=
        "The maximum number of items to keep in the model results cache at once.",
    )
    parser.add_argument(
        "--constraint-cache-size",
        type=int,
        default=2**18,
        help=
        "The maximum number of items to keep in the constraints cache at once.",
    )

    attack_group = parser.add_mutually_exclusive_group(required=False)
    search_choices = ", ".join(SEARCH_METHOD_CLASS_NAMES.keys())
    attack_group.add_argument(
        "--search",
        "--search-method",
        "-s",
        type=str,
        required=False,
        default="greedy-word-wir",
        help=f"The search method to use. choices: {search_choices}",
    )
    attack_group.add_argument(
        "--recipe",
        "--attack-recipe",
        "-r",
        type=str,
        required=False,
        default="alzantot",
        help=
        "full attack recipe (overrides provided goal function, transformation & constraints)",
        choices=ATTACK_RECIPE_NAMES.keys(),
    )
    attack_group.add_argument(
        "--attack-from-file",
        type=str,
        required=False,
        default=None,
        help=
        "attack to load from file (overrides provided goal function, transformation & constraints)",
    )
    # subparsers = parser.add_subparsers(help="textattack command helpers")

    val_dataset = load_ocnliDataset(split="dev")
    val_hugdataset = HuggingFaceDataset(val_dataset)

    # AttackCommand.register_subcommand(parser)
    attackCommand = AttackCommand()
    args = parser.parse_args()
    attackCommand.run(args, val_hugdataset)
    print("ok")

Example #7

Show file

File: attack_single.py Project: chenganhsieh/adv_final

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--num_examples", default=3000, type=int)  #50485
    parser.add_argument("--model",
                        default="hfl/chinese-roberta-wwm-ext",
                        type=str)
    parser.add_argument("--num_labels", default=3, type=int)
    parser.add_argument("--cuda", default=0, type=int)
    parser.add_argument("--tokenizer",
                        default="hfl/chinese-roberta-wwm-ext",
                        type=str)
    parser.add_argument(
        "--transformation",
        type=str,
        required=False,
        default="word-swap-embedding",
        help=
        'The transformation to apply. Usage: "--transformation {transformation}:{arg_1}={value_1},{arg_3}={value_3}". Choices: ',
    )

    # add_model_args(parser)
    # add_dataset_args(parser)

    parser.add_argument(
        "--constraints",
        type=str,
        required=False,
        nargs="*",
        default=["repeat", "stopword"],
        help=
        'Constraints to add to the attack. Usage: "--constraints {constraint}:{arg_1}={value_1},{arg_3}={value_3}". Choices: ',
    )

    parser.add_argument(
        "--log-to-txt",
        "-l",
        nargs="?",
        default=None,
        const="",
        type=str,
        help=
        "Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save "
        "output to specified directory in default naming convention; otherwise enter argument to specify "
        "file name",
    )

    parser.add_argument(
        "--log-to-csv",
        nargs="?",
        default=
        "/home/guest/r09944010/2020MLSECURITY/final/ml-security-proj/attack/OCNLI/roberta/",
        const="",
        type=str,
        help=
        "Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save "
        "output to specified directory in default naming convention; otherwise enter argument to specify "
        "file name",
    )

    parser.add_argument(
        "--csv-style",
        default=None,
        const="fancy",
        nargs="?",
        type=str,
        help="Use --csv-style plain to remove [[]] around words",
    )

    parser.add_argument("--enable-visdom",
                        action="store_true",
                        help="Enable logging to visdom.")

    parser.add_argument(
        "--enable-wandb",
        action="store_true",
        help="Enable logging to Weights & Biases.",
    )

    parser.add_argument("--disable-stdout",
                        action="store_true",
                        help="Disable logging to stdout")

    parser.add_argument(
        "--interactive",
        action="store_true",
        default=False,
        help="Whether to run attacks interactively.",
    )

    parser.add_argument(
        "--attack-n",
        action="store_true",
        default=False,
        help=
        "Whether to run attack until `n` examples have been attacked (not skipped).",
    )

    parser.add_argument(
        "--parallel",
        action="store_true",
        default=False,
        help="Run attack using multiple GPUs.",
    )

    # goal_function_choices = ", ".join(GOAL_FUNCTION_CLASS_NAMES.keys())
    parser.add_argument(
        "--goal-function",
        "-g",
        default="untargeted-classification",
        # help=f"The goal function to use. choices: {goal_function_choices}",
    )

    def str_to_int(s):
        return sum((ord(c) for c in s))

    parser.add_argument("--random-seed",
                        default=str_to_int("TEXTATTACK"),
                        type=int)

    parser.add_argument(
        "--checkpoint-dir",
        required=False,
        type=str,
        default=None,
        help="The directory to save checkpoint files.",
    )

    parser.add_argument(
        "--checkpoint-interval",
        required=False,
        type=int,
        help=
        "If set, checkpoint will be saved after attacking every N examples. If not set, no checkpoints will be saved.",
    )

    parser.add_argument(
        "--query-budget",
        "-q",
        type=int,
        default=float("inf"),
        help=
        "The maximum number of model queries allowed per example attacked.",
    )
    parser.add_argument(
        "--model-batch-size",
        type=int,
        default=26,
        help="The batch size for making calls to the model.",
    )
    parser.add_argument(
        "--model-cache-size",
        type=int,
        default=2**18,
        help=
        "The maximum number of items to keep in the model results cache at once.",
    )
    parser.add_argument(
        "--constraint-cache-size",
        type=int,
        default=2**18,
        help=
        "The maximum number of items to keep in the constraints cache at once.",
    )

    attack_group = parser.add_mutually_exclusive_group(required=False)
    attack_group.add_argument(
        "--search",
        "--search-method",
        "-s",
        type=str,
        required=False,
        default="greedy-word-wir",
        # help=f"The search method to use. choices: {search_choices}",
    )
    attack_group.add_argument(
        "--recipe",
        "--attack-recipe",
        "-r",
        type=str,
        required=False,
        default=None,
        # help="full attack recipe (overrides provided goal function, transformation & constraints)",
        # choices=ATTACK_RECIPE_NAMES.keys(),
    )
    attack_group.add_argument(
        "--attack-from-file",
        type=str,
        required=False,
        default=None,
        help=
        "attack to load from file (overrides provided goal function, transformation & constraints)",
    )
    args = parser.parse_args()

    # dataset = load_dataset()
    dataset = load_ocnliDataset(split="dev")
    dataset = HuggingFaceDataset(dataset)

    num_remaining_attacks = args.num_examples
    worklist = deque(range(0, args.num_examples))
    worklist_tail = worklist[-1]

    config = BertConfig.from_pretrained(
        "hfl/chinese-macbert-base")  # "hfl/chinese-macbert-base"
    config.output_attentions = False
    config.output_token_type_ids = False
    # config.max_length = 30
    tokenizer = BertTokenizer.from_pretrained("hfl/chinese-macbert-base",
                                              config=config)

    config = AutoConfig.from_pretrained(
        './models/roberta/chinese-roberta-wwm-ext-OCNLI-2021-01-05-23-46-02-975289',
        num_labels=3)
    model = AutoModelForSequenceClassification.from_pretrained(
        './models/roberta/chinese-roberta-wwm-ext-OCNLI-2021-01-05-23-46-02-975289',
        config=config,
    )
    model_wrapper = HuggingFaceModelWrapper(model, tokenizer, batch_size=28)

    # goal function
    goal_function = UntargetedClassification(model_wrapper)
    # constraints
    # stopwords = set(
    #     ["个", "关于", "之上", "across", "之后", "afterwards", "再次", "against", "ain", "全部", "几乎", "单独", "along", "早已", "也", "虽然", "是", "among", "amongst", "一个", "和", "其他", "任何", "anyhow", "任何人", "anything", "anyway", "anywhere", "are", "aren", "没有", "around", "as", "at", "后", "been", "之前", "beforehand", "behind", "being", "below", "beside", "besides", "之間", "beyond", "皆是", "但", "by", "可以", "不可以", "是", "不是", "couldn't", "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down", "due", "之間", "either", "之外", "elsewhere", "空", "足夠", "甚至", "ever", "任何人", "everything", "everywhere", "except", "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "if", "in", "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile", "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly", "must", "mustn", "mustn't", "my", "myself", "namely", "needn", "needn't", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "per", "please", "s", "same", "shan", "shan't", "she", "she's", "should've", "shouldn", "shouldn't", "somehow", "something", "sometime", "somewhere", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "this", "those", "through", "throughout", "thru", "thus", "to", "too", "toward", "towards", "under", "unless", "until", "up", "upon", "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "with", "within", "without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]
    # )
    constraints = [RepeatModification(), StopwordModification()]
    # constraints = [RepeatModification(), StopwordModification(stopwords=stopwords)]
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"})
    constraints.append(input_column_modification)
    constraints.append(MaxWordsPerturbed(max_percent=0.2))
    constraints.append(
        WordEmbeddingDistance(max_mse_dist=0.5,
                              compare_against_original=False))
    # constraints.append(
    #     Google1BillionWordsLanguageModel(
    #         top_n_per_index=4, compare_against_original=False
    #     )
    # )
    # use_constraint = UniversalSentenceEncoder(
    #     threshold=0.840845057,
    #     metric="angular",
    #     compare_against_original=False,
    #     window_size=15,
    #     skip_text_shorter_than_window=True,
    # )
    # constraints.append(use_constraint)
    transformation = WordSwapEmbedding(max_candidates=8)
    # transformation = WordDeletion()
    # search methods
    # search_method = GreedyWordSwapWIR(wir_method="delete")
    search_method = AlzantotGeneticAlgorithm(pop_size=60,
                                             max_iters=20,
                                             post_crossover_check=False)

    start_time = time.time()
    textattack.shared.utils.set_seed(args.random_seed)

    # attack
    attack = Attack(goal_function, constraints, transformation, search_method)
    print(attack)
    attack_log_manager = parse_logger_from_args(args)

    pbar = tqdm.tqdm(total=num_remaining_attacks, smoothing=0)
    num_results = 0
    num_failures = 0
    num_successes = 0

    for result in attack.attack_dataset(dataset, indices=worklist):
        attack_log_manager.log_result(result)
        if not args.disable_stdout:
            print("\n")
        if (not args.attack_n) or (not isinstance(
                result, textattack.attack_results.SkippedAttackResult)):
            pbar.update(1)
        else:
            # worklist_tail keeps track of highest idx that has been part of worklist
            # Used to get the next dataset element when attacking with `attack_n` = True.
            worklist_tail += 1
            worklist.append(worklist_tail)

        num_results += 1

        if (type(result) == textattack.attack_results.SuccessfulAttackResult
                or type(result)
                == textattack.attack_results.MaximizedAttackResult):
            num_successes += 1
        if type(result) == textattack.attack_results.FailedAttackResult:
            num_failures += 1
        pbar.set_description(
            "[Succeeded / Failed / Total] {} / {} / {}".format(
                num_successes, num_failures, num_results))

        if (args.checkpoint_interval
                and len(attack_log_manager.results) % args.checkpoint_interval
                == 0):
            new_checkpoint = textattack.shared.Checkpoint(
                args, attack_log_manager, worklist, worklist_tail)
            new_checkpoint.save()
            attack_log_manager.flush()

    pbar.close()
    print()
    # Enable summary stdout
    if args.disable_stdout:
        attack_log_manager.enable_stdout()
    attack_log_manager.log_summary()
    attack_log_manager.flush()
    print()
    # finish_time = time.time()
    textattack.shared.logger.info(f"Attack time: {time.time()}s")
    attack_log_manager.results

Example #8

Show file

File: attack_keras_parallel.py Project: Hanyu-Liu-123/TextAttack

y_train = np.array(y_train[:index])
y_test = np.array(y_test[index:])
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

vocabulary = tf.keras.datasets.imdb.get_word_index(path="imdb_word_index.json")

results = model.fit(
    x_train, y_train, epochs=1, batch_size=512, validation_data=(x_test, y_test)
)


if __name__ == "__main__":
    torch.multiprocessing.freeze_support()

    model_wrapper = CustomKerasModelWrapper(model)
    dataset = HuggingFaceDataset("rotten_tomatoes", None, "test", shuffle=True)

    attack = PWWSRen2019.build(model_wrapper)

    attack_args = AttackArgs(
        num_examples=10,
        checkpoint_dir="checkpoints",
        parallel=True,
        num_workers_per_device=2,
    )

    attacker = Attacker(attack, dataset, attack_args)

    attacker.attack_dataset()