async def anonymize(piis: List[Pii], config: AnonymizerConfig): anonymizer = Anonymizer(config) try: anonymized_piis = [ AnonymizedPii(text=pii.text, id=pii.id) for pii in anonymizer.anonymize(piis) if pii.modified ] except ParserError: raise HTTPException(status_code=400, detail="Error parsing a pii") if len(anonymized_piis) != len(piis): # one or more piis were not flagged as `modified` logger.error( f"Invalid config (anonymized_piis={anonymized_piis}; piis={piis}") raise HTTPException(status_code=400, detail="Invalid Config") return AnonymizedPiisResponse(anonymized_piis=anonymized_piis)
def redact(input_dir, output_dir, anonymizer_config, recognizer_config): """Redact the documents in a directory. This script tries to redact all documents in the given directory and its subdirectories. Note: The redaction is done in an unsupervised manor. You have to ensure, that the chosen recognizers and configuration provide results of a sufficient quality on the given data. Do not use for anything critical.""" if input_dir is None or output_dir is None: raise UsageError("Please provide an input_dir and output_dir.") input_dir = Path(input_dir) output_dir = Path(output_dir) anonymizer_config = Path(anonymizer_config) recognizer_config = Path(recognizer_config) with open(anonymizer_config, "r") as f: config = AnonymizerConfig(**json.load(f)) anonymizer = Anonymizer(config) with open(recognizer_config, "r") as f: recognizer_config = nerwhal.Config(**json.load(f)) click.echo(f'Start redacting files in "{input_dir}" ...') items_to_redact = [] for root, dirs, files in os.walk(input_dir): for file in files: items_to_redact += [(root, file)] with progressbar(items_to_redact) as items: for root, file in items: relative_path = Path(os.path.relpath(root, start=input_dir)) / Path(file) in_path = input_dir / relative_path try: wrapper = FileWrapper(in_path) except UnsupportedFormat: click.echo( f"Warning: Unsupported format for file {relative_path}! This file was skipped!" ) continue except Exception: click.echo( f"Error while processing file {relative_path}! This file was skipped!", err=True) continue result = nerwhal.recognize( wrapper.text, config=recognizer_config, combination_strategy="smart-fusion", context_words=True, return_tokens=False, ) id_to_piis = { str(idx): pii for idx, pii in enumerate(result["ents"]) } piis_for_anonymizer = [ Pii(tag=pii.tag, text=pii.text, id=idx) for idx, pii in id_to_piis.items() ] anonymized_piis = [ anonymized_pii for anonymized_pii in anonymizer.anonymize(piis_for_anonymizer) if anonymized_pii.modified ] for anonymized_pii in anonymized_piis: unanonymized_pii = id_to_piis[anonymized_pii.id] wrapper.add_alter(unanonymized_pii.start_char, unanonymized_pii.end_char, anonymized_pii.text) wrapper.apply_alters() out_path = output_dir / relative_path out_path.parent.mkdir(parents=True, exist_ok=True) wrapper.save(out_path) click.echo(f'The redacted files have been written to "{output_dir}".')