async def anonymize(piis: List[Pii], config: AnonymizerConfig):
    anonymizer = Anonymizer(config)
    try:
        anonymized_piis = [
            AnonymizedPii(text=pii.text, id=pii.id)
            for pii in anonymizer.anonymize(piis) if pii.modified
        ]
    except ParserError:
        raise HTTPException(status_code=400, detail="Error parsing a pii")

    if len(anonymized_piis) != len(piis):
        # one or more piis were not flagged as `modified`
        logger.error(
            f"Invalid config (anonymized_piis={anonymized_piis}; piis={piis}")
        raise HTTPException(status_code=400, detail="Invalid Config")

    return AnonymizedPiisResponse(anonymized_piis=anonymized_piis)
Example #2
0
def redact(input_dir, output_dir, anonymizer_config, recognizer_config):
    """Redact the documents in a directory.

    This script tries to redact all documents in the given directory and its subdirectories.

    Note: The redaction is done in an unsupervised manor. You have to ensure, that the chosen recognizers and
    configuration provide results of a sufficient quality on the given data. Do not use for anything critical."""

    if input_dir is None or output_dir is None:
        raise UsageError("Please provide an input_dir and output_dir.")

    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    anonymizer_config = Path(anonymizer_config)
    recognizer_config = Path(recognizer_config)

    with open(anonymizer_config, "r") as f:
        config = AnonymizerConfig(**json.load(f))
        anonymizer = Anonymizer(config)

    with open(recognizer_config, "r") as f:
        recognizer_config = nerwhal.Config(**json.load(f))

    click.echo(f'Start redacting files in "{input_dir}" ...')

    items_to_redact = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            items_to_redact += [(root, file)]

    with progressbar(items_to_redact) as items:
        for root, file in items:
            relative_path = Path(os.path.relpath(root,
                                                 start=input_dir)) / Path(file)
            in_path = input_dir / relative_path

            try:
                wrapper = FileWrapper(in_path)
            except UnsupportedFormat:
                click.echo(
                    f"Warning: Unsupported format for file {relative_path}! This file was skipped!"
                )
                continue
            except Exception:
                click.echo(
                    f"Error while processing file {relative_path}! This file was skipped!",
                    err=True)
                continue

            result = nerwhal.recognize(
                wrapper.text,
                config=recognizer_config,
                combination_strategy="smart-fusion",
                context_words=True,
                return_tokens=False,
            )
            id_to_piis = {
                str(idx): pii
                for idx, pii in enumerate(result["ents"])
            }
            piis_for_anonymizer = [
                Pii(tag=pii.tag, text=pii.text, id=idx)
                for idx, pii in id_to_piis.items()
            ]

            anonymized_piis = [
                anonymized_pii
                for anonymized_pii in anonymizer.anonymize(piis_for_anonymizer)
                if anonymized_pii.modified
            ]

            for anonymized_pii in anonymized_piis:
                unanonymized_pii = id_to_piis[anonymized_pii.id]
                wrapper.add_alter(unanonymized_pii.start_char,
                                  unanonymized_pii.end_char,
                                  anonymized_pii.text)
            wrapper.apply_alters()

            out_path = output_dir / relative_path
            out_path.parent.mkdir(parents=True, exist_ok=True)
            wrapper.save(out_path)

    click.echo(f'The redacted files have been written to "{output_dir}".')