async def anonymize(piis: List[Pii], config: AnonymizerConfig):
    anonymizer = Anonymizer(config)
    try:
        anonymized_piis = [
            AnonymizedPii(text=pii.text, id=pii.id)
            for pii in anonymizer.anonymize(piis) if pii.modified
        ]
    except ParserError:
        raise HTTPException(status_code=400, detail="Error parsing a pii")

    if len(anonymized_piis) != len(piis):
        # one or more piis were not flagged as `modified`
        logger.error(
            f"Invalid config (anonymized_piis={anonymized_piis}; piis={piis}")
        raise HTTPException(status_code=400, detail="Invalid Config")

    return AnonymizedPiisResponse(anonymized_piis=anonymized_piis)
Beispiel #2
0
def generate_data():
    try:
        generate_data = pd.DataFrame()
        rows = int(request.form.get('num_rows'))
        anonymize_data = Anonymizer()
        fake_names, fake_first_names, fake_second_names = anonymize_data.fake_name_generator(
            rows)
        generate_data['name'] = fake_names
        generate_data['first_name'] = fake_first_names
        generate_data['last_name'] = fake_second_names
        generate_data['city'] = anonymize_data.get_fake_cities(rows)
        generate_data['country'] = anonymize_data.get_fake_countries(rows)
        generate_data['street_address'] = anonymize_data.get_fake_addresses(
            rows)
        generate_data['url'] = anonymize_data.get_fake_uris(rows)
        generate_data.to_csv('static/generated_data.csv',
                             sep='|',
                             index=None,
                             header=True)
        flash('File successfully generated')
        return render_template('index.html',
                               path_generated_data='static/generated_data.csv')

    except:
        return render_template('index.html')
Beispiel #3
0
def anonymize():
    try:
        global filepath
        data = pd.read_csv(filepath, sep='|')
        rows, c = data.shape
        selected_name = request.form.get('options_name')
        selected_city = request.form.get('options_city')
        selected_country = request.form.get('options_country')
        selected_address = request.form.get('options_address')
        selected_url = request.form.get('options_uri')

        anonymize_data = Anonymizer()

        if selected_name != 'None':
            fake_names, fake_first_names, fake_second_names = anonymize_data.fake_name_generator(
                rows)
            data[str(selected_name)] = fake_names
        if selected_city != 'None':
            data[str(selected_city)] = anonymize_data.get_fake_cities(rows)
        if selected_country != 'None':
            data[str(selected_country)] = anonymize_data.get_fake_countries(
                rows)
        if selected_address != 'None':
            data[str(selected_address)] = anonymize_data.get_fake_addresses(
                rows)
        if selected_url != 'None':
            data[str(selected_url)] = anonymize_data.get_fake_uris(rows)

        data.to_csv('static/export_data.csv', sep='|', index=None, header=True)
        flash('File successfully anonymized')
        return render_template('index.html',
                               path_anonymize='static/export_data.csv')
    except:
        return render_template('index.html')
Beispiel #4
0
def main(input_path, image_output_path, weights_path, image_extensions,
         face_threshold, plate_threshold, write_json, obfuscation_parameters):
    download_weights(download_directory=weights_path)

    kernel_size, sigma, box_kernel_size = obfuscation_parameters.split(',')
    obfuscator = Obfuscator(kernel_size=int(kernel_size),
                            sigma=float(sigma),
                            box_kernel_size=int(box_kernel_size))
    detectors = {
        'face':
        Detector(kind='face',
                 weights_path=get_weights_path(weights_path, kind='face')),
        'plate':
        Detector(kind='plate',
                 weights_path=get_weights_path(weights_path, kind='plate'))
    }
    detection_thresholds = {'face': face_threshold, 'plate': plate_threshold}
    anonymizer = Anonymizer(obfuscator=obfuscator, detectors=detectors)
    anonymizer.anonymize_images(input_path=input_path,
                                output_path=image_output_path,
                                detection_thresholds=detection_thresholds,
                                file_types=image_extensions.split(','),
                                write_json=write_json)
Beispiel #5
0
def get_structured_data():
    inputText = request.form['inputText']
    print(inputText)
    anonymize_data = Anonymizer()
    anonymizedText = anonymize_data.get_anonymize_text(inputText)
    return jsonify({"status": "success", "response": anonymizedText})

def anonymize(consumer):
    cache = deque()
    for msg in consumer:
        print(msg)
        cache.append(msg)
        if len(cache) > CACHE_SIZE:
            output = anonymizer.process([{**msg.value} for msg in cache])
            if isinstance(output, list):
                for _ in output:
                    yield _
            else:
                yield output


CACHE_SIZE = 5
anonymizer = Anonymizer({
    "drop": {
        "keys": ["something-unimportant"]
    },
    "mean": {
        "keys": ["some-number"]
    }
})

for anon_msg in anonymize(consumer):
    producer.send(
        "anon", anon_msg.value
        if hasattr(anon_msg, "value") else str(anon_msg).encode())
  
  for line in file.readlines():
    #print(line)
    line = line.strip() # Guarding against erroneous leading and trailing whitespaces
    if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
      if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
        parsedData.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
      messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
      date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
      messageBuffer.append(message) # Append message to buffer
    else:
      messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer

!pip install data-anonymizer-mapper
from anonymizer import Anonymizer
anonymizer = Anonymizer()
print(parsedData)

df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])
df['Author'] = df['Author'].apply(lambda s : anonymizer.get_anonymized_name(s) if s is not None else s)
df.head()

df.describe()

#Number of messages per author
author_messages=df['Author'].value_counts()
#print(author_messages)
author_messages.plot.barh()

messages_Scrum= df.loc[df['Message'].str.contains(pat='scrum')== True]
print(messages_Scrum)
Beispiel #8
0
def redact(input_dir, output_dir, anonymizer_config, recognizer_config):
    """Redact the documents in a directory.

    This script tries to redact all documents in the given directory and its subdirectories.

    Note: The redaction is done in an unsupervised manor. You have to ensure, that the chosen recognizers and
    configuration provide results of a sufficient quality on the given data. Do not use for anything critical."""

    if input_dir is None or output_dir is None:
        raise UsageError("Please provide an input_dir and output_dir.")

    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    anonymizer_config = Path(anonymizer_config)
    recognizer_config = Path(recognizer_config)

    with open(anonymizer_config, "r") as f:
        config = AnonymizerConfig(**json.load(f))
        anonymizer = Anonymizer(config)

    with open(recognizer_config, "r") as f:
        recognizer_config = nerwhal.Config(**json.load(f))

    click.echo(f'Start redacting files in "{input_dir}" ...')

    items_to_redact = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            items_to_redact += [(root, file)]

    with progressbar(items_to_redact) as items:
        for root, file in items:
            relative_path = Path(os.path.relpath(root,
                                                 start=input_dir)) / Path(file)
            in_path = input_dir / relative_path

            try:
                wrapper = FileWrapper(in_path)
            except UnsupportedFormat:
                click.echo(
                    f"Warning: Unsupported format for file {relative_path}! This file was skipped!"
                )
                continue
            except Exception:
                click.echo(
                    f"Error while processing file {relative_path}! This file was skipped!",
                    err=True)
                continue

            result = nerwhal.recognize(
                wrapper.text,
                config=recognizer_config,
                combination_strategy="smart-fusion",
                context_words=True,
                return_tokens=False,
            )
            id_to_piis = {
                str(idx): pii
                for idx, pii in enumerate(result["ents"])
            }
            piis_for_anonymizer = [
                Pii(tag=pii.tag, text=pii.text, id=idx)
                for idx, pii in id_to_piis.items()
            ]

            anonymized_piis = [
                anonymized_pii
                for anonymized_pii in anonymizer.anonymize(piis_for_anonymizer)
                if anonymized_pii.modified
            ]

            for anonymized_pii in anonymized_piis:
                unanonymized_pii = id_to_piis[anonymized_pii.id]
                wrapper.add_alter(unanonymized_pii.start_char,
                                  unanonymized_pii.end_char,
                                  anonymized_pii.text)
            wrapper.apply_alters()

            out_path = output_dir / relative_path
            out_path.parent.mkdir(parents=True, exist_ok=True)
            wrapper.save(out_path)

    click.echo(f'The redacted files have been written to "{output_dir}".')
Beispiel #9
0
import json
from anonymizer import Anonymizer
from faker import Faker
from faker.providers import internet
import re

anonymizer = Anonymizer()


def anonymize_url(url):
    identity = ('username', 'users', 'FirstName', 'lastName', 'LastName',
                'FirstName', 'password', 'Password')
    for value in identity:
        if url.rfind(value) != -1:
            return replace(url, value)
    return url


def anonymize_ip():
    fake = Faker()
    fake.add_provider(internet)
    ip = fake.ipv4_private()
    return ip


def replace(url, string):
    position = url.rfind(string) + len(string) + 1
    substring = url[position:]
    anonymized_string = anonymizer.get_anonymized_name(substring)
    an_url = url.replace(substring, anonymized_string)
    #print(anonymizer.get_original_name(anonymized_string))