Exemple #1
0
def deidentify(source_path, config_path):

    # read config and source files
    config = read_config(config_path)
    source_df = read_file(source_path, config)

    # Update config so that the Column key values match column names in data frame
    # this is important for creating Cape policies
    indices = list(config['Columns'].keys())
    all_col_names = list(source_df.columns)
    col_names = [all_col_names[idx] for idx in indices]
    config['Columns'] = dict(zip(col_names, list(config['Columns'].values())))

    # create new Cave Policy and apply it to source data frame
    policy_path = create_policy(config)
    policy = cape.parse_policy(policy_path)
    new = cape.apply_policy(policy, source_df)

    # save deidentified file
    # save to working dir for now
    filename_noext, extension = os.path.splitext(source_path)
    _, filename = os.path.split(filename_noext)
    print(filename)
    save_path = f'{filename}_nophi{extension}'
    new.to_csv(save_path,
               sep=config['File']['Delimiter'],
               index=None,
               header=config['File']['Header'])

    print(f'No PHI file: {save_path}')
    print(f'Cave Policy: {policy_path}')
Exemple #2
0
import cape_privacy as cape
import numpy as np
import pandas as pd
from pyspark import sql

sess_builder = sql.SparkSession.builder
sess_builder = sess_builder.appName("cape.examples.rounding")
sess = sess_builder.getOrCreate()
sess = cape.spark.configure_session(sess)

pdf = pd.DataFrame(np.ones(5, dtype=np.float32) + 0.2, columns=["ones"])
df = sess.createDataFrame(pdf)
df.show()

policy = cape.parse_policy("policy/spark_round.yaml")
result = cape.apply_policy(policy, df)
result.show()
Exemple #3
0
import cape_privacy as cape
import pandas as pd
import numpy as np

policy = cape.parse_policy("policy/perturb_value_field.yaml")

df = pd.DataFrame(np.ones(5,), columns=["ones"])
df = cape.apply_policy(policy, df)
print(df.head())
Exemple #4
0
df["is_claimed"]= data_csv.get_replace(df,"is_claimed",['fal_se', 'truee']
                                                 , [False,True] )


# %%
data_csv.get_convert_sec_date(df, "last_login" )


# %%
data_csv.get_stand_decimal(df, "paid_amount", 2)


# %%
# policy based encryption 

policy = cape.parse_policy(env.policy_file)
caped_df = cape.apply_policy(policy, df)

caped_df.name="test"

data_csv.write_csv(caped_df)


# %%
user_details_temp=pd.DataFrame(df_json.user_details.values.tolist())

user_details=pd.DataFrame.from_records(user_details_temp)[['name','dob','address','username','password','national_id']]

user_details.name="user_details"

data_json.write_csv(user_details)
import cape_privacy as cape
import pandas as pd

from dataset import load_dataset

# Load the Pandas DataFrame
df = load_dataset()
print("Original Dataset:")
print(df.head())
# Load the privacy policy and apply it to the DataFrame
policy = cape.parse_policy("mask_personal_information.yaml")
df = cape.apply_policy(policy, df)

print("Masked Dataset:")
print(df.head())
async def generate(request: Request,
                   file: UploadFile = File(...),
                   policy_file: UploadFile = File(...),
                   download_data: Any = None):
    """
s
    :param file:
    :param policy_file:
    :param download_data:
    :param request:
    :return:
    """
    logger.info(
        f"Generate the code based Rendered data: {policy_file.filename} - {file.filename}"
    )
    if file.filename.find('.csv') < 0:
        return main.app.templates.TemplateResponse(
            "index.html", {
                "request": request,
                "error": "Please upload data only in CSV format...!!!"
            })
    if policy_file.filename.find('.yaml') < 0:
        return main.app.templates.TemplateResponse(
            "index.html", {
                "request": request,
                "error": "Please upload Policy file only in yaml format...!!!"
            })

    df: pd.DataFrame = pd.read_csv(file.file)
    logger.info(df.head())
    with open(f"/tmp/{policy_file.filename}", 'wb+') as fd:
        fd.write(policy_file.file.read())
    policy = cape.parse_policy(f"/tmp/{policy_file.filename}")
    secure_df = cape.apply_policy(policy, df)
    logger.info(secure_df.head())

    df['20% recall'] = df['name'].apply(len)
    df['secure-name'] = secure_df['name']
    secure_df['40% recall'] = secure_df['name'].apply(len)
    plot_data = pd.concat([df, secure_df],
                          axis=1,
                          ignore_index=False,
                          sort=True)
    plot_data.index = plot_data.index + 1
    plot_data.head(6).reset_index().plot(x="index",
                                         y=["20% recall", "40% recall"],
                                         kind="bar")
    plt.title("Ranking Precision of the Proposed Technique.")
    plt.xlabel("Precision")
    plt.ylabel("Data set size (KB)")
    plt.savefig(f'app/static/{file.filename}-sec.png')
    logger.info("graph one generated")

    df['privacy score'] = df.apply(
        lambda x: privacy_score(x['name'], x['secure-name']), axis=1)
    plot_data = pd.concat([df, secure_df],
                          axis=1,
                          ignore_index=False,
                          sort=True)
    plot_data.index = plot_data.index + 1
    plot_data.head(6).reset_index().plot(x="index",
                                         y=["privacy score"],
                                         kind="bar")
    plt.title("Privacy Score of the Proposed Technique.")
    plt.xlabel("Client ID")
    plt.ylabel("Privacy Score")
    plt.savefig(f'app/static/{file.filename}-sim.png')
    logger.info("graph two generated")

    df['Bayes Net'] = df.apply(
        lambda x: x['privacy score'] * random_algorithm(), axis=1)
    df['AIRS'] = df.apply(lambda x: x['privacy score'] * random_algorithm(),
                          axis=1)
    df['SVM'] = df.apply(lambda x: x['privacy score'] * random_algorithm(),
                         axis=1)
    df['C4.5'] = df.apply(lambda x: x['privacy score'] * random_algorithm(),
                          axis=1)
    df['CBA'] = df.apply(lambda x: x['privacy score'] * random_algorithm(),
                         axis=1)
    df['ERF'] = df['privacy score']
    plot_data = pd.concat([df, secure_df],
                          axis=1,
                          ignore_index=False,
                          sort=True)
    plot_data.index = plot_data.index + 1
    plot_data.head(6).reset_index().plot(
        x="index",
        y=["Bayes Net", "AIRS", "SVM", "C4.5", "CBA", "ERF"],
        kind="bar")
    plt.title("Accuracy Analysis of the Existing and Proposed Techniques.")
    plt.ylabel("Accuracy")
    plt.xlabel("Data set size (KB)")
    plt.savefig(f'app/static/{file.filename}-thi.png')
    logger.info("graph three generated")

    df.index = df.index + 1
    if download_data:
        return FileResponse(content=df.to_csv(),
                            media_type='text/csv',
                            headers={
                                'content-disposition':
                                f"attachment; filename=generate_data.csv",
                                'content-type': 'text/csv'
                            })

    return main.app.templates.TemplateResponse(
        "index.html", {
            "request": request,
            "name": file.filename,
            "file": file,
            "data": df.head(20).to_html(),
            "secure_data": secure_df.head(20).to_html()
        })
Exemple #7
0
from pyspark import sql
from pyspark.sql import functions

import cape_privacy as cape

# Set up your SparkSession as usual, but configure it for use with Cape.
# We do this because some transformations expect Arrow to be enabled.
sess = sql.SparkSession.builder.appName(
    "cape.tutorial.maskPersonalInformation").getOrCreate()
sess = cape.spark.configure_session(sess)

# Load a Spark DataFrame
df = sess.read.load("data/credit_with_pii.csv",
                    format="csv",
                    sep=",",
                    inferSchema="true",
                    header="true")
df = df.withColumn(
    "Application_date",
    functions.to_date(functions.col("Application_date"), "yyyy-MM-dd"),
)
print("Original Dataset:")
print(df.show())
# Load the privacy policy and apply it to the DataFrame
policy = cape.parse_policy("policy/credit_policy.yaml")
df = cape.apply_policy(policy, df)

print("Masked Dataset:")
print(df.show())