Beispiel #1
0
def deidentify(source_path, config_path):

    # read config and source files
    config = read_config(config_path)
    source_df = read_file(source_path, config)

    # Update config so that the Column key values match column names in data frame
    # this is important for creating Cape policies
    indices = list(config['Columns'].keys())
    all_col_names = list(source_df.columns)
    col_names = [all_col_names[idx] for idx in indices]
    config['Columns'] = dict(zip(col_names, list(config['Columns'].values())))

    # create new Cave Policy and apply it to source data frame
    policy_path = create_policy(config)
    policy = cape.parse_policy(policy_path)
    new = cape.apply_policy(policy, source_df)

    # save deidentified file
    # save to working dir for now
    filename_noext, extension = os.path.splitext(source_path)
    _, filename = os.path.split(filename_noext)
    print(filename)
    save_path = f'{filename}_nophi{extension}'
    new.to_csv(save_path,
               sep=config['File']['Delimiter'],
               index=None,
               header=config['File']['Header'])

    print(f'No PHI file: {save_path}')
    print(f'Cave Policy: {policy_path}')
Beispiel #2
0
import cape_privacy as cape
import numpy as np
import pandas as pd
from pyspark import sql

sess_builder = sql.SparkSession.builder
sess_builder = sess_builder.appName("cape.examples.rounding")
sess = sess_builder.getOrCreate()
sess = cape.spark.configure_session(sess)

pdf = pd.DataFrame(np.ones(5, dtype=np.float32) + 0.2, columns=["ones"])
df = sess.createDataFrame(pdf)
df.show()

policy = cape.parse_policy("policy/spark_round.yaml")
result = cape.apply_policy(policy, df)
result.show()
Beispiel #3
0
import cape_privacy as cape
import pandas as pd
import numpy as np

policy = cape.parse_policy("policy/perturb_value_field.yaml")

df = pd.DataFrame(np.ones(5,), columns=["ones"])
df = cape.apply_policy(policy, df)
print(df.head())
async def generate(request: Request,
                   file: UploadFile = File(...),
                   policy_file: UploadFile = File(...),
                   download_data: Any = None):
    """
s
    :param file:
    :param policy_file:
    :param download_data:
    :param request:
    :return:
    """
    logger.info(
        f"Generate the code based Rendered data: {policy_file.filename} - {file.filename}"
    )
    if file.filename.find('.csv') < 0:
        return main.app.templates.TemplateResponse(
            "index.html", {
                "request": request,
                "error": "Please upload data only in CSV format...!!!"
            })
    if policy_file.filename.find('.yaml') < 0:
        return main.app.templates.TemplateResponse(
            "index.html", {
                "request": request,
                "error": "Please upload Policy file only in yaml format...!!!"
            })

    df: pd.DataFrame = pd.read_csv(file.file)
    logger.info(df.head())
    with open(f"/tmp/{policy_file.filename}", 'wb+') as fd:
        fd.write(policy_file.file.read())
    policy = cape.parse_policy(f"/tmp/{policy_file.filename}")
    secure_df = cape.apply_policy(policy, df)
    logger.info(secure_df.head())

    df['20% recall'] = df['name'].apply(len)
    df['secure-name'] = secure_df['name']
    secure_df['40% recall'] = secure_df['name'].apply(len)
    plot_data = pd.concat([df, secure_df],
                          axis=1,
                          ignore_index=False,
                          sort=True)
    plot_data.index = plot_data.index + 1
    plot_data.head(6).reset_index().plot(x="index",
                                         y=["20% recall", "40% recall"],
                                         kind="bar")
    plt.title("Ranking Precision of the Proposed Technique.")
    plt.xlabel("Precision")
    plt.ylabel("Data set size (KB)")
    plt.savefig(f'app/static/{file.filename}-sec.png')
    logger.info("graph one generated")

    df['privacy score'] = df.apply(
        lambda x: privacy_score(x['name'], x['secure-name']), axis=1)
    plot_data = pd.concat([df, secure_df],
                          axis=1,
                          ignore_index=False,
                          sort=True)
    plot_data.index = plot_data.index + 1
    plot_data.head(6).reset_index().plot(x="index",
                                         y=["privacy score"],
                                         kind="bar")
    plt.title("Privacy Score of the Proposed Technique.")
    plt.xlabel("Client ID")
    plt.ylabel("Privacy Score")
    plt.savefig(f'app/static/{file.filename}-sim.png')
    logger.info("graph two generated")

    df['Bayes Net'] = df.apply(
        lambda x: x['privacy score'] * random_algorithm(), axis=1)
    df['AIRS'] = df.apply(lambda x: x['privacy score'] * random_algorithm(),
                          axis=1)
    df['SVM'] = df.apply(lambda x: x['privacy score'] * random_algorithm(),
                         axis=1)
    df['C4.5'] = df.apply(lambda x: x['privacy score'] * random_algorithm(),
                          axis=1)
    df['CBA'] = df.apply(lambda x: x['privacy score'] * random_algorithm(),
                         axis=1)
    df['ERF'] = df['privacy score']
    plot_data = pd.concat([df, secure_df],
                          axis=1,
                          ignore_index=False,
                          sort=True)
    plot_data.index = plot_data.index + 1
    plot_data.head(6).reset_index().plot(
        x="index",
        y=["Bayes Net", "AIRS", "SVM", "C4.5", "CBA", "ERF"],
        kind="bar")
    plt.title("Accuracy Analysis of the Existing and Proposed Techniques.")
    plt.ylabel("Accuracy")
    plt.xlabel("Data set size (KB)")
    plt.savefig(f'app/static/{file.filename}-thi.png')
    logger.info("graph three generated")

    df.index = df.index + 1
    if download_data:
        return FileResponse(content=df.to_csv(),
                            media_type='text/csv',
                            headers={
                                'content-disposition':
                                f"attachment; filename=generate_data.csv",
                                'content-type': 'text/csv'
                            })

    return main.app.templates.TemplateResponse(
        "index.html", {
            "request": request,
            "name": file.filename,
            "file": file,
            "data": df.head(20).to_html(),
            "secure_data": secure_df.head(20).to_html()
        })