def deidentify(source_path, config_path): # read config and source files config = read_config(config_path) source_df = read_file(source_path, config) # Update config so that the Column key values match column names in data frame # this is important for creating Cape policies indices = list(config['Columns'].keys()) all_col_names = list(source_df.columns) col_names = [all_col_names[idx] for idx in indices] config['Columns'] = dict(zip(col_names, list(config['Columns'].values()))) # create new Cave Policy and apply it to source data frame policy_path = create_policy(config) policy = cape.parse_policy(policy_path) new = cape.apply_policy(policy, source_df) # save deidentified file # save to working dir for now filename_noext, extension = os.path.splitext(source_path) _, filename = os.path.split(filename_noext) print(filename) save_path = f'{filename}_nophi{extension}' new.to_csv(save_path, sep=config['File']['Delimiter'], index=None, header=config['File']['Header']) print(f'No PHI file: {save_path}') print(f'Cave Policy: {policy_path}')
import cape_privacy as cape import numpy as np import pandas as pd from pyspark import sql sess_builder = sql.SparkSession.builder sess_builder = sess_builder.appName("cape.examples.rounding") sess = sess_builder.getOrCreate() sess = cape.spark.configure_session(sess) pdf = pd.DataFrame(np.ones(5, dtype=np.float32) + 0.2, columns=["ones"]) df = sess.createDataFrame(pdf) df.show() policy = cape.parse_policy("policy/spark_round.yaml") result = cape.apply_policy(policy, df) result.show()
import cape_privacy as cape import pandas as pd import numpy as np policy = cape.parse_policy("policy/perturb_value_field.yaml") df = pd.DataFrame(np.ones(5,), columns=["ones"]) df = cape.apply_policy(policy, df) print(df.head())
df["is_claimed"]= data_csv.get_replace(df,"is_claimed",['fal_se', 'truee'] , [False,True] ) # %% data_csv.get_convert_sec_date(df, "last_login" ) # %% data_csv.get_stand_decimal(df, "paid_amount", 2) # %% # policy based encryption policy = cape.parse_policy(env.policy_file) caped_df = cape.apply_policy(policy, df) caped_df.name="test" data_csv.write_csv(caped_df) # %% user_details_temp=pd.DataFrame(df_json.user_details.values.tolist()) user_details=pd.DataFrame.from_records(user_details_temp)[['name','dob','address','username','password','national_id']] user_details.name="user_details" data_json.write_csv(user_details)
import cape_privacy as cape import pandas as pd from dataset import load_dataset # Load the Pandas DataFrame df = load_dataset() print("Original Dataset:") print(df.head()) # Load the privacy policy and apply it to the DataFrame policy = cape.parse_policy("mask_personal_information.yaml") df = cape.apply_policy(policy, df) print("Masked Dataset:") print(df.head())
async def generate(request: Request, file: UploadFile = File(...), policy_file: UploadFile = File(...), download_data: Any = None): """ s :param file: :param policy_file: :param download_data: :param request: :return: """ logger.info( f"Generate the code based Rendered data: {policy_file.filename} - {file.filename}" ) if file.filename.find('.csv') < 0: return main.app.templates.TemplateResponse( "index.html", { "request": request, "error": "Please upload data only in CSV format...!!!" }) if policy_file.filename.find('.yaml') < 0: return main.app.templates.TemplateResponse( "index.html", { "request": request, "error": "Please upload Policy file only in yaml format...!!!" }) df: pd.DataFrame = pd.read_csv(file.file) logger.info(df.head()) with open(f"/tmp/{policy_file.filename}", 'wb+') as fd: fd.write(policy_file.file.read()) policy = cape.parse_policy(f"/tmp/{policy_file.filename}") secure_df = cape.apply_policy(policy, df) logger.info(secure_df.head()) df['20% recall'] = df['name'].apply(len) df['secure-name'] = secure_df['name'] secure_df['40% recall'] = secure_df['name'].apply(len) plot_data = pd.concat([df, secure_df], axis=1, ignore_index=False, sort=True) plot_data.index = plot_data.index + 1 plot_data.head(6).reset_index().plot(x="index", y=["20% recall", "40% recall"], kind="bar") plt.title("Ranking Precision of the Proposed Technique.") plt.xlabel("Precision") plt.ylabel("Data set size (KB)") plt.savefig(f'app/static/{file.filename}-sec.png') logger.info("graph one generated") df['privacy score'] = df.apply( lambda x: privacy_score(x['name'], x['secure-name']), axis=1) plot_data = pd.concat([df, secure_df], axis=1, ignore_index=False, sort=True) plot_data.index = plot_data.index + 1 plot_data.head(6).reset_index().plot(x="index", y=["privacy score"], kind="bar") plt.title("Privacy Score of the Proposed Technique.") plt.xlabel("Client ID") plt.ylabel("Privacy Score") plt.savefig(f'app/static/{file.filename}-sim.png') logger.info("graph two generated") df['Bayes Net'] = df.apply( lambda x: x['privacy score'] * random_algorithm(), axis=1) df['AIRS'] = df.apply(lambda x: x['privacy score'] * random_algorithm(), axis=1) df['SVM'] = df.apply(lambda x: x['privacy score'] * random_algorithm(), axis=1) df['C4.5'] = df.apply(lambda x: x['privacy score'] * random_algorithm(), axis=1) df['CBA'] = df.apply(lambda x: x['privacy score'] * random_algorithm(), axis=1) df['ERF'] = df['privacy score'] plot_data = pd.concat([df, secure_df], axis=1, ignore_index=False, sort=True) plot_data.index = plot_data.index + 1 plot_data.head(6).reset_index().plot( x="index", y=["Bayes Net", "AIRS", "SVM", "C4.5", "CBA", "ERF"], kind="bar") plt.title("Accuracy Analysis of the Existing and Proposed Techniques.") plt.ylabel("Accuracy") plt.xlabel("Data set size (KB)") plt.savefig(f'app/static/{file.filename}-thi.png') logger.info("graph three generated") df.index = df.index + 1 if download_data: return FileResponse(content=df.to_csv(), media_type='text/csv', headers={ 'content-disposition': f"attachment; filename=generate_data.csv", 'content-type': 'text/csv' }) return main.app.templates.TemplateResponse( "index.html", { "request": request, "name": file.filename, "file": file, "data": df.head(20).to_html(), "secure_data": secure_df.head(20).to_html() })
from pyspark import sql from pyspark.sql import functions import cape_privacy as cape # Set up your SparkSession as usual, but configure it for use with Cape. # We do this because some transformations expect Arrow to be enabled. sess = sql.SparkSession.builder.appName( "cape.tutorial.maskPersonalInformation").getOrCreate() sess = cape.spark.configure_session(sess) # Load a Spark DataFrame df = sess.read.load("data/credit_with_pii.csv", format="csv", sep=",", inferSchema="true", header="true") df = df.withColumn( "Application_date", functions.to_date(functions.col("Application_date"), "yyyy-MM-dd"), ) print("Original Dataset:") print(df.show()) # Load the privacy policy and apply it to the DataFrame policy = cape.parse_policy("policy/credit_policy.yaml") df = cape.apply_policy(policy, df) print("Masked Dataset:") print(df.show())