Example #1
0
def main(opt):
    output_path = 'prov_results'

    # Specify where to save the processed files as savepath
    savepath = os.path.join(output_path, 'Testing')

    df = pd.DataFrame(
        {
            'B': ['B2', 'B3', 'B6', 'B7'],
            'D': ['D2', 'D3', 'D6', 'D7'],
            'F': ['F2', 'F3', 'F6', 'F7']
        },
        index=[2, 3, 6, 7])
    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization')
    # Create a new provenance document
    if not opt:
        p = pr.Provenance(df, savepath)
    else:
        savepath = os.path.join(savepath, 'FP')
        p = pr_lib.Provenance(df, savepath)
    tracker = ProvenanceTracker.ProvenanceTracker(df, p)
    #    tracker.df=tracker.df.dropna()

    df4 = pd.DataFrame(
        {
            'A': ['A0', 'A1', 'A2', 'A3'],
            'B': ['B0', 'B1', 'B2', 'B3'],
            'C': ['C0', np.nan, 'C2', 'C3'],
            'D': ['D0', 'D1', 'D2', 'D3']
        },
        index=[0, 1, 2, 3])
    tracker.add_second_df(df4)
    tracker.set_join_op(axis=0, on=None)
    tracker.df = pd.concat([df, df4], axis=0, sort=False)
    print(tracker.df)
Example #2
0
def main(input_path, opt):
    # Specify where to save the processed files as savepath
    output_path = os.path.join('prov_results', os.path.basename(input_path))
    savepath = os.path.join(output_path, 'ST_prov')

    df = pd.read_csv(input_path)
    #Trade columnds
    #['T_ID', 'T_DTS', 'T_ST_ID', 'T_TT_ID', 'T_IS_CASH', 'T_S_SYMB', 'T_QTY', 'T_BIDPRICE', 'C_ID', 'T_EXEX_NAME', 'T_TRADE_PRICE', 'T_CHRG', 'T_COMM', 'T_TAX', 'ActionType', 'ActionTS', 'C_TAX_ID', 'C_L_NAME', 'C_F_NAME', 'C_M_NAME', 'C_GNDR', 'C_TIER', 'C_DOB', 'C_ADLINE1', 'C_ADLINE2', 'C_ZIPCODE', 'C_CITY', 'C_STATE_PROV', 'C_CTRY', 'C_CTRY_1', 'C_AREA_1', 'C_LOCAL_1', 'C_EXT_1', 'C_CTRY_2', 'C_AREA_2', 'C_LOCAL_2', 'C_EXT_2', 'C_CTRY_3', 'C_AREA_3', 'C_LOCAL_3', 'C_EXT_3', 'C_EMAIL_1', 'C_EMAIL_2', 'C_LCL_TX_ID', 'C_NAT_TX_ID']

    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization')
    # Create a new provenance document
    if opt:
        p = pr.Provenance(df, savepath)
    else:
        p = pr_lib.Provenance(df, savepath)

    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Input prov entities created and saved')

    #SPACE TRANSFORMATION: add column on trade T_BOOL
    #Check Null Commission
    df['T_BOOL'] = [0 if s == None else 1 for s in df.T_COMM]

    print(
        '[' + time.strftime("%d/%m-%H:%M:%S") +
        '] Space Transformation done: Check Null Commission, T_BOOL column added'
    )

    #GET PROVENANCE
    d = p.get_prov_space_transformation(df, ['T_COMM'])
    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Prov Space Transformation saved')
Example #3
0
def main(input_path, opt):
    # Specify where to save the processed files as savepath
    output_path = os.path.join('prov_results', os.path.basename(input_path))
    savepath = os.path.join(output_path, 'IG_prov')

    df = pd.read_csv(input_path)

    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization')
    # Create a new provenance document
    if opt:
        p = pr.Provenance(df, savepath)
    else:
        p = pr_lib.Provenance(df, savepath)

    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Input prov entities created and saved')

    #INSTANCE GENERATION: add one record to dataframe
    valueMax_comm = df['T_COMM'].max()
    df = df.append({'T_COMM': valueMax_comm}, ignore_index=True)
    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Instance Generation done: added one record to dataframe')

    #GET PROVENANCE
    d = p.get_prov_instance_generation(df, ['T_COMM'])
    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Prov Instance Generation saved')
Example #4
0
def main(input_path, opt):
	# Specify where to save the processed files as savepath
	output_path = os.path.join('prov_results', os.path.basename(input_path))
	savepath = os.path.join(output_path, 'FT_prov')

	df = pd.read_csv(input_path)
	#Trade columnds
	#['T_ID', 'T_DTS', 'T_ST_ID', 'T_TT_ID', 'T_IS_CASH', 'T_S_SYMB', 'T_QTY', 'T_BIDPRICE', 'C_ID', 'T_EXEX_NAME', 'T_TRADE_PRICE', 'T_CHRG', 'T_COMM', 'T_TAX', 'ActionType', 'ActionTS', 'C_TAX_ID', 'C_L_NAME', 'C_F_NAME', 'C_M_NAME', 'C_GNDR', 'C_TIER', 'C_DOB', 'C_ADLINE1', 'C_ADLINE2', 'C_ZIPCODE', 'C_CITY', 'C_STATE_PROV', 'C_CTRY', 'C_CTRY_1', 'C_AREA_1', 'C_LOCAL_1', 'C_EXT_1', 'C_CTRY_2', 'C_AREA_2', 'C_LOCAL_2', 'C_EXT_2', 'C_CTRY_3', 'C_AREA_3', 'C_LOCAL_3', 'C_EXT_3', 'C_EMAIL_1', 'C_EMAIL_2', 'C_LCL_TX_ID', 'C_NAT_TX_ID']

	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Initialization')
	# Create a new provenance document
	if opt: 
		p = pr.Provenance(df, savepath)
	else:
		p = pr_lib.Provenance(df, savepath)
	
	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Input prov entities created and saved')


	#FEATURE TRANSFORMATION: correct invalid gender.
	# Gender is uppercased. Values other than 'M' or 'F' are replaced with 'U'
	df['C_GNDR'] = df['C_GNDR'].str.upper()
	df['C_GNDR'] = ['U' if g is not 'F' or g is not 'M' else g for g in df.C_GNDR]

	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Feature Transformation done: correct invalid gender entities')

	#GET PROVENANCE
	d = p.get_prov_feature_transformation(df, ['C_GNDR'])
	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Prov Feature Transformation saved')
Example #5
0
def main(input_path, opt):
	# Specify where to save the processed files as savepath
	output_path = os.path.join('prov_results', os.path.basename(input_path))
	savepath = os.path.join(output_path, 'DM_prov')

	df = pd.read_csv(input_path)

	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Initialization')
	# Create a new provenance document
	if opt: 
		p = pr.Provenance(df, savepath)
	else:
		p = pr_lib.Provenance(df, savepath)
	
	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Input prov entities created and saved')


	#DIMENSIONALITY REDUCTION: randomly removes one column from df
	columns = df.columns
	random_col = randrange(len(columns)-1)
	to_delete = columns[random_col]
	df = df.drop([to_delete], axis=1)
	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Dimensionality Reduction done: ' + to_delete + ' column deleted')

	#GET PROVENANCE
	d = p.get_prov_dim_reduction(df)
	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Prov Dimensionality Reduction saved')
Example #6
0
def main(opt):
    output_path = 'prov_results'

    # Specify where to save the processed files as savepath
    savepath = os.path.join(output_path, 'Join')

    df = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2', 'K0'],
                         'key2': ['K0', 'K1', 'K0', 'K1', 'K0'],
                         'A': ['A0', 'A1', 'A2', 'A3', 'A4'],
                         'B': ['B0', 'B1', 'B2', 'B3', 'B4']
                         })
    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization')
    # Create a new provenance document
    if not opt:
        p = pr.Provenance(df, savepath)
    else:
        savepath = os.path.join(savepath, 'FP')
        p = pr_lib.Provenance(df, savepath)
    tracker=ProvenanceTracker.ProvenanceTracker(df, p)
#    tracker.df=tracker.df.dropna()

    right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2', ],
                          'key2': ['K0', 'K0', 'K0', 'K0'],
                          'A': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3'],
                          'C':['B0', 'B1', 'B2', 'B3']})
    tracker.add_second_df(right)
    tracker.set_join_op(axis=None,on=['key1', 'key2'])
    tracker.df = pd.merge(tracker.df, tracker.second_df, on=['key1', 'key2'], how='right')
    print(tracker.df)
Example #7
0
def main(input_path, opt):
    # Specify where to save the processed files as savepath
    output_path = os.path.join('prov_results', os.path.basename(input_path))
    savepath = os.path.join(output_path, 'I_prov')

    df = pd.read_csv(input_path)

    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization')
    # Create a new provenance document
    if opt:
        p = pr.Provenance(df, savepath)
    else:
        p = pr_lib.Provenance(df, savepath)

    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Input prov entities created and saved')

    #IMPUTATION: fill NaN elements of T_COMM column with avg
    AVG_comm = df['T_COMM'].mean()
    df['T_COMM'] = df['T_COMM'].fillna(AVG_comm)
    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Imputation done: fill NaN elements of T_COMM column')

    #GET PROVENANCE
    d = p.get_prov_imputation(df, ['T_COMM'])
    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov Imputation saved')
Example #8
0
def main(input_path, opt):
    # Specify where to save the processed files as savepath
    output_path = os.path.join('prov_results', os.path.basename(input_path))
    savepath = os.path.join(output_path, 'VT_prov')

    df = pd.read_csv(input_path)

    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization')
    # Create a new provenance document
    if opt:
        p = pr.Provenance(df, savepath)
    else:
        p = pr_lib.Provenance(df, savepath)

    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Input prov entities created and saved')

    #VALUE TRANSFORMATION: remove invalid date of birth [C_DOB].
    #DOB < Batch_Date - 100 or DOB > Batch_Date
    batch_Date_from = '1917-07-07'
    batch_Date_to = '2017-07-07'
    df['C_DOB'] = [
        g if g >= batch_Date_from and g <= batch_Date_to else np.nan
        for g in df.C_DOB
    ]

    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Value Transformation done: removed invalid DOB')

    #GET PROVENANCE
    d = p.get_prov_value_transformation(df, ['C_DOB'])
    print('[' + time.strftime("%d/%m-%H:%M:%S") +
          '] Prov Value Transformation saved')
Example #9
0
def main(opt):
    input_path = '../real_world_pipeline/Datasets/compas.csv'
    output_path = 'prov_results'

    # Specify where to save the processed files as savepath
    savepath = os.path.join(output_path, 'Compas')

    df = pd.read_csv(input_path, header=0)

    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization')
    # Create a new provenance document
    if opt:
        p = pr.Provenance(df, savepath)
    else:
        savepath = os.path.join(savepath, 'FP')
        p = pr_lib.Provenance(df, savepath)
    tracker = ProvenanceTracker.ProvenanceTracker(df, p)

    # OPERATION O
    # select relevant columns
    tracker.df = tracker.df[[
        'age', 'c_charge_degree', 'race', 'sex', 'priors_count',
        'days_b_screening_arrest', 'two_year_recid', 'c_jail_in', 'c_jail_out'
    ]]

    #d = p.get_prov_dim_reduction(df)

    # OPERATION 1
    # Remove missing values
    #tracker.df = tracker.df.dropna()

    #d = p.get_prov_dim_reduction(df)

    # OPERATION 2
    # Make race binary
    tracker.df.race = [0 if r != 'Caucasian' else 1 for r in tracker.df.race]
    # imputation test
    AVG_comm = tracker.df['days_b_screening_arrest'].mean()
    tracker.df['days_b_screening_arrest'] = tracker.df[
        'days_b_screening_arrest'].fillna(AVG_comm)

    #d = p.get_prov_feature_transformation(df, ['race'])

    # OPERATION 3
    # Make two_year_recid the label
    tracker.df = tracker.df.rename({'two_year_recid': 'label'}, axis=1)

    # reverse label for consistency with function defs: 1 means no recid (good), 0 means recid (bad)
    tracker.df.label = [0 if l == 1 else 1 for l in tracker.df.label]

    #d = p.get_prov_feature_transformation(df, ['label'])

    # OPERATION 4
    # convert jailtime to days
    tracker.df['jailtime'] = (pd.to_datetime(tracker.df.c_jail_out) -
                              pd.to_datetime(tracker.df.c_jail_in)).dt.days
    tracker.stop_space_prov(['c_jail_in', 'c_jail_out'])
    # Get provenance of space transformation
    #d = p.get_prov_space_transformation(df, ['c_jail_out', 'c_jail_in'])

    # OPERATION 5
    # drop jail in and out dates
    tracker.df = tracker.df.drop(['c_jail_in', 'c_jail_out'], axis=1)

    #d = p.get_prov_dim_reduction(df)

    # OPERATION 6
    # M: misconduct, F: felony
    tracker.df.c_charge_degree = [
        0 if s == 'M' else 1 for s in tracker.df.c_charge_degree
    ]

    #d = p.get_prov_feature_transformation(df, ['c_charge_degree'])

    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov saved')
Example #10
0
def main(opt):
    input_path = '../real_world_pipeline/Datasets/census.csv'
    filename_ext = os.path.basename(input_path)
    filename, ext = os.path.splitext(filename_ext)
    output_path = 'prov_results'

    # Specify where to save the processed files as savepath
    savepath = os.path.join(output_path, filename)

    df = pd.read_csv(input_path)

    # Assign names to columns
    names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
             'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label']

    df.columns = names

    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization')
    # Create a new provenance document
    if opt:
        p = pr.Provenance(df, savepath)
    else:
        savepath = os.path.join(savepath, 'FP')
        p = pr_lib.Provenance(df, savepath)
    tracker=ProvenanceTracker.ProvenanceTracker(df, p)

    # OPERATION 0
    # Cleanup names from spaces
    col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country',
           'label']

    for c in col:
        tracker.df[c] = tracker.df[c].map(str.strip)

    # PROVENANCE 0
    #d = p.get_prov_feature_transformation(df, col, 'Cleanup names from spaces')

    # OPERATION 1
    # Replace ? character for NaN value
    tracker.df = tracker.df.replace('?', np.nan)

    # PROVENANCE 1
    #d = p.get_prov_value_transformation(df, df.columns)

    # OPERATION 2-3
    # One-hot encode categorical variables
    col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

    for c in col:
        dummies = []
        dummies.append(pd.get_dummies(tracker.df[c]))
        df_dummies = pd.concat(dummies, axis=1)
        tracker.df = pd.concat((tracker.df, df_dummies), axis=1)
        tracker.df = tracker.df.drop([c], axis=1)
        # PROVENANCE 2-3
        #d = p.get_prov_space_transformation(df, [c])

    # OPERATION 4
    # Assign sex and label binary values 0 and 1
    tracker.df.sex = tracker.df.sex.replace('Male', 1)
    tracker.df.sex = tracker.df.sex.replace('Female', 0)
    tracker.df.label = tracker.df.label.replace('<=50K', 0)
    tracker.df.label = tracker.df.label.replace('>50K', 1)

    # PROVENANCE 4
    #col = ['sex', 'label']
    #d = p.get_prov_feature_transformation(df, col, 'Assign sex and label binary values 0 and 1')

    # OPERATION 5
    # Drop fnlwgt variable
    tracker.df = tracker.df.drop(['fnlwgt'], axis=1)
    # PROVENANCE 5
    #d = p.get_prov_dim_reduction(df)

    print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov saved')
def main(opt):
	input_path = 'real_world_pipeline/Datasets/german.csv'
	output_path = 'prov_results'
	
	# Specify where to save the processed files as savepath
	savepath = os.path.join(output_path, 'German')

	df = pd.read_csv(input_path, header=0)

	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Initialization')
	# Create a new provenance document
	if opt: 
		p = pr.Provenance(df, savepath)
	else:
		savepath = os.path.join(savepath, 'FP')
		p = pr_lib.Provenance(df, savepath)
	
	#OPERATION 0
	# Turn criptic values into interpretable form
	df = df.replace({'checking': {'A11': 'check_low', 'A12': 'check_mid', 'A13': 'check_high',
	                              'A14': 'check_none'},
	                 'credit_history': {'A30': 'debt_none', 'A31': 'debt_noneBank',
	                                    'A32': 'debt_onSchedule','A33': 'debt_delay',
	                                    'A34': 'debt_critical'},
	                 'purpose': {'A40': 'pur_newCar', 'A41': 'pur_usedCar',
	                             'A42': 'pur_furniture', 'A43': 'pur_tv',
	                             'A44': 'pur_appliance', 'A45': 'pur_repairs',
	                             'A46': 'pur_education', 'A47': 'pur_vacation',
	                             'A48': 'pur_retraining', 'A49': 'pur_business',
	                             'A410': 'pur_other'},
	                 'savings': {'A61': 'sav_small', 'A62': 'sav_medium', 'A63': 'sav_large',
	                             'A64': 'sav_xlarge', 'A65': 'sav_none'},
	                 'employment': {'A71': 'emp_unemployed', 'A72': 'emp_lessOne',
	                                'A73': 'emp_lessFour', 'A74': 'emp_lessSeven',
	                                'A75': 'emp_moreSeven'},
	                 'other_debtors': {'A101': 'debtor_none', 'A102': 'debtor_coApp',
	                                   'A103': 'debtor_guarantor'},
	                 'property': {'A121': 'prop_realEstate', 'A122': 'prop_agreement',
	                              'A123': 'prop_car', 'A124': 'prop_none'},
	                 'other_inst': {'A141': 'oi_bank', 'A142': 'oi_stores', 'A143': 'oi_none'},
	                 'housing': {'A151': 'hous_rent', 'A152': 'hous_own', 'A153': 'hous_free'},
	                 'job': {'A171': 'job_unskilledNR', 'A172': 'job_unskilledR',
	                         'A173': 'job_skilled', 'A174': 'job_highSkill'},
	                 'phone': {'A191': 0, 'A192': 1},
	                 'foreigner': {'A201': 1, 'A202': 0},
	                 'label': {2: 0}})
	col = ['checking', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property', 'other_inst', 'housing', 'job', 'phone', 'foreigner', 'label']

	d = p.get_prov_feature_transformation(df, col)

	#OPERATION 1
	# More criptic values translating
	df['status'] = np.where(df.personal_status == 'A91', 'divorced',
	                        np.where(df.personal_status == 'A92', 'divorced', 
	                                 np.where(df.personal_status == 'A93', 'single',
	                                          np.where(df.personal_status == 'A95', 'single',
	                                                   'married'))))

	# Translate gender values
	df['gender'] = np.where(df.personal_status == 'A92', 0,
	                        np.where(df.personal_status == 'A95', 0,
	                                 1))

	d = p.get_prov_space_transformation(df, ['personal_status'])

	#OPERATION 2
	# Drop personal_status column
	df = df.drop(['personal_status'], axis=1)

	d = p.get_prov_dim_reduction(df)

	#OPERATION 3-13
	# One-hot encode categorical columns
	col = ['checking', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property',
	       'other_inst', 'housing', 'job', 'status']
	for c in col:
	    dummies = []
	    dummies.append(pd.get_dummies(df[c]))
	    df_dummies = pd.concat(dummies, axis = 1)
	    df = pd.concat((df, df_dummies), axis = 1)
	    df = df.drop([c], axis = 1)
	    d = p.get_prov_space_transformation(df, [c])

	print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Prov saved')