dct_params=dct_params, path_folder=path_folder, file_template=file_template, plot_fig=True) # ============================================================================= # Obtain General Model for the rest of the experiments # ============================================================================= # Does not matter 'clustering_algorithm' or 'method' CLUSTER_ALGORITHM = "kmeans" METHOD = "discard" clf, sc, _, df_anomalies = ocsvm_rule_extractor( dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=CLUSTER_ALGORITHM, method=METHOD, use_inverse=False, path_save_model=path_folder, dct_params=dct_params) # ============================================================================= # Surrogate Decision Tree # ============================================================================= #### Obtain Rules df_rules_inliers, df_rules_outliers, df_no_pruned, df_yes_pruned = surrogate_dt_rules( df_anomalies, clf, numerical_cols, categorical_cols, path=path_folder,
def ocsvm_rules_experiments_pipeline(df_mat, numerical_cols, categorical_cols, cluster_algorithm, method, rules_used, dct_params, path_folder, file_template, store_intermediate=False, plot_fig=False): """ Parameters ---------- df_mat : TYPE DESCRIPTION. numerical_cols : TYPE DESCRIPTION. categorical_cols : TYPE DESCRIPTION. cluster_algorithm : TYPE DESCRIPTION. method : TYPE DESCRIPTION. rules_used : TYPE DESCRIPTION. dct_params : TYPE DESCRIPTION. path_folder : TYPE DESCRIPTION. file_template : TYPE DESCRIPTION. plot_fig : TYPE, optional DESCRIPTION. The default is False. Returns ------- None. """ print("Beginning process...") if rules_used == "all" or rules_used == "inliers": print("\n\n") print("*" * 100) print("Obtaining Rules for Inliers...") print("*" * 100) use_inverse = False file_name = file_naming_ocsvm(file_template=file_template, cluster_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse) #### Obtain Rules [Inliers] if not store_intermediate: # Rules print("Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor( dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print("Max different values (inliers) : {0} | Rules extracted {1}". format(len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) else: try: df_all = pd.read_csv(path_folder + '/df_rules_' + file_name + '.csv') df_anomalies = pd.read_csv(path_folder + '/df_anomalies_' + file_name + '.csv') clf = pickle.load( open("{0}/backup.p".format(path_folder), "rb")) sc = pickle.load(open("{0}/sc.p".format(path_folder), "rb")) except: print("File not found! Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor( dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (inliers) : {0} | Rules extracted {1}" .format(len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) # If kprototypes, do not consider "categorical cols" for the purpose of the rest of the code if cluster_algorithm == "kprototypes": feature_cols = list(set(numerical_cols + categorical_cols)) cat_additional = [] else: feature_cols = numerical_cols cat_additional = categorical_cols # Complete Rules print("Checking inliers inside hypercubes...") try: df_rules = pd.read_csv( "{path}/{file_name}_rules_{type_r}_pruned_ocsvm.csv".format( path=path_folder, file_name=file_name, type_r="inliers")) except: df_rules = ocsvm_rules_completion( df_anomalies, df_all, feature_cols, cat_additional, inliers_used=True, clustering_algorithm=cluster_algorithm, path=path_folder, file_name=file_name) # Use only pure rules df_rules = df_rules[df_rules["n_outliers_included"] == 0] print("Obtaining metrics...") # df_rules = rule_overlapping_score(df_rules, df_anomalies, # feature_cols, cat_additional) df_rules = check_stability(df_anomalies, df_rules, clf, feature_cols, cat_additional, using_inliers=True) # Saving rules obtained print("Saving rules...") df_rules.to_csv(path_folder + '/df_rules_complete_' + file_name + '.csv', index=False) if plot_fig: #### Plot Rules [Inliers] print("Plotting rules for inliers...") df_rules = df_rules.copy() df_rules = df_rules.drop_duplicates().reset_index(drop=True) plot_2D(df_rules, df_anomalies, folder=path_folder, path_name=file_name) if rules_used == "all" or rules_used == "outliers": print("\n\n") print("*" * 100) print("Obtaining Rules for Outliers...") print("*" * 100) #### Obtain Rules [Outliers] use_inverse = True file_name = file_naming_ocsvm(file_template=file_template, cluster_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse) if not store_intermediate: # Rules print("Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor( dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=False, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (outliers) : {0} | Rules extracted {1}". format(len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) else: try: df_all = pd.read_csv(path_folder + '/df_rules_' + file_name + '.csv') df_anomalies = pd.read_csv(path_folder + '/df_anomalies_' + file_name + '.csv') clf = pickle.load( open("{0}/backup.p".format(path_folder), "rb")) sc = pickle.load(open("{0}/sc.p".format(path_folder), "rb")) except: print("File not found! Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor( dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (outliers) : {0} | Rules extracted {1}" .format(len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) # If kprototypes, do not consider "categorical cols" for the purpose of the rest of the code if cluster_algorithm == "kprototypes": feature_cols = list(set(numerical_cols + categorical_cols)) cat_additional = [] else: feature_cols = numerical_cols cat_additional = categorical_cols # Complete Rules print("Checking outliers inside hypercubes...") df_anomalies['predictions'] = df_anomalies['predictions'] * -1 df_anomalies['distances'] = df_anomalies['distances'] * -1 try: df_rules = pd.read_csv( "{path}/{file_name}_rules_{type_r}_pruned_ocsvm.csv".format( path=path_folder, file_name=file_name, type_r="outliers")) except: df_rules = ocsvm_rules_completion( df_anomalies, df_all, feature_cols, cat_additional, inliers_used=False, clustering_algorithm=cluster_algorithm, path=path_folder, file_name=file_name) df_rules = df_rules[df_rules["n_inliers_included"] == 0] print("Obtaining metrics...") # df_rules = rule_overlapping_score(df_rules, df_anomalies, # feature_cols, cat_additional) df_rules = check_stability(df_anomalies, df_rules, clf, feature_cols, cat_additional, using_inliers=False) # Saving rules obtained print("Saving rules...") df_rules.to_csv(path_folder + '/df_rules_complete_' + file_name + '.csv', index=False) if plot_fig: #### Plot Rules [Outliers] print("Plotting rules for outliers...") df_rules = df_rules.copy() df_rules = df_rules.drop_duplicates().reset_index(drop=True) plot_2D(df_rules, df_anomalies, folder=path_folder, path_name=file_name)
def ocsvm_rules_experiments_pipeline(df_mat, numerical_cols, categorical_cols, cluster_algorithm, method, rules_used, dct_params, path_folder, file_template, store_intermediate=False, plot_fig=False): """ Parameters ---------- df_mat : TYPE DESCRIPTION. numerical_cols : TYPE DESCRIPTION. categorical_cols : TYPE DESCRIPTION. cluster_algorithm : TYPE DESCRIPTION. method : TYPE DESCRIPTION. rules_used : TYPE DESCRIPTION. dct_params : TYPE DESCRIPTION. path_folder : TYPE DESCRIPTION. file_template : TYPE DESCRIPTION. plot_fig : TYPE, optional DESCRIPTION. The default is False. Returns ------- None. """ print("Beginning process...") if rules_used == "all" or rules_used == "inliers": print("\n\n") print("*"*100) print("Obtaining Rules for Inliers...") print("*"*100) use_inverse = False file_name = file_naming_ocsvm(file_template=file_template, cluster_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse) #### Obtain Rules [Inliers] if not store_intermediate: # Rules print("Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (inliers) : {0} | Rules extracted {1}".format( len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) else: try: df_all = pd.read_csv(path_folder + '/df_rules_' + file_name + '.csv') df_anomalies = pd.read_csv(path_folder + '/df_anomalies_' + file_name + '.csv') clf = pickle.load(open("{0}/backup.p".format(path_folder), "rb")) sc = pickle.load(open("{0}/sc.p".format(path_folder), "rb")) except: print("File not found! Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (inliers) : {0} | Rules extracted {1}".format( len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) # If kprototypes, do not consider "categorical cols" for the purpose of the rest of the code if cluster_algorithm == "kprototypes": feature_cols = list(set(numerical_cols + categorical_cols)) cat_additional = [] else: feature_cols = numerical_cols cat_additional = categorical_cols df_anomalies = df_anomalies df_rules = df_all inliers_used=True clustering_algorithm=cluster_algorithm path=path_folder file_name=file_name df_rules['n_inliers_included'] = 0 df_rules['n_outliers_included'] = 0 n_inliers = len(df_anomalies[df_anomalies['predictions']==1]) n_outliers = len(df_anomalies[df_anomalies['predictions']==-1]) n_vertex = (len(cat_additional) + 1)*2**(len(feature_cols)) print("Checking inliers inside rules...") df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==1].iterrows()) df_check = pd.concat([x[x['check']>0] for x in df_check]) df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index() df_temp = df_rules[['n_inliers_included']].reset_index() df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0) df_rules['n_inliers_included'] = df_check print("Checking outliers inside rules...") df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==-1].iterrows()) df_check = pd.concat([x[x['check']>0] for x in df_check]) df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index() df_temp = df_rules[['n_inliers_included']].reset_index() df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0) df_rules['n_outliers_included'] = df_check # Check how many datapoints are included with the rules with Precision=1 print("Checking inliers/outliers inside hypercubes with Precision=1...") n_inliers_p1 = 0 n_inliers_p0 = 0 n_outliers_p1 = 0 n_outliers_p0 = 0 n_inliers = len(df_anomalies[df_anomalies['predictions']==1]) n_outliers = len(df_anomalies[df_anomalies['predictions']==-1]) def wrapper_precision_check(data_point): df_rules['check'] = check_datapoint_inside(data_point, df_rules, feature_cols, cat_additional)['check'] n_inliers_p1 = 0 n_inliers_p0 = 0 n_outliers_p1 = 0 n_outliers_p0 = 0 if inliers_used: # If inlier if data_point['predictions']==1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_outliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p1 += 1 else: # If outlier if data_point['predictions']==-1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_inliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p1 += 1 return {'n_inliers_p0':n_inliers_p0, 'n_inliers_p1':n_inliers_p1, 'n_outliers_p0':n_outliers_p0, 'n_outliers_p1':n_outliers_p1} dct_out = Parallel(n_jobs=N_JOBS)(delayed(wrapper_precision_check)(data_point) for i, data_point in df_anomalies.iterrows()) df_out = pd.DataFrame(dct_out).sum() for i, data_point in df_anomalies.iterrows(): df_rules['check'] = check_datapoint_inside(data_point, df_rules, feature_cols, cat_additional)['check'] if inliers_used: # If inlier if data_point['predictions']==1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_outliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p1 += 1 else: # If outlier if data_point['predictions']==-1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_inliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p1 += 1 if inliers_used: df_rules['n_inliers'] = n_inliers df_rules['n_inliers_p0'] = df_out['n_inliers_p0'] df_rules['n_inliers_p1'] = df_out['n_inliers_p1'] try: del df_rules['check'] except: pass path_aux = "inliers" else: df_rules['n_outliers_p1'] = df_out['n_outliers_p1'] df_rules['n_outliers_p0'] = df_out['n_outliers_p0'] df_rules['n_outliers'] = n_outliers try: del df_rules['check'] except: pass path_aux = "outliers" # Save to CSV df_rules.to_csv("{path}/{file_name}_rules_{type_r}_pruned_ocsvm.csv".format(path=path, file_name=file_name, type_r = path_aux), index=False) # Use only pure rules df_rules = df_rules[df_rules["n_outliers_included"]==0] print("Obtaining metrics...") df_rules = rule_overlapping_score(df_rules, df_anomalies, feature_cols, cat_additional) df_rules = check_stability(df_anomalies, df_rules, clf, feature_cols, cat_additional, using_inliers=True) # Saving rules obtained print("Saving rules...") df_rules.to_csv(path_folder + '/df_rules_complete_' + file_name + '.csv', index=False) if plot_fig: #### Plot Rules [Inliers] print("Plotting rules for inliers...") df_rules = df_rules.copy() df_rules = df_rules.drop_duplicates().reset_index(drop=True) plot_2D(df_rules, df_anomalies, folder = path_folder, path_name=file_name) if rules_used == "all" or rules_used == "outliers": print("\n\n") print("*"*100) print("Obtaining Rules for Outliers...") print("*"*100) #### Obtain Rules [Outliers] use_inverse = True file_name = file_naming_ocsvm(file_template=file_template, cluster_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse) if not store_intermediate: # Rules print("Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=False, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (outliers) : {0} | Rules extracted {1}".format( len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) else: try: df_all = pd.read_csv(path_folder + '/df_rules_' + file_name + '.csv') df_anomalies = pd.read_csv(path_folder + '/df_anomalies_' + file_name + '.csv') clf = pickle.load(open("{0}/backup.p".format(path_folder), "rb")) sc = pickle.load(open("{0}/sc.p".format(path_folder), "rb")) except: print("File not found! Fitting OCSVM model...") clf, sc, df_result, df_anomalies = ocsvm_rule_extractor(dataset_mat=df_mat, numerical_cols=numerical_cols, categorical_cols=categorical_cols, clustering_algorithm=cluster_algorithm, method=method, use_inverse=use_inverse, dct_params=dct_params, store_intermediate=store_intermediate, path_save_model=path_folder) df_all = df_result df_no = df_anomalies[df_anomalies['predictions'] == 1] df_no = df_no.drop_duplicates() print( "Max different values (outliers) : {0} | Rules extracted {1}".format( len(df_no), len(df_all))) print("Saving rules...") df_all.to_csv(path_folder + '/df_rules_' + file_name + '.csv', index=False) df_anomalies.to_csv(path_folder + '/df_anomalies_' + file_name + '.csv', index=False) # If kprototypes, do not consider "categorical cols" for the purpose of the rest of the code if cluster_algorithm == "kprototypes": feature_cols = list(set(numerical_cols + categorical_cols)) cat_additional = [] else: feature_cols = numerical_cols cat_additional = categorical_cols # Complete Rules print("Checking outliers inside hypercubes...") df_anomalies['predictions'] = df_anomalies['predictions']*-1 df_anomalies['distances'] = df_anomalies['distances']*-1 df_anomalies = df_anomalies df_rules = df_all inliers_used=False clustering_algorithm=cluster_algorithm path=path_folder file_name=file_name df_rules['n_inliers_included'] = 0 df_rules['n_outliers_included'] = 0 n_inliers = len(df_anomalies[df_anomalies['predictions']==1]) n_outliers = len(df_anomalies[df_anomalies['predictions']==-1]) n_vertex = (len(cat_additional) + 1)*2**(len(feature_cols)) print("Checking inliers inside rules...") df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==1].iterrows()) df_check = pd.concat([x[x['check']>0] for x in df_check]) df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index() df_temp = df_rules[['n_inliers_included']].reset_index() df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0) df_rules['n_inliers_included'] = df_check print("Checking outliers inside rules...") df_check = Parallel(n_jobs=N_JOBS)(delayed(check_datapoint_inside_only)(data_point,df_rules,feature_cols,cat_additional) for i, data_point in df_anomalies[df_anomalies['predictions']==-1].iterrows()) df_check = pd.concat([x[x['check']>0] for x in df_check]) df_check = pd.DataFrame(df_check.groupby(df_check.index).sum()).reset_index() df_temp = df_rules[['n_inliers_included']].reset_index() df_check = df_temp.merge(df_check, how="outer")[['check']].fillna(0) df_rules['n_outliers_included'] = df_check # Check how many datapoints are included with the rules with Precision=1 print("Checking inliers/outliers inside hypercubes with Precision=1...") n_inliers_p1 = 0 n_inliers_p0 = 0 n_outliers_p1 = 0 n_outliers_p0 = 0 n_inliers = len(df_anomalies[df_anomalies['predictions']==1]) n_outliers = len(df_anomalies[df_anomalies['predictions']==-1]) def wrapper_precision_check(data_point): df_rules['check'] = check_datapoint_inside(data_point, df_rules, feature_cols, cat_additional)['check'] n_inliers_p1 = 0 n_inliers_p0 = 0 n_outliers_p1 = 0 n_outliers_p0 = 0 if inliers_used: # If inlier if data_point['predictions']==1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_outliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p1 += 1 else: # If outlier if data_point['predictions']==-1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_inliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p1 += 1 return {'n_inliers_p0':n_inliers_p0, 'n_inliers_p1':n_inliers_p1, 'n_outliers_p0':n_outliers_p0, 'n_outliers_p1':n_outliers_p1} dct_out = Parallel(n_jobs=N_JOBS)(delayed(wrapper_precision_check)(data_point) for i, data_point in df_anomalies.iterrows()) df_out = pd.DataFrame(dct_out).sum() for i, data_point in df_anomalies.iterrows(): df_rules['check'] = check_datapoint_inside(data_point, df_rules, feature_cols, cat_additional)['check'] if inliers_used: # If inlier if data_point['predictions']==1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_outliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_inliers_p1 += 1 else: # If outlier if data_point['predictions']==-1: # Rules with any P and that include this datapoint df_aux = df_rules[(df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p0 += 1 # Rules with P=1 and that include this datapoint df_aux = df_rules[(df_rules['n_inliers_included']==0) & (df_rules['check']==1)] if len(df_aux) > 0: n_outliers_p1 += 1 if inliers_used: df_rules['n_inliers'] = n_inliers df_rules['n_inliers_p0'] = df_out['n_inliers_p0'] df_rules['n_inliers_p1'] = df_out['n_inliers_p1'] try: del df_rules['check'] except: pass path_aux = "inliers" else: df_rules['n_outliers_p1'] = df_out['n_outliers_p1'] df_rules['n_outliers_p0'] = df_out['n_outliers_p0'] df_rules['n_outliers'] = n_outliers try: del df_rules['check'] except: pass path_aux = "outliers" # Save to CSV df_rules.to_csv("{path}/{file_name}_rules_{type_r}_pruned_ocsvm.csv".format(path=path, file_name=file_name, type_r = path_aux), index=False) df_rules = df_rules[df_rules["n_inliers_included"]==0] print("Obtaining metrics...") df_rules = rule_overlapping_score(df_rules, df_anomalies, feature_cols, cat_additional) df_rules = check_stability(df_anomalies, df_rules, clf, feature_cols, cat_additional, using_inliers=False) # Saving rules obtained print("Saving rules...") df_rules.to_csv(path_folder + '/df_rules_complete_' + file_name + '.csv', index=False) if plot_fig: #### Plot Rules [Outliers] print("Plotting rules for outliers...") df_rules = df_rules.copy() df_rules = df_rules.drop_duplicates().reset_index(drop=True) plot_2D(df_rules, df_anomalies, folder = path_folder, path_name = file_name) else: raise ValueError("Argument {0} not found -- use ['all', 'outliers' or 'inliers'] instead".format(rules_used) )