def agglomerative(normal, anormal, connectivity, n_clusters=2, linkage='ward'): X = pd.concat([normal, anormal], axis=0) X_fraude = X[['id_siniestro', 'FRAUDE']] del X['FRAUDE'] del X['id_siniestro'] db = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage, connectivity=connectivity).fit(X) labels = db.labels_ labels_df = pd.DataFrame(labels, index = X.index, columns=['Clusters']) labels = db.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(n_clusters_) comparative = pd.concat([X_fraude, labels_df], axis=1) f1, f2, fscore, df_clusters = fs.fraud_score(comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters') comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index) comparative['FRAUDE'] = comparative['FRAUDE'].map(int) comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1 comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1 return f1, f2, fscore, comparative
def isolation_forest(normal, anormal, contamination=0.1, n_estimators=50): X = pd.concat([normal, anormal], axis=0) X_fraude = X[['id_siniestro', 'FRAUDE']] del X['FRAUDE'] del X['id_siniestro'] db = IsolationForest(n_estimators = n_estimators, max_samples=X.shape[0], bootstrap=True, verbose=1, random_state=42, contamination=contamination) db.fit(X) labels = db.predict(X) labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters']) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) comparative = pd.concat([X_fraude, labels_df], axis=1) f1, f2, fscore, df_clusters = fs.fraud_score(comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters') comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index) comparative['FRAUDE'] = comparative['FRAUDE'].map(int) comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1 comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1 return f1, f2, fscore, comparative
def super_vector(normal, anormal, nu=0.1, gamma=0.1): X = pd.concat([normal, anormal], axis=0) X_fraude = X[['id_siniestro', 'FRAUDE']] del X['FRAUDE'] del X['id_siniestro'] db = svm.OneClassSVM(nu=nu, kernel="linear", gamma=gamma) db.fit(X) labels = db.predict(X) labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters']) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) comparative = pd.concat([X_fraude, labels_df], axis=1) f1, f2, fscore, df_clusters = fs.fraud_score( comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters') comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index) comparative['FRAUDE'] = comparative['FRAUDE'].map(int) comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1 comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1 return f1, f2, fscore, comparative
def dbscan(normal, anormal, eps=0.3, min_samples=175, leaf_size=30): X = pd.concat([normal, anormal], axis=0) X_fraude = X[['id_siniestro', 'FRAUDE']] del X['FRAUDE'] del X['id_siniestro'] db = DBSCAN(eps=eps, min_samples=min_samples, leaf_size=leaf_size, n_jobs=-1).fit(X) labels = db.labels_ labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters']) labels = db.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(n_clusters_) comparative = pd.concat([X_fraude, labels_df], axis=1) f1, f2, fscore, df_clusters = fs.fraud_score( comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters') comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index) comparative['FRAUDE'] = comparative['FRAUDE'].map(int) comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1 comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1 return f1, f2, fscore, comparative
def hdbscan(normal, anormal, min_cluster_size=15, min_samples=5, csm='eom'): X = pd.concat([normal, anormal], axis=0) X_fraude = X[['FRAUDE']] del X['FRAUDE'] db = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, cluster_selection_method=csm, allow_single_cluster=True, core_dist_n_jobs=-1).fit(X) labels = db.labels_ labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters']) labels = db.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(n_clusters_) comparative = pd.concat([X_fraude, labels_df], axis=1) f1, f2, fscore, df_clusters = fs.fraud_score( comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters') comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index) comparative['FRAUDE'] = comparative['FRAUDE'].map(int) comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1 comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1 return f1, f2, fscore, comparative
def mean_shift(normal, anormal, quantile=0.5): X = pd.concat([normal, anormal], axis=0) X_fraude = X[['id_siniestro', 'FRAUDE']] del X['FRAUDE'] del X['id_siniestro'] bandwith = estimate_bandwidth(X.values, quantile=quantile, random_state=42) db = MeanShift(bandwidth=bandwith, bin_seeding=True, cluster_all=False, min_bin_freq=50, n_jobs=-1).fit(X) labels = db.labels_ labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters']) labels = db.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) cluster_centers = db.cluster_centers_ comparative = pd.concat([X_fraude, labels_df], axis=1) f1, f2, fscore, df_clusters = fs.fraud_score( comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters') comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index) comparative['FRAUDE'] = comparative['FRAUDE'].map(int) comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1 comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1 return f1, f2, fscore, comparative
def mini_batch_kmeans(normal, anormal, n_clusters=2, max_iter=100, batch_size=100): X = pd.concat([normal, anormal], axis=0) X_fraude = X[['id_siniestro', 'FRAUDE']] del X['FRAUDE'] del X['id_siniestro'] db = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=batch_size, random_state=541) db.fit(X) labels = db.predict(X) labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters']) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) comparative = pd.concat([X_fraude, labels_df], axis=1) f1, f2, fscore, df_clusters = fs.fraud_score( comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters') comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index) comparative['FRAUDE'] = comparative['FRAUDE'].map(int) comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1 comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1 return f1, f2, fscore, comparative
def gaussian_mixture(normal, anormal, n_components, cov, tol, max_iter, reg_covar, type='Gaussian'): X = pd.concat([normal, anormal], axis=0) X_fraude = X[['id_siniestro', 'FRAUDE']] del X['FRAUDE'] del X['id_siniestro'] if type == 'Gaussian': db = mixture.GaussianMixture(n_components=n_components, covariance_type=cov, tol=tol, max_iter=max_iter, init_params='kmeans', random_state=541, reg_covar=reg_covar).fit(X) if type == 'Bayesian': db = mixture.BayesianGaussianMixture(n_components=n_components, covariance_type=cov, tol=tol, max_iter=max_iter, init_params='kmeans', random_state=541).fit(X) labels = db.predict(X) labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters']) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(n_clusters_) comparative = pd.concat([X_fraude, labels_df], axis=1) f1, f2, fscore, df_clusters = fs.fraud_score( comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters') comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index) comparative['FRAUDE'] = comparative['FRAUDE'].map(int) comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1 comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1 return f1, f2, fscore, comparative