def agglomerative(normal, anormal, connectivity, n_clusters=2, linkage='ward'):

    X = pd.concat([normal, anormal], axis=0)

    X_fraude = X[['id_siniestro', 'FRAUDE']]
    del X['FRAUDE']
    del X['id_siniestro']

    db = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage, connectivity=connectivity).fit(X)

    labels = db.labels_
    labels_df = pd.DataFrame(labels, index = X.index, columns=['Clusters'])
    labels = db.labels_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print(n_clusters_)

    comparative = pd.concat([X_fraude, labels_df], axis=1)
    f1, f2, fscore, df_clusters = fs.fraud_score(comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters')
    comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index)
    comparative['FRAUDE'] = comparative['FRAUDE'].map(int)
    comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1
    comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1

    return f1, f2, fscore, comparative
def isolation_forest(normal, anormal, contamination=0.1, n_estimators=50):

    X = pd.concat([normal, anormal], axis=0)

    X_fraude = X[['id_siniestro', 'FRAUDE']]
    del X['FRAUDE']
    del X['id_siniestro']


    db = IsolationForest(n_estimators = n_estimators, max_samples=X.shape[0],
                         bootstrap=True, verbose=1, random_state=42,
                         contamination=contamination)
    db.fit(X)


    labels = db.predict(X)
    labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters'])
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    comparative = pd.concat([X_fraude, labels_df], axis=1)
    f1, f2, fscore, df_clusters = fs.fraud_score(comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters')
    comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index)
    comparative['FRAUDE'] = comparative['FRAUDE'].map(int)
    comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1
    comparative.loc[comparative['Clusters'].isin(df_clusters), 'FRAUDE_Clusters'] = 1
    return f1, f2, fscore, comparative
def super_vector(normal, anormal, nu=0.1, gamma=0.1):

    X = pd.concat([normal, anormal], axis=0)

    X_fraude = X[['id_siniestro', 'FRAUDE']]
    del X['FRAUDE']
    del X['id_siniestro']

    db = svm.OneClassSVM(nu=nu, kernel="linear", gamma=gamma)
    db.fit(X)

    labels = db.predict(X)
    labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters'])
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    comparative = pd.concat([X_fraude, labels_df], axis=1)
    f1, f2, fscore, df_clusters = fs.fraud_score(
        comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters')
    comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index)
    comparative['FRAUDE'] = comparative['FRAUDE'].map(int)
    comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1
    comparative.loc[comparative['Clusters'].isin(df_clusters),
                    'FRAUDE_Clusters'] = 1
    return f1, f2, fscore, comparative
Beispiel #4
0
def dbscan(normal, anormal, eps=0.3, min_samples=175, leaf_size=30):

    X = pd.concat([normal, anormal], axis=0)

    X_fraude = X[['id_siniestro', 'FRAUDE']]
    del X['FRAUDE']
    del X['id_siniestro']

    db = DBSCAN(eps=eps,
                min_samples=min_samples,
                leaf_size=leaf_size,
                n_jobs=-1).fit(X)

    labels = db.labels_
    labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters'])
    labels = db.labels_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print(n_clusters_)

    comparative = pd.concat([X_fraude, labels_df], axis=1)
    f1, f2, fscore, df_clusters = fs.fraud_score(
        comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters')
    comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index)
    comparative['FRAUDE'] = comparative['FRAUDE'].map(int)
    comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1
    comparative.loc[comparative['Clusters'].isin(df_clusters),
                    'FRAUDE_Clusters'] = 1

    return f1, f2, fscore, comparative
Beispiel #5
0
def hdbscan(normal, anormal, min_cluster_size=15, min_samples=5, csm='eom'):

    X = pd.concat([normal, anormal], axis=0)

    X_fraude = X[['FRAUDE']]
    del X['FRAUDE']

    db = HDBSCAN(min_cluster_size=min_cluster_size,
                 min_samples=min_samples,
                 cluster_selection_method=csm,
                 allow_single_cluster=True,
                 core_dist_n_jobs=-1).fit(X)

    labels = db.labels_
    labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters'])
    labels = db.labels_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print(n_clusters_)

    comparative = pd.concat([X_fraude, labels_df], axis=1)
    f1, f2, fscore, df_clusters = fs.fraud_score(
        comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters')
    comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index)
    comparative['FRAUDE'] = comparative['FRAUDE'].map(int)
    comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1
    comparative.loc[comparative['Clusters'].isin(df_clusters),
                    'FRAUDE_Clusters'] = 1

    return f1, f2, fscore, comparative
Beispiel #6
0
def mean_shift(normal, anormal, quantile=0.5):

    X = pd.concat([normal, anormal], axis=0)
    X_fraude = X[['id_siniestro', 'FRAUDE']]
    del X['FRAUDE']
    del X['id_siniestro']

    bandwith = estimate_bandwidth(X.values, quantile=quantile, random_state=42)

    db = MeanShift(bandwidth=bandwith,
                   bin_seeding=True,
                   cluster_all=False,
                   min_bin_freq=50,
                   n_jobs=-1).fit(X)

    labels = db.labels_
    labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters'])
    labels = db.labels_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    cluster_centers = db.cluster_centers_

    comparative = pd.concat([X_fraude, labels_df], axis=1)
    f1, f2, fscore, df_clusters = fs.fraud_score(
        comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters')
    comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index)
    comparative['FRAUDE'] = comparative['FRAUDE'].map(int)
    comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1
    comparative.loc[comparative['Clusters'].isin(df_clusters),
                    'FRAUDE_Clusters'] = 1
    return f1, f2, fscore, comparative
Beispiel #7
0
def mini_batch_kmeans(normal,
                      anormal,
                      n_clusters=2,
                      max_iter=100,
                      batch_size=100):

    X = pd.concat([normal, anormal], axis=0)
    X_fraude = X[['id_siniestro', 'FRAUDE']]
    del X['FRAUDE']
    del X['id_siniestro']

    db = MiniBatchKMeans(n_clusters=n_clusters,
                         max_iter=max_iter,
                         batch_size=batch_size,
                         random_state=541)
    db.fit(X)

    labels = db.predict(X)
    labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters'])
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    comparative = pd.concat([X_fraude, labels_df], axis=1)
    f1, f2, fscore, df_clusters = fs.fraud_score(
        comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters')
    comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index)
    comparative['FRAUDE'] = comparative['FRAUDE'].map(int)
    comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1
    comparative.loc[comparative['Clusters'].isin(df_clusters),
                    'FRAUDE_Clusters'] = 1
    return f1, f2, fscore, comparative
Beispiel #8
0
def gaussian_mixture(normal,
                     anormal,
                     n_components,
                     cov,
                     tol,
                     max_iter,
                     reg_covar,
                     type='Gaussian'):

    X = pd.concat([normal, anormal], axis=0)

    X_fraude = X[['id_siniestro', 'FRAUDE']]
    del X['FRAUDE']
    del X['id_siniestro']
    if type == 'Gaussian':
        db = mixture.GaussianMixture(n_components=n_components,
                                     covariance_type=cov,
                                     tol=tol,
                                     max_iter=max_iter,
                                     init_params='kmeans',
                                     random_state=541,
                                     reg_covar=reg_covar).fit(X)
    if type == 'Bayesian':
        db = mixture.BayesianGaussianMixture(n_components=n_components,
                                             covariance_type=cov,
                                             tol=tol,
                                             max_iter=max_iter,
                                             init_params='kmeans',
                                             random_state=541).fit(X)

    labels = db.predict(X)
    labels_df = pd.DataFrame(labels, index=X.index, columns=['Clusters'])
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print(n_clusters_)

    comparative = pd.concat([X_fraude, labels_df], axis=1)
    f1, f2, fscore, df_clusters = fs.fraud_score(
        comparative.drop(['id_siniestro'], axis=1), 'FRAUDE', 'Clusters')
    comparative['FRAUDE_Clusters'] = pd.Series(0, index=comparative.index)
    comparative['FRAUDE'] = comparative['FRAUDE'].map(int)
    comparative.loc[comparative['FRAUDE'] == 1, 'FRAUDE_Clusters'] = 1
    comparative.loc[comparative['Clusters'].isin(df_clusters),
                    'FRAUDE_Clusters'] = 1

    return f1, f2, fscore, comparative