Esempio n. 1
0
def elbow(X, ax, k_min=1, k_max=10, random_state=42):
    """
    エルボー法の結果を可視化する。
    
    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Training instances to cluster.
    ax : matplotlib.axes.Axes
        プロットを行なう Axes オブジェクト。これを更新する。
    k_min : int, default 1
        エルボー法を行なう、最小のクラスタ数。
    k_max : int, default 10
        エルボー法を行なう、最大のクラスタ数。
    random_state : int, RandomState instance, default 42
        Determines random number generation for centroid initialization.
        Use an int to make the randomness deterministic.
    """

    _kmeans.euclidean_distances = new_euclidean_distances

    sse = []
    for k in range(k_min, k_max + 1):
        km = _kmeans.KMeans(n_clusters=k, random_state=random_state)
        km.fit(X)
        sse.append(km.inertia_)
    ax.plot(range(k_min, k_max + 1), sse, marker='o')
    ax.set_xlabel('number of clusters')
    ax.set_ylabel('SSE')
Esempio n. 2
0
def main() -> None:
    SEED_VALUE = 42
    FILEPATH_INPUT = '../data/raw/sample_10k_ver2.csv'
    FILEPATH_OUTPUT = '../results/2020-06-24/ocsvm.csv'
    DIRPATH_SAVEFIG = '../results/2020-06-24/figures/'
    
    N_CLUSTERS = 5
    WIDTH_DETECT = '180D'  # Time width for anomaly detection
    WIDTH_STEP = '30D'  # Step width for anomaly detection
    PARAMS_OCSVM = {
        'kernel': 'rbf',
        'gamma': 0.1,
        'nu': 0.5,
        'contamination': 0.01,
    }
    
    gd.fix_random_seed(SEED_VALUE)
    
    df_orig = gd.load_input_csv(FILEPATH_INPUT,
                                usecols=['objectid', 'mjd', 'm_ap30'])
    list_id = gd.get_unique_list(df_orig, 'objectid')
    df_orig['objectid'].replace(list_id, np.arange(len(list_id)), inplace=True)
    df_ididx_mjdcols = gd.get_ididx_mjdcols_dataframe(df_orig, df_orig)
    
    df_coord = gd.load_input_csv(FILEPATH_INPUT,
                                 usecols=['objectid', 'coord_ra', 'coord_dec'])
    df_coord.sort_values('objectid', inplace=True)
    df_coord.drop_duplicates(inplace=True)
    df_coord.reset_index(inplace=True, drop=True)
    
    scaler = StandardScaler()
    X_kmeans = np.array([df_coord['coord_ra'].values,
                         df_coord['coord_dec'].values,
                         df_ididx_mjdcols.mean(axis=1).round(4).values]).T
    X_kmeans_std = scaler.fit_transform(X_kmeans)
    
    # vis.set_rcParams()
    # fig = plt.figure()
    # ax = fig.add_subplot(1, 1, 1)
    # mk.elbow(X=X_kmeans_std, ax=ax, random_state=SEED_VALUE)
    # ax.set_title('Elbow method')
    # fig.tight_layout()
    # fig.savefig(DIRPATH_SAVEFIG + 'elbow-method.pdf')
    # return
    
    _kmeans.euclidean_distances = mk.new_euclidean_distances
    km = _kmeans.KMeans(n_clusters=N_CLUSTERS, random_state=SEED_VALUE)
    y_kmeans = km.fit_predict(X_kmeans_std)
    _kmeans.euclidean_distances = euclidean_distances
    
    da.get_y_pred = execute_ocsvm
    
    df_outlier = pd.DataFrame(columns=['objectid', 'mjd_st', 'mjd_en'])
    for cl in np.unique(y_kmeans):
        df_cl = df_ididx_mjdcols.iloc[y_kmeans == cl]
        df_cl_drop = df_cl.dropna(axis=1, how='all')
        imp.impute_by_nbr_and_spatial_mean(df_cl_drop, df_coord)
        df_cl_center = tfr.transform_dataframe_to_centering(df_cl_drop)
        df_cl_center = df_cl_center.T
        tfr.convert_to_DatetimeIndex(df_cl_center)
        df_outlier_cl = da.detect_anomaly_per_period(df_cl_center,
                                                     width_detect=WIDTH_DETECT,
                                                     width_step=WIDTH_STEP,
                                                     **PARAMS_OCSVM)
        df_outlier = pd.concat([df_outlier, df_outlier_cl])
    
    df_outlier['objectid'].replace(np.arange(len(list_id)), list_id,
                                   inplace=True)
    df_outlier.sort_values(['objectid', 'mjd_st', 'mjd_en'], inplace=True)
    df_outlier.to_csv(FILEPATH_OUTPUT, index=False)
Esempio n. 3
0
def main() -> None:
    SEED_VALUE = 42
    FILEPATH_INPUT = '../data/raw/sample_10k_ver2.csv'
    FILEPATH_OUTPUT = '../results/2020-06-24/iforest.csv'
    DIRPATH_SAVEFIG = '../results/2020-06-24/figures/'

    N_CLUSTERS = 5
    WIDTH_DETECT = '180D'  # Time width for anomaly detection
    WIDTH_STEP = '30D'  # Step width for anomaly detection
    PARAMS_IFOREST = {
        'n_estimators': 100,
        'max_samples': 'auto',
        'contamination': 0.01,
        'random_state': SEED_VALUE,
    }

    # 乱数のシードを固定して再現性を担保
    gd.fix_random_seed(SEED_VALUE)

    #
    # 入力 csv ファイルからデータを整形
    #
    df_orig = gd.load_input_csv(FILEPATH_INPUT,
                                usecols=['objectid', 'mjd', 'm_ap30'])
    # objectid をそのまま使うとメモリ消費が激しいので、0 からの連番に置き換える
    list_id = gd.get_unique_list(df_orig, 'objectid')
    df_orig['objectid'].replace(list_id, np.arange(len(list_id)), inplace=True)
    df_ididx_mjdcols = gd.get_ididx_mjdcols_dataframe(df_orig, df_orig)

    df_coord = gd.load_input_csv(FILEPATH_INPUT,
                                 usecols=['objectid', 'coord_ra', 'coord_dec'])
    df_coord.sort_values('objectid', inplace=True)
    df_coord.drop_duplicates(inplace=True)
    df_coord.reset_index(inplace=True, drop=True)

    #
    # k-means への入力を作成
    #
    scaler = StandardScaler()
    X_kmeans = np.array([
        df_coord['coord_ra'].values, df_coord['coord_dec'].values,
        df_ididx_mjdcols.mean(axis=1).round(4).values
    ]).T
    X_kmeans_std = scaler.fit_transform(X_kmeans)

    #
    # エルボー法によりクラスタ数を決定
    #
    # vis.set_rcParams()
    # fig = plt.figure()
    # ax = fig.add_subplot(1, 1, 1)
    # mk.elbow(X=X_kmeans_std, ax=ax, random_state=SEED_VALUE)
    # ax.set_title('Elbow method')
    # fig.tight_layout()
    # fig.savefig(DIRPATH_SAVEFIG + 'elbow-method.pdf')
    # return

    #
    # k-means によりクラスタラベルを割り当てる
    #
    # monkey-patch により k-means の距離関数を自作関数に変更
    _kmeans.euclidean_distances = mk.new_euclidean_distances
    km = _kmeans.KMeans(n_clusters=N_CLUSTERS, random_state=SEED_VALUE)
    y_kmeans = km.fit_predict(X_kmeans_std)
    # monkey-patch による変更を元に戻す
    _kmeans.euclidean_distances = euclidean_distances

    # monkey-patch により適用させるアルゴリズムを設定
    da.get_y_pred = execute_iforest

    #
    # クラスタ別に異常検知
    #
    df_outlier = pd.DataFrame(columns=['objectid', 'mjd_st', 'mjd_en'])
    for cl in np.unique(y_kmeans):
        df_cl = df_ididx_mjdcols.iloc[y_kmeans == cl]
        # 計測値が 1 つも存在しない時刻は切り落とす
        df_cl_drop = df_cl.dropna(axis=1, how='all')
        # 空間的近傍平均+空間平均により欠損値を補完
        imp.impute_by_nbr_and_spatial_mean(df_cl_drop, df_coord)
        # object ごとにデータを中央に揃える(平均 0 に正規化)
        df_cl_center = tfr.transform_dataframe_to_centering(df_cl_drop)
        # pandas で時系列データを扱う際は時間軸を index にしたほうが都合が良いので、
        # pandas.DataFrame を転置する(異常検知する際には元に戻す)
        df_cl_center = df_cl_center.T
        tfr.convert_to_DatetimeIndex(df_cl_center)
        # 異常検知
        df_outlier_cl = da.detect_anomaly_per_period(df_cl_center,
                                                     width_detect=WIDTH_DETECT,
                                                     width_step=WIDTH_STEP,
                                                     **PARAMS_IFOREST)
        df_outlier = pd.concat([df_outlier, df_outlier_cl])

    df_outlier['objectid'].replace(np.arange(len(list_id)),
                                   list_id,
                                   inplace=True)
    df_outlier.sort_values(['objectid', 'mjd_st', 'mjd_en'], inplace=True)
    df_outlier.to_csv(FILEPATH_OUTPUT, index=False)