Example #1
0
def dbscan(sim_matrix, radius, min_samples, dist_measure, max_radius):
    start = datetime.now()
    result = DBSCAN(eps=radius,
                    min_samples=min_samples,
                    metric='precomputed',
                    n_jobs=-1).fit(sim_matrix)
    end = datetime.now()
    getLogger(__name__).info('dbscan running time:{}'.format(end - start))
    core_sample_mask = np.zeros_like(result.labels_, dtype=bool)
    core_sample_mask[result.core_sample_indices_] = True
    labels_cal = result.labels_

    # for i in range(0,len(result.labels_)):
    #     print(TRAIN_KPI[i], core_sample_mask[i], labels_cal[i])

    # Number of clusters in labels calculated by DBSCAN, ignoring noise if present
    num_clusters = len(set(labels_cal)) - (1 if -1 in labels_cal else 0)

    # print('number of clusters: %d' % num_clusters)
    getLogger(__name__).info('number of clusters: %d' % num_clusters)

    cluster = {}
    medoids = []

    for cla in range(0, num_clusters):
        cluster[cla] = []
        # print('class %d: %d' % (cla, labels_cal.tolist().count(cla)))
        getLogger(__name__).info('class %d: %d' %
                                 (cla, labels_cal.tolist().count(cla)))
        index = [idx for idx, e in enumerate(labels_cal) if e == cla]
        for id in index:
            cluster[cla].append(TRAIN_KPI[id])
            # print(TRAIN_KPI[id])

        medoid, min_dist = get_the_medoids(sim_matrix, index)
        medoids.append(medoid)
        # print(medoid, min_dist)
        getLogger(__name__).info(medoid)
        getLogger(__name__).info(min_dist)

    # assign the 'noisy' curve in DBSCAN and find the real noise.(sim to all the medoids are larger than threshold)
    index = [idx for idx, e in enumerate(labels_cal) if e == -1]
    cluster[-1] = []
    # assign according to the sim to the medoid of each cluster.
    for uuid in index:
        data_arr = data_dict[TRAIN_KPI[uuid]]
        cla, it_dist = assignment(medoids,
                                  data_arr,
                                  dist_category=dist_measure)
        cluster[cla].append(TRAIN_KPI[uuid])
        labels_cal[uuid] = cla
        print('KPI %s belongs to class %d' % (TRAIN_KPI[uuid], cla))
        getLogger(__name__).info('KPI %s belongs to class %d' %
                                 (TRAIN_KPI[uuid], cla))
        all_kpi.append(TRAIN_KPI[uuid])
        all_cla.append(cla)
        all_dist.append(it_dist)
    dataframe = pd.DataFrame({
        'uuid': all_kpi,
        'cluster': all_cla,
        'dist': all_dist
    })
    dataframe.to_csv("/home/jialingxiang/NewDTWFrame/SplitKPI/all_dist.csv",
                     index=False,
                     sep=',')

    # assign method 2: assign according to its nearest clustered curve.
    '''
    for uuid in index:
        cla, new_labels = assign_to_nearest(sim_matrix, uuid, labels_cal)
        cluster[cla].append(TRAIN_KPI[uuid])
        print('KPI %d belongs to class %d' %(TRAIN_KPI[uuid], cla))
        labels_cal = new_labels
    '''

    # print(cluster)
    result = {}
    for key in cluster.keys():
        # print('class %d: %d' % (key, len(cluster[key])))
        getLogger(__name__).info('class %d: %d' % (key, len(cluster[key])))
        for value in cluster[key]:
            result[value] = key
    result_df = pd.DataFrame(list(result.items()), columns=['uuid', 'cluster'])
    result_df.to_hdf(EXP_ROOT + 'cluster_result_r%f.hdf' % max_radius,
                     '/cluster_result',
                     mode='w',
                     format='table')

    medoids_dict = {}
    for i in range(0, len(medoids)):
        medoids_dict[i] = medoids[i]
    medoids_df = pd.DataFrame(list(medoids_dict.items()),
                              columns=['cluster', 'medoid'])
    medoids_df.to_hdf(EXP_ROOT + 'medoids_r%f.hdf' % max_radius,
                      '/medoids',
                      mode='w',
                      format='table')

    return medoids, labels_cal