def dbscan(sim_matrix, radius, min_samples, dist_measure, max_radius): start = datetime.now() result = DBSCAN(eps=radius, min_samples=min_samples, metric='precomputed', n_jobs=-1).fit(sim_matrix) end = datetime.now() getLogger(__name__).info('dbscan running time:{}'.format(end - start)) core_sample_mask = np.zeros_like(result.labels_, dtype=bool) core_sample_mask[result.core_sample_indices_] = True labels_cal = result.labels_ # for i in range(0,len(result.labels_)): # print(TRAIN_KPI[i], core_sample_mask[i], labels_cal[i]) # Number of clusters in labels calculated by DBSCAN, ignoring noise if present num_clusters = len(set(labels_cal)) - (1 if -1 in labels_cal else 0) # print('number of clusters: %d' % num_clusters) getLogger(__name__).info('number of clusters: %d' % num_clusters) cluster = {} medoids = [] for cla in range(0, num_clusters): cluster[cla] = [] # print('class %d: %d' % (cla, labels_cal.tolist().count(cla))) getLogger(__name__).info('class %d: %d' % (cla, labels_cal.tolist().count(cla))) index = [idx for idx, e in enumerate(labels_cal) if e == cla] for id in index: cluster[cla].append(TRAIN_KPI[id]) # print(TRAIN_KPI[id]) medoid, min_dist = get_the_medoids(sim_matrix, index) medoids.append(medoid) # print(medoid, min_dist) getLogger(__name__).info(medoid) getLogger(__name__).info(min_dist) # assign the 'noisy' curve in DBSCAN and find the real noise.(sim to all the medoids are larger than threshold) index = [idx for idx, e in enumerate(labels_cal) if e == -1] cluster[-1] = [] # assign according to the sim to the medoid of each cluster. for uuid in index: data_arr = data_dict[TRAIN_KPI[uuid]] cla, it_dist = assignment(medoids, data_arr, dist_category=dist_measure) cluster[cla].append(TRAIN_KPI[uuid]) labels_cal[uuid] = cla print('KPI %s belongs to class %d' % (TRAIN_KPI[uuid], cla)) getLogger(__name__).info('KPI %s belongs to class %d' % (TRAIN_KPI[uuid], cla)) all_kpi.append(TRAIN_KPI[uuid]) all_cla.append(cla) all_dist.append(it_dist) dataframe = pd.DataFrame({ 'uuid': all_kpi, 'cluster': all_cla, 'dist': all_dist }) dataframe.to_csv("/home/jialingxiang/NewDTWFrame/SplitKPI/all_dist.csv", index=False, sep=',') # assign method 2: assign according to its nearest clustered curve. ''' for uuid in index: cla, new_labels = assign_to_nearest(sim_matrix, uuid, labels_cal) cluster[cla].append(TRAIN_KPI[uuid]) print('KPI %d belongs to class %d' %(TRAIN_KPI[uuid], cla)) labels_cal = new_labels ''' # print(cluster) result = {} for key in cluster.keys(): # print('class %d: %d' % (key, len(cluster[key]))) getLogger(__name__).info('class %d: %d' % (key, len(cluster[key]))) for value in cluster[key]: result[value] = key result_df = pd.DataFrame(list(result.items()), columns=['uuid', 'cluster']) result_df.to_hdf(EXP_ROOT + 'cluster_result_r%f.hdf' % max_radius, '/cluster_result', mode='w', format='table') medoids_dict = {} for i in range(0, len(medoids)): medoids_dict[i] = medoids[i] medoids_df = pd.DataFrame(list(medoids_dict.items()), columns=['cluster', 'medoid']) medoids_df.to_hdf(EXP_ROOT + 'medoids_r%f.hdf' % max_radius, '/medoids', mode='w', format='table') return medoids, labels_cal