def run(input_file, output_file, max_common_frames, n_clusters): input = np.load(input_file) #print(np.linalg.norm(input[0, 10:])) #Frame_id, track id, ., ., ., ., ., ., ., ., appearance features t_ = time.time() print("Input shape:", input.shape) ids = list(np.unique(input[:, 1])) print(time.time() - t_) t_ = time.time() print("Total number of ids:", len(ids)) ids_by_frames = {} for row in input: if row[0] not in ids_by_frames.keys(): ids_by_frames[row[0]] = [] ids_by_frames[row[0]].append(row[1]) n_ids_by_frames = {k: len(v) for k, v in ids_by_frames.items()} plt.bar(n_ids_by_frames.keys(), n_ids_by_frames.values(), color='g') #plt.show() print("Maximum number of ids on the same frame:", max(n_ids_by_frames.values())) ff = [] for f, nid in n_ids_by_frames.items(): if nid > n_clusters: ff.append(f) print("Delete frames with n_detections > n_clusters:", len(ff), ff) input = np.array([x for x in input if x[0] not in ff]) min_len_tracklet = 10 print("Delete tracklets with n_detections < ", min_len_tracklet) lens = [] to_remove = [] for i in ids: t = input[input[:, 1] == i][:, 0] if t.shape[0] < min_len_tracklet: to_remove.append(i) else: lens.append(t.shape[0]) for i in to_remove: ids.remove(i) input = input[~(input[:, 1] == i)] print(input.shape, "detections x features") print("Mean len of tracklets (in frames):", np.mean(lens)) random_data = [] data = [] for i in ids: group = input[:, 1] == i n_frames = input[group].shape[0] d = np.zeros(input[0, 10:].shape[0] + 2) d[0] = input[group][:, 0].min(axis=0) d[1] = i d[2:] = input[group][:, 10:].mean(axis=0) d[2:] = d[2:] / np.linalg.norm(d[2:]) * n_frames data.append(d) x = np.random.random(128) x = x / np.linalg.norm(x) random_data.append(list(d[:2]) + list(x)) data = np.array(data) data = data[data[:, 0].argsort()] ids = list(data[:, 1]) random_data = np.array(random_data) common_frames = np.zeros((len(ids), len(ids))) if run_common_frames: print("Computing common frames matrix...") for i in range(len(ids)): for j in range(i, len(ids)): n_common_frames = len( set(list(input[input[:, 1] == ids[i]][:, 0])).intersection( list(input[input[:, 1] == ids[j]][:, 0]))) common_frames[i, j] = n_common_frames print("Saved common frames matrix") np.save("common_frames.npy", common_frames) else: print("Loaded common frames matrix") common_frames = np.load("common_frames.npy") #print(common_frames, common_frames.shape) print("Computing 'cannot link' constraints with max_common_frames = ", max_common_frames) must_link = [] cannot_link = [(i, j) for i in range(len(ids)) for j in range(i + 1, len(ids)) if common_frames[i, j] > max_common_frames] #print(cannot_link) print("Number of constraints", len(cannot_link)) #Initialisation des centroides ids_by_frames = {} for row in input: if row[0] not in ids_by_frames.keys(): ids_by_frames[row[0]] = [] ids_by_frames[row[0]].append(row[1]) n_ids_by_frames = {k: len(v) for k, v in ids_by_frames.items()} ref_frame = max(n_ids_by_frames, key=lambda key: n_ids_by_frames[key]) centers_ids_init = ids_by_frames[ref_frame] centers_init = [list(data[:, 1]).index(i) for i in centers_ids_init] ids = list(np.unique(data[:, 1])) if (len(ids) < n_clusters): n_clusters = len(ids) clusters, centers = cop_kmeans(dataset=data[:, 2:], initialization=centers_init, k=n_clusters, ml=must_link, cl=cannot_link, spherical=True) #We then compute a clustering with random unit features and only the constraints (to compare) #random_clusters, random_centers = cop_kmeans(dataset=random_data, k=n_clusters, ml=must_link, cl=cannot_link, spherical=True) #print("Adjusted Rand Index between constrained k-means clustering and a constrained but random clustering", adjusted_rand_score(clusters, random_clusters)) out_clusters = clusters output = input[:, :10] output[:, 1] = np.array( [out_clusters[ids.index(int(x))] for x in input[:, 1]]) #print("Output:", output) print("Output saved to:", output_file) np.save(output_file, output)
def fit(self, x=None, y=None, shuffle_=None, pretrain_epochs=100, batch_size_au=256, maxiter_DC=7000, update_interval=140, n_clusters=2, seed_value=42, verbose=None, file_out=None, N_no_mod=None): ''' Fit the model ''' if x is None: autoencoder, encoder = self.autoencoderConv1D(self.signal_shape) autoencoder.compile(optimizer='adadelta', loss='mse') clustering_layer = ClusteringLayer(n_clusters, name='clustering')( encoder.output) model = Model(inputs=encoder.input, outputs=[clustering_layer, autoencoder.output]) model.compile(loss=['kld', 'mse'], loss_weights=[0.3, 1], optimizer='adam') return model # 2. Set `python` built-in pseudo-random generator at a fixed value random.seed( seed_value ) # 3. Set `numpy` pseudo-random generator at a fixed value np.random.seed( seed_value ) # 4. Set `tensorflow` pseudo-random generator at a fixed value tf.set_random_seed( seed_value ) # 5. For layers that introduce randomness like dropout, make sure to set seed values config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) nmi_f = normalized_mutual_info_score ari_f = adjusted_rand_score if shuffle_: x, y = shuffle(x, y, random_state=seed_value) # Baseline1 raw data #kmeans = KMeans(n_clusters=n_clusters, n_init=20, n_jobs=10, random_state = seed_value) #y_pred_kmeans = kmeans.fit_predict(x.reshape((x.shape[0], x.shape[1]))) # baseline data with cop-kmeans must_link = list(itertools.combinations(np.arange(0, 900), 2)) y_pred_kmeans, centers = cop_kmeans(dataset=x, k=2, ml=must_link) if file_out: file_out.write('Acc. k-means : ' + str(self.accuracy(y, np.array(y_pred_kmeans))) + '\n') batch_size = batch_size_au autoencoder, encoder = self.autoencoderConv1D(self.signal_shape) autoencoder.summary() autoencoder.compile(optimizer='adadelta', loss='mse') autoencoder.fit(x, x, batch_size=batch_size, epochs=pretrain_epochs) # Baseline 2 #kmeans = KMeans(n_clusters=n_clusters, n_init=20, n_jobs=10, random_state=42) #y_pred_kmeans = kmeans.fit_predict(encoder.predict(x)) y_pred_kmeans, centers = cop_kmeans(dataset=encoder.predict(x), k=2, ml=must_link) if file_out: file_out.write('Acc. Autoencoder : ' + str(self.accuracy(y, np.array(y_pred_kmeans))) + '\n') # build the clustering layer clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output) model = Model(inputs=encoder.input, outputs=[clustering_layer, autoencoder.output]) model.compile(loss=['kld', 'mse'], loss_weights=[0.1, 1], optimizer='adam') model.summary() centers = np.array(centers).reshape((2, 5)) #Initialize cluster centers k-means model.get_layer(name='clustering').set_weights([centers]) loss = 0 index = 0 maxiter = maxiter_DC update_interval = update_interval index_array = np.arange(x.shape[0]) # change the batch size to the number of samples #batch_size = x.shape[0] # computing an auxiliary target distribution def target_distribution(q): weight = q**2 / q.sum(0) return (weight.T / weight.sum(1)).T for ite in range(int(maxiter)): if ite % update_interval == 0: q, _ = model.predict(x, verbose=0) p = target_distribution( q) # update the auxiliary target distribution p p[:N_no_mod] = p[:N_no_mod] # evaluate the clustering performance y_pred = q.argmax(1) if y is not None: acc = self.accuracy(y, y_pred) ari = ari_f(y, y_pred) nmi = nmi_f(y, y_pred) loss = loss if verbose: print( 'Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' % (ite, acc, nmi, ari), ' ; loss=', loss) idx = index_array[index * batch_size:min((index + 1) * batch_size, x.shape[0])] loss = model.train_on_batch(x=x[idx], y=[p[idx], x[idx]]) index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0 return model
calculate_geo_distance(r1.lat, r1.lng, r2.lat, r2.lng) for _, r2 in lat_lng_df.iterrows() ]) D_lat_lng_scale = scaler.fit_transform(D_lat_lng) D_lat_lng_scale = pd.DataFrame(D_lat_lng_scale).fillna(np.nanmean(D_lat_lng_scale)).values # calculate topic distance between statement persons_1 = list(map(preprocess, list(df['Statement']))) persons_2 = list(map(preprocess, list(df['Statement']))) D_statement = - affinity_computation(persons_1, persons_2, n_components=30, min_df=2, max_df=0.8, weighting='tfidf', projection='svd') std_topic = D_statement.std() # clustering D_final = (D_statement) + (10 * std_topic * D_tz) + (std_topic * D_lat_lng_scale) # final distance X_mds = MDS(n_components=30).fit_transform(D_final) clusters_kmean, centers_kmean = cop_kmeans(dataset=X_mds, k=200, cl=cannot_link) output_df = df[selected_cols] output_df['pod_number'] = clusters_kmean # rearrange df_rearrange = [] pod_num = 1 for _, df_tz in output_df.groupby('timezone'): for _, df_pod_num in df_tz.groupby('pod_number'): df_pod_num['pod_number'] = pod_num df_rearrange.append(df_pod_num) pod_num += 1 df_rearrange = pd.concat(df_rearrange)[selected_cols] df_rearrange.to_csv('pod_matching_rearrange_mds.csv', index=False)
for i in range(precip_sample + 1, 153): if label_list_train[i] == 1: must_link.append((precip_sample, i)) # This is the same idea for i in range(bicontin_sample + 1, 153): if label_list_train[i] == -1: must_link.append((bicontin_sample, i)) # We do not want anything with different labels to be linked cannot_link = [(precip_sample, bicontin_sample)] random.seed(18) # This command runs the method; clusters are the label assignments clusters, centers = cop_kmeans(dataset=processed_no_test, k=2, ml=must_link, cl=cannot_link) # remember this is an unsupervised process with no real label information going into it; # the algorithm will give two clusters, but we have to look into it to see which cluster # labels correspond to our initial labels precip_cluster_label = clusters[precip_sample] bicontin_cluster_label = clusters[bicontin_sample] for i in range(0, 205): if clusters[i] == precip_cluster_label: clusters[i] = 1 else: clusters[i] = -1
def cop_kmean(): # input_matrix = numpy.random.rand(100, 500) documents = read_txt() input_matrix = [] for i in documents: input_matrix = build_matrix(i, input_matrix) # must_link = [(1,2),(8,9),(0,1),(3,7),(0,3),(0,4),(11,12),(24,26),(25,27),(50,51),(53,54)] # cannot_link = [(7,13),(0,5),(0,8),(0,24),(0,27),(0,50),(0,53)] must_link = [(0, 1), (0, 2), (0, 3), (0, 7), (5, 8), (14, 18), (37, 38)] cannot_link = [(0, 4), (0, 5), (0, 6), (0, 8), (7, 71), (7, 16), (0, 31), (0, 37), (7, 123), (0, 123), (7, 18)] clusters, centers = cop_kmeans(dataset=input_matrix, k=4, ml=must_link, cl=cannot_link) print(clusters) print(input_matrix) print(centers) print(len(clusters), '--', len(input_matrix)) # test model # tests = ['h', 'a', 'qq', 'cx', 'hvh'] tests = [ 'a', 'b', 'abbca', 'ccbba', 'dabdda', 'm', 'ammp', 'non', 'mmdn', 'oaa', 'o', 'wwp', 'xp', 'xnp', 'ppopn', 'w', 'xzzmz', 'ywzzyz', 'wywyzyy', 'zww' ] result = [] for test in tests: vect = word2vector(test) temp = input_matrix.index(vect) # print(vect) group_num = clusters[temp] # print(test_result) line = [ test, str(get_score(centers, vect, group_num)), str(group_num), str(temp), str(vect) ] result.append(line) write_excel_xls(book_name_xls, sheet_name_xls, value_title) write_excel_xls_append(book_name_xls, result) # score_a_file(input_matrix, clusters, centers) fig = plt.figure() ax = Axes3D(fig) cluster_set = [[], [], [], []] for ind in range(len(input_matrix)): cluster_set[clusters[ind]].append((input_matrix[ind])) cluster_arr = tuple(cluster_set) ax.scatter([i[0] for i in cluster_arr[0]], [i[1] for i in cluster_arr[0]], [i[2] for i in cluster_arr[0]], c='r', label='first cluster') ax.scatter([i[0] for i in cluster_arr[1]], [i[1] for i in cluster_arr[1]], [i[2] for i in cluster_arr[1]], c='b', label='second cluster') ax.scatter([i[0] for i in cluster_arr[2]], [i[1] for i in cluster_arr[2]], [i[2] for i in cluster_arr[2]], c='g', label='third cluster') ax.scatter([i[0] for i in cluster_arr[3]], [i[1] for i in cluster_arr[3]], [i[2] for i in cluster_arr[3]], c='y', label='fourth cluster') print('the items of each cluster is: ', len(cluster_set[3][:]), len(cluster_set[2]), len(cluster_set[1]), len(cluster_set[0])) # ax.scatter(centers[0][2], centers[0][3], centers[0][5], marker='*', c='r') # ax.scatter(centers[1][2], centers[1][3], centers[1][5], marker='1', c='b') # ax.scatter(centers[2][2], centers[2][3], centers[2][5], marker='P', c='g') # ax.scatter(centers[3][2], centers[3][3], centers[3][5], marker='x', c='y') # print(cluster_arr[0]) ax.legend(loc='best') ax.set_zlabel('high risk', fontdict={'size': 13, 'color': 'black'}) ax.set_ylabel('medium risk', fontdict={'size': 13, 'color': 'black'}) ax.set_xlabel('normal event', fontdict={'size': 13, 'color': 'black'}) plt.savefig('fig.png', bbox_inches='tight') plt.show()
def run(input_file, output_file, max_common_frames, n_clusters, version): input = np.load(input_file) #Frame_id, track id, ., ., ., ., ., ., ., ., appearance features t_ = time.time() print("Input shape:", input.shape) ids = list(np.unique(input[:, 1])) ######## NETTOYAGE print("Total number of ids:", len(ids)) ids_by_frames = {} for row in input: if row[0] not in ids_by_frames.keys(): ids_by_frames[row[0]] = [] ids_by_frames[row[0]].append(row[1]) n_ids_by_frames = {k: len(v) for k, v in ids_by_frames.items()} #plt.bar(n_ids_by_frames.keys(), n_ids_by_frames.values(), color='g') #plt.show() print("Maximum number of ids on the same frame:", max(n_ids_by_frames.values())) ff = [] for f, nid in n_ids_by_frames.items(): if nid > n_clusters: ff.append(f) print("Delete frames with n_detections > n_clusters:", len(ff), ff) input = np.array([x for x in input if x[0] not in ff]) min_len_tracklet = 0 lens = [] to_remove = [] for i in ids: t = input[input[:, 1] == i][:, 0] if t.shape[0] < min_len_tracklet: to_remove.append(i) else: lens.append(t.shape[0]) for i in to_remove: ids.remove(i) input = input[~(input[:, 1] == i)] print("Delete tracklets with n_detections < ", min_len_tracklet, " : ", len(to_remove)) print(input.shape, "detections x features") print( "Mean len of tracklets (in frames):", np.mean(lens), ) ######## FIN NETTOYAGE random_data = [] data = [] nn_frames = [] for i in ids: group = input[:, 1] == i n_frames = input[group].shape[0] nn_frames.append(n_frames) d = np.zeros(input[0, 10:].shape[0] + 2) d[0] = input[group][:, 0].min(axis=0) d[1] = i d[2:] = input[group][:, 10:].mean(axis=0) d[2:] = d[2:] / np.linalg.norm(d[2:]) * n_frames data.append(d) x = np.random.random(128) x = x / np.linalg.norm(x) random_data.append(list(d[:2]) + list(x)) data = np.array(data) data = data[data[:, 0].argsort()] #Sort by asc frame_idx ids = list(data[:, 1]) plt.hist(nn_frames, bins=100) #plt.show() common_frames = np.zeros((len(ids), len(ids))) if run_common_frames: print("Computing common frames matrix...") for i in range(len(ids)): for j in range(i, len(ids)): n_common_frames = len( set(list(input[input[:, 1] == ids[i]][:, 0])).intersection( list(input[input[:, 1] == ids[j]][:, 0]))) common_frames[i, j] = n_common_frames print("Saved common frames matrix") np.save("common_frames.npy", common_frames) else: print("Loaded common frames matrix") common_frames = np.load("common_frames.npy") #print(common_frames, common_frames.shape) print("Computing 'cannot link' constraints with max_common_frames = ", max_common_frames) must_link = [] cannot_link = [(i, j) for i in range(len(ids)) for j in range(i + 1, len(ids)) if common_frames[i, j] > max_common_frames] #print(cannot_link) print("Number of constraints", len(cannot_link)) #INITIALISATION DES CENTROIDES init_mode = 2 ids_by_frames = {} for row in input: if row[0] not in ids_by_frames.keys(): ids_by_frames[row[0]] = [] ids_by_frames[row[0]].append(row[1]) if init_mode == 1 or True: #On cherche la frame ayant le plus de joueurs détectés simultanément, # et on initialise les clusters avec les tracklets détectés sur cette frame n_ids_by_frames = {k: len(v) for k, v in ids_by_frames.items()} ref_frame = max(n_ids_by_frames, key=lambda key: n_ids_by_frames[key]) print(ref_frame, ids_by_frames[ref_frame], [list(data[:, 1]).index(i) for i in ids_by_frames[ref_frame]]) if init_mode == 2: # On cherche la frame sur laquelle le somme des longueurs des tracklets détectés sur cette frame est maximale # et on initialise les clusters avec les tracklets détectés sur cette frame len_by_frames = {} for row in input: if row[0] not in len_by_frames.keys(): len_by_frames[row[0]] = [] len_by_frames[row[0]].append( np.linalg.norm(data[data[:, 1] == row[1]].reshape(-1)[2:])) sum_len_by_frames = {k: sum(v) for k, v in len_by_frames.items()} ref_frame = max(sum_len_by_frames, key=lambda key: sum_len_by_frames[key]) centers_ids_init = ids_by_frames[ref_frame] centers_init = [list(data[:, 1]).index(i) for i in centers_ids_init] print(ref_frame, centers_ids_init, centers_init) ids = list(np.unique(data[:, 1])) if (len(ids) < n_clusters): n_clusters = len(ids) clusters, centers = cop_kmeans(dataset=data[:, 2:], initialization=centers_init, k=n_clusters, ml=must_link, cl=cannot_link, spherical=True) if clusters == None: print("Error: impossible clustering") exit() print([clusters[ids.index(int(x))] for x in centers_ids_init]) #clusters = [np.random.randint(0,n_clusters) for i in range(len(ids))] #RANDOM CLUSTERING clusters_ = { 2.0: [1.0], 584.0: [1.0], 644.0: [1.0], 910.0: [1.0], 1060.0: [1.0], 1435.0: [1.0], 1593.0: [1.0], 1732.0: [1.0], 2021.0: [1.0], 2149.0: [1.0], 2273.0: [1.0], 2455.0: [1.0], 2550.0: [1.0], 2671.0: [1.0], 2680.0: [1.0, 6.0], 21.0: [2.0], 67.0: [2.0], 103.0: [2.0], 270.0: [2.0], 346.0: [2.0], 399.0: [2.0], 666.0: [2.0], 1129.0: [2.0], 1382.0: [2.0], 1658.0: [2.0], 1714.0: [2.0], 2029.0: [2.0], 2087.0: [2.0], 2348.0: [2.0], 2785.0: [2.0], 224.0: [3.0], 283.0: [3.0], 316.0: [3.0], 386.0: [3.0], 936.0: [3.0], 1299.0: [3.0], 1374.0: [3.0], 1462.0: [3.0], 1567.0: [3.0], 1636.0: [3.0], 1700.0: [3.0], 1860.0: [3.0], 2432.0: [3.0], 2594.0: [3.0], 2643.0: [3.0], 2711.0: [3.0], 4.0: [4.0], 575.0: [4.0], 676.0: [4.0], 756.0: [4.0], 888.0: [4.0], 950.0: [4.0], 1156.0: [4.0, 6.0], 1918.0: [4.0, 8.0], 2086.0: [4.0], 2163.0: [4.0], 2358.0: [4.0], 2553.0: [4.0], 3.0: [5.0], 180.0: [5.0], 235.0: [5.0], 304.0: [5.0], 401.0: [5.0], 893.0: [5.0], 1428.0: [5.0], 1558.0: [5.0], 2032.0: [5.0], 2743.0: [5.0], 1.0: [6.0], 42.0: [6.0], 140.0: [6.0], 181.0: [6.0], 510.0: [6.0], 560.0: [6.0], 686.0: [6.0], 819.0: [6.0], 961.0: [6.0], 1058.0: [6.0], 1403.0: [6.0], 1990.0: [6.0], 2059.0: [6.0], 2253.0: [6.0], 2416.0: [6.0], 2516.0: [6.0], 2748.0: [6.0], 5.0: [7.0], 108.0: [7.0], 262.0: [7.0], 344.0: [7.0], 438.0: [7.0], 504.0: [7.0], 578.0: [7.0], 832.0: [7.0], 941.0: [7.0], 1038.0: [7.0], 1407.0: [7.0], 1547.0: [7.0], 1783.0: [7.0], 1866.0: [7.0], 1910.0: [7.0], 2222.0: [7.0], 2458.0: [7.0], 2570.0: [7.0], 2645.0: [7.0], 6.0: [8.0], 54.0: [8.0], 89.0: [8.0], 261.0: [8.0], 334.0: [8.0], 409.0: [8.0], 440.0: [8.0], 637.0: [8.0], 932.0: [8.0], 1102.0: [8.0], 1173.0: [8.0], 1406.0: [8.0], 1508.0: [8.0], 1644.0: [8.0], 2007.0: [8.0], 2166.0: [8.0], 2480.0: [8.0], 2525.0: [8.0], 46.0: [9.0], 661.0: [9.0], 894.0: [9.0], 1029.0: [9.0], 1325.0: [9.0], 1831.0: [9.0], 1973.0: [9.0], 2268.0: [9.0], 2139.0: [10.0], 1355.0: [11.0] } #output = np.array([x for x in input[:,:10] if x[1] in clusters_.keys()]) #output[:,1] = np.array([clusters_[x][0] for x in output[:,1]]) output = input[:, :10] output[:, 1] = np.array([clusters[ids.index(int(x))] for x in output[:, 1]]) print("Output saved to:", output_file) np.save(output_file, output)
def run(input_file, output_file, max_common_frames, n_clusters, version): input = np.load(input_file) #Frame_id, track id, ., ., ., ., ., ., ., ., appearance features t_ = time.time() print("Input shape:", input.shape) ids = list(np.unique(input[:, 1])) ######## NETTOYAGE print("Total number of ids:", len(ids)) ids_by_frames = {} for row in input: if row[0] not in ids_by_frames.keys(): ids_by_frames[row[0]] = [] ids_by_frames[row[0]].append(row[1]) n_ids_by_frames = {k: len(v) for k, v in ids_by_frames.items()} #plt.bar(n_ids_by_frames.keys(), n_ids_by_frames.values(), color='g') #plt.show() print("Maximum number of ids on the same frame:", max(n_ids_by_frames.values())) ff = [] for f, nid in n_ids_by_frames.items(): if nid > n_clusters: ff.append(f) print("Delete frames with n_detections > n_clusters:", len(ff), ff) input = np.array([x for x in input if x[0] not in ff]) min_len_tracklet = 10 lens = [] to_remove = [] for i in ids: t = input[input[:, 1] == i][:, 0] if t.shape[0] < min_len_tracklet: to_remove.append(i) else: lens.append(t.shape[0]) for i in to_remove: ids.remove(i) input = input[~(input[:, 1] == i)] print("Delete tracklets with n_detections < ", min_len_tracklet, " : ", len(to_remove)) print(input.shape, "detections x features") print( "Mean len of tracklets (in frames):", np.mean(lens), ) ######## FIN NETTOYAGE random_data = [] data = [] nn_frames = [] for i in ids: group = input[:, 1] == i n_frames = input[group].shape[0] nn_frames.append(n_frames) d = np.zeros(input[0, 10:].shape[0] + 2) d[0] = input[group][:, 0].min(axis=0) d[1] = i d[2:] = input[group][:, 10:].mean(axis=0) d[2:] = d[2:] / np.linalg.norm(d[2:]) * n_frames data.append(d) x = np.random.random(128) x = x / np.linalg.norm(x) random_data.append(list(d[:2]) + list(x)) data = np.array(data) data = data[data[:, 0].argsort()] #Sort by asc frame_idx ids = list(data[:, 1]) plt.hist(nn_frames, bins=100) #plt.show() common_frames = np.zeros((len(ids), len(ids))) if run_common_frames: print("Computing common frames matrix...") for i in range(len(ids)): for j in range(i, len(ids)): n_common_frames = len( set(list(input[input[:, 1] == ids[i]][:, 0])).intersection( list(input[input[:, 1] == ids[j]][:, 0]))) common_frames[i, j] = n_common_frames print("Saved common frames matrix") np.save("common_frames.npy", common_frames) else: print("Loaded common frames matrix") common_frames = np.load("common_frames.npy") print("Computing 'cannot link' constraints with max_common_frames = ", max_common_frames) must_link = [] cannot_link = [(i, j) for i in range(len(ids)) for j in range(i + 1, len(ids)) if common_frames[i, j] > max_common_frames] print("Number of constraints", len(cannot_link)) if version == 1: #SPECTRAL CLUSTERING if run_similarity_matrix: print("Computing similarity matrix...") similarity_matrix = np.zeros((len(ids), len(ids))) for i in range(len(ids)): for j in range(i + 1, len(ids)): similarity_matrix[i, j] = (1.0 + np.max( cosine_similarity(input[input[:, 1] == ids[i]][:, 10:], input[input[:, 1] == ids[j]] [:, 10:]).reshape(-1))) / 2.0 """if (i,j) in cannot_link: similarity_matrix[i, j] = 0.0""" similarity_matrix[j, i] = similarity_matrix[i, j] print("Saved similarity matrix") np.save("similarity_matrix.npy", similarity_matrix) else: print("Loaded similarity matrix") similarity_matrix = np.load("similarity_matrix.npy") clusters = octave.TCCRP([0, 1, 2, 3], [1, 2, 3, 5], [1, 1, 1, [1, 2, 3]]) print(clusters) if version == 0: #SPHERICAL K-MEANS #INITIALISATION DES CENTROIDES ids_by_frames = {} for row in input: if row[0] not in ids_by_frames.keys(): ids_by_frames[row[0]] = [] ids_by_frames[row[0]].append(row[1]) n_ids_by_frames = {k: len(v) for k, v in ids_by_frames.items()} ref_frame = max(n_ids_by_frames, key=lambda key: n_ids_by_frames[key]) centers_ids_init = ids_by_frames[ref_frame] centers_init = [list(data[:, 1]).index(i) for i in centers_ids_init] print(data[:, 1]) print(ref_frame, centers_ids_init, centers_init) ids = list(np.unique(data[:, 1])) if (len(ids) < n_clusters): n_clusters = len(ids) clusters, centers = cop_kmeans(dataset=data[:, 2:], initialization=centers_init + [13], k=n_clusters, ml=must_link, cl=cannot_link, spherical=True) if clusters == None: print("Error: impossible clustering") exit() output = input[:, :10] output[:, 1] = np.array([clusters[ids.index(int(x))] for x in input[:, 1]]) print("Output saved to:", output_file) np.save(output_file, output)