def group_region(fname, traj, u_dim): poi = np.load('./data/' + fname + '_POI.npy') # location id start from 1 gr = './data/' + fname + '_group.txt' file = open(gr, 'a') g = open(gr, 'wb') group = [] # (user, *num, ?, [id, lat, lon]) for i in range(u_dim): user = [] for item in traj: if item[0, 0].item( ) == i + 1: # the user id to whom the traj belongs to is i+1 user += item[:, 1].numpy().tolist() # visited points in this traj if not user: continue locs = poi[:, 0] pos = [] for item in user: pos.append(poi[np.where(locs == item)][0]) pos = np.array(pos) # poi of the user' all visited points (repeated) DB = DBSCAN(eps=0.002, min_samples=10).fit( pos[:, 1:]).labels_ # eps 0.01 -- dis 0.84174 base = [] # (num, ?, [id, lat, lon]) region, tmp = [], [] num_group = DB.max() + 1 # start from 0 to max for num in range(num_group): base.append(pos[np.where(DB == num)].tolist()) for point in poi.tolist(): for each in base[num]: # get all points within the region, dis less than 0.01 to at least one base point if euclidean(point, each) < (0.001): # half it here tmp.append(point[0]) break region.append(tmp) # region (num, ?) print(len(tmp)) tmp = [] print('...') group.append(region) pickle.dump(group, g) g.close() return group # (user, *num, ?)
def merge_close_lines(self, lines, threshold): lines = np.asarray(lines) distances = np.reshape(np.abs(lines[:, 0]), (len(lines), 1)) clusters = DBSCAN(eps=threshold, min_samples=1).fit_predict(distances) merged_lines = [] for cluster_id in range(0, clusters.max() + 1): lines_to_merge = np.asarray(lines[clusters == cluster_id]) idxs = np.array(range(0, lines_to_merge.shape[0])) res = np.argsort(np.abs(lines_to_merge[:, 0])) sorted_lines = lines_to_merge[res] sorted_idxs = idxs[res] merged_lines.append(lines_to_merge[sorted_idxs[int(lines_to_merge.shape[0]/2)], :]) return merged_lines
def dbscan(similarity, static_fc, sub): dbscan = DBSCAN(metric = "euclidean").fit_predict(similarity[sub]) plt.plot(np.asarray(dbscan) + 2) plt.title("State transitions", fontsize = 20) plot_matrix(static_fc[sub]) plt.title("Static connectivity", fontsize = 20) plot_matrix(similarity[sub]) plt.title("Similarity matrix", fontsize = 20) n = 1 for cluster in range(-1, dbscan.max()+1): label = dbscan == cluster percent = np.sum((label)/len(label) * 100) max_seq = maximum_sequence(label, cluster) mean_mat = np.mean(all_FC_sl[1][label], axis = 0) #plot_matrix(mean_mat) mean_matrices.append(mean_mat) plot_matrix(mean_mat, auto_fit= True, vmin = static_fc[sub].min(), vmax = static_fc[sub].max()) #, axes = ax plt.title("State %d" %n, fontsize = 20) plt.suptitle("Percent of occurence: %d%%" %percent, backgroundcolor = "white") n += 1
def createClusters(self, minMagnitude = 0, treeR = 22, leafNum = 190, neighborR = 22, timeScale = 10, eps = 18, minPts = 90, delta = 1.0): #self.loadVideo() self.loadHMM() self._print('Created ' + self.labeledCoordsFile) coords = self.obj.retDBScanMatrix(minMagnitude) np.save(self.localClusterDirectory + 'RawCoords.npy', coords) #subprocess.call(['rclone', 'copy', self.localClusterDirectory + 'RawCoordsFile.npy', self.cloudClusterDirectory], stderr = self.fnull) sortData = coords[coords[:,0].argsort()][:,0:3] #sort data by time for batch processing, throwing out 4th column (magnitude) numBatches = int(sortData[-1,0]/delta/3600) + 1 #delta is number of hours to batch together. Can be fraction. sortData[:,0] = sortData[:,0]*timeScale #scale time so that time distances between transitions are comparable to spatial differences labels = np.zeros(shape = (sortData.shape[0],1), dtype = sortData.dtype) #Calculate clusters in batches to avoid RAM overuse curr_label = 0 #Labels for each batch start from zero - need to offset these print('Calculating clusters in ' + str(numBatches) + ' total batches', file = sys.stderr) for i in range(numBatches): print('Batch: ' + str(i), file = sys.stderr) min_time, max_time = i*delta*timeScale*3600, (i+1)*delta*timeScale*3600 # Have to deal with rescaling of time. 3600 = # seconds in an hour hour_range = np.where((sortData[:,0] > min_time) & (sortData[:,0] <= max_time)) min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1 X = NearestNeighbors(radius=treeR, metric='minkowski', p=2, algorithm='kd_tree',leaf_size=leafNum,n_jobs=24).fit(sortData[min_index:max_index]) dist = X.radius_neighbors_graph(sortData[min_index:max_index], neighborR, 'distance') sub_label = DBSCAN(eps=eps, min_samples=minPts, metric='precomputed', n_jobs=24).fit_predict(dist) new_labels = int(sub_label.max()) + 1 sub_label[sub_label != -1] += curr_label labels[min_index:max_index,0] = sub_label curr_label += new_labels sortData[:,0] = sortData[:,0]/timeScale self.labeledCoords = np.concatenate((sortData, labels), axis = 1).astype('int64') np.save(self.localClusterDirectory + self.labeledCoordsFile, self.labeledCoords) subprocess.call(['rclone', 'copy', self.localClusterDirectory + self.labeledCoordsFile, self.cloudClusterDirectory], stderr = self.fnull)
def Signatures(): # a few definitions ... tph_bins = array(['TPH_C06', 'TPH_C07', 'TPH_C08', 'TPH_C09-C10', 'TPH_C11-C12', 'TPH_C13-C14', 'TPH_C15-C16', 'TPH_C17-C18', 'TPH_C19-C20', 'TPH_C21-C22','TPH_C23-C24', 'TPH_C25-C28', 'TPH_C29-C32', 'TPH_C33-C36']) fuels = array(['gasoline', 'diesel', 'kerosene', 'bunker C', 'heavy fuel oil', 'crude oil']) stretch = 5. # vertical exag. factor, used to calculate distance matrices report = linspace(start=10., stop=90., num=9, endpoint=True) # percentile classes used to process distance matrices N = 100 # number of samples to include in each synthetic reference fuel population num_K_clusters = 6 # number of K-Means clusters to assign eps=0.1 # difference tolerance, dbscan cluster analysis min_samples=10 # minimum number of samples for dbscan clusters # read data sets ... soil_TPH_df = ReadSoilData(tph_bins, 0.) # consider all reported samples in data sets, including those with values of 0. print 'Read and processed all site soil data.' locations_df = read_csv('survey.txt',sep='\t') print 'Read boring locations.' soil_TPH_df = merge(locations_df, soil_TPH_df, on='Location ID', how='inner') elev = array(soil_TPH_df['Surface Elevation(ft-msl)'] - soil_TPH_df['Depth']) soil_TPH_df.insert(5, 'elev', elev) print 'Merged soil sample survey data with TPH data set.' # conduct cluster analyses to find general patterns in TPH data X = soil_TPH_df[tph_bins].values # define feature subset k_means = KMeans(init='k-means++', n_clusters=num_K_clusters, n_init=25) # K-means cluster analysis z = k_means.fit_predict(X) soil_TPH_df['kmeans_group'] = z # append group indices to soil_TPH data frame centroids_df = DataFrame(k_means.cluster_centers_, columns=tph_bins) # note cluster centroids and write to output file centroids_df.to_csv('centroids.csv') z = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X) # DBSCAN cluster analysis soil_TPH_df['dbscan_group'] = z num_d_clusters = z.max() + 2 print 'Conducted cluster analyses.' # tag data points using SVM algorithm on TPH data fuel_refs_df = read_csv('fuel_ref.txt',sep='\t') print 'Read fuel reference compositions.' training_set_df = CreateTraining(fuels, tph_bins, N, fuel_refs_df) # generate synthetic reference fuel populations (for training sets) X = training_set_df[tph_bins].values # define feature subset of training set y = training_set_df['tag'].values # define targets of training set C = 1.0 # fit model (C = SVM regularization parameter) lin_svc = svm.LinearSVC(C=C).fit(X, y) Z = soil_TPH_df[tph_bins].values # use model to classify the test set z = lin_svc.predict(Z) soil_TPH_df['svm_predict'] = z # append fuel 'tags' to soil_TPH data frame print 'Conducted support-vector-machine classification analysis.' # write output files (for both clustering and SVM) ... soil_TPH_df.to_csv('soil_TPH.csv') # write fully processed soil hydrocarbon datasets to output files for fuel_type in fuels: soil_TPH_df[soil_TPH_df['svm_predict'] == fuel_type].to_csv(fuel_type + '.csv') # write output files by tagged signature for i in xrange(num_K_clusters): soil_TPH_df[soil_TPH_df['kmeans_group'] == i].to_csv('kgroup_' + str(i) + '.csv') # write output files by K-means group index for i in xrange(num_d_clusters): soil_TPH_df[soil_TPH_df['dbscan_group'] == i-1].to_csv('dgroup_' + str(i) + '.csv') # write output files by K-means group index # compare distibution of point-to-point distances, within classes and between classes, as a measured of randomness of scatter for i, fuel_type in enumerate(fuels): # distance arrays, by tag (i.e., svm-designation) points = soil_TPH_df[soil_TPH_df['svm_predict'] == fuel_type][['Easting', 'Northing', 'elev']] points['elev'] *= stretch percents = DistDistrib(points, report) if i: dist_matrix = dstack((dist_matrix, percents)) else: dist_matrix = percents tag_df = DataFrame(transpose(dist_matrix[0])) tag_df.columns = report.astype(str) tag_df.insert(0, 'category', fuels) for i in xrange(num_K_clusters): # distance arrays, by cluster: Kmeans-designation points = soil_TPH_df[soil_TPH_df['kmeans_group'] == i][['Easting', 'Northing', 'elev']] points['elev'] *= stretch percents = DistDistrib(points, report) if i: dist_matrix = dstack((dist_matrix, percents)) else: dist_matrix = percents kcluster_df = DataFrame(transpose(dist_matrix[0])) kcluster_df.columns = report.astype(str) for i in xrange(num_d_clusters): # distance arrays, by cluster: dbscan-designation points = soil_TPH_df[soil_TPH_df['dbscan_group'] == i-1][['Easting', 'Northing', 'elev']] points['elev'] *= stretch percents = DistDistrib(points, report) if i: dist_matrix = dstack((dist_matrix, percents)) else: dist_matrix = percents dcluster_df = DataFrame(transpose(dist_matrix[0])) dcluster_df.columns = report.astype(str) # distance array for all soil samples points = soil_TPH_df[['Easting', 'Northing', 'elev']] points['elev'] *= stretch percents = DistDistrib(points, report) all_df = DataFrame(percents.reshape((1, -1)), columns = report.astype(str)) all_df.columns = report.astype(str) # summarize distances by cluster kcluster_all_df = kcluster_df.append(all_df, ignore_index=True) kcluster_all_df.to_csv('report_k_cluster.csv') dcluster_all_df = dcluster_df.append(all_df, ignore_index=True) dcluster_all_df.to_csv('report_d_cluster.csv') # summarize distances by svm tags all_df.insert(0, 'category', 'ALL') tag_all_df = tag_df.append(all_df, ignore_index=True) tag_all_df.to_csv('report_tag.csv') print 'Analyzed sample-to-sample distances among sets.' print 'Done.'
def createClusters(self, minMagnitude=0, treeR=22, leafNum=190, neighborR=22, timeScale=10, eps=18, minPts=90, delta=1.0, Nclips=200, delta_xy=100, delta_t=60, smallLimit=500): self.loadVideo() self.loadHMM() self._print('Clustering HMM transitions using DBScan') coords = self.obj.retDBScanMatrix(minMagnitude) np.save(self.localClusterDirectory + 'RawCoords.npy', coords) #subprocess.call(['rclone', 'copy', self.localClusterDirectory + 'RawCoordsFile.npy', self.cloudClusterDirectory], stderr = self.fnull) sortData = coords[coords[:, 0].argsort( )][:, 0: 3] #sort data by time for batch processing, throwing out 4th column (magnitude) numBatches = int( sortData[-1, 0] / delta / 3600 ) + 1 #delta is number of hours to batch together. Can be fraction. sortData[:, 0] = sortData[:, 0] * timeScale #scale time so that time distances between transitions are comparable to spatial differences labels = np.zeros(shape=(sortData.shape[0], 1), dtype=sortData.dtype) #Calculate clusters in batches to avoid RAM overuse curr_label = 0 #Labels for each batch start from zero - need to offset these print('Calculating clusters in ' + str(numBatches) + ' total batches', file=sys.stderr) for i in range(numBatches): print('Batch: ' + str(i), file=sys.stderr) min_time, max_time = i * delta * timeScale * 3600, ( i + 1 ) * delta * timeScale * 3600 # Have to deal with rescaling of time. 3600 = # seconds in an hour hour_range = np.where((sortData[:, 0] > min_time) & (sortData[:, 0] <= max_time)) min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1 X = NearestNeighbors(radius=treeR, metric='minkowski', p=2, algorithm='kd_tree', leaf_size=leafNum, n_jobs=24).fit(sortData[min_index:max_index]) dist = X.radius_neighbors_graph(sortData[min_index:max_index], neighborR, 'distance') sub_label = DBSCAN(eps=eps, min_samples=minPts, metric='precomputed', n_jobs=24).fit_predict(dist) new_labels = int(sub_label.max()) + 1 sub_label[sub_label != -1] += curr_label labels[min_index:max_index, 0] = sub_label curr_label += new_labels sortData[:, 0] = sortData[:, 0] / timeScale self.labeledCoords = np.concatenate((sortData, labels), axis=1).astype('int64') np.save(self.localClusterDirectory + self.labeledCoordsFile, self.labeledCoords) subprocess.call([ 'rclone', 'copy', self.localClusterDirectory + self.labeledCoordsFile, self.cloudClusterDirectory ], stderr=self.fnull) uniqueLabels = set(self.labeledCoords[:, 3]) uniqueLabels.remove(-1) print( str(self.labeledCoords[self.labeledCoords[:, 3] != -1].shape[0]) + ' HMM transitions assigned to ' + str(len(uniqueLabels)) + ' clusters', file=sys.stderr) df = pd.DataFrame(self.labeledCoords, columns=['T', 'X', 'Y', 'LID']) clusterData = df.groupby('LID').apply( lambda x: pd.Series({ 'projectID': self.projectID, 'videoID': self.baseName, 'N': x['T'].count(), 't': int(x['T'].mean()), 'X': int(x['X'].mean()), 'Y': int(x['Y'].mean()), 't_span': int(x['T'].max() - x['T'].min()), 'X_span': int(x['X'].max() - x['X'].min()), 'Y_span': int(x['Y'].max() - x['Y'].min()), 'ManualAnnotation': 'No', 'ManualLabel': '', 'MLLabel': '' })) clusterData['X_depth'] = df.apply( lambda row: (self.transM[0][0] * row.X + self.transM[0][1] * row.Y + self.transM[0][2]) / (self.transM[2][0] * row.X + self.transM[2][1] * row.Y + self. transM[2][2]), axis=1) clusterData['Y_depth'] = df.apply( lambda row: (self.transM[1][0] * row.X + self.transM[1][1] * row.Y + self.transM[1][2]) / (self.transM[2][0] * row.X + self.transM[2][1] * row.Y + self. transM[2][2]), axis=1) clusterData.to_csv(self.localClusterDirectory + self.clusterFile, sep='\t') clusterData = pd.read_csv(self.localClusterDirectory + self.clusterFile, sep='\t', header=0) # Identify rows for manual labeling manualClips = 0 smallClips = 0 cap = cv2.VideoCapture(self.localMasterDirectory + self.videofile) framerate = cap.get(cv2.CAP_PROP_FPS) for row in clusterData.sample(n=clusterData.shape[0]).itertuples(): if manualClips > Nclips: break LID, N, t, x, y = row.LID, row.N, row.t, row.X, row.Y if x - delta_xy < 0 or x + delta_xy >= self.height or y - delta_xy < 0 or y + delta_xy >= self.width or LID == -1 or framerate * t - delta_t < 0 or framerate * t + delta_t >= self.frames: continue if smallClips > Nclips / 20: continue clusterData.loc[clusterData.LID == LID, 'ManualAnnotation'] = 'Yes' manualClips += 1 if N < smallLimit: smallClips += 1 clusterData.to_csv(self.localClusterDirectory + self.clusterFile, sep='\t') subprocess.call([ 'rclone', 'sync', self.localClusterDirectory, self.cloudClusterDirectory ], stderr=self.fnull) self.clusterData = clusterData self.createClusterClips()
def load_traj(dname): # start from 1 gr = './data/' + dname + '_group.txt' file = open(gr, 'a') g = open(gr, 'wb') # data (num_visit, [u, l, t]), +1 to avoid 0 as padding # poi (num_loc, [l, lat, lon]) group = [] # (user, *num, ?, [id, lat, lon]) poi = np.load('./data/' + dname + '_POI.npy') # location id start from 1 data = np.load('./data/' + dname + '_data.npy') + 1 data = data[np.argsort(data[:, 0])] # sort user delta_t = 60 * 24 * 1 # min * hour * day trajs_tmp, labels_tmp, traj, his_seq, tmp, u_id = [], [], [], np.array( []), [], 1 train_trajs, test_trajs, train_labels, test_labels = [], [], [], [] # re_user_id = 1 for item in data: # item ([u, l, t]) if u_id == item[0]: # get all info about user u_id tmp.append(item) else: # build time-ranked info and next user if len(tmp) is 0: # if no u_id in data, continue continue tmp = np.array(tmp) his_seq = tmp[np.argsort( tmp[:, 2])].copy() # sort location, (num_id, [u, l, t]) # his_seq[:, 0] = re_user_id # delete redundant user ids visited_locs = his_seq[:, 1] if len(visited_locs) == 0: continue loc_ids = poi[:, 0] pos = [] for visited_loc in visited_locs: pos.append(poi[np.where(loc_ids == visited_loc)][0]) pos = np.array( pos) # poi of the user' all visited points (repeated) DB = DBSCAN(eps=0.002, min_samples=10).fit( pos[:, 1:]).labels_ # eps 0.01 -- dis 0.84174 num_group = DB.max() + 1 # start from 0 to max neg_num = len(DB[np.where(DB == -1)]) print(DB) if len(DB) < neg_num * 2: print('no!') continue base = [] # (num, ?, [id, lat, lon]) region, reg_tmp = [], [] for num in range(num_group): base.append(pos[np.where(DB == num)].tolist()) for point in poi.tolist(): for every in base[num]: # get all points within the region, dis less than 0.01 to at least one base point if euclidean(point, every) < 0.005: # half it here reg_tmp.append(point[0]) break region.append(reg_tmp) # region (num, ?) print(len(reg_tmp)) reg_tmp = [] # from historical sequence build trajectories for j, each in enumerate(his_seq): # each ([u, l, t]) each = each.tolist() if j == 0: traj.append(each) # traj (len_traj, [u, l, t]) continue if each[2] - his_seq[j - 1, 2] <= delta_t: traj.append(each) else: if len(traj ) > 3: # traj fewer than 3 check-ins are removed # (?num_traj, ?len_traj-1, [u, l, t]) trajs_tmp.append(torch.LongTensor(traj[:-1])) labels_tmp.append(traj[-1][1]) # loc of last one traj = [each] if len(traj) > 3: trajs_tmp.append(torch.LongTensor(traj[:-1])) labels_tmp.append(traj[-1][1]) traj = [] tmp = [] tmp.append(item) # user fewer than 5 trajs are removed if len(trajs_tmp) > 5: train_trajs = train_trajs + trajs_tmp[:-1] test_trajs = test_trajs + trajs_tmp[-1:] train_labels = train_labels + labels_tmp[:-1] test_labels = test_labels + labels_tmp[-1:] # re_user_id += 1 trajs_tmp = [] labels_tmp = [] u_id += 1 return [train_trajs, test_trajs], [train_labels, test_labels] # (N, *len, 3), (N)
ax.set_zlabel('Z', fontdict={'size': 10, 'color': 'red'}) ax.set_ylabel('Y', fontdict={'size': 10, 'color': 'red'}) ax.set_xlabel('X', fontdict={'size': 10, 'color': 'red'}) ax.view_init(elev=30, azim=20) plt.title("DBSCAN Result") plt.show() # print("TSNEing") # tsne = TSNE(n_components=2) # tsne_result = tsne.fit_transform(three_data_result) # print(tsne_result.shape) # plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=db_classification_result) # plt.title("DBSCAN After TSNE") # plt.show() print("类别数 %d" % (db_classification_result.max() + 1)) # ---- Hierarchical ---- start = time.clock() hierarchical_result = AgglomerativeClustering( n_clusters=4, linkage="complete").fit_predict(three_data) elapsed = (time.clock() - start) print("Hierarchical time consumed ", elapsed) np.save("Hierarchical/hierarchical_complete_class_result", hierarchical_result) fig = plt.figure() ax = Axes3D(fig) ax.scatter(three_data[:, 0], three_data[:, 1], three_data[:, 2], c=hierarchical_result)
def _createClusters(self): print(' Creating clusters from HMM transitions,,Time: ' + str(datetime.datetime.now())) # Load in HMM data hmmObj = HA(self.videoObj.localHMMFile) # Convert into coords object and save it coords = hmmObj.retDBScanMatrix(self.projFileManager.minMagnitude) np.save(self.videoObj.localRawCoordsFile, coords) # Run data in batches to avoid RAM override sortData = coords[coords[:, 0].argsort( )][:, 0: 3] #sort data by time for batch processing, throwing out 4th column (magnitude) numBatches = int( sortData[-1, 0] / self.projFileManager.delta / 3600 ) + 1 #delta is number of hours to batch together. Can be fraction. sortData[:, 0] = sortData[:, 0] * self.projFileManager.timeScale #scale time so that time distances between transitions are comparable to spatial differences labels = np.zeros(shape=(sortData.shape[0], 1), dtype=sortData.dtype) # Initialize labels #Calculate clusters in batches to avoid RAM overuse curr_label = 0 #Labels for each batch start from zero - need to offset these print(' ' + str(numBatches) + ' total batches. On batch: ', end='', flush=True) for i in range(numBatches): print(str(i) + ',', end='', flush=True) min_time, max_time = i * self.projFileManager.delta * self.projFileManager.timeScale * 3600, ( i + 1 ) * self.projFileManager.delta * self.projFileManager.timeScale * 3600 # Have to deal with rescaling of time. 3600 = # seconds in an hour hour_range = np.where((sortData[:, 0] > min_time) & (sortData[:, 0] <= max_time)) min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1 X = NearestNeighbors(radius=self.projFileManager.treeR, metric='minkowski', p=2, algorithm='kd_tree', leaf_size=self.projFileManager.leafNum, n_jobs=24).fit(sortData[min_index:max_index]) dist = X.radius_neighbors_graph(sortData[min_index:max_index], self.projFileManager.neighborR, 'distance') sub_label = DBSCAN(eps=self.projFileManager.eps, min_samples=self.projFileManager.minPts, metric='precomputed', n_jobs=self.workers).fit_predict(dist) new_labels = int(sub_label.max()) + 1 sub_label[sub_label != -1] += curr_label labels[min_index:max_index, 0] = sub_label curr_label += new_labels print() # Concatenate and save information sortData[:, 0] = sortData[:, 0] / self.projFileManager.timeScale labeledCoords = np.concatenate((sortData, labels), axis=1).astype('int64') np.save(self.videoObj.localLabeledCoordsFile, labeledCoords) print(' Concatenating and summarizing clusters,,Time: ' + str(datetime.datetime.now())) df = pd.DataFrame(labeledCoords, columns=['T', 'X', 'Y', 'LID']) clusterData = df.groupby('LID').apply( lambda x: pd.Series({ 'projectID': self.lp.projectID, 'videoID': self.videoObj.baseName, 'N': x['T'].count(), 't': int(x['T'].mean()), 'X': int(x['X'].mean()), 'Y': int(x['Y'].mean()), 't_span': int(x['T'].max() - x['T'].min()), 'X_span': int(x['X'].max() - x['X'].min()), 'Y_span': int(x['Y'].max() - x['Y'].min()), 'ManualAnnotation': 'No', 'ManualLabel': '', 'ClipCreated': 'No', 'DepthChange': np.nan, })) clusterData['TimeStamp'] = clusterData.apply( lambda row: (self.videoObj.startTime + datetime.timedelta(seconds=int(row.t))), axis=1) clusterData['ClipName'] = clusterData.apply(lambda row: '__'.join([ str(x) for x in [ self.lp.projectID, self.videoObj.baseName, row.name, row.N, row .t, row.X, row.Y ] ]), axis=1) # Identify clusters to make clips for #self._print('Identifying clusters to make clips for', log = False) delta_xy = self.projFileManager.delta_xy delta_t = self.projFileManager.delta_t smallClips, clipsCreated = 0, 0 # keep track of clips with small number of pixel changes for row in clusterData.sample(n=clusterData.shape[0]).itertuples( ): # Randomly go through the dataframe LID, N, t, x, y, time = row.Index, row.N, row.t, row.X, row.Y, row.TimeStamp if x - delta_xy < 0 or x + delta_xy >= self.videoObj.height or y - delta_xy < 0 or y + delta_xy >= self.videoObj.width: continue # Check temporal compatability (part a): elif self.videoObj.framerate * t - delta_t < 0 or LID == -1: continue # Check temporal compatability (part b): elif time < self.lightsOnTime or time > self.lightsOffTime: continue else: clusterData.loc[clusterData.index == LID, 'ClipCreated'] = 'Yes' if N < self.projFileManager.smallLimit: if smallClips > self.videoObj.nManualLabelClips / 20: continue smallClips += 1 if clipsCreated < self.videoObj.nManualLabelClips: clusterData.loc[clusterData.index == LID, 'ManualAnnotation'] = 'Yes' clipsCreated += 1 clusterData.to_csv(self.videoObj.localLabeledClustersFile, sep=',') self.clusterData = clusterData
g_all_frag_clusters["fs_cluster"][clustering[i]] = [g_all_fs_frag_descrs[i]] # 处理不含FS的分段 for i in range(non_fs_frag_num): frag_t = g_all_non_fs_frag_descrs[i].fragment_type if frag_t in g_all_frag_clusters["non_fs_cluster"]: g_all_frag_clusters["non_fs_cluster"][frag_t].append(g_all_non_fs_frag_descrs[i]) else: g_all_frag_clusters["non_fs_cluster"][frag_t] = [g_all_non_fs_frag_descrs[i]] # check_all_clusters() # 大类分裂 max_fs_cluster_num = clustering.max() max_non_fs_cluster_num = 9 # 待删除的超大小限制的类名 del_fs_cluster_array = [] del_non_fs_cluster_array = [] # 分裂出的新类 new_fs_cluster_array = [] new_non_fs_cluster_array = [] # 含FS for (clu_name, cluster) in g_all_frag_clusters["fs_cluster"].items(): # 如果类中只有一个分段 if len(cluster) == 1: continue
def _createClusters(self): print(' Creating clusters from HMM transitions,,Time: ' + str(datetime.datetime.now())) # Load in HMM data hmmObj = HA(self.output_directory + self.baseName + '_hmm') # Convert into coords object and save it coords = hmmObj.retDBScanMatrix(self.minMagnitude) np.save(self.output_directory + self.baseName + '_coords.npy', coords) # Run data in batches to avoid RAM override sortData = coords[coords[:, 0].argsort( )][:, 0: 3] #sort data by time for batch processing, throwing out 4th column (magnitude) numBatches = int( sortData[-1, 0] / self.delta / 3600 ) + 1 #delta is number of hours to batch together. Can be fraction. sortData[:, 0] = sortData[:, 0] * self.timeScale #scale time so that time distances between transitions are comparable to spatial differences labels = np.zeros(shape=(sortData.shape[0], 1), dtype=sortData.dtype) # Initialize labels #Calculate clusters in batches to avoid RAM overuse curr_label = 0 #Labels for each batch start from zero - need to offset these print(' ' + str(numBatches) + ' total batches. On batch: ', end='', flush=True) for i in range(numBatches): print(str(i) + ',', end='', flush=True) min_time, max_time = i * self.delta * self.timeScale * 3600, ( i + 1 ) * self.delta * self.timeScale * 3600 # Have to deal with rescaling of time. 3600 = # seconds in an hour hour_range = np.where((sortData[:, 0] > min_time) & (sortData[:, 0] <= max_time)) min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1 X = NearestNeighbors(radius=self.treeR, metric='minkowski', p=2, algorithm='kd_tree', leaf_size=self.leafNum, n_jobs=24).fit(sortData[min_index:max_index]) dist = X.radius_neighbors_graph(sortData[min_index:max_index], self.neighborR, 'distance') sub_label = DBSCAN(eps=self.eps, min_samples=self.minPts, metric='precomputed', n_jobs=self.workers).fit_predict(dist) new_labels = int(sub_label.max()) + 1 sub_label[sub_label != -1] += curr_label labels[min_index:max_index, 0] = sub_label curr_label += new_labels print() # Concatenate and save information sortData[:, 0] = sortData[:, 0] / self.timeScale labeledCoords = np.concatenate((sortData, labels), axis=1).astype('int64') np.save(self.output_directory + self.baseName + '.labeledCoords.npy', labeledCoords) print(' Concatenating and summarizing clusters,,Time: ' + str(datetime.datetime.now())) df = pd.DataFrame(labeledCoords, columns=['T', 'X', 'Y', 'LID']) clusterData = df.groupby('LID').apply( lambda x: pd.Series({ 'N': x['T'].count(), 't': int(x['T'].mean()), 'X': int(x['X'].mean()), 'Y': int(x['Y'].mean()), 't_span': int(x['T'].max() - x['T'].min()), 'X_span': int(x['X'].max() - x['X'].min()), 'Y_span': int(x['Y'].max() - x['Y'].min()), })) clusterData.to_csv(self.output_directory + self.baseName + '.clusters.csv', sep=',')