Python DBSCAN.max Examples, sklearn.cluster.DBSCAN.max Python Examples

Example #1

0

Show file

def group_region(fname, traj, u_dim):
    poi = np.load('./data/' + fname + '_POI.npy')  # location id start from 1
    gr = './data/' + fname + '_group.txt'
    file = open(gr, 'a')
    g = open(gr, 'wb')

    group = []  # (user, *num, ?, [id, lat, lon])
    for i in range(u_dim):
        user = []
        for item in traj:
            if item[0, 0].item(
            ) == i + 1:  # the user id to whom the traj belongs to is i+1
                user += item[:,
                             1].numpy().tolist()  # visited points in this traj
        if not user:
            continue
        locs = poi[:, 0]
        pos = []
        for item in user:
            pos.append(poi[np.where(locs == item)][0])
        pos = np.array(pos)  # poi of the user' all visited points (repeated)

        DB = DBSCAN(eps=0.002, min_samples=10).fit(
            pos[:, 1:]).labels_  # eps 0.01 -- dis 0.84174
        base = []  # (num, ?, [id, lat, lon])
        region, tmp = [], []
        num_group = DB.max() + 1  # start from 0 to max
        for num in range(num_group):
            base.append(pos[np.where(DB == num)].tolist())
            for point in poi.tolist():
                for each in base[num]:
                    # get all points within the region, dis less than 0.01 to at least one base point
                    if euclidean(point, each) < (0.001):  # half it here
                        tmp.append(point[0])
                        break
            region.append(tmp)  # region (num, ?)
            print(len(tmp))
            tmp = []
        print('...')
        group.append(region)

    pickle.dump(group, g)
    g.close()
    return group  # (user, *num, ?)

Example #2

0

Show file

File: HoughLineClassifier.py Project: horczech/sudoku

    def merge_close_lines(self, lines, threshold):
        lines = np.asarray(lines)

        distances = np.reshape(np.abs(lines[:, 0]), (len(lines), 1))

        clusters = DBSCAN(eps=threshold, min_samples=1).fit_predict(distances)

        merged_lines = []
        for cluster_id in range(0, clusters.max() + 1):
            lines_to_merge = np.asarray(lines[clusters == cluster_id])
            idxs = np.array(range(0, lines_to_merge.shape[0]))

            res = np.argsort(np.abs(lines_to_merge[:, 0]))
            sorted_lines = lines_to_merge[res]
            sorted_idxs = idxs[res]

            merged_lines.append(lines_to_merge[sorted_idxs[int(lines_to_merge.shape[0]/2)], :])

        return merged_lines

Example #3

0

Show file

File: dbscan.py Project: mengd2/ADBN

def dbscan(similarity, static_fc, sub):
    dbscan = DBSCAN(metric = "euclidean").fit_predict(similarity[sub])
    plt.plot(np.asarray(dbscan) + 2)
    plt.title("State transitions", fontsize = 20)
    plot_matrix(static_fc[sub])
    plt.title("Static connectivity", fontsize = 20)  
    plot_matrix(similarity[sub])
    plt.title("Similarity matrix", fontsize = 20)
        
    n = 1
    
    for cluster in range(-1, dbscan.max()+1):
        label = dbscan == cluster
        percent = np.sum((label)/len(label) * 100)
        max_seq = maximum_sequence(label, cluster)
        mean_mat = np.mean(all_FC_sl[1][label], axis = 0)    #plot_matrix(mean_mat)
        mean_matrices.append(mean_mat)
        plot_matrix(mean_mat, auto_fit= True, vmin = static_fc[sub].min(), vmax = static_fc[sub].max()) #, axes = ax
        plt.title("State %d" %n, fontsize = 20)
        plt.suptitle("Percent of occurence: %d%%" %percent, backgroundcolor = "white")

        n += 1

Example #4

0

Show file

    def createClusters(self, minMagnitude = 0, treeR = 22, leafNum = 190, neighborR = 22, timeScale = 10, eps = 18, minPts = 90, delta = 1.0):
        #self.loadVideo()
        self.loadHMM()
        self._print('Created ' + self.labeledCoordsFile)
        coords = self.obj.retDBScanMatrix(minMagnitude)
        np.save(self.localClusterDirectory + 'RawCoords.npy', coords)
        #subprocess.call(['rclone', 'copy', self.localClusterDirectory + 'RawCoordsFile.npy', self.cloudClusterDirectory], stderr = self.fnull)
               

        sortData = coords[coords[:,0].argsort()][:,0:3] #sort data by time for batch processing, throwing out 4th column (magnitude)
        numBatches = int(sortData[-1,0]/delta/3600) + 1 #delta is number of hours to batch together. Can be fraction.

        sortData[:,0] = sortData[:,0]*timeScale #scale time so that time distances between transitions are comparable to spatial differences
        labels = np.zeros(shape = (sortData.shape[0],1), dtype = sortData.dtype)

        #Calculate clusters in batches to avoid RAM overuse
        curr_label = 0 #Labels for each batch start from zero - need to offset these
            
        print('Calculating clusters in ' + str(numBatches) + ' total batches', file = sys.stderr)
        for i in range(numBatches):
            print('Batch: ' + str(i), file = sys.stderr)
            min_time, max_time = i*delta*timeScale*3600, (i+1)*delta*timeScale*3600 # Have to deal with rescaling of time. 3600 = # seconds in an hour
            hour_range = np.where((sortData[:,0] > min_time) & (sortData[:,0] <= max_time))
            min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1
            X = NearestNeighbors(radius=treeR, metric='minkowski', p=2, algorithm='kd_tree',leaf_size=leafNum,n_jobs=24).fit(sortData[min_index:max_index])
            dist = X.radius_neighbors_graph(sortData[min_index:max_index], neighborR, 'distance')
            sub_label = DBSCAN(eps=eps, min_samples=minPts, metric='precomputed', n_jobs=24).fit_predict(dist)
            new_labels = int(sub_label.max()) + 1
            sub_label[sub_label != -1] += curr_label
            labels[min_index:max_index,0] = sub_label
            curr_label += new_labels

        sortData[:,0] = sortData[:,0]/timeScale
        self.labeledCoords = np.concatenate((sortData, labels), axis = 1).astype('int64')
        np.save(self.localClusterDirectory + self.labeledCoordsFile, self.labeledCoords)
        subprocess.call(['rclone', 'copy', self.localClusterDirectory + self.labeledCoordsFile, self.cloudClusterDirectory], stderr = self.fnull)

Example #5

0

Show file

def Signatures(): 

    # a few definitions ...
    tph_bins = array(['TPH_C06', 'TPH_C07', 'TPH_C08', 'TPH_C09-C10', 'TPH_C11-C12', 'TPH_C13-C14', 'TPH_C15-C16', 'TPH_C17-C18', 'TPH_C19-C20', 'TPH_C21-C22','TPH_C23-C24', 'TPH_C25-C28', 'TPH_C29-C32', 'TPH_C33-C36'])
    fuels = array(['gasoline', 'diesel', 'kerosene', 'bunker C', 'heavy fuel oil', 'crude oil'])
    stretch = 5.                                                                    # vertical exag. factor, used to calculate distance matrices
    report = linspace(start=10., stop=90., num=9, endpoint=True)                    # percentile classes used to process distance matrices
    N = 100                                                                         # number of samples to include in each synthetic reference fuel population
    num_K_clusters = 6                                                              # number of K-Means clusters to assign
    eps=0.1                                                                         # difference tolerance, dbscan cluster analysis
    min_samples=10                                                                  # minimum number of samples for dbscan clusters 

    # read data sets ...

    soil_TPH_df = ReadSoilData(tph_bins, 0.)                                        # consider all reported samples in data sets, including those with values of 0.
    print 'Read and processed all site soil data.'

    locations_df =  read_csv('survey.txt',sep='\t')
    print 'Read boring locations.'

    soil_TPH_df = merge(locations_df, soil_TPH_df, on='Location ID', how='inner')
    elev = array(soil_TPH_df['Surface Elevation(ft-msl)'] - soil_TPH_df['Depth'])
    soil_TPH_df.insert(5, 'elev', elev)
    print 'Merged soil sample survey data with TPH data set.'

    # conduct cluster analyses to find general patterns in TPH data

    X = soil_TPH_df[tph_bins].values                                                # define feature subset
    k_means = KMeans(init='k-means++', n_clusters=num_K_clusters, n_init=25)        # K-means cluster analysis
    z = k_means.fit_predict(X)
    soil_TPH_df['kmeans_group'] = z                                                 # append group indices to soil_TPH data frame
    centroids_df = DataFrame(k_means.cluster_centers_, columns=tph_bins)            # note cluster centroids and write to output file
    centroids_df.to_csv('centroids.csv')
    z = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X)                     # DBSCAN cluster analysis
    soil_TPH_df['dbscan_group'] = z
    num_d_clusters = z.max() + 2
    print 'Conducted cluster analyses.'

    # tag data points using SVM algorithm on TPH data
    
    fuel_refs_df = read_csv('fuel_ref.txt',sep='\t')
    print 'Read fuel reference compositions.'
    training_set_df = CreateTraining(fuels, tph_bins, N, fuel_refs_df)              # generate synthetic reference fuel populations (for training sets)
    X = training_set_df[tph_bins].values                                            # define feature subset of training set
    y = training_set_df['tag'].values                                               # define targets of training set        
    C = 1.0                                                                         # fit model (C = SVM regularization parameter)
    lin_svc = svm.LinearSVC(C=C).fit(X, y)
    Z = soil_TPH_df[tph_bins].values                                                # use model to classify the test set
    z = lin_svc.predict(Z)
    soil_TPH_df['svm_predict'] = z                                                  # append fuel 'tags' to soil_TPH data frame
    print 'Conducted support-vector-machine classification analysis.'

    # write output files (for both clustering and SVM) ...
      
    soil_TPH_df.to_csv('soil_TPH.csv')                                                                                      # write fully processed soil hydrocarbon datasets to output files               
    for fuel_type in fuels: soil_TPH_df[soil_TPH_df['svm_predict'] == fuel_type].to_csv(fuel_type + '.csv')                 # write output files by tagged signature
    for i in xrange(num_K_clusters): soil_TPH_df[soil_TPH_df['kmeans_group'] == i].to_csv('kgroup_' + str(i) + '.csv')      # write output files by K-means group index
    for i in xrange(num_d_clusters): soil_TPH_df[soil_TPH_df['dbscan_group'] == i-1].to_csv('dgroup_' + str(i) + '.csv')    # write output files by K-means group index

    # compare distibution of point-to-point distances, within classes and between classes, as a measured of randomness of scatter

    for i, fuel_type in enumerate(fuels):                                           # distance arrays, by tag (i.e., svm-designation)
        points = soil_TPH_df[soil_TPH_df['svm_predict'] == fuel_type][['Easting', 'Northing', 'elev']]
        points['elev'] *= stretch
        percents = DistDistrib(points, report)
        if i:
            dist_matrix = dstack((dist_matrix, percents))
        else:
            dist_matrix = percents
    tag_df = DataFrame(transpose(dist_matrix[0]))
    tag_df.columns = report.astype(str) 
    tag_df.insert(0, 'category', fuels)
    
    for i in xrange(num_K_clusters):                                                  # distance arrays, by cluster: Kmeans-designation
        points = soil_TPH_df[soil_TPH_df['kmeans_group'] == i][['Easting', 'Northing', 'elev']]
        points['elev'] *= stretch
        percents = DistDistrib(points, report)        
        if i:
            dist_matrix = dstack((dist_matrix, percents))
        else:
            dist_matrix = percents
    kcluster_df = DataFrame(transpose(dist_matrix[0]))
    kcluster_df.columns = report.astype(str)

    for i in xrange(num_d_clusters):                                                  # distance arrays, by cluster: dbscan-designation
        points = soil_TPH_df[soil_TPH_df['dbscan_group'] == i-1][['Easting', 'Northing', 'elev']]
        points['elev'] *= stretch
        percents = DistDistrib(points, report)        
        if i:
            dist_matrix = dstack((dist_matrix, percents))
        else:
            dist_matrix = percents
    dcluster_df = DataFrame(transpose(dist_matrix[0]))
    dcluster_df.columns = report.astype(str)

    # distance array for all soil samples
    points = soil_TPH_df[['Easting', 'Northing', 'elev']]
    points['elev'] *= stretch
    percents = DistDistrib(points, report)
    all_df = DataFrame(percents.reshape((1, -1)), columns = report.astype(str))
    all_df.columns = report.astype(str)

    # summarize distances by cluster
    kcluster_all_df = kcluster_df.append(all_df, ignore_index=True)
    kcluster_all_df.to_csv('report_k_cluster.csv')
    dcluster_all_df = dcluster_df.append(all_df, ignore_index=True)
    dcluster_all_df.to_csv('report_d_cluster.csv')

    # summarize distances by svm tags
    all_df.insert(0, 'category', 'ALL')    
    tag_all_df = tag_df.append(all_df, ignore_index=True)
    tag_all_df.to_csv('report_tag.csv')

    print 'Analyzed sample-to-sample distances among sets.'

    print 'Done.'

Example #6

0

Show file

File: VideoProcessor.py Project: manu-tej/Kinect2

    def createClusters(self,
                       minMagnitude=0,
                       treeR=22,
                       leafNum=190,
                       neighborR=22,
                       timeScale=10,
                       eps=18,
                       minPts=90,
                       delta=1.0,
                       Nclips=200,
                       delta_xy=100,
                       delta_t=60,
                       smallLimit=500):
        self.loadVideo()
        self.loadHMM()
        self._print('Clustering HMM transitions using DBScan')
        coords = self.obj.retDBScanMatrix(minMagnitude)
        np.save(self.localClusterDirectory + 'RawCoords.npy', coords)
        #subprocess.call(['rclone', 'copy', self.localClusterDirectory + 'RawCoordsFile.npy', self.cloudClusterDirectory], stderr = self.fnull)

        sortData = coords[coords[:, 0].argsort(
        )][:, 0:
           3]  #sort data by time for batch processing, throwing out 4th column (magnitude)
        numBatches = int(
            sortData[-1, 0] / delta / 3600
        ) + 1  #delta is number of hours to batch together. Can be fraction.

        sortData[:,
                 0] = sortData[:,
                               0] * timeScale  #scale time so that time distances between transitions are comparable to spatial differences
        labels = np.zeros(shape=(sortData.shape[0], 1), dtype=sortData.dtype)

        #Calculate clusters in batches to avoid RAM overuse
        curr_label = 0  #Labels for each batch start from zero - need to offset these

        print('Calculating clusters in ' + str(numBatches) + ' total batches',
              file=sys.stderr)
        for i in range(numBatches):
            print('Batch: ' + str(i), file=sys.stderr)
            min_time, max_time = i * delta * timeScale * 3600, (
                i + 1
            ) * delta * timeScale * 3600  # Have to deal with rescaling of time. 3600 = # seconds in an hour
            hour_range = np.where((sortData[:, 0] > min_time)
                                  & (sortData[:, 0] <= max_time))
            min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1
            X = NearestNeighbors(radius=treeR,
                                 metric='minkowski',
                                 p=2,
                                 algorithm='kd_tree',
                                 leaf_size=leafNum,
                                 n_jobs=24).fit(sortData[min_index:max_index])
            dist = X.radius_neighbors_graph(sortData[min_index:max_index],
                                            neighborR, 'distance')
            sub_label = DBSCAN(eps=eps,
                               min_samples=minPts,
                               metric='precomputed',
                               n_jobs=24).fit_predict(dist)
            new_labels = int(sub_label.max()) + 1
            sub_label[sub_label != -1] += curr_label
            labels[min_index:max_index, 0] = sub_label
            curr_label += new_labels

        sortData[:, 0] = sortData[:, 0] / timeScale
        self.labeledCoords = np.concatenate((sortData, labels),
                                            axis=1).astype('int64')
        np.save(self.localClusterDirectory + self.labeledCoordsFile,
                self.labeledCoords)
        subprocess.call([
            'rclone', 'copy', self.localClusterDirectory +
            self.labeledCoordsFile, self.cloudClusterDirectory
        ],
                        stderr=self.fnull)

        uniqueLabels = set(self.labeledCoords[:, 3])
        uniqueLabels.remove(-1)
        print(
            str(self.labeledCoords[self.labeledCoords[:, 3] != -1].shape[0]) +
            ' HMM transitions assigned to ' + str(len(uniqueLabels)) +
            ' clusters',
            file=sys.stderr)

        df = pd.DataFrame(self.labeledCoords, columns=['T', 'X', 'Y', 'LID'])
        clusterData = df.groupby('LID').apply(
            lambda x: pd.Series({
                'projectID': self.projectID,
                'videoID': self.baseName,
                'N': x['T'].count(),
                't': int(x['T'].mean()),
                'X': int(x['X'].mean()),
                'Y': int(x['Y'].mean()),
                't_span': int(x['T'].max() - x['T'].min()),
                'X_span': int(x['X'].max() - x['X'].min()),
                'Y_span': int(x['Y'].max() - x['Y'].min()),
                'ManualAnnotation': 'No',
                'ManualLabel': '',
                'MLLabel': ''
            }))

        clusterData['X_depth'] = df.apply(
            lambda row: (self.transM[0][0] * row.X + self.transM[0][1] * row.Y
                         + self.transM[0][2]) /
            (self.transM[2][0] * row.X + self.transM[2][1] * row.Y + self.
             transM[2][2]),
            axis=1)
        clusterData['Y_depth'] = df.apply(
            lambda row: (self.transM[1][0] * row.X + self.transM[1][1] * row.Y
                         + self.transM[1][2]) /
            (self.transM[2][0] * row.X + self.transM[2][1] * row.Y + self.
             transM[2][2]),
            axis=1)

        clusterData.to_csv(self.localClusterDirectory + self.clusterFile,
                           sep='\t')
        clusterData = pd.read_csv(self.localClusterDirectory +
                                  self.clusterFile,
                                  sep='\t',
                                  header=0)

        # Identify rows for manual labeling
        manualClips = 0
        smallClips = 0

        cap = cv2.VideoCapture(self.localMasterDirectory + self.videofile)
        framerate = cap.get(cv2.CAP_PROP_FPS)

        for row in clusterData.sample(n=clusterData.shape[0]).itertuples():
            if manualClips > Nclips:
                break

            LID, N, t, x, y = row.LID, row.N, row.t, row.X, row.Y
            if x - delta_xy < 0 or x + delta_xy >= self.height or y - delta_xy < 0 or y + delta_xy >= self.width or LID == -1 or framerate * t - delta_t < 0 or framerate * t + delta_t >= self.frames:
                continue
            if smallClips > Nclips / 20:
                continue
            clusterData.loc[clusterData.LID == LID, 'ManualAnnotation'] = 'Yes'
            manualClips += 1
            if N < smallLimit:
                smallClips += 1

        clusterData.to_csv(self.localClusterDirectory + self.clusterFile,
                           sep='\t')
        subprocess.call([
            'rclone', 'sync', self.localClusterDirectory,
            self.cloudClusterDirectory
        ],
                        stderr=self.fnull)
        self.clusterData = clusterData
        self.createClusterClips()

Example #7

0

Show file

def load_traj(dname):  # start from 1
    gr = './data/' + dname + '_group.txt'
    file = open(gr, 'a')
    g = open(gr, 'wb')

    # data (num_visit, [u, l, t]), +1 to avoid 0 as padding
    # poi (num_loc, [l, lat, lon])
    group = []  # (user, *num, ?, [id, lat, lon])
    poi = np.load('./data/' + dname + '_POI.npy')  # location id start from 1
    data = np.load('./data/' + dname + '_data.npy') + 1
    data = data[np.argsort(data[:, 0])]  # sort user
    delta_t = 60 * 24 * 1  # min * hour * day

    trajs_tmp, labels_tmp, traj, his_seq, tmp, u_id = [], [], [], np.array(
        []), [], 1
    train_trajs, test_trajs, train_labels, test_labels = [], [], [], []
    # re_user_id = 1

    for item in data:  # item ([u, l, t])
        if u_id == item[0]:  # get all info about user u_id
            tmp.append(item)
        else:  # build time-ranked info and next user
            if len(tmp) is 0:  # if no u_id in data, continue
                continue
            tmp = np.array(tmp)
            his_seq = tmp[np.argsort(
                tmp[:, 2])].copy()  # sort location, (num_id, [u, l, t])
            # his_seq[:, 0] = re_user_id  # delete redundant user ids

            visited_locs = his_seq[:, 1]
            if len(visited_locs) == 0:
                continue
            loc_ids = poi[:, 0]
            pos = []
            for visited_loc in visited_locs:
                pos.append(poi[np.where(loc_ids == visited_loc)][0])
            pos = np.array(
                pos)  # poi of the user' all visited points (repeated)

            DB = DBSCAN(eps=0.002, min_samples=10).fit(
                pos[:, 1:]).labels_  # eps 0.01 -- dis 0.84174
            num_group = DB.max() + 1  # start from 0 to max
            neg_num = len(DB[np.where(DB == -1)])
            print(DB)
            if len(DB) < neg_num * 2:
                print('no!')
                continue
            base = []  # (num, ?, [id, lat, lon])
            region, reg_tmp = [], []
            for num in range(num_group):
                base.append(pos[np.where(DB == num)].tolist())
                for point in poi.tolist():
                    for every in base[num]:
                        # get all points within the region, dis less than 0.01 to at least one base point
                        if euclidean(point, every) < 0.005:  # half it here
                            reg_tmp.append(point[0])
                            break
                region.append(reg_tmp)  # region (num, ?)
                print(len(reg_tmp))
                reg_tmp = []

            # from historical sequence build trajectories
            for j, each in enumerate(his_seq):  # each ([u, l, t])
                each = each.tolist()
                if j == 0:
                    traj.append(each)  # traj (len_traj, [u, l, t])
                    continue
                if each[2] - his_seq[j - 1, 2] <= delta_t:
                    traj.append(each)
                else:
                    if len(traj
                           ) > 3:  # traj fewer than 3 check-ins are removed
                        # (?num_traj, ?len_traj-1, [u, l, t])
                        trajs_tmp.append(torch.LongTensor(traj[:-1]))
                        labels_tmp.append(traj[-1][1])  # loc of last one
                    traj = [each]

            if len(traj) > 3:
                trajs_tmp.append(torch.LongTensor(traj[:-1]))
                labels_tmp.append(traj[-1][1])

            traj = []
            tmp = []
            tmp.append(item)

            # user fewer than 5 trajs are removed
            if len(trajs_tmp) > 5:
                train_trajs = train_trajs + trajs_tmp[:-1]
                test_trajs = test_trajs + trajs_tmp[-1:]
                train_labels = train_labels + labels_tmp[:-1]
                test_labels = test_labels + labels_tmp[-1:]
                # re_user_id += 1
            trajs_tmp = []
            labels_tmp = []
            u_id += 1

    return [train_trajs, test_trajs], [train_labels,
                                       test_labels]  # (N, *len, 3), (N)

Example #8

0

Show file

File: main.py Project: majiaxin110/ml_dm_homework

ax.set_zlabel('Z', fontdict={'size': 10, 'color': 'red'})
ax.set_ylabel('Y', fontdict={'size': 10, 'color': 'red'})
ax.set_xlabel('X', fontdict={'size': 10, 'color': 'red'})
ax.view_init(elev=30, azim=20)
plt.title("DBSCAN Result")
plt.show()

# print("TSNEing")
# tsne = TSNE(n_components=2)
# tsne_result = tsne.fit_transform(three_data_result)
# print(tsne_result.shape)
# plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=db_classification_result)
# plt.title("DBSCAN After TSNE")
# plt.show()

print("类别数 %d" % (db_classification_result.max() + 1))

# ---- Hierarchical ----
start = time.clock()
hierarchical_result = AgglomerativeClustering(
    n_clusters=4, linkage="complete").fit_predict(three_data)
elapsed = (time.clock() - start)
print("Hierarchical time consumed ", elapsed)
np.save("Hierarchical/hierarchical_complete_class_result", hierarchical_result)

fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(three_data[:, 0],
           three_data[:, 1],
           three_data[:, 2],
           c=hierarchical_result)

Example #9

0

Show file

File: VideoPreparer.py Project: LijiangLong/patrick_bower_code

    def _createClusters(self):
        print('  Creating clusters from HMM transitions,,Time: ' +
              str(datetime.datetime.now()))

        # Load in HMM data
        hmmObj = HA(self.videoObj.localHMMFile)

        # Convert into coords object and save it
        coords = hmmObj.retDBScanMatrix(self.projFileManager.minMagnitude)
        np.save(self.videoObj.localRawCoordsFile, coords)

        # Run data in batches to avoid RAM override
        sortData = coords[coords[:, 0].argsort(
        )][:, 0:
           3]  #sort data by time for batch processing, throwing out 4th column (magnitude)
        numBatches = int(
            sortData[-1, 0] / self.projFileManager.delta / 3600
        ) + 1  #delta is number of hours to batch together. Can be fraction.

        sortData[:,
                 0] = sortData[:,
                               0] * self.projFileManager.timeScale  #scale time so that time distances between transitions are comparable to spatial differences
        labels = np.zeros(shape=(sortData.shape[0], 1),
                          dtype=sortData.dtype)  # Initialize labels

        #Calculate clusters in batches to avoid RAM overuse
        curr_label = 0  #Labels for each batch start from zero - need to offset these
        print('   ' + str(numBatches) + ' total batches. On batch: ',
              end='',
              flush=True)
        for i in range(numBatches):
            print(str(i) + ',', end='', flush=True)

            min_time, max_time = i * self.projFileManager.delta * self.projFileManager.timeScale * 3600, (
                i + 1
            ) * self.projFileManager.delta * self.projFileManager.timeScale * 3600  # Have to deal with rescaling of time. 3600 = # seconds in an hour
            hour_range = np.where((sortData[:, 0] > min_time)
                                  & (sortData[:, 0] <= max_time))
            min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1
            X = NearestNeighbors(radius=self.projFileManager.treeR,
                                 metric='minkowski',
                                 p=2,
                                 algorithm='kd_tree',
                                 leaf_size=self.projFileManager.leafNum,
                                 n_jobs=24).fit(sortData[min_index:max_index])
            dist = X.radius_neighbors_graph(sortData[min_index:max_index],
                                            self.projFileManager.neighborR,
                                            'distance')
            sub_label = DBSCAN(eps=self.projFileManager.eps,
                               min_samples=self.projFileManager.minPts,
                               metric='precomputed',
                               n_jobs=self.workers).fit_predict(dist)
            new_labels = int(sub_label.max()) + 1
            sub_label[sub_label != -1] += curr_label
            labels[min_index:max_index, 0] = sub_label
            curr_label += new_labels
        print()
        # Concatenate and save information
        sortData[:, 0] = sortData[:, 0] / self.projFileManager.timeScale
        labeledCoords = np.concatenate((sortData, labels),
                                       axis=1).astype('int64')
        np.save(self.videoObj.localLabeledCoordsFile, labeledCoords)
        print('  Concatenating and summarizing clusters,,Time: ' +
              str(datetime.datetime.now()))

        df = pd.DataFrame(labeledCoords, columns=['T', 'X', 'Y', 'LID'])
        clusterData = df.groupby('LID').apply(
            lambda x: pd.Series({
                'projectID': self.lp.projectID,
                'videoID': self.videoObj.baseName,
                'N': x['T'].count(),
                't': int(x['T'].mean()),
                'X': int(x['X'].mean()),
                'Y': int(x['Y'].mean()),
                't_span': int(x['T'].max() - x['T'].min()),
                'X_span': int(x['X'].max() - x['X'].min()),
                'Y_span': int(x['Y'].max() - x['Y'].min()),
                'ManualAnnotation': 'No',
                'ManualLabel': '',
                'ClipCreated': 'No',
                'DepthChange': np.nan,
            }))
        clusterData['TimeStamp'] = clusterData.apply(
            lambda row:
            (self.videoObj.startTime + datetime.timedelta(seconds=int(row.t))),
            axis=1)
        clusterData['ClipName'] = clusterData.apply(lambda row: '__'.join([
            str(x) for x in [
                self.lp.projectID, self.videoObj.baseName, row.name, row.N, row
                .t, row.X, row.Y
            ]
        ]),
                                                    axis=1)
        # Identify clusters to make clips for
        #self._print('Identifying clusters to make clips for', log = False)
        delta_xy = self.projFileManager.delta_xy
        delta_t = self.projFileManager.delta_t
        smallClips, clipsCreated = 0, 0  # keep track of clips with small number of pixel changes
        for row in clusterData.sample(n=clusterData.shape[0]).itertuples(
        ):  # Randomly go through the dataframe
            LID, N, t, x, y, time = row.Index, row.N, row.t, row.X, row.Y, row.TimeStamp
            if x - delta_xy < 0 or x + delta_xy >= self.videoObj.height or y - delta_xy < 0 or y + delta_xy >= self.videoObj.width:
                continue
            # Check temporal compatability (part a):
            elif self.videoObj.framerate * t - delta_t < 0 or LID == -1:
                continue
            # Check temporal compatability (part b):
            elif time < self.lightsOnTime or time > self.lightsOffTime:
                continue
            else:
                clusterData.loc[clusterData.index == LID,
                                'ClipCreated'] = 'Yes'
                if N < self.projFileManager.smallLimit:
                    if smallClips > self.videoObj.nManualLabelClips / 20:
                        continue
                    smallClips += 1
                if clipsCreated < self.videoObj.nManualLabelClips:
                    clusterData.loc[clusterData.index == LID,
                                    'ManualAnnotation'] = 'Yes'
                    clipsCreated += 1

        clusterData.to_csv(self.videoObj.localLabeledClustersFile, sep=',')
        self.clusterData = clusterData

Example #10

0

Show file

File: simhash_dedup.py Project: Hilaver/vm-deduplication

        g_all_frag_clusters["fs_cluster"][clustering[i]] = [g_all_fs_frag_descrs[i]]

# 处理不含FS的分段
for i in range(non_fs_frag_num):
    frag_t = g_all_non_fs_frag_descrs[i].fragment_type
    if frag_t in g_all_frag_clusters["non_fs_cluster"]:
        g_all_frag_clusters["non_fs_cluster"][frag_t].append(g_all_non_fs_frag_descrs[i])
    else:
        g_all_frag_clusters["non_fs_cluster"][frag_t] = [g_all_non_fs_frag_descrs[i]]

# check_all_clusters()


# 大类分裂

max_fs_cluster_num = clustering.max()
max_non_fs_cluster_num = 9

#   待删除的超大小限制的类名
del_fs_cluster_array = []
del_non_fs_cluster_array = []

#   分裂出的新类
new_fs_cluster_array = []
new_non_fs_cluster_array = []

#   含FS
for (clu_name, cluster) in g_all_frag_clusters["fs_cluster"].items():
    # 如果类中只有一个分段
    if len(cluster) == 1:
        continue

Example #11

0

Show file

File: HMM_video.py Project: ptmcgrat/CichlidBowerTracker

    def _createClusters(self):
        print('  Creating clusters from HMM transitions,,Time: ' +
              str(datetime.datetime.now()))

        # Load in HMM data
        hmmObj = HA(self.output_directory + self.baseName + '_hmm')

        # Convert into coords object and save it
        coords = hmmObj.retDBScanMatrix(self.minMagnitude)
        np.save(self.output_directory + self.baseName + '_coords.npy', coords)

        # Run data in batches to avoid RAM override
        sortData = coords[coords[:, 0].argsort(
        )][:, 0:
           3]  #sort data by time for batch processing, throwing out 4th column (magnitude)
        numBatches = int(
            sortData[-1, 0] / self.delta / 3600
        ) + 1  #delta is number of hours to batch together. Can be fraction.

        sortData[:,
                 0] = sortData[:,
                               0] * self.timeScale  #scale time so that time distances between transitions are comparable to spatial differences
        labels = np.zeros(shape=(sortData.shape[0], 1),
                          dtype=sortData.dtype)  # Initialize labels

        #Calculate clusters in batches to avoid RAM overuse
        curr_label = 0  #Labels for each batch start from zero - need to offset these
        print('   ' + str(numBatches) + ' total batches. On batch: ',
              end='',
              flush=True)
        for i in range(numBatches):
            print(str(i) + ',', end='', flush=True)

            min_time, max_time = i * self.delta * self.timeScale * 3600, (
                i + 1
            ) * self.delta * self.timeScale * 3600  # Have to deal with rescaling of time. 3600 = # seconds in an hour
            hour_range = np.where((sortData[:, 0] > min_time)
                                  & (sortData[:, 0] <= max_time))
            min_index, max_index = hour_range[0][0], hour_range[0][-1] + 1
            X = NearestNeighbors(radius=self.treeR,
                                 metric='minkowski',
                                 p=2,
                                 algorithm='kd_tree',
                                 leaf_size=self.leafNum,
                                 n_jobs=24).fit(sortData[min_index:max_index])
            dist = X.radius_neighbors_graph(sortData[min_index:max_index],
                                            self.neighborR, 'distance')
            sub_label = DBSCAN(eps=self.eps,
                               min_samples=self.minPts,
                               metric='precomputed',
                               n_jobs=self.workers).fit_predict(dist)
            new_labels = int(sub_label.max()) + 1
            sub_label[sub_label != -1] += curr_label
            labels[min_index:max_index, 0] = sub_label
            curr_label += new_labels
        print()
        # Concatenate and save information
        sortData[:, 0] = sortData[:, 0] / self.timeScale
        labeledCoords = np.concatenate((sortData, labels),
                                       axis=1).astype('int64')
        np.save(self.output_directory + self.baseName + '.labeledCoords.npy',
                labeledCoords)
        print('  Concatenating and summarizing clusters,,Time: ' +
              str(datetime.datetime.now()))

        df = pd.DataFrame(labeledCoords, columns=['T', 'X', 'Y', 'LID'])
        clusterData = df.groupby('LID').apply(
            lambda x: pd.Series({
                'N': x['T'].count(),
                't': int(x['T'].mean()),
                'X': int(x['X'].mean()),
                'Y': int(x['Y'].mean()),
                't_span': int(x['T'].max() - x['T'].min()),
                'X_span': int(x['X'].max() - x['X'].min()),
                'Y_span': int(x['Y'].max() - x['Y'].min()),
            }))

        clusterData.to_csv(self.output_directory + self.baseName +
                           '.clusters.csv',
                           sep=',')