Example #1
0
def dbscan(similarity, concepts=2, euclid=False):
    if euclid:
        model = DBSCAN(eps=0.6, min_samples=10, algorithm='auto', leaf_size=30)
        return model.fit_predict(similarity)
    else:
        model = DBSCAN(eps=0.6, min_samples=10, metric='precomputed', algorithm='auto', leaf_size=30)
        return model.fit_predict(1 - similarity)
def plot_dbscan():
    X, y = make_blobs(random_state=0, n_samples=12)

    dbscan = DBSCAN()
    clusters = dbscan.fit_predict(X)
    clusters

    fig, axes = plt.subplots(3, 4, figsize=(11, 8), subplot_kw={'xticks': (), 'yticks': ()})
    # Plot clusters as red, green and blue, and outliers (-1) as white
    colors = ['r', 'g', 'b']
    markers = ['o', '^', 'v']

    # iterate over settings of min_samples and eps
    for i, min_samples in enumerate([2, 3, 5]):
        for j, eps in enumerate([1, 1.5, 2, 3]):
            # instantiate DBSCAN with a particular setting
            dbscan = DBSCAN(min_samples=min_samples, eps=eps)
            # get cluster assignments
            clusters = dbscan.fit_predict(X)
            print("min_samples: %d eps: %f  cluster: %s" % (min_samples, eps, clusters))
            if np.any(clusters == -1):
                c = ['w'] + colors
                m = ['o'] + markers
            else:
                c = colors
                m = markers
            discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c, s=8, markers=m)
            inds = dbscan.core_sample_indices_
            # vizualize core samples and clusters.
            if len(inds):
                discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds],
                                 ax=axes[i, j], s=15, c=colors,
                                 markers=markers)
            axes[i, j].set_title("min_samples: %d eps: %.1f" % (min_samples, eps))
    fig.tight_layout()
Example #3
0
    def search_charges(self, data, z=0, threshold = 30):
        A = deriv(data,z)
        
        print 'Searching charges...'
        time0 = time.time()        
        
        det = A[3]*A[5]-A[4]**2

        dx = -(A[1]*A[5]-A[2]*A[4])/det
        dy = -(A[2]*A[3]-A[1]*Aa[4])/det
        
        datamax = A[0]+A[1]*dx+A[2]*dy+A[3]*dx**2/2+A[4]*dx*dy+A[5]*dy**2/2        
        t = np.where((np.abs(dx) < 1)*(np.abs(dy) < 1)*(np.abs(datamax) > threshold)*(det > 0))        
        
        x = np.array([t[1]+dx[t], t[0]+dy[t]]).T
        
        db = DBSCAN(min_samples = 1, eps = 1)
        db.fit_predict(x)
        
        n_charges = np.max(db.labels_)+1
        qi = np.zeros(n_charges)
        xi = np.zeros((3,n_charges))
        
        for i in range(0, n_charges):
            xi[0:2,i] = np.mean(x[db.labels_ == i,:], axis=0)
            qi[i] = np.mean(datamax[t][db.labels_ == i])
        
        
        self.set_charges(qi,xi)
        print 'Done! Elapsed time: '+str(time.time()-time0)
        return self
Example #4
0
    def _fit_dbscan(self, x):
        # clustering
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model = DBSCAN(eps=1.0, min_samples=100)
            model.fit_predict(x)
            k = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0)
            self._labels[r] = model.labels_
            self._parameters[r] = model.core_sample_indices_

            # build equivalent gmm
            model_gmm = GMM(n_components=k, covariance_type="full")
            model_gmm.means_ = model.core_sample_indices_
            model_gmm.covars_ = sp.ones(
                (k, self.input_dim)) * self.sigma_factor
            model_gmm.weights_ = sp.array(
                [(self._labels[r] == i).sum() for i in xrange(k)])

            # evaluate goodness of fit
            self._ll[r] = model_gmm.score(x).sum()
            if self.gof_type == 'aic':
                self._gof[r] = model_gmm.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model_gmm.bic(x)

            # debug info
            if self.debug is True:
                print self._gof[r]
Example #5
0
	def current_datapoints_dbscan(self):
		"""
		Method clusters points-outliers (after current_datapoints_threshold_filter and current_datapoints_outliers_filter) into slice-clusters using DBSCAN.
		Returns dict of slice-clusters - base for event-candidates. Uses self.eps attribute to estimate cluster boundaries.
		"""
		nets = self.current_datapoints.keys()
		ids = concatenate([self.current_datapoints[x]['ids'] for x in nets])
		coords = concatenate([self.current_datapoints[x]['array'] for x in nets])
		weights = concatenate([self.current_datapoints[x]['weights'] for x in nets])
		if len(ids) > 0:
			clustering = DBSCAN(eps=self.eps, min_samples=5)
			labels = clustering.fit_predict(coords)
			core_ids = ids[clustering.core_sample_indices_]
			ids = ids[labels > -1]
			coords = coords[labels > -1]
			weights = weights[labels > -1]
			labels = labels[labels > -1]
			ret_tab = {}
			for i in range(len(labels)):
				try:
					ret_tab[labels[i]].append({'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids})
				except KeyError:
					ret_tab[labels[i]] = [{'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids}]
			return ret_tab
		else:
			return {}
def DBScan_Flux(phots, ycenters, xcenters, dbsClean=0, useTheForce=False):
    """Class methods are similar to regular functions.

    Note:
        Do not include the `self` parameter in the ``Args`` section.

    Args:
        param1: The first parameter.
        param2: The second parameter.

    Returns:
        True if successful, False otherwise.

    """
    
    dbsPhots    = DBSCAN()#n_jobs=-1)
    stdScaler   = StandardScaler()
    
    phots       = np.copy(phots.ravel())
    phots[~np.isfinite(phots)] = np.median(phots[np.isfinite(phots)])
    
    featuresNow = np.transpose([stdScaler.fit_transform(ycenters[:,None]).ravel(), \
                                stdScaler.fit_transform(xcenters[:,None]).ravel(), \
                                stdScaler.fit_transform(phots[:,None]).ravel()   ] )
    
    # print(featuresNow.shape)
    dbsPhotsPred= dbsPhots.fit_predict(featuresNow)
    
    return dbsPhotsPred == dbsClean
Example #7
0
def _cluster(params):
    cls = None
    method = sh.getConst('method')
    if method=='kmedoid':
        assert False
        # from kmedoid import kmedsoid
        # cls = kmedoid
    elif method=='dbscan':
        from sklearn.cluster import DBSCAN
        cls = DBSCAN(eps=params['eps'],min_samples=params['min_samples'],
                     metric='precomputed')
    else:
        assert False, 'FATAL: unknown cluster method'

    ##
    mat = sh.getConst('mat')
    labels = cls.fit_predict(mat)
    nLabels = len(set(labels))

    ##
    sil = None; cal = None
    if (nLabels >= 2)and(nLabels <= len(labels)-1):
        sil = met.silhouette_score(mat,labels,'precomputed')
        cal = met.calinski_harabaz_score(mat,labels)
    perf = dict(silhouette_score=sil,calinski_harabaz_score=cal)

    return (labels,perf)
Example #8
0
def cluster_dbscan(matrix, distance_measure="sts", eps=1):
    """Clusters the distance matrix for a given epsilon value, if distance
    measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’, 
    ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’, 
    ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, 
    ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, 
    ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]

    Parameters
    ----------
    matrix: np.matrix
        The input matrix. If distance measure is sts, this should be the sts
        distance matrix. If other distance, this should be the time-series
        matrix of size ngenes x nsamples.
    distance_measure: str
        The distance measure, default is sts, short time-series distance.
        Any distance measure available in scikit-learn is available here.
        Note: multiple time-series is NOT supported for distances other than    
        "sts".

    Returns
    -------
    cluster_labels: list of int
        A list of size ngenes that defines cluster membership.
    """
    if (distance_measure == "sts"):
        dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
    else:
        dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2)
    cluster_labels = dbs.fit_predict(matrix)
    return cluster_labels
Example #9
0
def cluster_DBSCAN(args):
	"""
	Clustering with Ward hierarchical clustering: constructs a tree and cuts it.
	"""
	#load data
	g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file")
	vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits)
	logger.info('Vectorizer: %s' % vec)

	X = vec.transform(g_it, n_jobs = args.n_jobs)
	logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0]))
	
	#project to lower dimensional space to use clustering algorithms
	transformer = TruncatedSVD(n_components=args.n_components)
	X_dense=transformer.fit_transform(X)

	#log statistics on data
	logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0]))

	#clustering
	clustering_algo = DBSCAN(eps = args.eps)
	y = clustering_algo.fit_predict(X_dense)
	msg = 'Predictions statistics: '
	msg += util.report_base_statistics(y)
	logger.info(msg)

	#save model for vectorizer
	out_file_name = "vectorizer"
	eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name)
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)

	#save result
	out_file_name = "labels"
	eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text")
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
def get_clusters(tracks):
    neighbors = g.m.neighborsSpin.value()
    dist = g.m.neighborDistanceSpin.value()
    data = np.array([[tr['mean_x'], tr['mean_y']] for tr in tracks])
    scanner = DBSCAN(eps=dist, min_samples=neighbors)
    ids = scanner.fit_predict(data)
    return ids
def dbscan_outliers(df):
    """
    Find outliers (noise points) using DBSCAN.

    Parameters
    ----------
    df: A pandas.DataFrame

    Returns
    -------
    A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame)
    """

    scaler = StandardScaler()
    scaler.fit(df)
    scaled = scaler.transform(df)

    dbs = DBSCAN()

    db = dbs.fit(scaled)
    outliers = dbs.fit_predict(scaled)

    df_o = df.ix[np.nonzero(outliers)]

    return db, df_o
Example #12
0
def cluster_with_dbscan(vectors, epsilon=0.5, min_samples=5, distances=None, metric="euclidean"):
    # precomputing our distances will be faster as we can use multiple cores
    if distances is None:
        distances = pairwise_distances(vectors, n_jobs=-1, metric=metric)

    dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric="precomputed")
    return dbscan.fit_predict(distances)
Example #13
0
 def cluster_lvl1(self, data):
     db = DBSCAN(eps=2., min_samples=2, metric='precomputed')
     processed = np.float32(np.vstack([
         np.mgrid[:self.map_height, :self.map_width].reshape(2, -1),
         data.ravel()
     ])).T
     dist = self.distances_for_lvl1(processed)
     return db.fit_predict(dist).reshape(self.map_height, self.map_width)
Example #14
0
    def regroup(self, maxdistance, minsize, algo = 'auto'):

        self.__loginfo('Regrouping')
        dbsfit = DBSCAN(eps=maxdistance, min_samples=minsize, algorithm=algo).fit(self.primarylist)
        dbsresult = dbsfit.fit_predict(self.primarylist)
        grouplist = []
        for grouplabel in dbsresult:
            if not grouplabel in grouplist: grouplist.append(grouplabel)
        self.__loginfo('Group label count: %s' % len(grouplist))
Example #15
0
def main(datafile, feature1, feature2, normalize, clusteroutput, percentile, copula):
    X, features = read_sah_h5(datafile, just_good=False)
    if 'id' not in features:
        ids = np.arange(len(X))
    else:
        ids = X[:, features.index('id')]
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]
    D = np.column_stack([x, y])

    idx = np.random.randint(len(X), size=10000)

    D = D[idx]
    ids = ids[idx]

    if normalize:
        mean = np.average(D, axis=0)
        std = np.std(D, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs
        Dnorm = (D - mean) / std
    elif copula:
        Dnorm = np.column_stack([copula_transform(f) for f in D.T])
    else:
        Dnorm = D

    kmeans = MiniBatchKMeans(n_clusters=50)
    gmm = GMM(n_components=200, covariance_type='full', verbose=True)
    #C = gmm.fit_predict(Dnorm)
    dbscan = DBSCAN(eps=100.0, min_samples=1)
    C = dbscan.fit_predict(Dnorm)
    print C

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(D[:, 0], D[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C)))

#    for c in np.unique(C):
#        pl.bar(0, 0, lw=0, ec='none',
#            fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c)
#    pl.legend(loc='upper left')

    if percentile > 0:
        pl.xlim(
            scoreatpercentile(x, percentile),
            scoreatpercentile(x, 100-percentile)
        )
        pl.ylim(
            scoreatpercentile(y, percentile),
            scoreatpercentile(y, 100-percentile)
        )

    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.show()
def clusterize(_features):
    import sklearn
    from sklearn.cluster import DBSCAN
    
    est = DBSCAN()
    
    Y = est.fit_predict(_features[:,2:])
    
    y_pred = [(i==major_index(Y)) for i in Y]
    
    return np.c_[_features[:,0], _features[:,1], y_pred]
Example #17
0
def dbscan_clusterize(regions, eps, min_samples):
    if len(regions) < min_samples:
        return [], regions

    samples = np.array([[r.cx, r.cy] for r in regions])

    clustering = DBSCAN(eps=eps, min_samples=min_samples)
    clusters_ids = clustering.fit_predict(samples)
    clusters, noise = convert_clusters(regions, clusters_ids)

    return clusters, noise
Example #18
0
def dbscan_cluster(docs, eps=None):
    vectr = Vectorizer()
    docs = [clean(d) for d in docs]
    vecs = vectr.vectorize(docs, train=True)

    if eps is None:
        dist_mat = build_dist_mat(vecs)
        eps = estimate_eps(dist_mat)[0]

    m = DBSCAN(min_samples=3, metric='euclidean', eps=eps)
    labels = m.fit_predict(vecs)
    return labels
def DBSCAN_cluster(init_ds,ts_flag=False):
    '''
    Parameters: init_ds - 2D list of data
                ts_flag - boolean specifying if the first column of init_ds is a datetime object or not
    Returns: 2D list with additional column denoting which cluster said row falls into
    '''

    if ts_flag:
        init_ds = [i[1:] for i in init_ds]

    dbscn = DBSCAN()
    labels = dbscn.fit_predict(init_ds)

    return [init_ds[i]+[labels[i]] for i in range(len(init_ds))]
Example #20
0
def evaluate_clustering():

    similarity_matrix = get_sense_similarity_submatrix(range(10000))
    matrix_size = len(similarity_matrix)
    print('got matrix')

    affinity_propagation = AffinityPropagation()
    labels1 = affinity_propagation.fit_predict(similarity_matrix)
    print('affinity propagation')

    dbscan = DBSCAN(min_samples=1)
    labels2 = dbscan.fit_predict(similarity_matrix)
    print('print dbscan')

    distance_matrix = np.ndarray((matrix_size, matrix_size))
    for i in range(matrix_size):
        for j in range(matrix_size):
            distance_matrix[i, j] = 1 - similarity_matrix[i, j]

    print(distance_matrix[1, 2])
    print(distance_matrix[1, 1])

    print('created distance matrix')

    cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1)
    cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2)

    print(cluster_map1)
    print(cluster_map2)

    sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean')
    sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean')
    sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix)
    sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix)

    num_elements1 = [len(values) for values in cluster_map1.values()]
    num_elements2 = [len(values) for values in cluster_map2.values()]
    print(num_elements1)
    print(num_elements2)

    print('Number of clusters Affinity Propagation: %f' % len(cluster_map1))
    print('Number of clusters DBSCAN: %f' % len(cluster_map2))
    print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1))
    print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2))
    print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1))
    print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2))
    print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1)
    print('Silouhette score DBSCAN (distance matrix): %f' % sc2)
    print('Dunn index Affinity Propagation (distance matrix): %f' % sc5)
    print('Dunn index DBSCAN (distance matrix): %f' % sc6)
def cv_iteration(n_jobs=2, eps=1., min_samples=30, metric='euclidean', algorithm='brute', leaf_size=30, p=2.):
    X, y_train, _ = load_data()
    scores = []
    cms = []  # confusion matrices
    cluster_sizes = []
    model = DBSCAN(n_jobs=n_jobs, eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm, leaf_size=leaf_size, p=p)
    predictions = model.fit_predict(X)
    score, confusion_matrix = scoring_function(y_train, predictions)
    scores.append(score)
    cms.append(serialise_confusion_matrix(confusion_matrix))
    cluster_sizes.append(serialise_confusion_matrix(np.unique(predictions, return_counts=True)))
    return {'result': scores,
           'confusion_matrices': eval(str(cms)),
           'score_name': string_enhancer(str(scoring_function)),
           'cluster_sizes': eval(str(cluster_sizes))}
def main(datafile, normalize, ndims, copula, clusteroutput, subsample):
    X, features = read_sah_h5(datafile)
    I, all_features = read_sah_h5(datafile, just_good=False)
    if 'id' in all_features:
        ids = X[:, all_features.index('id')]
    else:
        ids = np.arange(len(X)).astype(int)

    Xorig = X
    if normalize:
        mean = np.average(X, axis=0)
        std = np.std(X, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs
        X = (X - mean) / std

    idx = np.random.randint(len(X), size=subsample)

    X = X[idx]
    ids = ids[idx]

    if copula:
        X = np.column_stack([copula_transform(x) for x in X.T])

    # I added this for the time/freq clustering
    # to emphasize the frequency feature
    # X[:, 1] *= 1e-3

    Y = bh_sne(X, d=ndims)

    dbscan = DBSCAN(eps=1.75, min_samples=5)
    C = dbscan.fit_predict(Y)

    tree = ExtraTreesClassifier(n_estimators=100)
    tree.fit(X, C)
    for f, i in zip(features, tree.feature_importances_):
        print '%s: %f' % (f, i)

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(Y[:, 0], Y[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C)))

    for c in np.unique(C):
        pl.bar(0, 0, lw=0, ec='none', fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c)
    pl.legend()

    pl.show()
Example #23
0
 def predict_window_bps(self, pkt_featurizer):
     self.add_to_windows(pkt_featurizer)
     windows_bps = map(self.to_bytes_sec, self.windows)
     windows_bps_shaped = np.array(windows_bps).reshape(-1,1)
     dbscan = DBSCAN()
     labels = dbscan.fit_predict(windows_bps_shaped)
     predict_window_bps = labels[-1] == -1
     if self.plot:
         x_range = map(self.to_mid_time, self.windows)
         self.plot_1d_dbscan(windows_bps_shaped, labels, x_range, self.windows_fig,
                             "Mid Time of Window", "Average bytes/sec", "Windowed Bps Clustering")
                             
     self.windows[-1].pop()
     if not self.windows[-1]:
         self.windows.pop()
     return predict_window_bps
Example #24
0
 def cluster_characters(self):
     """On the basis of co-occurrences of characters in scenes,
     performs a clustering to assign characters to different
     groups."""
     cooccurences = np.zeros((len(self.characters), len(self.characters)))
     for scene in self:
         for character_i in scene.characters:
             for character_j in scene.characters:
                 cooccurences[character_i.id, character_j.id] += 1.0
                 cooccurences[character_j.id, character_i.id] = cooccurences[
                     character_i.id, character_j.id]
     cooccurences = cooccurences / cooccurences.sum()
     clusterer = DBSCAN(eps=cooccurences.mean(), min_samples=1)
     clustering = clusterer.fit_predict(cooccurences)
     for character in self.characters:
         # check if this propagates
         character.cluster = clustering[character.id]
Example #25
0
def clusterize(_features):
    def major_index(l):
        from collections import defaultdict

        d = defaultdict(int)
        for item in l:
            d[item] += 1

        return max(d.iteritems(), key=lambda x: x[1])[0]

    est = DBSCAN()

    Y = est.fit_predict(_features[:, 2:])

    y_pred = [(i == major_index(Y)) for i in Y]

    return np.c_[_features[:, 0], _features[:, 1], y_pred]
    def cluster_subgraphs(self, matrix, nth_neighbor=1):
        '''
        get the median distance to the NTH neighbor with NN
        use that distance to cluster with scan
        '''
        neigh = NearestNeighbors(n_neighbors=nth_neighbor+1, metric='euclidean')
        neigh.fit(matrix)
        dist, indices = neigh.kneighbors(matrix)
        #dist = np.median(dist[:, nth_neighbor], axis=0) # 1 is the Nth neigh


        if self.min_clustersize < 1.0:
            minsamp = matrix.shape[0]*self.min_clustersize
            #print minsamp,matrix.shape, self.min_clustersize
        else:
            minsamp = self.min_clustersize

        def distances_select_first_non_id_neighbor(distances):
            x,y = distances.nonzero()
            _, idd = np.unique(x, return_index=True)

            """
            for i,e in enumerate(zip(list(x), list(y))):
                print e, distances[e]
                if i in idd:
                    print "!!!"
            print idd
            """
            return distances[ x[idd],y[idd]]


        #dists =  distances_select_NTH_non_id_neighbor(dist,2)
        dists =  distances_select_first_non_id_neighbor(dist)
        #dist = np.median(dists)
        dists=np.sort(dists)
        idx=int(len(dists)*self.dbscan_range)
        dist=dists[idx]
        if self.debug:
            print "name_subgraph: choosing dist %d of %d" % (idx, len(dists))



        # get the clusters
        scan = DBSCAN(eps=dist, min_samples=minsamp)
        return scan.fit_predict(matrix)
Example #27
0
    def split_dbscan(self, eps, min_samples):

        # Extract dataset from files
        dataset = [f.dataset for f in self.files]

        # Initialize classifier
        classifier = DBSCAN(eps=eps, min_samples=min_samples)

        # Fit dataset
        index = classifier.fit_predict(dataset)

        count = max(index) + 2

        clusters = [Cluster(self.directory, self.name + '-' + str(i)) for i in range(count)]
        clusters[count - 1].name = 'na'
        for i in range(0, len(self.files), 1):
            clusters[index[i] % count].add_file(self.files[i])

        return clusters
Example #28
0
class DBScan:
    """
    DBScan model.
    """
    def __init__(self, features, eps, min_samples):
        """
        Initialisation method for DBScan
        :param features: trajectory feature to learn from
        :type features: list()
        """
        self.features = features
        self.model = DBSCAN(eps=eps, min_samples=min_samples)
        self.cluster_labels = []

    def fit_predict(self):
        """
        Fits the model to the data and return the cluster labels.
        """
        self.cluster_labels = self.model.fit_predict(self.features)
Example #29
0
def main():
    print ""
    start_time = time.time()

    vectorized_data_words, vectorized_class_labels = get_sample_data(False)

    data_proc_time = time.time() - start_time
    print "Data processing took " + str(data_proc_time) + " seconds"

    print "Clustering data..."
    # estimator = KMeans(n_clusters = int(num_unique_classes(vectorized_class_labels)) )
    # estimator = DBSCAN(min_samples=1)
    estimator = DBSCAN(eps=0.53, min_samples=2, metric="cosine", algorithm="brute")
    prediction = estimator.fit_predict(vectorized_data_words)

    print "Clustering took " + str(time.time() - start_time - data_proc_time) + " seconds"
    cluster_quality(prediction, vectorized_data_words, vectorized_class_labels)

    print "Total running time: " + str(time.time() - start_time) + " seconds"
    print ""
    def get_clusters(self, eps_range=[0.001, 0.002, 0.003, 0.0035, 0.004, 0.0045, 0.005, 0.0055, 0.006, 0.007, 0.008, 0.009, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, .1]):
        best_num_clusters = 0
        best_fitted_response, best_cluster = 0, None
        for epsilon in eps_range:
            clusterer = DBSCAN(eps=epsilon, **self.kwargs)
            fitted_response = clusterer.fit_predict(self.feature_vectors)

            # Check if there are more clusters
            num_unique_responses = len(set(fitted_response))
            if num_unique_responses > best_num_clusters:
                best_num_clusters = num_unique_responses
                best_cluster = clusterer
                best_fitted_response = fitted_response

        if not best_cluster.components_.shape[0]:
            similarities = [1] * self.feature_vectors.shape[0]
        else:
            similarities = np.max(self.feature_vectors.dot(best_cluster.components_.T), axis = 1)

        return best_fitted_response, similarities.tolist()
def train(args, model, device, optimizer, exp_dir):
    #change rho avlue accoding to training numbers
    if args.trainsize > 2000 and args.trainsize <= 6000:
        rho = 1.7e-3
    elif args.trainsize > 6000 and args.trainsize <= 8000:
        rho = 1.5e-3
    elif args.trainsize > 8000 and args.trainsize <= 10000:
        rho = 1.3e-3
    elif args.trainsize > 10000 and args.trainsize <= 12000:
        rho = 1.1e-3
    elif args.trainsize > 12000 and args.trainsize <= 14000:
        rho = 0.9e-3
    elif args.trainsize > 14000 and args.trainsize <= 16000:
        rho = 0.7e-3
    else:
        rho = args.rho

    #start episodic training
    total_NMI = np.zeros(args.iteration)
    total_AMI = np.zeros(args.iteration)
    total_SMI = np.zeros(args.iteration)
    total_ACCU = np.zeros(args.iteration + 1)

    #Tesing before self-training
    print('Tesing before self-training')
    accu = test(args, model, device)
    total_ACCU[0] = accu

    for iter_n in range(args.iteration):

        #generate data loader
        extraction_loader = DataLoader(
            dataset.Omniglot(
                root=args.data_dir,
                train=True,
                size=args.trainsize,
                transform=transforms.Compose([
                    transforms.Resize(32),
                    #transforms.Grayscale(num_output_channels=3),
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307, ), (0.3081, ))
                ])),
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=4,
            pin_memory=True)
        #extract all data features
        train_features, target_labels = extract_features(
            model=model, data_loader=extraction_loader, device=device)

        #rerank to get the jaccard distance
        rerank_dist = re_ranking(features=train_features,
                                 MemorySave=args.memory_save)

        #build the DBSCAN model
        tri_mat = np.triu(rerank_dist, 1)  # tri_mat.dim=2
        tri_mat = tri_mat[np.nonzero(tri_mat)]  # tri_mat.dim=1
        tri_mat = np.sort(tri_mat, axis=None)
        top_num = np.round(rho * tri_mat.size).astype(int)
        eps = tri_mat[:top_num].mean()
        print('eps in cluster: {:.3f}'.format(eps))
        cluster = DBSCAN(eps=eps,
                         min_samples=4,
                         metric='precomputed',
                         n_jobs=8)

        # select & cluster images as training set of this episode
        print('Clustering and labeling...')
        train_features = train_features.cpu().numpy()
        labels = cluster.fit_predict(rerank_dist)

        #calculate NMI of chosed data points of current episode
        TL = target_labels
        list_true = [int(TL[i].cpu().numpy()) for i in range(len(TL))]
        list_pred = labels.tolist()

        NMI = nmi_withGT(list_pred, list_true)
        AMI = ami_withGT(list_pred, list_true)

        SMI = sampling_NMI_withGT(list_pred, list_true)
        total_NMI[iter_n] = NMI
        total_AMI[iter_n] = AMI
        total_SMI[iter_n] = SMI

        num_ids = len(set(labels)) - 1
        #generate new dataset
        new_dataset = []

        unique_labels, label_count = np.unique(labels, return_counts=True)

        for i in range(len(extraction_loader.dataset.splittxt)):
            idd = np.where(unique_labels == labels[i])[0][0]

            if labels[i] == -1 or label_count[idd] < 6:
                continue

            new_dataset.append(
                (extraction_loader.dataset.splittxt[i], labels[i], 0))

        LL = [new_dataset[i][1] for i in range(len(new_dataset))]
        print(np.unique(LL, return_counts=True))

        print(
            'Iteration {} have {} training ids, {} training images, NMI is {}, AMI is {}, SMI is {}'
            .format(iter_n + 1, num_ids, len(new_dataset), NMI, AMI, SMI))

        #training dataloader
        BS = args.batch_size * args.ims_per_id
        train_loader = DataLoader(
            dataset.Omniglot_clustering(root=args.data_dir,
                                        dat_set=new_dataset,
                                        transform=transforms.Compose([
                                            transforms.Resize(32),
                                            transforms.ToTensor(),
                                            transforms.Normalize((0.1307, ),
                                                                 (0.3081, ))
                                        ])),
            batch_size=BS,
            num_workers=4,
            sampler=RandomIdentitySampler(new_dataset, args.ims_per_id),
            pin_memory=True,
            drop_last=True)

        #training with prototipical learning methods
        for ep in range(args.epochs):
            # Adjust Learning Rate
            adjust_lr_exp(optimizer, args.base_lr, ep + 1, args.epochs,
                          args.exp_decay_at_epoch)

            model.train()

            protoacc_meter = AverageMeter()
            protoloss_meter = AverageMeter()
            ep_st = time.time()

            for data, target in tqdm(train_loader):
                #pdb.set_trace()
                data, target = data.to(device), target.to(device)
                optimizer.zero_grad()
                feat, x_hat = model(data)

                protoloss, acc = loss_fn(feat,
                                         target=target,
                                         n_support=args.train_shot)
                protoloss = protoloss.to(device)

                protoloss.backward()
                optimizer.step()

                protoacc_meter.update(acc.item())
                protoloss_meter.update(protoloss.item())

            #Epoch log
            time_log = 'Ep {}, {:.2f}s'.format(
                ep,
                time.time() - ep_st,
            )

            loss_log = (', acc {:.2%}, protoloss {:.4f}'.format(
                protoacc_meter.avg, protoloss_meter.avg))

            final_log = time_log + loss_log

            print(final_log)

        #adjust learning rate back to initialized learning rate
        print('Learning rate adjuested back to base learning rate {:.10f}'.
              format(args.base_lr))
        for g in optimizer.param_groups:
            g['lr'] = args.base_lr

        accu = test(args, model, device)

        total_ACCU[iter_n + 1] = accu

    print('total NMI value is, ', total_NMI)
    print('total AMI value is, ', total_AMI)
    print('total SMI value is, ', total_SMI)
    print('total ACCU value is, ', total_ACCU)
Example #32
0
class Hdbscan(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    '''
        Primitive that applies Hierarchical Density-Based Clustering or Density-Based Clustering 
        algorithms. This is an unsupervised, clustering primitive, but has been
        representend as a supervised classification problem to produce a compliant primitive. 

        Training inputs: D3M dataset with features and labels, and D3M indices
        Outputs: D3M dataset with predicted labels and D3M indices
    '''
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "ca014488-6004-4b54-9403-5920fbe5a834",
        'version':
        __version__,
        'name':
        "hdbscan",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Clustering'],
        'source': {
            'name':
            __author__,
            'contact':
            __contact__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/D3M-Unsupervised",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package': 'cython',
            'version': '0.29.14',
        }, {
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/NewKnowledge/D3M-Unsupervised.git@{git_commit}#egg=D3MUnsupervised'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.clustering.hdbscan.Hdbscan',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.DBSCAN,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.CLUSTERING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        if self.hyperparams['algorithm'] == 'HDBSCAN':
            self.clf = hdbscan.HDBSCAN(
                min_cluster_size=self.hyperparams['min_cluster_size'],
                min_samples=self.hyperparams['min_samples'],
                cluster_selection_method=self.
                hyperparams['cluster_selection_method'])
        else:
            self.clf = DBSCAN(eps=self.hyperparams['eps'],
                              min_samples=self.hyperparams['min_samples'])

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe with attached metadata for semi-supervised or unsupervised data

        Returns
        ----------
        Outputs
            The output depends on the required_output hyperparameter and is either a dataframe containing a single column 
            where each entry is the cluster ID, or the input daatframe with the cluster ID of each row added as an additional feature. 
        """

        # find target and index variables
        targets = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]

        X_test = inputs.copy()
        if len(index):
            X_test = X_test.drop(columns=list(inputs)[index[0]])
        if len(target_names):
            X_test = X_test.drop(columns=target_names)
        X_test = X_test.values

        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = dataframe_utils.select_rows(inputs,
                                                 np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        if self.hyperparams['required_output'] == 'feature':

            hdb_df = d3m_DataFrame(
                pandas.DataFrame(self.clf.fit_predict(X_test),
                                 columns=['cluster_labels']))

            # just add last column of last column ('clusters')
            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = 'cluster_labels'
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/Attribute')
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            df_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 1
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(utils_cp.append_columns(inputs, hdb_df))
        else:

            hdb_df = d3m_DataFrame(
                pandas.DataFrame(self.clf.fit_predict(X_test),
                                 columns=[target_names[0]]))

            hdb_df = pandas.concat([inputs.d3mIndex, hdb_df], axis=1)

            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = index_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = target_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 1), col_dict)

            df_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 2
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(hdb_df)
Example #33
0
class DBScanCluster:
    def __init__(self, epsilon=0.3, min_pts=10):
        """ function: contructor
            --------------------
            instantiate a dbscan clustering algorithm
        """
        self.name = "dbscan"
        self.dbsc = None
        self.epsilon = epsilon
        self.min_pts = min_pts
        self.clusters = dict([])

    ###########################################################################
    ######### general helped functions for measureing cluster quality #########
    ###########################################################################

    def __compute_variance(self):
        """ function: compute_variance
            --------------------------
            compute the variance/skew across cluster sizes

            returns: variance of the clustering
        """
        mean, variance = 0.0, 0.0
        for key, cluster in self.clusters.iteritems():
            mean += len(cluster)
        mean /= len(self.clusters)
        for key, cluster in self.clusters.iteritems():
            variance += (len(cluster) - mean)**2
        variance /= len(self.clusters)
        return variance

    def __compute_entropy(self, dataset):
        """ function: compute_entropy
            -------------------------
            compute the entropy of @self.vectors or @self.clusters

            returns: entropy scores of the dataset
        """
        entropy = 0.0
        for key, cluster in dataset.iteritems():
            length = len(cluster)
            factor = float(length) / len(self.vectors)
            temp = 0.0
            for topic in self.topics:
                inner = 0.0
                for fv in cluster:
                    if topic in fv.topics:
                        inner += 1.0
                if inner > 0:
                    inner /= float(length)
                    temp += -inner * log(inner, 2)
            entropy += factor * temp
        return entropy

    ###########################################################################
    ################ mains to generate and test the clustering ################
    ###########################################################################

    def generate_clusters(self, feature_vectors):
        """ function: generate_clusters
            ---------------------------
            generate k-means clusters for feature vectors

            :param feature_vectors: set of features to construct model
        """
        # generate clusters
        cluster_start = time.time()
        fv_space, topic_space = [], []
        for key, fv in feature_vectors.iteritems():
            fv_space.append(fv.vector)
            topic_space.append(fv.topics)
        self.dbsc = DBSCAN(eps=self.epsilon, min_samples=self.min_pts)
        clusters = self.dbsc.fit_predict(fv_space)
        # split dataset based on clusters
        for i, index in enumerate(clusters):
            if not self.clusters.has_key(index):
                self.clusters[index] = []
            self.clusters[index].append(feature_vectors[i])
        cluster_time = time.time() - cluster_start
        # set object members for entropy calculation
        self.vectors = feature_vectors
        self.topics = set().union(*topic_space)
        # compute entropy of clusters + gain
        all = self.vectors.values()
        before = self.__compute_entropy({0: self.vectors.values()})
        after = self.__compute_entropy(self.clusters)
        print "Entropy Before Clustering:", before
        print "Entropy After Clustering :", after
        print "Overall Gain in Entropy:", before - after
        # compute variance of cluster sizes + time
        print "Clustering Variance:", self.__compute_variance()
        print "Time for Clustering:", cluster_time, "seconds"
        # reset clusters
        self.clusters = dict([])
Example #34
0
def main(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    cudnn.benchmark = True

    # Create data loaders
    assert args.num_instances > 1, "num_instances should be greater than 1"
    assert args.batch_size % args.num_instances == 0, \
        'num_instances should divide batch_size'
    if args.height is None or args.width is None:
        args.height, args.width = (144, 56) if args.arch == 'inception' else \
            (256, 128)

    # get source data
    src_dataset, src_extfeat_loader = \
        get_source_data(args.src_dataset, args.data_dir, args.height,
                        args.width, args.batch_size, args.workers)
    # get target data
    tgt_dataset, num_classes, tgt_extfeat_loader, test_loader = \
        get_data(args.tgt_dataset, args.data_dir, args.height,
                 args.width, args.batch_size, args.workers)

    # Create model
    # Hacking here to let the classifier be the number of source ids
    if args.src_dataset == 'dukemtmc':
        model = models.create(args.arch, num_classes=632, pretrained=False)
        coModel = models.create(args.arch, num_classes=632, pretrained=False)
    elif args.src_dataset == 'market1501':
        model = models.create(args.arch, num_classes=676, pretrained=False)
        coModel = models.create(args.arch, num_classes=676, pretrained=False)
    elif args.src_dataset == 'msmt17':
        model = models.create(args.arch, num_classes=1041, pretrained=False)
        coModel = models.create(args.arch, num_classes=1041, pretrained=False)
    elif args.src_dataset == 'cuhk03':
        model = models.create(args.arch, num_classes=1230, pretrained=False)
        coModel = models.create(args.arch, num_classes=1230, pretrained=False)
    else:
        raise RuntimeError('Please specify the number of classes (ids) of the network.')

    # Load from checkpoint
    if args.resume:
        print('Resuming checkpoints from finetuned model on another dataset...\n')
        checkpoint = load_checkpoint(args.resume)
        model.load_state_dict(checkpoint['state_dict'], strict=False)
        coModel.load_state_dict(checkpoint['state_dict'], strict=False)
    else:
        raise RuntimeWarning('Not using a pre-trained model.')
    model = nn.DataParallel(model).cuda()
    coModel = nn.DataParallel(coModel).cuda()

    evaluator = Evaluator(model, print_freq=args.print_freq)
    # evaluator.evaluate(test_loader, tgt_dataset.query, tgt_dataset.gallery)
    # if args.evaluate: return

    # Criterion
    criterion = [
        TripletLoss(args.margin, args.num_instances, isAvg=False, use_semi=False).cuda(),
        TripletLoss(args.margin, args.num_instances, isAvg=False, use_semi=False).cuda(),
    ]

    # Optimizer
    optimizer = torch.optim.Adam(
        model.parameters(), lr=args.lr
    )
    coOptimizer = torch.optim.Adam(
        coModel.parameters(), lr=args.lr
    )

    optims = [optimizer, coOptimizer]

    # training stage transformer on input images
    normalizer = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    train_transformer = T.Compose([
        T.Resize((args.height, args.width)),
        T.RandomHorizontalFlip(),
        T.ToTensor(), normalizer,
        T.RandomErasing(probability=0.5, sh=0.2, r1=0.3)
    ])

    # # Start training
    for iter_n in range(args.iteration):
        if args.lambda_value == 0:
            source_features = 0
        else:
            # get source datas' feature
            source_features, _ = extract_features(model, src_extfeat_loader, print_freq=args.print_freq, numStripe=None)
            # synchronization feature order with src_dataset.train
            source_features = torch.cat([source_features[f].unsqueeze(0) for f, _, _ in src_dataset.train], 0)

            # extract training images' features
        print('Iteration {}: Extracting Target Dataset Features...'.format(iter_n + 1))
        target_features, _ = extract_features(model, tgt_extfeat_loader, print_freq=args.print_freq, numStripe=None)
        # synchronization feature order with dataset.train
        target_features = torch.cat([target_features[f].unsqueeze(0) for f, _, _ in tgt_dataset.trainval], 0)
        # calculate distance and rerank result
        print('Calculating feature distances...')
        target_features = target_features.numpy()
        rerank_dist = re_ranking(source_features, target_features, lambda_value=args.lambda_value)
        if iter_n == 0:
            # DBSCAN cluster
            tri_mat = np.triu(rerank_dist, 1)  # tri_mat.dim=2
            tri_mat = tri_mat[np.nonzero(tri_mat)]  # tri_mat.dim=1
            tri_mat = np.sort(tri_mat, axis=None)
            top_num = np.round(args.rho * tri_mat.size).astype(int)
            eps = tri_mat[:top_num].mean()
            print('eps in cluster: {:.3f}'.format(eps))
            cluster = DBSCAN(eps=eps, min_samples=4, metric='precomputed', n_jobs=8)

        # select & cluster images as training set of this epochs
        print('Clustering and labeling...')
        labels = cluster.fit_predict(rerank_dist)
        num_ids = len(set(labels)) - 1
        print('Iteration {} have {} training ids'.format(iter_n + 1, num_ids))
        # generate new dataset
        new_dataset, unknown_dataset = [], []
        # assign label for target ones
        unknownLab = labelNoise(torch.from_numpy(target_features), torch.from_numpy(labels))
        # unknownFeats = target_features[labels==-1,:]
        unCounter, index = 0, 0
        from collections import defaultdict
        realIDs, fakeIDs = defaultdict(list), []
        for (fname, realPID, cam), label in zip(tgt_dataset.trainval, labels):
            if label == -1:
                unknown_dataset.append((fname, int(unknownLab[unCounter]), cam))  # unknown data
                fakeIDs.append(int(unknownLab[unCounter]))
                realIDs[realPID].append(index)
                unCounter += 1
                index += 1
                continue
            # dont need to change codes in trainer.py _parsing_input function and sampler function after add 0
            new_dataset.append((fname, label, cam))
            fakeIDs.append(label)
            realIDs[realPID].append(index)
            index += 1
        print('Iteration {} have {} training images'.format(iter_n + 1, len(new_dataset)))
        precision, recall, fscore = calScores(realIDs, np.asarray(fakeIDs))  # fakeIDs does not contain -1
        print('precision:{}, recall:{}, fscore: {}'.format(100 * precision, 100 * recall, fscore))

        train_loader = DataLoader(
            Preprocessor(new_dataset, root=tgt_dataset.images_dir, transform=train_transformer),
            batch_size=args.batch_size, num_workers=4,
            sampler=RandomIdentitySampler(new_dataset, args.num_instances),
            pin_memory=True, drop_last=True
        )
        # hard samples
        # noiseImgs = [name[1] for name in unknown_dataset]
        # saveAll(noiseImgs, tgt_dataset.images_dir, 'noiseImg')
        # import ipdb; ipdb.set_trace()
        unLoader = DataLoader(
            Preprocessor(unknown_dataset, root=tgt_dataset.images_dir, transform=train_transformer),
            batch_size=args.batch_size, num_workers=4,
            sampler=RandomIdentitySampler(unknown_dataset, args.num_instances),
            pin_memory=True, drop_last=True
        )
        # train model with new generated dataset
        trainer = RCoTeaching(
           model, coModel, train_loader, unLoader, criterion, optims
        )

        # Start training
        for epoch in range(args.epochs):
            trainer.train(epoch, remRate=0.2 + (0.8 / args.iteration) * (1 + iter_n))

        # test only
        rank_score = evaluator.evaluate(test_loader, tgt_dataset.query, tgt_dataset.gallery)
        # print('co-model:\n')
        # rank_score = evaluatorB.evaluate(test_loader, tgt_dataset.query, tgt_dataset.gallery)

    # Evaluate
    rank_score = evaluator.evaluate(test_loader, tgt_dataset.query, tgt_dataset.gallery)
    save_checkpoint({
        'state_dict': model.module.state_dict(),
        'epoch': epoch + 1, 'best_top1': rank_score.market1501[0],
    }, True, fpath=osp.join(args.logs_dir, 'RCT.pth'))
    return rank_score.map, rank_score.market1501[0]
Example #35
0
"""
Created on Tue Apr  2 17:16:08 2019

@author: cankozan
"""
import pandas as pd
import numpy as np

df = pd.read_excel('data.xlsx')
df = df.dropna()

from sklearn.cluster import KMeans

km = KMeans(n_clusters=6)

km.fit(df.iloc[:, :2])
labeled = km.predict(df.iloc[:, :2])

from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=6)
labeled_ac = ac.fit_predict(df.iloc[:, :2])

from scipy.cluster.hierarchy import dendrogram, linkage
z = linkage(df.iloc[:, :2])
dendrogram(z)

from sklearn.cluster import DBSCAN

dbs = DBSCAN(eps=10, min_samples=2)
labeled_dbs = dbs.fit_predict(df.iloc[:, :2])
Example #36
0
#kMeans clustering
from sklearn.cluster import KMeans
km = KMeans(init='random', max_iter=150, n_clusters=2, random_state=0)
y_km = km.fit_predict(X)

plt.scatter(X[y_km == 0, 0], X[y_km == 0, 1], c='green')
plt.scatter(X[y_km == 1, 0], X[y_km == 1, 1], c='red')
plt.title("KMeans")
plt.show()

#Agglomerative Clustering with complete linkage
from sklearn.cluster.hierarchical import AgglomerativeClustering
aggcl = AgglomerativeClustering(n_clusters=2, linkage='complete')
y_agcl = aggcl.fit_predict(X)

plt.scatter(X[y_agcl == 0, 0], X[y_agcl == 0, 1], c='green')
plt.scatter(X[y_agcl == 1, 0], X[y_agcl == 1, 1], c='red')
plt.title("Aggolomerative Clustering")
plt.show()

#Demonstaring clustering using density-based approach
from sklearn.cluster import DBSCAN
dbs = DBSCAN(eps=0.2, min_samples=5)
y_dbs = dbs.fit_predict(X)

plt.scatter(X[y_dbs == 0, 0], X[y_dbs == 0, 1], c='green')
plt.scatter(X[y_dbs == 1, 0], X[y_dbs == 1, 1], c='red')
plt.title("Density based(DBSCAN) Clustering")
plt.show()
Example #37
0
def test_weighted_dbscan():
    # ensure sample_weight is validated
    with pytest.raises(ValueError):
        dbscan([[0], [1]], sample_weight=[2])
    with pytest.raises(ValueError):
        dbscan([[0], [1]], sample_weight=[2, 3, 4])

    # ensure sample_weight has an effect
    assert_array_equal([],
                       dbscan([[0], [1]], sample_weight=None,
                              min_samples=6)[0])
    assert_array_equal([],
                       dbscan([[0], [1]], sample_weight=[5, 5],
                              min_samples=6)[0])
    assert_array_equal([0],
                       dbscan([[0], [1]], sample_weight=[6, 5],
                              min_samples=6)[0])
    assert_array_equal([0, 1],
                       dbscan([[0], [1]], sample_weight=[6, 6],
                              min_samples=6)[0])

    # points within eps of each other:
    assert_array_equal([0, 1],
                       dbscan([[0], [1]],
                              eps=1.5,
                              sample_weight=[5, 1],
                              min_samples=6)[0])
    # and effect of non-positive and non-integer sample_weight:
    assert_array_equal([],
                       dbscan([[0], [1]],
                              sample_weight=[5, 0],
                              eps=1.5,
                              min_samples=6)[0])
    assert_array_equal([0, 1],
                       dbscan([[0], [1]],
                              sample_weight=[5.9, 0.1],
                              eps=1.5,
                              min_samples=6)[0])
    assert_array_equal([0, 1],
                       dbscan([[0], [1]],
                              sample_weight=[6, 0],
                              eps=1.5,
                              min_samples=6)[0])
    assert_array_equal([],
                       dbscan([[0], [1]],
                              sample_weight=[6, -1],
                              eps=1.5,
                              min_samples=6)[0])

    # for non-negative sample_weight, cores should be identical to repetition
    rng = np.random.RandomState(42)
    sample_weight = rng.randint(0, 5, X.shape[0])
    core1, label1 = dbscan(X, sample_weight=sample_weight)
    assert len(label1) == len(X)

    X_repeated = np.repeat(X, sample_weight, axis=0)
    core_repeated, label_repeated = dbscan(X_repeated)
    core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
    core_repeated_mask[core_repeated] = True
    core_mask = np.zeros(X.shape[0], dtype=bool)
    core_mask[core1] = True
    assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)

    # sample_weight should work with precomputed distance matrix
    D = pairwise_distances(X)
    core3, label3 = dbscan(D,
                           sample_weight=sample_weight,
                           metric="precomputed")
    assert_array_equal(core1, core3)
    assert_array_equal(label1, label3)

    # sample_weight should work with estimator
    est = DBSCAN().fit(X, sample_weight=sample_weight)
    core4 = est.core_sample_indices_
    label4 = est.labels_
    assert_array_equal(core1, core4)
    assert_array_equal(label1, label4)

    est = DBSCAN()
    label5 = est.fit_predict(X, sample_weight=sample_weight)
    core5 = est.core_sample_indices_
    assert_array_equal(core1, core5)
    assert_array_equal(label1, label5)
    assert_array_equal(label1, est.labels_)
Example #38
0
class LexRank(object):
    def __init__(self,
                 similarity='cosine',
                 decay_window=20,
                 decay_alpha=0.25,
                 clustering='dbscan',
                 tagger='twitter',
                 useful_tags=[
                     'Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb',
                     'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix',
                     'Alpha', 'Number'
                 ],
                 delimiters=['. ', '\n', '.\n'],
                 min_token_length=2,
                 stopwords=stopwords_ko,
                 no_below_word_count=2,
                 no_above_word_portion=0.85,
                 max_dictionary_size=None,
                 min_cluster_size=2,
                 similarity_threshold=0.85,
                 matrix_smoothing=False,
                 n_clusters=None,
                 compactify=True,
                 **kwargs):
        self.decay_window = decay_window
        self.decay_alpha = decay_alpha
        if similarity == 'cosine':  # very, very slow :(
            self.vectorizer = DictVectorizer()
            self.uniform_sim = self._sim_cosine
        elif similarity == 'jaccard':
            self.uniform_sim = self._sim_jaccard
        elif similarity == 'normalized_cooccurrence':
            self.uniform_sim = self._sim_normalized_cooccurrence
        else:
            raise LexRankError(
                "available similarity functions are: cosine, jaccard, normalized_cooccurrence"
            )
        self.sim = lambda sentence1, sentence2: self.decay(
            sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
        self.factory = SentenceFactory(tagger=tagger,
                                       useful_tags=useful_tags,
                                       delimiters=delimiters,
                                       min_token_length=min_token_length,
                                       stopwords=stopwords,
                                       **kwargs)
        if clustering == 'birch':
            self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
            self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix
                                                                     )
        elif clustering == 'dbscan':
            self._dbscan = DBSCAN()
            self._clusterer = lambda matrix: self._dbscan.fit_predict(1 -
                                                                      matrix)
        elif clustering == 'affinity':
            self._affinity = AffinityPropagation()
            self._clusterer = lambda matrix: self._affinity.fit_predict(1 -
                                                                        matrix)
        elif clustering is None:
            self._clusterer = lambda matrix: [
                0 for index in range(matrix.shape[0])
            ]
        else:
            raise LexRankError(
                "available clustering algorithms are: birch, markov, no-clustering(use `None`)"
            )
        self.no_below_word_count = no_below_word_count
        self.no_above_word_portion = no_above_word_portion
        self.max_dictionary_size = max_dictionary_size
        self.similarity_threshold = similarity_threshold
        self.min_cluster_size = min_cluster_size
        self.matrix_smoothing = matrix_smoothing
        self.compactify = compactify

    def summarize(self, text):
        self.sentences = self.factory.text2sentences(text)
        self.num_sentences = len(self.sentences)
        self.corpus = SentenceCorpus(self.sentences, self.no_below_word_count,
                                     self.no_above_word_portion,
                                     self.max_dictionary_size)
        self.model = TfidfModel(self.corpus.bows,
                                id2word=self.corpus.dictionary,
                                normalize=True)
        self.tfidfs = self.model[self.corpus.bows]
        self._inject_tfidfs()
        self._build_matrix()
        self._clustering()
        if self.compactify:
            self._compactify()
        self.graphs = []
        for i in range(self.num_clusters):
            graph = self.sentences2graph(self.clusters[i])
            pagerank = networkx.pagerank(graph, weight='weight')
            self.clusters[i] = sorted(pagerank, key=pagerank.get, reverse=True)
            self.graphs.append(graph)

    def _sim_jaccard(self, sentence1, sentence2):
        if sentence1 == sentence2:
            return 1
        p = sum((sentence1.counter & sentence2.counter).values())
        q = sum((sentence1.counter | sentence2.counter).values())
        return p / q if q else 0

    def _sim_cosine(self, sentence1, sentence2):
        if sentence1 == sentence2:
            return 1
        sentence1_tfidf = {
            word_id: tfidf
            for word_id, tfidf in sentence1.tfidf
        }
        sentence2_tfidf = {
            word_id: tfidf
            for word_id, tfidf in sentence2.tfidf
        }
        vector1, vector2 = self.vectorizer.fit_transform(
            [sentence1_tfidf, sentence2_tfidf]).toarray()
        return vector1.dot(vector2)

    def _sim_normalized_cooccurrence(self, sentence1, sentence2):
        if sentence1 == sentence2:
            return 1
        return len(set(sentence1.tokens) & set(sentence2.tokens)) / (
            math.log(len(sentence1.tokens)) + math.log(len(sentence2.tokens)))

    def decay(self, sentence1, sentence2):
        distance = abs(sentence1.index - sentence2.index)
        closeness = max(self.decay_window - distance, 0) / self.decay_window
        return math.pow(closeness, self.decay_alpha)

    def _inject_tfidfs(self):
        for index in range(self.num_sentences):
            bow = self.corpus.bows[index]
            self.sentences[index].bow = bow
            self.sentences[index].tfidf = self.model[bow]

    def _build_matrix(self):
        self.matrix = np.zeros((self.num_sentences, self.num_sentences))
        for sentence1 in self.sentences:
            for sentence2 in self.sentences:
                self.matrix[sentence1.index,
                            sentence2.index] = self.sim(sentence1, sentence2)
        if self.matrix_smoothing:
            for index in range(self.num_sentences):
                self.matrix[index, index] = 0
                self.matrix[index, index] = max(self.matrix[index])

    def sentences2graph(self, sentences):
        graph = networkx.Graph()
        graph.add_nodes_from(sentences)
        for sentence1 in sentences:
            for sentence2 in sentences:
                weight = self.matrix[sentence1.index, sentence2.index]
                if weight:
                    graph.add_edge(sentence1, sentence2, weight=weight)
        return graph

    def _clustered(self):
        self.clusters = [
            cluster for cluster in self.clusters
            if len(cluster) >= self.min_cluster_size
        ]
        self.num_clusters = len(self.clusters)
        self.clusters = sorted(self.clusters,
                               key=lambda cluster: len(cluster),
                               reverse=True)

    def _clustering(self):
        cls = self._clusterer(self.matrix)
        bucket = {}
        for index in range(len(cls)):
            key = str(cls[index])
            if key not in bucket:
                bucket[key] = []
            bucket[key].append(self.sentences[index])
        self.clusters = bucket.values()
        self._clustered()

    def _compactify(self):
        clusters = []
        for cluster in self.clusters:
            compact_cluster = []
            cluster_size = len(cluster)
            for i in range(cluster_size):
                cluster[i].duplicated = False
            for i in range(cluster_size):
                if cluster[i].duplicated:
                    continue
                for j in range(i + 1, cluster_size):
                    if cluster[j].duplicated:
                        continue
                    if self.uniform_sim(
                            cluster[i],
                            cluster[j]) > self.similarity_threshold:
                        cluster[j].duplicated = True
                compact_cluster.append(cluster[i])
            clusters.append(compact_cluster)
        self.clusters = clusters
        self._clustered()

    def _verbose(self):
        summaries = sorted(self.summaries, key=lambda sentence: sentence.index)
        return [sentence.text for sentence in summaries]

    def probe(self, k=None):
        if not hasattr(self, 'clusters'):
            raise LexRankError("summarize it first")
        if not k:
            k = max(2, self.num_clusters)
        if k < 0:
            raise LexRankError(
                "appropriate value for `k`: float(0 ~ 1) for compress rate, or natural number for exact number of sentences"
            )
        if k > self.num_sentences:
            raise LexRankError("this will not give a summarization")
        if k < 1:
            k = int(self.num_sentences * k)
        self.summaries = []
        ends = np.array([len(cluster) for cluster in self.clusters])
        drones = np.zeros(ends.shape)
        for i in range(self.num_clusters):
            self.summaries.append(self.clusters[i][0])
            drones[i] += 1
            if len(self.summaries) == k:
                return self._verbose()
        while True:
            branch = np.array([drones + 1, ends]).min(axis=0) / ends
            leach = int(branch.argmin())
            drone = int(drones[leach])
            self.summaries.append(self.clusters[leach][drone])
            drones[leach] += 1
            if len(self.summaries) == k:
                return self._verbose()
def main(argv):
    global chatty

    start = ""
    end = ""
    duration = ""

    bisection = 0
    bisection_max = 10
    min_step = datetime.timedelta(minutes=5)

    try:
        opts, args = getopt.getopt(
            argv, "hqs:d:t:",
            ["help", "quiet", "start=", "duration=", "steps="])
    except getopt.GetoptError:
        eprint('Error: Unrecognized option!')
        eprint(
            'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
        )
        eprint('Use -h for help.')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print('categorization.py [-h] [-s <start> -d <duration>]')
            print('  -h:                    print this text')
            print(
                '  -s <start>:            start date and time in elasticsearch time format'
            )
            print('  -d <duration>:         duration format <number>[mhdw]')
            print('  -t <steps>:            steps format <number>[mhdw]')
            print(
                '                         If Elasticsearch query isn\'t responed in time, step size will be automatically adjusted by bisection.'
            )
            print('  -q:                    no output except errors')
            print('  --help:                same as -h')
            print('  --start <start>:       same as -s')
            print('  --duration <duration>: same as -d')
            print('  --steps <steps>:       same as -t')
            print('  --quiet:               same as -q')
            print('')
            print(
                ' If start and duration are omitted, the last 24 hours will be used.'
            )
            sys.exit(0)
        elif opt in ("-q", "--quiet"):
            chatty = False
        elif opt in ("-s", "--start"):
            try:
                start_dt = datetime.datetime.strptime(arg, "%Y-%m-%dT%H:%M")
                start = arg
            except ValueError:
                eprint('Error: Invalid option start!')
                eprint(
                    'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                )
                eprint('<start> format: yyyy-mm-ddThh:mm')
                eprint('Use -h for help.')
                sys.exit(2)
        elif opt in ("-d", "--duration"):
            match = re.match("(\d+)([mhdw])$", arg)
            if match:
                (x, c) = match.groups()
                try:
                    y = int(x)
                except:
                    eprint('Error: Invalid option duration!')
                    eprint(
                        'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                    )
                    eprint('<duration> format: <number>[mhdw]')
                    eprint('Use -h for help.')
                    sys.exit(2)
                if c == 'm':
                    delta = datetime.timedelta(minutes=y)
                elif c == 'h':
                    delta = datetime.timedelta(hours=y)
                elif c == 'd':
                    delta = datetime.timedelta(days=y)
                elif c == 'w':
                    delta = datetime.timedelta(days=(7 * y))
                else:
                    eprint('Error: Invalid option duration!')
                    eprint(
                        'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                    )
                    eprint('<duration> format: <number>[mhdw]')
                    eprint('Use -h for help.')
                    sys.exit(2)
                duration = arg
            else:
                eprint('Error: Invalid option duration!')
                eprint(
                    'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                )
                eprint('<duration> format: <number>[mhdw]')
                eprint('Use -h for help.')
                sys.exit(2)
        elif opt in ("-t", "--steps"):
            match = re.match("(\d+)([mhdw])$", arg)
            if match:
                (x, c) = match.groups()
                try:
                    y = int(x)
                except:
                    eprint('Error: Invalid option steps!')
                    eprint(
                        'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                    )
                    eprint('<duration> format: <number>[mhdw]')
                    eprint('Use -h for help.')
                    sys.exit(2)
                if c == 'm':
                    step = datetime.timedelta(minutes=y)
                elif c == 'h':
                    step = datetime.timedelta(hours=y)
                elif c == 'd':
                    step = datetime.timedelta(days=y)
                elif c == 'w':
                    step = datetime.timedelta(days=(7 * y))
                else:
                    eprint('Error: Invalid option steps!')
                    eprint(
                        'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                    )
                    eprint('<duration> format: <number>[mhdw]')
                    eprint('Use -h for help.')
                    sys.exit(2)
            else:
                eprint('Error: Invalid option steps!')
                eprint(
                    'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                )
                eprint('<steps> format: <number>[mhdw]')
                eprint('Use -h for help.')
                sys.exit(2)

    if ((start == "") and (duration == "")):
        end_dt = datetime.datetime.now()
        end = end_dt.strftime("%Y-%m-%dT%H:%M")
        delta = datetime.timedelta(days=1)
        start_dt = end_dt - delta
        start = start_dt.strftime("%Y-%m-%dT%H:%M")
    elif ((start == "") or (duration == "")):
        eprint('Error: Invalid option combination!')
        eprint('Start and duration must both be specified or omitted.')
        eprint(
            'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
        )
        eprint('Use -h for help.')
        sys.exit(2)
    else:
        end_dt = start_dt + delta
        end = end_dt.strftime("%Y-%m-%dT%H:%M")

    if ('step' not in vars()):
        step = delta

    if (end_dt - start_dt) < step:
        step = end_dt - start_dt

    es = elasticsearch.Elasticsearch(nodes)

    elasticsearch_version = get_elasticsearch_major_version(es)

    axes = [
        'entropy_peer_sessions', 'entropy_peer_packets', 'entropy_peer_bytes',
        'entropy_sport_sessions', 'entropy_sport_packets',
        'entropy_sport_bytes', 'entropy_dport_sessions',
        'entropy_dport_packets', 'entropy_dport_bytes'
    ]

    host = {}
    peer = {}
    sport = {}
    dport = {}
    '''
    During data generation slight differences in timing
    may cause some hosts not to be listed in all dictionaries.
    These hosts will be ignored.
    '''
    ignore_hosts = []

    sets = [[
        'peer (1)', peer, "src", "SourceAddress", "dst", "DestinationAddress"
    ], ['peer (2)', peer, "dst", "DestinationAddress", "src", "SourceAddress"
        ], ['sport (1)', sport, "src", "SourceAddress", "sport", "SourcePort"],
            [
                'sport (2)', sport, "dst", "DestinationAddress", "dport",
                "DestinationPort"
            ],
            [
                'dport (1)', dport, "src", "SourceAddress", "dport",
                "DestinationPort"
            ],
            [
                'dport (2)', dport, "dst", "DestinationAddress", "sport",
                "SourcePort"
            ]]

    for s in sets:
        moving_start_dt = start_dt
        moving_end_dt = min(start_dt + step, end_dt)
        while True:
            try:
                start = moving_start_dt.strftime("%Y-%m-%dT%H:%M:%S")
                end = moving_end_dt.strftime("%Y-%m-%dT%H:%M:%S")
                print_or_quiet('%s Fetching %s data ... %s - %s' %
                               (strftime('%H:%M:%S'), s[0], start, end))
                query_into_dictionary(es, elasticsearch_version, start, end,
                                      host, s[1], s[2], s[3], s[4], s[5])
                moving_start_dt = moving_end_dt
                moving_end_dt += step
                if moving_start_dt >= end_dt:
                    break
                if moving_end_dt > end_dt:
                    moving_end_dt = end_dt
            except elasticsearch.exceptions.ConnectionTimeout as esect:
                bisection += 1
                if bisection <= bisection_max:
                    step = step // 2
                    if step < min_step:
                        eprint(
                            'Elasticsearch Connection Timeout. Minimum timeframe reached. Exiting ...'
                        )
                        sys.exit(3)
                    eprint(
                        'Elasticsearch Connection Timeout. Halving step size.')
                    moving_end_dt = moving_start_dt + step
                else:
                    eprint(
                        '%d. time Elasticsearch Connection Timeout. Exiting ...'
                        % bisection)
                    sys.exit(3)

    print_or_quiet('%s Calculating sums ...' % (strftime('%H:%M:%S')))
    calculate_sums(host, 'peer', peer, ignore_hosts)
    calculate_sums(host, 'sport', sport, ignore_hosts)
    calculate_sums(host, 'dport', dport, ignore_hosts)

    print_or_quiet('%s Removing incomplete hosts ...' % (strftime('%H:%M:%S')))
    for h in ignore_hosts:
        try:
            del host[h]
        except KeyError:
            # host may appear more than once
            pass

    print_or_quiet('%s Calculating entropy ...' % (strftime('%H:%M:%S')))
    calculate_entropy(host,
                      [[peer, 'peer'], [dport, 'dport'], [sport, 'sport']])

    print_or_quiet('%s Removing dictionaries ...' % (strftime('%H:%M:%S')))
    peer.clear()
    sport.clear()
    dport.clear()

    if (len(host) == 0):
        eprint("No data found. Exiting ...")
        sys.exit(1)

    print_or_quiet('%s Creating sample set ...' % (strftime('%H:%M:%S')))
    labels = host.keys()
    npa = create_np(host, axes)
    n_samples, n_features = npa.shape
    print_or_quiet('          samples: %d features:%d' %
                   (n_samples, n_features))

    #########################
    # MeanShift
    #########################
    print_or_quiet('%s Calculating bandwidth ...' % (strftime('%H:%M:%S')))
    bandwidth = estimate_bandwidth(npa,
                                   quantile=bandwidth_quantile,
                                   n_samples=bandwidth_n_samples,
                                   random_state=bandwidth_random_state,
                                   n_jobs=bandwidth_n_jobs)

    if (bandwidth == 0.00000000):
        eprint('Useless bandwith. Exiting ....')
        sys.exit(3)

    print_or_quiet('%s Calculating MeanShift ...' % (strftime('%H:%M:%S')))
    ms = MeanShift(bandwidth=bandwidth,
                   bin_seeding=meanshift_bin_seeding,
                   cluster_all=meanshift_cluster_all,
                   n_jobs=meanshift_n_jobs)
    ms.fit(npa, npa.shape)
    n_clusters = len(numpy.unique(ms.labels_))

    print_or_quiet('%s Getting predictions ...' % (strftime('%H:%M:%S')))
    prediction = ms.predict(npa)
    ip = [y['ip_address'] for y in host.values()]
    ip_prediction = zip(ip, prediction)

    meanshift_output = {}
    for (ip, prediction) in ip_prediction:
        meanshift_output.setdefault(prediction, list()).append(ip)

    print_or_quiet('%s Writing MeanShift output file ...' %
                   (strftime('%H:%M:%S')))
    with open(meanshift_outputfile, 'w') as fp:
        json.dump(
            {str(key): value
             for key, value in meanshift_output.iteritems()}, fp)

    #########################
    # KMeans
    #########################
    print_or_quiet('%s Calculating KMeans ...' % (strftime('%H:%M:%S')))
    km = KMeans(n_clusters=kmeans_n_clusters)
    km.fit(npa, npa.shape)

    print_or_quiet('%s Getting predictions ...' % (strftime('%H:%M:%S')))
    prediction = km.predict(npa)
    ip = [y['ip_address'] for y in host.values()]
    ip_prediction = zip(ip, prediction)

    kmeans_output = {}
    for (ip, prediction) in ip_prediction:
        kmeans_output.setdefault(prediction, list()).append(ip)

    print_or_quiet('%s Writing KMeans output file ...' %
                   (strftime('%H:%M:%S')))
    with open(kmeans_outputfile, 'w') as fp:
        json.dump(
            {str(key): value
             for key, value in kmeans_output.iteritems()}, fp)

    #########################
    # AgglomerativeClustering
    #########################
    print_or_quiet('%s Calculating Agglomerative Clustering ...' %
                   (strftime('%H:%M:%S')))
    ac = AgglomerativeClustering(n_clusters=agglomerative_n_clusters,
                                 affinity=agglomerative_affinity,
                                 linkage=agglomerative_linkage)

    prediction = ac.fit_predict(npa)
    ip = [y['ip_address'] for y in host.values()]
    ip_prediction = zip(ip, prediction)

    agglomerative_output = {}
    for (ip, prediction) in ip_prediction:
        agglomerative_output.setdefault(prediction, list()).append(ip)

    print_or_quiet('%s Writing Agglomerative Clustering output file ...' %
                   (strftime('%H:%M:%S')))
    with open(agglomerative_outputfile, 'w') as fp:
        json.dump(
            {
                str(key): value
                for key, value in agglomerative_output.iteritems()
            }, fp)

    #########################
    # DBSCAN
    #########################
    print_or_quiet('%s Calculating DBSCAN ...' % (strftime('%H:%M:%S')))
    db = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples)

    prediction = db.fit_predict(npa)
    ip = [y['ip_address'] for y in host.values()]
    ip_prediction = zip(ip, prediction)

    dbscan_output = {}
    for (ip, prediction) in ip_prediction:
        dbscan_output.setdefault(prediction, list()).append(ip)

    print_or_quiet('%s Writing DBSCAN output file ...' %
                   (strftime('%H:%M:%S')))
    with open(dbscan_outputfile, 'w') as fp:
        json.dump(
            {str(key): value
             for key, value in dbscan_output.iteritems()}, fp)
Example #40
0
df1 = pd.read_csv('./GpsData/CHILD_GPS(한수1).csv')
df2 = pd.read_csv('./GpsData/CHILD_GPS(한수2).csv')
df3 = pd.read_csv('./GpsData/CHILD_GPS(김규1).csv')
df4 = pd.read_csv('./GpsData/CHILD_GPS(김규2).csv')
'''
df = pd.read_csv('./GpsData/CHILD_GPS(한수1).csv')

#Give Columns  Name
df.columns = ['ChildKey', 'Time', 'latitude', 'longitude']
X = df[['latitude', 'longitude']]


distance_matrix = squareform(pdist(X, (lambda u, v: haversine(u, v))))

db = DBSCAN(eps=0.45, min_samples=3, metric='precomputed')
y_db = db.fit_predict(distance_matrix)

#Cluster's Info - '-1' is Noise Cluster in X['Cluster'] data
X['cluster'] = y_db
'''
X['color'] = np.where(X.cluster == -1, 'red', 'blue')
X.plot(kind='scatter',
       x='longitude',
       y='latitude',
       s=20,
       c=X['color'])
'''
'''
#Drop All of Noise Data
for i, row in X.iterrows():
    if row['cluster'] == -1:
Example #41
0
 def DBScanModel(self, X_train, X_test, y_train, y_test):
     model = DBSCAN()
     model.fit(X_train)
     y_pred = model.fit_predict(X_test)
     return model, y_pred
Example #42
0
def dbscan(temp_list, hum_list, gas_list, label_list):

    temp = np.array(temp_list)
    label = []
    hum = np.array(hum_list)
    gas = np.array(gas_list)
    data_temp_y = []
    data_hum_y = []
    data_gas_y = []
    for i in label_list:
        data_label = get_label(i)
        data_temp_y.append(data_label[0])
        data_hum_y.append(data_label[1])
        data_gas_y.append(data_label[2])
    # split data
    # 이때 label을 temp, hum, gas의 label 형태로 변환해줘야함

    data_temp_x = temp.astype(np.float64).reshape(-1, 1)
    data_hum_x = hum.astype(np.float64).reshape(-1, 1)
    data_gas_x = gas.astype(np.float64).reshape(-1, 1)

    # create model
    model_temp = DBSCAN(min_samples=10)
    model_hum = DBSCAN(min_samples=10)
    model_gas = DBSCAN(min_samples=10)

    # predict y
    pred_temp_y = model_temp.fit_predict(data_temp_x)
    pred_hum_y = model_temp.fit_predict(data_hum_x)
    pred_gas_y = model_temp.fit_predict(data_gas_x)

    pred_temp_y = np.where(pred_temp_y != -1, 0, pred_temp_y)
    pred_temp_y = np.where(pred_temp_y == -1, 1, pred_temp_y)

    pred_hum_y = np.where(pred_hum_y != -1, 0, pred_hum_y)
    pred_hum_y = np.where(pred_hum_y == -1, 1, pred_hum_y)

    pred_gas_y = np.where(pred_gas_y != -1, 0, pred_gas_y)
    pred_gas_y = np.where(pred_gas_y == -1, 1, pred_gas_y)

    unique_temp, counts_temp = np.unique(data_temp_y, return_counts=True)
    unique_hum, counts_hum = np.unique(data_hum_y, return_counts=True)
    unique_gas, counts_gas = np.unique(data_gas_y, return_counts=True)

    unique_temp_pred, counts_temp_pred = np.unique(pred_temp_y,
                                                   return_counts=True)
    unique_hum_pred, counts_hum_pred = np.unique(pred_hum_y,
                                                 return_counts=True)
    unique_gas_pred, counts_gas_pred = np.unique(pred_gas_y,
                                                 return_counts=True)

    print("temp:", dict(zip(unique_temp, counts_temp)),
          dict(zip(unique_temp_pred, counts_temp_pred)))
    print("hum:", dict(zip(unique_hum, counts_hum)),
          dict(zip(unique_hum_pred, counts_hum_pred)))
    print("gas:", dict(zip(unique_gas, counts_gas)),
          dict(zip(unique_gas_pred, counts_gas_pred)))

    print('temparature\'s accuracy is:',
          accuracy_score(data_temp_y, pred_temp_y))
    print('humidity\'s accuracy is:', accuracy_score(data_hum_y, pred_hum_y))
    print('gas\'s accuracy is:', accuracy_score(data_gas_y, pred_gas_y))
    for i in range(40, len(pred_temp_y)):
        temp_label = 0
        if pred_temp_y[i] == 1:
            temp_label += 4 * pred_temp_y[i]
        if pred_hum_y[i] == 1:
            temp_label += 2 * pred_hum_y[i]
        if pred_gas_y[i] == 1:
            temp_label += 1 * pred_gas_y[i]
        label.append(temp_label)

    return label

res_1 = df1.iloc[:,1:]

#pca = PCA(n_components=5)
#res = pca.fit_transform(res_1)
res = res_1



kmean_ncluster = 8
kmeans = KMeans(n_clusters=kmean_ncluster, init='k-means++', max_iter=400, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(res)

clustering = DBSCAN(eps=10, min_samples=2)
pred_y2 = clustering.fit_predict(res)

spectral_ncluster = 8
spectral = SpectralClustering(spectral_ncluster, eigen_solver='arpack', affinity="nearest_neighbors")
pred_y3 = spectral.fit_predict(res)

prep_data.check_result(res, pred_y)
prep_data.check_result(res, pred_y2)
prep_data.check_result(res, pred_y3)


view = pd.DataFrame(df_yara.loc[:,('yara')].copy())
view['kmeans'] = pd.DataFrame(pred_y)
view['dbscan'] = pd.DataFrame(pred_y2)
view['spectral_clustering'] = pd.DataFrame(pred_y3)
view['validation_groups'] = tmp['tag']
Example #44
0
# Importing the dataset
dataset = pd.read_csv('Mall_Customers.csv')
X = dataset.iloc[:, [3, 4]].values

X = StandardScaler().fit_transform(X)
plt.scatter(X[:, 0], X[:, 1], s=100,
            color='blue')  #posicionamento dos eixos x e y
plt.grid()  #função que desenha a grade no nosso gráfico
plt.show()

#eps
#
dbscan = DBSCAN(eps=0.4, min_samples=2, algorithm='kd_tree')

pred_y = dbscan.fit_predict(X)
max_label = np.max(pred_y)
plt.scatter(X[pred_y == -1, 0],
            X[pred_y == -1, 1],
            s=100,
            c='gray',
            label='Anomalies')

for i in range(0, max_label + 1):
    color = np.random.rand(3, )
    plt.scatter(X[pred_y == i, 0],
                X[pred_y == i, 1],
                s=100,
                c=color,
                label='Cluster' + str(i))
plt.title('Clusters of customers')
Example #45
0
from sklearn.cluster import DBSCAN

X, v = make_blobs(n_samples=60, centers=4, cluster_std=.60, random_state=0)

plt.figure(figsize=(15,10))
plt.scatter(X[:,0], X[:,1])

N = len(X)
ind = np.arange(N)
print(ind)
for label, x, y in zip(ind,X[:,0], X[:,1]):
    plt.annotate(label,xy=(x,y))
    

clustering = DBSCAN(eps=.8, min_samples=5) # cambiar para que se vea diferente.
etiqueta = clustering.fit_predict(X)

plt.figure(figsize=(15,10))
plt.xlim(-5,5)
plt.ylim(0,10)
plt.scatter(X[:,0], X[:,1], c=etiqueta)

N = len(X)
ind = np.arange(N)
for label, x, y in zip(ind, X[:,0], X[:,1]):
    plt.annotate(label,xy=(x,y))


def buscarVecinos(P,X,epsilon):
    N = len(X)
    vecinos = []
Example #46
0
# If there are at least min_samples many data points within a distance of eps to a given
# data point, that data point is classified as a core sample.

import matplotlib.pyplot as plt

import mglearn

from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs, make_moons
from sklearn.preprocessing import StandardScaler


X, y = make_blobs(n_samples=20, random_state=0)

dbscan = DBSCAN()
clusters = dbscan.fit_predict(X)

print("Cluster membershipd:\n{0}".format(clusters)) # [-1, -1, ..., -1, -1] - all points were assigned to noise

# Increasing min_samples (going from top to bottom in the figure) means that fewer points will be core points, and 
# more points will be labeled as noise.

# The parameter eps is somewhat more important, as it determines what it means for
# points to be “close.” Setting eps to be very small will mean that no points are core
# samples, and may lead to all points being labeled as noise. Setting eps to be very large
# will result in all points forming a single cluster.

# While DBSCAN doesn’t require setting the number of clusters explicitly, setting eps
# implicitly controls how many clusters will be found.

mglearn.plots.plot_dbscan()
Example #47
0
# -*- coding: utf-8 -*-
"""
Created on Tue May 12 21:40:26 2020

@author: Dr. Taimoor
"""

import pandas as pd

corpus = pd.read_csv('D:\\Dataset.csv')
print(corpus)

data = corpus.iloc[:, [2, 3, 5]].values
print('\n Data', data)

from sklearn.cluster import DBSCAN

#You should know your number of desired clusters before hand
#Keep tuning these two parameters until you get those clusters
dbscan = DBSCAN(eps=500, min_samples=2)
print(dbscan)

#Cluster data
result = dbscan.fit_predict(data)

print('\n Multivariate Outliers labeled as -1', result)
Example #48
0
def plot_results(results,
                 samples,
                 phenotypes,
                 labels,
                 outdir,
                 filter_diff_thres=.2,
                 filter_response_thres=0,
                 response_grad_cutoff=None,
                 stat_test=None,
                 positive_filters_only=False,
                 log_yscale=False,
                 group_a='group A',
                 group_b='group B',
                 group_names=None,
                 tsne_ncell=10000,
                 regression=False,
                 clustering=None,
                 add_filter_response=False,
                 percentage_drop_cluster=.1,
                 min_cluster_freq=0.2,
                 show_filters=True):
    """ Plots the results of a CellCnn analysis.

    Args:
        - results :
            Dictionary containing the results of a CellCnn analysis.
        - samples :
            Samples from which to visualize the selected cell populations.
        - phenotypes :
            List of phenotypes corresponding to the provided `samples`.
        - labels :
            Names of measured markers.
        - outdir :
            Output directory where the generated plots will be stored.
        - filter_diff_thres :
            Threshold that defines which filters are most discriminative. Given an array
            ``filter_diff`` of average cell filter response differences between classes,
            sorted in decreasing order, keep a filter ``i, i > 0`` if it holds that
            ``filter_diff[i-1] - filter_diff[i] < filter_diff_thres * filter_diff[i-1]``.
            For regression problems, the array ``filter_diff`` contains Kendall's tau
            values for each filter.
        - filter_response_thres :
            Threshold for choosing a responding cell population. Default is 0.
        - response_grad_cutoff :
            Threshold on the gradient of the cell filter response CDF, might be useful for defining
            the selected cell population.
        - stat_test: None | 'ttest' | 'mannwhitneyu'
            Optionally, perform a statistical test on selected cell population frequencies between
            two groups and report the corresponding p-value on the boxplot figure
            (see plots description below). Default is None. Currently only used for binary
            classification problems.
        - group_a :
            Name of the first class.
        - group_b :
            Name of the second class.
        - group_names :
            List of names for the different phenotype classes.
        - positive_filters_only :
            If True, only consider filters associated with higher cell population frequency in the
            positive class.
        - log_yscale :
            If True, display the y-axis of the boxplot figure (see plots description below) in
            logarithmic scale.
        - clustering: None | 'dbscan' | 'louvain'
            Post-processing option for selected cell populations. Default is None.
        - tsne_ncell :
            Number of cells to include in t-SNE calculations and plots.
        - regression :
            Whether it is a regression problem.
        - show_filters :
            Whether to plot learned filter weights.

    Returns:
        A list with the indices and corresponding cell filter response thresholds of selected
        discriminative filters. \
        This function also produces a collection of plots for model interpretation.
        These plots are stored in `outdir`. They comprise the following:

        - clustered_filter_weights.pdf :
            Filter weight vectors from all trained networks that pass a validation accuracy
            threshold, grouped in clusters via hierarchical clustering. Each row corresponds to
            a filter. The last column(s) indicate the weight(s) connecting each filter to the output
            class(es). Indices on the y-axis indicate the filter cluster memberships, as a
            result of the hierarchical clustering procedure.
        - consensus_filter_weights.pdf :
            One representative filter per cluster is chosen (the filter with minimum distance to all
            other memebers of the cluster). We call these selected filters "consensus filters".
        - best_net_weights.pdf :
            Filter weight vectors of the network that achieved the highest validation accuracy.
        - filter_response_differences.pdf :
            Difference in cell filter response between classes for each consensus filter.
            To compute this difference for a filter, we first choose a filter-specific class, that's
            the class with highest output weight connection to the filter. Then we compute the
            average cell filter response (value after the pooling layer) for validation samples
            belonging to the filter-specific class (``v1``) and the average cell filter response
            for validation samples not belonging to the filter-specific class (``v0``).
            The difference is computed as ``v1 - v0``. For regression problems, we cannot compute
            a difference between classes. Instead we compute Kendall's rank correlation coefficient
            between the predictions of each individual filter (value after the pooling layer) and
            the true response values.
            This plot helps decide on a cutoff (``filter_diff_thres`` parameter) for selecting
            discriminative filters.
        - tsne_all_cells.png :
            Marker distribution overlaid on t-SNE map. 

        In addition, the following plots are produced for each selected filter (e.g. filter ``i``):

        - cdf_filter_i.pdf :
            Cumulative distribution function of cell filter response for filter ``i``. This plot
            helps decide on a cutoff (``filter_response_thres`` parameter) for selecting the
            responding cell population.

        - selected_population_distribution_filter_i.pdf :
            Histograms of univariate marker expression profiles for the cell population selected by
            filter ``i`` vs all cells.

        - selected_population_frequencies_filter_i.pdf :
            Boxplot of selected cell population frequencies in samples of the different classes,
            if running a classification problem. For regression settings, a scatter plot of selected
            cell population frequencies vs response variable is generated.

        - tsne_cell_response_filter_i.png :
            Cell filter response overlaid on t-SNE map.

        - tsne_selected_cells_filter_i.png :
            Marker distribution of selected cell population overlaid on t-SNE map.
    """

    # create the output directory
    mkdir_p(outdir)

    # number of measured markers
    nmark = samples[0].shape[1]

    if results['selected_filters'] is not None:
        print 'Loading the weights of consensus filters.'
        filters = results['selected_filters']
    else:
        sys.exit('Consensus filters were not found.')

    if show_filters:
        plot_filters(results, labels, outdir)
    # get discriminative filter indices in consensus matrix
    keep_idx = discriminative_filters(
        results,
        outdir,
        filter_diff_thres,
        positive_filters_only=positive_filters_only,
        show_filters=show_filters)

    # encode the sample and sample-phenotype for each cell
    sample_sizes = []
    per_cell_ids = []
    for i, x in enumerate(samples):
        sample_sizes.append(x.shape[0])
        per_cell_ids.append(i * np.ones(x.shape[0]))
    # for each selected filter, plot the selected cell population
    x = np.vstack(samples)
    z = np.hstack(per_cell_ids)

    if results['scaler'] is not None:
        x = results['scaler'].transform(x)

    print 'Computing t-SNE projection...'
    tsne_idx = np.random.choice(x.shape[0], tsne_ncell)
    x_for_tsne = x[tsne_idx].copy()
    x_tsne = TSNE(n_components=2).fit_transform(x_for_tsne)
    vmin, vmax = np.zeros(x.shape[1]), np.zeros(x.shape[1])
    for seq_index in range(x.shape[1]):
        vmin[seq_index] = np.percentile(x[:, seq_index], 1)
        vmax[seq_index] = np.percentile(x[:, seq_index], 99)
    fig_path = os.path.join(outdir, 'tsne_all_cells')
    plot_tsne_grid(x_tsne,
                   x_for_tsne,
                   fig_path,
                   labels=labels,
                   fig_size=(20, 20),
                   point_size=5)

    return_filters = []
    for i_filter in keep_idx:
        w = filters[i_filter, :nmark]
        b = filters[i_filter, nmark]
        g = np.sum(w.reshape(1, -1) * x, axis=1) + b
        g = g * (g > 0)

        # skip a filter if it does not select any cell
        if np.max(g) <= 0:
            continue

        ecdf = sm.distributions.ECDF(g)
        gx = np.linspace(np.min(g), np.max(g))
        gy = ecdf(gx)
        plt.figure()
        sns.set_style('whitegrid')
        a = plt.step(gx, gy)
        t = filter_response_thres
        # set a threshold to the CDF gradient?
        if response_grad_cutoff is not None:
            by = np.array(a[0].get_ydata())[::-1]
            bx = np.array(a[0].get_xdata())[::-1]
            b_diff_idx = np.where(by[:-1] - by[1:] >= response_grad_cutoff)[0]
            if len(b_diff_idx) > 0:
                t = bx[b_diff_idx[0] + 1]
        plt.plot((t, t), (np.min(gy), 1.), 'r--')
        plt.xlabel('Cell filter response')
        plt.ylabel('Cumulative distribution function (CDF)')
        sns.despine()
        plt.savefig(os.path.join(outdir, 'cdf_filter_%d.pdf' % i_filter),
                    format='pdf')
        plt.clf()
        plt.close()

        condition = g > t
        x1 = x[condition]
        z1 = z[condition]
        g1 = g[condition]

        # skip a filter if it does not select any cell with the new cutoff threshold
        if x1.shape[0] == 0:
            continue

        # else add the filters to selected filters
        return_filters.append((i_filter, t))
        # t-SNE plots for characterizing the selected cell population
        fig_path = os.path.join(outdir,
                                'tsne_cell_response_filter_%d.png' % i_filter)
        plot_2D_map(x_tsne, g[tsne_idx], fig_path, s=5)
        # overlay marker values on TSNE map for selected cells
        fig_path = os.path.join(outdir,
                                'tsne_selected_cells_filter_%d' % i_filter)
        g_tsne = g[tsne_idx]
        x_pos = x_for_tsne[g_tsne > t]
        x_tsne_pos = x_tsne[g_tsne > t]
        plot_tsne_selection_grid(x_tsne_pos,
                                 x_pos,
                                 x_tsne,
                                 vmin,
                                 vmax,
                                 fig_path=fig_path,
                                 labels=labels,
                                 fig_size=(20, 20),
                                 s=5,
                                 suffix='png')

        if clustering is None:
            suffix = 'filter_%d' % i_filter
            plot_selected_subset(x1, z1, x, labels, sample_sizes, phenotypes,
                                 outdir, suffix, stat_test, log_yscale,
                                 group_a, group_b, group_names, regression)
        else:
            if clustering == 'louvain':
                print 'Creating a k-NN graph with %d/%d cells...' % (
                    x1.shape[0], x.shape[0])
                k = 10
                G = create_graph(x1, k, g1, add_filter_response)
                print 'Identifying cell communities...'
                cl = G.community_fastgreedy()
                clusters = np.array(cl.as_clustering().membership)
            else:
                print 'Clustering using the dbscan algorithm...'
                eps = set_dbscan_eps(x1,
                                     os.path.join(outdir, 'kNN_distances.png'))
                cl = DBSCAN(eps=eps, min_samples=5, metric='l1')
                clusters = cl.fit_predict(x1)

            # discard outliers, i.e. clusters with very few cells
            c = Counter(clusters)
            cluster_ids = []
            min_cells = int(min_cluster_freq * x1.shape[0])
            for key, val in c.items():
                if (key != -1) and (val > min_cells):
                    cluster_ids.append(key)

            num_clusters = len(cluster_ids)
            scores = np.zeros(num_clusters)
            for j in range(num_clusters):
                cl_id = cluster_ids[j]
                scores[j] = np.mean(g1[clusters == cl_id])

            # keep the communities with high cell filter response
            sorted_idx = np.argsort(scores)[::-1]
            scores = scores[sorted_idx]
            keep_idx_comm = [sorted_idx[0]]
            for i in range(1, num_clusters):
                if (scores[i - 1] -
                        scores[i]) < percentage_drop_cluster * scores[i - 1]:
                    keep_idx_comm.append(sorted_idx[i])
                else:
                    break

            for j in keep_idx_comm:
                cl_id = cluster_ids[j]
                xc = x1[clusters == cl_id]
                zc = z1[clusters == cl_id]
                suffix = 'filter_%d_cluster_%d' % (i_filter, cl_id)
                plot_selected_subset(xc, zc, x, labels, sample_sizes,
                                     phenotypes, outdir, suffix, stat_test,
                                     log_yscale, group_a, group_b, group_names,
                                     regression)
    print 'Done.\n'
    return return_filters
Example #49
0
#data = data[data[:,2]<1]

data = data[data[:, 3] < 15]
data = data[data[:, 3] > -15]

data = data[data[:, 4] < 15]
data = data[data[:, 4] > -15]

X = np.copy(data[:, 0:5])

X = StandardScaler().fit_transform(X)
#X = MinMaxScaler().fit_transform(X)
data_zs = np.copy(X)

clt = DBSCAN(eps=0.26, min_samples=13)
datalables = clt.fit_predict(data_zs)

r1 = pd.Series(datalables).value_counts()

print(r1)

datapro = np.column_stack((data, datalables))

highdata = datapro[datapro[:, 8] == 0]
lowdata = datapro[datapro[:, 8] == -1]

np.savetxt('highdata.txt', highdata)
np.savetxt('lowdata.txt', lowdata)

arcmin = 15
temp = [0 for i in range(arcmin * 2)]
class Detector(object):
    def __init__(self, config=detector_config):
        self.config = config
        self.grid_size = self.config["grid_size"]
        self.min_cluster_occupied_grids = self.config[
            "min_cluster_occupied_grids"]
        self.map_size = self.config["map_size"]
        self.num_grid = round(self.map_size / self.grid_size)
        self.shape = (self.num_grid, self.num_grid)
        self.location_to_index = build_location_to_index(self.config)
        self.index_to_location = build_index_to_location(self.config)

        self._confidence_map = np.zeros(self.shape, dtype=np.int)
        self._object_dict = {}
        self._object_label_waiting_list = []
        self._detected_object_number = 0
        # self.runtime = 0
        # self.num_pub = 0
        self.clustering_algo = DBSCAN(
            eps=self.config["neighborhood_size"],
            min_samples=self.config["neighborhood_min_samples"]
        )  # min sample=6 should be the FAST MODE
        logging.info('Global Map Created.')

    def _compute_object_overlap(self, obj1, obj2):

        x_min1, y_min1, x_max1, y_max1 = obj1["bounding_box"]
        x_min2, y_min2, x_max2, y_max2 = obj2["bounding_box"]

        intersection = max(0, (min(x_max1, x_max2) - max(x_min1, x_min2))) * \
                       max(0, (min(y_max1, y_max2) - max(y_min1, y_min2)))
        if intersection < 1e-6:
            return 0
        union = (y_max2 - y_min2) * (x_max2 - x_min2) + (y_max1 - y_min1) * (
            x_max1 - x_min1) - intersection
        return intersection / union

    def _compute_overlap(self, obj, cluster):
        # Now the intesection over union is not proper here.
        # Since we are using the bounding box to serve as the target.
        # So it can be naturally a very small IoU even if the cluster is in the bounding box.
        # Therefore I choose to use ``the intersection over the cluster``,
        # which is still not greater than 1.

        current_points = cluster["occupied_grids"]
        current_weight = cluster["occupied_weight"]
        if "search_range" in obj:
            x_min, y_min, x_max, y_max = obj["search_range"]
        else:
            x_min, y_min, x_max, y_max = obj["bounding_box"]
        intersection_mask = np_logical_and_list(x_min <= current_points[:, 0],
                                                current_points[:, 0] <= x_max,
                                                y_min <= current_points[:, 1],
                                                current_points[:, 1] <= y_max)
        intersection = current_weight[intersection_mask].sum()
        union = current_weight.sum()
        return intersection / union

    def _remove_object(self, obj_label):
        if obj_label not in self._object_dict:
            return
        self._object_label_waiting_list.append(obj_label)
        self._object_dict.pop(obj_label)

    def _fit(self, points, weight):
        if len(points) == 0:
            return None
        labels = self.clustering_algo.fit_predict(points, sample_weight=weight)
        return labels

    def _find_cluster(self, points, weight, high, low, occupied_grids):
        if len(points) < 1:
            logging.info("No point found in receptive field.")
            return {}
        labels = self._fit(points, weight)
        cluster_num = labels.max()
        if cluster_num < 0:
            logging.info("No cluster found in receptive field.")
            return {}
        raw_cluster_dict = {
            i: {  # cluster info
                "points": points[labels == i],
                "weight": weight[labels == i],
                "high": high[labels == i],
                "low": low[labels == i],
                "mask": labels == i,
                "occupied_grids": occupied_grids[labels == i]
            }
            for i in range(0, cluster_num + 1)
        }
        return raw_cluster_dict

    def _process_cluster(self, raw_cluster_dict):
        cluster_properties = {}
        for label, cluster_info in raw_cluster_dict.items():
            # calculate basic properties of each cluster
            points = cluster_info["points"]
            weight = cluster_info["weight"]
            centroid = np.dot(weight, points) / weight.sum()
            length = sqrt(np.sum(np.square(points - centroid),
                                 axis=1).max()) * 2

            num_occupied_grids = len(points)
            cluster_properties[label] = {
                "area": num_occupied_grids * self.grid_size * self.grid_size,
                "length": length,
                "centroid": centroid,
                "density": weight.sum(),
                "occupied": cluster_info["occupied_grids"].shape[0],
                "high": cluster_info["high"].max(),
                "low": cluster_info["low"].min(),
                # "label": label,
                "occupied_grids": cluster_info["occupied_grids"],
                "occupied_weight": weight,
            }
        return cluster_properties

    def _get_new_label(self):
        if len(self._object_label_waiting_list) == 0:
            new_label = "Object {}".format(self._detected_object_number)
            self._detected_object_number += 1
        else:
            new_label = self._object_label_waiting_list.pop(0)
        return new_label

    def _create_object(self, cluster):
        label = self._get_new_label()
        self._object_dict[label] = DetectedObject(label, cluster, self.config)
        return label

    def _register_cluster(self, cluster_properties):
        modified_objects = set()
        checked_cluster = set()

        for label, obj in self._object_dict.items():  # for each object
            possible_clusters = {}
            if obj["status"] != LOST:
                continue

            for cluster_label, cluster in cluster_properties.items():
                overlap = self._compute_overlap(obj, cluster)
                if overlap > self.config["overlap_threshold"]:
                    possible_clusters[cluster_label] = overlap

            if possible_clusters:
                # max would return the "key" of the maximum "value".
                target_cluster_label = max(possible_clusters)
                obj.update_property(cluster_properties[target_cluster_label])
                modified_objects.add(label)
                checked_cluster.add(target_cluster_label)

        for cluster_label, cluster in cluster_properties.items(
        ):  # for each cluster
            if cluster_label in checked_cluster: continue
            possible_objects = {}
            for label, obj in self._object_dict.items():  # for each object
                if label in modified_objects: continue
                overlap = self._compute_overlap(obj, cluster)
                if overlap > self.config["overlap_threshold"]:
                    possible_objects[label] = overlap
            if not possible_objects:  # No match existing object
                label = self._create_object(cluster)
            else:
                label = max(
                    possible_objects
                )  # max would return the "key" of the maximum "value".
                self._object_dict[label].update_property(
                    cluster)  # _update_object(label, cluster)
            modified_objects.add(label)

        all_keys = set(self._object_dict.keys())
        not_modified_objects = all_keys.difference(modified_objects)

        for obj_label in not_modified_objects:
            should_remove = self._object_dict[obj_label].lost()
            if should_remove:
                self._remove_object(obj_label)

        all_objects = list(self._object_dict.items())
        for obj_label1, obj1 in all_objects[:-1]:
            if obj_label1 not in self._object_dict: continue
            for obj_label2, obj2 in all_objects[1:]:
                if obj_label2 not in self._object_dict: continue
                overlap = self._compute_object_overlap(obj1, obj2)
                if overlap > self.config["overlap_threshold"]:
                    if obj1["confidence"] < obj2["confidence"]:
                        self._remove_object(obj_label1)
                    else:
                        self._remove_object(obj_label2)
        return self._object_dict

    def _should_decrease_confidence(self, obj_cluster):
        should = False
        if len(obj_cluster["occupied_grids"]
               ) < self.min_cluster_occupied_grids:
            should = True
        if obj_cluster["length"] > self.config["max_object_length"]:
            should = True
        if obj_cluster["high"] > self.config["max_object_bottom_height"]:
            should = True
        if obj_cluster["status"] == LOST:
            should = True
        return should

    def _verify_objects(self):
        for obj_label, obj in self._object_dict.items():
            if self._should_decrease_confidence(obj):
                obj.decrease_confidence()
            else:
                obj.increase_confidence()

    @property
    def object_dict(self):
        return {
            obj_label: obj_info
            for obj_label, obj_info in self._object_dict.items()
            if obj_info["confidence"] > self.config["min_detected_confidence"]
        }

    @property
    def availiable_obejct_keys(self):
        ret = {}
        for obj_label, obj_info in self._object_dict.items():
            if obj_info["confidence"] > self.config["min_detected_confidence"]:
                index = obj_label.split(' ')[1]
                key = ord(str(index))
                ret[key] = obj_label
        return ret

    def update(self, input_dict):
        points, weight, high, low = input_dict["points"], input_dict["weight"], input_dict["high"], \
                                    input_dict["low"]

        occupied_grids = input_dict["indices"]
        if points is not None:
            raw_cluster_dict = self._find_cluster(points, weight, high, low,
                                                  occupied_grids)
            prop = self._process_cluster(raw_cluster_dict)
        else:
            prop = {}

        self._register_cluster(prop)

        self._verify_objects()

        return self._object_dict
Example #51
0
"""le=LabelEncoder()
y=le.fit_transform(y)
le.transform(['M','B'])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,stratify=y,random_state=1)
pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=2),LogisticRegression(random_state=1))
pipe_lr.fit(X_train,y_train)
y_pred=pipe_lr.predict(X_test)
print(pipe_lr.score(X_test,y_test))
"""
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
#X,y=make_moons(n_samples=400,noise=0.05,random_state=0)
db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
y_db = db.fit_predict(X)
km = KMeans(n_clusters=2, random_state=0)
y_km = km.fit_predict(X)

plt.scatter(X[:, 0], X[:, 1])
plt.show()

plt.scatter(X[y_db == 0, 0],
            X[y_db == 0, 1],
            c='lightblue',
            edgecolor='black',
            marker='o',
            s=40,
            label='cluster 1')
plt.scatter(X[y_db == 1, 0],
            X[y_db == 1, 1],
Example #52
0
    def detect_outliers(laptimes, divider):
        rng = max(laptimes) - min(laptimes)
        outlier_detection = DBSCAN(min_samples = 2, eps = rng/divider)

        return outlier_detection.fit_predict(np.array(laptimes).reshape(-1, 1))
Example #53
0
def create_clusters(df, cluster_eps=0.03, min_samples=30):
    dbscan = DBSCAN(eps=cluster_eps, min_samples=min_samples)
    return dbscan.fit_predict(df)
Example #54
0
# Parámetros: eps (mínima distancia), min_samples (número de ejemplos en el vecindario de un punto para considerarlo centro)
dbscan = DBSCAN(eps=0.127, min_samples=20)

# Lista de algoritmos a utilizar
algorithms = (('DBSCAN', dbscan)) 
cluster_predict = {}

# Bucle algoritmos
filas_tabla_res = []

print('----- Ejecutando ' + 'DBSCAN', end='') # -----

#Tomamos tiempos
t = time.time()
# Ejecuto el algoritmo y asigno los clusters
cluster_predict['DBSCAN'] = dbscan.fit_predict(X_normal)
tiempo = time.time() - t
#Pinto resultados
print(": {:.2f} segundos, ".format(tiempo), end='')
try:
    metric_CH = metrics.calinski_harabasz_score(X_normal, cluster_predict['DBSCAN'])
    print("Calinski-Harabasz Index: {:.3f}, ".format(metric_CH), end='')
except:
    print("----ERROR: No podemos calcular el índice Calinski-Harabasz---")
    metric_CH = -1
# Otra medida de rendimiento, menos eficiente
# el cálculo de Silhouette puede consumir mucha RAM. Si son muchos datos, digamos más de 10k, se puede seleccionar una muestra, p.ej., el 20%
if len(X) > 10000:
    muestra_silhoutte = 0.2
else:
    muestra_silhoutte = 1.0
Example #55
0
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(xlst)

distances, indices = nbrs.kneighbors(xlst)

mean_dist = np.mean(distances)
print("MEAN NEAREST NEIGHBOR DISTANCE", np.mean(distances))

clustering = DBSCAN(eps=mean_dist * 4.0, min_samples=1)

xlst_cluster = xlst * 1.0

xlst_cluster[:, 0] *= 5.0

print('xlst shape', xlst.shape)

cluster_labels = clustering.fit_predict(xlst_cluster)

print('cl', cluster_labels)

cluster2chars = {}

for j in range(0, len(cluster_labels)):
    if not cluster_labels[j] in cluster2chars:
        cluster2chars[cluster_labels[j]] = []

    cluster2chars[cluster_labels[j]].append(xlst[j])

cluster2color = {}

for keyc in cluster2chars:
    randcolor = [
Example #56
0
epsilon_deviation = 0.33
change_width_for = {}
weight_min_width = 0.9
weight_max_width = 0.1
weight_max_deviation = 0.6
weight_min_deviation = 0.4
allowed_group_wind_speed_deviation = 0.13
min_points_in_group = 4
limit_elements_in_group_std = 1.6
limit_next_center_std = 0.5
right_diff_limit = 5

for group in data_set.groupby('discrete_ActivePower'):

    work_group = group[1].copy()
    work_group['dbscan_label'] = dbscan_alg_obj.fit_predict(
        work_group[['scaled_WindSpeed', 'discrete_ActivePower']])
    work_group = work_group[work_group['dbscan_label'] != -1].copy()

    if work_group.shape[0] <= n_kmeans_clusters:
        continue

    kmeans_alg_obj.fit(work_group[['scaled_WindSpeed',
                                   'discrete_ActivePower']])
    work_group['kmeans_label'] = kmeans_alg_obj.labels_
    label_positions = [
        i[0] for i in sorted(enumerate(kmeans_alg_obj.cluster_centers_[:, 0]),
                             key=lambda x: x[1])
    ]
    sorted_values = np.sort(kmeans_alg_obj.cluster_centers_[:, 0], axis=0)
    current_width = sorted_values[-1] - sorted_values[0]
Example #57
0
X.head()
Xscaled=StandardScaler().fit_transform(X) #array o/p
Xn=normalize(Xscaled) #array o/p
X=pd.DataFrame(Xn,columns=data.columns,index=data.index)
X.head()

obj=PCA(n_components=2,random_state=123)
Xcomp=obj.fit_transform(X)
Xcomp=pd.DataFrame(Xcomp, columns=['P1',"P2"],index=X.index)
Xcomp.head()

#model
model=DBSCAN(eps=0.3,min_samples=4)
fit=model.fit(Xcomp)
labels=fit.labels_
y_pred = model.fit_predict(Xcomp) #labels

#plot
  plt.scatter(Xcomp.iloc[:,0], Xcomp.iloc[:,1],c=y_pred, cmap='Paired')
  plt.title("DBSCAN")
#summary
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print(' number of clusters: %d' % n_clusters_)
print(' number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(Xcomp, labels))


# DBSCAN Clustering function
# def dbscan(X, eps, min_samples):
#     ss = StandardScaler()
Example #58
0
def main(_argv):
    # Hyperparameters
    max_cosine_distance = 0.4  # Used in deep sort
    nn_budget = None  # Used in deep sort
    nms_max_overlap = 1.0
    gathering_thresh = 3
    pixels_to_meter = 150  #300

    # Initialize person deep sort
    model_filename = 'model_data/mars-small128.pb'
    person_encoder = gdet.create_box_encoder(
        model_filename, batch_size=1
    )  # This encodes the data inside a bounding box into a vector
    person_metric = nn_matching.NearestNeighborDistanceMetric(
        "cosine", max_cosine_distance,
        nn_budget)  # Calculate cosine distance metric
    person_tracker = Tracker(person_metric)  # Initialize person tracker
    print("Person deep sort initialized")

    # Initialize group sort
    group_metric = nn_matching.NearestNeighborDistanceMetric(
        "euclidean", max_cosine_distance, nn_budget)
    group_tracker = GroupTracker(group_metric)
    print("Group deep sort initialized")

    # Initialize DBSCAN model for clustering
    dbscan_model = DBSCAN(eps=pixels_to_meter, min_samples=1)
    print("DBSCAN initialized")

    # load configuration for object detector
    config = ConfigProto()
    config.gpu_options.allow_growth = True
    session = InteractiveSession(config=config)
    STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config(FLAGS)
    input_size = FLAGS.size
    video_path = FLAGS.video

    # Load object detection model
    saved_model_loaded = tf.saved_model.load(FLAGS.weights,
                                             tags=[tag_constants.SERVING])
    infer = saved_model_loaded.signatures['serving_default']
    print("Object detection model initialized")

    # Read in all class names from config
    class_names = utils.read_class_names(cfg.YOLO.CLASSES)
    # Only allowed_classes will be drawn
    allowed_classes = ['person']  #list(class_names.values())

    # Display/Visual things
    video = None
    out = None
    if not FLAGS.output:
        #video = VideoReader(video_path) # Initialize async reader for video
        video = cv2.VideoCapture(video_path)
    else:
        video = cv2.VideoCapture(video_path)
        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = int(video.get(cv2.CAP_PROP_FPS))
        codec = cv2.VideoWriter_fourcc(*FLAGS.output_format)
        out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height))

    # Initialize color map
    cmap = plt.get_cmap('tab20b')
    colors = [cmap(i)[:3] for i in np.linspace(0, 1, 20)]

    display_yolo = False  # Controls whether yolo bounding boxes are drawn
    display_person_track = True
    display_centroids = False  # Controls whether bounding box centroids are drawn
    display_groups = True
    video_written = False

    cv2.namedWindow("detection", cv2.WINDOW_NORMAL)
    cv2.resizeWindow("detection", 1280, 720)

    # Main loop
    while True:
        """ PERFORM READING OF FRAME """
        _, frame = video.read()
        if frame is None:
            print('Video has ended, restarting video...')

            if FLAGS.output:
                if not video_written:
                    print("Output video written!")
                    out.release()
                    video_written = True

            break

            try:
                video.reset()
            except:
                video = cv2.VideoCapture(video_path)
            person_tracker.reset_tracks()
            group_tracker.reset_tracks()
            _, frame = video.read()

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Reformatting frame read from video
        frame_size = frame.shape[:2]
        image_data = cv2.resize(frame, (input_size, input_size))
        image_data = image_data / 255.
        image_data = image_data[np.newaxis, ...].astype(np.float32)
        start_time = time.time()
        """ END READING OF FRAME """
        """ PERFORM OBJECT DETECTION FOR PERSON """
        # Convert frame image data to tensorflow input matrix and perform prediction
        batch_data = tf.constant(image_data)
        pred_bbox = infer(batch_data)
        for key, value in pred_bbox.items():
            boxes = value[:, :, 0:4]
            pred_conf = value[:, :, 4:]

        # On predictions, run NMS to clean duplicate bounding boxes outputted by the model
        boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression(
            boxes=tf.reshape(boxes, (tf.shape(boxes)[0], -1, 1, 4)),
            scores=tf.reshape(
                pred_conf,
                (tf.shape(pred_conf)[0], -1, tf.shape(pred_conf)[-1])),
            max_output_size_per_class=50,
            max_total_size=50,
            iou_threshold=FLAGS.iou,
            score_threshold=FLAGS.score)

        # Convert data to numpy arrays and slice out unused elements
        num_objects = valid_detections.numpy()[0]
        bboxes = boxes.numpy()[0]
        bboxes = bboxes[0:int(num_objects)]
        scores = scores.numpy()[0]
        scores = scores[0:int(num_objects)]
        classes = classes.numpy()[0]
        classes = classes[0:int(num_objects)]

        # Get usable format of bounding boxes
        original_h, original_w, _ = frame.shape
        bboxes, bboxes_xyxy = utils.format_boxes(bboxes, original_h,
                                                 original_w)

        # Drawing YOLO detected bounding boxes
        if display_yolo:
            for j in range(0, len(bboxes_xyxy)):
                if classes[j] != 0:
                    continue
                box = bboxes_xyxy[j]
                cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]),
                              (0, 0, 255), 3)

        # Loop through objects and use class index to get class name, allow only classes in allowed_classes list
        names = []
        deleted_indx = []
        for i in range(num_objects):
            class_indx = int(classes[i])
            class_name = class_names[class_indx]
            if class_name not in allowed_classes:
                deleted_indx.append(i)
            else:
                names.append(class_name)
        names = np.array(names)
        count = len(names)
        if FLAGS.count:
            cv2.putText(frame, "Objects being tracked: {}".format(count),
                        (5, 35), cv2.FONT_HERSHEY_COMPLEX_SMALL, 2,
                        (0, 255, 0), 2)
            print("Objects being tracked: {}".format(count))
        # Delete detections that are not in allowed_classes
        bboxes = np.delete(bboxes, deleted_indx, axis=0)
        scores = np.delete(scores, deleted_indx, axis=0)
        """ END OBJECT DETECTION FOR PERSON """
        """ PERFORM PERSON DEEP SORT """
        # Encode YOLO person detections to feed to person_tracker
        person_features = person_encoder(frame, bboxes)
        person_detections = [
            Detection(bbox, score, class_name, feature)
            for bbox, score, class_name, feature in zip(
                bboxes, scores, names, person_features)
        ]

        # Run non-maxima supression
        boxs = np.array([d.tlwh for d in person_detections])
        scores = np.array([d.confidence for d in person_detections])
        classes = np.array([d.class_name for d in person_detections])
        indices = preprocessing.non_max_suppression(boxs, classes,
                                                    nms_max_overlap, scores)
        person_detections = [person_detections[i] for i in indices]

        # Execute the person tracker
        person_tracker.predict()
        person_tracker.update(person_detections)

        # Update tracks to get bounding boxes of people
        person_bboxes = []
        person_centroids = []
        for person_track in person_tracker.tracks:
            if not person_track.is_confirmed(
            ) or person_track.time_since_update > 3:
                continue
            person_bboxes.append([int(x) for x in person_track.to_tlbr()])
            person_centroids.append(utils.get_centroid(person_track.to_tlwh()))

            if display_person_track:
                bbox = person_track.to_tlbr()
                color = colors[int(person_track.track_id) % len(colors)]
                color = [i * 255 for i in color]
                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                              (int(bbox[2]), int(bbox[3])), color, 2)
                cv2.rectangle(
                    frame, (int(bbox[0]), int(bbox[1] - 30)),
                    (int(bbox[0]) +
                     (len("person") + len(str(person_track.track_id))) * 17,
                     int(bbox[1])), color, -1)
                cv2.putText(frame, "person" + "-" + str(person_track.track_id),
                            (int(bbox[0]), int(bbox[1] - 10)), 0, 0.75,
                            (255, 255, 255), 2)
        """ END PERSON DEEP SORT """
        """ PERFORM GROUP SORT """
        # Detect and draw clusters of people too close together
        cluster_bboxes = []
        cluster_sizes = []
        if len(person_bboxes) != 0:
            cluster_assignments = dbscan_model.fit_predict(person_centroids)
            clusters = np.unique(cluster_assignments)

            for cluster in clusters:
                # Get array of centroids detected to be under current cluster
                row_ix = np.where(cluster_assignments == cluster)

                color = colors[random.randint(0, 10) % len(colors)]
                color = [j * 255 for j in color]

                point_arr = []
                for i in row_ix[0]:
                    point_arr.append(list(person_bboxes[i][0:2]))
                    point_arr.append(list(person_bboxes[i][2:4]))

                    if display_centroids:
                        frame = cv2.circle(
                            frame,
                            (person_centroids[i][0], person_centroids[i][1]),
                            20, color, -1)

                # Get bounding rectangle that covers group of people
                x, y, w, h = cv2.boundingRect(np.array(point_arr))

                # Skip all bounding rectangles that have height or width of 1
                if w != 1 and h != 1:
                    cluster_bboxes.append([x, y, w,
                                           h])  # Store bounding box of cluster
                    cluster_sizes.append(
                        len(point_arr) //
                        2)  # Store number of people in current cluster

        # Encode cluster data and feed to tracker
        group_features = []
        for i in range(0, len(cluster_bboxes)):
            cur_feature = np.array([cluster_sizes[i]])  # Get people in group

            # Get centre (mean position) of group
            # First get centroids of each bounding box in group
            bbox_centroids = np.apply_along_axis(utils.get_centroid, 1,
                                                 cluster_bboxes)
            mean = np.mean(bbox_centroids, axis=0)
            cur_feature = np.concatenate((cur_feature, mean))

            # Get variance
            variance = np.mean(bbox_centroids, axis=0)
            cur_feature = np.concatenate((cur_feature, variance))
            group_features.append(cur_feature)

        group_features = np.array(group_features)
        group_detections = [
            GroupDetection(bbox, num_people,
                           feature) for bbox, num_people, feature in zip(
                               cluster_bboxes, cluster_sizes, group_features)
        ]

        # Call the Deep sort tracker
        group_tracker.predict()
        group_tracker.update(group_detections)

        # update tracks
        for group_track in group_tracker.tracks:
            # If track is not confirmed or if it has been 2 or more frames since the track
            # was not found, we do not draw this track. The deletion of this track will
            # be handled automatically by the tracker.update() function I believe
            if not group_track.is_confirmed() or group_track.time_since_update > 1 or \
                    group_track.num_people <= gathering_thresh:
                continue
            bbox = group_track.to_tlbr()
            class_name = group_track.get_class()

            # Draw detection box on screen
            blank_frame = np.zeros(frame.shape, np.uint8)
            cv2.rectangle(blank_frame, (int(bbox[0]), int(bbox[1])),
                          (int(bbox[2]), int(bbox[3])), (255, 0, 0),
                          cv2.FILLED)
            frame = cv2.addWeighted(frame, 1.0, blank_frame, 0.4, 1)
            cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                          (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)
            cv2.putText(
                frame,
                f"{group_track.track_id} | Size: {group_track.num_people}",
                tuple(group_track.get_centroid()), 0, 2, (255, 255, 255), 2)

            # TODO If track age is greater than a certain number of frames, we issue an alert! Omg I'm done yay.
        """ END GROUP SORT """

        # Calculate frames per second of entire process
        fps = 1.0 / (time.time() - start_time)
        print("FPS: %.2f" % fps)
        result = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        if not FLAGS.dont_show:
            cv2.imshow("detection", result)

        # if output flag is set, save video file
        if FLAGS.output:
            out.write(result)

        key_press = cv2.waitKey(1) & 0xFF
        if key_press == ord('q') or key_press == 27:  # ESC key
            break
        elif key_press == ord('y'):
            display_yolo = not display_yolo
        elif key_press == ord('c'):
            display_centroids = not display_centroids
        elif key_press == ord('p'):
            display_person_track = not display_person_track

    cv2.destroyAllWindows()
Example #59
0
from sklearn.cluster import DBSCAN
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

base = pd.read_csv('dataset/credit_card_clients.csv', header = 1)

base['BILL_TOTAL'] = base['BILL_AMT1'] + base['BILL_AMT2'] + base['BILL_AMT3']+ base['BILL_AMT4']+ base['BILL_AMT5']+ base['BILL_AMT6']

x = base.iloc[:,[1,25]].values
scaler = StandardScaler();
x = scaler.fit_transform(x);

dbscan = DBSCAN(eps = 0.37, min_samples = 4)
previsoes = dbscan.fit_predict(x)

# retorna a contagem de itens unicos
unicos, quantidade = np.unique(previsoes, return_counts = True)

plt.scatter(x[previsoes == 0, 0], x[previsoes == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(x[previsoes == 1, 0], x[previsoes == 1, 1], s = 100, c = 'orange' , label = 'Cluster 2')
plt.scatter(x[previsoes == 2, 0], x[previsoes == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.xlabel('Limite')
plt.ylabel('Gastos')
plt.legend()

lista_clientes = np.column_stack((base, previsoes))
lista_clientes = lista_clientes[lista_clientes[:, 26].argsort()]

Example #60
0
#
# for eps in np.linspace(0.001, 10, 1000):
#     for neighbours in range(1, 10):
#         dbscan = DBSCAN(eps=eps, min_samples=neighbours, metric='canberra', algorithm='brute')
#         y_predicted = dbscan.fit_predict(frame) + 1
#
#         score = accuracy_score(y_predicted, train_ys)
#         if score > mx_score:
#             mx_score = score
#             algo = dbscan
#
# print(mx_score)
# print(algo.get_params(True))

best_algo = DBSCAN(eps=1.0119099099099098, min_samples=9, metric='canberra', algorithm='brute')
pred = best_algo.fit_predict(frame) + 1

draw_clusters(train_xs, train_ys, 'Original dataset: Wine', save_name='original.png')
draw_clusters(train_xs, pred, 'Clusterized by DBSCAN dataset: Wine   |   Accuracy: 0.78', save_name='clusterized.png')

eps_xs = []
f_scores = []
silhouette_scores = []

for eps in np.linspace(0.001, 3, 500):
    algo = DBSCAN(eps=eps, min_samples=4, metric='canberra', algorithm='brute')
    predicted = algo.fit_predict(frame) + 1
    eps_xs.append(eps)
    f_score = accuracy_score(train_ys, predicted)
    f_scores.append(f_score)
    if len(set(algo.labels_ + 1)) == 1: