Example #1
0
def clusterMalwareNames(malwareNames):
    # strictly lexical clustering over malware-names
    wordCount = {}
    # create a distance matrix
    matrix = np.zeros((len(malwareNames), len(malwareNames)))
    for i in range(len(malwareNames)):
        for j in range(len(malwareNames)):
            if matrix[i, j] == 0.0:        
                matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j])
                matrix[j, i] = matrix[i, j]
    
    # Scikit-Learn's DBSCAN implementation to cluster the malware-names
    clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
    clust.fit(matrix)    
    
    preds = clust.labels_
    clabels = np.unique(preds)
    
    # create Word-Count Map
    for i in range(clabels.shape[0]):
        if clabels[i] < 0:
            continue
        
        cmem_ids = np.where(preds == clabels[i])[0]
        cmembers = []
        
        for cmem_id in cmem_ids:
            cmembers.append(malwareNames[cmem_id])
        
        wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids)
    return wordCount
Example #2
0
def cluster():
    eps_set = 0.5 * np.arange(1, 7)
    npt_set = np.arange(1, 6)
    scores = []
    global res
    res = []
    for eps in eps_set:
        for npt in npt_set:
            est = DBSCAN(eps=eps, min_samples=npt)
            est.fit(x)
            ari = metrics.adjusted_rand_score(y, est.labels_)
            scores.append(ari)
            n_noise = len([ l for l in est.labels_ if l == -1])
            res.append((ari, np.max(est.labels_) + 1 , n_noise))
            print ari
    max_score = np.max(scores)
    max_idx = scores.index(max_score)
    max_eps = eps_set[max_idx / len(npt_set)]
    max_npt = npt_set[max_idx % len(npt_set)]
    print max_score, max_eps, max_npt
    scores = np.array(scores).reshape(len(eps_set), len(npt_set))
    pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
    pl.colorbar()
    pl.xticks(np.arange(len(npt_set)), npt_set)
    pl.yticks(np.arange(len(eps_set)), eps_set)
    pl.ylabel('eps')
    pl.xlabel('min_samples')
    pl.show()
Example #3
0
def find_tracks(data, eps=20, min_samples=20):
    """Applies the DBSCAN algorithm from scikit-learn to find tracks in the data.

    Parameters
    ----------
    data : array-like
        An array of (x, y, z, hits) data points
    eps : number, optional
        The minimum distance between adjacent points in a cluster
    min_samples : number, optional
        The min number of points in a cluster

    Returns
    -------
    tracks : list
        A list of tracks. Each track is an ndarray of points.

    """
    xyz = data[:, 0:3]
    dbs = DBSCAN(eps=eps, min_samples=min_samples)
    dbs.fit(xyz)

    tracks = []
    for track in (np.where(dbs.labels_ == n)[0] for n in np.unique(dbs.labels_) if n != -1):
        tracks.append(data[track])

    return tracks
Example #4
0
def test():
    global est
    est = DBSCAN(eps=1, min_samples=1)
    est.fit(x)
    print est.labels_
    ari = metrics.adjusted_rand_score(y, est.labels_)
    print ari
Example #5
0
def train_dbscan():
	print "starting dbscan clustering..."
	model = DBSCAN(eps=dbs_eps, min_samples=dbs_min_samples, metric=dbs_metric, algorithm='auto')
	model.fit(X)
	
	core_ponts = model.core_sample_indices_ 
	if output_core_points:
		print "core points data index"
		print core_points
	print "num of core points %d" %(len(core_ponts))
	
	print "all points clutser index"
	cluster_index = model.labels_
	if output_cluster_members:
		#print cluster_index
		cluster_members = {}
		for i,c in enumerate(cluster_index):
			index_list = cluster_members.get(c, list())
			index_list.append(i)
			cluster_members[c] = index_list
		for cl, indx_list in cluster_members.iteritems():
			if cl > 0:
				print "cluster index %d  size %d" %(cl, len(indx_list))
			else:
				print "noise points size %d" %(len(indx_list))
			print indx_list
	
	print "num of clusters %d" %(cluster_index.max() + 1)
    def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time):

        BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING
        END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING

        data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X]

        labels = None
        if clusterType == 'kmeans':
            kmeans = KMeans(n_clusters=N_CLUSTERS)
            kmeans.fit(data)
            labels = kmeans.labels_
        elif clusterType == 'affinity_propagation':
            ap = AffinityPropagation(damping=0.75)
            ap.fit(data)
            labels = ap.labels_
            N_CLUSTERS = np.max(self.labels)+1
        elif clusterType == 'DBSCAN':
            dbscan = DBSCAN()
            dbscan.fit(data)
            labels = dbscan.labels_
            N_CLUSTERS = np.max(labels)+1
            print 'N_CLUSTERS=' + str(N_CLUSTERS)
        elif clusterType == 'AgglomerativeClustering':
            ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
            ac.fit(data)
            labels = ac.labels_
        else:
            print 'ERROR: clusterType: ' + clusterType + ' is not recognized'

        return (labels, N_CLUSTERS)
Example #7
0
def cluster_mappings(vector_inpath, do_pca=False, target_dim=100, indices_inpath=None, epsilon=2.5, min_s=20):
	# TODO: CLustering parameters
	# TODO: Metric cosine similarity or euclidian distance
	print alt("Load mappings...")
	indices, model = load_mappings_from_model(vector_inpath)
	X = numpy.array([model[key] for key in indices])
	# del model
	if do_pca:
		print alt("Truncate vectors with PCA to %i dimensions..." %(target_dim))
		pca = PCA(n_components=target_dim)
		pca.fit(X)
		X = pca.transform(X)
	print alt("Cluster points...")
	# k = 2 * X[0].shape[0] - 1
	# min_pts = k + 1
	#dbscan = DBSCAN(eps=0.1, min_samples=20, metric='cosine',algorithm='brute')
	dbscan = DBSCAN(eps=epsilon, min_samples=min_s)
	dbscan.fit(X)
	labels = dbscan.labels_
	print get_cluster_size(labels)
	print alt("Finished clustering!")
	sscore = silhouette_score(X, labels)
	print("Silhouette Coefficient: %0.3f" %(sscore))
	if indices_inpath:
		resolve_indices(indices, labels, indices_inpath, model)
 def dbscan_algo(self,cluster,X=None):
     
     if self.dMetric=='levenstein':
         clust = DBSCAN(eps=self.epsilon,min_samples=1,metric="precomputed")
         clust.fit(X)
     else:
         vectorizer = TfidfVectorizer().fit_transform(cluster)
         dataX = TfidfTransformer(norm='l1',smooth_idf=True,use_idf=True,sublinear_tf=False).fit_transform(vectorizer)
         clust = DBSCAN(eps=self.epsilon,metric="cosine",min_samples=3,algorithm='brute')
         clust.fit(dataX)
     
     companyNames = cluster
     
     preds = clust.labels_
     clabels = np.unique(preds)
     for i in range(clabels.shape[0]):
         if clabels[i] < 0:
             continue
         cmem_ids = np.where(preds==clabels[i])[0]
         cmembers = []
         for cmem_id in cmem_ids:
             cmembers.append(companyNames[cmem_id])
         clusteritems = ",".join(cmembers)
         print clusteritems
         if len(cmem_ids) > 1:
             self.result.write("Clustered: %s"%clusteritems)
             self.result.write('\n')
Example #9
0
    def on_squaremsg_received(self, msg):
        detected_squares = []
        for square_msg in msg.squares:
            detected_squares.append(TrackedSquare.from_msg(square_msg))

        self._prev_squares.append(detected_squares)
        
        all_squares = list(itertools.chain.from_iterable(self._prev_squares))
        square_centers = [list(s.center) + [s.hue] for s in all_squares]
        data = np.array(square_centers)

        ms = DBSCAN(eps=64, min_samples=3)
        ms.fit(data)
        labels = ms.labels_

        ts_msg = TrackedSquares()
        for i, s in enumerate(all_squares):
            label = np.int0(labels[i])
            if label < 0: 
                continue

            s.tracking_colour = TrackedSquare.TRACKING_COLOURS[label % len(TrackedSquare.TRACKING_COLOURS)]
            s.tracking_detected = True

            ts_msg.squares.append(s.to_msg())

        self._squares_pub.publish(ts_msg)
Example #10
0
    def cluster_dbscan(self, calpha=False, cluster_diameter=6, cluster_min_size=10):
        '''
        cluster the residues using the DBSCAN method. 
        The parameters here are neighborhood diameter (eps) and neighborhood 
        connectivity (min_samples).
        
        Returns a list of cluster labels, in which label ``-1`` means an outlier point,
        which doesn't belong to any cluster.
        '''

        if not self.positive_residues:
            return {}
        
        if calpha:
            data_atoms = self.positive_residues.select('ca')
        else:
            data_atoms = self.positive_residues.select('sidechain or ca').copy()
        
        assert (
                data_atoms.getHierView().numResidues() == 
                self.positive_residues.getHierView().numResidues()
                )
        
        OUTLIER_LABEL = -1
        
        db_clust = DBSCAN(eps=cluster_diameter, min_samples=cluster_min_size)
        db_clust.fit(data_atoms.getCoords())

        db_labels = db_clust.labels_.astype(int)
        #print db_labels, len(db_labels)
        if calpha:
            residue_labels = db_labels
        
        else:
            residues = list(data_atoms.getHierView().iterResidues())
            residue_labels = np.zeros(len(residues), dtype=int)
            
            def most_common(lst):
                lst = list(lst)
                return max(set(lst) or [OUTLIER_LABEL], key=lst.count)
            
            data_atoms.setBetas(db_labels)
            for i, res in enumerate(residues):
                atom_labels = res.getBetas()
                residue_labels[i] = most_common(atom_labels[atom_labels!=OUTLIER_LABEL])
                
        assert len(residue_labels) == self.positive_residues.getHierView().numResidues()
        
        residue_numbers = self.positive_residues.ca.getResnums()
        clusters = sorted(
                [residue_numbers[residue_labels==i] for i in
                    set(residue_labels) if i!=-1], 
                key=self.conf_sum, 
                reverse=True,
                )
        return dict(enumerate(clusters))
Example #11
0
def fit(fvecs, params):
	eps_ = int(params[0])
	min_s = int(params[1])
	metric_=params[2]
	# affinity : “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or ‘precomputed’

	model = DBSCAN(eps=eps_, min_samples=min_s, metric=metric_)
	model.fit(fvecs)
	print len(set(model.labels_))
	return model.labels_
Example #12
0
 def dbscan(self, eps=0.75, min_samples=3):
     """
     :param kwargs: key-value arguments to pass to DBSCAN
                    (eps: max dist between points in same neighbourhood,
                     min_samples: number of points in a neighbourhood)
     :return:
     """
     est = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples)
     est.fit(self.get_dm(False))
     return Partition(est.labels_)
def db(lngs, lats, city, cluster_diameter):
	city_area = city["area"]
	city_lng = city["lng"]
	city_lat = city["lat"]
	lngs = np.array(lngs)*math.cos(city_lat)

	dbscan = DBSCAN(metric='euclidean')
	dbscan.fit(np.array([lngs, lats]).transpose())
	cluster_labels = np.array(dbscan.labels_)

	return labels_to_index(cluster_labels)
Example #14
0
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
                  method='ap'):
    """Define clusters given the similarity matrix and the threshold."""
    n, labels = connected_components(similarity_matrix, directed=False)
    prev_max_clust = 0
    print("connected components: %d" % n)
    clusters = labels.copy()

    if method == 'dbscan':
        ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
    if method == 'ap':
        ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter,
                                 preference='median')

    for i in range(n):
        idxs = np.where(labels == i)[0]
        if idxs.shape[0] > 1:
            sm = similarity_matrix[idxs][:, idxs]
            sm += sm.T + scipy.sparse.eye(sm.shape[0])

            # Hierarchical clustering
            if method == 'hc':
                dists = squareform(1 - sm.toarray())
                links = fastcluster.linkage(dists, method='ward')
                try:
                    clusters_ = fcluster(links, threshold, 'distance')
                except ValueError as err:
                    logging.critical(err)
                    clusters_ = np.zeros(1, dtype=int)

            # DBSCAN
            elif method == 'dbscan':
                db = ap.fit(1. - sm.toarray())
                # Number of clusters in labels, ignoring noise if present.
                clusters_ = db.labels_
                # n_clusters_ = len(set(clusters_)) - int(0 in clusters_)

            # AffinityPropagation
            # ap = AffinityPropagation(affinity='precomputed')
            elif method == 'ap':
                db = ap.fit(sm)
                clusters_ = db.labels_
            else:
                raise ValueError("clustering method %s unknown" % method)

            if np.min(clusters_) == 0:
                clusters_ += 1
            clusters_ += prev_max_clust
            clusters[idxs] = clusters_
            prev_max_clust = max(clusters_)
        else:  # connected component contains just 1 element
            prev_max_clust += 1
            clusters[idxs] = prev_max_clust
    return np.array(extra.flatten(clusters))
Example #15
0
def score_sam(min_val, max_val, incr=1):
    sam_range = range(min_val, max_val, incr)
    scores = []
    for k in sam_range:
        db = DBSCAN(eps=2, min_samples=k)
        db.fit(X_scaled)
        if len(set(db.labels_)) > 1:
            scores.append(metrics.silhouette_score(X_scaled, db.labels_))
        else:
            scores.append(0)
    return scores
Example #16
0
	def simple_clustering(x):
		print alt("Current parameters: %s" %(str(x)))
		dbscan = DBSCAN(eps=x[0], min_samples=x[1], p=x[2])
		dbscan.fit(X)
		cluster_sizes = get_cluster_size(dbscan.labels_)
		print alt("Current cluster sizes: %s" %(cluster_sizes))
		sscore = silhouette_score(X, dbscan.labels_)
		tscore = (sscore / (len(cluster_sizes.keys()) - 1))
		print alt("Current value of objective function: %.5f" %(tscore))
		print "-" * 50
		return -1.0 * tscore
 def dbscan(self):
     """
     Use DBSCAN to perform clustering in this chunk
     """
     # Set up DBSCAN
     db = DBSCAN(eps=self.neighborhood,
                 min_samples=self.min_members)
     # Perform the clustering
     db.fit(self.galaxy_coordinates)
     # save the labels
     np.savetxt(self.clusterfile,db.labels_,fmt='%d')
Example #18
0
def cluster_DB(classif_data, vect_data):
	db = DBSCAN()

	np_arr_train = np.array(vect_data["train_vect"])
	np_arr_label = np.array(classif_data["topics"])
	np_arr_test = np.array(vect_data["test_vect"])

	print "DB"

	db.fit(np_arr_train)
	sil_score = metrics.silhouette_score(np_arr_train, db.labels_, metric='euclidean')
	print sil_score
	return db.labels_
Example #19
0
class ClusterAnalysis:
    ''' To get results c.image_clusters(areas=True, only_synapses=True)
    '''
    DBSCAN_EPS = 6
    DB_SCAN_MIN_SAMPLES = 4
    MIN_CLUSTER_MEMBERS = 25
    def __init__(self, ves, mem, syn):
        self.ves = to_bool_arr(ves)
        self.mem = to_bool_arr(mem)
        self.syn = to_bool_arr(syn)

        self.psyn = self.syn #& self.mem
        self.nsyn = ~self.syn & self.mem
        self.psynpoints = np.asarray(zip(*self.psyn.nonzero()))

        self.db = DBSCAN(eps=self.DBSCAN_EPS, min_samples=self.DB_SCAN_MIN_SAMPLES)
        self.db.fit(self.psynpoints)

        self.clusters = self._get_clusters()

    def _get_clusters(self):
        clus = {}
        labels = self.db.labels_
        for label in set(labels):
            if label==-1:
                continue
            members = self.psynpoints[labels==label]
            if len(members)<self.MIN_CLUSTER_MEMBERS:
                continue
            clus[len(clus)] = Cluster(members, len(clus), self)
        return clus


    def image_clusters(self, show_only_one=None, areas=False, background=None, only_synapses=False):
        '''Background has to be a PIL.Image. Returns Image'''
        m = np.zeros_like(self.psyn)
        for i, c in self.clusters.items():
            if show_only_one is not None and i!=show_only_one:
                continue
            if only_synapses and not c.synapse:
                continue
            c = c.members if not areas else c.area
            m[c[:,0], c[:,1]] = 1
        if not background:
            return Image.fromarray((m*255).astype(np.uint8))
        else:
            background = background.convert('RGB')
            bg = background.load()
            for p in zip(*m.T.nonzero()):
                bg[p] = (255,0,0)
            return background
Example #20
0
def main(args):
    linesFile = sys.stdin
    if len(args) > 1:
        linesFile = open(args[1], 'r')

    allFeatures = []
    allFilenames = []
    filename = linesFile.readline()
    while len(filename) > 0:
        dataStr = linesFile.readline()
        features = np.fromstring(dataStr, dtype=int, sep=' ')
        features = features[:len(features)//2]

        allFeatures.append(features)
        allFilenames.append(filename.strip())

        filename = linesFile.readline()
    print 'finished reading all filenames. clustering...'


    dbscan = DBSCAN(eps=1100, min_samples=2, random_state=np.random.RandomState(0))
    dbscan.fit(np.atleast_2d(allFeatures))
    print 'num clusters', len(set(dbscan.labels_))

    homeDir = os.path.expanduser('~')
    groupsDir = os.path.join(homeDir, 'groups')
    templatesDir = os.path.join(groupsDir, 'templates')

    shutil.rmtree(groupsDir)
    os.mkdir(groupsDir)
    os.mkdir(templatesDir)
    for label, filename in zip(dbscan.labels_, allFilenames):
        label = str(int(label))

        groupFolder = os.path.join(groupsDir, label)
        isFirstInstance = not os.path.isdir(groupFolder)
        if isFirstInstance:
            os.mkdir(groupFolder)

        originalImage = cv2.imread(os.path.join("/Users/huipeng/EO990RW8/", filename), 0)
        height, width = originalImage.shape
        resizedImage = cv2.resize(originalImage, (width/4, height/4))

        newFilename = os.path.join(groupFolder, filename + '.png')
        cv2.imwrite(newFilename, resizedImage)
        if isFirstInstance and label != '-1':
            newFilename = os.path.join(templatesDir, label + '_' + filename + '.png')
            cv2.imwrite(newFilename, resizedImage)

    print 'finished'
Example #21
0
    def _aglom_cluster(self):

        # https://github.com/overlap-ai/words2map/blob/master/words2map.py
        print ('_aglom_cluster')
        size = self.opts['size'] * self.opts['size']
        print (size)
        #cluster = AgglomerativeClustering(n_clusters=size)
        #cluster = Birch(n_clusters=size)
        cluster = DBSCAN(eps=0.3, min_samples=10)
        #X = self.X[:10000]
        #cluster.fit(X)
        
        cluster.fit(self.X)
        return cluster.labels_
class MinHashDBSCAN():
    def __init__(self, eps=0.5, min_samples=5, 
        algorithm='auto', leaf_size=30, p=None, random_state=None, 
        fast=False, n_neighbors=5, radius=1.0,
        number_of_hash_functions=400,
        max_bin_size = 50, minimal_blocks_in_common = 1,
        shingle_size = 4, excess_factor = 5,
        number_of_cores=None, chunk_size=None):

        self.eps = eps
        self.min_samples = min_samples
        # self.metric = metric
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.p = p
        self.random_state = random_state
        self.radius = radius
        self.fast = fast
        self.number_of_hash_functions = number_of_hash_functions
        self.max_bin_size = max_bin_size
        self.minimal_blocks_in_common = minimal_blocks_in_common
        self.shingle_size = shingle_size
        self.excess_factor = excess_factor
        self.number_of_cores = number_of_cores
        self.chunk_size = chunk_size
        self.n_neighbors = n_neighbors

        self._dbscan = DBSCAN(eps=self.eps, min_samples=min_samples, metric='precomputed',
                algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p, random_state=self.random_state)
        self.labels_ = None
        # only for compatible issues
    def fit(self, X, y=None):
        minHashNeighbors = MinHash(n_neighbors = self.n_neighbors, 
        radius = self.radius, fast = self.fast,
        number_of_hash_functions = self.number_of_hash_functions,
        max_bin_size = self.max_bin_size,
        minimal_blocks_in_common = self.minimal_blocks_in_common,
        shingle_size = self.shingle_size,
        excess_factor = self.excess_factor,
        number_of_cores = self.number_of_cores,
        chunk_size = self.chunk_size, similarity=False)
        minHashNeighbors.fit(X, y)
        graph_result = minHashNeighbors.kneighbors_graph(mode='distance')
        self._dbscan.fit(graph_result)
        self.labels_ = self._dbscan.labels_
    def fit_predict(self, X, y=None):
        self.fit(X, y)
        return self.labels_
Example #23
0
    def find_example_points(self):
        """Finds examplar data points for each cluster"""

        # Train DBSCAN
        dbscan = DBSCAN()
        dbscan.fit(self.IMAGE_STORE)

        labels = dbscan.labels_
        unique, indices = np.unique(labels, return_index=True)

        # Remove the 'noise' example data point
        index = np.where(unique == -1)
        unique = np.delete(unique, index)
        indices = np.delete(indices, index)

        return np.vstack(unique, self.IMAGE_STORE[indices])
Example #24
0
def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    # sd_scaler = StandardScaler()
    res = dr.get_dataset_ensembl_info()
    outliers_id = []
    for g in genes:
        # scaled = sd_scaler.fit(data.loc[g, :])
        fit = db.fit(np.reshape(data.loc[g, :], (196, 1)))

        candidates = itemfreq(fit.labels_)

        try:
            class_zero = candidates[0][1]
            class_one = candidates[1][1]

            support = min(class_one, class_zero)

            if min_samples < support <= max_samples:
                info = [gene for gene in res if gene.ensemblgeneid == g][0]
                formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                  "distance": "NA"}
                jinfo = json.dumps(formatted_info)
                jinfo += ","
                outliers_id.append(g)
                print("outlier found :" + g)
                if as_json:
                    yield (jinfo)
                else:
                    yield (formatted_info)
        except:
            pass
def dbscan_outliers(df):
    """
    Find outliers (noise points) using DBSCAN.

    Parameters
    ----------
    df: A pandas.DataFrame

    Returns
    -------
    A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame)
    """

    scaler = StandardScaler()
    scaler.fit(df)
    scaled = scaler.transform(df)

    dbs = DBSCAN()

    db = dbs.fit(scaled)
    outliers = dbs.fit_predict(scaled)

    df_o = df.ix[np.nonzero(outliers)]

    return db, df_o
Example #26
0
def cluster_tweets(tweets):
    #TODO get TFIDF vector
    #do clustering
    ner_tags = [get_ner_tags(tweet).tolist() for tweet in tweets['tweet']]
    vectorizer = TfidfVectorizer(preprocessor=_dummy_preprocess, tokenizer=lambda x:x,
                                 binary=True,
                                 min_df=0, use_idf=True, smooth_idf=True)
    tfidf = vectorizer.fit_transform(ner_tags) 
    
    #ner_tags = [get_ner_tags(tweet) for tweet in tweets['tweet']]
    print "clustering started"
    t0 = time()
    #cluster = AgglomerativeClustering(n_clusters=3, affinity="cosine" )
    #cluster = MiniBatchKMeans(n_clusters=10, max_iter=100, batch_size=100) 
    #metric=sklearn.metrics.pairwise.cosine_distances
    cluster = DBSCAN(min_samples=2, eps=0.5)    
        
    clustered = cluster.fit(tfidf.todense())
       
    #clustered = cluster.fit(ner_tags)
    labels = clustered.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print "clustering finished in %.3f seconds"%(time()-t0)   
    print "%d clusters detected"%n_clusters_
    
    tweets['cluster'] = labels
    tweets['ner'] = ner_tags
    return tweets
Example #27
0
        def cluster_points(XY):
            """Find clusters of points in XY and return a list of the indices
            of the points in each cluster.
            """
            # use a density based sort to separate points into distinct
            # clusters, each one corresponding to a distinct root.
            # There is an edge case here in that roots can become
            # degenerate: as the roots come closer together, they will
            # be treated as a single root cluster at some non-zero
            # separation distance.
            db = DBSCAN(eps=r, min_samples=2)
            db.fit(XY)

            max_label = int(db.labels_.max())
            labels = range(max_label + 1)
            where_label = [np.where(db.labels_ == label) for label in labels]
            return where_label
    def execute(self,data):
        dbsc=DBSCAN(eps=2.828,min_samples=2)
        #print data
        dbsc.fit(data)

        clusters= dbsc.labels_
        #print clusters
        clustering=get_clust_dict(clusters,data)
        #print clustering
        het=mean_inter_het(clustering)
        hom=mean_intra_hom(clustering)

        op = pd.DataFrame(columns=['het','hom'])
        op.het = [het]
        op.hom= [hom]
        op.to_csv('stat.csv', sep=',', encoding='utf-8')
        return data
Example #29
0
def update_location_centroid(point, cluster, max_distance, min_samples):
    """ Updates the centroid of a location cluster with another point

    Args:
        point (:obj:`Point`): Point to add to the cluster
        cluster (:obj:`list` of :obj:`Point`): Location cluster
        max_distance (float): Max neighbour distance
        min_samples (int): Minimum number of samples
    Returns:
        (:obj:`Point`, :obj:`list` of :obj:`Point`): Tuple with the location centroid
            and new point cluster (given cluster + given point)
    """
    cluster.append(point)
    points = [p.gen2arr() for p in cluster]

    # Estimates the epsilon
    eps = estimate_meters_to_deg(max_distance, precision=6)

    p_cluster = DBSCAN(eps=eps, min_samples=min_samples)
    p_cluster.fit(points)

    clusters = {}
    for i, label in enumerate(p_cluster.labels_):
        if label in clusters.keys():
            clusters[label].append(points[i])
        else:
            clusters[label] = [points[i]]

    centroids = []
    biggest_centroid_l = -float("inf")
    biggest_centroid = None

    for label, n_cluster in clusters.items():
        centroid = compute_centroid(n_cluster)
        centroids.append(centroid)

        if label >= 0 and len(n_cluster) >= biggest_centroid_l:
            biggest_centroid_l = len(n_cluster)
            biggest_centroid = centroid

    if biggest_centroid is None:
        biggest_centroid = compute_centroid(points)

    return biggest_centroid, cluster
def FindClusters(cfg, cfg_old, eps=1.5, min_samples=6, periodic=False, box=[]):
    """
    Find density clusters in the

    """
    if(not periodic):
        clf = DBSCAN(eps=eps, min_samples=min_samples)
    else:
        myMetric = metric.PeriodicMetric(box)
        algo = 'brute'
        clf = DBSCAN(eps=eps, min_samples=min_samples,
            algorithm=algo, metric=myMetric.Distance)

    clf.fit(cfg)

    # Plot clusters.
    mySet = set(clf.labels_)
    print(mySet)
    PlotClusters(cfg, cfg_old, clf.labels_, box)
import pandas as pd

# Importing the dataset
from sklearn.datasets import load_iris
iris = load_iris()

from sklearn.cluster import DBSCAN
dbscan = DBSCAN()

print dbscan
"""
DBSCAN(eps=0.5, metric='euclidean', min_samples=5,
  random_state=111)
"""

dbscan.fit(iris.data)

dbscan.labels_

# Visualising the clusters
# as data is in 3d space, we need to apply PCA for 2d ploting
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(iris.data)
pca_2d = pca.transform(iris.data)
explained_variance = pca.explained_variance_ratio_

df_pca = pd.DataFrame(pca.components_,
                      columns=iris.feature_names,
                      index=['PC-1', 'PC-2'])
"""
#alternative way, fit_transform
Example #32
0
    row_ix = where(yhat == cluster)
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
pyplot.show()

# k-means clustering
from numpy import unique
from sklearn.cluster import KMeans

X, _ = make_classification(n_samples=1000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           random_state=4)
model = KMeans(n_clusters=2)
model.fit(X)
yhat = model.predict(X)
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
pyplot.show()

# gaussian mixture clustering
from numpy import unique
from sklearn.mixture import GaussianMixture

X, _ = make_classification(n_samples=1000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
Example #33
0
def findLocalExtrema(da, highVal=0, lowVal=1000, eType='Low'):
    """
    Utility function to find local low/high field variable coordinates on a contour map. To classify as a local high, the data
    point must be greater than highVal, and to classify as a local low, the data point must be less than lowVal.
    Args:
        da: (:class:`xarray.DataArray`):
            Xarray data array containing the lat, lon, and field variable (ex. pressure) data values
        highVal (:class:`int`):
            Data value that the local high must be greater than to qualify as a "local high" location.
            Default highVal is 0.
        lowVal (:class:`int`):
            Data value that the local low must be less than to qualify as a "local low" location.
            Default lowVal is 1000.
        eType (:class:`str`):
            'Low' or 'High'
            Determines which extrema are being found- minimum or maximum, respectively.
            Default eType is 'Low'.
    Returns:
        clusterExtremas (:class:`list`):
            List of coordinate tuples in GPS form (lon in degrees, lat in degrees)
            that specify local low/high locations
    """

    # Create a 2D array of coordinates in the same shape as the field variable data
    # so each coordinate is easily mappable to a data value
    # ex:
    # (1, 1), (2, 1), (3, 1)
    # (1, 2)................
    # (1, 3)................
    lons, lats = np.meshgrid(np.array(da.lon), np.array(da.lat))
    coordarr = np.dstack((lons, lats))

    # Find all zeroes that also qualify as low or high values
    extremacoords = []

    if eType == 'Low':
        coordlist = np.argwhere(da.data < lowVal)
        extremacoords = [tuple(coordarr[x[0]][x[1]]) for x in coordlist]
    if eType == 'High':
        coordlist = np.argwhere(da.data > highVal)
        extremacoords = [tuple(coordarr[x[0]][x[1]]) for x in coordlist]

    if extremacoords == []:
        if eType == 'Low':
            warnings.warn(
                'No local extrema with data value less than given lowVal')
            return []
        if eType == 'High':
            warnings.warn(
                'No local extrema with data value greater than given highVal')
            return []

    # Clean up noisy data to find actual extrema

    # Use Density-based spatial clustering of applications with noise
    # to cluster and label coordinates
    db = DBSCAN(eps=10, min_samples=1)
    new = db.fit(extremacoords)
    labels = new.labels_

    # Create an dictionary of values with key being coordinate
    # and value being cluster label.
    coordsAndLabels = {label: [] for label in labels}
    for label, coord in zip(labels, extremacoords):
        coordsAndLabels[label].append(coord)

    # Initialize array of coordinates to be returned
    clusterExtremas = []

    # Iterate through the coordinates in each cluster
    for key in coordsAndLabels:

        # Create array to hold all the field variable values for that cluster
        datavals = []
        for coord in coordsAndLabels[key]:

            # Find pressure data at that coordinate
            cond = np.logical_and(coordarr[:, :, 0] == coord[0],
                                  coordarr[:, :, 1] == coord[1])
            x, y = np.where(cond)
            datavals.append(da.data[x[0]][y[0]])

        # Find the index of the smallest/greatest field variable value of each cluster
        if eType == 'Low':
            index = np.argmin(np.array(datavals))
        if eType == 'High':
            index = np.argmax(np.array(datavals))

        # Append the coordinate corresponding to that index to the array to be returned
        clusterExtremas.append(
            (coordsAndLabels[key][index][0], coordsAndLabels[key][index][1]))

    return clusterExtremas
Example #34
0
    'encrypted_5_zipcode', 'produtos_vendidos', 'transacoes_total',
    'faturamento_total'
]].dropna()

#minmax
subset = (subset - subset.min()) / (subset.max() - subset.min())
#zscore
#subset = (subset - subset.mean()) / subset.std()

X = np.column_stack([
    subset.encrypted_5_zipcode, subset.produtos_vendidos,
    subset.transacoes_total, subset.faturamento_total
])

clustering = DBSCAN(metric='euclidean', eps=0.3)
clustering.fit(X)
metrics.silhouette_score(X, clustering.labels_, metric='euclidean')

pca = PCA(n_components=2)
reduced = pca.fit_transform(X)
plt.scatter(reduced[:, 0], reduced[:, 1], c=clustering.labels_)

sazon_set = lojas_df[[
    'encrypted_5_zipcode', 'periodo_0', 'periodo_1', 'periodo_2', 'periodo_3',
    'periodo_4'
]].dropna()
sazon_set = (sazon_set - sazon_set.min()) / (sazon_set.max() - sazon_set.min())

X = np.column_stack([
    sazon_set.encrypted_5_zipcode, sazon_set.periodo_0, sazon_set.periodo_1,
    sazon_set.periodo_2, sazon_set.periodo_3, sazon_set.periodo_4
Example #35
0
"""

from sklearn.datasets import make_blobs
from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

X, ytrue = make_blobs(n_samples=1000,
                      n_features=2,
                      cluster_std=0.01,
                      random_state=0)

# plt.scatter(X[:, 0], X[:, 1], c = y)

miModelo = DBSCAN(eps=0.5, min_samples=5)
miModelo.fit(X)
clusters = miModelo.labels_

plt.figure()
plt.subplot(1, 2, 1)
plt.scatter(X[:, 0], X[:, 1], c=ytrue, s=150)
plt.title('Color = ytrue')

plt.subplot(1, 2, 2)
plt.scatter(X[:, 0], X[:, 1], c=clusters, s=150)
plt.title('Color = clusters')

from sklearn.metrics import silhouette_score

sc = silhouette_score(X, clusters)
Example #36
0
    cv.drawContours(img_color, [cnt], 0, (0, 0, 0), 2)

cv.imshow("result", img_color)
cv.waitKey(0)
x = np.array(contours2)

contours22 = np.vstack(contours2).squeeze()

#print(contours22)
height = img_color.shape[0]
width = img_color.shape[1]
channels = img_color.shape[2]
radius = ((height / 8) + (width / 8)) / 2
df = pd.DataFrame(contours22)
model = DBSCAN(eps=radius, min_samples=3)
model.fit(df)
y_predict = model.fit_predict(df)
print(y_predict)

df[2] = y_predict
#print(df)

df = df[df[2] != -1]
del df[2]
print(df)
tuples = [tuple(x) for x in df.values]
#print(tuples)
for cnt in tuples:
    cv.circle(img_color, cnt, 0, (255, 255, 0), 2)

cv.imshow("result2", img_color)
Example #37
0
    def process_dbscan_jaccard(self):
        dbscan = DBSCAN(algorithm='ball_tree', metric='haversine')
        dbscan.fit(self.sparse_matrix)

        #Get the shape of Sparse matrix
        row, col = self.sparse_matrix.get_shape()

        #Calculate average point position for each cluster
        cluster = {}
        clusetr_amount = {}
        for i in range(0, row):
            if dbscan.labels_[i] != -1:
                if dbscan.labels_[i] in clusetr_amount:
                    clusetr_amount[dbscan.labels_[i]] += 1
                else:
                    clusetr_amount[dbscan.labels_[i]] = 1

                for j in range(0, col):
                    if self.sparse_matrix[i, j] == 1:
                        if dbscan.labels_[i] in cluster:
                            cluster[dbscan.labels_[i]][j] += 1
                        else:
                            cluster[dbscan.labels_[i]] = numpy.zeros(col)
                            cluster[dbscan.labels_[i]][j] = 1

        for key in cluster:
            cluster[key] = cluster[key] / clusetr_amount[key]

        minumum_distance = {}
        for key in cluster:
            minumum_distance[key] = 999

        #Find the nearest row by euclidean to average point position as centroid for each cluster
        array_row = numpy.zeros(col)
        centroid = {}
        for i in range(0, row):
            if dbscan.labels_[i] != -1:
                for j in range(0, col):
                    if self.sparse_matrix[i, j] == 1:
                        array_row[j] = 1
                eu_dist = distance.euclidean(array_row,
                                             cluster[dbscan.labels_[i]])
                if eu_dist < minumum_distance[dbscan.labels_[i]]:
                    minumum_distance[dbscan.labels_[i]] = eu_dist
                    centroid[dbscan.labels_[i]] = i
            array_row = numpy.zeros(col)

        centroid_matrix = numpy.zeros((len(centroid), col))
        for i in range(0, len(centroid_matrix)):
            for j in range(0, col):
                centroid_matrix[i][j] = self.sparse_matrix[centroid[i], j]

        #Find overall jaccard score by centroid
        array_row = numpy.zeros(col)
        overall_distance = 0
        for i in range(0, row):
            if dbscan.labels_[i] != -1:
                for j in range(0, col):
                    if self.sparse_matrix[i, j] == 1:
                        array_row[j] = 1
                ja_distance = jaccard_score(centroid_matrix[dbscan.labels_[i]],
                                            array_row)
                overall_distance = overall_distance + ja_distance
            array_row = numpy.zeros(col)

        self.fout.write("DBSCAN overall distance:")
        self.fout.write("\n")
        self.fout.write(str(overall_distance))
        self.fout.write("\n")
        self.fout.flush()
        return dbscan.labels_
Example #38
0
def cluster(num_samples, num_clusters):
    x = 3.3 * np.random.randn(num_samples, 2)
    X = StandardScaler().fit_transform(x)

    flag = False

    if flag:
        t0 = time.time()
        km = MiniBatchKMeans(init='k-means++',
                             n_clusters=num_clusters,
                             batch_size=3 * num_clusters,
                             max_no_improvement=10,
                             verbose=0,
                             max_iter=100,
                             random_state=0)
        km.fit(X)
        t1 = time.time()

        print 'kmeans time taken : ', t1 - t0

    flag = True
    if flag:
        t0 = time.time()
        bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=10000)
        # bandwidth = 0.5
        print bandwidth
        print 'bandwidth estimation time : ', time.time() - t0

        ms = MeanShift(bandwidth=bandwidth,
                       bin_seeding=True,
                       min_bin_freq=100,
                       n_jobs=-1)
        ms.fit(X)
        t1 = time.time()
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_

        labels_unique = np.unique(labels)
        n_clusters_ = len(labels_unique)

        print("number of estimated clusters : %d" % n_clusters_)
        print 'meanshift time taken : ', t1 - t0

    flag = False
    if flag:
        x = 3.3 * np.random.randn(num_samples, 2)
        X = StandardScaler().fit_transform(x)

        t0 = time.time()
        db = DBSCAN(eps=0.3, min_samples=100)
        db.fit(X)
        t1 = time.time()
        n_clusters_ = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)
        print 'number of clusters:', n_clusters_

        print 'DBSCAN time taken : ', t1 - t0

    flag = False

    if flag:
        t0 = time.time()
        bm = Birch(threshold=0.3, n_clusters=None)
        bm.fit(X)
        t1 = time.time()
        print 'number of clusters:', np.unique(bm.labels_).size

        print 'Birch time taken : ', t1 - t0
Example #39
0
########################################################################################################################
eps = 0.3
ms = 20
'''
eps: DBSCAN算法参数,即我们的ϵϵ-邻域的距离阈值,和样本距离超过ϵϵ的样本点不在ϵϵ-邻域内。
默认值是0.5.一般需要通过在多组值里面选择一个合适的阈值。
eps过大,则更多的点会落在核心对象的ϵϵ-邻域,此时我们的类别数可能会减少,本来不应该是一类的样本也会被划为一类。
反之则类别数可能会增大,本来是一类的样本却被划分开。
min_samples: DBSCAN算法参数,即样本点要成为核心对象所需要的ϵϵ-邻域的样本数阈值。
默认值是5. 一般需要通过在多组值里面选择一个合适的阈值。通常和eps一起调参。
在eps一定的情况下,min_samples过大,则核心对象会过少,此时簇内部分本来是一类的样本可能会被标为噪音点,
类别数也会变多。反之min_samples过小的话,则会产生大量的核心对象,可能会导致类别数过少。
'''
dbscan = DBSCAN(eps=eps, min_samples=ms)
dbscan.fit(output3_value_embedded)
label_pred = dbscan.labels_
# while(label_pred.max()!=4):
#     print(label_pred.max())
#     if label_pred.max()>4:
#         eps=eps*1.05
#         ms=ms+1
#         print(eps,ms)
#         dbscan = DBSCAN(eps=eps, min_samples=ms)
#         dbscan.fit(output3_value_embedded)
#         label_pred = dbscan.labels_
#     if label_pred.max()<4:
#         eps = eps * 0.95
#         if ms>2:
#             ms=ms-1
#         print(eps,ms)
Example #40
0
from sklearn.preprocessing import scale
from pandas.tools.plotting import scatter_matrix

dim_reduction=PCA()
Xc=dim_reduction.fit_transform(scale(X))
print 'variance explained by first 2 components %0.1f'%(sum(dim_reduction.explained_variance_ratio_[-2:]* 100))
df = pd.DataFrame(Xc, columns =['comp_' + str(j+1) for j in range(10)])
first_two = df.plot(kind ='scatter', x ='comp_1', y ='comp_2', c ='DarkGray', s = 50) 
last_two = df.plot( kind ='scatter', x ='comp_9', y ='comp_10', c ='DarkGray', s = 50)
outlying=(Xc[:,-1] < -0.3) | (Xc[:,-2] < -1.0) 
print df[outlying] #Print outliers found out by PCA

#**************************Using Cluster Analysis***************************
from sklearn.cluster import DBSCAN
DB=DBSCAN(eps=2.5, min_samples=25, random_state=101)
DB.fit(Xc)
from collections import Counter
print Counter (DB.labels_),'\n'
print df[DB.labels_==-1]
Counter({0:414,-1:28})
#*************************Using OneClassSVM******************************
from sklearn import svm
outliers_fraction =0.01 
nu_estimate=0.95*outliers_fraction+0.05
auto_detection=svm.OneClassSVM(kernel="rbf", gamma=0.01,degree=3,nu=nu_estimate)
auto_detection.fit(Xc)
evaluation=auto_detection.predict(Xc)
print df[evaluation==-1]


class DataCleaner():
    """
	Class for outlier detection and data cleaning in preprocessing
	Implements sklearn.cluster.DBSCAN for compositional clustering, 
	and sklearn.ensemble.IsolationForest for greedy outlier flagging.
	Applies z-score threshold within composition clusters to screen 
	IsolationForest flags.
	
	Parameters
	----------
	data: dataset to process (pandas DataFrame)
	prop_dim: property dimension to screen for outliers
	comp_dims: composition dimensions for clustering and IsolationForest
	add_fit_dims: additional dimensions to use for outlier identification
	cluster_by: column to group by for clustering. If None, use DBSCAN to identify clusters
	DB_kw: kwargs to pass to DBSCAN instantiation
	IF_kw: kwargs to pass to IsolationForest instantiation
	"""
    def __init__(self,
                 data,
                 prop_dim,
                 comp_dims=None,
                 add_fit_dims=[],
                 cluster_by=None,
                 DB_kw={},
                 IF_kw={}):
        self.data = data
        self.set_prop_dim(prop_dim)
        self.set_comp_dims(comp_dims)
        self.add_fit_dims = add_fit_dims
        self.cluster_by = cluster_by
        self.random_state = np.random.RandomState(17)
        self.db = DBSCAN(**DB_kw)
        self.clf = IsolationForest(random_state=self.random_state, **IF_kw)

    def set_prop_dim(self, prop_dim):
        "set property dimension"
        self._prop_dim = prop_dim

    def get_prop_dim(self):
        "get property dimension"
        return self._prop_dim

    prop_dim = property(get_prop_dim, set_prop_dim)

    def set_comp_dims(self, comp_dims=None):
        """
		set composition dimensions used for clustering. Defaults to all valid elemental symbols in data columns
		
		Parameters
		----------
		comp_dims: list of columns in data to use as composition dimensions
		"""
        #if no comp dims specified, use all columns that are valid element symbols
        if comp_dims == None:
            comp_dims = []
            for col in self.data.columns:
                try:
                    mg.Element(col)
                    comp_dims.append(col)
                except ValueError:
                    pass
        self._comp_dims = comp_dims

    def get_comp_dims(self):
        "get composition dimensions"
        return self._comp_dims

    comp_dims = property(get_comp_dims, set_comp_dims)

    @property
    def comp_data(self):
        "composition data"
        return self.data[self.comp_dims]

    @property
    def fit_dims(self):
        "dimensions used for identifying outliers"
        return self.comp_dims + self.add_fit_dims + [self.prop_dim]

    def fit_data(self,
                 comp_scale=1,
                 prop_scale=1,
                 add_fit_scale=1,
                 cluster_by=None):
        "data used for identifying outliers"
        fit_data = self.data.copy()
        fit_data[self.comp_dims] = self.scaled_comp_data(
            scale=comp_scale, cluster_by=cluster_by).values

        ss = StandardScaler()
        if cluster_by is None:
            fit_data[self.prop_dim] = prop_scale * ss.fit_transform(
                fit_data[self.prop_dim].values[:, None])
            if len(self.add_fit_dims) > 0:
                fit_data[self.add_fit_dims] = add_fit_scale * ss.fit_transform(
                    fit_data[self.add_fit_dims])
        else:
            # scale within each cluster
            gdf = fit_data.groupby(cluster_by)
            for cluster, idx in gdf.groups.items():
                cdata = fit_data.loc[idx, :]
                fit_data.loc[idx,
                             self.prop_dim] = prop_scale * ss.fit_transform(
                                 cdata[self.prop_dim].values[:, None])
                if len(self.add_fit_dims) > 0:
                    fit_data.loc[
                        idx,
                        self.add_fit_dims] = add_fit_scale * ss.fit_transform(
                            cdata[self.add_fit_dims])

        return fit_data[self.fit_dims]

    def scaled_comp_data(self, scale=1, cluster_by=None):
        """
		scale composition dimensions such that largest-variance dimension has variance max_var
		"""
        ss = StandardScaler()
        if cluster_by is None:
            #get dimension with largest variance
            ref_dim = np.var(self.comp_data).idxmax()
            ss.fit(self.comp_data[ref_dim].values[:, None])
            #scale all comp dims with same scaler such that refdim has variance max_var
            scaled_comp_data = pd.DataFrame(scale *
                                            ss.transform(self.comp_data),
                                            columns=self.comp_dims)
        else:
            # scale within each cluster
            gdf = self.data.groupby(cluster_by)
            scaled_comp_data = self.comp_data.copy()
            for cluster, idx in gdf.groups.items():
                cdata = self.comp_data.loc[idx, :]
                #get dimension with largest variance
                ref_dim = np.var(cdata).idxmax()
                ss.fit(cdata[ref_dim].values[:, None])
                #scale all comp dims with same scaler such that refdim has variance max_var
                scaled_comp_data.loc[idx, :] = scale * ss.transform(cdata)

        return scaled_comp_data

    def fit(self, method, comp_scale=1, prop_scale=1):
        """
		fit DBSCAN and IsolationForest to data
		
		Parameters
		----------
		comp_scale: maximum compositional variance set by scale_composition
		"""

        if method == 'DBIFZ':

            if self.cluster_by is None:
                # fit DBSCAN to comp data for compositional clustering
                self.db.fit(self.scaled_comp_data(scale=comp_scale))

            # fit IsolationForest to iso data for greedy outlier flagging
            self.clf.fit(self.fit_data(comp_scale, prop_scale))

        elif method == 'DBSCAN':
            # nothing to do yet
            pass

    def predict(self,
                method,
                comp_scale=1,
                prop_scale=1,
                add_fit_scale=1,
                z_thresh=2):
        """
		predict outliers in data
		
		Parameters
		----------
		z_thresh: z-score threshold for intra-cluster outlier identification
		"""

        self.pred = pd.DataFrame()
        self.pred[self.prop_dim] = self.data[self.prop_dim]
        self.z_thresh = z_thresh

        if self.cluster_by is not None:
            # use provided column to group into clusters
            self.pred.loc[:, 'cluster_name'] = self.data[self.cluster_by]
            cluster_names = self.pred['cluster_name'].unique()
            clusters = np.arange(len(cluster_names))
            cluster_dict = dict(zip(cluster_names, clusters))
            self.pred['cluster'] = self.pred['cluster_name'].map(
                lambda x: cluster_dict[x])
            self.cluster_name = dict(zip(clusters, cluster_names))

        if method == 'DBIFZ':

            if self.cluster_by is None:
                # use DBSCAN to cluster by composition
                self.pred.loc[:, 'cluster'] = self.db.fit_predict(
                    self.scaled_comp_data(
                        comp_scale))  #db has no pure predict function
                self.pred['cluster_name'] = self.pred['cluster']
                clusters = self.pred['cluster'].unique()
                self.cluster_name = dict(zip(clusters, clusters))

            fit_data = self.fit_data(comp_scale, prop_scale, add_fit_scale)

            self.pred.loc[:, 'isolation_flag'] = self.clf.predict(fit_data)
            self.pred.loc[:, 'isolation_score'] = self.clf.decision_function(
                fit_data)

            #get z-scores for each cluster and cross-ref with isolation forest
            for i, cluster in enumerate(self.pred['cluster'].unique()):
                df = self.pred.loc[self.pred['cluster'] == cluster, :]
                self.pred.loc[self.pred['cluster'] == cluster,
                              'cluster_zscore'] = z_score(df[self.prop_dim])

            #set final outlier flag - if flagged by isolation forest and cluster z-score is outside z_thresh
            self.pred.loc[:, 'outlier_flag'] = np.where(
                (self.pred['isolation_flag'] == -1) &
                (np.abs(self.pred['cluster_zscore']) > z_thresh), -1, 0)

        elif method == 'DBSCAN':

            if self.cluster_by is None:
                # apply DBSCAN to comp dims and prop dim to cluster and identify outliers
                fit_data = self.fit_data(comp_scale, prop_scale, add_fit_scale)
                self.pred.loc[:, 'cluster'] = self.db.fit_predict(
                    fit_data)  #db has no pure predict function
                self.pred['cluster_name'] = self.pred['cluster']
                clusters = self.pred['cluster'].unique()
                self.cluster_name = dict(zip(clusters, clusters))
                # cluster -1 is outliers
                self.pred['outlier_flag'] = self.pred['cluster'].map(
                    lambda x: -1 if x == -1 else 0)

            else:
                # apply DBSCAN within each provided cluster to identify outliers
                fit_data = self.fit_data(comp_scale,
                                         prop_scale,
                                         add_fit_scale,
                                         cluster_by=self.cluster_by)
                for cluster, idx in self.data.groupby(
                        self.cluster_by).groups.items():
                    cdata = fit_data.loc[idx, :]
                    self.pred.loc[idx,
                                  'DB_cluster'] = self.db.fit_predict(cdata)
                # cluster -1 is outliers
                self.pred['outlier_flag'] = self.pred['DB_cluster'].map(
                    lambda x: -1 if x == -1 else 0)

            #get z-scores for each cluster
            for i, cluster in enumerate(self.pred['cluster'].unique()):
                df = self.pred.loc[self.pred['cluster'] == cluster, :]
                self.pred.loc[self.pred['cluster'] == cluster,
                              'cluster_zscore'] = z_score(df[self.prop_dim])

            # set IF columns for compatibility
            self.pred.loc[:, 'isolation_flag'] = 1
            self.pred.loc[:, 'isolation_score'] = 0

        #include scaled fit_data in pred
        for col in fit_data.columns:
            self.pred[f'{col}_fit'] = fit_data[col]
        #return self.pred

    def fit_predict(self,
                    method,
                    comp_scale=1,
                    prop_scale=1,
                    add_fit_scale=1,
                    z_thresh=2):
        """combine fit and predict functions"""
        self.fit(method, comp_scale, prop_scale)
        self.predict(method, comp_scale, prop_scale, add_fit_scale, z_thresh)
        #return self.pred

    def remove_outliers(self):
        """remove outliers identified by fit_predict"""
        self.clean_data = self.data[self.pred['outlier_flag'] != -1]
        #return self.clean_data

    @property
    def data_pred(self):
        "data joined with prediction results"
        return self.data.join(self.pred.drop(labels=self.prop_dim, axis=1))

    @property
    def outliers(self):
        "outlier data rows"
        return self.data_pred[self.data_pred['outlier_flag'] == -1]

    @property
    def inliers(self):
        "inlier data rows"
        return self.data_pred[self.data_pred['outlier_flag'] != -1]

    def set_DB_params(self, **params):
        """set DBSCAN parameters"""
        self.db.set_params(**params)

    def set_IF_params(self, **params):
        """set IsolationForest parameters"""
        self.clf.set_params(**params)

    def scatter_slices(self,
                       slice_axis,
                       slice_starts,
                       slice_widths,
                       tern_axes,
                       color_col=None,
                       vmin=None,
                       vmax=None,
                       cmap=plt.cm.viridis,
                       data_filter=None,
                       **scatter_kwargs):
        if color_col is None:
            color_col = self.prop_dim

        if data_filter is not None:
            data = data_filter(self.data_pred)
        else:
            data = self.data_pred

        #get vmin and vmax
        if vmin is None:
            vmin = data[color_col].min()
        if vmax is None:
            vmax = data[color_col].max()

        #plot all
        axes = scatter_slices(data,
                              color_col,
                              slice_axis,
                              slice_starts,
                              slice_widths,
                              tern_axes,
                              cmap=cmap,
                              vmin=vmin,
                              vmax=vmax,
                              **scatter_kwargs)

    def scatter_slice_highlight(self,
                                slice_axis,
                                slice_starts,
                                slice_widths,
                                tern_axes,
                                color_col=None,
                                vmin=None,
                                vmax=None,
                                cmap=plt.cm.viridis,
                                data_filter=None,
                                **scatter_kwargs):
        """
		plot all data points with outliers highlighted in red. color determined by value of prop_dim
		
		Parameters
		----------
		
		slice_axis: composition dimension on which to slice
		slice_starts: values of slice_axis at which to start slices
		slice_widths: widths of slices in slice_axis dimension. Single value or list
		tern_axes: composition dimensions for ternary plot axes (order: right, top, left)
		cmap: colormap for prop_dim values
		scatter_kwargs: kwargs to pass to helpers.plotting.scatter_slices
		"""
        if color_col is None:
            color_col = self.prop_dim

        if data_filter is not None:
            data = data_filter(self.data_pred)
        else:
            data = self.data_pred

        #get vmin and vmax
        if vmin is None:
            vmin = data[color_col].min()
        if vmax is None:
            vmax = data[color_col].max()

        inliers = data[data['outlier_flag'] == 0]
        outliers = data[data['outlier_flag'] == -1]

        #plot inliers
        axes = scatter_slices(inliers,
                              color_col,
                              slice_axis,
                              slice_starts,
                              slice_widths,
                              tern_axes,
                              cmap=cmap,
                              vmin=vmin,
                              vmax=vmax,
                              **scatter_kwargs)
        #plot outliers
        scatter_slices(outliers,
                       color_col,
                       slice_axis,
                       slice_starts,
                       slice_widths,
                       tern_axes,
                       cmap=cmap,
                       axes=axes,
                       vmin=vmin,
                       vmax=vmax,
                       colorbar=False,
                       s=20,
                       marker='d',
                       edgecolors='r',
                       linewidths=0.8,
                       **scatter_kwargs)

    def scatter_slice_clusters(self,
                               slice_axis,
                               slice_starts,
                               slice_widths,
                               tern_axes,
                               cmap=plt.cm.plasma,
                               **scatter_kwargs):
        """
		plot all data points with cluster shown by color
		
		Parameters
		----------
		slice_axis: composition dimension on which to slice
		slice_starts: values of slice_axis at which to start slices
		slice_widths: widths of slices in slice_axis dimension. Single value or list
		tern_axes: composition dimensions for ternary plot axes (order: right, top, left)
		cmap: colormap for cluster values
		scatter_kwargs: kwargs to pass to helpers.plotting.scatter_slices
		"""
        #make norm for discrete colormap
        clusters = list(self.cluster_name.keys())
        cluster_names = list(self.cluster_name.values())
        n_clusters = len(self.pred['cluster'].unique())
        bounds = np.arange(min(clusters) - 0.5, max(clusters) + 0.51)
        norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

        scatter_slices(self.data_pred,
                       'cluster',
                       slice_axis,
                       slice_starts,
                       slice_widths,
                       tern_axes,
                       cmap=cmap,
                       norm=norm,
                       cb_kwargs={
                           'norm': norm,
                           'ticks': clusters,
                           'tickformat': '%.0f',
                           'ticklabels': cluster_names
                       },
                       **scatter_kwargs)

    def scatter_slice_outliers(self,
                               slice_axis,
                               slice_starts,
                               slice_widths,
                               tern_axes,
                               cmap=plt.cm.viridis,
                               **scatter_kwargs):
        """
		plot outliers only
		
		Parameters
		----------
		slice_axis: composition dimension on which to slice
		slice_starts: values of slice_axis at which to start slices
		slice_widths: widths of slices in slice_axis dimension. Single value or list
		tern_axes: composition dimensions for ternary plot axes (order: right, top, left)
		cmap: colormap for prop_dim values
		scatter_kwargs: kwargs to pass to helpers.plotting.scatter_slices
		"""
        axes = scatter_slices(self.outliers,
                              self.prop_dim,
                              slice_axis,
                              slice_starts,
                              slice_widths,
                              tern_axes,
                              cmap=cmap,
                              **scatter_kwargs)
        return axes

    def scatter_slice_inliers(self,
                              slice_axis,
                              slice_starts,
                              slice_widths,
                              tern_axes,
                              cmap=plt.cm.viridis,
                              **scatter_kwargs):
        """
		plot inliers only
		
		Parameters
		----------
		slice_axis: composition dimension on which to slice
		slice_starts: values of slice_axis at which to start slices
		slice_widths: widths of slices in slice_axis dimension. Single value or list
		tern_axes: composition dimensions for ternary plot axes (order: right, top, left)
		cmap: colormap for prop_dim values
		scatter_kwargs: kwargs to pass to helpers.plotting.scatter_slices
		"""
        axes = scatter_slices(self.inliers,
                              self.prop_dim,
                              slice_axis,
                              slice_starts,
                              slice_widths,
                              tern_axes,
                              cmap=cmap,
                              **scatter_kwargs)
        return axes

    def cluster_hist(self, ncols=2, cluster_by=None):

        if cluster_by is None:
            clusters = list(self.cluster_name.keys())
        else:
            gdf = self.data_pred.groupby(cluster_by)
            clusters = [k for k in gdf.groups.keys()]

        #print(clusters)
        nrows = int(np.ceil(len(clusters) / ncols))
        #print(nrows)
        fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * 4, nrows * 3))
        for (i, cluster), ax in zip(enumerate(clusters), axes.ravel()):
            if cluster_by is None:
                df = self.data_pred.loc[self.data_pred['cluster'] ==
                                        cluster, :]
            else:
                idx = gdf.groups[cluster]
                df = self.data_pred.loc[idx, :]

            num_outliers = len(df[df['isolation_flag'] == -1])
            # try: #2d axes
            # ax = axes[int(i/ncols), i%ncols]
            # except IndexError: #1d axes
            # ax = axes[i]

            dfo = df[df['isolation_flag'] == -1]
            dfi = df[df['isolation_flag'] == 1]
            hist, bins = np.histogram(df['cluster_zscore'])
            if len(dfo) > 0:
                # if isolation forest outliers exist (method=DBIFZ)
                ax.hist([dfo['cluster_zscore'], dfi['cluster_zscore']],
                        alpha=0.8,
                        bins=bins,
                        histtype='barstacked',
                        label=[
                            'IsolationForest outliers',
                            'IsolationForest inliers'
                        ],
                        color=['#ff7f0e', '#1f77b4'])
                ax.legend()
            else:
                # if no isolation forest outliers (method=DBSCAN)
                dfo = df[df['outlier_flag'] == -1]
                dfi = df[df['outlier_flag'] == 0]
                ax.hist([dfo['cluster_zscore'], dfi['cluster_zscore']],
                        alpha=0.8,
                        bins=bins,
                        histtype='barstacked',
                        label=['Outliers', 'Inliers'],
                        color=['#ff7f0e', '#1f77b4'])
                ax.legend()

            if cluster_by is None:
                ax.set_title('Cluster {}'.format(self.cluster_name[cluster]))
            else:
                ax.set_title('Cluster {}'.format(cluster))
            ax.set_xlabel('Cluster Z-score')
            ax.set_ylabel('Frequency')

            #plot z-score threshold
            ax.axvline(-self.z_thresh, ls='--', c='r')
            ax.axvline(self.z_thresh, ls='--', c='r')

            # add second axis to show prop_dim values
            ax2 = ax.twiny()
            ax2.set_xlim(ax.get_xlim())
            ax2.set_xticks(ax.get_xticks())
            tick_vals = df[self.prop_dim].mean(
            ) + df[self.prop_dim].std() * ax.get_xticks()
            ax2.set_xticklabels(np.round(tick_vals, 1))
            ax2.set_xlabel(self.prop_dim)
        fig.tight_layout()

    def cluster_scatter(self,
                        x_col,
                        y_col,
                        plot_combined=False,
                        cluster_by=None,
                        flag_outliers=False,
                        ncols=2,
                        s=8,
                        data_filter=None,
                        sharex=False,
                        sharey=False,
                        basefontsize=11,
                        **scatter_kw):
        """
		Scatter plot for each cluster. 
		
		Args:
			x_col: x column
			y_col: y column
			plot_combined: if True, create an additional plot with all samples overlaid
			cluster_by: column to use for grouping. If None, use clusters assigned by fit_predict
			flag_outliers: if True, plot outliers in orange
			ncols: number of columns for subplot grid
			s: point size
			data_filter: function to filter data. Should apply to DataFrame and return filtered DataFrame.
				Ex: data_filter = lambda df: df[df['property']==value]
			sharex, sharey: kwargs for plt.subplots()
			scatter_kw: kw for plt.scatter()
		"""

        if data_filter is None:
            data = self.data_pred
        else:
            data = data_filter(self.data_pred)

        if cluster_by is None:
            clusters = list(self.cluster_name.keys())
        else:
            gdf = data.groupby(cluster_by)
            clusters = [k for k in gdf.groups.keys()]

        if plot_combined:
            num_plots = len(clusters) + 1
        else:
            num_plots = len(clusters)
        nrows = int(np.ceil(num_plots / ncols))

        fig, axes = plt.subplots(nrows,
                                 ncols,
                                 figsize=(ncols * 4, nrows * 3),
                                 sharex=sharex,
                                 sharey=sharey)

        for (i, cluster), ax in zip(enumerate(clusters), axes.ravel()):
            if cluster_by is None:
                df = data.loc[self.data_pred['cluster'] == cluster, :]
            else:
                idx = gdf.groups[cluster]
                df = data.loc[idx, :]

            if flag_outliers is False:
                ax.scatter(df[x_col], df[y_col], s=s, **scatter_kw)
            else:
                dfi = df[df['outlier_flag'] == 0]
                dfo = df[df['outlier_flag'] == -1]
                ax.scatter(dfi[x_col],
                           dfi[y_col],
                           label='Inliers',
                           s=s,
                           **scatter_kw)
                ax.scatter(dfo[x_col],
                           dfo[y_col],
                           label='Outliers',
                           s=s,
                           **scatter_kw)
                ax.legend(fontsize=basefontsize)

            if cluster_by is None:
                ax.set_title('Cluster {}'.format(self.cluster_name[cluster]),
                             fontsize=basefontsize + 1)
            else:
                ax.set_title('Cluster {}'.format(cluster),
                             fontsize=basefontsize + 1)
            ax.set_xlabel(x_col, fontsize=basefontsize)
            ax.set_ylabel(y_col, fontsize=basefontsize)
            ax.tick_params(axis='both',
                           which='major',
                           labelsize=basefontsize - 1)

        if plot_combined:
            # plot all clusters on same axes
            if flag_outliers is False:
                axes.ravel()[-1].scatter(data[x_col],
                                         data[y_col],
                                         s=s,
                                         **scatter_kw)
            else:
                dfi = data[data['outlier_flag'] == 0]
                dfo = data[data['outlier_flag'] == -1]
                axes.ravel()[-1].scatter(dfi[x_col],
                                         dfi[y_col],
                                         label='Inliers',
                                         s=s,
                                         **scatter_kw)
                axes.ravel()[-1].scatter(dfo[x_col],
                                         dfo[y_col],
                                         label='Outliers',
                                         s=s,
                                         **scatter_kw)
                axes.ravel()[-1].legend(fontsize=basefontsize)

            axes.ravel()[-1].set_xlabel(x_col, fontsize=basefontsize)
            axes.ravel()[-1].set_ylabel(y_col, fontsize=basefontsize)
            axes.ravel()[-1].set_title('All Clusters',
                                       fontsize=basefontsize + 1)
            axes.ravel()[-1].tick_params(axis='both',
                                         which='major',
                                         labelsize=basefontsize - 1)

        for ax in axes.ravel()[num_plots:]:
            # turn off unused axes
            ax.axis('off')

        if sharex:
            for ax in axes[:-1, :]:
                ax.set_xlabel('')
        if sharey:
            for ax in axes[:, 1:].ravel():
                ax.set_ylabel('')

        fig.tight_layout()

    def quat_plot(self,
                  ax=None,
                  figsize=(8, 6),
                  quat_axes=['Co', 'Fe', 'Zr', 'Y'],
                  label_kw={},
                  gridlines=True,
                  color_col=None,
                  colorbar=True,
                  cb_kw={},
                  s=3,
                  data_filter=None,
                  **scatter_kw):
        qax = QuaternaryAxes(ax=ax, figsize=figsize)
        qax.draw_axes()
        # default corner label kwargs
        label_kwargs = dict(offset=0.11, size=14)
        # update with user kwargs
        label_kwargs.update(label_kw)
        qax.label_corners(quat_axes, **label_kwargs)

        if color_col is None:
            color_col = self.prop_dim

        # Default colorbar kwargs
        cb_kwargs = {
            'label': color_col,
            'cbrect': [0.8, 0.1, 0.02, 0.65],
            'labelkwargs': {
                'size': 14
            },
            'tickparams': {
                'labelsize': 13
            }
        }
        # update with any user-specified kwargs
        cb_kwargs.update(cb_kw)

        if data_filter is not None:
            data = data_filter(self.data_pred)
        else:
            data = self.data_pred

        if 'vmin' not in scatter_kw.keys():
            scatter_kw['vmin'] = data[color_col].min()
        if 'vmax' not in scatter_kw.keys():
            scatter_kw['vmax'] = data[color_col].max()

        qax.scatter(data[quat_axes].values,
                    c=data[color_col],
                    s=s,
                    colorbar=colorbar,
                    cb_kwargs=cb_kwargs,
                    **scatter_kw)

        qax.axes_ticks(size=13, corners='rbt', offset=0.08)
        if gridlines == True:
            qax.gridlines(ls=':', LW=0.6)
        qax.ax.axis('off')

        return qax

    def quat_highlight(self,
                       ax=None,
                       figsize=(8, 6),
                       quat_axes=['Co', 'Fe', 'Zr', 'Y'],
                       label_kw={},
                       gridlines=True,
                       color_col=None,
                       cb_label=None,
                       data_filter=None,
                       **scatter_kw):
        qax = QuaternaryAxes(ax=ax, figsize=figsize)
        qax.draw_axes()
        # default corner label kwargs
        label_kwargs = dict(offset=0.11, size=14)
        # update with user kwargs
        label_kwargs.update(label_kw)
        qax.label_corners(quat_axes, **label_kwargs)

        if color_col is None:
            color_col = self.prop_dim
        if cb_label is None:
            cb_label = color_col

        if data_filter is not None:
            data = data_filter(self.data_pred)
        else:
            data = self.data_pred

        if 'vmin' not in scatter_kw.keys():
            scatter_kw['vmin'] = data[color_col].min()
        if 'vmax' not in scatter_kw.keys():
            scatter_kw['vmax'] = data[color_col].max()

        inliers = data[data['outlier_flag'] == 0]
        outliers = data[data['outlier_flag'] == -1]

        qax.scatter(inliers[quat_axes].values,
                    c=inliers[color_col],
                    s=3,
                    colorbar=True,
                    cb_kwargs={
                        'label': cb_label,
                        'cbrect': [0.8, 0.1, 0.02, 0.65],
                        'labelkwargs': {
                            'size': 14
                        },
                        'tickparams': {
                            'labelsize': 13
                        }
                    },
                    **scatter_kw)
        qax.scatter(outliers[quat_axes].values,
                    c=outliers[color_col],
                    s=6,
                    edgecolors='r',
                    linewidths=0.5,
                    **scatter_kw)

        qax.axes_ticks()
        if gridlines == True:
            qax.gridlines()
        qax.ax.axis('off')

        return qax

    def quat_clusters(self,
                      ax=None,
                      figsize=(8, 6),
                      quat_axes=['Co', 'Fe', 'Zr', 'Y'],
                      label_kw={},
                      gridlines=True,
                      cmap=plt.cm.plasma,
                      s=3,
                      colorbar=True,
                      cb_kw={},
                      **scatter_kw):
        qax = QuaternaryAxes(ax=ax, figsize=figsize)
        qax.draw_axes()
        # default corner label kwargs
        label_kwargs = dict(offset=0.11, size=14)
        # update with user kwargs
        label_kwargs.update(label_kw)
        qax.label_corners(quat_axes, **label_kwargs)

        vmin = self.pred['cluster'].min()
        vmax = self.pred['cluster'].max()

        #make norm for discrete colormap
        clusters = list(
            self.cluster_name.keys())  #pred['cluster'].unique().astype(int)
        cluster_names = list(self.cluster_name.values())
        n_clusters = len(clusters)
        bounds = np.arange(min(clusters) - 0.5, max(clusters) + 0.51)
        norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

        # Default colorbar kwargs
        if self.cluster_by is None:
            cb_label = 'Cluster'
        else:
            cb_label = self.cluster_by
        cb_kwargs = {
            'label': cb_label,
            'norm': norm,
            'ticks': clusters,
            'ticklabels': cluster_names,
            'cbrect': [0.8, 0.1, 0.02, 0.65],
            'labelkwargs': {
                'size': 14
            },
            'tickparams': {
                'labelsize': 13
            }
        }
        # update with any user-specified kwargs
        cb_kwargs.update(cb_kw)

        qax.scatter(self.data[quat_axes].values,
                    c=self.pred['cluster'],
                    s=s,
                    cmap=cmap,
                    norm=norm,
                    vmin=vmin,
                    vmax=vmax,
                    colorbar=colorbar,
                    cb_kwargs=cb_kwargs,
                    **scatter_kw)

        qax.axes_ticks(size=13, corners='rbt', offset=0.08)
        if gridlines == True:
            qax.gridlines(ls=':', LW=0.6)
        qax.ax.axis('off')
        return qax

    def reduce_comp_dims(self, kernel='poly', gamma=10, **kpca_kw):
        comp_dims = self.comp_dims.copy()
        if 'O' in comp_dims:
            comp_dims.remove('O')
        if 'Ba' in comp_dims:
            comp_dims.remove('Ba')
        print('Dimensions for KPCA reduction:', comp_dims)
        self.kpca_dims = comp_dims
        #self.reconstructed = self.data.copy()
        # reconstructed dims
        rc_dims = [f'{d}_kpca' for d in comp_dims]
        self.kpca = KernelPCA(kernel=kernel,
                              n_components=2,
                              fit_inverse_transform=True,
                              gamma=gamma,
                              **kpca_kw)

        # self.reduced = self.data_pred.copy()
        # write reduced dimensions to pred (can't write to data_pred - it is basically just a SQL view)
        self.pred['v1'] = 0
        self.pred['v2'] = 0
        self.pred[['v1', 'v2']] = self.kpca.fit_transform(self.data[comp_dims])
        self.pred[rc_dims] = pd.DataFrame(self.kpca.inverse_transform(
            self.pred[['v1', 'v2']]),
                                          index=self.pred.index)

        # self.reduced[self.prop_dim] = self.data[self.prop_dim].values
        # self.reduced['outlier_flag'] = self.pred['outlier_flag'].values
        # self.reduced['cluster'] = self.pred['cluster'].values

        error = np.linalg.norm(self.data[comp_dims].values -
                               self.pred[rc_dims].values,
                               ord=2)
        print('Reconstruction error:', error)

        #return self.reduced, error

    def quat_reconstruction(self,
                            ax=None,
                            figsize=(8, 6),
                            gridlines=True,
                            color_col=None,
                            cb_label=None,
                            s=3,
                            data_filter=None,
                            **scatter_kw):
        """
		Plot reconstructed composition data
		"""
        rc_dims = [f'{d}_kpca' for d in self.kpca_dims]
        self.quat_plot(ax, figsize, rc_dims, gridlines, color_col, cb_label, s,
                       data_filter, **scatter_kw)

    def reduced_plot(self,
                     ax=None,
                     cmap=plt.cm.viridis,
                     vmin=None,
                     vmax=None,
                     cbar=True,
                     cbrect=[0.88, 0.12, 0.02, 0.75],
                     **kwargs):
        """
		scatter plot of prop_dim in reduced-dimension composition space
		
		Args:
		-----
		kwargs: kwargs to pass to plt.scatter
		"""
        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = plt.gcf()
        if vmin is None:
            vmin = self.reduced[self.prop_dim].min()
        if vmax is None:
            vmax = self.reduced[self.prop_dim].max()

        ax.scatter(self.reduced['v1'],
                   self.reduced['v2'],
                   c=self.reduced[self.prop_dim],
                   cmap=cmap,
                   vmin=vmin,
                   vmax=vmax,
                   **kwargs)
        if cbar == True:
            add_colorbar(fig=fig,
                         ax=ax,
                         cmap=cmap,
                         label=self.prop_dim,
                         vmin=vmin,
                         vmax=vmax,
                         subplots_adjust=dict(left=0.1, right=0.8),
                         cbrect=cbrect)
        ax.set_xlabel('$v_1$')
        ax.set_ylabel('$v_2$')

    def reduced_highlight_plot(self,
                               ax=None,
                               cmap=plt.cm.viridis,
                               vmin=None,
                               vmax=None,
                               s=8,
                               cbar=True,
                               cbrect=[0.88, 0.12, 0.02, 0.75],
                               **kwargs):
        """
		scatter plot of prop_dim in reduced-dimension composition space with outliers highlighted in red
		
		Args:
		-----
		ax: axis on which to plot. if None, create new axis
		cmap: colormap
		vmin: vmin for colormap
		vmax: vmax for colormap
		s: marker size
		cbar: if True, create a colorbar
		cbrect: colorbar rectangle: [left, bottom, width, height]
		kwargs: kwargs to pass to plt.scatter
		"""
        outliers = self.reduced.loc[self.reduced['outlier_flag'] == -1, :]
        inliers = self.reduced.loc[self.reduced['outlier_flag'] != -1, :]

        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = plt.gcf()
        if vmin is None:
            vmin = self.reduced[self.prop_dim].min()
        if vmax is None:
            vmax = self.reduced[self.prop_dim].max()

        ax.scatter(inliers['v1'],
                   inliers['v2'],
                   c=inliers[self.prop_dim],
                   cmap=cmap,
                   vmin=vmin,
                   vmax=vmax,
                   s=s,
                   **kwargs)
        ax.scatter(outliers['v1'],
                   outliers['v2'],
                   c=outliers[self.prop_dim],
                   cmap=cmap,
                   vmin=vmin,
                   vmax=vmax,
                   s=s * 2,
                   edgecolors='r',
                   linewidths=0.7,
                   **kwargs)
        if cbar == True:
            add_colorbar(fig=fig,
                         ax=ax,
                         cmap=cmap,
                         label=self.prop_dim,
                         vmin=vmin,
                         vmax=vmax,
                         subplots_adjust=dict(left=0.1, right=0.8),
                         cbrect=cbrect)
        ax.set_xlabel('$v_1$')
        ax.set_ylabel('$v_2$')

        return ax

    def reduced_inlier_plot(self,
                            ax=None,
                            cmap=plt.cm.viridis,
                            vmin=None,
                            vmax=None,
                            cbar=True,
                            cbrect=[0.88, 0.12, 0.02, 0.75],
                            **kwargs):
        inliers = self.reduced.loc[self.reduced['outlier_flag'] != -1, :]

        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = plt.gcf()
        if vmin is None:
            vmin = self.reduced[self.prop_dim].min()
        if vmax is None:
            vmax = self.reduced[self.prop_dim].max()

        ax.scatter(inliers['v1'],
                   inliers['v2'],
                   c=inliers[self.prop_dim],
                   cmap=cmap,
                   vmin=vmin,
                   vmax=vmax,
                   **kwargs)
        if cbar == True:
            add_colorbar(fig=fig,
                         ax=ax,
                         cmap=cmap,
                         label=self.prop_dim,
                         vmin=vmin,
                         vmax=vmax,
                         subplots_adjust=dict(left=0.1, right=0.8),
                         cbrect=cbrect)
        ax.set_xlabel('$v_1$')
        ax.set_ylabel('$v_2$')

        return ax

    def reduced_outlier_plot(self,
                             ax=None,
                             cmap=plt.cm.viridis,
                             vmin=None,
                             vmax=None,
                             cbar=True,
                             cbrect=[0.88, 0.12, 0.02, 0.75],
                             **kwargs):
        """
		scatter plot of prop_dim in reduced-dimension composition space with outliers highlighted in red
		
		Args:
		-----
		kwargs: kwargs to pass to plt.scatter
		"""
        outliers = self.reduced.loc[self.reduced['outlier_flag'] == -1, :]

        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = plt.gcf()
        if vmin is None:
            vmin = self.reduced[self.prop_dim].min()
        if vmax is None:
            vmax = self.reduced[self.prop_dim].max()

        ax.scatter(outliers['v1'],
                   outliers['v2'],
                   c=outliers[self.prop_dim],
                   cmap=cmap,
                   vmin=vmin,
                   vmax=vmax,
                   **kwargs)
        if cbar == True:
            add_colorbar(fig=fig,
                         ax=ax,
                         cmap=cmap,
                         label=self.prop_dim,
                         vmin=vmin,
                         vmax=vmax,
                         subplots_adjust=dict(left=0.1, right=0.8),
                         cbrect=cbrect)
        ax.set_xlabel('$v_1$')
        ax.set_ylabel('$v_2$')

        return ax

    def reduced_cluster_plot(self,
                             ax=None,
                             cmap=plt.cm.plasma,
                             cbar=True,
                             cbrect=[0.88, 0.12, 0.02, 0.75],
                             **kwargs):

        vmin = self.pred['cluster'].min()
        vmax = self.pred['cluster'].max()

        #make norm for discrete colormap
        clusters = list(self.cluster_name.keys())
        cluster_names = list(self.cluster_name.values())
        n_clusters = len(self.pred['cluster'].unique())
        bounds = np.arange(min(clusters) - 0.5, max(clusters) + 0.51)
        norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

        if ax is None:
            fig, ax = plt.subplots()
        else:
            fig = plt.gcf()
        ax.scatter(self.reduced['v1'],
                   self.reduced['v2'],
                   c=self.reduced['cluster'],
                   cmap=cmap,
                   norm=norm,
                   **kwargs)
        ax.set_xlabel('$v_1$')
        ax.set_ylabel('$v_2$')
        if cbar == True:
            add_colorbar(fig=fig,
                         ax=ax,
                         cmap=cmap,
                         norm=norm,
                         label='Cluster',
                         ticks=clusters,
                         ticklabels=cluster_names,
                         subplots_adjust=dict(left=0.1, right=0.8),
                         cbrect=cbrect)

        return ax

    def cluster_sample(self):
        if self.cluster_by == 'sample':
            return self.cluster_name
        if 'sample' in self.data.columns:
            cluster_sample = {}
            for cluster, cdf in self.data_pred.groupby('cluster'):
                cluster_sample[cluster] = list(cdf['sample'].unique())
            return cluster_sample
        else:
            raise Exception('Data does not contain sample column')
Example #42
0
Created on Wed Apr  1 16:36:12 2020

@author: Niloy
"""

#DB SCAN clustering
#essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#import dataset
dataset = pd.read_csv('dataset.csv', error_bad_lines=False)
X = dataset.loc[:, ['latitude1', 'longitude1']]

#import dbscan
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=5, min_samples=5)

model = dbscan.fit(X)
labels = model.labels_

from sklearn import metrics

sample_cores = np.zeros_like(labels, dtype=bool)
sample_cores[dbscan.core_sample_indices_] = True

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

print(metrics.silhouette_score(X, labels))
Example #43
0
# coding: utf-8

# In[1]:

import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

# In[2]:

data = np.genfromtxt('kmeans.txt', delimiter=' ')

# In[3]:

model = DBSCAN(eps=1, min_samples=4)
model.fit(data)

# In[4]:

result = model.fit_predict(data)
result

# In[5]:

mark = ['or', 'ob', 'og', 'oy', 'ok', 'om']
for i, d in enumerate(data):
    plt.plot(d[0], d[1], mark[result[i]])

plt.show()

# In[ ]:
Example #44
0
def clusterByDbscan(dataList, epsRadius, minSamples):
    """
    # DBSCAN算法:将簇定义为密度相连的点最大集合,能够把具有足够高密度的区域划分为簇,并且可在噪声的空间数据集中发现任意形状的簇。
    # 密度:空间中任意一点的密度是以该点为圆心,以EPS为半径的圆区域内包含的点数目
    # 边界点:空间中某一点的密度,如果小于某一点给定的阈值min_samples,则称为边界点
    # 噪声点:不属于核心点,也不属于边界点的点,也就是密度为1的点
    #DBSCAN的算法,聚类结果不错,因为是按照设定的人的活动半径的密度可达来聚合的,但其结果是将数据集合分类,并不求出中心点。
    dataList:[{"id","lat","lng",""},{"id","lat","lng",""}],必需包括id,lat,lng字段
    epsRadius: 聚内直线距离(km)
    minSamples:最少点数目
    return:
    {'noiseIds': [24, 25], 'clusterSet': [{'clusterCoreIds': [20, 21, 26], 'clusterCenterId': 26.0, 'clusterAroundIds': []}, {'clusterCoreIds': [22, 23], 'clusterCenterId': 22.0, 'clusterAroundIds': []}]}
    """
    if epsRadius and dataList:
        df = pd.DataFrame(dataList)
        df["lat"] = df["lat"].astype(float)
        df["lng"] = df["lng"].astype(float)
        X = df[["lat", "lng"]]
        distance_matrix = squareform(pdist(X, (lambda u, v: haversine(u, v))))
        #选取0.5公里(500m)作为密度聚合半径参数,在此使用球面距离来衡量地理位置的距离,来作为聚合的半径参数。
        # 聚合所需指定的min_samples数目为3个(一个聚合点至少是三个人)
        # db = DBSCAN(eps=0.5, min_samples=3, metric='precomputed')
        #如果你将metric设置成了precomputed的话,那么传入的X参数应该为各个向量之间的相似度矩阵,
        # 然后fit函数会直接用你这个矩阵来进行计算.否则的话,你还是要乖乖地传入(n_samples, n_features)形式的向量.
        db = DBSCAN(
            epsRadius, minSamples,
            metric='precomputed')  #通过metric='precomputed'计算稀疏的半径临近图,这会节省内存使用
        db.fit(distance_matrix)  #模型的训练
        y = db.fit_predict(distance_matrix)  #模型的预测方法
        #标记聚类点对应下标为True
        coreSamplesMask = zeros_like(db.labels_, dtype=bool)
        coreSamplesMask[db.core_sample_indices_] = True
        #print(db.core_sample_indices_)
        #聚类标签(数组,表示每个样本所属聚类)和所有聚类的数量,标签-1对应的样本表示异常点
        clusterLabels = db.labels_
        uniqueClusterLabels = set(clusterLabels)
        nClusters = len(uniqueClusterLabels) - (-1 in clusterLabels)
        #print(nClusters)
        # 异常点
        clusterInfo = {}
        offset_mask = (clusterLabels == -1)
        noiseIds = df.loc[offset_mask, ["id"]].values
        clusterInfo["noiseIds"] = noiseIds.flatten().tolist()

        clusterSet = []
        for i, clusterLabel in enumerate(uniqueClusterLabels):
            #clusterIndex是个True/Fasle数组,其中True表示对应样本为聚类点
            clusterData = {}
            if clusterLabel != -1:
                clusterIndex = (clusterLabels == clusterLabel)
                #计算聚类点集的中心点
                clusterDf = df.loc[clusterIndex & coreSamplesMask,
                                   ["id", "lat", "lng"]]
                clusterCorePoints = df.loc[clusterIndex & coreSamplesMask,
                                           ["id"]].values
                clusterData["clusterCoreIds"] = clusterCorePoints.flatten(
                ).tolist()
                clusterData["clusterCenterId"] = calcClusterCenter(
                    clusterDf)[0]
                #边界点
                aroundPoints = df.loc[(clusterIndex & ~coreSamplesMask),
                                      ["id"]].values
                clusterData["clusterAroundIds"] = aroundPoints.flatten(
                ).tolist()
                clusterSet.append(clusterData)
        clusterInfo["clusterSet"] = clusterSet

    return clusterInfo
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_diff_srv_rate',
    'dst_host_srv_diff_host_rate', 'service_ecr_i', 'service_private',
    'service_http', 'service_eco_i', 'service_other', 'service_ftp_data',
    'service_smtp', 'service_ftp', 'service_domain_u', 'service_telnet',
    'protocol_type_icmp', 'protocol_type_tcp', 'flag_SF', 'flag_S0',
    'protocol_type_udp', 'flag_REJ', 'flag_RSTR', 'flag_SH'
]

final_pandas_df_2 = final_pandas_df_1[impcol]
#final_pandas_df.info()
final_pandas_df = final_pandas_df_2[0:10000]
target = target_1[0:10000]
dbscan = DBSCAN(eps=3, algorithm='kd_tree', min_samples=5)

dbscan.fit(final_pandas_df)
#print(dbscan.labels_)

labels = dbscan.labels_
final_pandas_df['acctual_response'] = target
final_pandas_df['preditions'] = labels

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
#final_pandas_df.to_csv('F:/sem3/big_data/final_project/New folder/sample_test50k.csv',sep = '\t')
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
dff = sqlCtx.createDataFrame(final_pandas_df)
print(type(dff))
     [1.5, 3.75], [1.75, 3.25], [2.0, 3.5], [3.0, 2.25], [3.5, 1.75],
     [3.75, 8.75], [3.95, 0.9], [4.0, 1.5], [2.5, 2.75], [2.25, 2.25],
     [2.0, 3.5], [2.75, 1.75], [4.5, 1.1], [5.0, 9.0], [8.75, 5.15],
     [8.0, 2.25], [8.25, 3.0], [8.5, 4.75], [8.5, 4.25], [8.25, 3.35],
     [7.0, 1.75], [8.0, 3.5], [6.0, 1.25], [5.5, 1.75], [5.25, 1.25],
     [4.9, 1.25], [5.0, 1.5], [7.5, 2.25], [7.75, 2.75], [6.75, 2.0],
     [6.25, 1.75], [4.5, 1.1], [3.0, 4.5], [7.0, 4.5], [5.0, 3.0], [4.0, 3.35],
     [6.0, 3.35], [4.25, 3.25], [5.75, 3.25], [3.5, 3.75], [6.5, 3.75],
     [3.25, 4.0], [6.75, 4.0], [3.75, 3.55], [6.25, 3.55], [4.75, 3.05],
     [5.25, 3.05], [4.5, 3.15], [5.5, 3.15], [4.0, 6.5], [4.0, 6.75],
     [4.0, 6.25], [3.75, 6.5], [4.25, 6.5], [4.25, 6.75], [3.75, 6.25],
     [6.0, 6.5], [6.0, 6.75], [6.0, 6.25], [5.75, 6.75], [5.75, 6.25],
     [6.25, 6.75], [6.25, 6.25], [9.5, 9.5], [2.5, 9.5], [1.0, 8.0]]
data = np.asarray(X)
dbscan = DBSCAN(eps=2, min_samples=10)
dbscan.fit(data)
pca = PCA(n_components=2).fit(data)
pca_2d = pca.transform(data)

cnt1 = cnt2 = cnt3 = cnt4 = cnt5 = cnt6 = cnt7 = cnt8 = cnt9 = cnt10 = cnt11 = cnt12 = 0
for i in range(0, pca_2d.shape[0]):
    if dbscan.labels_[i] == -1:
        c1 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='r', marker='x')
        cnt1 = cnt1 + 1
    elif dbscan.labels_[i] == 0:
        c2 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='g', marker=(8, 2))
        cnt2 = cnt2 + 1
    elif dbscan.labels_[i] == 1:
        c3 = pl.scatter(pca_2d[i, 0], pca_2d[i, 1], c='b', marker=(8, 2))
        cnt3 = cnt3 + 1
    elif dbscan.labels_[i] == 2:
def do_clustering(target_csv, cluster_method):
    num_cluster = 24
    df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'),
                          index_col=0,
                          header=0,
                          encoding='utf-8-sig')
    df_data.index.name = 'short_code'
    print(df_data.iloc[:100])
    print(df_data.shape)

    start_time = time.time()
    if cluster_method == 0:
        clustering = DBSCAN(eps=0.3, min_samples=1000)
        clustering.fit(df_data)
        csv_name = 'clustered_dbscan_' + target_csv + '.csv'
    elif cluster_method == 1:
        clustering = OPTICS(min_samples=1000, metric='cosine')
        clustering.fit(df_data)
        csv_name = 'clustered_optics_' + target_csv + '.csv'
    elif cluster_method == 2:
        clustering = AgglomerativeClustering(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_ward_' + target_csv + '.csv'
    elif cluster_method == 3:
        clustering = AgglomerativeClustering(affinity='cosine',
                                             linkage='complete',
                                             n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_agglo_complete_' + target_csv + '.csv'
    elif cluster_method == 4:
        clustering = AgglomerativeClustering(affinity='cosine',
                                             linkage='single',
                                             n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_agglo_single_' + target_csv + '.csv'
    elif cluster_method == 5:
        clustering = Birch(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_birch_' + target_csv + '.csv'
    elif cluster_method == 6:
        clustering = KMeans(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_kmeans_' + target_csv + '.csv'
    elif cluster_method == 7:
        clustering = SpectralClustering(n_clusters=num_cluster,
                                        random_state=42,
                                        assign_labels='discretize')
        clustering.fit(df_data)
        csv_name = 'clustered_spectral_' + target_csv + '.csv'
    print("time elapsed for clustering: " + str(time.time() - start_time))
    print(clustering.get_params())
    print(clustering.labels_)
    count_percentage(clustering.labels_)
    result_df = pd.DataFrame(data=clustering.labels_,
                             index=df_data.index,
                             columns=['cluster'])

    start_time = time.time()
    print("calinski_harabasz_score: ",
          calinski_harabasz_score(df_data, result_df['cluster'].squeeze()))
    print("silhouette_score: ",
          silhouette_score(df_data, result_df['cluster'].squeeze()))
    print("davies_bouldin_score: ",
          davies_bouldin_score(df_data, result_df['cluster'].squeeze()))
    print("time elapsed for scoring: " + str(time.time() - start_time))
    result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name),
                     encoding='utf-8-sig')
Example #48
0
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics

eps = 2
minpts = 2

# Create condensed distance vector
df = pd.read_csv('normalized_undersampled.csv')
dbs = DBSCAN(eps=eps, min_samples=minpts)
db = dbs.fit(df)
print(db)
labels_sklearn = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels_sklearn)) - (1 if -1 in labels_sklearn else 0)

# Evaluate
real_label = []
with open('CencusIncomeUndersampled.csv', 'r') as f:
    lines = f.readlines()
    count = 0
    for line in lines:
        x = line.split(',')
        if (x[-1] == '<=50K\n'):
            real_label.append(0.0)
        elif (x[-1] == '>50K\n'):
            real_label.append(1.0)

print(labels_sklearn)
Example #49
0
k = int(input("Which k yielded the best silhouette score? "))

kmeans = KMeans(n_clusters=k)
kmeans.fit(dataset)
print(kmeans.labels_)
print(silhouette_score(dataset, kmeans.labels_, metric='euclidean'))
print("*****")

agglomerative = AgglomerativeClustering()
agglomerative.fit(dataset)
print(agglomerative.labels_)
print(silhouette_score(dataset, agglomerative.labels_, metric='euclidean'))
print("*****")

scan = DBSCAN(eps=0.5, min_samples=2)
scan.fit(dataset)
print(scan.labels_)
print(silhouette_score(dataset, scan.labels_, metric='euclidean'))
print("*****")

scan2 = DBSCAN(eps=0.55, min_samples=2)
scan2.fit(dataset)
print(scan2.labels_)
print(silhouette_score(dataset, scan2.labels_, metric='euclidean'))
print("*****")

scan3 = DBSCAN(eps=0.6, min_samples=3)
scan3.fit(dataset)
print(scan3.labels_)
print(silhouette_score(dataset, scan3.labels_, metric='euclidean'))
print("*****")
Example #50
0
def main():
    # get images
    image_1_path = e1.get()
    image_2_path = e2.get()
    try:
        image_1_RGB = plt.imread(image_1_path)
        image_2_RGB = plt.imread(image_2_path)
        color_tolerance = float(e4.get())
        cluster_tolerance = float(e3.get())
        pass
    except:
        state.set('ERROR')
        lstate.config(bg='#FF7F7F')
        window.update_idletasks()
        messagebox.showinfo(title='ERROR', message='输入错误!')
        return None
        pass

    # update the state
    lstate.config(bg='#7FFF7F')
    window.update_idletasks()

    # show image
    state.set('显示图片中。。。')
    window.update_idletasks()
    img_open = Image.open(e1.get())
    img = img_open.resize((128, 64))
    img = ImageTk.PhotoImage(img)
    lp1.config(image=img)
    lp1.image = img
    window.update_idletasks()
    # show image
    img_open = Image.open(e2.get())
    img = img_open.resize((128, 64))
    img = ImageTk.PhotoImage(img)
    lp2.config(image=img)
    lp2.image = img
    window.update_idletasks()

    # resize to speed up
    image_1_RGB = Image.open(image_1_path)
    w_resize = 96
    h_resize = int(w_resize * image_1_RGB.size[1] / image_1_RGB.size[0])
    image_1_RGB = image_1_RGB.resize((w_resize, h_resize))
    image_1_RGB = np.array(image_1_RGB)
    # resize to speed up
    image_2_RGB = Image.open(image_2_path)
    w_resize = 96
    h_resize = int(w_resize * image_2_RGB.size[1] / image_2_RGB.size[0])
    image_2_RGB = image_2_RGB.resize((w_resize, h_resize))
    image_2_RGB = np.array(image_2_RGB)

    state.set('转换RGB为LAB中。。。')
    window.update_idletasks()
    image_1_LAB = cv2.cvtColor(image_1_RGB, cv2.COLOR_RGB2LAB)
    image_2_LAB = cv2.cvtColor(image_2_RGB, cv2.COLOR_RGB2LAB)

    # image 1
    state.set('第一张图片聚类中。。。')
    window.update_idletasks()
    dbscan1 = DBSCAN(eps=cluster_tolerance, min_samples=3)
    h_1, w_1, c_1 = image_1_LAB.shape
    image_1_data = image_1_LAB.reshape((h_1 * w_1, c_1))

    image_1_lab_data = []
    for data in image_1_data:
        image_1_lab_data.append(
            [data[0] * 100 / 255, data[1] - 128, data[2] - 128])
        pass
    image_1_lab_data = np.array(image_1_lab_data)

    dbscan1.fit(image_1_lab_data)
    labels = dbscan1.labels_
    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
    # find the cluster center
    theme_1 = []
    cluster_area_1 = []
    for i in range(n_clusters_1):
        one_cluster = image_1_lab_data[labels == i]
        km = KMeans(n_clusters=1, max_iter=600)
        km.fit(one_cluster)
        theme_1.append(np.squeeze(km.cluster_centers_))
        cluster_area_1.append(len(one_cluster) / len(image_1_lab_data))
        pass
    theme_1 = np.array(theme_1)
    # show image
    uint8_theme_1 = []
    for theme in theme_1:
        uint8_theme_1.append(
            [theme[0] * 255 / 100, theme[1] + 128, theme[2] + 128])
        pass
    uint8_theme_1 = np.array(uint8_theme_1)

    pic_array = cv2.cvtColor(
        np.uint8(uint8_theme_1.reshape(1, len(uint8_theme_1), 3)),
        cv2.COLOR_LAB2RGB)
    pic_array = make_image(pic_array[0])
    pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB')
    img = ImageTk.PhotoImage(pic)
    lp1c.config(image=img)
    lp1c.image = img
    window.update_idletasks()

    # image 2
    state.set('第二张图片聚类中。。。')
    window.update_idletasks()
    dbscan2 = DBSCAN(eps=cluster_tolerance, min_samples=3)
    h_2, w_2, c_2 = image_2_LAB.shape
    image_2_data = image_2_LAB.reshape((h_2 * w_2, c_2))

    image_2_lab_data = []
    for data in image_2_data:
        image_2_lab_data.append(
            [data[0] * 100 / 255, data[1] - 128, data[2] - 128])
        pass
    image_2_lab_data = np.array(image_2_lab_data)

    dbscan2.fit(image_2_lab_data)
    labels = dbscan2.labels_
    n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0)
    # find the cluster center
    theme_2 = []
    cluster_area_2 = []
    for i in range(n_clusters_2):
        one_cluster = image_2_lab_data[labels == i]
        km = KMeans(n_clusters=1, max_iter=600)
        km.fit(one_cluster)
        theme_2.append(np.squeeze(km.cluster_centers_))
        cluster_area_2.append(len(one_cluster) / len(image_2_lab_data))
        pass
    theme_2 = np.array(theme_2)
    # show image
    uint8_theme_2 = []
    for theme in theme_2:
        uint8_theme_2.append(
            [theme[0] * 255 / 100, theme[1] + 128, theme[2] + 128])
        pass
    uint8_theme_2 = np.array(uint8_theme_2)

    pic_array = cv2.cvtColor(
        np.uint8(uint8_theme_2.reshape(1, len(uint8_theme_2), 3)),
        cv2.COLOR_LAB2RGB)
    pic_array = make_image(pic_array[0])
    pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB')
    img = ImageTk.PhotoImage(pic)
    lp2c.config(image=img)
    lp2c.image = img
    window.update_idletasks()

    state.set('聚类完成')
    window.update_idletasks()

    def calc_chromatism(lab1, lab2):
        deltaL = lab1[0] - lab2[0]
        deltaA = lab1[1] - lab2[1]
        deltaB = lab1[2] - lab2[2]
        deltaE = (deltaL**2 + deltaA**2 + deltaB**2)**0.5
        return deltaE

    '''
    # image 1 area
    image1_color_area = []
    state.set('计算图片一各颜色面积占比中。。。'+str(0)+'%')
    window.update_idletasks()
    for i in range(n_clusters_1):
        num_same_pixs = 0
        L1 = theme_1[i][0]
        A1 = theme_1[i][1]
        B1 = theme_1[i][2]
        LAB1 = [L1, A1, B1]
        for j in range(0, h_1*w_1):
            L2 = image_1_lab_data[j][0]
            A2 = image_1_lab_data[j][1]
            B2 = image_1_lab_data[j][2]
            LAB2 = [L2, A2, B2]
            deltaE = calc_chromatism(LAB1, LAB2)
            if deltaE <= color_tolerance:
                num_same_pixs += 1
                pass
            pass
        area = num_same_pixs/(h_1*w_1)
        image1_color_area.append(area)
        state.set('计算图片一各颜色面积占比中。。。'+str(int(100*(i+1)/n_clusters_1))+'%')
        window.update_idletasks()
        pass
    #print(sum(image1_color_area))
    
    # image 2 area
    image2_color_area = []
    state.set('计算图片二各颜色面积占比中。。。'+str(0)+'%')
    window.update_idletasks()
    for i in range(n_clusters_2):
        num_same_pixs = 0
        L1 = theme_2[i][0]
        A1 = theme_2[i][1]
        B1 = theme_2[i][2]
        LAB1 = [L1, A1, B1]
        for j in range(0, h_2*w_2):
            L2 = image_2_lab_data[j][0]
            A2 = image_2_lab_data[j][1]
            B2 = image_2_lab_data[j][2]
            LAB2 = [L2, A2, B2]
            deltaE = calc_chromatism(LAB1, LAB2)
            if deltaE <= color_tolerance:
                num_same_pixs += 1
                pass
            pass
        area = num_same_pixs/(h_2*w_2)
        image2_color_area.append(area)
        state.set('计算图片二各颜色面积占比中。。。'+str(int(100*(i+1)/n_clusters_2))+'%')
        window.update_idletasks()
        pass
    #print(sum(image2_color_area))
    
    state.set('面积占比计算完成')
    window.update_idletasks()
    '''
    '''
    image1_color_area
    image2_color_area
    cluster_area_1
    cluster_area_2
    '''

    Image_1_Area = cluster_area_1[:]
    Image_2_Area = cluster_area_2[:]
    print(np.sum(Image_1_Area))
    print(np.sum(Image_2_Area))

    state.set('共同色选取中。。。')
    window.update_idletasks()
    common_color = []
    common_area = []
    common_color_A = []
    common_color_B = []
    for i in range(n_clusters_1):
        L1 = theme_1[i][0]
        A1 = theme_1[i][1]
        B1 = theme_1[i][2]
        LAB1 = [L1, A1, B1]
        for j in range(n_clusters_2):
            L2 = theme_2[j][0]
            A2 = theme_2[j][1]
            B2 = theme_2[j][2]
            LAB2 = [L2, A2, B2]
            deltaE = calc_chromatism(LAB1, LAB2)
            if deltaE <= color_tolerance:
                S1 = Image_1_Area[i] / (Image_1_Area[i] + Image_2_Area[j])
                S2 = Image_2_Area[j] / (Image_1_Area[i] + Image_2_Area[j])
                L3 = L1 * S1 + L2 * S2
                A3 = A1 * S1 + A2 * S2
                B3 = B1 * S1 + B2 * S2
                L1 = round(L1, 3)
                A1 = round(A1, 3)
                B1 = round(B1, 3)
                L2 = round(L2, 3)
                A2 = round(A2, 3)
                B2 = round(B2, 3)
                L3 = round(L3, 3)
                A3 = round(A3, 3)
                B3 = round(B3, 3)
                LAB1 = [L1, A1, B1]
                LAB2 = [L2, A2, B2]
                LAB3 = [L3, A3, B3]
                common_color_A.append(LAB1)
                common_color_B.append(LAB2)
                common_color.append(LAB3)
                common_area.append((Image_1_Area[i], Image_2_Area[j]))
                pass
            pass
        pass

    #print(common_color)
    #print(common_area)
    state.set('共同色选取完成')
    window.update_idletasks()

    title = ' ' * 22 + 'LAB' + ' ' * (
        48 - 3) + 'A' + ' ' * 48 + 'B' + ' ' * 32 + 'Std Color'
    listbox.delete(0, tk.END)
    listbox.insert(tk.END, title)
    window.update_idletasks()

    result_info = []
    for i in range(len(common_color)):
        #info = '{:4d}'.format(i+1) + ' '*4
        info = '[{:3.3f} {:3.3f} {:3.3f}]'.format(common_color[i][0], \
                 common_color[i][1], common_color[i][2])
        info += ' ' * (28 - len(info))
        info += '{:3.2f}'.format(100 * common_area[i][0]) + '%' + ' ' * 4
        info += '[{:3.3f} {:3.3f} {:3.3f}]'.format(common_color_A[i][0], \
                 common_color_A[i][1], common_color_A[i][2])
        info += ' ' * (64 - len(info))
        info += '{:3.2f}'.format(100 * common_area[i][1]) + '%' + ' ' * 4
        info += '[{:3.3f} {:3.3f} {:3.3f}]'.format(common_color_B[i][0], \
                 common_color_B[i][1], common_color_B[i][2])
        info += ' ' * (100 - len(info))
        selected_std_color = select_std_color(common_color[i])
        info += selected_std_color

        res = (selected_std_color, info)
        result_info.append(res)

        pass

    colors = []
    dict_colors = {}
    nums = []
    for i in range(len(result_info)):
        colors.append(result_info[i][0])
        colors_set = set(colors)
        pass
    for color in colors_set:
        num = colors.count(color)
        if str(num) not in dict_colors.keys():
            nums.append(num)
            dict_colors[str(num)] = [color]
            pass
        else:
            dict_colors[str(num)].append(color)
            pass
        pass
    #print(dict_colors)

    index = 0
    while dict_colors != {}:
        num = max(nums)
        key = str(num)
        for color in dict_colors[key]:
            LAB1 = std_colors[color]
            num_same_pixs = 0
            for n in range(0, h_1 * w_1):
                L2 = image_1_lab_data[n][0]
                A2 = image_1_lab_data[n][1]
                B2 = image_1_lab_data[n][2]
                LAB2 = [L2, A2, B2]
                deltaE = calc_chromatism(LAB1, LAB2)
                if deltaE <= color_tolerance:
                    num_same_pixs += 1
                    pass
                pass
            area_A = num_same_pixs / (h_1 * w_1)
            num_same_pixs = 0
            for n in range(0, h_2 * w_2):
                L2 = image_2_lab_data[n][0]
                A2 = image_2_lab_data[n][1]
                B2 = image_2_lab_data[n][2]
                LAB2 = [L2, A2, B2]
                deltaE = calc_chromatism(LAB1, LAB2)
                if deltaE <= color_tolerance:
                    num_same_pixs += 1
                    pass
                pass
            area_B = num_same_pixs / (h_2 * w_2)
            area = [round(100 * area_A, 2), round(100 * area_B, 2)]
            for color_info in result_info:
                if color_info[0] == color:
                    index += 1
                    std_color_pic = [[[
                        std_colors[color_info[0]][0] * 255 / 100,
                        std_colors[color_info[0]][1] + 128,
                        std_colors[color_info[0]][2] + 128
                    ]]]
                    RGB_pic = cv2.cvtColor(np.uint8(std_color_pic),
                                           cv2.COLOR_LAB2RGB)
                    RGB = np.squeeze(RGB_pic)
                    listbox.insert(tk.END, ' ')
                    c = '#'
                    R = RGB[0]
                    R = hex(R)
                    R = str(R)[2:]
                    if len(R) == 1:
                        R += '0'
                        pass
                    G = RGB[1]
                    G = hex(G)
                    G = str(G)[2:]
                    if len(G) == 1:
                        G += '0'
                        pass
                    B = RGB[2]
                    B = hex(B)
                    B = str(B)[2:]
                    if len(B) == 1:
                        B += '0'
                        pass
                    c += R + G + B
                    #print(c)
                    listbox.itemconfig(tk.END, bg=c)
                    info = '{:4d}'.format(index) + ' ' * 4
                    info += color_info[1][:]
                    info += ' ' * 4 + '[{:3.2f}% {:3.2f}%]'.format(
                        area[0], area[1])
                    listbox.insert(tk.END, info)
                    window.update_idletasks()
                    pass
                pass
            pass
        del dict_colors[key]
        nums.remove(num)
        pass

    scrollbar.config(command=listbox.yview)
    window.update_idletasks()

    pass
Example #51
0
while n <= 2:
    labImg = cv.pyrDown(labImg)
    n = n + 1

#Squash image feature vector, 3 channels
featureImg = np.reshape(labImg, ([-1, 3]))
row, col, ch = labImg.shape

#flover image
#db = DBSCAN(eps = 5, min_samples = 10, metric = 'euclidean', algorithm = 'auto')
#Highway image
#db = DBSCAN(eps = 5, min_samples = 5, metric = 'euclidean', algorithm = 'brute')
#cars image
db = DBSCAN(eps=5, min_samples=10, metric='euclidean', algorithm='auto')

db.fit(featureImg)
labels = db.labels_
components = db.components_

indices = np.dstack(np.indices(labImg.shape[:2]))
xyColors = np.concatenate((labImg, indices), axis=-1)
featureImage2 = np.reshape(xyColors, ([-1, 5]))
db.fit(featureImage2)
labels2 = db.labels_
components2 = db.components_

figureSize = 10
plt.plot(figsize=(figureSize, figureSize))
plt.subplot(1, 2, 1)
plt.imshow(image)
plt.subplot(1, 2, 2)
Example #52
0
def cluster(X):

    dbscan = DBSCAN(metric='precomputed')
    db = dbscan.fit(X)
    print(db.labels_)
Example #53
0
def main(args):
    pnet, rnet, onet = create_network_face_detection(args.gpu_memory_fraction)

    with tf.Graph().as_default(), tf.device('/device:GPU:0'):

        with tf.Session() as sess:
            facenet.load_model(args.model)

            image_list = load_images_from_folder(args.data_dir)
            images = align_data(image_list, args.image_size, args.margin, pnet,
                                rnet, onet)

            images_placeholder = sess.graph.get_tensor_by_name("input:0")
            embeddings = sess.graph.get_tensor_by_name("embeddings:0")
            phase_train_placeholder = sess.graph.get_tensor_by_name(
                "phase_train:0")
            feed_dict = {
                images_placeholder: images,
                phase_train_placeholder: False
            }
            emb = sess.run(embeddings, feed_dict=feed_dict)

            nrof_images = len(images)

            matrix = np.zeros((nrof_images, nrof_images))

            print('')
            # Print distance matrix
            print('Distance matrix')
            print('    ', end='')
            for i in range(nrof_images):
                print('    %1d     ' % i, end='')
            print('')
            for i in range(nrof_images):
                print('%1d  ' % i, end='')
                for j in range(nrof_images):
                    dist = np.sqrt(
                        np.sum(np.square(np.subtract(emb[i, :], emb[j, :]))))
                    matrix[i][j] = dist
                    print('  %1.4f  ' % dist, end='')
                print('')

            print('')

            # DBSCAN is the only algorithm that doesn't require the number of clusters to be defined.
            db = DBSCAN(eps=args.cluster_threshold,
                        min_samples=args.min_cluster_size,
                        metric='precomputed')
            db.fit(matrix)
            labels = db.labels_

            # get number of clusters
            no_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            print('No of clusters:', no_clusters)

            if no_clusters > 0:
                if args.largest_cluster_only:
                    largest_cluster = 0
                    for i in range(no_clusters):
                        print('Cluster {}: {}'.format(
                            i,
                            np.nonzero(labels == i)[0]))
                        if len(np.nonzero(labels == i)[0]) > len(
                                np.nonzero(labels == largest_cluster)[0]):
                            largest_cluster = i
                    print('Saving largest cluster (Cluster: {})'.format(
                        largest_cluster))
                    cnt = 1
                    for i in np.nonzero(labels == largest_cluster)[0]:
                        misc.imsave(
                            os.path.join(args.out_dir,
                                         str(cnt) + '.png'), images[i])
                        cnt += 1
                else:
                    print('Saving all clusters')
                    for i in range(no_clusters):
                        cnt = 1
                        print('Cluster {}: {}'.format(
                            i,
                            np.nonzero(labels == i)[0]))
                        path = os.path.join(args.out_dir, str(i))
                        if not os.path.exists(path):
                            os.makedirs(path)
                            for j in np.nonzero(labels == i)[0]:
                                misc.imsave(
                                    os.path.join(path,
                                                 str(cnt) + '.png'), images[j])
                                cnt += 1
                        else:
                            for j in np.nonzero(labels == i)[0]:
                                misc.imsave(
                                    os.path.join(path,
                                                 str(cnt) + '.png'), images[j])
                                cnt += 1
Example #54
0
def DBSCAN_Clusterization(X, EPS, MIN_SAMPLES):

    DBClusters = DBSCAN(eps=EPS,
                        min_samples=MIN_SAMPLES,
                        metric='euclidean',
                        algorithm='auto')  #'kd_tree')
    DBClusters.fit(X)
    #DBClusters.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(
        DBClusters.labels_)) - (1 if -1 in DBClusters.labels_ else 0)
    core_samples = np.zeros_like(DBClusters.labels_, dtype=bool)
    core_samples[DBClusters.core_sample_indices_] = True

    # PRINT CLUSTERS & # of CLUSTERS
    #    print("Clusters:"+str(DBClusters.labels_))
    #
    #    print('Estimated number of clusters: %d' % n_clusters_)

    clusters = [X[DBClusters.labels_ == i] for i in range(n_clusters_)]
    outliers = X[DBClusters.labels_ == -1]

    if plot:
        plt.clf()
        # Plot Outliers
        plt.scatter(outliers[:, 0],
                    outliers[:, 1],
                    c="black",
                    label="Outliers")

    # Plot Clusters
    cmap = get_cmap(len(clusters))
    x_clusters = [None] * len(clusters)
    y_clusters = [None] * len(clusters)
    #colors = [0]
    colors = "bgrcmykw"
    color_index = 0
    for i in range(len(clusters)):
        x_clusters[i] = []
        y_clusters[i] = []
        # print("Tamano Cluster "+ str(i) + ": " + str(len(clusters[i])))
        for j in range(len(clusters[i])):
            x_clusters[i].append(clusters[i][j][0])
            y_clusters[i].append(clusters[i][j][1])

    #
        if plot:
            plt.scatter(x_clusters[i],
                        y_clusters[i],
                        label="Cluster %d" % i,
                        s=8**2,
                        c=colors[color_index])  #c=cmap(i))
        if color_index == len(colors) - 1:
            color_index = 0
        else:
            color_index += 1

    if plot:
        #plot the Clusters
        #plt.title("Clusters Vs Serving UABS")
        plt.scatter(x2, y2, c="yellow", label="UABSs",
                    s=10**2)  #plot UABS new position
        plt.xlabel('x (meters)', fontsize=16)
        plt.ylabel('y (meters)', fontsize=16)
        plt.legend(loc='upper center',
                   bbox_to_anchor=(0.5, -0.15),
                   fancybox=True,
                   shadow=True,
                   ncol=5)
        plt.savefig(
            "Graph_Clustered_UOS_Scenario {}s.pdf".format(simulation_time),
            format='pdf',
            dpi=1000)
        plt.show()

    return clusters, x_clusters, y_clusters
Example #55
0
    y = np.reshape(chunk["victim_position_y"].values, (-1, 1)) if y is None else \
        np.vstack((y, np.reshape(chunk["victim_position_y"].values, (-1, 1))))
    chunk = data.get_chunk(CHUNKSIZE)

total_data_count = X.shape[0]
print(total_data_count)

training_data = np.array(list(zip(X, y))).reshape(-1, 2)
eps_range = list(range(3500, 4000, 200))
sample_range = list(range(100, 300, 25))

# for e in eps_range:
#     for s in sample_range:
model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES)
model.fit(training_data)

# visualization
group_size = max(model.labels_)
# if group_size <= 10:
#     print("Break One s:{} e:{} max:{}".format(e, s, max(model.labels_)))
#     break
#
# if group_size > 100:
#     print("Continue One s:{} e:{} max:{}".format(e, s, max(model.labels_)))
#     continue

for i in range(group_size):
    color = "#" + "".join([hex_range[randint(0, 15)] for _ in range(6)])
    curr_x = [x for idx, x in enumerate(X) if model.labels_[idx] == i]
Example #56
0
"""
8.6.3 基于密度的空间聚类
"""

from sklearn import datasets as dss
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['FangSong']
plt.rcParams['axes.unicode_minus'] = False

X, y = dss.make_moons(n_samples=1000, noise=0.05)
dbs_1 = DBSCAN()  # 默认核心样本半径0.5,核心样本邻居5个
dbs_2 = DBSCAN(eps=0.2)  # 核心样本半径0.2,核心样本邻居5个
dbs_3 = DBSCAN(eps=0.1)  # 核心样本半径0.1,核心样本邻居5个

dbs_1.fit(X)
dbs_2.fit(X)
dbs_3.fit(X)

plt.subplot(131)
plt.title('eps=0.5')
plt.scatter(X[:, 0], X[:, 1], c=dbs_1.labels_)
plt.subplot(132)
plt.title('eps=0.2')
plt.scatter(X[:, 0], X[:, 1], c=dbs_2.labels_)
plt.subplot(133)
plt.title('eps=0.1')
plt.scatter(X[:, 0], X[:, 1], c=dbs_3.labels_)
plt.show()
Example #57
0
from sklearn.cluster import KMeans
from sklearn import datasets
import numpy as np

X,y = datasets.make_moons(n_samples=1500, noise=.05)

x1 = X[:,0]
x2 = X[:,1]

print("This is the dataset we want to classify with DBSCAN!")
plt.scatter(x1,x2,s=5)
plt.show()

#results with DBSCAN algorithm
dbscan = DBSCAN(eps=0.1)
dbscan.fit(X)
y_pred = dbscan.labels_.astype(np.int)

colors = np.array(['#ff0000', '#00ff00'])

print("These are the clusters with DBSCAN!")
plt.scatter(x1,x2,s=5,color=colors[y_pred])
plt.show()

#results with K-Means Clustering
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
y_pred = kmeans.labels_.astype(np.int)

colors = np.array(['#ff0000', '#00ff00'])
    x = pd.DataFrame(preprocessing.scale(x_original))
elif preprocess == 'norm':
    x = pd.DataFrame(preprocessing.MinMaxScaler().fit_transform(x_original))
else:
    x = x_original
print('gotovo')

# Dodeljivanje imena kolonama
x.columns = features

print('features:', features)

print('=-----------------------------------------=')
print('=              klasterovanje              =')
print('=-----------------------------------------=')
for eps in range(1, 21):
    for min_samples in range(0, 201, 10):
        if min_samples == 0:
            min_samples = min_samples + 1
        print("eps:", eps)
        print("min_samples:", min_samples)
        est = DBSCAN(eps=eps * 0.1, min_samples=min_samples)
        est.fit(x)
        col_name = 'labels_' + str(eps) + '_' + str(min_samples)
        df[col_name] = est.labels_
        num_clusters = max(est.labels_) + 1
        print("number of clusters:", num_clusters)
        print('=-----------------------------------------=')

df.to_csv('datasets/clustered_data.csv')
Example #59
0
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 19 19:47:14 2020

@author: João Victor
"""
from sklearn import metrics
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.cluster import DBSCAN

wine = load_wine()
modelo = wine.target
dbscan = DBSCAN(eps=100, min_samples=50)
print(dbscan)
dbscan.fit(wine.data)
resultado = dbscan.labels_
print(modelo)
print(resultado)
print(metrics.adjusted_mutual_info_score(modelo, resultado))
Example #60
0
    def _group(self, cutoff_index=64, algorithm="DBSCAN"):
        """
        Groups elementary components using clustering algorithms.
        
        Parameters
        ----------
        algorithm : str
            String specifying the clustering algorithm to be applied. Possible
            is ``DBSCAN`` or ``AffinityPropagation``.
        """
        print "Computing clustering with cutoff at i = {}...".format(cutoff_index)
        assert hasattr(self, "_wcorr")
        assert isinstance(algorithm, str) 
        assert algorithm in ("DBSCAN", "AffinityPropagation")
        
        # compute distance from correlation
        X = np.abs(self._wcorr[:cutoff_index,:cutoff_index] - 1.0)
        
        if algorithm is "DBSCAN":
            from sklearn.cluster import DBSCAN
            db = DBSCAN(min_samples=2, metric="precomputed")
            db.fit(X)
            labels = db.labels_
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = list(labels).count(-1)                    
        
        elif algorithm is "AffinityPropagation":
            from sklearn.cluster import AffinityPropagation
            af = AffinityPropagation(affinity="precomputed")
            labels = af.labels_
            cluster_centers_indices = af.cluster_centers_indices_
            n_clusters = len(cluster_centers_indices)

        # print user info
        print "   Estimated number of clusters: {}".format(n_clusters)
        if algorithm is "DBSCAN":
            print "   Estimated number of noise points: {}".format(n_noise)
        from sklearn import metrics
        s_score = metrics.silhouette_score(X, labels, metric="precomputed")
        print "   Silhouette Coefficient: {:0.3f}".format(s_score)
        
        # extract cluster indices and power
        clusters = dict()
        _, _, total_power = self._compute_powers()
        for i in range(n_clusters):
            ind = np.where(labels == i)[0]
            relative_power = (self._s[ind]**2).sum() / total_power
            if relative_power > self.__power_threshold:
                print "   Relative power of cluster {0}: {1:0.3f}".format(i, relative_power)
            clusters[i] = [tuple(ind), relative_power]
        
        # extract clusters from noise
        noise_ind = np.where(labels == -1)[0]
        if algorithm is "DBSCAN":
            assert noise_ind.size == n_noise
        j = n_clusters
        for i, ind in enumerate(noise_ind):
            relative_power = self._s[ind]**2 / total_power
            if relative_power > self.__power_threshold:
                print "   Relative power of noise {0}: {1:0.3f}".format(i, relative_power)
                clusters[j] = [ind, relative_power]
                j += 1

        # set final number of clusters
        self._n_groups = len(clusters)
        
        # sort according to power
        powers = np.array(zip(*clusters.values())[1])
        sort_ind = np.argsort(powers)[::-1]
        self._group_power = powers[sort_ind]
        
        # compute groups
        if self._ndim == 1:
            self._ts_groups = np.zeros((self._n, self._n_groups))
        else:
            self._ts_groups = np.zeros(self._n + (self._n_groups, ))
        for i, ind in enumerate(sort_ind):
            indices = clusters[ind][0]
            if isinstance(indices, (tuple, list, np.ndarray)):
                if len(indices) > 1:
                    self._ts_groups[...,i] = self._ts_components[...,indices].sum(axis=-1)
                else:
                    self._ts_groups[...,i] = self._ts_components[...,indices]
            else:
                self._ts_groups[...,i] = self._ts_components[...,indices]