def test_monotonic_likelihood(): # We check that each step of the each step of variational inference without # regularization improve monotonically the training set of the bound rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] bgmm = BayesianGaussianMixture( n_components=2 * n_components, covariance_type=covar_type, warm_start=True, max_iter=1, random_state=rng, tol=1e-4, ) current_lower_bound = -np.infty # Do one training iteration at a time so we can make sure that the # training log likelihood increases after each iteration. for _ in range(500): prev_lower_bound = current_lower_bound current_lower_bound = bgmm.fit(X).lower_bound_ assert_greater_equal(current_lower_bound, prev_lower_bound) if bgmm.converged_: break assert bgmm.converged_
def BayesianGaussianMixture(V, **kwargs): """Performs clustering on *V* by using Gaussian mixture models with variational inference. The function uses :func:`sklearn.micture.GaussianMixture`. See sklearn documents for details. :arg V: row-normalized eigenvectors for the purpose of clustering. :type V: :class:`numpy.ndarray` :arg n_clusters: specifies the number of clusters. :type n_clusters: int """ try: from sklearn.mixture import BayesianGaussianMixture except ImportError: raise ImportError('Use of this function (BayesianGaussianMixture) requires the ' 'installation of sklearn.') n_components = kwargs.pop('n_components', None) if n_components == None: n_components = kwargs.pop('n_clusters',None) if n_components == None: n_components = 1 n_init = kwargs.pop('n_init', 1) mixture = BayesianGaussianMixture(n_init=n_init, **kwargs).fit(V) return mixture.fit_predict(V)
def test_bayesian_mixture_fit_predict_n_init(): # Check that fit_predict is equivalent to fit.predict, when n_init > 1 X = np.random.RandomState(0).randn(1000, 5) gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0) y_pred1 = gm.fit_predict(X) y_pred2 = gm.predict(X) assert_array_equal(y_pred1, y_pred2)
def test_check_covariance_precision(): # We check that the dot product of the covariance and the precision # matrices is identity. rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components, n_features = 2 * rand_data.n_components, 2 # Computation of the full_covariance bgmm = BayesianGaussianMixture(n_components=n_components, max_iter=100, random_state=rng, tol=1e-3, reg_covar=0) for covar_type in COVARIANCE_TYPE: bgmm.covariance_type = covar_type bgmm.fit(rand_data.X[covar_type]) if covar_type == 'full': for covar, precision in zip(bgmm.covariances_, bgmm.precisions_): assert_almost_equal(np.dot(covar, precision), np.eye(n_features)) elif covar_type == 'tied': assert_almost_equal(np.dot(bgmm.covariances_, bgmm.precisions_), np.eye(n_features)) elif covar_type == 'diag': assert_almost_equal(bgmm.covariances_ * bgmm.precisions_, np.ones((n_components, n_features))) else: assert_almost_equal(bgmm.covariances_ * bgmm.precisions_, np.ones(n_components))
def test_bayesian_mixture_predict_predict_proba(): # this is the same test as test_gaussian_mixture_predict_predict_proba() rng = np.random.RandomState(0) rand_data = RandomData(rng) for prior_type in PRIOR_TYPE: for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y bgmm = BayesianGaussianMixture( n_components=rand_data.n_components, random_state=rng, weight_concentration_prior_type=prior_type, covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message(NotFittedError, "This BayesianGaussianMixture instance" " is not fitted yet. Call 'fit' with " "appropriate arguments before using " "this method.", bgmm.predict, X) bgmm.fit(X) Y_pred = bgmm.predict(X) Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
def fit(self, X, Y): # assume classes are numbered 0...K-1 self.K = len(set(Y)) self.gaussians = [] self.p_y = np.zeros(self.K) for k in range(self.K): print("Fitting gmm", k) Xk = X[Y == k] self.p_y[k] = len(Xk) gmm = BayesianGaussianMixture(10) gmm.fit(Xk) self.gaussians.append(gmm) # normalize p(y) self.p_y /= self.p_y.sum()
def test_bayesian_mixture_fit_predict(seed, max_iter, tol): rng = np.random.RandomState(seed) rand_data = RandomData(rng, scale=7) n_components = 2 * rand_data.n_components for covar_type in COVARIANCE_TYPE: bgmm1 = BayesianGaussianMixture(n_components=n_components, max_iter=max_iter, random_state=rng, tol=tol, reg_covar=0) bgmm1.covariance_type = covar_type bgmm2 = copy.deepcopy(bgmm1) X = rand_data.X[covar_type] Y_pred1 = bgmm1.fit(X).predict(X) Y_pred2 = bgmm2.fit_predict(X) assert_array_equal(Y_pred1, Y_pred2)
def kde_entropy_sklearn_gmm(points, n_est=None, n_components=None): """ Use sklearn.neigbors.KernelDensity pdf to estimate entropy. Data is standardized before kde. Sample points drawn from gaussian mixture model from original points. Fails for bimodal and dirichlet, similar to statsmodels kde. """ from sklearn.mixture import BayesianGaussianMixture as GMM n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est if n_components is None: n_components = int(5*sqrt(d)) predictor = GMM(n_components=n_components, covariance_type='full', #verbose=True, max_iter=1000) predictor.fit(x) evaluation_points, _ = predictor.sample(n_est) logp = sklearn_log_density(x, evaluation_points=evaluation_points) H = -np.mean(logp) return H / LN2
def gmm_entropy(points, n_est=None, n_components=None): #from sklearn.mixture import GaussianMixture as GMM from sklearn.mixture import BayesianGaussianMixture as GMM n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est if n_components is None: n_components = int(5*sqrt(d)) ## Standardization doesn't seem to help ## Note: sigma may be zero #x, mu, sigma = standardize(x) # if standardized predictor = GMM(n_components=n_components, covariance_type='full', #verbose=True, max_iter=1000) predictor.fit(x) eval_x, _ = predictor.sample(n_est) weight_x = predictor.score_samples(eval_x) H = -np.mean(weight_x) #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma)) # if standardized dH = 0. ## cross-check against own calcs #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_) #print("alt", H, alt.entropy()) #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T) return H / LN2, dH / LN2
def test_bayesian_mixture_precisions_prior_initialisation(): rng = np.random.RandomState(0) n_samples, n_features = 10, 2 X = rng.rand(n_samples, n_features) # Check raise message for a bad value of degrees_of_freedom_prior bad_degrees_of_freedom_prior_ = n_features - 1. bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng) assert_raise_message( ValueError, "The parameter 'degrees_of_freedom_prior' should be " "greater than %d, but got %.3f." % (n_features - 1, bad_degrees_of_freedom_prior_), bgmm.fit, X) # Check correct init for a given value of degrees_of_freedom_prior degrees_of_freedom_prior = rng.rand() + n_features - 1. bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_) # Check correct init for the default value of degrees_of_freedom_prior degrees_of_freedom_prior_default = n_features bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_) # Check correct init for a given value of covariance_prior covariance_prior = { 'full': np.cov(X.T, bias=1) + 10, 'tied': np.cov(X.T, bias=1) + 5, 'diag': np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3, 'spherical': rng.rand() } bgmm = BayesianGaussianMixture(random_state=rng) for cov_type in ['full', 'tied', 'diag', 'spherical']: bgmm.covariance_type = cov_type bgmm.covariance_prior = covariance_prior[cov_type] bgmm.fit(X) assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_) # Check raise message for a bad spherical value of covariance_prior bad_covariance_prior_ = -1. bgmm = BayesianGaussianMixture(covariance_type='spherical', covariance_prior=bad_covariance_prior_, random_state=rng) assert_raise_message( ValueError, "The parameter 'spherical covariance_prior' " "should be greater than 0., but got %.3f." % bad_covariance_prior_, bgmm.fit, X) # Check correct init for the default value of covariance_prior covariance_prior_default = { 'full': np.atleast_2d(np.cov(X.T)), 'tied': np.atleast_2d(np.cov(X.T)), 'diag': np.var(X, axis=0, ddof=1), 'spherical': np.var(X, axis=0, ddof=1).mean() } bgmm = BayesianGaussianMixture(random_state=0) for cov_type in ['full', 'tied', 'diag', 'spherical']: bgmm.covariance_type = cov_type bgmm.fit(X) assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)
def find_nuclear_contours(cell_rois, cell_stack, nuclear_stack, um_to_px): nuclear_rois = {} unknown_rois = {} nuclear_centroids = {} status_tags = {} for ID, cell_roi in cell_rois.items(): print(f'ID: {ID}') # although, 8 seems pyknotic # 45, and 41, are instances of multis ids = [7] ids = [id - 1 for id in ids] #if ID not in ids: continue cell_contours = cell_roi['contours'] nuclear_contours = [] unknown_contours = [] nuclear_centroids_list = [] status_tags_list = [] for tp, cell_contour in enumerate(cell_contours): #if tp > 0: break # Acquire cellular slice and offset. cell_slice, cell_offset = get_slice_and_offset(cell_contour) # Use slice to acquire cell subimage. cell_subimg = cell_stack[tp][cell_slice] tmp = cell_contour - np.array([cell_offset[::-1]]) cx, cy = extract_improved_cell_centroid(cell_subimg, tmp) cell_offset = cell_slice[0].start, cell_slice[1].start nuclear_centroids_list.append(np.array([[cx, cy]]) + np.array([cell_offset[::-1]])) nuc_subimg = nuclear_stack[tp][cell_slice] cell_mask = np.zeros_like(cell_subimg) tmp = cell_contour - np.array([cell_offset[::-1]]) cv2.drawContours(cell_mask, [tmp], -1, 255, -1) nuc_subimg[cell_mask == 0] = 0 # Remove everything outside of the cell. nuc_subimg[nuc_subimg < np.percentile(nuc_subimg, 10)] = 0 nuc_subimg = rmp.subtract(nuc_subimg) nuc_subimg[nuc_subimg < np.percentile(nuc_subimg, 30)] = 0 nuc_subimg = morphology.grey_opening(nuc_subimg, size=(3,3)) DELTA = um_to_px(10) mask = np.zeros_like(nuc_subimg) cv2.circle(mask, (cx, cy), DELTA, 255, -1) nuc_subimg[mask == 0] = 0 median_filtered = skimage.filters.median(nuc_subimg) # Median filtering leads to pixels with 0 value having much larger nonzero values. Ensure the pixels # with low values continue to be low. median_filtered[nuc_subimg <= np.percentile(nuc_subimg, 5)] = 0 nuc_subimg = median_filtered def plt_center(): plt.plot(cx, cy, 'r.') def fail_func(tag): nuclear_contours.append(np.empty(0)) unknown_contours.append(np.empty(0)) status_tags_list.append(tag) COMP = 5 # 1D GMM AREA # Fit only to the nonzero portion of the image to attain a sharper fit. gmm = BayesianGaussianMixture(n_components=COMP) nonzero_nuc_subimg = nuc_subimg[nuc_subimg != 0] if nonzero_nuc_subimg.size == 0: fail_func('insufficient') continue try: gmm.fit(nonzero_nuc_subimg.reshape(-1, 1)) except ValueError: fail_func('insufficient-2') continue #visualize('nuc_subimg', nuc_subimg) gpred = gmm.predict(nuc_subimg.reshape(-1, 1)).reshape(nuc_subimg.shape).astype(np.uint8) label_of_min = np.argmin(gmm.means_) # If the minimum's label is not zero, then switch it to zero. if label_of_min != 0: # Temporarily set zero-label to value guaranteed to exceed total Gaussian count. tmp = COMP + 1 gpred[gpred == 0] = tmp # Ensure the minimum label is zero. gpred[gpred == label_of_min] = 0 # Ensure whatever label was zero is now set to the minimum label. gpred[gpred == tmp] = label_of_min # Switch the means to reflect the change. tmp = np.copy(gmm.means_[0]) gmm.means_[0] = gmm.means_[label_of_min] gmm.means_[label_of_min] = tmp # Ensure whatever was background in the nuclear image remains as background. Without this line, it is possible # that GMM prediction, which was fed nonzero pixels only, would incorrectly classify background. This is necessary. gpred[nuc_subimg == 0] = 0 #visualize(f'{ID+1}-T{tp+1}gpred', gpred, plt_center) # Extract contours and choose the one closest to the SCE. P = np.ones_like(nuc_subimg, dtype=np.uint8) P[gpred == 0] = 0 _, contours, _ = cv2.findContours(P, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) compute_roi_centroid_dist = lambda contour: compute_dist(np.array([cx, cy]), contour) try: nuclear_contour = min(contours, key=compute_roi_centroid_dist) except ValueError: fail_func('unfound') continue #visualize('gpred', gpred) # Upon having found the closest contour, remove rest from image. Q = np.zeros_like(nuc_subimg, dtype=np.uint8) cv2.drawContours(Q, [nuclear_contour], -1, 255, -1) #visualize('Q', Q) gpred[Q == 0] = 0 # Check for pyknotic. if gpred[cy, cx] == 0: fail_func('pyknotic') continue # Crop the image. x, y, w, h = cv2.boundingRect(nuclear_contour) max_y, max_x = gpred.shape crop_y, crop_x = max(0, y-1), max(0, x-1) cropped_gpred = gpred[crop_y:min(max_y, y+h+1), crop_x:min(max_x, x+w+1)] # Update cellular offset and SCE. cell_offset += np.array([crop_y, crop_x]) cx -= crop_x cy -= crop_y #visualize('gpred', gpred) img, label_num = skimage.measure.label(cropped_gpred, return_num=True) if label_num == 1: pass #print(np.where(img == 1)) img = img.astype(np.uint8) # Add one to label number because 0 is not included in count. 0 is background. label_num += 1 def traverse_row_left_to_right(row, inward_links, incidence_counts): # Moving left-to-right, find each index where its right-neighbor differs from it. diffs = np.where(row[:-1] != row[1:])[0] # Make tuples of these labels and their right-neighbors; of the label that changes upon # right-traversal, and what it changes to. transitions = list(zip(row[diffs], row[diffs+1])) # The directed graph: labels_within = [0] for left, right in transitions: incidence_counts[left, right] += 1 if right not in labels_within: inward_links[left].add(right) labels_within.append(right) def traverse_rows(img, inward_links, incidence_counts): for row in img: traverse_row_left_to_right(row, inward_links, incidence_counts) # Now reverse row and do same. traverse_row_left_to_right(row[::-1], inward_links, incidence_counts) def build_graphs(img, label_num): inward_links = defaultdict(set) incidence_counts = np.zeros((label_num,) * 2) traverse_rows(img, inward_links, incidence_counts) # Transpose image and to do same for columns. traverse_rows(img.T, inward_links, incidence_counts) # Turn incidence counts into proportions. incidence_proportions = np.array([c / np.sum(c) for c in incidence_counts]) return inward_links, incidence_proportions #visualize('img', img) inward_links, incidence_proportions = build_graphs(img, label_num) ''' _labels = sorted(inward_links.keys()) for l in _labels: print(f'{l}: {inward_links[l]}') for ix, row in enumerate(incidence_proportions): _row = [str(round(x, 2)) for x in row] print(f'{ix}: {_row}') ''' # Determine which label is to be taken as the background interface. bg_interface_label = np.argmax(incidence_proportions[0]) # Now will follow a series of rules that use the graph information to properly setup the subsequent portions. #visualize('img', img, plt_center) # Ensure that any ROI that is fully contained within another label is coalesced into that label. for label in range(label_num): # If the label has no inward links, then it is fully contained in another label. if label not in inward_links: # Find which label it is contained inside. for _label, links in inward_links.items(): if label in links: #print(f'{label} not found. It is inside {_label}.') img[img == label] = _label break # Ensure everything that is not background or background interface is same label. # Multiplying by 2 here because: a) need label to be different than the background interface label, # and it helps visualization for the values to be separated. signal_label = bg_interface_label * 2 img[np.logical_and(img != 0, img != bg_interface_label)] = signal_label # Find all signal ROIs. Keep one closest to SCE and eliminate rest. _img = np.copy(img) _img[img == bg_interface_label] = 0 _, contours, _ = cv2.findContours(_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) compute_roi_centroid_dist = lambda contour: compute_dist(np.array([cx, cy]), contour) #visualize('img', _img) try: contour = min(contours, key=compute_roi_centroid_dist) except: fail_func('strange-1') continue # Eliminate any other signal ROIs. mask = np.zeros_like(img) cv2.drawContours(mask, [contour], -1, 255, -1) img[np.logical_and(mask == 0, img != bg_interface_label)] = 0 # Check that SCE is on signal. If not, strange. sce_label = img[cy, cx] if sce_label in [0, bg_interface_label]: fail_func('strange-2') continue # Will now compute the centroid of the signal ROI to expand around for attaining the nuclear ROI estimate. # To better estimate the centroid of the nuclear region, will erode the signal ROI. This will remove portions # of the ROI which may come from either other nuclei or other non-nuclear regions that make it non-circular. # These portions would skew the centroid estimate. # Acquire the contour's bounding rectangle. x, y, w, h = cv2.boundingRect(contour) min_dim = min(w, h) # Will use a 3x3 structuring element to erode. To be conservative, only erode a quarter of the minimum dimension # of the bounding rectangle. Erosion will function on both sides of the ROI, so eroding half of the minimum dimension # on either side would make the ROI vanish. Eroding half of half will ensure the ROI persists. This is done somewhat # heuristically to improve results. iter = int(min_dim / 4) #visualize('mask', mask) mask = binary_erosion(mask, structure=np.array([[0,1,0],[1,1,1],[0,1,0]]), iterations=iter).astype(np.uint8) _, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # Try to find a contour after erosion. If this fails, continue to use contour extracted before erosion. try: contour = contours[0] except IndexError: contour = contour #visualize('postmask', mask) # Compute the centroid of the chosen signal ROI. my, mx = map(int, [np.mean(contour[:, :, 1]), np.mean(contour[:, :, 0])]) # Now begin drawing concentric circles centered at this centroid with increasing radius. # When the circle overlaps background, either the background interface or otherwise, determine that # it has encompassed the full extent of the nucleus that will be measured. center = mx, my mask = np.zeros_like(img) radius = 0 # 2 is currently used as the signal ROI label. non_signal_labels = np.empty(0) while non_signal_labels.size == 0: radius += 1 cv2.circle(mask, center, radius, 255, -1) non_signal_labels = img[np.logical_and(mask != 0, img != signal_label)] # Subtract 1 as it just failed the loop condition. radius -= 1 # After fitting a circle, try to expand into an ellipse. # First try one axis, then the other. radius1 = radius mask = np.zeros_like(img) non_signal_labels = np.empty(0) while non_signal_labels.size == 0: radius1 += 1 cv2.ellipse(mask, center, (radius1, radius), 0, 0, 360, 255, -1) non_signal_labels = img[np.logical_and(mask != 0, img != signal_label)] # Subtract 1 as it just failed the loop condition. radius1 -= 1 radius2 = radius mask = np.zeros_like(img) non_signal_labels = np.empty(0) while non_signal_labels.size == 0: radius2 += 1 cv2.ellipse(mask, center, (radius1, radius2), 0, 0, 360, 255, -1) non_signal_labels = img[np.logical_and(mask != 0, img != signal_label)] # Subtract 1 as it just failed the loop condition. radius2 -= 1 _, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) nuclear_contour = contours[0] if nuclear_contour.size == 0: fail_func('unfound') continue # If SCE is not within nuclear contour, do not choose contour. if cv2.pointPolygonTest(nuclear_contour, (cx, cy), False) != 1: fail_func('external-SCE') continue def plt_m(): plt.plot(mx, my, 'b.') # Now find all ROIs that are not background or interface. Choose one closest to SCE. #visualize('img', img, plt_m) # Now specify the indeterminate region. This will be entire contour, sans background. img[img != 0] = 255 _, contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) unknown_contour = contours[0] #itercount = max(1, int(DELTA / 10)) # Currently leaving out the containing own centroid portion. # If made it this far, believe that we've extracted the SCE containing contour. # NOT E: this could have failed. Need to have additional steps to account for failure. # Will need to check for own centroid. # If all contained are of either this label or whatever other label is found, then have found a region containing this one. #mask1 = np.zeros_like(gpred) #mask2 = np.zeros_like(gpred) # Draw contour recently found, dilate it, and re-extract. #cv2.drawContours(mask1, [contour], -1, 255, -1) #cv2.drawContours(mask2, [contour], -1, 255, -1) #mask2 = binary_dilation(mask2, structure=np.ones((3, 3))).astype(np.uint8) #xor_mask = np.logical_xor(mask1, mask2) # Visualize XOR #tmp = np.copy(gpred) #tmp[np.logical_not(xor_mask)] = 0 #visualize('gpred', gpred) #visualize('xor', tmp) #S = np.copy(tmp) #visualize('pre', pred2) #S = binary_opening(S) #S = binary_dilation(S, iterations=itercount).astype(np.uint8) #S[cell_mask == 0] = 0 #_, unknown_contour_estimates, _ = cv2.findContours(S, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) unknown_contours.append(unknown_contour + np.array([cell_offset[::-1]])) nuclear_contours.append(nuclear_contour + np.array([cell_offset[::-1]])) status_tags_list.append('') nuclear_rois[ID] = nuclear_contours unknown_rois[ID] = unknown_contours nuclear_centroids[ID] = nuclear_centroids_list status_tags[ID] = status_tags_list return nuclear_rois, unknown_rois, nuclear_centroids, status_tags
def _mean(data): # aggregated data gm = BayesianGaussianMixture(n_components=2) gm.fit(data) return gm.means_
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs from sklearn.mixture import GaussianMixture from sklearn.mixture import BayesianGaussianMixture from copy import deepcopy aic = [] bic = [] n_com = 12 data, label = make_blobs(n_samples=300, n_features=2, centers=10) gmm_without = GaussianMixture(n_components=4).fit_predict(data) gmm_vbem = BayesianGaussianMixture(n_components=10).fit_predict(data) plt.scatter(data[:, 0], data[:, 1], c=label) plt.show() low = 99999 for n in range(1, n_com + 1): gmm = GaussianMixture(n_components=n) gmm.fit(data) aic.append(gmm.aic(data)) if aic[-1] < low: low = aic[-1] model = deepcopy(gmm) low2 = 99999 for n in range(1, n_com + 1): gmm = GaussianMixture(n_components=n) gmm.fit(data)
class Pyxelate: CONVOLUTIONS = np.array( [[[2, 2], [2, 2]], [[11, -1], [-1, -1]], [[-1, 11], [-1, -1]], [[-1, -1], [11, -1]], [[-1, -1], [-1, 11]], [[5, 5], [-1, -1]], [[-1, -1], [5, 5]], [[5, -1], [5, -1]], [[-1, 5], [-1, 5]], [[5, -1], [-1, 5]], [[-1, 5], [5, -1]], [[-1, 3], [3, 3]], [[3, -1], [3, 3]], [[3, 3], [-1, 3]], [[3, 3], [3, -1]]], dtype="int") SOLUTIONS = np.array([ [[1, 1], [1, 1]], [[0, 1], [1, 1]], [[1, 0], [1, 1]], [[1, 1], [0, 1]], [[1, 1], [1, 0]], [[1, 1], [0, 0]], [[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 1], [0, 1]], [[1, 0], [1, 0]], [[0, 1], [0, 1]], [[1, 0], [0, 0]], [[0, 1], [0, 0]], [[0, 0], [1, 0]], [[0, 0], [0, 1]], ], dtype="bool") ITER = 2 def __init__(self, height, width, color=8, dither=True, alpha=.6, regenerate_palette=True, keyframe=.6, sensitivity=.07, random_state=0): """Create instance for generating similar pixel arts.""" self.height = int(height) self.width = int(width) if self.width < 1 or self.height < 1: raise ValueError("Result can not be smaller than 1x1 pixels.") self.color = int(color) if self.color < 2: raise ValueError("The minimum number of colors is 2.") elif self.color > 32: raise ValueError("The maximum number of colors is 32.") if dither: self.dither = 1 / (self.color + 1) else: self.dither = 0. self.alpha = float(alpha) # threshold for opacity self.regenerate_palette = bool(regenerate_palette) self.keyframe = keyframe # threshold for differences between keyframes self.sensitivity = sensitivity # threshold for differences between parts of keyframes # BGM self.is_fitted = False self.random_state = int(random_state) self.model = BayesianGaussianMixture( n_components=self.color, max_iter=256, covariance_type="tied", weight_concentration_prior_type="dirichlet_distribution", mean_precision_prior=1. / 256., warm_start=False, random_state=self.random_state) def convert(self, image): """Generate pixel art from image""" return self._convert(image, False, False) def _convert(self, image, override_adapthist=False, override_dither=False): """Generate pixel art from image or sequence of images""" # does the image have alpha channel? if self._is_transparent(image): # remove artifacts from transparent edges image = self._dilate(image) # create alpha mask mask = resize(image[:, :, 3], (self.height, self.width), anti_aliasing=True) # mask for colors color_mask = resize(image[:, :, 3], (32, 32), anti_aliasing=False).ravel() else: mask = None color_mask = None # apply adaptive contrast if not override_adapthist: image = self._fix_hist(image) # create sample for finding palette if self.regenerate_palette or not self.is_fitted: examples = resize(image[:, :, :3], (32, 32), anti_aliasing=False).reshape(-1, 3).astype("int") if color_mask is not None: # transparent colors should be ignored examples = examples[color_mask >= self.alpha] self._fit_model(examples) # resize image to 4 times the desired width and height image = resize( image[:, :, :3], (self.height * self.ITER * 2, self.width * self.ITER * 2), anti_aliasing=True) # generate pixelated image with desired width / height t = time() image = self._reduce(image) print('SS:', time() - t) # apply palette height, width, depth = image.shape reshaped = np.reshape(image, (height * width, depth)) probs = self.model.predict_proba(reshaped) y = np.argmax(probs, axis=1) # increase hue and snap color values to multiples of 8 palette = rgb2hsv(self.model.means_.reshape(-1, 1, 3)) palette[:, :, 1] *= 1.14 # empirical magic number palette = hsv2rgb(palette).reshape(self.color, 3) // 8 * 8 palette[palette == 248] = 255 # clamping // 8 * 8 would rarely allow 255 values # generate recolored image image = palette[y] # apply dither over threshold if it's not zero if not override_dither and self.dither: # get second best probability by removing the best one probs[np.arange(len(y)), y] = 0 # get new best and values v = np.max(probs, axis=1) > self.dither y = np.argmax(probs, axis=1) # replace every second pixel with second best color pad = not bool(width % 2) if pad: # make sure to alternate between starting positions # bottleneck for i in range(0, len(image), 2): i += (i // width) % 2 if v[i]: image[i] = palette[y[i]] else: i = np.argwhere(v[::2]) * 2 image[i] = palette[y[i]] image = np.reshape(image, (height, width, depth)) if mask is not None: # use transparency from original image, but make it either 0 or 255 mask[mask >= self.alpha] = 255 mask[mask < self.alpha] = 0 image = np.dstack( (image, mask)) # result has lost its alpha channel return np.clip(image.astype("int"), 0, 255).astype("uint8") def convert_sequence(self, images): """Generates sequence of pixel arts from a list of images""" try: _ = np.array(images, dtype=float) except ValueError: # image sizes are different == setting an array element with a sequence raise ValueError("Shape of images in list are different.") # apply adaptive histogram on each images = [self._fix_hist(image) for image in images] transparent = self._is_transparent(images[0]) keyframe_limit = self.keyframe * np.prod(images[0].shape) * 255. sensitivity_limit = self.sensitivity * 255. diff_images, key_frames = [], [] # create new images that are just the differences between sequences for image in images: # add first image if diff_images: diff = np.abs(image[:, :, :3] - diff_images[-1][:, :, :3]) # image is not too different, from previous one, create mask if np.sum(diff) < keyframe_limit: diff = resize(np.mean(diff, axis=2), (self.height, self.width), anti_aliasing=True) over, under = diff > sensitivity_limit, diff <= sensitivity_limit diff[over], diff[under] = 255, 0. diff = resize(diff, (image.shape[0], image.shape[1]), anti_aliasing=False) # was the image already transparent? if transparent: image[:, :, 3] = diff else: image = np.dstack((image, diff)) key_frames.append(False) else: key_frames.append(True) else: key_frames.append(True) # add transparency layer for keyframes also, for easier broadcasting if not self._is_transparent(image): image = np.dstack( (image, np.ones((image.shape[0], image.shape[1])))) diff_images.append(image) # create a palette from all images if possible if self.regenerate_palette: warnings.warn( "using regenerate_palette=True will result in flickering, as the palette will be regenerated for each image!", Warning) else: self._palette_from_list(diff_images) # merge keyframes and differences last = None for image, key in zip(diff_images, key_frames): current = self._convert(image, True, ~key) # pyxelate keyframe / change if last is None: last = current else: # merge differences to previous images mask = ~np.logical_xor(last[:, :, 3], current[:, :, 3]) last[mask] = current[mask] # generator yield last.copy() def _palette_from_list(self, images): """Fit model to find palette using all images in list at once""" transparency = self._is_transparent(images[0]) examples = [] color_masks = [] # sample from all images for image in images: examples.append( resize(image[:, :, :3], (16, 16), anti_aliasing=False).reshape(-1, 3).astype("int")) if transparency: color_masks.append( resize(images[0][:, :, 3], (16, 16), anti_aliasing=False)) # concatenate to a single matrix examples = np.concatenate(examples) if transparency: # transparent colors should be ignored color_masks = np.concatenate(color_masks).ravel() examples = examples[color_masks >= self.alpha] self._fit_model(examples) def _fit_model(self, X): """Fit model while suppressing warnings from sklearn""" converge = True with warnings.catch_warnings(record=True) as w: # fit model self.model.fit(X) if w and w[-1].category == ConvergenceWarning: warnings.filterwarnings('ignore', category=ConvergenceWarning) converge = False if not converge: warnings.warn( "the model has failed to converge, try a different number of colors for better results!", Warning) self.is_fitted = True def _reduce(self, image): """Apply convolutions on image ITER times and generate a smaller image based on the highest magnitude of gradients""" # self is visible to decorated function @adapt_rgb(each_channel) def _wrapper(dim): # apply median filter for noise reduction dim = median(dim, square(4)) for n in range(self.ITER): h, w = dim.shape h, w = h // 2, w // 2 tt = time() flatten = view_as_blocks(dim, (2, 2)).reshape(-1, 2, 2) print('flatten:', time() - tt) # bottleneck tt = time() # slow one upper new_image = np.fromiter( (self._reduce_conv(f) for f in flatten), flatten.dtype).reshape((h, w)) print('bottleneck:', time() - tt) if n < self.ITER - 1: dim = new_image.copy() return new_image return _wrapper(image) def _reduce_conv(self, f): # slow one lower """The actual function that selects the right pixels based on the gradients 2x2 square""" return np.mean(f[self.SOLUTIONS[np.argmax( np.sum(np.multiply(self.CONVOLUTIONS, f.reshape(-1, 2, 2)).reshape(-1, 4), axis=1))]]) def _dilate(self, image): """Dilate semi-transparent edges to remove artifacts (unwanted edges, caused by transparent pixels having different colors)""" @adapt_rgb(each_channel) def _wrapper(dim): return dilation(dim, selem=square(4)) # use dilated pixels for semi-transparent ones mask = image[:, :, 3] alter = _wrapper(image[:, :, :3]) image[:, :, :3][mask < self.alpha] = alter[mask < self.alpha] return image @staticmethod def _fix_hist(image): """Apply adaptive histogram""" image = equalize_adapthist( image) * 255 * 1.14 # empirical magic number image[image <= 8.] = 0. return image @staticmethod def _is_transparent(image): """Returns True if there is an additional dimension for transparency""" return bool(image.shape[2] == 4)
def all_classifier_models(): models = [] metrix = [] c_report = [] train_accuracy = [] test_accuracy = [] models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier())) models.append(('GaussianNB', GaussianNB())) models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100))) models.append(('SVM', SVC(gamma='auto'))) models.append(('Linear_SVM', LinearSVC())) models.append(('XGB', XGBClassifier())) models.append(('SGD', SGDClassifier())) models.append(('Perceptron', Perceptron())) models.append(('ExtraTreeClassifier', ExtraTreeClassifier())) models.append(('OneClassSVM', OneClassSVM(gamma = 'auto'))) models.append(('NuSVC', NuSVC())) models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1))) models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0))) models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0))) models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('RidgeClassifierCV', RidgeClassifierCV())) models.append(('RidgeClassifier', RidgeClassifier())) models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier())) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))] models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()))) clf1 = LogisticRegression(multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'))) models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('ExtraTreesClassifier', ExtraTreesClassifier())) models.append(('CategoricalNB', CategoricalNB())) models.append(('ComplementNB', ComplementNB())) models.append(('BernoulliNB', BernoulliNB())) models.append(('MultinomialNB', MultinomialNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('NearestCentroid', NearestCentroid())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('GaussianMixture', GaussianMixture())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) test_accuracy= [] names = [] for name, model in models: try: m = model m.fit(X_train, y_train) y_pred = m.predict(X_test) train_acc = round(m.score(X_train, y_train) * 100, 2) test_acc = metrics.accuracy_score(y_test,y_pred) *100 c_report.append(classification_report(y_test, y_pred)) test_accuracy.append(test_acc) names.append(name) metrix.append([name, train_acc, test_acc]) except: print("Exception Occurred :",name) return metrix,test_accuracy,names
## Test BGM for one appliance appl = 'WHE' # Create vector with P and Q values and plot them P = d[appl].P[init:end].values Q = d[appl].Q[init:end].values X = np.transpose([P, Q]) plt.plot(d[appl].P[init:end], d[appl].Q[init:end],'o', alpha=0.1) # Normalize X sscl = StandardScaler().fit(X) X = sscl.transform(X) # Apply clusterer bgm = BayesianGaussianMixture(n_components=33, covariance_type='full', weight_concentration_prior_type='dirichlet_distribution', random_state=42).fit(X) y_pred = bgm.predict(X) # Plot clusters with X unnormalized X = sscl.inverse_transform(X) plt.figure() plt.scatter(X[:,0],X[:,1], color=colors[y_pred]) means = sscl.inverse_transform(bgm.means_) medians = get_medians(X, y_pred) # plt.plot(means[:,0],means[:,1],'kx') plt.plot(medians[:,0],medians[:,1],'kx') # TODO: Compare mean with ground truth plt.figure() plt.plot(P)
def em_stereo(self,n_component=1,dp=True,thresh_hold=0.4): self.num_params = 0 #The range of len(params) _step = 0 for var_idx in tqdm(range(len(self.merge_var[0]))): for x_v in range(len(self.merge_var[0][var_idx])): print('Step %d'%_step,end='\r') _step += 1 try: for y_v in range(len(self.merge_var[0][var_idx][x_v])): #print('cluster weights ....%d'%var_idx) dist = [] for task_idx in range(len(self.merge_var)): nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v][y_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v][y_v])),200) dist.append(nor) dist = np.array(np.asmatrix(np.concatenate(dist)).T) if dp: print('Initializing DPGMM%d ... '%_step,end='\r') gmm = DPGMM( max_iter=1000, n_components=n_component, covariance_type='spherical') else: gmm = GMM( max_iter=200, n_components=n_component, covariance_type='spherical') gmm.fit(dist) new_idx_list = [] for task_idx in range(len(self.merge_var)): #if dp: #Strategy 1. Set threshold predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1)) f_ = True while f_: #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)): if gmm.weights_[np.argmax(predict_probability)] > thresh_hold: new_idx = np.argmax(predict_probability) f_ = False else: predict_probability[0][np.argmax(predict_probability)] = 0.0 self.num_params += 1 #else: # new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1)) # if new_idx in new_idx_list: self.num_params += 1 new_idx_list.append(new_idx) self.merge_var[task_idx][var_idx][x_v][y_v] = gmm.means_[new_idx] self.merge_uncertainty[task_idx][var_idx][x_v][y_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0) except TypeError: dist = [] for task_idx in range(len(self.merge_var)): nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v])),200) dist.append(nor) dist = np.array(np.asmatrix(np.concatenate(dist)).T) if dp: print('Initializing DPGMM%d ... '%_step,end='\r') gmm = DPGMM( max_iter=200, n_components=n_component, covariance_type='spherical') else: gmm = GMM( max_iter=200, n_components=n_component, covariance_type='spherical') gmm.fit(dist) new_idx_list = [] for task_idx in range(len(self.merge_var)): #if dp: #Strategy 1. Set threshold predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1)) f_ = True while f_: #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)): if gmm.weights_[np.argmax(predict_probability)] > thresh_hold: new_idx = np.argmax(predict_probability) f_ = False else: predict_probability[0][np.argmax(predict_probability)] = 0.0 self.num_params += 1 #else: # new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1)) # if new_idx in new_idx_list: # self.num_params += 1 new_idx_list.append(new_idx) self.merge_var[task_idx][var_idx][x_v] = gmm.means_[new_idx] self.merge_uncertainty[task_idx][var_idx][x_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)
def _make_bgmm(self, label, X, Y): indices = np.where(Y == label) bgm = BayesianGaussianMixture(n_components=3, max_iter=200, tol=1e-3) bgm.fit(X[indices]) return bgm
from sklearn.mixture import GaussianMixture """ GMM 高斯混合模型 n_components 高斯分布的个数 covariance_type 各个高斯分布的方差关系, full -> 各个分布之间没有关系 """ data = [] g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000) g.fit(data) print('类别概率:\t', g.weights_[0]) print('均值:\n', g.means_, '\n') print('方差:\n', g.covariances_, '\n') # DPGMM 可以解决自动调整 n_components 的问题 from sklearn.mixture import BayesianGaussianMixture dpgmm = BayesianGaussianMixture( n_components=3, covariance_type='full', max_iter=1000, n_init=5, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=10)
from sklearn.mixture import BayesianGaussianMixture from sklearn.linear_model import LogisticRegression dataset = loaddataset(train_file) testset = loaddataset(test_file) names = [ "Nearest Neighbors", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", ] ab = AdaBoostClassifier(random_state=1) bgm = BayesianGaussianMixture(random_state=1) dt = DecisionTreeClassifier(random_state=1) gb = GradientBoostingClassifier(random_state=1) lr = LogisticRegression(random_state=1) rf = RandomForestClassifier(random_state=1) classifiers = [ KNeighborsClassifier(3), GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True), RandomForestClassifier(random_state=1), GaussianNB(), QuadraticDiscriminantAnalysis() ] svcl = LinearSVC(random_state=1) svcg = SVC(random_state=1)
alpha=0.5, clip_box=ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title("GMM", fontsize=18) plt.grid(True) #DPGMM dpgmm = BayesianGaussianMixture( n_components=n_compenents, covariance_type="full", max_iter=100, n_init=5, weight_concentration_prior_type="dirichlet_process", weight_concentration_prior=100) dpgmm.fit(x) centers = dpgmm.means_ covs = dpgmm.covariances_ print("DPGMM均值=:\n", centers) print("DPGMM方差=:\n", covs) y_hat = dpgmm.predict(x) ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], c=y, s=30, cmap=cm, marker='o')
for k, e in enumerate(ells): a.add_artist(e) e.set_facecolor((1 - w[k], 1 - w[k], 1 - w[k])) a.set_xlim(np.min(x1) - np.min(x1) / 10, np.max(x1) + np.min(x1) / 10) a.set_ylim(np.min(x2) - np.min(x2) / 10, np.max(x2) + np.min(x2) / 10) plt.scatter(x1, x2) plt.savefig('plot_{}.png'.format(iteration)) plt.show() plt.close() estimators = [ ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$", BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_process", n_components=2 * 5, reg_covar=0, init_params='random', max_iter=10, mean_precision_prior=.8, random_state=3), [1], 10), ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$", BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_process", n_components=2 * 5, reg_covar=0, init_params='random', max_iter=15, mean_precision_prior=.8, random_state=3), [1], 15), ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$", BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_process", n_components=2 * 5, reg_covar=0, init_params='random', max_iter=20, mean_precision_prior=.8, random_state=3), [1], 20), ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
x_train_unlabeled = train_unlabeled #Switch to numpy # Preprocessing X x_train = [] x_train_labeled = np.array(x_train_labeled) x_train_unlabeled = np.array(x_train_unlabeled) x_train.extend(x_train_labeled) x_train.extend(x_train_unlabeled) x_test = np.array(test) # Preprocessing y y_train_labeled = np.array(y_train_labeled) ones = -1 * np.ones(21000) ones = np.array(ones) y_train = np.concatenate((y_train_labeled, ones)).astype(int) # # GMM # GMM = BayesianGaussianMixture(n_components=10, random_state=0) # GMM.fit(x_train, y_train) # y_pred = GMM.predict(x_test) # Bayesian GMM BGMM = BayesianGaussianMixture(n_components=10, random_state=0) BGMM.fit(x_train, y_train) y_pred = BGMM.predict(x_test) # output results d = {'Id': test.index, 'y': y_pred} output = pd.DataFrame(d) output.to_csv('output5.csv', index=False)
def test_compare_covar_type(): # We can compare the 'full' precision with the other cov_type if we apply # 1 iter of the M-step (done during _initialize_parameters). rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) X = rand_data.X['full'] n_components = rand_data.n_components for prior_type in PRIOR_TYPE: # Computation of the full_covariance bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type='full', max_iter=1, random_state=0, tol=1e-7) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) full_covariances = ( bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis]) # Check tied_covariance = mean(full_covariances, 0) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type='tied', max_iter=1, random_state=0, tol=1e-7) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_ assert_almost_equal(tied_covariance, np.mean(full_covariances, 0)) # Check diag_covariance = diag(full_covariances) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type='diag', max_iter=1, random_state=0, tol=1e-7) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) diag_covariances = (bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis]) assert_almost_equal( diag_covariances, np.array([np.diag(cov) for cov in full_covariances])) # Check spherical_covariance = np.mean(diag_covariances, 0) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type='spherical', max_iter=1, random_state=0, tol=1e-7) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_ assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))
# Plot the latent space latent_vectors = chain_call(m1.latent_sample, X_test, 1000) #%% # verify sklearn gaussian mixture? from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.manifold import TSNE from matplotlib import pyplot as plt from sklearn.mixture import BayesianGaussianMixture pca = PCA(2) # pca = TSNE(2) X_pca = pca.fit_transform(latent_vectors) kmeans = BayesianGaussianMixture(10, tol=1e-6, max_iter=1000) pred = kmeans.fit_predict(X_pca) print(purity_score(y_test, pred)) #%% df_latent = pd.DataFrame({ "x1": X_pca[:, 0], "x2": X_pca[:, 1], "cat": ["pred_{}".format(i) for i in y_test], "kmeans": ["pred_{}".format(i) for i in pred], }) plt.figure(figsize=(10, 10)) sns.scatterplot(data=df_latent, x="x1", y="x2", hue="cat") plt.figure(figsize=(10, 10)) sns.scatterplot(data=df_latent, x="x1", y="x2", hue="kmeans")
def gets_best_model(self, X, target): best_classifiers = [] outer_cv = StratifiedKFold(n_splits=self.num_folds, shuffle=True, random_state=1) model_factory = [ AdaBoostClassifier(), BaggingClassifier(), BayesianGaussianMixture(), BernoulliNB(), CalibratedClassifierCV(), CatBoostClassifier(verbose=False), DecisionTreeClassifier(), ExtraTreesClassifier(), GaussianMixture(), GaussianNB(), GradientBoostingClassifier(), KNeighborsClassifier(), LinearDiscriminantAnalysis(), LogisticRegression(max_iter=1000), LogisticRegressionCV(max_iter=1000), MLPClassifier(), QuadraticDiscriminantAnalysis(), RandomForestClassifier(), SGDClassifier() ] logging.basicConfig(filename="ml_dl_toolbox_logfilename.log", level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') scoring = ('accuracy', 'neg_mean_squared_error') try: for el in model_factory: el.seed = self.seed scores = cross_validate(el, X.drop(target, axis=1), X[target], cv=outer_cv, n_jobs=-1, scoring=scoring) scores = abs( np.sqrt( np.mean(scores['test_neg_mean_squared_error']) * -1)) / np.mean(scores['test_accuracy']) score_description = [ el, '{el}'.format(el=el.__class__.__name__), "%0.5f" % scores ] best_classifiers.append(score_description) best_model = pd.DataFrame( best_classifiers, columns=["Algorithm", "Model", "RMSE/Accuracy"]).sort_values("RMSE/Accuracy", axis=0, ascending=True) best_model = best_model.reset_index() except OSError: logging.error('Check data structure') else: logging.info('Best fitting algorithm: ' + best_model["Model"][0] + " RMSE/Accuracy: " + best_model["RMSE/Accuracy"][0]) return best_model["Algorithm"][0]
np.random.seed(1) X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB, data_thr.rateC, data_thr.rateCA] scaler = StandardScaler() X = scaler.fit_transform(X) # 1 corresponds to data_thr.rate and 4=5-1 to data_thr.rateC w = w / np.sqrt(scaler.var_[1:]) # w = np.exp(-np.exp(3 * w.mean(axis=1))) w = 1. / w.mean(axis=1) ** 2 Html_file = open("gmm_sklearn_files/gmm3_sklearn.html", "w") gmm = BayesianGaussianMixture(n_components=3, alpha_prior=0.1, beta_prior=1, n_init=5) gmm.fit(X) # , weights=w) not implemented in sklearn yet preds = gmm.predict(X) probs = gmm.predict_proba(X) data_thr['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] # Spectral9 color_key = color_key[:len(set(preds))+1] covs = gmm.covariances_ means = gmm.means_ # transform cov for non-standardizeed data: covs = np.array([np.dot(np.diag(np.sqrt(scaler.var_)),
def main_process(img_np: np.ndarray, ccs: List[np.ndarray], text_lines: List[Tuple[int, int, int, int]], cc2textline_assignment): if len(ccs) == 0: return final_mask = np.zeros_like(ccs[0]) dpgmm = BayesianGaussianMixture(n_components=5, covariance_type='diag') kern = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) text_line_colors = defaultdict(list) #print(cc2textline_assignment) for i, cc in enumerate(tqdm(ccs)): if np.sum(cv2.bitwise_and(cc, final_mask)) > 0: final_mask = cv2.bitwise_or(final_mask, cc) continue pixels = img_np[cv2.erode(cc, kern) > 127] #img_np[cc > 127]# if len(pixels) < 5: final_mask = cv2.bitwise_or(final_mask, cc) continue cls1 = dpgmm.fit(pixels) # print(cls1.means_) # print(cls1.weights_) # print(np.sqrt(cls1.covariances_)) cls1_top2_mean, cls1_top2_stddev, cls1_k = find_top_k_dpgmm(cls1, 2) # cls1_top2_mean[0][0] = 4 # cls1_top2_mean[0][1] = 4 # cls1_top2_mean[0][2] = 4 # cls1_top2_mean[1][0] = 253 # cls1_top2_mean[1][1] = 253 # cls1_top2_mean[1][2] = 253 # cls1_top2_stddev[0][0] = 5 # cls1_top2_stddev[0][1] = 5 # cls1_top2_stddev[0][2] = 5 # cls1_top2_stddev[1][0] = 5 # cls1_top2_stddev[1][1] = 5 # cls1_top2_stddev[1][2] = 5 # if cls1_top2_mean[0][0] > 100 : # breakpoint() cls1_top2_stddev_ext = np.round(cls1_top2_stddev * COLOR_RANGE_SIGMA) # if i == 2 : # breakpoint() top1_mask = extend_cc_region(img_np, cc, cls1_top2_mean[0], cls1_top2_stddev_ext[0]) top2_mask = extend_cc_region(img_np, cc, cls1_top2_mean[1], cls1_top2_stddev_ext[1]) # area1 = int(top1_mask.sum()) # area2 = int(top2_mask.sum()) # if area1 == 0 or area2 == 0 : # breakpoint() # continue # area_cc = int(cc.sum()) # if abs(area1 - area_cc) < abs(area2 - area_cc) : # D = top1_mask # selected_idx = 0 # else : # D = top2_mask # selected_idx = 1 # intersect_area1 = int(cv2.bitwise_and(cc, top1_mask).sum()) # intersect_area2 = int(cv2.bitwise_and(cc, top2_mask).sum()) iou1 = cv2.bitwise_and(cc, top1_mask).sum() / cv2.bitwise_or( cc, top1_mask).sum() iou2 = cv2.bitwise_and(cc, top2_mask).sum() / cv2.bitwise_or( cc, top2_mask).sum() if iou1 > iou2: D = top1_mask selected_idx = 0 if iou1 < 1e-1: #print(iou1) D = cc selected_idx = -1 else: D = top2_mask selected_idx = 1 if iou2 < 1e-1: #print(iou2) D = cc selected_idx = -1 # print(selected_idx, iou1, iou2) # save_rgb('text_mask_utils_tmp.png', cc) # save_rgb('text_mask_utils_tmp_color.png', img_np*(cc>0)[:,:,None]) # save_rgb('text_mask_utils_tmp_top2_mask.png', top2_mask) # save_rgb('text_mask_utils_tmp_top1_mask.png', top1_mask) # input('x') # if cc2textline_assignment[i] == 12 : # breakpoint() D = cv2.bitwise_or(cc, D) D = cv2.dilate(D, kern) # if cls1_top2_mean[selected_idx][0] < 100 : # breakpoint() final_mask = cv2.bitwise_or(final_mask, D) # seed_point_candidates_mask = cv2.inRange(cc_region, cls1_top2_mean[0] - cls1_top2_stddev_ext[0], cls1_top2_mean[0] + cls1_top2_stddev_ext[0]) # seed_point_candidates_y, seed_point_candidates_x = np.where(seed_point_candidates_mask > 127) # seed_point = (seed_point_candidates_x[0] + x, seed_point_candidates_y[0] + y) # D = np.zeros((cc.shape[0] + 2, cc.shape[1] + 2), dtype = np.uint8) # cv2.floodFill(img_np, D, seed_point, (255), cls1_top2_stddev_ext[0].tolist(), cls1_top2_stddev_ext[0].tolist(), cv2.FLOODFILL_MASK_ONLY) # D = D[1: -1, 1: -1] * 255 # final_mask = cv2.bitwise_or(final_mask, D) # now we find text color if selected_idx == -1: continue # skip text_color_value = cls1_top2_mean[selected_idx] text_color_stddev = cls1_top2_stddev[selected_idx] #print('color=', text_color_value, text_color_stddev) text_line_colors[cc2textline_assignment[i]].append(text_color_value) textline_img = np.copy(img_np) def get_textline_color(j, visited_text_lines: List[int]): visited_text_lines.append(j) (x, y, w, h) = text_lines[j] colors = text_line_colors[j] if not colors: #print(f'[!!!!!!!!!!!!] Textline {j} has no color assigned, seraching for closest') min_dist = 10000000000000000000 min_dist_k = -1 for k, (x2, y2, w2, h2) in enumerate(text_lines): d = rect_distance(x, y, x + w, y + h, x2, y2, x2 + w2, y2 + h2) if d < min_dist and k not in visited_text_lines: min_dist = d min_dist_k = k if min_dist_k == -1: print( f'[!!!!!!!!!!!!] Textline {j} has no color assigned and unable to find closest rectangle, defaulting to black' ) return np.zeros((3, ), dtype=np.uint8) return get_textline_color(min_dist_k, visited_text_lines) else: clr = np.round(np.mean(np.array(colors), axis=0)) return clr textline_colors = [] for j, (x, y, w, h) in enumerate(text_lines): # colors = text_line_colors[j] # if not colors : # print(f'[!!!!!!!!!!!!] Textline {j} has no color assigned') # cv2.rectangle(textline_img, (x, y), (x + w, y + h), (0, 255, 0), 2) # continue clr = get_textline_color(j, []).tolist() textline_colors.append(clr) cv2.rectangle(textline_img, (x, y), (x + w, y + h), clr, 2) #save_rgb('text_mask_utils_final_textlines.png', textline_img) #save_rgb('text_mask_utils_final_mask_masked.png', (final_mask > 127)[:,:,None] * img_np) #cv2.imwrite('text_mask_utils_final_mask.png', final_mask) return final_mask, textline_colors
'energy': np.double, 'liveness': np.double, 'loudness': np.double, 'mode': np.double, 'speechiness': np.double, 'tempo': np.double, 'valence': np.double, 'instrumentalness': np.double, } df = df.astype(types) X = df.drop(labels=['id ', 'artist', 'name', 'mode', 'tempo', 'loudness'], axis=1) # Initial model to get appropriate number of clusters bgmm = BayesianGaussianMixture( n_components=20, n_init=20, ) bgmm.fit(X) num_clusters = len([w for w in bgmm.weights_ if w > 0.05]) bgmm_2 = BayesianGaussianMixture(n_components=num_clusters, n_init=20) probs = pd.DataFrame(bgmm_2.fit(X).predict_proba(X)) results = pd.concat([df, probs], axis=1, sort=False) # inital results - fluctuate a lot and also the weightings seem to do bugger all. # TODO: # - REAAALLLY have a look at what features are actually important. instrumentallness is not one of them... # - DBSCAN is probs a pretty good way of getting the clusters actually - do a minimum value of like 10 songs
def cluster(idx_arr, h_arr, w_arr, data, cw, samplerate, minp, percentile, npc, ngaus, plot_steps=False): l = np.ones(h_arr.shape) if ((np.max(h_arr) - np.min(h_arr)) / np.max(h_arr)) > 0.25: # classify by height bgm = BayesianGaussianMixture(ngaus, max_iter=100, n_init=10) l = bgm.fit_predict(h_arr.reshape(-1, 1)) # if any of the clusters merged have very little height difference, merge them. if len(np.unique(l)) > 1: for ll in np.unique(l): if ((np.max(h_arr[l != ll]) - np.min(h_arr[l != ll])) / np.max(h_arr[l != ll])) < 0.25: l[l != ll] = np.max(l) + 1 if plot_steps == True: mean_eods, eod_times, _ = find_window(data, idx_arr, l, h_arr, rm=False) print('clustering based on hight') plot_all(data, ts, samplerate, ms, vs) # now cluster based on waveform al = np.ones(len(l)) * -1 # extract snippets snippets = np.stack([ data[int(idx - cw * samplerate / 2):int(idx + cw * samplerate / 2)] for idx in idx_arr ]) # keep track of the labels so that no labels are overwritten maxlab = 0 for hl in np.unique(l): if len(l[l == hl]) > minp: # extract snippets, idxs and hs for this hight cluster csnippets = StandardScaler().fit_transform(snippets[l == hl]) cidx_arr = idx_arr[l == hl] ch_arr = h_arr[l == hl] # extract relevant snippet features pca = PCA(npc).fit(csnippets).transform(csnippets) # determine good epsilon knn = np.sort(pairwise_distances(pca, pca))[:, minp] eps = np.percentile(knn, percentile) # cluster by EOD shape c = DBSCAN(eps=eps, min_samples=minp).fit(pca).labels_ if plot_steps == True: mean_eods, eod_times, _ = find_window(data, cidx_arr, c, ch_arr, rm=False) print('clustering on scaled eods') plot_all(data, ts, samplerate, ms, vs) # cluster again without scaling (sometimes this works better wrt scaling) csnippets_ns = snippets[l == hl] pca = PCA(npc).fit(csnippets_ns).transform(csnippets_ns) knn = np.sort(pairwise_distances(pca, pca))[:, minp] eps = np.percentile(knn, percentile) c_ns = DBSCAN(eps=eps, min_samples=minp).fit(pca).labels_ if plot_steps == True: mean_eods, eod_times = find_window(data, cidx_arr, c_ns, ch_arr, rm=False) print('clustering on non-scaled eods') plot_all(data, ts, samplerate, ms, vs) # merge results for scaling and without scaling _, _, _, c = merge_clusters(c, c_ns, cidx_arr, cidx_arr, ch_arr, data, samplerate) if plot_steps == True: mean_eods, eod_times = find_window(data, cidx_arr, c, ch_arr, rm=False) print('merged scale and non-scaled') plot_all(data, ts, samplerate, ms, vs) # update maxlab so that no clusters are overwritten c[c == -1] = -maxlab - 1 al[l == hl] = c + maxlab maxlab = np.max(al) + 1 # return the overall clusters (al) and the clusters based on hight (l) return al, l
value, vector = sp.linalg.eigh(cov) width, height = value[0], value[1] v = vector[0] / sp.linalg.norm(vector[0]) angle = 180* np.arctan(v[1] / v[0]) / np.pi e = Ellipse(xy=center, width=width, height=height, angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title('GMM', fontsize=15) plt.grid(b=True, ls=':', color='#606060') # DPGMM dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1) dpgmm.fit(x) centers = dpgmm.means_ covs = dpgmm.covariances_ print('DPGMM均值 = \n', centers) print('DPGMM方差 = \n', covs) y_hat = dpgmm.predict(x) print(y_hat) ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020') for i, cc in enumerate(zip(centers, covs)):
# Parameters of the dataset random_state, n_components, n_features = 2, 3, 2 colors = np.array(['#0072B2', '#F0E442', '#D55E00']) covars = np.array([[[.7, .0], [.0, .1]], [[.5, .0], [.0, .1]], [[.5, .0], [.0, .1]]]) samples = np.array([200, 500, 200]) means = np.array([[.0, -.70], [.0, .0], [.0, .70]]) # mean_precision_prior= 0.8 to minimize the influence of the prior estimators = [("Finite mixture with a Dirichlet distribution\nprior and " r"$\gamma_0=$", BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_distribution", n_components=2 * n_components, reg_covar=0, init_params='random', max_iter=1500, mean_precision_prior=.8, random_state=random_state), [0.001, 1, 1000]), ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$", BayesianGaussianMixture( weight_concentration_prior_type="dirichlet_process", n_components=2 * n_components, reg_covar=0, init_params='random', max_iter=1500, mean_precision_prior=.8, random_state=random_state), [1, 1000, 100000])] # Generate data
def test_bayesian_mixture_precisions_prior_initialisation(): rng = np.random.RandomState(0) n_samples, n_features = 10, 2 X = rng.rand(n_samples, n_features) # Check raise message for a bad value of degrees_of_freedom_prior bad_degrees_of_freedom_prior_ = n_features - 1.0 bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng) assert_raise_message( ValueError, "The parameter 'degrees_of_freedom_prior' should be " "greater than %d, but got %.3f." % (n_features - 1, bad_degrees_of_freedom_prior_), bgmm.fit, X, ) # Check correct init for a given value of degrees_of_freedom_prior degrees_of_freedom_prior = rng.rand() + n_features - 1.0 bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_) # Check correct init for the default value of degrees_of_freedom_prior degrees_of_freedom_prior_default = n_features bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_) # Check correct init for a given value of covariance_prior covariance_prior = { "full": np.cov(X.T, bias=1) + 10, "tied": np.cov(X.T, bias=1) + 5, "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3, "spherical": rng.rand(), } bgmm = BayesianGaussianMixture(random_state=rng) for cov_type in ["full", "tied", "diag", "spherical"]: bgmm.covariance_type = cov_type bgmm.covariance_prior = covariance_prior[cov_type] bgmm.fit(X) assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_) # Check raise message for a bad spherical value of covariance_prior bad_covariance_prior_ = -1.0 bgmm = BayesianGaussianMixture( covariance_type="spherical", covariance_prior=bad_covariance_prior_, random_state=rng ) assert_raise_message( ValueError, "The parameter 'spherical covariance_prior' " "should be greater than 0., but got %.3f." % bad_covariance_prior_, bgmm.fit, X, ) # Check correct init for the default value of covariance_prior covariance_prior_default = { "full": np.atleast_2d(np.cov(X.T)), "tied": np.atleast_2d(np.cov(X.T)), "diag": np.var(X, axis=0, ddof=1), "spherical": np.var(X, axis=0, ddof=1).mean(), } bgmm = BayesianGaussianMixture(random_state=0) for cov_type in ["full", "tied", "diag", "spherical"]: bgmm.covariance_type = cov_type bgmm.fit(X) assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)
def BGMreport(path, visualize=1, cut_n=6): t2 = 15 t3 = 0.07 n_components = 3 denses, _ = finddensefromcut(path, cut_n) maxd = [] for dense in denses[(cut_n - 5):]: maxd.append(max(dense)) lofd = len(denses[0]) samples = list() for i in range((cut_n - 5), cut_n): #sampling for BGM samples.append(np.array(tosample(denses[i])).reshape(-1, 1)) allmeans = [] allcovs = [] allweights = [] BGM45 = np.zeros((45)) for i in range(5): BGM = BayesianGaussianMixture( n_components=n_components, covariance_type='spherical', weight_concentration_prior=0.000000000001, max_iter=500) BGM.fit(samples[i]) means = np.reshape(BGM.means_, (-1, )) permu = np.argsort(means) means = means[permu] BGM45[i * 9 + 3:i * 9 + 6] = means allmeans.append(means) covs = BGM.covariances_ covs = covs[permu] BGM45[i * 9 + 6:i * 9 + 9] = covs allcovs.append(covs) weights = BGM.weights_ weights = weights[permu] BGM45[i * 9:i * 9 + 3] = weights * len(samples[i]) allweights.append(weights) if visualize == 1: l = 0 for i in range(cut_n - 5, cut_n): #visualization l += 1 plt.subplot(2, n_components, l), plt.plot(denses[i]) X = np.linspace(0, lofd, num=200, endpoint=False) Ys = toGM(X, n_components, allmeans[l - 1], allcovs[l - 1], allweights[l - 1]) for j in range(n_components): #plt.subplot(1,5,l),plt.plot([allmeans[l-1][j],allmeans[l-1][j]],[0,255]) plt.subplot(2, n_components, l), plt.plot(X, len(samples[l - 1]) * Ys[j]) #plt.subplot(2,n_components,l),plt.plot(X,Ys[j]) plt.ylim(0, 255) plt.show() ans = np.zeros((12, )) pre = np.zeros((5, n_components)) for i in range( 5 ): ###preprocessing the data to avoid peak overlapping(far overlap and near overlap) influence: identify far/near overlap cases and suppress far overlap peaks, amplify near overlap peaks ###如果很理想的情况应该能把两个far overlap的peak合并成一个在中间mean的,但是现在可以先直接把两个抑制掉,毕竟就不太可能是单克隆峰了。far overlap也就是两个峰实际上在图里面是同一个,BGM将其拆分从而更好的拟合高斯模型,我们这里将其抑制因为能够拆分为两个峰的基本上cov都比较大,不尖。 for j in range(n_components): for l in range(n_components): if j < l: if allweights[i][j] / allweights[i][l] > 3 or allweights[ i][j] / allweights[i][ l] < 0.3333: #ignore when weight difference is too large continue if allcovs[i][j] / allweights[i][j] / allcovs[i][ l] * allweights[i][l] / abs( allmeans[i][j] - allmeans[i][l] ) * mean( np.sqrt(allcovs[i][j]), np.sqrt(allcovs[i][l]) ) > 2 or allcovs[i][l] / allweights[i][ l] / allcovs[i][j] * allweights[i][j] / abs( allmeans[i][j] - allmeans[i][l] ) * mean( np. sqrt(allcovs[i][j]), np.sqrt(allcovs[i][l]) ) > 2: #if the cov difference is large than it will be ignored from far overlap because there should be two peaks in the original density plot #near overlap situation is when a sharp peak is on a mild one. it happens when monoclonal peak has a background polyclonal peak. here we amplify the sharp peaks' weight when their cov difference is large enough or their distance is close enough so that it will be detected as abnormal in the classification step if abs(allmeans[i][j] - allmeans[i][l]) < 3.5 * np.sqrt( max(allcovs[i][j], allcovs[i][l])): neww = allweights[i][j] + allweights[i][l] if allcovs[i][l] / allweights[i][l] / allcovs[i][ j] * allweights[i][j] > 1 and allweights[ i][j] > 0.15: if allcovs[i][j] < 400: allweights[i][j] = neww else: if allcovs[i][l] < 400: allweights[i][l] = neww continue if allcovs[i][j] / allweights[i][j] / len( samples[i] ) < t3 / 2.5 or allcovs[i][l] / allweights[i][l] / len( samples[i] ) < t3 / 2.5: #if one of the considered peak has very small variance, then it should not be far overlap situation where the original peak is mild continue if allcovs[i][j] < 70 or allcovs[i][l] < 70: continue elif abs(allmeans[i][j] - allmeans[i][l]) < 3.5 * np.sqrt( max(allcovs[i][j], allcovs[i][l]) ): #far overlap situation where there is only a mild peak in the original density plot, and GMM model break it down to two sharper peaks to fit the guassian curves more accurately. here we just suppress the peaks and thus we cannot determine the column is abnormal because of the two considered components pre[i][j] = pre[i][l] = 1 for i in [0, 1, 2]: for j in [3, 4]: if maxd[i] < 50 or maxd[j] < 50: continue else: for k in range(len(allmeans[i])): for l in range(len(allmeans[j])): if pre[i][k] == 1 or pre[j][l] == 1: continue if abs(allmeans[i][k] - allmeans[j][l]) > lofd / t2: continue else: if allweights[i][k] < 0.1 or allweights[j][l] < 0.1: continue else: if allcovs[i][k] / allweights[i][k] / len( samples[i] ) > t3 or allcovs[j][l] / allweights[j][l] / len( samples[j] ) > t3: ###the t figure, represents the sharpness of the peak. just variance is not enough, we need to consider n_samples and weights too. continue else: ans[i * 2 + j - 2] = 1 ans[7 + i] = 1 ans[7 + j] = 1 ans[0] = 1 for i in range(5): for j in range(n_components): if pre[i][j] == 1: continue if maxd[i] < 80: continue elif allweights[i][j] < 0.05: continue if allcovs[i][j] / allweights[i][j] / len( samples[i]) > t3: ###t-figure continue else: ans[7 + i] = 1 ans[0] = 1 return ans, BGM45
def test_compare_covar_type(): # We can compare the 'full' precision with the other cov_type if we apply # 1 iter of the M-step (done during _initialize_parameters). rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) X = rand_data.X["full"] n_components = rand_data.n_components # Computation of the full_covariance bgmm = BayesianGaussianMixture( n_components=2 * n_components, covariance_type="full", max_iter=1, random_state=0, tol=1e-7 ) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X) full_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis] # Check tied_covariance = mean(full_covariances, 0) bgmm = BayesianGaussianMixture( n_components=2 * n_components, covariance_type="tied", max_iter=1, random_state=0, tol=1e-7 ) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X) tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_ assert_almost_equal(tied_covariance, np.mean(full_covariances, 0)) # Check diag_covariance = diag(full_covariances) bgmm = BayesianGaussianMixture( n_components=2 * n_components, covariance_type="diag", max_iter=1, random_state=0, tol=1e-7 ) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X) diag_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis] assert_almost_equal(diag_covariances, np.array([np.diag(cov) for cov in full_covariances])) # Check spherical_covariance = np.mean(diag_covariances, 0) bgmm = BayesianGaussianMixture( n_components=2 * n_components, covariance_type="spherical", max_iter=1, random_state=0, tol=1e-7 ) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X) spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_ assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))
value, vector = sp.linalg.eigh(cov) width, height = value[0], value[1] v = vector[0] / sp.linalg.norm(vector[0]) angle = 180* np.arctan(v[1] / v[0]) / np.pi e = Ellipse(xy=center, width=width, height=height, angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title('GMM', fontsize=15) plt.grid(b=True, ls=':', color='#606060') # DPGMM dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5, weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1) dpgmm.fit(x) #假定高斯分布的参数是随机变量,且服从dirichlet_process过程,weight_concentration_prior越大越考虑到先验,越小越靠近样本 centers = dpgmm.means_ covs = dpgmm.covariances_ print u'DPGMM均值 = \n', centers print u'DPGMM方差 = \n', covs y_hat = dpgmm.predict(x) print y_hat ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020') for i, cc in enumerate(zip(centers, covs)):
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]]) knn.predict(X_new) print(knn.predict_proba(X_new)) y_dist, y_pred_idx = knn.kneighbors(X_new, n_neighbors=1) y_pred = dbscan.labels_[dbscan.core_sample_indices_][y_pred_idx] y_pred[y_dist > 0.2] = -1 print(y_pred.ravel()) from sklearn.mixture import GaussianMixture gm = GaussianMixture(n_components=3, n_init=10) gm.fit(X) X_new, y_new = gm.sample(6) print("density:") print(gm.score_samples(X)) densities = gm.score_samples(X) density_threshold = np.percentile(densities, 4) anomalies = X[densities < density_threshold] print(gm.bic(X)) print(gm.aic(X)) from sklearn.mixture import BayesianGaussianMixture bgm = BayesianGaussianMixture(n_components=10, n_init=10) bgm.fit(X) print("weights:") print(np.round(bgm.weights_, 2))
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture from sklearn.model_selection import train_test_split import sklearn import pandas import cv2 input_path = "/Users/Srikanth/PycharmProjects/DataMiningClass/datasets/face.jpg" img = cv2.imread(input_path) o_shape = img.shape k = 2 new_data = img.reshape(-1, 3) vgmm = BayesianGaussianMixture(n_components=k) # vgmm = GaussianMixture(n_components=k) vgmm = vgmm.fit(new_data) cluater = vgmm.predict(new_data) # Reshape the input data to the orignal shape cluater = cluater.reshape(o_shape[0], o_shape[1]) from matplotlib import pyplot pyplot.imshow(cluater) pyplot.show()
def wnn_entropy(points, k=None, weights=True, n_est=None, gmm=None): r""" Weighted Kozachenko-Leonenko nearest-neighbour entropy calculation. *k* is the number of neighbours to consider, with default $k=n^{1/3}$ *n_est* is the number of points to use for estimating the entropy, with default $n_\rm{est} = n$ *weights* is True for default weights, False for unweighted (using the distance to the kth neighbour only), or a vector of weights of length *k*. *gmm* is the number of gaussians to use to model the distribution using a gaussian mixture model. Default is 0, and the points represent an empirical distribution. Returns entropy H in bits and its uncertainty. Berrett, T. B., Samworth, R.J., Yuan, M., 2016. Efficient multivariate entropy estimation via k-nearest neighbour distances. https://arxiv.org/abs/1606.00304 """ from sklearn.neighbors import NearestNeighbors n, d = points.shape # Default to the full set if n_est is None: n_est = n # reduce size of draw to n_est if n_est >= n: x = points else: x = points[permutation(n)[:n_est]] n = n_est # Default k based on n if k is None: # Private communication: cube root of n is a good choice for k # Personal observation: k should be much bigger than d k = max(int(n**(1/3)), 3*d) # If weights are given then use them (setting the appropriate k), # otherwise use the default weights. if isinstance(weights, bool): weights = _wnn_weights(k, d, weights) else: k = len(weights) #print("weights", weights, sum(weights)) # select knn algorithm algorithm = 'auto' #algorithm = 'kd_tree' #algorithm = 'ball_tree' #algorithm = 'brute' n_components = 0 if gmm is None else gmm # H = 1/n sum_i=1^n sum_j=1^k w_j log E_{j,i} # E_{j,i} = e^-Psi(j) V_d (n-1) z_{j,i}^d = C z^d # logC = -Psi(j) + log(V_d) + log(n-1) # H = 1/n sum sum w_j logC + d/n sum sum w_j log(z) # = sum w_j logC + d/n sum sum w_j log(z) # = A + d/n B # H^2 = 1/n sum Psi = digamma(np.arange(1, k+1)) logVd = d/2*log(pi) - gammaln(1 + d/2) logC = -Psi + logVd + log(n-1) # TODO: standardizing points doesn't work. # Standardize the data so that distances conform. This is equivalent to # a u-substitution u = sigma x + mu, so the integral needs to be corrected # for dU = det(sigma) dx. Since the standardization squishes the dimensions # independently, sigma is a diagonal matrix, with the determinant equal to # the product of the diagonal elements. #x, mu, sigma = standardize(x) # Note: sigma may be zero #detDU = np.prod(sigma) detDU = 1. if n_components > 0: # Use Gaussian mixture to model the distribution from sklearn.mixture import GaussianMixture as GMM predictor = GMM(n_components=gmm, covariance_type='full') predictor.fit(x) eval_x, _ = predictor.sample(n_est) #weight_x = predictor.score_samples(eval_x) skip = 0 else: # Empirical distribution # TODO: should we use the full draw for kNN and a subset for eval points? # Choose a subset for evaluating the entropy estimate, if desired #print(n_est, n) #eval_x = x if n_est >= n else x[permutation(n)[:n_est]] eval_x = x #weight_x = 1 skip = 1 tree = NearestNeighbors(algorithm=algorithm, n_neighbors=k+skip) tree.fit(x) dist, _ind = tree.kneighbors(eval_x, n_neighbors=k+skip, return_distance=True) # Remove first column. Since test points are in x, the first column will # be a point from x with distance 0, and can be ignored. if skip: dist = dist[:, skip:] # Find log distances. This can be problematic for MCMC runs where a # step is rejected, and therefore identical points are in the distribution. # Ignore them by replacing these points with nan and using nanmean. # TODO: need proper analysis of duplicated points in MCMC chain dist[dist == 0] = nan logdist = log(dist) H_unweighted = logC + d*np.nanmean(logdist, axis=0) H = np.dot(H_unweighted, weights)[0] Hsq_k = np.nanmean((logC[-1] + d*logdist[:,-1])**2) # TODO: abs shouldn't be needed? if Hsq_k < H**2: print("warning: avg(H^2) < avg(H)^2") dH = sqrt(abs(Hsq_k - H**2)/n_est) #print("unweighted", H_unweighted) #print("weighted", H, Hsq_k, H**2, dH, detDU, LN2) return H * detDU / LN2, dH * detDU / LN2
def cluster_vbgm(aligned_maps): # sample_by_features = np.vstack([xmap.flatten() for xmap in aligned_maps]) embedding = embed(aligned_maps) clusterer = BayesianGaussianMixture(n_components=10) return clusterer.fit_predict(embedding)