コード例 #1
0
def test_monotonic_likelihood():
    # We check that each step of the each step of variational inference without
    # regularization improve monotonically the training set of the bound
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    n_components = rand_data.n_components

    for covar_type in COVARIANCE_TYPE:
        X = rand_data.X[covar_type]
        bgmm = BayesianGaussianMixture(
            n_components=2 * n_components,
            covariance_type=covar_type,
            warm_start=True,
            max_iter=1,
            random_state=rng,
            tol=1e-4,
        )
        current_lower_bound = -np.infty
        # Do one training iteration at a time so we can make sure that the
        # training log likelihood increases after each iteration.
        for _ in range(500):
            prev_lower_bound = current_lower_bound
            current_lower_bound = bgmm.fit(X).lower_bound_
            assert_greater_equal(current_lower_bound, prev_lower_bound)

            if bgmm.converged_:
                break
        assert bgmm.converged_
コード例 #2
0
ファイル: cluster.py プロジェクト: fongchun/ProDy
def BayesianGaussianMixture(V, **kwargs):
    """Performs clustering on *V* by using Gaussian mixture models with variational inference. The function uses :func:`sklearn.micture.GaussianMixture`. See sklearn documents 
    for details.

    :arg V: row-normalized eigenvectors for the purpose of clustering.
    :type V: :class:`numpy.ndarray`

    :arg n_clusters: specifies the number of clusters. 
    :type n_clusters: int
    """

    try:
        from sklearn.mixture import BayesianGaussianMixture
    except ImportError:
        raise ImportError('Use of this function (BayesianGaussianMixture) requires the '
                          'installation of sklearn.')
    
    n_components = kwargs.pop('n_components', None)
    if n_components == None:
        n_components = kwargs.pop('n_clusters',None)
        if n_components == None:
            n_components = 1
    
    n_init = kwargs.pop('n_init', 1)
    
    mixture = BayesianGaussianMixture(n_init=n_init, **kwargs).fit(V)

    return mixture.fit_predict(V)
コード例 #3
0
def test_bayesian_mixture_fit_predict_n_init():
    # Check that fit_predict is equivalent to fit.predict, when n_init > 1
    X = np.random.RandomState(0).randn(1000, 5)
    gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0)
    y_pred1 = gm.fit_predict(X)
    y_pred2 = gm.predict(X)
    assert_array_equal(y_pred1, y_pred2)
コード例 #4
0
def test_check_covariance_precision():
    # We check that the dot product of the covariance and the precision
    # matrices is identity.
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    n_components, n_features = 2 * rand_data.n_components, 2

    # Computation of the full_covariance
    bgmm = BayesianGaussianMixture(n_components=n_components,
                                   max_iter=100, random_state=rng, tol=1e-3,
                                   reg_covar=0)
    for covar_type in COVARIANCE_TYPE:
        bgmm.covariance_type = covar_type
        bgmm.fit(rand_data.X[covar_type])

        if covar_type == 'full':
            for covar, precision in zip(bgmm.covariances_, bgmm.precisions_):
                assert_almost_equal(np.dot(covar, precision),
                                    np.eye(n_features))
        elif covar_type == 'tied':
            assert_almost_equal(np.dot(bgmm.covariances_, bgmm.precisions_),
                                np.eye(n_features))

        elif covar_type == 'diag':
            assert_almost_equal(bgmm.covariances_ * bgmm.precisions_,
                                np.ones((n_components, n_features)))

        else:
            assert_almost_equal(bgmm.covariances_ * bgmm.precisions_,
                                np.ones(n_components))
コード例 #5
0
def test_bayesian_mixture_predict_predict_proba():
    # this is the same test as test_gaussian_mixture_predict_predict_proba()
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for prior_type in PRIOR_TYPE:
        for covar_type in COVARIANCE_TYPE:
            X = rand_data.X[covar_type]
            Y = rand_data.Y
            bgmm = BayesianGaussianMixture(
                n_components=rand_data.n_components,
                random_state=rng,
                weight_concentration_prior_type=prior_type,
                covariance_type=covar_type)

            # Check a warning message arrive if we don't do fit
            assert_raise_message(NotFittedError,
                                 "This BayesianGaussianMixture instance"
                                 " is not fitted yet. Call 'fit' with "
                                 "appropriate arguments before using "
                                 "this method.", bgmm.predict, X)

            bgmm.fit(X)
            Y_pred = bgmm.predict(X)
            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
            assert_array_equal(Y_pred, Y_pred_proba)
            assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
コード例 #6
0
  def fit(self, X, Y):
    # assume classes are numbered 0...K-1
    self.K = len(set(Y))

    self.gaussians = []
    self.p_y = np.zeros(self.K)
    for k in range(self.K):
      print("Fitting gmm", k)
      Xk = X[Y == k]
      self.p_y[k] = len(Xk)
      gmm = BayesianGaussianMixture(10)
      gmm.fit(Xk)
      self.gaussians.append(gmm)
    # normalize p(y)
    self.p_y /= self.p_y.sum()
コード例 #7
0
def test_bayesian_mixture_fit_predict(seed, max_iter, tol):
    rng = np.random.RandomState(seed)
    rand_data = RandomData(rng, scale=7)
    n_components = 2 * rand_data.n_components

    for covar_type in COVARIANCE_TYPE:
        bgmm1 = BayesianGaussianMixture(n_components=n_components,
                                        max_iter=max_iter, random_state=rng,
                                        tol=tol, reg_covar=0)
        bgmm1.covariance_type = covar_type
        bgmm2 = copy.deepcopy(bgmm1)
        X = rand_data.X[covar_type]

        Y_pred1 = bgmm1.fit(X).predict(X)
        Y_pred2 = bgmm2.fit_predict(X)
        assert_array_equal(Y_pred1, Y_pred2)
コード例 #8
0
ファイル: entropy.py プロジェクト: bumps/bumps
def kde_entropy_sklearn_gmm(points, n_est=None, n_components=None):
    """
    Use sklearn.neigbors.KernelDensity pdf to estimate entropy.

    Data is standardized before kde.

    Sample points drawn from gaussian mixture model from original points.

    Fails for bimodal and dirichlet, similar to statsmodels kde.
    """
    from sklearn.mixture import BayesianGaussianMixture as GMM
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    if n_components is None:
        n_components = int(5*sqrt(d))

    predictor = GMM(n_components=n_components, covariance_type='full',
                    #verbose=True,
                    max_iter=1000)
    predictor.fit(x)
    evaluation_points, _ = predictor.sample(n_est)

    logp = sklearn_log_density(x, evaluation_points=evaluation_points)
    H = -np.mean(logp)
    return H / LN2
コード例 #9
0
ファイル: entropy.py プロジェクト: bumps/bumps
def gmm_entropy(points, n_est=None, n_components=None):
    #from sklearn.mixture import GaussianMixture as GMM
    from sklearn.mixture import BayesianGaussianMixture as GMM
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    if n_components is None:
        n_components = int(5*sqrt(d))

    ## Standardization doesn't seem to help
    ## Note: sigma may be zero
    #x, mu, sigma = standardize(x)   # if standardized
    predictor = GMM(n_components=n_components, covariance_type='full',
                    #verbose=True,
                    max_iter=1000)
    predictor.fit(x)
    eval_x, _ = predictor.sample(n_est)
    weight_x = predictor.score_samples(eval_x)
    H = -np.mean(weight_x)
    #with np.errstate(divide='ignore'): H = H + np.sum(np.log(sigma))   # if standardized
    dH = 0.
    ## cross-check against own calcs
    #alt = GaussianMixture(predictor.weights_, mu=predictor.means_, sigma=predictor.covariances_)
    #print("alt", H, alt.entropy())
    #print(np.vstack((weight_x[:10], alt.logpdf(eval_x[:10]))).T)
    return H / LN2, dH / LN2
コード例 #10
0
def test_bayesian_mixture_precisions_prior_initialisation():
    rng = np.random.RandomState(0)
    n_samples, n_features = 10, 2
    X = rng.rand(n_samples, n_features)

    # Check raise message for a bad value of degrees_of_freedom_prior
    bad_degrees_of_freedom_prior_ = n_features - 1.
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=bad_degrees_of_freedom_prior_,
        random_state=rng)
    assert_raise_message(
        ValueError, "The parameter 'degrees_of_freedom_prior' should be "
        "greater than %d, but got %.3f." %
        (n_features - 1, bad_degrees_of_freedom_prior_), bgmm.fit, X)

    # Check correct init for a given value of degrees_of_freedom_prior
    degrees_of_freedom_prior = rng.rand() + n_features - 1.
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=degrees_of_freedom_prior,
        random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior,
                        bgmm.degrees_of_freedom_prior_)

    # Check correct init for the default value of degrees_of_freedom_prior
    degrees_of_freedom_prior_default = n_features
    bgmm = BayesianGaussianMixture(
        degrees_of_freedom_prior=degrees_of_freedom_prior_default,
        random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior_default,
                        bgmm.degrees_of_freedom_prior_)

    # Check correct init for a given value of covariance_prior
    covariance_prior = {
        'full': np.cov(X.T, bias=1) + 10,
        'tied': np.cov(X.T, bias=1) + 5,
        'diag': np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3,
        'spherical': rng.rand()
    }

    bgmm = BayesianGaussianMixture(random_state=rng)
    for cov_type in ['full', 'tied', 'diag', 'spherical']:
        bgmm.covariance_type = cov_type
        bgmm.covariance_prior = covariance_prior[cov_type]
        bgmm.fit(X)
        assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_)

    # Check raise message for a bad spherical value of covariance_prior
    bad_covariance_prior_ = -1.
    bgmm = BayesianGaussianMixture(covariance_type='spherical',
                                   covariance_prior=bad_covariance_prior_,
                                   random_state=rng)
    assert_raise_message(
        ValueError, "The parameter 'spherical covariance_prior' "
        "should be greater than 0., but got %.3f." % bad_covariance_prior_,
        bgmm.fit, X)

    # Check correct init for the default value of covariance_prior
    covariance_prior_default = {
        'full': np.atleast_2d(np.cov(X.T)),
        'tied': np.atleast_2d(np.cov(X.T)),
        'diag': np.var(X, axis=0, ddof=1),
        'spherical': np.var(X, axis=0, ddof=1).mean()
    }

    bgmm = BayesianGaussianMixture(random_state=0)
    for cov_type in ['full', 'tied', 'diag', 'spherical']:
        bgmm.covariance_type = cov_type
        bgmm.fit(X)
        assert_almost_equal(covariance_prior_default[cov_type],
                            bgmm.covariance_prior_)
コード例 #11
0
def find_nuclear_contours(cell_rois, cell_stack, nuclear_stack, um_to_px):
    nuclear_rois = {}
    unknown_rois = {}
    nuclear_centroids = {}
    status_tags = {}
    for ID, cell_roi in cell_rois.items():

        print(f'ID: {ID}')
        # although, 8 seems pyknotic
        # 45, and 41, are instances of multis
        ids = [7]
        ids = [id - 1 for id in ids]
        #if ID not in ids: continue

        cell_contours = cell_roi['contours']

        nuclear_contours = []
        unknown_contours = []
        nuclear_centroids_list = []
        status_tags_list = []

        for tp, cell_contour in enumerate(cell_contours):

            #if tp > 0: break

            # Acquire cellular slice and offset.
            cell_slice, cell_offset = get_slice_and_offset(cell_contour)

            # Use slice to acquire cell subimage.
            cell_subimg = cell_stack[tp][cell_slice]

            tmp = cell_contour - np.array([cell_offset[::-1]])
            cx, cy = extract_improved_cell_centroid(cell_subimg, tmp)

            cell_offset = cell_slice[0].start, cell_slice[1].start

            nuclear_centroids_list.append(np.array([[cx, cy]]) + np.array([cell_offset[::-1]]))

            nuc_subimg = nuclear_stack[tp][cell_slice]
            cell_mask = np.zeros_like(cell_subimg)
            tmp = cell_contour - np.array([cell_offset[::-1]])
            cv2.drawContours(cell_mask, [tmp], -1, 255, -1)
            nuc_subimg[cell_mask == 0] = 0

            # Remove everything outside of the cell.

            nuc_subimg[nuc_subimg < np.percentile(nuc_subimg, 10)] = 0
            nuc_subimg = rmp.subtract(nuc_subimg)
            nuc_subimg[nuc_subimg < np.percentile(nuc_subimg, 30)] = 0
            nuc_subimg = morphology.grey_opening(nuc_subimg, size=(3,3))

            DELTA = um_to_px(10) 
            mask = np.zeros_like(nuc_subimg)
            cv2.circle(mask, (cx, cy), DELTA, 255, -1)
            nuc_subimg[mask == 0] = 0

            median_filtered = skimage.filters.median(nuc_subimg)  
            # Median filtering leads to pixels with 0 value having much larger nonzero values. Ensure the pixels
            # with low values continue to be low.
            median_filtered[nuc_subimg <= np.percentile(nuc_subimg, 5)] = 0
            nuc_subimg = median_filtered

            def plt_center():
                plt.plot(cx, cy, 'r.')

            def fail_func(tag):
                nuclear_contours.append(np.empty(0))
                unknown_contours.append(np.empty(0))
                status_tags_list.append(tag)

            COMP = 5 
            # 1D GMM AREA

            # Fit only to the nonzero portion of the image to attain a sharper fit.
            gmm = BayesianGaussianMixture(n_components=COMP)
            nonzero_nuc_subimg = nuc_subimg[nuc_subimg != 0]

            if nonzero_nuc_subimg.size == 0:
                fail_func('insufficient')
                continue

            try: 
                gmm.fit(nonzero_nuc_subimg.reshape(-1, 1))
            except ValueError:
                fail_func('insufficient-2')
                continue

            #visualize('nuc_subimg', nuc_subimg)

            gpred = gmm.predict(nuc_subimg.reshape(-1, 1)).reshape(nuc_subimg.shape).astype(np.uint8)

            label_of_min = np.argmin(gmm.means_)

            

            # If the minimum's label is not zero, then switch it to zero.
            if label_of_min != 0:
                # Temporarily set zero-label to value guaranteed to exceed total Gaussian count.
                tmp = COMP + 1
                gpred[gpred == 0] = tmp 
                # Ensure the minimum label is zero.
                gpred[gpred == label_of_min] = 0
                # Ensure whatever label was zero is now set to the minimum label.
                gpred[gpred == tmp] = label_of_min
                # Switch the means to reflect the change.
                tmp = np.copy(gmm.means_[0])
                gmm.means_[0] = gmm.means_[label_of_min]
                gmm.means_[label_of_min] = tmp

            # Ensure whatever was background in the nuclear image remains as background. Without this line, it is possible
            # that GMM prediction, which was fed nonzero pixels only, would incorrectly classify background. This is necessary.
            gpred[nuc_subimg == 0] = 0

            #visualize(f'{ID+1}-T{tp+1}gpred', gpred, plt_center)

            # Extract contours and choose the one closest to the SCE.
            P = np.ones_like(nuc_subimg, dtype=np.uint8)
            P[gpred == 0] = 0

            _, contours, _ = cv2.findContours(P, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 
            compute_roi_centroid_dist = lambda contour: compute_dist(np.array([cx, cy]), contour)
            try: 
                nuclear_contour = min(contours, key=compute_roi_centroid_dist)

            except ValueError: 
                fail_func('unfound')
                continue

            #visualize('gpred', gpred)

            # Upon having found the closest contour, remove rest from image.
            Q = np.zeros_like(nuc_subimg, dtype=np.uint8)
            cv2.drawContours(Q, [nuclear_contour], -1, 255, -1)
            #visualize('Q', Q)
            gpred[Q == 0] = 0

            # Check for pyknotic.
            if gpred[cy, cx] == 0:
                fail_func('pyknotic')
                continue

            # Crop the image.
            x, y, w, h = cv2.boundingRect(nuclear_contour)

            max_y, max_x = gpred.shape
            crop_y, crop_x = max(0, y-1), max(0, x-1)
            cropped_gpred = gpred[crop_y:min(max_y, y+h+1), crop_x:min(max_x, x+w+1)]

            # Update cellular offset and SCE.
            cell_offset += np.array([crop_y, crop_x])
            cx -= crop_x
            cy -= crop_y

            #visualize('gpred', gpred)
            img, label_num = skimage.measure.label(cropped_gpred, return_num=True)

            if label_num == 1:
                pass
                #print(np.where(img == 1))
            img = img.astype(np.uint8)

            # Add one to label number because 0 is not included in count. 0 is background.
            label_num += 1

            def traverse_row_left_to_right(row, inward_links, incidence_counts):
                # Moving left-to-right, find each index where its right-neighbor differs from it.
                diffs = np.where(row[:-1] != row[1:])[0]
                # Make tuples of these labels and their right-neighbors; of the label that changes upon
                # right-traversal, and what it changes to.
                transitions = list(zip(row[diffs], row[diffs+1]))
                # The directed graph:
                labels_within = [0]
                for left, right in transitions:
                    incidence_counts[left, right] += 1
                    if right not in labels_within:
                        inward_links[left].add(right)
                        labels_within.append(right)

            def traverse_rows(img, inward_links, incidence_counts):
                for row in img:
                    traverse_row_left_to_right(row, inward_links, incidence_counts)
                    # Now reverse row and do same.
                    traverse_row_left_to_right(row[::-1], inward_links, incidence_counts)

            def build_graphs(img, label_num):
                inward_links = defaultdict(set)
                incidence_counts = np.zeros((label_num,) * 2)
                traverse_rows(img, inward_links, incidence_counts)
                # Transpose image and to do same for columns.
                traverse_rows(img.T, inward_links, incidence_counts)

                # Turn incidence counts into proportions.
                incidence_proportions = np.array([c / np.sum(c) for c in incidence_counts])
                return inward_links, incidence_proportions

            #visualize('img', img)
            inward_links, incidence_proportions = build_graphs(img, label_num)

            '''
            _labels = sorted(inward_links.keys())
            for l in _labels:
                print(f'{l}: {inward_links[l]}')

            for ix, row in enumerate(incidence_proportions):
                _row = [str(round(x, 2)) for x in row]
                print(f'{ix}: {_row}')
            '''

            # Determine which label is to be taken as the background interface.
            bg_interface_label = np.argmax(incidence_proportions[0])

            # Now will follow a series of rules that use the graph information to properly setup the subsequent portions.
            #visualize('img', img, plt_center)

            # Ensure that any ROI that is fully contained within another label is coalesced into that label.
            for label in range(label_num):
                # If the label has no inward links, then it is fully contained in another label.
                if label not in inward_links:
                    # Find which label it is contained inside.
                    for _label, links in inward_links.items():
                        if label in links:
                            #print(f'{label} not found. It is inside {_label}.')
                            img[img == label] = _label
                            break

            # Ensure everything that is not background or background interface is same label.
            # Multiplying by 2 here because: a) need label to be different than the background interface label,
            # and it helps visualization for the values to be separated.
            signal_label = bg_interface_label * 2 
                        
            img[np.logical_and(img != 0, img != bg_interface_label)] = signal_label

            # Find all signal ROIs. Keep one closest to SCE and eliminate rest.
            _img = np.copy(img)
            _img[img == bg_interface_label] = 0
            _, contours, _ = cv2.findContours(_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 
            compute_roi_centroid_dist = lambda contour: compute_dist(np.array([cx, cy]), contour)
            #visualize('img', _img)
            try: contour = min(contours, key=compute_roi_centroid_dist)
            except: 
                fail_func('strange-1')
                continue

            # Eliminate any other signal ROIs. 
            mask = np.zeros_like(img)
            cv2.drawContours(mask, [contour], -1, 255, -1)
            img[np.logical_and(mask == 0, img != bg_interface_label)] = 0

            # Check that SCE is on signal. If not, strange. 
            sce_label = img[cy, cx]
            if sce_label in [0, bg_interface_label]:
                fail_func('strange-2')
                continue

            # Will now compute the centroid of the signal ROI to expand around for attaining the nuclear ROI estimate. 
            # To better estimate the centroid of the nuclear region, will erode the signal ROI. This will remove portions
            # of the ROI which may come from either other nuclei or other non-nuclear regions that make it non-circular. 
            # These portions would skew the centroid estimate.

            # Acquire the contour's bounding rectangle.
            x, y, w, h = cv2.boundingRect(contour)
            min_dim = min(w, h)

            # Will use a 3x3 structuring element to erode. To be conservative, only erode a quarter of the minimum dimension
            # of the bounding rectangle. Erosion will function on both sides of the ROI, so eroding half of the minimum dimension
            # on either side would make the ROI vanish. Eroding half of half will ensure the ROI persists. This is done somewhat
            # heuristically to improve results.

            iter = int(min_dim / 4)

            #visualize('mask', mask)
            mask = binary_erosion(mask, structure=np.array([[0,1,0],[1,1,1],[0,1,0]]), iterations=iter).astype(np.uint8)
            _, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 
            # Try to find a contour after erosion. If this fails, continue to use contour extracted before erosion.
            try: contour = contours[0]
            except IndexError: contour = contour
            #visualize('postmask', mask)

            # Compute the centroid of the chosen signal ROI.
            my, mx =  map(int, [np.mean(contour[:, :, 1]), np.mean(contour[:, :, 0])])

            # Now begin drawing concentric circles centered at this centroid with increasing radius. 
            # When the circle overlaps background, either the background interface or otherwise, determine that
            # it has encompassed the full extent of the nucleus that will be measured.
            center = mx, my
            mask = np.zeros_like(img)
            radius = 0
            # 2 is currently used as the signal ROI label.
            non_signal_labels = np.empty(0)
            while non_signal_labels.size == 0:
                radius += 1
                cv2.circle(mask, center, radius, 255, -1)
                non_signal_labels = img[np.logical_and(mask != 0, img != signal_label)]

            # Subtract 1 as it just failed the loop condition.
            radius -= 1

            # After fitting a circle, try to expand into an ellipse.
            # First try one axis, then the other.
            radius1 = radius
            mask = np.zeros_like(img)
            non_signal_labels = np.empty(0)
            while non_signal_labels.size == 0:
                radius1 += 1
                cv2.ellipse(mask, center, (radius1, radius), 0, 0, 360, 255, -1)
                non_signal_labels = img[np.logical_and(mask != 0, img != signal_label)]

            # Subtract 1 as it just failed the loop condition.
            radius1 -= 1
            radius2 = radius
            mask = np.zeros_like(img)
            non_signal_labels = np.empty(0)
            while non_signal_labels.size == 0:
                radius2 += 1
                cv2.ellipse(mask, center, (radius1, radius2), 0, 0, 360, 255, -1)
                non_signal_labels = img[np.logical_and(mask != 0, img != signal_label)]

            # Subtract 1 as it just failed the loop condition.
            radius2 -= 1

            _, contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 
            nuclear_contour = contours[0]

            if nuclear_contour.size == 0:
                fail_func('unfound')
                continue

            # If SCE is not within nuclear contour, do not choose contour.
            if cv2.pointPolygonTest(nuclear_contour, (cx, cy), False) != 1:
                fail_func('external-SCE')
                continue

            def plt_m():
                plt.plot(mx, my, 'b.')


            # Now find all ROIs that are not background or interface. Choose one closest to SCE.
            #visualize('img', img, plt_m)

            # Now specify the indeterminate region. This will be entire contour, sans background.
            img[img != 0] = 255
            _, contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 
            unknown_contour = contours[0]

            #itercount = max(1, int(DELTA / 10))

            # Currently leaving out the containing own centroid portion.
            # If made it this far, believe that we've extracted the SCE containing contour.
            # NOT E: this could have failed. Need to have additional steps to account for failure.
            # Will need to check for own centroid. 

            # If all contained are of either this label or whatever other label is found, then have found a region containing this one.
            #mask1 = np.zeros_like(gpred)
            #mask2 = np.zeros_like(gpred)

            # Draw contour recently found, dilate it, and re-extract.
            #cv2.drawContours(mask1, [contour], -1, 255, -1)
            #cv2.drawContours(mask2, [contour], -1, 255, -1)
            #mask2 = binary_dilation(mask2, structure=np.ones((3, 3))).astype(np.uint8)
            #xor_mask = np.logical_xor(mask1, mask2)

            # Visualize XOR
            #tmp = np.copy(gpred)
            #tmp[np.logical_not(xor_mask)] = 0
            #visualize('gpred', gpred)
            #visualize('xor', tmp)


            #S = np.copy(tmp)
            #visualize('pre', pred2)
            #S = binary_opening(S)
            #S = binary_dilation(S, iterations=itercount).astype(np.uint8)
            #S[cell_mask == 0] = 0
            #_, unknown_contour_estimates, _ = cv2.findContours(S, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 


            unknown_contours.append(unknown_contour + np.array([cell_offset[::-1]]))
            nuclear_contours.append(nuclear_contour + np.array([cell_offset[::-1]]))
            status_tags_list.append('')

        nuclear_rois[ID] = nuclear_contours
        unknown_rois[ID] = unknown_contours
        nuclear_centroids[ID] = nuclear_centroids_list
        status_tags[ID] = status_tags_list

    return nuclear_rois, unknown_rois, nuclear_centroids, status_tags
コード例 #12
0
 def _mean(data):  # aggregated data
     gm = BayesianGaussianMixture(n_components=2)
     gm.fit(data)
     return gm.means_
コード例 #13
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture
from copy import deepcopy

aic = []
bic = []
n_com = 12

data, label = make_blobs(n_samples=300, n_features=2, centers=10)
gmm_without = GaussianMixture(n_components=4).fit_predict(data)
gmm_vbem = BayesianGaussianMixture(n_components=10).fit_predict(data)

plt.scatter(data[:, 0], data[:, 1], c=label)
plt.show()

low = 99999
for n in range(1, n_com + 1):
    gmm = GaussianMixture(n_components=n)
    gmm.fit(data)
    aic.append(gmm.aic(data))
    if aic[-1] < low:
        low = aic[-1]
        model = deepcopy(gmm)

low2 = 99999
for n in range(1, n_com + 1):
    gmm = GaussianMixture(n_components=n)
    gmm.fit(data)
コード例 #14
0
ファイル: pyxelate.py プロジェクト: EgorVoron/pixelate-gpu
class Pyxelate:

    CONVOLUTIONS = np.array(
        [[[2, 2], [2, 2]], [[11, -1], [-1, -1]], [[-1, 11], [-1, -1]],
         [[-1, -1], [11, -1]], [[-1, -1], [-1, 11]], [[5, 5], [-1, -1]],
         [[-1, -1], [5, 5]], [[5, -1], [5, -1]], [[-1, 5], [-1, 5]],
         [[5, -1], [-1, 5]], [[-1, 5], [5, -1]], [[-1, 3], [3, 3]],
         [[3, -1], [3, 3]], [[3, 3], [-1, 3]], [[3, 3], [3, -1]]],
        dtype="int")

    SOLUTIONS = np.array([
        [[1, 1], [1, 1]],
        [[0, 1], [1, 1]],
        [[1, 0], [1, 1]],
        [[1, 1], [0, 1]],
        [[1, 1], [1, 0]],
        [[1, 1], [0, 0]],
        [[0, 0], [1, 1]],
        [[1, 0], [1, 0]],
        [[0, 1], [0, 1]],
        [[1, 0], [1, 0]],
        [[0, 1], [0, 1]],
        [[1, 0], [0, 0]],
        [[0, 1], [0, 0]],
        [[0, 0], [1, 0]],
        [[0, 0], [0, 1]],
    ],
                         dtype="bool")

    ITER = 2

    def __init__(self,
                 height,
                 width,
                 color=8,
                 dither=True,
                 alpha=.6,
                 regenerate_palette=True,
                 keyframe=.6,
                 sensitivity=.07,
                 random_state=0):
        """Create instance for generating similar pixel arts."""
        self.height = int(height)
        self.width = int(width)
        if self.width < 1 or self.height < 1:
            raise ValueError("Result can not be smaller than 1x1 pixels.")
        self.color = int(color)
        if self.color < 2:
            raise ValueError("The minimum number of colors is 2.")
        elif self.color > 32:
            raise ValueError("The maximum number of colors is 32.")
        if dither:
            self.dither = 1 / (self.color + 1)
        else:
            self.dither = 0.
        self.alpha = float(alpha)  # threshold for opacity
        self.regenerate_palette = bool(regenerate_palette)
        self.keyframe = keyframe  # threshold for differences between keyframes
        self.sensitivity = sensitivity  # threshold for differences between parts of keyframes

        # BGM
        self.is_fitted = False
        self.random_state = int(random_state)
        self.model = BayesianGaussianMixture(
            n_components=self.color,
            max_iter=256,
            covariance_type="tied",
            weight_concentration_prior_type="dirichlet_distribution",
            mean_precision_prior=1. / 256.,
            warm_start=False,
            random_state=self.random_state)

    def convert(self, image):
        """Generate pixel art from image"""
        return self._convert(image, False, False)

    def _convert(self, image, override_adapthist=False, override_dither=False):
        """Generate pixel art from image or sequence of images"""
        # does the image have alpha channel?
        if self._is_transparent(image):
            # remove artifacts from transparent edges
            image = self._dilate(image)
            # create alpha mask
            mask = resize(image[:, :, 3], (self.height, self.width),
                          anti_aliasing=True)
            # mask for colors
            color_mask = resize(image[:, :, 3], (32, 32),
                                anti_aliasing=False).ravel()
        else:
            mask = None
            color_mask = None

        # apply adaptive contrast
        if not override_adapthist:
            image = self._fix_hist(image)
        # create sample for finding palette
        if self.regenerate_palette or not self.is_fitted:
            examples = resize(image[:, :, :3], (32, 32),
                              anti_aliasing=False).reshape(-1, 3).astype("int")
            if color_mask is not None:
                # transparent colors should be ignored
                examples = examples[color_mask >= self.alpha]
            self._fit_model(examples)

        # resize image to 4 times the desired width and height
        image = resize(
            image[:, :, :3],
            (self.height * self.ITER * 2, self.width * self.ITER * 2),
            anti_aliasing=True)
        # generate pixelated image with desired width / height
        t = time()
        image = self._reduce(image)
        print('SS:', time() - t)

        # apply palette
        height, width, depth = image.shape
        reshaped = np.reshape(image, (height * width, depth))
        probs = self.model.predict_proba(reshaped)
        y = np.argmax(probs, axis=1)

        # increase hue and snap color values to multiples of 8
        palette = rgb2hsv(self.model.means_.reshape(-1, 1, 3))
        palette[:, :, 1] *= 1.14  # empirical magic number
        palette = hsv2rgb(palette).reshape(self.color, 3) // 8 * 8
        palette[palette ==
                248] = 255  # clamping // 8 * 8 would rarely allow 255 values

        # generate recolored image
        image = palette[y]

        # apply dither over threshold if it's not zero
        if not override_dither and self.dither:
            # get second best probability by removing the best one
            probs[np.arange(len(y)), y] = 0
            # get new best and values
            v = np.max(probs, axis=1) > self.dither
            y = np.argmax(probs, axis=1)

            # replace every second pixel with second best color
            pad = not bool(width % 2)
            if pad:
                # make sure to alternate between starting positions
                # bottleneck
                for i in range(0, len(image), 2):
                    i += (i // width) % 2
                    if v[i]:
                        image[i] = palette[y[i]]
            else:
                i = np.argwhere(v[::2]) * 2
                image[i] = palette[y[i]]

        image = np.reshape(image, (height, width, depth))
        if mask is not None:
            # use transparency from original image, but make it either 0 or 255
            mask[mask >= self.alpha] = 255
            mask[mask < self.alpha] = 0
            image = np.dstack(
                (image, mask))  # result has lost its alpha channel

        return np.clip(image.astype("int"), 0, 255).astype("uint8")

    def convert_sequence(self, images):
        """Generates sequence of pixel arts from a list of images"""
        try:
            _ = np.array(images, dtype=float)
        except ValueError:
            # image sizes are different == setting an array element with a sequence
            raise ValueError("Shape of images in list are different.")

        # apply adaptive histogram on each
        images = [self._fix_hist(image) for image in images]

        transparent = self._is_transparent(images[0])
        keyframe_limit = self.keyframe * np.prod(images[0].shape) * 255.
        sensitivity_limit = self.sensitivity * 255.
        diff_images, key_frames = [], []

        # create new images that are just the differences between sequences
        for image in images:
            # add first image
            if diff_images:
                diff = np.abs(image[:, :, :3] - diff_images[-1][:, :, :3])
                # image is not too different, from previous one, create mask
                if np.sum(diff) < keyframe_limit:
                    diff = resize(np.mean(diff, axis=2),
                                  (self.height, self.width),
                                  anti_aliasing=True)
                    over, under = diff > sensitivity_limit, diff <= sensitivity_limit
                    diff[over], diff[under] = 255, 0.
                    diff = resize(diff, (image.shape[0], image.shape[1]),
                                  anti_aliasing=False)
                    # was the image already transparent?
                    if transparent:
                        image[:, :, 3] = diff
                    else:
                        image = np.dstack((image, diff))
                    key_frames.append(False)
                else:
                    key_frames.append(True)
            else:
                key_frames.append(True)
            # add transparency layer for keyframes also, for easier broadcasting
            if not self._is_transparent(image):
                image = np.dstack(
                    (image, np.ones((image.shape[0], image.shape[1]))))
            diff_images.append(image)

        # create a palette from all images if possible
        if self.regenerate_palette:
            warnings.warn(
                "using regenerate_palette=True will result in flickering, as the palette will be regenerated for each image!",
                Warning)
        else:
            self._palette_from_list(diff_images)

        # merge keyframes and differences
        last = None
        for image, key in zip(diff_images, key_frames):
            current = self._convert(image, True,
                                    ~key)  # pyxelate keyframe / change
            if last is None:
                last = current
            else:
                # merge differences to previous images
                mask = ~np.logical_xor(last[:, :, 3], current[:, :, 3])
                last[mask] = current[mask]
            # generator
            yield last.copy()

    def _palette_from_list(self, images):
        """Fit model to find palette using all images in list at once"""
        transparency = self._is_transparent(images[0])
        examples = []
        color_masks = []

        # sample from all images
        for image in images:
            examples.append(
                resize(image[:, :, :3], (16, 16),
                       anti_aliasing=False).reshape(-1, 3).astype("int"))
            if transparency:
                color_masks.append(
                    resize(images[0][:, :, 3], (16, 16), anti_aliasing=False))

        # concatenate to a single matrix
        examples = np.concatenate(examples)
        if transparency:
            # transparent colors should be ignored
            color_masks = np.concatenate(color_masks).ravel()
            examples = examples[color_masks >= self.alpha]
        self._fit_model(examples)

    def _fit_model(self, X):
        """Fit model while suppressing warnings from sklearn"""
        converge = True
        with warnings.catch_warnings(record=True) as w:
            # fit model
            self.model.fit(X)
            if w and w[-1].category == ConvergenceWarning:
                warnings.filterwarnings('ignore', category=ConvergenceWarning)
                converge = False
        if not converge:
            warnings.warn(
                "the model has failed to converge, try a different number of colors for better results!",
                Warning)
        self.is_fitted = True

    def _reduce(self, image):
        """Apply convolutions on image ITER times and generate a smaller image
		based on the highest magnitude of gradients"""

        # self is visible to decorated function
        @adapt_rgb(each_channel)
        def _wrapper(dim):
            # apply median filter for noise reduction
            dim = median(dim, square(4))
            for n in range(self.ITER):
                h, w = dim.shape
                h, w = h // 2, w // 2
                tt = time()
                flatten = view_as_blocks(dim, (2, 2)).reshape(-1, 2, 2)
                print('flatten:', time() - tt)
                # bottleneck
                tt = time()
                # slow one upper
                new_image = np.fromiter(
                    (self._reduce_conv(f) for f in flatten),
                    flatten.dtype).reshape((h, w))
                print('bottleneck:', time() - tt)
                if n < self.ITER - 1:
                    dim = new_image.copy()
            return new_image

        return _wrapper(image)

    def _reduce_conv(self, f):
        # slow one lower
        """The actual function that selects the right pixels based on the gradients  2x2 square"""
        return np.mean(f[self.SOLUTIONS[np.argmax(
            np.sum(np.multiply(self.CONVOLUTIONS, f.reshape(-1, 2,
                                                            2)).reshape(-1, 4),
                   axis=1))]])

    def _dilate(self, image):
        """Dilate semi-transparent edges to remove artifacts
		(unwanted edges, caused by transparent pixels having different colors)"""
        @adapt_rgb(each_channel)
        def _wrapper(dim):
            return dilation(dim, selem=square(4))

        # use dilated pixels for semi-transparent ones
        mask = image[:, :, 3]
        alter = _wrapper(image[:, :, :3])
        image[:, :, :3][mask < self.alpha] = alter[mask < self.alpha]
        return image

    @staticmethod
    def _fix_hist(image):
        """Apply adaptive histogram"""
        image = equalize_adapthist(
            image) * 255 * 1.14  # empirical magic number
        image[image <= 8.] = 0.
        return image

    @staticmethod
    def _is_transparent(image):
        """Returns True if there is an additional dimension for transparency"""
        return bool(image.shape[2] == 4)
コード例 #15
0
def all_classifier_models():
    models = []
    metrix = []
    c_report = []
    train_accuracy = []
    test_accuracy = []
    
    models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100)))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('Linear_SVM', LinearSVC()))
    models.append(('XGB', XGBClassifier()))
    models.append(('SGD', SGDClassifier()))
    models.append(('Perceptron', Perceptron()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    models.append(('OneClassSVM', OneClassSVM(gamma = 'auto')))
    models.append(('NuSVC', NuSVC()))
    models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)))
    models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0)))
    models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0)))
    models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('LogisticRegressionCV', LogisticRegressionCV()))
    models.append(('RidgeClassifierCV', RidgeClassifierCV()))
    models.append(('RidgeClassifier', RidgeClassifier()))
    models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier()))
    models.append(('GaussianProcessClassifier', GaussianProcessClassifier()))
    models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier()))
    estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))]
    models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())))
    clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('BaggingClassifier', BaggingClassifier()))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    models.append(('CategoricalNB', CategoricalNB()))
    models.append(('ComplementNB', ComplementNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('MultinomialNB', MultinomialNB()))
    models.append(('CalibratedClassifierCV', CalibratedClassifierCV()))
    models.append(('LabelPropagation', LabelPropagation()))
    models.append(('LabelSpreading', LabelSpreading()))
    models.append(('NearestCentroid', NearestCentroid()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('GaussianMixture', GaussianMixture()))
    models.append(('BayesianGaussianMixture', BayesianGaussianMixture()))
    
    test_accuracy= []
    names = []
    for name, model in models:
        try:
            m = model
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            train_acc = round(m.score(X_train, y_train) * 100, 2)
            test_acc = metrics.accuracy_score(y_test,y_pred) *100
            c_report.append(classification_report(y_test, y_pred))
            test_accuracy.append(test_acc)
            names.append(name)
            metrix.append([name, train_acc, test_acc])
        except:
            print("Exception Occurred  :",name)
    return metrix,test_accuracy,names
コード例 #16
0
ファイル: process_data.py プロジェクト: WittmannF/NILM_LP
## Test BGM for one appliance
appl = 'WHE'

# Create vector with P and Q values and plot them
P = d[appl].P[init:end].values
Q = d[appl].Q[init:end].values
X = np.transpose([P, Q])

plt.plot(d[appl].P[init:end], d[appl].Q[init:end],'o', alpha=0.1)

# Normalize X
sscl = StandardScaler().fit(X)
X = sscl.transform(X)

# Apply clusterer
bgm = BayesianGaussianMixture(n_components=33, covariance_type='full', weight_concentration_prior_type='dirichlet_distribution', random_state=42).fit(X)
y_pred = bgm.predict(X)

# Plot clusters with X unnormalized
X = sscl.inverse_transform(X)
plt.figure()
plt.scatter(X[:,0],X[:,1], color=colors[y_pred])
means = sscl.inverse_transform(bgm.means_)
medians = get_medians(X, y_pred)
# plt.plot(means[:,0],means[:,1],'kx')
plt.plot(medians[:,0],medians[:,1],'kx')


# TODO: Compare mean with ground truth
plt.figure()
plt.plot(P)
コード例 #17
0
    def em_stereo(self,n_component=1,dp=True,thresh_hold=0.4):
        self.num_params = 0
        #The range of len(params)
        _step = 0
        for var_idx in tqdm(range(len(self.merge_var[0]))):

            for x_v in range(len(self.merge_var[0][var_idx])):
                print('Step %d'%_step,end='\r')
                _step += 1
                try:
                    
                    for y_v in range(len(self.merge_var[0][var_idx][x_v])):
                        #print('cluster weights ....%d'%var_idx)
                        dist = []
                        for task_idx in range(len(self.merge_var)):
                            nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v][y_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v][y_v])),200)
                            dist.append(nor)
                        
                        dist = np.array(np.asmatrix(np.concatenate(dist)).T)
                        if dp:
                            print('Initializing DPGMM%d ... '%_step,end='\r')
                            gmm = DPGMM( max_iter=1000,  n_components=n_component, covariance_type='spherical')
                        else:
                            gmm = GMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                        gmm.fit(dist)
                        new_idx_list = []
                        for task_idx in range(len(self.merge_var)):
                            #if dp:
                            #Strategy 1. Set threshold
                            predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1))
                            f_ = True
                            
                            while f_:
                                #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)):
                                if gmm.weights_[np.argmax(predict_probability)] > thresh_hold:
                                    new_idx = np.argmax(predict_probability)
                                    f_ = False
                                else:
                                    predict_probability[0][np.argmax(predict_probability)] = 0.0
                                    self.num_params += 1
                            

                        #else:
                        #    new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1))
                        #    if new_idx in new_idx_list:
                                self.num_params += 1
                            new_idx_list.append(new_idx)
                            self.merge_var[task_idx][var_idx][x_v][y_v] = gmm.means_[new_idx]
                            self.merge_uncertainty[task_idx][var_idx][x_v][y_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)


                except TypeError:
                    dist = []
                    
                    
                    for task_idx in range(len(self.merge_var)):
                        nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v])),200)
                        dist.append(nor)
                    dist = np.array(np.asmatrix(np.concatenate(dist)).T)
                    if dp:
                        print('Initializing DPGMM%d ... '%_step,end='\r')
                        gmm = DPGMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                    else:
                        gmm = GMM( max_iter=200,  n_components=n_component, covariance_type='spherical')
                    gmm.fit(dist)
                    new_idx_list = []
                    for task_idx in range(len(self.merge_var)):
                        #if dp:
                        #Strategy 1. Set threshold
                        predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1))
                        f_ = True
                        while f_:
                            #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)):
                            if gmm.weights_[np.argmax(predict_probability)] > thresh_hold:
                                new_idx = np.argmax(predict_probability)
                                f_ = False
                            else:
                                predict_probability[0][np.argmax(predict_probability)] = 0.0
                                self.num_params += 1

                    #else:
                    #    new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1))
                    #    if new_idx in new_idx_list:
                    #        self.num_params += 1
                        new_idx_list.append(new_idx)
                        self.merge_var[task_idx][var_idx][x_v] = gmm.means_[new_idx]
                        self.merge_uncertainty[task_idx][var_idx][x_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)
コード例 #18
0
 def _make_bgmm(self, label, X, Y):
     indices = np.where(Y == label)
     bgm = BayesianGaussianMixture(n_components=3, max_iter=200, tol=1e-3)
     bgm.fit(X[indices])
     return bgm
コード例 #19
0
from sklearn.mixture import GaussianMixture
"""
    GMM 高斯混合模型
        n_components    高斯分布的个数
        covariance_type 各个高斯分布的方差关系,
                        full -> 各个分布之间没有关系
"""
data = []
g = GaussianMixture(n_components=2,
                    covariance_type='full',
                    tol=1e-6,
                    max_iter=1000)
g.fit(data)

print('类别概率:\t', g.weights_[0])
print('均值:\n', g.means_, '\n')
print('方差:\n', g.covariances_, '\n')

# DPGMM 可以解决自动调整 n_components 的问题
from sklearn.mixture import BayesianGaussianMixture
dpgmm = BayesianGaussianMixture(
    n_components=3,
    covariance_type='full',
    max_iter=1000,
    n_init=5,
    weight_concentration_prior_type='dirichlet_process',
    weight_concentration_prior=10)
コード例 #20
0
from sklearn.mixture import BayesianGaussianMixture
from sklearn.linear_model import LogisticRegression

dataset = loaddataset(train_file)
testset = loaddataset(test_file)

names = [
    "Nearest Neighbors",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
]

ab = AdaBoostClassifier(random_state=1)
bgm = BayesianGaussianMixture(random_state=1)
dt = DecisionTreeClassifier(random_state=1)
gb = GradientBoostingClassifier(random_state=1)
lr = LogisticRegression(random_state=1)
rf = RandomForestClassifier(random_state=1)

classifiers = [
    KNeighborsClassifier(3),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    RandomForestClassifier(random_state=1),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

svcl = LinearSVC(random_state=1)
svcg = SVC(random_state=1)
コード例 #21
0
ファイル: 18.5.DPGMM.py プロジェクト: zscFirefly/ML_From_xx
                    alpha=0.5,
                    clip_box=ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()

    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title("GMM", fontsize=18)
    plt.grid(True)

    #DPGMM
    dpgmm = BayesianGaussianMixture(
        n_components=n_compenents,
        covariance_type="full",
        max_iter=100,
        n_init=5,
        weight_concentration_prior_type="dirichlet_process",
        weight_concentration_prior=100)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm.covariances_
    print("DPGMM均值=:\n", centers)
    print("DPGMM方差=:\n", covs)
    y_hat = dpgmm.predict(x)

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], c=y, s=30, cmap=cm, marker='o')
コード例 #22
0
    for k, e in enumerate(ells):
        a.add_artist(e)
        e.set_facecolor((1 - w[k], 1 - w[k], 1 - w[k]))

    a.set_xlim(np.min(x1) - np.min(x1) / 10, np.max(x1) + np.min(x1) / 10)
    a.set_ylim(np.min(x2) - np.min(x2) / 10, np.max(x2) + np.min(x2) / 10)
    plt.scatter(x1, x2)
    plt.savefig('plot_{}.png'.format(iteration))
    plt.show()
    plt.close()

estimators = [
    ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
     BayesianGaussianMixture(
        weight_concentration_prior_type="dirichlet_process",
        n_components=2 * 5, reg_covar=0, init_params='random',
        max_iter=10, mean_precision_prior=.8,
        random_state=3), [1], 10),
    ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
     BayesianGaussianMixture(
         weight_concentration_prior_type="dirichlet_process",
         n_components=2 * 5, reg_covar=0, init_params='random',
         max_iter=15, mean_precision_prior=.8,
         random_state=3), [1], 15),
    ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
         BayesianGaussianMixture(
            weight_concentration_prior_type="dirichlet_process",
            n_components=2 * 5, reg_covar=0, init_params='random',
            max_iter=20, mean_precision_prior=.8,
            random_state=3), [1], 20),
    ("Infinite mixture with a Dirichlet process\n prior and" r"$\gamma_0=$",
コード例 #23
0
x_train_unlabeled = train_unlabeled

#Switch to numpy
# Preprocessing X
x_train = []
x_train_labeled = np.array(x_train_labeled)
x_train_unlabeled = np.array(x_train_unlabeled)
x_train.extend(x_train_labeled)
x_train.extend(x_train_unlabeled)
x_test = np.array(test)

# Preprocessing y
y_train_labeled = np.array(y_train_labeled)
ones = -1 * np.ones(21000)
ones = np.array(ones)
y_train = np.concatenate((y_train_labeled, ones)).astype(int)

# # GMM
# GMM = BayesianGaussianMixture(n_components=10, random_state=0)
# GMM.fit(x_train, y_train)
# y_pred = GMM.predict(x_test)

# Bayesian GMM
BGMM = BayesianGaussianMixture(n_components=10, random_state=0)
BGMM.fit(x_train, y_train)
y_pred = BGMM.predict(x_test)

# output results
d = {'Id': test.index, 'y': y_pred}
output = pd.DataFrame(d)
output.to_csv('output5.csv', index=False)
コード例 #24
0
def test_compare_covar_type():
    # We can compare the 'full' precision with the other cov_type if we apply
    # 1 iter of the M-step (done during _initialize_parameters).
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    X = rand_data.X['full']
    n_components = rand_data.n_components

    for prior_type in PRIOR_TYPE:
        # Computation of the full_covariance
        bgmm = BayesianGaussianMixture(
            weight_concentration_prior_type=prior_type,
            n_components=2 * n_components,
            covariance_type='full',
            max_iter=1,
            random_state=0,
            tol=1e-7)
        bgmm._check_initial_parameters(X)
        bgmm._initialize_parameters(X, np.random.RandomState(0))
        full_covariances = (
            bgmm.covariances_ *
            bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis])

        # Check tied_covariance = mean(full_covariances, 0)
        bgmm = BayesianGaussianMixture(
            weight_concentration_prior_type=prior_type,
            n_components=2 * n_components,
            covariance_type='tied',
            max_iter=1,
            random_state=0,
            tol=1e-7)
        bgmm._check_initial_parameters(X)
        bgmm._initialize_parameters(X, np.random.RandomState(0))

        tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
        assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))

        # Check diag_covariance = diag(full_covariances)
        bgmm = BayesianGaussianMixture(
            weight_concentration_prior_type=prior_type,
            n_components=2 * n_components,
            covariance_type='diag',
            max_iter=1,
            random_state=0,
            tol=1e-7)
        bgmm._check_initial_parameters(X)
        bgmm._initialize_parameters(X, np.random.RandomState(0))

        diag_covariances = (bgmm.covariances_ *
                            bgmm.degrees_of_freedom_[:, np.newaxis])
        assert_almost_equal(
            diag_covariances,
            np.array([np.diag(cov) for cov in full_covariances]))

        # Check spherical_covariance = np.mean(diag_covariances, 0)
        bgmm = BayesianGaussianMixture(
            weight_concentration_prior_type=prior_type,
            n_components=2 * n_components,
            covariance_type='spherical',
            max_iter=1,
            random_state=0,
            tol=1e-7)
        bgmm._check_initial_parameters(X)
        bgmm._initialize_parameters(X, np.random.RandomState(0))

        spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
        assert_almost_equal(spherical_covariances,
                            np.mean(diag_covariances, 1))
コード例 #25
0
# Plot the latent space

latent_vectors = chain_call(m1.latent_sample, X_test, 1000)

#%%
# verify sklearn gaussian mixture?
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from sklearn.mixture import BayesianGaussianMixture

pca = PCA(2)
# pca = TSNE(2)
X_pca = pca.fit_transform(latent_vectors)
kmeans = BayesianGaussianMixture(10, tol=1e-6, max_iter=1000)
pred = kmeans.fit_predict(X_pca)
print(purity_score(y_test, pred))

#%%
df_latent = pd.DataFrame({
    "x1": X_pca[:, 0],
    "x2": X_pca[:, 1],
    "cat": ["pred_{}".format(i) for i in y_test],
    "kmeans": ["pred_{}".format(i) for i in pred],
})
plt.figure(figsize=(10, 10))
sns.scatterplot(data=df_latent, x="x1", y="x2", hue="cat")

plt.figure(figsize=(10, 10))
sns.scatterplot(data=df_latent, x="x1", y="x2", hue="kmeans")
コード例 #26
0
 def gets_best_model(self, X, target):
     best_classifiers = []
     outer_cv = StratifiedKFold(n_splits=self.num_folds,
                                shuffle=True,
                                random_state=1)
     model_factory = [
         AdaBoostClassifier(),
         BaggingClassifier(),
         BayesianGaussianMixture(),
         BernoulliNB(),
         CalibratedClassifierCV(),
         CatBoostClassifier(verbose=False),
         DecisionTreeClassifier(),
         ExtraTreesClassifier(),
         GaussianMixture(),
         GaussianNB(),
         GradientBoostingClassifier(),
         KNeighborsClassifier(),
         LinearDiscriminantAnalysis(),
         LogisticRegression(max_iter=1000),
         LogisticRegressionCV(max_iter=1000),
         MLPClassifier(),
         QuadraticDiscriminantAnalysis(),
         RandomForestClassifier(),
         SGDClassifier()
     ]
     logging.basicConfig(filename="ml_dl_toolbox_logfilename.log",
                         level=logging.INFO,
                         format='%(asctime)s %(levelname)-8s %(message)s',
                         datefmt='%Y-%m-%d %H:%M:%S')
     scoring = ('accuracy', 'neg_mean_squared_error')
     try:
         for el in model_factory:
             el.seed = self.seed
             scores = cross_validate(el,
                                     X.drop(target, axis=1),
                                     X[target],
                                     cv=outer_cv,
                                     n_jobs=-1,
                                     scoring=scoring)
             scores = abs(
                 np.sqrt(
                     np.mean(scores['test_neg_mean_squared_error']) *
                     -1)) / np.mean(scores['test_accuracy'])
             score_description = [
                 el, '{el}'.format(el=el.__class__.__name__),
                 "%0.5f" % scores
             ]
             best_classifiers.append(score_description)
             best_model = pd.DataFrame(
                 best_classifiers,
                 columns=["Algorithm", "Model",
                          "RMSE/Accuracy"]).sort_values("RMSE/Accuracy",
                                                        axis=0,
                                                        ascending=True)
             best_model = best_model.reset_index()
     except OSError:
         logging.error('Check data structure')
     else:
         logging.info('Best fitting algorithm: ' + best_model["Model"][0] +
                      " RMSE/Accuracy: " + best_model["RMSE/Accuracy"][0])
         return best_model["Algorithm"][0]
コード例 #27
0
ファイル: vbgmm_sklearn.py プロジェクト: ngoix/cyg-x1
np.random.seed(1)

X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB,
          data_thr.rateC, data_thr.rateCA]

scaler = StandardScaler()
X = scaler.fit_transform(X)

# 1 corresponds to data_thr.rate and 4=5-1 to data_thr.rateC
w = w / np.sqrt(scaler.var_[1:])
# w = np.exp(-np.exp(3 * w.mean(axis=1)))
w = 1. / w.mean(axis=1) ** 2

Html_file = open("gmm_sklearn_files/gmm3_sklearn.html", "w")

gmm = BayesianGaussianMixture(n_components=3, alpha_prior=0.1, beta_prior=1,
                              n_init=5)
gmm.fit(X)  # , weights=w) not implemented in sklearn yet
preds = gmm.predict(X)
probs = gmm.predict_proba(X)

data_thr['preds'] = pd.Series(preds).astype("category")

color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
             "brown", "green", "orange"]  # Spectral9
color_key = color_key[:len(set(preds))+1]

covs = gmm.covariances_
means = gmm.means_

# transform cov for non-standardizeed data:
covs = np.array([np.dot(np.diag(np.sqrt(scaler.var_)),
コード例 #28
0
def main_process(img_np: np.ndarray, ccs: List[np.ndarray],
                 text_lines: List[Tuple[int, int, int,
                                        int]], cc2textline_assignment):
    if len(ccs) == 0:
        return
    final_mask = np.zeros_like(ccs[0])
    dpgmm = BayesianGaussianMixture(n_components=5, covariance_type='diag')
    kern = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    text_line_colors = defaultdict(list)
    #print(cc2textline_assignment)
    for i, cc in enumerate(tqdm(ccs)):
        if np.sum(cv2.bitwise_and(cc, final_mask)) > 0:
            final_mask = cv2.bitwise_or(final_mask, cc)
            continue
        pixels = img_np[cv2.erode(cc, kern) > 127]  #img_np[cc > 127]#
        if len(pixels) < 5:
            final_mask = cv2.bitwise_or(final_mask, cc)
            continue
        cls1 = dpgmm.fit(pixels)
        # print(cls1.means_)
        # print(cls1.weights_)
        # print(np.sqrt(cls1.covariances_))
        cls1_top2_mean, cls1_top2_stddev, cls1_k = find_top_k_dpgmm(cls1, 2)
        # cls1_top2_mean[0][0] = 4
        # cls1_top2_mean[0][1] = 4
        # cls1_top2_mean[0][2] = 4
        # cls1_top2_mean[1][0] = 253
        # cls1_top2_mean[1][1] = 253
        # cls1_top2_mean[1][2] = 253
        # cls1_top2_stddev[0][0] = 5
        # cls1_top2_stddev[0][1] = 5
        # cls1_top2_stddev[0][2] = 5
        # cls1_top2_stddev[1][0] = 5
        # cls1_top2_stddev[1][1] = 5
        # cls1_top2_stddev[1][2] = 5
        # if cls1_top2_mean[0][0] > 100 :
        # 	breakpoint()
        cls1_top2_stddev_ext = np.round(cls1_top2_stddev * COLOR_RANGE_SIGMA)
        # if i == 2 :
        # 	breakpoint()
        top1_mask = extend_cc_region(img_np, cc, cls1_top2_mean[0],
                                     cls1_top2_stddev_ext[0])
        top2_mask = extend_cc_region(img_np, cc, cls1_top2_mean[1],
                                     cls1_top2_stddev_ext[1])
        # area1 = int(top1_mask.sum())
        # area2 = int(top2_mask.sum())
        # if area1 == 0 or area2 == 0 :
        # 	breakpoint()
        # 	continue
        # area_cc = int(cc.sum())
        # if abs(area1 - area_cc) < abs(area2 - area_cc) :
        # 	D = top1_mask
        # 	selected_idx = 0
        # else :
        # 	D = top2_mask
        # 	selected_idx = 1
        # intersect_area1 = int(cv2.bitwise_and(cc, top1_mask).sum())
        # intersect_area2 = int(cv2.bitwise_and(cc, top2_mask).sum())
        iou1 = cv2.bitwise_and(cc, top1_mask).sum() / cv2.bitwise_or(
            cc, top1_mask).sum()
        iou2 = cv2.bitwise_and(cc, top2_mask).sum() / cv2.bitwise_or(
            cc, top2_mask).sum()
        if iou1 > iou2:
            D = top1_mask
            selected_idx = 0
            if iou1 < 1e-1:
                #print(iou1)
                D = cc
                selected_idx = -1
        else:
            D = top2_mask
            selected_idx = 1
            if iou2 < 1e-1:
                #print(iou2)
                D = cc
                selected_idx = -1
        # print(selected_idx, iou1, iou2)
        # save_rgb('text_mask_utils_tmp.png', cc)
        # save_rgb('text_mask_utils_tmp_color.png', img_np*(cc>0)[:,:,None])
        # save_rgb('text_mask_utils_tmp_top2_mask.png', top2_mask)
        # save_rgb('text_mask_utils_tmp_top1_mask.png', top1_mask)
        # input('x')
        # if cc2textline_assignment[i] == 12 :
        # 	breakpoint()

        D = cv2.bitwise_or(cc, D)
        D = cv2.dilate(D, kern)
        # if cls1_top2_mean[selected_idx][0] < 100 :
        # 	breakpoint()
        final_mask = cv2.bitwise_or(final_mask, D)
        # seed_point_candidates_mask = cv2.inRange(cc_region, cls1_top2_mean[0] - cls1_top2_stddev_ext[0], cls1_top2_mean[0] + cls1_top2_stddev_ext[0])
        # seed_point_candidates_y, seed_point_candidates_x = np.where(seed_point_candidates_mask > 127)
        # seed_point = (seed_point_candidates_x[0] + x, seed_point_candidates_y[0] + y)
        # D = np.zeros((cc.shape[0] + 2, cc.shape[1] + 2), dtype = np.uint8)
        # cv2.floodFill(img_np, D, seed_point, (255), cls1_top2_stddev_ext[0].tolist(), cls1_top2_stddev_ext[0].tolist(), cv2.FLOODFILL_MASK_ONLY)
        # D = D[1: -1, 1: -1] * 255
        # final_mask = cv2.bitwise_or(final_mask, D)

        # now we find text color
        if selected_idx == -1:
            continue  # skip
        text_color_value = cls1_top2_mean[selected_idx]
        text_color_stddev = cls1_top2_stddev[selected_idx]
        #print('color=', text_color_value, text_color_stddev)
        text_line_colors[cc2textline_assignment[i]].append(text_color_value)

    textline_img = np.copy(img_np)

    def get_textline_color(j, visited_text_lines: List[int]):
        visited_text_lines.append(j)
        (x, y, w, h) = text_lines[j]
        colors = text_line_colors[j]
        if not colors:
            #print(f'[!!!!!!!!!!!!] Textline {j} has no color assigned, seraching for closest')
            min_dist = 10000000000000000000
            min_dist_k = -1
            for k, (x2, y2, w2, h2) in enumerate(text_lines):
                d = rect_distance(x, y, x + w, y + h, x2, y2, x2 + w2, y2 + h2)
                if d < min_dist and k not in visited_text_lines:
                    min_dist = d
                    min_dist_k = k
            if min_dist_k == -1:
                print(
                    f'[!!!!!!!!!!!!] Textline {j} has no color assigned and unable to find closest rectangle, defaulting to black'
                )
                return np.zeros((3, ), dtype=np.uint8)
            return get_textline_color(min_dist_k, visited_text_lines)
        else:
            clr = np.round(np.mean(np.array(colors), axis=0))
            return clr

    textline_colors = []
    for j, (x, y, w, h) in enumerate(text_lines):
        # colors = text_line_colors[j]
        # if not colors :
        # 	print(f'[!!!!!!!!!!!!] Textline {j} has no color assigned')
        # 	cv2.rectangle(textline_img, (x, y), (x + w, y + h), (0, 255, 0), 2)
        # 	continue
        clr = get_textline_color(j, []).tolist()
        textline_colors.append(clr)
        cv2.rectangle(textline_img, (x, y), (x + w, y + h), clr, 2)

    #save_rgb('text_mask_utils_final_textlines.png', textline_img)

    #save_rgb('text_mask_utils_final_mask_masked.png', (final_mask > 127)[:,:,None] * img_np)
    #cv2.imwrite('text_mask_utils_final_mask.png', final_mask)
    return final_mask, textline_colors
コード例 #29
0
    'energy': np.double,
    'liveness': np.double,
    'loudness': np.double,
    'mode': np.double,
    'speechiness': np.double,
    'tempo': np.double,
    'valence': np.double,
    'instrumentalness': np.double,
}
df = df.astype(types)
X = df.drop(labels=['id ', 'artist', 'name', 'mode', 'tempo', 'loudness'],
            axis=1)

# Initial model to get appropriate number of clusters
bgmm = BayesianGaussianMixture(
    n_components=20,
    n_init=20,
)
bgmm.fit(X)
num_clusters = len([w for w in bgmm.weights_ if w > 0.05])

bgmm_2 = BayesianGaussianMixture(n_components=num_clusters, n_init=20)

probs = pd.DataFrame(bgmm_2.fit(X).predict_proba(X))

results = pd.concat([df, probs], axis=1, sort=False)

# inital results - fluctuate a lot and also the weightings seem to do bugger all.

# TODO:
#  - REAAALLLY have a look at what features are actually important. instrumentallness is not one of them...
#  - DBSCAN is probs a pretty good way of getting the clusters actually - do a minimum value of like 10 songs
コード例 #30
0
def cluster(idx_arr,
            h_arr,
            w_arr,
            data,
            cw,
            samplerate,
            minp,
            percentile,
            npc,
            ngaus,
            plot_steps=False):

    l = np.ones(h_arr.shape)

    if ((np.max(h_arr) - np.min(h_arr)) / np.max(h_arr)) > 0.25:

        # classify by height
        bgm = BayesianGaussianMixture(ngaus, max_iter=100, n_init=10)
        l = bgm.fit_predict(h_arr.reshape(-1, 1))

        # if any of the clusters merged have very little height difference, merge them.
        if len(np.unique(l)) > 1:
            for ll in np.unique(l):
                if ((np.max(h_arr[l != ll]) - np.min(h_arr[l != ll])) /
                        np.max(h_arr[l != ll])) < 0.25:
                    l[l != ll] = np.max(l) + 1

    if plot_steps == True:
        mean_eods, eod_times, _ = find_window(data,
                                              idx_arr,
                                              l,
                                              h_arr,
                                              rm=False)
        print('clustering based on hight')
        plot_all(data, ts, samplerate, ms, vs)

    # now cluster based on waveform
    al = np.ones(len(l)) * -1

    # extract snippets
    snippets = np.stack([
        data[int(idx - cw * samplerate / 2):int(idx + cw * samplerate / 2)]
        for idx in idx_arr
    ])

    # keep track of the labels so that no labels are overwritten
    maxlab = 0

    for hl in np.unique(l):
        if len(l[l == hl]) > minp:

            # extract snippets, idxs and hs for this hight cluster
            csnippets = StandardScaler().fit_transform(snippets[l == hl])
            cidx_arr = idx_arr[l == hl]
            ch_arr = h_arr[l == hl]

            # extract relevant snippet features
            pca = PCA(npc).fit(csnippets).transform(csnippets)

            # determine good epsilon
            knn = np.sort(pairwise_distances(pca, pca))[:, minp]
            eps = np.percentile(knn, percentile)

            # cluster by EOD shape
            c = DBSCAN(eps=eps, min_samples=minp).fit(pca).labels_

            if plot_steps == True:
                mean_eods, eod_times, _ = find_window(data,
                                                      cidx_arr,
                                                      c,
                                                      ch_arr,
                                                      rm=False)
                print('clustering on scaled eods')
                plot_all(data, ts, samplerate, ms, vs)

            # cluster again without scaling (sometimes this works better wrt scaling)
            csnippets_ns = snippets[l == hl]
            pca = PCA(npc).fit(csnippets_ns).transform(csnippets_ns)
            knn = np.sort(pairwise_distances(pca, pca))[:, minp]
            eps = np.percentile(knn, percentile)
            c_ns = DBSCAN(eps=eps, min_samples=minp).fit(pca).labels_

            if plot_steps == True:
                mean_eods, eod_times = find_window(data,
                                                   cidx_arr,
                                                   c_ns,
                                                   ch_arr,
                                                   rm=False)
                print('clustering on non-scaled eods')
                plot_all(data, ts, samplerate, ms, vs)

            # merge results for scaling and without scaling
            _, _, _, c = merge_clusters(c, c_ns, cidx_arr, cidx_arr, ch_arr,
                                        data, samplerate)

            if plot_steps == True:
                mean_eods, eod_times = find_window(data,
                                                   cidx_arr,
                                                   c,
                                                   ch_arr,
                                                   rm=False)
                print('merged scale and non-scaled')
                plot_all(data, ts, samplerate, ms, vs)

            # update maxlab so that no clusters are overwritten
            c[c == -1] = -maxlab - 1
            al[l == hl] = c + maxlab
            maxlab = np.max(al) + 1

    # return the overall clusters (al) and the clusters based on hight (l)
    return al, l
コード例 #31
0
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title('GMM', fontsize=15)
    plt.grid(b=True, ls=':', color='#606060')

    # DPGMM
    dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
                                    weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm.covariances_
    print('DPGMM均值 = \n', centers)
    print('DPGMM方差 = \n', covs)
    y_hat = dpgmm.predict(x)
    print(y_hat)

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')

    for i, cc in enumerate(zip(centers, covs)):
コード例 #32
0
# Parameters of the dataset
random_state, n_components, n_features = 2, 3, 2
colors = np.array(['#0072B2', '#F0E442', '#D55E00'])

covars = np.array([[[.7, .0], [.0, .1]], [[.5, .0], [.0, .1]],
                   [[.5, .0], [.0, .1]]])
samples = np.array([200, 500, 200])
means = np.array([[.0, -.70], [.0, .0], [.0, .70]])

# mean_precision_prior= 0.8 to minimize the influence of the prior
estimators = [("Finite mixture with a Dirichlet distribution\nprior and "
               r"$\gamma_0=$",
               BayesianGaussianMixture(
                   weight_concentration_prior_type="dirichlet_distribution",
                   n_components=2 * n_components,
                   reg_covar=0,
                   init_params='random',
                   max_iter=1500,
                   mean_precision_prior=.8,
                   random_state=random_state), [0.001, 1, 1000]),
              ("Infinite mixture with a Dirichlet process\n prior and"
               r"$\gamma_0=$",
               BayesianGaussianMixture(
                   weight_concentration_prior_type="dirichlet_process",
                   n_components=2 * n_components,
                   reg_covar=0,
                   init_params='random',
                   max_iter=1500,
                   mean_precision_prior=.8,
                   random_state=random_state), [1, 1000, 100000])]

# Generate data
コード例 #33
0
def test_bayesian_mixture_precisions_prior_initialisation():
    rng = np.random.RandomState(0)
    n_samples, n_features = 10, 2
    X = rng.rand(n_samples, n_features)

    # Check raise message for a bad value of degrees_of_freedom_prior
    bad_degrees_of_freedom_prior_ = n_features - 1.0
    bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng)
    assert_raise_message(
        ValueError,
        "The parameter 'degrees_of_freedom_prior' should be "
        "greater than %d, but got %.3f." % (n_features - 1, bad_degrees_of_freedom_prior_),
        bgmm.fit,
        X,
    )

    # Check correct init for a given value of degrees_of_freedom_prior
    degrees_of_freedom_prior = rng.rand() + n_features - 1.0
    bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_)

    # Check correct init for the default value of degrees_of_freedom_prior
    degrees_of_freedom_prior_default = n_features
    bgmm = BayesianGaussianMixture(degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng).fit(X)
    assert_almost_equal(degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_)

    # Check correct init for a given value of covariance_prior
    covariance_prior = {
        "full": np.cov(X.T, bias=1) + 10,
        "tied": np.cov(X.T, bias=1) + 5,
        "diag": np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3,
        "spherical": rng.rand(),
    }

    bgmm = BayesianGaussianMixture(random_state=rng)
    for cov_type in ["full", "tied", "diag", "spherical"]:
        bgmm.covariance_type = cov_type
        bgmm.covariance_prior = covariance_prior[cov_type]
        bgmm.fit(X)
        assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_)

    # Check raise message for a bad spherical value of covariance_prior
    bad_covariance_prior_ = -1.0
    bgmm = BayesianGaussianMixture(
        covariance_type="spherical", covariance_prior=bad_covariance_prior_, random_state=rng
    )
    assert_raise_message(
        ValueError,
        "The parameter 'spherical covariance_prior' "
        "should be greater than 0., but got %.3f." % bad_covariance_prior_,
        bgmm.fit,
        X,
    )

    # Check correct init for the default value of covariance_prior
    covariance_prior_default = {
        "full": np.atleast_2d(np.cov(X.T)),
        "tied": np.atleast_2d(np.cov(X.T)),
        "diag": np.var(X, axis=0, ddof=1),
        "spherical": np.var(X, axis=0, ddof=1).mean(),
    }

    bgmm = BayesianGaussianMixture(random_state=0)
    for cov_type in ["full", "tied", "diag", "spherical"]:
        bgmm.covariance_type = cov_type
        bgmm.fit(X)
        assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)
コード例 #34
0
ファイル: utils.py プロジェクト: Adamli12/M-protein
def BGMreport(path, visualize=1, cut_n=6):
    t2 = 15
    t3 = 0.07
    n_components = 3
    denses, _ = finddensefromcut(path, cut_n)
    maxd = []
    for dense in denses[(cut_n - 5):]:
        maxd.append(max(dense))
    lofd = len(denses[0])
    samples = list()
    for i in range((cut_n - 5), cut_n):  #sampling for BGM
        samples.append(np.array(tosample(denses[i])).reshape(-1, 1))
    allmeans = []
    allcovs = []
    allweights = []
    BGM45 = np.zeros((45))
    for i in range(5):
        BGM = BayesianGaussianMixture(
            n_components=n_components,
            covariance_type='spherical',
            weight_concentration_prior=0.000000000001,
            max_iter=500)
        BGM.fit(samples[i])
        means = np.reshape(BGM.means_, (-1, ))
        permu = np.argsort(means)
        means = means[permu]
        BGM45[i * 9 + 3:i * 9 + 6] = means
        allmeans.append(means)
        covs = BGM.covariances_
        covs = covs[permu]
        BGM45[i * 9 + 6:i * 9 + 9] = covs
        allcovs.append(covs)
        weights = BGM.weights_
        weights = weights[permu]
        BGM45[i * 9:i * 9 + 3] = weights * len(samples[i])
        allweights.append(weights)
    if visualize == 1:
        l = 0
        for i in range(cut_n - 5, cut_n):  #visualization
            l += 1
            plt.subplot(2, n_components, l), plt.plot(denses[i])
            X = np.linspace(0, lofd, num=200, endpoint=False)
            Ys = toGM(X, n_components, allmeans[l - 1], allcovs[l - 1],
                      allweights[l - 1])
            for j in range(n_components):
                #plt.subplot(1,5,l),plt.plot([allmeans[l-1][j],allmeans[l-1][j]],[0,255])
                plt.subplot(2, n_components,
                            l), plt.plot(X,
                                         len(samples[l - 1]) * Ys[j])
                #plt.subplot(2,n_components,l),plt.plot(X,Ys[j])
                plt.ylim(0, 255)
        plt.show()
    ans = np.zeros((12, ))
    pre = np.zeros((5, n_components))
    for i in range(
            5
    ):  ###preprocessing the data to avoid peak overlapping(far overlap and near overlap) influence: identify far/near overlap cases and suppress far overlap peaks, amplify near overlap peaks
        ###如果很理想的情况应该能把两个far overlap的peak合并成一个在中间mean的,但是现在可以先直接把两个抑制掉,毕竟就不太可能是单克隆峰了。far overlap也就是两个峰实际上在图里面是同一个,BGM将其拆分从而更好的拟合高斯模型,我们这里将其抑制因为能够拆分为两个峰的基本上cov都比较大,不尖。
        for j in range(n_components):
            for l in range(n_components):
                if j < l:
                    if allweights[i][j] / allweights[i][l] > 3 or allweights[
                            i][j] / allweights[i][
                                l] < 0.3333:  #ignore when weight difference is too large
                        continue
                    if allcovs[i][j] / allweights[i][j] / allcovs[i][
                            l] * allweights[i][l] / abs(
                                allmeans[i][j] - allmeans[i][l]
                            ) * mean(
                                np.sqrt(allcovs[i][j]), np.sqrt(allcovs[i][l])
                            ) > 2 or allcovs[i][l] / allweights[i][
                                l] / allcovs[i][j] * allweights[i][j] / abs(
                                    allmeans[i][j] - allmeans[i][l]
                                ) * mean(
                                    np.
                                    sqrt(allcovs[i][j]), np.sqrt(allcovs[i][l])
                                ) > 2:  #if the cov difference is large than it will be ignored from far overlap because there should be two peaks in the original density plot
                        #near overlap situation is when a sharp peak is on a mild one. it happens when monoclonal peak has a background polyclonal peak. here we amplify the sharp peaks' weight when their cov difference is large enough or their distance is close enough so that it will be detected as abnormal in the classification step
                        if abs(allmeans[i][j] -
                               allmeans[i][l]) < 3.5 * np.sqrt(
                                   max(allcovs[i][j], allcovs[i][l])):
                            neww = allweights[i][j] + allweights[i][l]
                            if allcovs[i][l] / allweights[i][l] / allcovs[i][
                                    j] * allweights[i][j] > 1 and allweights[
                                        i][j] > 0.15:
                                if allcovs[i][j] < 400:
                                    allweights[i][j] = neww
                            else:
                                if allcovs[i][l] < 400:
                                    allweights[i][l] = neww
                        continue
                    if allcovs[i][j] / allweights[i][j] / len(
                            samples[i]
                    ) < t3 / 2.5 or allcovs[i][l] / allweights[i][l] / len(
                            samples[i]
                    ) < t3 / 2.5:  #if one of the considered peak has very small variance, then it should not be far overlap situation where the original peak is mild
                        continue
                    if allcovs[i][j] < 70 or allcovs[i][l] < 70:
                        continue
                    elif abs(allmeans[i][j] - allmeans[i][l]) < 3.5 * np.sqrt(
                            max(allcovs[i][j], allcovs[i][l])
                    ):  #far overlap situation where there is only a mild peak in the original density plot, and GMM model break it down to two sharper peaks to fit the guassian curves more accurately. here we just suppress the peaks and thus we cannot determine the column is abnormal because of the two considered components
                        pre[i][j] = pre[i][l] = 1
    for i in [0, 1, 2]:
        for j in [3, 4]:
            if maxd[i] < 50 or maxd[j] < 50:
                continue
            else:
                for k in range(len(allmeans[i])):
                    for l in range(len(allmeans[j])):
                        if pre[i][k] == 1 or pre[j][l] == 1:
                            continue
                        if abs(allmeans[i][k] - allmeans[j][l]) > lofd / t2:
                            continue
                        else:
                            if allweights[i][k] < 0.1 or allweights[j][l] < 0.1:
                                continue
                            else:
                                if allcovs[i][k] / allweights[i][k] / len(
                                        samples[i]
                                ) > t3 or allcovs[j][l] / allweights[j][l] / len(
                                        samples[j]
                                ) > t3:  ###the t figure, represents the sharpness of the peak. just variance is not enough, we need to consider n_samples and weights too.
                                    continue
                                else:
                                    ans[i * 2 + j - 2] = 1
                                    ans[7 + i] = 1
                                    ans[7 + j] = 1
                                    ans[0] = 1
    for i in range(5):
        for j in range(n_components):
            if pre[i][j] == 1:
                continue
            if maxd[i] < 80:
                continue
            elif allweights[i][j] < 0.05:
                continue
            if allcovs[i][j] / allweights[i][j] / len(
                    samples[i]) > t3:  ###t-figure
                continue
            else:
                ans[7 + i] = 1
                ans[0] = 1
    return ans, BGM45
コード例 #35
0
def test_compare_covar_type():
    # We can compare the 'full' precision with the other cov_type if we apply
    # 1 iter of the M-step (done during _initialize_parameters).
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    X = rand_data.X["full"]
    n_components = rand_data.n_components
    # Computation of the full_covariance
    bgmm = BayesianGaussianMixture(
        n_components=2 * n_components, covariance_type="full", max_iter=1, random_state=0, tol=1e-7
    )
    bgmm._check_initial_parameters(X)
    bgmm._initialize_parameters(X)
    full_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis]

    # Check tied_covariance = mean(full_covariances, 0)
    bgmm = BayesianGaussianMixture(
        n_components=2 * n_components, covariance_type="tied", max_iter=1, random_state=0, tol=1e-7
    )
    bgmm._check_initial_parameters(X)
    bgmm._initialize_parameters(X)

    tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_
    assert_almost_equal(tied_covariance, np.mean(full_covariances, 0))

    # Check diag_covariance = diag(full_covariances)
    bgmm = BayesianGaussianMixture(
        n_components=2 * n_components, covariance_type="diag", max_iter=1, random_state=0, tol=1e-7
    )
    bgmm._check_initial_parameters(X)
    bgmm._initialize_parameters(X)

    diag_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis]
    assert_almost_equal(diag_covariances, np.array([np.diag(cov) for cov in full_covariances]))

    # Check spherical_covariance = np.mean(diag_covariances, 0)
    bgmm = BayesianGaussianMixture(
        n_components=2 * n_components, covariance_type="spherical", max_iter=1, random_state=0, tol=1e-7
    )
    bgmm._check_initial_parameters(X)
    bgmm._initialize_parameters(X)

    spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_
    assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))
コード例 #36
0
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title('GMM', fontsize=15)
    plt.grid(b=True, ls=':', color='#606060')

    # DPGMM
    dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
                                    weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1)
    dpgmm.fit(x)         #假定高斯分布的参数是随机变量,且服从dirichlet_process过程,weight_concentration_prior越大越考虑到先验,越小越靠近样本
    centers = dpgmm.means_
    covs = dpgmm.covariances_
    print u'DPGMM均值 = \n', centers
    print u'DPGMM方差 = \n', covs
    y_hat = dpgmm.predict(x)
    print y_hat

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')

    for i, cc in enumerate(zip(centers, covs)):
コード例 #37
0
ファイル: kmeans.py プロジェクト: chenxu0602/TensorFlow2.0
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
knn.predict(X_new)
print(knn.predict_proba(X_new))

y_dist, y_pred_idx = knn.kneighbors(X_new, n_neighbors=1)
y_pred = dbscan.labels_[dbscan.core_sample_indices_][y_pred_idx]
y_pred[y_dist > 0.2] = -1
print(y_pred.ravel())

from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=3, n_init=10)
gm.fit(X)

X_new, y_new = gm.sample(6)
print("density:")
print(gm.score_samples(X))

densities = gm.score_samples(X)
density_threshold = np.percentile(densities, 4)
anomalies = X[densities < density_threshold]

print(gm.bic(X))
print(gm.aic(X))

from sklearn.mixture import BayesianGaussianMixture

bgm = BayesianGaussianMixture(n_components=10, n_init=10)
bgm.fit(X)
print("weights:")
print(np.round(bgm.weights_, 2))
コード例 #38
0
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.model_selection import train_test_split
import sklearn

import pandas

import cv2

input_path = "/Users/Srikanth/PycharmProjects/DataMiningClass/datasets/face.jpg"
img = cv2.imread(input_path)

o_shape = img.shape
k = 2
new_data = img.reshape(-1, 3)
vgmm = BayesianGaussianMixture(n_components=k)
# vgmm = GaussianMixture(n_components=k)
vgmm = vgmm.fit(new_data)
cluater = vgmm.predict(new_data)



# Reshape the input data to the orignal shape
cluater = cluater.reshape(o_shape[0], o_shape[1])
from matplotlib import pyplot
pyplot.imshow(cluater)
pyplot.show()




コード例 #39
0
ファイル: entropy.py プロジェクト: bumps/bumps
def wnn_entropy(points, k=None, weights=True, n_est=None, gmm=None):
    r"""
    Weighted Kozachenko-Leonenko nearest-neighbour entropy calculation.

    *k* is the number of neighbours to consider, with default $k=n^{1/3}$

    *n_est* is the number of points to use for estimating the entropy,
    with default $n_\rm{est} = n$

    *weights* is True for default weights, False for unweighted (using the
    distance to the kth neighbour only), or a vector of weights of length *k*.

    *gmm* is the number of gaussians to use to model the distribution using
    a gaussian mixture model.  Default is 0, and the points represent an
    empirical distribution.

    Returns entropy H in bits and its uncertainty.

    Berrett, T. B., Samworth, R.J., Yuan, M., 2016. Efficient multivariate
    entropy estimation via k-nearest neighbour distances.
    https://arxiv.org/abs/1606.00304
    """
    from sklearn.neighbors import NearestNeighbors
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    # Default k based on n
    if k is None:
        # Private communication: cube root of n is a good choice for k
        # Personal observation: k should be much bigger than d
        k = max(int(n**(1/3)), 3*d)

    # If weights are given then use them (setting the appropriate k),
    # otherwise use the default weights.
    if isinstance(weights, bool):
        weights = _wnn_weights(k, d, weights)
    else:
        k = len(weights)
    #print("weights", weights, sum(weights))

    # select knn algorithm
    algorithm = 'auto'
    #algorithm = 'kd_tree'
    #algorithm = 'ball_tree'
    #algorithm = 'brute'

    n_components = 0 if gmm is None else gmm

    # H = 1/n sum_i=1^n sum_j=1^k w_j log E_{j,i}
    # E_{j,i} = e^-Psi(j) V_d (n-1) z_{j,i}^d = C z^d
    # logC = -Psi(j) + log(V_d) + log(n-1)
    # H = 1/n sum sum w_j logC + d/n sum sum w_j log(z)
    #   = sum w_j logC + d/n sum sum w_j log(z)
    #   = A + d/n B
    # H^2 = 1/n sum
    Psi = digamma(np.arange(1, k+1))
    logVd = d/2*log(pi) - gammaln(1 + d/2)
    logC = -Psi + logVd + log(n-1)

    # TODO: standardizing points doesn't work.
    # Standardize the data so that distances conform.  This is equivalent to
    # a u-substitution u = sigma x + mu, so the integral needs to be corrected
    # for dU = det(sigma) dx.  Since the standardization squishes the dimensions
    # independently, sigma is a diagonal matrix, with the determinant equal to
    # the product of the diagonal elements.
    #x, mu, sigma = standardize(x)  # Note: sigma may be zero
    #detDU = np.prod(sigma)
    detDU = 1.

    if n_components > 0:
        # Use Gaussian mixture to model the distribution
        from sklearn.mixture import GaussianMixture as GMM
        predictor = GMM(n_components=gmm, covariance_type='full')
        predictor.fit(x)
        eval_x, _ = predictor.sample(n_est)
        #weight_x = predictor.score_samples(eval_x)
        skip = 0
    else:
        # Empirical distribution
        # TODO: should we use the full draw for kNN and a subset for eval points?
        # Choose a subset for evaluating the entropy estimate, if desired
        #print(n_est, n)
        #eval_x = x if n_est >= n else x[permutation(n)[:n_est]]
        eval_x = x
        #weight_x = 1
        skip = 1

    tree = NearestNeighbors(algorithm=algorithm, n_neighbors=k+skip)
    tree.fit(x)
    dist, _ind = tree.kneighbors(eval_x, n_neighbors=k+skip, return_distance=True)
    # Remove first column. Since test points are in x, the first column will
    # be a point from x with distance 0, and can be ignored.
    if skip:
        dist = dist[:, skip:]
    # Find log distances.  This can be problematic for MCMC runs where a
    # step is rejected, and therefore identical points are in the distribution.
    # Ignore them by replacing these points with nan and using nanmean.
    # TODO: need proper analysis of duplicated points in MCMC chain
    dist[dist == 0] = nan
    logdist = log(dist)
    H_unweighted = logC + d*np.nanmean(logdist, axis=0)
    H = np.dot(H_unweighted, weights)[0]
    Hsq_k = np.nanmean((logC[-1] + d*logdist[:,-1])**2)
    # TODO: abs shouldn't be needed?
    if Hsq_k < H**2:
        print("warning: avg(H^2) < avg(H)^2")
    dH = sqrt(abs(Hsq_k - H**2)/n_est)
    #print("unweighted", H_unweighted)
    #print("weighted", H, Hsq_k, H**2, dH, detDU, LN2)
    return H * detDU / LN2, dH * detDU / LN2
コード例 #40
0
def cluster_vbgm(aligned_maps):
    # sample_by_features = np.vstack([xmap.flatten() for xmap in aligned_maps])
    embedding = embed(aligned_maps)
    clusterer = BayesianGaussianMixture(n_components=10)
    return clusterer.fit_predict(embedding)