Beispiel #1
0
def test_mahalanobis_partial_mapping(mapping_type):
    mapping = mapping_type([0, 1])
    measure = measures.Mahalanobis(mapping=mapping)
    reduced_ui = CovarianceMatrix(np.diag([100, 10]))
    assert measure(state_u, state_v) == \
        distance.mahalanobis([[10], [1]],
                             [[11], [10]], np.linalg.inv(reduced_ui))
    mapping = np.array([0, 3])
    reduced_ui = CovarianceMatrix(np.diag([100, 10]))
    measure = measures.Mahalanobis(mapping=mapping)
    assert measure(state_u, state_v) == \
        distance.mahalanobis([[10], [1]],
                             [[11], [2]], np.linalg.inv(reduced_ui))

    mapping = mapping_type([0, 1])
    measure = measures.Mahalanobis(mapping=mapping, mapping2=mapping)
    assert measure(state_u, state_v) == \
        distance.mahalanobis([[10], [1]],
                             [[11], [10]], np.linalg.inv(reduced_ui))
    mapping = np.array([0, 3])
    measure = measures.Mahalanobis(mapping=mapping, mapping2=mapping)
    assert measure(state_u, state_v) == \
        distance.mahalanobis([[10], [1]],
                             [[11], [2]], np.linalg.inv(reduced_ui))

    mapping = mapping_type([0, 1])
    mapping2 = np.array([0, 3])
    measure = measures.Mahalanobis(mapping=mapping, mapping2=mapping2)
    assert measure(state_u, state_v) == \
        distance.mahalanobis([[10], [1]],
                             [[11], [2]], np.linalg.inv(reduced_ui))
def calculate_unit_distances(session):
	preprocessing_file = h5py.File(experiment.dir + 'preprocessing_results.hdf5', 'r+')
	features, units, cluster_id = get_spike_features_and_unit_ids(session)
	unit_cms = np.zeros((len(units), len(features[0])))
	euc = np.zeros((len(units), len(units)))
	maho = np.zeros((len(units), len(units)))

	inv_cov = np.linalg.inv(np.cov(features, rowvar=False))

	for i, unit in enumerate(units):
		unit_spikes = np.where(cluster_id == unit)[0]
		unit_spike_coords = features[unit_spikes]
		unit_cms[i] = np.mean(unit_spike_coords, 0)

	for i in range(len(units)):
		for j in range(len(units)):
			if j==i:
				euc[i,j] = np.linalg.norm(unit_cms[i] - np.zeros(len(features[0])))
				maho[i,j] = mahalanobis(unit_cms[i], np.zeros(len(features[0])), inv_cov)
			else:
				euc[i,j] = np.linalg.norm(unit_cms[i] - unit_cms[j])
				maho[i,j] = mahalanobis(unit_cms[i], unit_cms[j], inv_cov)

	group_grp = preprocessing_file[session.subExperiment.name + '/' + session.name + '/group_0/']
	group_grp.create_dataset("euclidian", data=euc)
	group_grp.create_dataset("mahalanobis", data=maho)
	return euc, maho
Beispiel #3
0
 def ratio_of_mv_normals(self, ukf_1, ukf_2, obs):
     """take the ratio of two mv normal densities too small for python
     
     often we see densities <1e-16 which python can struggle to handle. 
     Its easier to do this by hand especially as many of the terms in
     the ratio cancel out reducing the chance of numeric errors.
     
     Parameters
     --------
     ukf_1, ukf_2 : cls
         original `ukf1` ukf model and candidate `ukf2` ukf model
     Returns
     -------
     
     ratio : float
         `ratio` of two mv normals. prob > 0
     """
     x1 = ukf_1.x
     p1 = ukf_1.p
     x2 = ukf_2.x
     p2 = ukf_2.p
     obs1 = np.matmul(ukf_1.k, obs)
     obs2 = np.matmul(ukf_2.k, obs)
     
     ratio = 1.
     ratio *= (np.linalg.det(p1)/np.linalg.det(p2))**(-1/2)
     distance = -0.5*(mahalanobis(obs2, x2, p2)**2 - mahalanobis(obs1, x1, p1)**2)
     if np.exp(distance) == np.inf:
         ratio *= 0.
     else:
         ratio *= np.exp(distance)
     return ratio
def compute_dist_matrix_with_ubm(dataset_x, all_gmms, all_ubms, label_dict):
    dist_matrix = np.zeros(shape=(len(dataset_x), len(label_dict)))
    for subject_i in range(len(dataset_x)):
        cur_subject = dataset_x[subject_i]
        class_distances = []
        for class_i in range(len(label_dict)):
            class_gmm = all_gmms[class_i]
            min_dist = np.inf
            for gmm_comp in range(class_gmm.n_components):
                cur_dist = distance.mahalanobis(
                    cur_subject, class_gmm.means_[gmm_comp],
                    inv(class_gmm.covariances_)[gmm_comp])
                if cur_dist < min_dist:
                    min_dist = cur_dist
            class_ubm = all_ubms[class_i]
            ubm_min_dist = np.inf
            for gmm_comp in range(class_ubm.n_components):
                cur_dist = distance.mahalanobis(
                    cur_subject, class_ubm.means_[gmm_comp],
                    inv(class_ubm.covariances_)[gmm_comp])
                if cur_dist < ubm_min_dist:
                    ubm_min_dist = cur_dist
            class_distances.append(min_dist + (1 / ubm_min_dist))
        dist_matrix[subject_i, :] = class_distances
    return dist_matrix
Beispiel #5
0
    def mahalanobis_distance_sq(self, vec_x: Union[vector, matrix]):
        """
        returns the square of the mahalanobis distance using mean and covariance of the distribution.
        d = (x - \mu)^T \Sigma^{-1} (x - \mu)
        -0.5*mahalanobis_distance_sq + log_normalization_term = logpdf

        :param vec_x: observation(s)
        :return: (array of) square of the mahalanobis distance.
        """
        if vec_x.ndim == 1:
            if self._is_cov_diag:
                dist = _mvn_isotropic_mahalanobis_dist_sq(vec_x, self._mu, self._cov)
            else:
                cov_inv = np.linalg.inv(self._cov)
                dist = mahalanobis(vec_x, self._mu, cov_inv)**2
        elif vec_x.ndim == 2:
            if self._is_cov_diag:
                dist = np.array([_mvn_isotropic_mahalanobis_dist_sq(x, self._mu, self._cov) for x in vec_x])
            else:
                cov_inv = np.linalg.inv(self._cov)
                dist = np.array([mahalanobis(x, self._mu, cov_inv)**2 for x in vec_x])
        else:
            raise NotImplementedError("unexpected input.")

        return dist
Beispiel #6
0
def detect_outliers_with_tsne(inliers, outliers, reuses=True):
    dataset = np.concatenate([inliers, outliers], axis=0)
    reduced_dataset = None

    if reuses is False:
        reduced_dataset = TSNE(n_components=2, random_state=0).fit_transform(dataset)
        np.save(REDUCED_DATASET_PATH, reduced_dataset)
    else:
        reduced_dataset = np.load(REDUCED_DATASET_PATH)

    reduced_inliers = reduced_dataset[:inliers.shape[0]]
    reduced_outliers = reduced_dataset[inliers.shape[0]:]

    # calculate a covariance matrix using reduced_inners
    inv_sigma = np.linalg.inv(np.cov(reduced_inliers, rowvar=False))
    print('inv_sigma', inv_sigma.shape)

    # calculate a mean using reduced_inners
    mean = np.mean(reduced_inliers, axis=0)

    with open(F_VALUE_PATH, 'w') as fout:
        for i in range(20):
            threshold = THRESHOLD + 0.1 * i
            r = sum(1 for outlier in reduced_outliers if mahalanobis(mean, outlier, inv_sigma) > threshold)
            b = sum(1 for inlier in reduced_inliers if mahalanobis(mean, inlier, inv_sigma) > threshold)

            # the number of predicted outliers
            n = r + b

            precision = r / n
            recall = r / N_OUTLIERS
            f = 2 * precision * recall / (precision + recall)
            # print('thr={},f={},p={},r={}'.format(threshold, f, precision, recall))
            fout.write('{} {} {} {}\n'.format(threshold, f, precision, recall))
Beispiel #7
0
def mahalanobis_distance(text1, text2):
    vec1, vec2 = __get_vectors(text1, text2)
    intersection = set(vec1.keys()) & set(vec2.keys())
    intersection_vec1 = np.array([vec1[x] for x in intersection])
    intersection_vec2 = np.array([vec2[x] for x in intersection])
    unique_vec1 = np.array([vec1[x] for x in vec1.keys() - intersection])
    unique_vec2 = np.array([vec2[x] for x in vec2.keys() - intersection])

    intersection_covariance_matrix = __get_covariance_matrix(
        intersection_vec1, intersection_vec2)

    intersection_inverse_covariance = __get_pseudo_inverse(
        intersection_covariance_matrix)

    distance = mahalanobis(intersection_vec1, intersection_vec2,
                           intersection_inverse_covariance)

    auto_covariance1 = __get_covariance_matrix(unique_vec1, unique_vec1)
    auto_covariance2 = __get_covariance_matrix(unique_vec2, unique_vec2)

    distance += mahalanobis(unique_vec1, unique_vec1,
                            __get_pseudo_inverse(auto_covariance1))
    distance += mahalanobis(unique_vec2, unique_vec2,
                            __get_pseudo_inverse(auto_covariance2))
    return distance
Beispiel #8
0
def multivariate_normal_m_dist(mean, cov_mat, n, m_dist_max):
    # NOTE: initialise success markers
    success = False

    # NOTE: draw initial random sample
    xy = np.random.multivariate_normal(mean, cov_mat, n)

    stop = False
    while not success:
        # NOTE: transform sample statistics to match population parameters
        cov_mat_samp = np.cov(xy.T)
        cholesky_sample = np.linalg.cholesky(cov_mat_samp)
        cholesky_sample_inverse = np.linalg.inv(cholesky_sample)
        cholesky_population = np.linalg.cholesky(cov_mat)
        for i in range(n):
            xy[i] = np.matmul(
                np.matmul(cholesky_sample_inverse, cholesky_population),
                (xy[i, :] - np.mean(xy, axis=0))) + mean

        # NOTE: remove outliers
        for i in range(n):
            m_dist = mahalanobis(xy[i, :], mean, np.linalg.inv(cov_mat))
            while m_dist > m_dist_max:
                xy[i, :] = np.random.multivariate_normal(mean, cov_mat, 1)
                m_dist = mahalanobis(xy[i, :], mean, np.linalg.inv(cov_mat))
            outliers_removed = True

        tol = 10.0
        sample_mean = np.mean(xy, axis=0)
        sample_cov_mat = np.all(np.cov(xy.T))
        if np.all(sample_mean - mean < tol) and np.all(
                sample_cov_mat - cov_mat < tol):
            success = True

    return (xy)
Beispiel #9
0
def test_mahalanobis_full_mapping(mapping_type):
    mapping = mapping_type(np.arange(len(u)))
    measure = measures.Mahalanobis(mapping=mapping)
    assert measure(state_u,
                   state_v) == distance.mahalanobis(u, v, np.linalg.inv(ui))
    measure = measures.Mahalanobis(mapping=mapping, mapping2=mapping)
    assert measure(state_u,
                   state_v) == distance.mahalanobis(u, v, np.linalg.inv(ui))
Beispiel #10
0
def MStep(X, r, z, dof, updateDof=False):
    K, N = r.shape[1], X.shape[0]
    D = X.shape[1]
    mu = []
    sigma = []
    pi = []
    dof_new = []
    for i in range(K):
        rk = r[:, i]
        zk = z[:, i]
        w = (rk * zk).reshape(-1, 1)  # N * 1
        mu_k = np.sum(w * X, axis=0) / np.sum(w)
        sigma_k = UpdateSigma(X, mu_k, r, z, i)
        pi_k = np.sum(rk) / N

        mu.append(mu_k.ravel())
        sigma.append(sigma_k)
        pi.append(pi_k)

    if updateDof:
        L = np.zeros((N, K))
        for i in range(K):
            sigma_k = sigma[i]
            logdet = 0.5 * math.log(sl.det(sigma_k))
            logmix = math.log(pi[i])
            distances = []
            inv_sigma_k = sl.inv(sigma_k)
            for j in range(len(X)):
                xj = X[j]
                distances.append(
                    mahalanobis(xj.ravel(), mu[i].ravel(), inv_sigma_k)**2)
            distances = np.array(distances).ravel()
            L[:, i] = GetLk(dof[i], logmix, logdet, distances, X.shape[1])

        for i in range(K):
            sigma_k = sigma[i]
            logdet = 0.5 * math.log(sl.det(sigma_k))
            logmix = math.log(pi[i])
            distances = []
            inv_sigma_k = sl.inv(sigma_k)
            for j in range(len(X)):
                xj = X[j]
                distances.append(
                    mahalanobis(xj.ravel(), mu[i].ravel(), inv_sigma_k)**2)
            distances = np.array(distances).ravel()

            bnds = ((0.1, 200), )
            x_init = (dof[i], )
            res = so.minimize(dofFunc,
                              x0=x_init,
                              args=(logmix, logdet, distances, X.shape[1], L,
                                    i),
                              bounds=bnds)
            dof_new.append(res.x)
    else:
        dof_new = dof

    return np.array(mu), np.array(sigma), np.array(pi), np.array(dof_new)
Beispiel #11
0
def get_caliper(trt_compare, trtinfo, binid, median, percentile):
    looking_for = 30
    trt_more = [trt_compare.values]
    if trt_compare.shape[0] < looking_for:
        for neighbor in get_neighbors(trtinfo['bindf'], binid, his2ft.binners,
                                      trtinfo['levels'], looking_for):
            node = trtinfo['drugbins'].get_node("/" + neighbor)
            nodelab = node[:, 1] == trtinfo['trt']

            trt_more.append(trtinfo['scaler'].transform(node[:,
                                                             6:][nodelab, :]))

    trt_more = pd.DataFrame(np.vstack(trt_more))
    trt_more.index = list(trt_compare.index) + list(
        set(np.arange(2 * trt_more.shape[0])) -
        set(trt_compare.index))[:(trt_more.shape[0] - trt_compare.shape[0])]
    trtdist = pd.DataFrame()
    cutoff = 10

    if trt_compare.shape[0] > 10000:
        tshape = trt_compare.shape[0]
        print("bigboy! ", tshape)
        dists = []
        for k in range(int(trt_compare.shape[0] / 1000)):
            x = trt_compare.iloc[
                k:min(trt_compare.shape[0], k + 1000), :].apply(
                    lambda x: trt_more.drop(x.name, axis=0).iloc[
                        np.random.choice(tshape - 1, 500, replace=False), :]
                    .apply(lambda y: mahalanobis(x, y, trtinfo['prec']),
                           axis=1),
                    axis=1)
            dists.append(
                x.apply(lambda q: np.percentile(q[~pd.isnull(q)], percentile),
                        axis=1))
        return np.median(np.hstack(dists))

    elif trt_compare.shape[0] > 1000:
        #pdb.set_trace()
        tshape = trt_compare.shape[0]
        trtdist = trt_compare.apply(
            lambda x: trt_more.drop(x.name, axis=0).iloc[np.random.choice(
                tshape - 1, 100, replace=False), :].apply(
                    lambda y: mahalanobis(x, y, trtinfo['prec']), axis=1),
            axis=1)

    elif trt_more.shape[0] >= 30:
        trtdist = trt_compare.apply(
            lambda x: trt_more.drop(x.name, axis=0).apply(
                lambda y: mahalanobis(x, y, trtinfo['prec']), axis=1),
            axis=1)
    #pdb.set_trace()
    if median:
        return trtdist.apply(
            lambda q: np.percentile(q[~pd.isnull(q)], percentile),
            axis=1).median()
    else:
        return np.percentile(trtdist.stack(), percentile)
Beispiel #12
0
    def reassignLabels(self, X, threshold):
        Z = self.post_gmm_encode(X, transform=False)

        preds = self.kmeans.pred

        for i in range(len(preds)):
            if (distance.mahalanobis(Z[i], self.kmeans.means[0],
                                     self.kmeans.covs[0]) > threshold
                ) and (distance.mahalanobis(Z[i], self.kmeans.means[1],
                                            self.kmeans.covs[1]) > threshold):
                preds[i] = 2

        self.kmeans.pred = preds
Beispiel #13
0
    def z_score(self, x):
        """Computes the Mahalanobis distance of `x` from the center of this Gaussian. In the 1D case
        this reduces to computing an absolute z-score.

        NOTE: This function is vectorized if you pass multiple points as `x`.

        >>> Gaussian(2, 4).z_score(6)
        2.0

        >>> Gaussian(2, 4).z_score([0, 3, 6])
        array([1. , 0.5, 2. ])

        >>> Gaussian(pd.Series([2, 0, 0]), pd.DataFrame([ \
                [ 1.5, -0.5, -0.5], \
                [-0.5,  1.5, -0.5], \
                [-0.5, -0.5,  1.5] \
            ])).z_score(pd.Series([0, 1, 0]))
        1.7320508075688763

        >>> Gaussian(pd.Series([2, 0, 0]), pd.DataFrame([ \
                [ 1.5, -0.5, -0.5], \
                [-0.5,  1.5, -0.5], \
                [-0.5, -0.5,  1.5] \
            ])).z_score([[0, 1, 0], [1, 0, 0]])
        array([1.73205081, 1.        ])

        >>> Gaussian(pd.Series([0, 1], index=['a', 'b']), [1, 2]).z_score([1, 3])
        1.7320508075688772

        >>> Gaussian(pd.Series([0, 1], index=['a', 'b']), [1, 2]) \
                .z_score(pd.DataFrame([[3, 1], [5, 2]], columns=['b', 'a'], index=[0, 1]))
        0    1.732051
        1    3.464102
        dtype: float64

        """
        cov_inv = np.linalg.pinv(self.__covariance)
        if self.__should_vectorize(x):
            if isinstance(x, pd.DataFrame):
                x = x[self.__mean.index]
                return x.apply(lambda x: mahalanobis(self.__mean, x, cov_inv),
                               axis=1)
            else:
                return np.array(
                    [mahalanobis(self.__mean, x_i, cov_inv) for x_i in x])
        else:
            # Sort `x` labels to match `mean` indexing.
            if self.__has_similar_labels(x):
                x = x[self.__mean.index]
            return mahalanobis(self.__mean, x, cov_inv)
    def y_vec(self, centers, w):
        if self.eta is None:
            if self.sparse:
                sparse_reg = np.zeros(self.n_clusters)
                # scale_param = np.tanh(self.iteration_num / 300) if self.iteration_num > 700 else 0
                scale_param = 1e-5
                reg = utils.get_sparse_reg(centers, self.process_label)
                sparse_reg[self.process_label] = scale_param * reg

                return np.array([
                    mahalanobis(w, centers[label], self.Gammas_inv[label]) +
                    sparse_reg[label] + self.noise.fabric(self.iteration_num)
                    for label in range(self.n_clusters)
                ])
            else:
                return np.array([
                    mahalanobis(w, centers[label], self.Gammas_inv[label]) +
                    self.noise.fabric(self.iteration_num)
                    for label in range(self.n_clusters)
                ])
        else:
            if self.sparse:
                sparse_reg = np.zeros(self.n_clusters)

                reg_1 = utils.get_sparse_reg(centers, self.process_label)
                scale_param_1 = 1e-5
                # scale_param_1 = 1e-5 * np.tanh(self.iteration_num / 300) if self.iteration_num > 700 else 0
                sparse_reg[self.process_label] += scale_param_1 * reg_1

                reg_2 = utils.get_sparse_reg_2(centers, self.process_label)
                scale_param_2 = 1e-3 * np.tanh(
                    self.iteration_num /
                    300) if self.iteration_num > 700 else 0
                # scale_param_2 = 1e-3
                sparse_reg[self.process_label] += scale_param_2 * reg_2

                return np.array([
                    mahalanobis(w, centers[label],
                                np.linalg.inv(self.Gammas[label])) +
                    sparse_reg[label] + self.noise.fabric(self.iteration_num)
                    for label in range(self.n_clusters)
                ])
            else:
                return np.array([
                    mahalanobis(w, centers[label],
                                np.linalg.inv(self.Gammas[label])) +
                    self.noise.fabric(self.iteration_num)
                    for label in range(self.n_clusters)
                ])
Beispiel #15
0
def mahalanobisR(myRow, inData, covariance):
    """
    Find eucledian distance between given row and data given
    data could be an array or a single row
    requres covariance matrix, not inverted covariance matrix
    """
    IC = covariance.values if isinstance(covariance, pd.DataFrame) else covariance
    IC = sp.linalg.inv(IC)

    m = []
    if (len(inData.shape) == 1):
        return(mahalanobis(inData,myRow,IC) ** 2)
    for i in range(inData.shape[0]):
        m.append(mahalanobis(inData.ix[i,:],myRow,IC) ** 2)
    return(m)
Beispiel #16
0
 def _call(self, ds):
     """
     The method use a dataset and computes the mahalanobis distance from the trained 
     distribution.
     It uses the pvalue as a threshold to calculate how many volumes are 
     mahalanobis-distant from the training distribution.
     
     Parameters
     ----------
     ds: pymvpa dataset. Testing dataset
     
     Returns
     -------
     dataset: a dataset with the number of volumes with a m-distance below the threshold
     """
     
     distances = []
     
     mean_ = self.params['mean']
     icov_ = self.params['icov']
       
     for ex in ds:   
         dist_ = mahalanobis(mean_, ex, icov_)
         distances.append(dist_)
     
     chi_sq = scipy.stats.distributions.chi2(mean_.shape[0])
     m_value = chi_sq.isf(self.p)
     
     distances = np.array(distances)
     value = np.count_nonzero((distances ** 2) < m_value)
     
     #space = self.get_space()
     return Dataset(np.array([value]))
Beispiel #17
0
    def scoring(err, mu, sigma):

        scores = []
        for e in err:
            scores.append(mahalanobis(e, mu, sigma))

        return scores
Beispiel #18
0
    def _get_node_distance_matrix(self, datapoint, som_array):
        """Get distance of datapoint and node using Euclidean distance.

        Parameters
        ----------
        datapoint : np.array, shape=(X.shape[1])
            Datapoint = one row of the dataset `X`
        som_array : np.array
            Weight vectors of the SOM,
            shape = (self.n_rows, self.n_columns, X.shape[1])

        Returns
        -------
        distmat : np.array of float
            Distance between datapoint and each SOM node

        """
        # algorithms on the full matrix
        if self.distance_metric == "euclidean":
            return np.linalg.norm(som_array - datapoint, axis=2)

        # node-by-node algorithms
        distmat = np.zeros((self.n_rows, self.n_columns))
        if self.distance_metric == "manhattan":
            for node in self.node_list_:
                distmat[node] = dist.cityblock(
                    som_array[node[0], node[1]], datapoint)

        elif self.distance_metric == "mahalanobis":
            for node in self.node_list_:
                som_node = som_array[node[0], node[1]]
                cov = np.cov(np.stack((datapoint, som_node), axis=0),
                             rowvar=False)
                cov_pinv = np.linalg.pinv(cov)   # pseudo-inverse
                distmat[node] = dist.mahalanobis(
                    datapoint, som_node, cov_pinv)

        elif self.distance_metric == "tanimoto":
            # Note that this is a binary distance measure.
            # Therefore, the vectors have to be converted.
            # Source: Melssen 2006, Supervised Kohonen networks for
            #         classification problems
            # VERY SLOW ALGORITHM!!!
            threshold = 0.5
            for node in self.node_list_:
                som_node = som_array[node[0], node[1]]
                distmat[node] = dist.rogerstanimoto(
                    binarize(datapoint.reshape(1, -1), threshold=threshold,
                             copy=True),
                    binarize(som_node.reshape(1, -1), threshold=threshold,
                             copy=True))

        elif self.distance_metric == "spectralangle":
            for node in self.node_list_:
                distmat[node] = np.arccos(np.divide(
                    np.dot(som_array[node[0], node[1]], datapoint),
                    np.multiply(np.linalg.norm(som_array),
                                np.linalg.norm(datapoint))))

        return distmat
	def compute_distance_hmd(self, domain):
		meaningful_word_ratio = domain.get_linguistic_feature_set().get_meaningful_word_ratio()
		one_gram_normality_score = domain.get_linguistic_feature_set().get_one_gram_normality_score()
		two_gram_normality_score = domain.get_linguistic_feature_set().get_two_gram_normality_score()
		three_gram_normality_score = domain.get_linguistic_feature_set().get_three_gram_normality_score()
		four_gram_normality_score = domain.get_linguistic_feature_set().get_four_gram_normality_score()
		five_gram_normality_score = domain.get_linguistic_feature_set().get_five_gram_normality_score()

		current_sample = numpy.array([meaningful_word_ratio, one_gram_normality_score, two_gram_normality_score, three_gram_normality_score])

		filename = time.strftime('%Y-%m-%d', time.localtime(time.time())) + '.csv'
		_info = filename.strip('.csv')
		timestr = time.strftime('%Y-%m-%d', time.localtime(time.time()))
		f = open(parent_path + '/trails/' + timestr + ".csv", "a")
		f.writelines(str(meaningful_word_ratio))
		f.writelines('\t')
		f.writelines(str(one_gram_normality_score))
		f.writelines('\t')
		f.writelines(str(two_gram_normality_score))
		f.writelines('\t')
		f.writelines(str(three_gram_normality_score))
		f.writelines('\t')
		f.writelines(str(four_gram_normality_score))
		f.writelines('\t')
		f.writelines(str(five_gram_normality_score))
		f.writelines('\t')
		f.close()

		for i in range(len(current_sample)):
			if self._centroid[i] < current_sample[i]:
				current_sample[i] = self._centroid[i] ###if current_sample[i]  is bigger than centroid[i],then it must be no_dga.

		distance = mahalanobis(current_sample, self._centroid, self._cov_inv)

		return distance
	def compute_distance_bmd(self, domain):
		meaningful_word_ratio = domain.get_linguistic_feature_set().get_meaningful_word_ratio()
		one_gram_normality_score = domain.get_linguistic_feature_set().get_one_gram_normality_score()
		two_gram_normality_score = domain.get_linguistic_feature_set().get_two_gram_normality_score()
		three_gram_normality_score = domain.get_linguistic_feature_set().get_three_gram_normality_score()
		four_gram_normality_score = domain.get_linguistic_feature_set().get_four_gram_normality_score()
		five_gram_normality_score = domain.get_linguistic_feature_set().get_five_gram_normality_score()

		current_sample = numpy.array([meaningful_word_ratio, one_gram_normality_score, two_gram_normality_score, three_gram_normality_score])
		
		f=open("./new2/result_bmd.csv","a")
		f.writelines(str(meaningful_word_ratio))
		f.writelines('\t')
		f.writelines( str(one_gram_normality_score))
		f.writelines('\t')
		f.writelines( str(two_gram_normality_score))
		f.writelines('\t')
		f.writelines( str(three_gram_normality_score))
		f.writelines('\t')
		f.writelines( str(four_gram_normality_score))
		f.writelines('\t')
		f.writelines( str(five_gram_normality_score))
		f.writelines('\t')
		f.close()

		for i in range(len(current_sample)):
			if self._centroid[i] < current_sample[i]:
				current_sample[i] = self._centroid[i] ###if current_sample[i]  is bigger than centroid[i],then it must be no_dga.

		distance = mahalanobis(current_sample, self._centroid, self._cov_inv)

		return distance
Beispiel #21
0
def gaussian_weights(bundle, n_points=100, return_mahalnobis=False):
    """
    Calculate weights for each streamline/node in a bundle, based on a
    Mahalanobis distance from the mean of the bundle, at that node

    Parameters
    ----------
    bundle : array or list
        If this is a list, assume that it is a list of streamline coordinates
        (each entry is a 2D array, of shape n by 3). If this is an array, this
        is a resampled version of the streamlines, with equal number of points
        in each streamline.
    n_points : int, optional
        The number of points to resample to. *If the `bundle` is an array, this
        input is ignored*. Default: 100.

    Returns
    -------
    w : array of shape (n_streamlines, n_points)
        Weights for each node in each streamline, calculated as its relative
        inverse of the Mahalanobis distance, relative to the distribution of
        coordinates at that node position across streamlines.
    """
    if isinstance(bundle, list) or isinstance(bundle, dts.Streamlines):
        # if you got a list, assume that it needs to be resampled:
        bundle = _resample_bundle(bundle, n_points)
    else:
        if bundle.shape[-1] != 3:
            e_s = "Input must be shape (n_streamlines, n_points, 3)"
            raise ValueError(e_s)
        n_points = bundle.shape[1]

    w = np.zeros((bundle.shape[0], n_points))
    # If there's only one fiber here, it gets the entire weighting:
    if bundle.shape[0] == 1:
        return np.array([1])

    for node in range(bundle.shape[1]):
        # This should come back as a 3D covariance matrix with the spatial
        # variance covariance of this node across the different streamlines
        # This is a 3-by-3 array:
        node_coords = bundle[:, node]
        c = np.cov(node_coords.T, ddof=0)
        c = np.array([[c[0, 0], c[0, 1], c[0, 2]],
                      [0, c[1, 1], c[1, 2]],
                      [0, 0, c[2, 2]]])
        # Calculate the mean or median of this node as well
        # delta = node_coords - np.mean(node_coords, 0)
        m = np.mean(node_coords, 0)
        # Weights are the inverse of the Mahalanobis distance
        for fn in range(bundle.shape[0]):
            # calculate Mahalanobis for node on fiber[fn]
            w[fn, node] = mahalanobis(node_coords[fn], m, np.linalg.inv(c))
    if return_mahalnobis:
        return w
    # weighting is inverse to the distance (the further you are, the less you
    # should be weighted)
    w = 1 / w
    # Normalize before returning, so that the weights in each node sum to 1:
    return w / np.sum(w, 0)
Beispiel #22
0
    def Cal_FPR(self, config):
        N_ood = len(X_ood)  # 10000
        X_ood = np.pad(X_ood, ((0, 0), (2, 2), (2, 2), (0, 0)), 'constant')  # Adding the padding to the dataset
        f_of_x_ood = np.array(sess.run(fullc2, feed_dict={x: X_ood, keep_prob: 1.0}))
        label_of_x_ood = np.array(range(N_ood))
        for i in range(N_ood):
            temp = [None] * num_of_labels
            for label in range(num_of_labels):
                temp[label] = list()
                u = np.reshape(f_of_x_ood[i], (1, num_of_neurons))
                v = np.reshape(mu_hat[label], (1, num_of_neurons))
                temp[label].append(distance.mahalanobis(u, v, np.linalg.inv(sigma_hat)) ** 2)
            m_dist_data_of_x_ood = np.array(temp)
            index = np.argmin(m_dist_data_of_x_ood, 0)  # finding index of the closest label
            confidence_score_of_x_ood = m_max[index] - m_dist_data_of_x_ood[index]  # computing confidence score
            if confidence_score_of_x_ood > threshold:
                label_of_x_ood[i] = index  # classifying in-distribution data
            else:
                label_of_x_ood[i] = ood_index  # classifying out-of-distribution data
        num_of_in_distribution = 0
        for i in range(N_ood):
            if label_of_x_ood[i] != ood_index:
                num_of_in_distribution = num_of_in_distribution + 1
        fpr = num_of_in_distribution / N_ood
        print('FPR on out-of-distribution(EMNIST): {:.4f}'.format(fpr), end='\n')

        '''
Beispiel #23
0
 def Cal_TPR(self, config):
     N_data = len(data)  # 10000
     f_x = np.array(sess.run(fullc2, feed_dict={x: data, keep_prob: 1.0}))
     label_x
     pred_x = np.array(range(N_test))
     for i in range(N_data):
         temp = [None] * num_of_labels
         for label in range(num_of_labels):
             temp[label] = list()
             u = np.reshape(f_x[i], (1, num_of_neurons))
             v = np.reshape(mu_hat[label], (1, num_of_neurons))
             temp[label].append(distance.mahalanobis(u, v, np.linalg.inv(sigma_hat)) ** 2)
         m_dist_data_of_x_test = np.array(temp)
         index = np.argmin(m_dist_data_of_x_test, 0)  # finding index of the closest label
         confidence_score_of_x_test = m_max[index] - m_dist_data_of_x_test[index]  # computing confidence score
         if confidence_score_of_x_test > threshold:
             label_of_x_test[i] = index // 2  # classifying in-distribution data
         else:
             label_of_x_test[i] = ood_index  # classifying out-of-distribution data
     num_of_in_distribution = 0
     num_of_correctly_classified = 0
     accuracy_on_in_distribution = 0.0
     for i in range(N_test):
         if label_of_x_test[i] != ood_index:
             num_of_in_distribution = num_of_in_distribution + 1
             if label_of_x_test[i] == target_label_of_x_test[i]:
                 num_of_correctly_classified = num_of_correctly_classified + 1
     accuracy_on_in_distribution = num_of_correctly_classified / num_of_in_distribution
     tpr = num_of_in_distribution / N_test
     print('Classification accuracy on in-distribution: {:.4f}'.format(accuracy_on_in_distribution))
     print('TPR on in-distribution(MNIST): {:.4f}'.format(tpr), end='\n')
Beispiel #24
0
def genSimilarComposition(pulsePeriod, pieceDur, strokeModels = None, iAudioFile = None, iPos = None, invC = None):
    if strokeModels == None:
        strokeSeq = None
        ts = None 
        opulsePos = None
    else:
        testFeatFull = getFeatSequence(iAudioFile,iPos)
        testFeat = testFeatFull['pmfcc']
        print testFeat.shape
        Npulse = testFeat.shape[0]
        Ndata = len(strokeModels)
        strokeSeq = np.array([])
        ts = np.array([])
        tscurr = 0.0
        opulsePos = np.arange(0,pieceDur,pulsePeriod)
        for k in range(Npulse):
            ftIn = testFeat[k,params.selectInd]
            distVal = 1e6*np.ones(Ndata)
            ts = np.append(ts,tscurr)
            tscurr = tscurr + pulsePeriod
            for p in range(Ndata):
                ftOut = strokeModels[p]['feat']['pmfcc'][0][params.selectInd]
                distVal[p] = DS.mahalanobis(ftIn,ftOut,invC)
            strokeSeq = np.append(strokeSeq,np.argmin(distVal))
    return strokeSeq, ts, opulsePos
Beispiel #25
0
def mahalanobis_distances(df, axis=0):
    '''
    Returns a pandas Series with Mahalanobis distances for each sample on the
    axis.

    Note: does not work well when # of observations < # of dimensions
    Will either return NaN in answer
    or (in the extreme case) fail with a Singular Matrix LinAlgError

    Args:
        df: pandas DataFrame with columns to run diagnostics on
        axis: 0 to find outlier rows, 1 to find outlier columns
    '''
    df = df.transpose() if axis == 1 else df
    means = df.mean()
    try:
        inv_cov = np.linalg.inv(df.cov())
    except LinAlgError:
        return pd.Series([np.NAN] * len(df.index), df.index,
                         name='Mahalanobis')
    dists = []
    for i, sample in df.iterrows():
        dists.append(mahalanobis(sample, means, inv_cov))

    return pd.Series(dists, df.index, name='Mahalanobis')
Beispiel #26
0
    def pdf(x):
        x = np.asarray(x).ravel()

        assert len(x) == d, "Incorrect dimensionality. The input data must " \
            "be %d-dimensional." % d

        return scale_factor * m.exp(-0.5*mahalanobis(x, mu, inv_sigma))
Beispiel #27
0
def mahalanobis_distances(df, axis=0):
    '''
    Returns a pandas Series with Mahalanobis distances for each sample on the
    axis.

    Note: does not work well when # of observations < # of dimensions
    Will either return NaN in answer
    or (in the extreme case) fail with a Singular Matrix LinAlgError

    Args:
        df: pandas DataFrame with columns to run diagnostics on
        axis: 0 to find outlier rows, 1 to find outlier columns
    '''
    df = df.transpose() if axis == 1 else df
    means = df.mean()
    try:
        inv_cov = np.linalg.inv(df.cov())
    except LinAlgError:
        return pd.Series([np.NAN] * len(df.index),
                         df.index,
                         name='Mahalanobis')
    dists = []
    for i, sample in df.iterrows():
        dists.append(mahalanobis(sample, means, inv_cov))

    return pd.Series(dists, df.index, name='Mahalanobis')
Beispiel #28
0
    def _test(self, means: NDArray, cvars: NDArray) -> None:

        embeddings, artifacts = self._embed("test")
        b, c, h, w = embeddings.shape
        embeddings = embeddings.reshape(b, c, h * w)

        distances = []
        for i in tqdm(range(h * w), desc=f"{self.cfg.params.category} - compute distance"):
            mean = means[:, i]
            cvar_inv = np.linalg.inv(cvars[:, :, i])
            distance = [mahalanobis(e[:, i], mean, cvar_inv) for e in embeddings]
            distances.append(distance)

        img_h = self.cfg.params.height
        img_w = self.cfg.params.width
        amaps = torch.tensor(np.array(distances), dtype=torch.float32)
        amaps = amaps.permute(1, 0).view(b, h, w).unsqueeze(dim=1)  # (b, 1, h, w)
        amaps = F.interpolate(amaps, size=(img_h, img_w), mode="bilinear", align_corners=False)
        amaps = mean_smoothing(amaps)
        amaps = (amaps - amaps.min()) / (amaps.max() - amaps.min())
        amaps = amaps.squeeze().numpy()

        roc_score = compute_roc_score(amaps, np.array(artifacts["mask"]), artifacts["stem"])
        pro_score = compute_pro_score(amaps, np.array(artifacts["mask"]))
        mlflow.log_metrics({"roc_score": roc_score, "pro_score": pro_score})
        draw_roc_and_pro_curve(roc_score, pro_score)
        savegif(
            np.array(artifacts["image"]),
            amaps,
            np.array(artifacts["mask"]),
            artifacts["stem"],
        )
def get_anomalous_values(data, window_size, prob=0.99):
    """
    return a list of anomalous values, i.e. the ones that exceed md times
    in terms of Mohalanobis distance the expected multivariate average. Both
    multivariate average and Mahalanobis distance are calculated considering
    the moving windows, i.e. the value computed considering window_size
    neighbours, moving the window for each value of the serie.

    data : pandas.core.frame.DataFrame
    window_size: int
    md: float

    return: list
    """

    # under normal hypotesis, the Mohalanobis dinstance is Chi-squared
    # distribuited
    threshold = np.sqrt(-2 * np.log(1 - prob))

    # calculate the moving window for each point, and report the anomaly if
    # the distance of the idx-th point is greater than md times the mahalanobis
    # distance
    return [(p['idx'], p['value']) for p in nd_rolling(data, window_size)
            if mahalanobis(p['value'], p['window_avg'],
                           np.linalg.inv(p['window_cov'])) > threshold]
Beispiel #30
0
def KullbackLeiberDivergence(CoefficientA, CoefficientB, CoefficientC, Mean,
                             Sample):

    distance = (ds.mahalanobis(Mean, Sample, CoefficientA))**2
    divergence = CoefficientC + distance - CoefficientB - len(Mean)

    return np.sqrt(divergence / 2)
Beispiel #31
0
def _mahalanobis_distances_scipy(m, SI, X):
	 n = X.shape[0]
	 mahal = np.zeros(n)
	 for i in xrange(X.shape[0]):
		 x = X[i,:]
		 mahal[i] = distance.mahalanobis(x,m,SI)
	 return mahal
Beispiel #32
0
def _mahalanobis_distances_scipy(m, SI, X):
 n = X.shape[0]
 mahal = np.zeros(n)
 for i in xrange(X.shape[0]):
	 x = X[i,:]
	 mahal[i] = distance.mahalanobis(x,m,SI)
 return mahal
Beispiel #33
0
 def metrykaMahalanobisa(self,array1,array2, macierzKowariancji):
 
     """
     Computes the Mahalanobis distance between two n-vectors ``u`` and ``v``,
     which is defined as
 
     .. math::
 
        \sqrt{ (u-v) V^{-1} (u-v)^T }
 
     where ``V`` is the covariance matrix.  Note that the argument ``VI``
     is the inverse of ``V``.
 
     Parameters
     ----------
     u : ndarray
         An :math:`n`-dimensional vector.
     v : ndarray
         An :math:`n`-dimensional vector.
     VI : ndarray
         The inverse of the covariance matrix.
 
     Returns
     -------
     d : double
         The Mahalanobis distance between vectors ``u`` and ``v``.
     """
     return mahalanobis(array1, array2, macierzKowariancji)
Beispiel #34
0
def compute_compatibility(observations, predictions):
    """ 
	Individual Compatibility Test
	"""
    compatibility = dict()
    compatibility = {'d2': None, 'IC': None}
    compatibility['d2'] = np.zeros(shape=(observations['M'], predictions['N']))
    compatibility['IC'] = np.zeros(shape=(observations['M'], predictions['N']))

    # Compute Individual Squared Mahalanobis Distances
    for i in range(observations['M']):
        z = observations['z'][i]
        R = observations['R_covariance'][i]
        # R = [1]
        for j in range(predictions['N']):
            C = np.add(predictions['H_P_H'][i], R)
            C_inverse = np.linalg.inv(C)
            # C_inverse = [1]
            # print(z,R,C, predictions['h_map_fn'][j])
            compatibility['d2'][i][j] = mahalanobis(z,
                                                    predictions['h_map_fn'][j],
                                                    C_inverse)

    # Check Mahalanobis Distance against critical values from a Chi2 Distribution.
    for i in range(observations['M']):
        for j in range(predictions['N']):
            if (compatibility['d2'][i][j] < chi2.isf(q=0.01, df=2)):
                compatibility['IC'][i][j] = 1
            else:
                compatibility['IC'][i][j] = 0

    return compatibility
Beispiel #35
0
    def __call__(self, state1, state2):
        r"""Calculate the Mahalanobis distance between a pair of state objects

        Parameters
        ----------
        state1 : :class:`~.State`
        state2 : :class:`~.State`

        Returns
        -------
        float
            Mahalanobis distance between a pair of input :class:`~.State`
            objects

        """
        if self.mapping is not None:
            u = state1.state_vector[self.mapping]
            v = state2.state_vector[self.mapping]
            # extract the mapped covariance data
            rows = np.array(self.mapping, dtype=np.intp)
            columns = np.array(self.mapping, dtype=np.intp)
            cov = state1.covar[rows[:, np.newaxis], columns]
        else:
            u = state1.state_vector
            v = state2.state_vector
            cov = state1.covar

        vi = np.linalg.inv(cov)

        return distance.mahalanobis(u, v, vi)
def calc_distance(tracks, means, covs,remeasure):
    f_point = tracks.get_point_measurement()
    if remeasure:
        f_point.append(tracks.get_vdur())
    c = np.matrix(covs).I
    dist = mahalanobis(f_point,means, c)
    return dist
Beispiel #37
0
def distance(vector1, vector2, alpha=2, metric='euclidean'):
    '''
    Helper function that calculates the alpha

    :param vector1: a vector
    :type vector1: list of doubles
    :param vector2: a vector
    :type vector2: list of doubles
    :param metric: euclidean, mahalanobis, seuclidean, cityblock
    :type metric: string
    :rtype: norm between vectors A and B
    '''

    mp.dps = 50
    alpha = mpf(1.0 * alpha)
    vector1 = matrix(numpy.array(vector1))
    vector2 = matrix(numpy.array(vector2))

    if metric == 'euclidean':
        vector_norm = distances.euclidean(vector1, vector2)
    elif metric == 'mahalanobis':
        vi = numpy.linalg.inv(
            numpy.cov(numpy.concatenate((vector1, vector2)).T))
        vector_norm = distances.mahalanobis(vector1, vector2, vi)
    elif metric == 'seuclidean':
        vector_norm = distances.seuclidean(vector1, vector2)
    elif metric == 'cityblock':
        vector_norm = distances.cityblock(vector1, vector2)
    elif metric == 'hamming':
        vector_norm = distances.hamming(vector1, vector2)
    else:
        print "Unknown metric"
        return None

    return vector_norm
Beispiel #38
0
def mahalanobis_distance(a, b):
    """ uses the scipy mahalanobis distances to calculate the distance
    between two arrays. """
    x = np.array(a)
    y = np.array(b)
    z = np.vstack((x, y))
    cov = np.cov(z.T)
    return distance.mahalanobis(x, y, cov)
Beispiel #39
0
def greylvldistancemetric(x1,x2, cov, k ,n,img):
    
    grads = getgradientsalong(x1,n,img,k)
                            
    sumgrads = sum(np.abs(grads))
    if sumgrads != 0:
        grads = grads/float(sumgrads)
        
    return ssd.mahalanobis(x2,grads,cov)
Beispiel #40
0
def distancePV ( sample, mask, params_tissue1, params_tissue2, distance ):
    from scipy.spatial.distance import mahalanobis,euclidean
    import numpy as np

    # Direction vector between pure tissues
    d_vect = np.ravel(params_tissue2[0] - params_tissue1[0]).T
    mu1 = np.ravel(params_tissue1[0])
    mu2 = np.ravel(params_tissue2[0])
    SI1 = params_tissue1[1].getI()
    SI2 = params_tissue2[1].getI()

    if distance=='mahalanobis':
        norm = np.array( [ 1/(1+ mahalanobis(pix,mu2,SI2)/ mahalanobis(pix,mu1,SI1)) for pix in sample[mask==1] ] )
    elif distance=='dummy':
        norm = mask*0.5
    else:
        norm = np.array( [ 1/(1+ euclidean(pix,mu2)/ euclidean(pix,mu1)) for pix in sample[mask==1] ] )
    result = np.zeros( np.shape( mask ) )
    result[mask==1] = norm
    return result
Beispiel #41
0
def closest_mahalanobis(pt, W, cov, k):
	min_is = [-1] * k
	min_dists = [float('inf')] * k
	for i in range(W.shape[1]):
		dist = distance.mahalanobis(pt, W[:, i], cov)

		max_i = max(xrange(len(min_dists)), key=min_dists.__getitem__)
		if dist < min_dists[max_i]:
			min_dists[max_i]= dist
			min_is[max_i] = i
	return min_is
Beispiel #42
0
 def fit(self, X):
     """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
     self.mcd.fit(X)
     mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_  )
     d = np.array(map(mahalanobis, X)) #Mahalanobis distance values
     self.d2 = d ** 2 #MD squared
     n, self.degrees_of_freedom_ = X.shape
     self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) )
     if self.verbose:
         print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile)
         print "with support fraction %.2f."%self.support_fraction
     return self
Beispiel #43
0
 def mah_dist(self, new_pat, old_pats, min_limit):
     old_gaps = []
     new_gap = self.convert_pattern_to_gaps(new_pat)
     for pat in old_pats:
         old_gaps.append(self.convert_pattern_to_gaps(pat))
     old_arr = numpy.array(old_gaps)
     cov = numpy.cov(old_arr, rowvar=0)
     cov_inv = numpy.linalg.pinv(cov)
     mean = numpy.mean(old_gaps, axis=0)
     dist = numpy.sqrt(distance.mahalanobis(new_gap, mean, cov_inv))
     print('dist = ', dist)
     return True if dist < min_limit else False
Beispiel #44
0
    def SetCOINdist_2D(self, fit, distances, means, dist_type):

        #loop population Fitness
        count = 0

        #calculate covariance
        if dist_type == "mahalanobis":
            x = []
            y = []
            for i in fit:
                x.append(i[0][0])
                y.append(i[0][1])
            covar = np.linalg.pinv(np.cov(np.array(x),np.array(y)))
            #print " zzzzz "

        #print covar
        for f in fit:
            # print f
            # print "  x  "
            # print f[0][0]

            #distances to the mean
            shortest =  float("inf")
            #print means

            for i in means:
                #  1: mean - fitness
                if dist_type == "euclidean":
                    a = distance.euclidean(i,f[0])
                else:

                    # a = distance.euclidean(i,f[0])
                    # b = distance.mahalanobis(i,f[0],covar)
                    # print a
                    # print b
                    a = distance.mahalanobis(i,f[0],covar)
                    #print a


                if a<=shortest:
                    shortest = a
                # print a
                # print i
                # print f[0]
                # quit()
            #print shortest
            #quit()
            #the shortest distance to one of the points
            distances[count] = shortest

            count +=1
Beispiel #45
0
    def module3(self):

        # vectorの定義
        vector1 = np.array([1,1]).astype(np.float64)
        vector2 = np.array([2,3]).astype(np.float64)        

        con = np.vstack((vector1, vector2))

        # 逆行列の計算    # viは共分散行列
        vi = np.linalg.inv( con.T )
        print vi

        vector_norm = distances.mahalanobis( vector1, vector2, vi )
        print vector_norm
Beispiel #46
0
def test_mah_dist(true_pat, false_pat, err_bound):
    n_error_true = 0
    n_error_false = 0
    for i, t_pat in enumerate(true_pat):
        ex_pat = [p for j, p in enumerate(true_pat) if i is not j]
        m = np.mean(ex_pat, axis=0)
        cov = np.cov(ex_pat, rowvar=0)
        inv_cov = np.linalg.pinv(cov)
        mah_dist = dist.mahalanobis(t_pat, m, inv_cov)
        if(mah_dist > err_bound):
            n_error_true += 1

    m = np.mean(true_pat, axis=0)
    cov = np.cov(true_pat, rowvar=0)
    inv_cov = np.linalg.pinv(cov)
    for f_pat in false_pat:
        mah_dist = dist.mahalanobis(f_pat, m, inv_cov)
        if(mah_dist < err_bound):
            n_error_false += 1

    frr = float(n_error_true)/float(len(true_pat))
    far = float(n_error_false)/float(len(false_pat))
    return far, frr
Beispiel #47
0
def opt_alpha(alpha):
    e1 = alpha[0]*v_eff + alpha[1]*w_eff
    e2 = alpha[2]*v_eff + alpha[3]*w_eff
    
    v = v_eff + np.random.normal(e1)
    w = w_eff + np.random.normal(e2)
    
    all_v = np.vstack((v, w))
    
    cov_mat = np.cov(wld_vel.T)
    
    total_dist = 0.0
    for i in xrange(0, len(all_v.T)):
        total_dist += sp_dist.mahalanobis(all_v.T[i], wld_vel[i], np.linalg.inv(cov_mat))
    
    return total_dist
    def find_best_position(self, sampled_profile, point_index):
        '''

        :param sampled_profile: Length 2m + 1
        :param model: Length 2k + 1
        :return:
        '''
        model_length = len(self.means_points_model[point_index])
        sampled_profile_length = len(sampled_profile)
        min_value = float("inf")
        min_index = 0
        for i in range(0, sampled_profile_length - model_length + 1):
            sampled_profile_part = sampled_profile[i:i + model_length]
            distance = mahalanobis(sampled_profile_part, self.means_points_model[point_index],
                                   self.inverse_covariance_points_model[point_index])
            if distance < min_value:
                min_value = distance
                min_index = i
        return self.k + min_index
Beispiel #49
0
 def _call(self, ds):
       
     distances = []
     
     mean_ = self.params['mean']
     icov_ = self.params['icov']
       
     for ex in ds:   
         dist_ = mahalanobis(mean_, ex, icov_)
         distances.append(dist_)
     
     chi_sq = scipy.stats.distributions.chi2(mean_.shape[0])
     m_value = chi_sq.isf(self.p)
     
     distances = np.array(distances)
     value = np.count_nonzero((distances ** 2) < m_value)
     
     #space = self.get_space()
     return Dataset(np.array([value]))
Beispiel #50
0
def mahala_fcn(x, y):
    '''

    Parameters
    ----------

    x - numpy.ndarray
        A 1D array

    y - numpy.ndarray
        A 1D array

    '''

    cov = np.cov(zip(x, y))
    try:
        icov = np.linalg.inv(cov)
    except np.linalg.LinAlgError:
        icov = np.linalg.inv(cov + np.eye(cov.shape[0], cov.shape[1], k=1e-3))

    val = mahalanobis(x, y, icov)

    return np.sqrt(val)
Beispiel #51
0
def init_d2_weighting(data, num_comp):

    num_obs = data.shape[0]

    cov_inv = np.linalg.inv(np.cov(data, rowvar=0))

    select_prob = np.ones(num_obs) / num_obs
    shortest_dist = np.inf * np.ones(num_obs)
    centroid = np.ones(num_comp)

    for k in range(num_comp):
        # Select a random data point as centroid
        centroid[k] = np.nonzero(multinomial(1, select_prob))[0]

        # Recompute distances
        for i, d in enumerate(shortest_dist):
            d_new = mahalanobis(data[centroid[k], :], data[i, :], cov_inv)
            if d_new < d: shortest_dist[i] = d_new

        select_prob = normalize_logspace(
            pow(shortest_dist.reshape(1, len(shortest_dist)), 2, 1))
        select_prob = select_prob.flatten()

    return centroid
Beispiel #52
0
import numpy as np
import pylab as pl
import scipy.spatial.distance as dist
def plotSamples(x, y, z=None):
    stars = np.matrix([[3., -2., 0.], [3., 2., 0.]])
    if z is not None:
        x, y = z * np.matrix([x, y])
        stars = z * stars
    pl.scatter(x, y, s=10) # 画 gaussian 随机点
    pl.scatter(np.array(stars[0]), np.array(stars[1]), s=200, marker='*', color='r') # 画三个指定点
    pl.axhline(linewidth=2, color='g') # 画 x 轴
    pl.axvline(linewidth=2, color='g') # 画 y 轴
    pl.axis('equal')
    pl.axis([-5, 5, -5, 5])
    pl.show()
# 产生高斯分布的随机点
mean = [0, 0] # 平均值
cov = [[2, 1], [1, 2]] # 协方差
x, y = np.random.multivariate_normal(mean, cov, 1000).T
plotSamples(x, y)
covMat = np.matrix(np.cov(x, y)) # 求 x 与 y 的协方差矩阵
Z = np.linalg.cholesky(covMat).I # 仿射矩阵
plotSamples(x, y, Z)
# 求马氏距离
print '\n到原点的马氏距离分别是'
print dist.mahalanobis([0,0], [3,3], covMat.I), dist.mahalanobis([0,0], [-2,2], covMat.I)
# 求变换后的欧几里得距离
dots = (Z * np.matrix([[3, -2, 0], [3, 2, 0]])).T
print '\n变换后到原点的欧几里得距离分别是:'
print dist.minkowski([0, 0], np.array(dots[0]), 2), dist.minkowski([0, 0], np.array(dots[1]), 2)
def dbScanDistance(a, b):
	covInv = np.cov(np.vstack((a,b)).T)
	return mahalanobis(a, b, covInv)
 def y_vec(self, centers, w):
     return np.array([mahalanobis(w, centers[label], np.linalg.inv(self.Gammas[label])) + self.noise.fabric(
             self.iteration_num) for label in xrange(self.n_clusters)])
Beispiel #55
0
        example_dist[label]['i_cov'] = cov_.precision_
        print 'Inverted covariance estimated...'
        
    # Get predictions of target dataset (unlabelled)
    # We simply apply classifier to target dataset
    classifier_prediction_tar = results['predictions']

    
    mahalanobis_values = np.zeros_like(ds_tar.targets, dtype=np.float)
    distances = dict()
    
    # For each class it is computed the distance of samples from class distribution
    for c in np.unique(classifier_prediction_tar):
        distances[c] = []
        for j, ex in enumerate(ds_tar.samples):
            dist_ = mahalanobis(example_dist[c]['mean'], ex, example_dist[c]['i_cov'])
            distances[c].append(dist_)
            # If the class is the same of the classifier prediction we store it
            # It makes nosense to store two arrays! But now I did it this way!!!
            ## TODO: Create only one vector and filter it afterwards
            if c == classifier_prediction_tar[j]:
                mahalanobis_values[j] = dist_
        
        distances[c] = np.array(distances[c]) ** 2
    '''
    Squared Mahalanobis distance is similar to a chi square distribution with 
    degrees of freedom equal to the number of features.
    '''
    
    mahalanobis_values = np.array(mahalanobis_values) ** 2
    
 def cluster_decision(self, point):
     return np.argmin(
             [mahalanobis(self.cluster_centers_[label], point, np.linalg.inv(self.Gammas[label])) for label in
              range(self.n_clusters)])
Beispiel #57
0
def gaussian_weights(bundle, n_points=100, return_mahalnobis=False,
                     stat=np.mean):
    """
    Calculate weights for each streamline/node in a bundle, based on a
    Mahalanobis distance from the core the bundle, at that node (mean, per
    default).

    Parameters
    ----------
    bundle : Streamlines
        The streamlines to weight.
    n_points : int, optional
        The number of points to resample to. *If the `bundle` is an array, this
        input is ignored*. Default: 100.

    Returns
    -------
    w : array of shape (n_streamlines, n_points)
        Weights for each node in each streamline, calculated as its relative
        inverse of the Mahalanobis distance, relative to the distribution of
        coordinates at that node position across streamlines.
    """
    # Resample to same length for each streamline:
    bundle = set_number_of_points(bundle, n_points)

    # This is the output
    w = np.zeros((len(bundle), n_points))

    # If there's only one fiber here, it gets the entire weighting:
    if len(bundle) == 1:
        if return_mahalnobis:
            return np.array([np.nan])
        else:
            return np.array([1])

    for node in range(n_points):
        # This should come back as a 3D covariance matrix with the spatial
        # variance covariance of this node across the different streamlines
        # This is a 3-by-3 array:
        node_coords = bundle.data[node::n_points]
        c = np.cov(node_coords.T, ddof=0)
        # Reorganize as an upper diagonal matrix for expected Mahalnobis input:
        c = np.array([[c[0, 0], c[0, 1], c[0, 2]],
                      [0, c[1, 1], c[1, 2]],
                      [0, 0, c[2, 2]]])
        # Calculate the mean or median of this node as well
        # delta = node_coords - np.mean(node_coords, 0)
        m = stat(node_coords, 0)
        # Weights are the inverse of the Mahalanobis distance
        for fn in range(len(bundle)):
            # In the special case where all the streamlines have the exact same
            # coordinate in this node, the covariance matrix is all zeros, so
            # we can't calculate the Mahalnobis distance, we will instead give
            # each streamline an identical weight, equal to the number of
            # streamlines:
            if np.allclose(c, 0):
                w[:, node] = len(bundle)
                break
            # Otherwise, go ahead and calculate Mahalanobis for node on
            # fiber[fn]:
            w[fn, node] = mahalanobis(node_coords[fn], m, np.linalg.inv(c))
    if return_mahalnobis:
        return w
    # weighting is inverse to the distance (the further you are, the less you
    # should be weighted)
    w = 1 / w
    # Normalize before returning, so that the weights in each node sum to 1:
    return w / np.sum(w, 0)
Beispiel #58
0
precision = np.load('precision_test.npy')  # Needed for finding best conformer
lowest_dist = float('inf')
lowest_pose = []  # Best pose, saved for latter output
for i in xrange(n_cycles):
    params_new = tecto.update_get_connection()
    params[i] = params_new

    # Find the distance to ideal tetraloop-receptor params
    diff = PARAMS_TLR - params_new
    # Make sure the diff angles are within (-pi, pi]
    for i in xrange(3, 6):  # index 3,4,5 are angles, others are distances
        if diff[i] > np.pi:
            diff[i] -= 2 * np.pi
        elif diff[i] <= -np.pi:
            diff[i] += 2 * np.pi
    dist = mahalanobis(diff, np.zeros(6), precision)
    if dist < lowest_dist:
        lowest_dist = dist
        lowest_pose = [tecto.pose1.copy(), tecto.pose2.copy()]

###### Likelyhood Computation ######
# Fold the angles in params into proper range, such that
# they centered at the mean.
N_CYCLE_FOLD_ANGLE = 10
for j in xrange(N_CYCLE_FOLD_ANGLE):
    mean = np.mean(params, axis=0)
    for i in xrange(3, 6):  # index 3,4,5 are angles, others are distances
        params[:, i][params[:, i] > mean[i] + np.pi] -= 2 * np.pi
        params[:, i][params[:, i] < mean[i] - np.pi] += 2 * np.pi
        if PARAMS_TLR[i] > mean[i] + np.pi:
            PARAMS_TLR[i] += 2 * np.pi
Beispiel #59
0
def remove_outliers(treeList, strategy, outpath, e, summary):
    print "the strategy is: " + strategy
    if len(treeList) < 10:
        print "number of trees is " + str(len(treeList)) + ". This is not enough for outlier removal!"
        return treeList
    if strategy == "consensus10" or strategy == "consensus3":
        ftmp = findMRL(treeList, e, outpath, summary)
        ref_tree = dendropy.Tree.get(path=ftmp, schema="newick")
        treeList.append(ref_tree)
        d = list()

        for tree in treeList:
            tree.encode_bipartitions()
            ref_tree.encode_bipartitions()
            res = treecompare.false_positives_and_negatives(ref_tree, tree)
            d.append(res[1])
        if strategy == "consensus3":
            mean = np.mean(d)
            #             mean = mstats.mode(d)
            #             mean = mean[0]
            print "the mean distance to consensus tree was: " + str(mean)
            st = np.std(d)
            print "the std of distances to consensus tree was: " + str(st)
            for i in range(len(d) - 1, 0, -1):
                if d[i] > mean + 2.0 * st:
                    print "deleting " + str(i) + "th tree!"
                    print "d[i] to delete: " + str(d[i])
                    del treeList[i]
        else:
            sortIdx = np.argsort(d, 0)
            print len(sortIdx)
            print sortIdx
            m = int(len(sortIdx) / 4.0)
            print "deleting " + str(m) + " of the trees"
            idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True)
            print idx
            print d
            for i in idx:
                print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(d[i])
                del treeList[i]
    elif strategy == "pairwise1" or strategy == "pairwise2" or strategy == "pairwise3":
        D = np.ndarray(shape=(len(treeList), len(treeList)), dtype=float)
        for i in range(0, len(treeList)):
            D[i][i] = 0.0
            for j in range(i + 1, len(treeList)):
                tree1 = treeList[i]
                tree2 = treeList[j]
                tree1.encode_bipartitions()
                tree2.encode_bipartitions()
                res1 = treecompare.false_positives_and_negatives(tree1, tree2)
                D[i][j] = res1[1]
                D[j][i] = res1[0]
        if strategy == "pairwise1":
            d = np.mean(D, 1)

            C = np.cov(D)
            v = [distance.mahalanobis(D[:, i], d, C) for i in range(0, len(treeList))]
            print v
            sortIdx = np.argsort(v, 0)
            m = int(len(sortIdx) * 0.15)
            idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True)
            for i in idx:
                print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(v[i])
                del treeList[i]
        elif strategy == "pairwise3":
            d = np.mean(D, 0)

            sortIdx = np.argsort(d, 0)
            print len(sortIdx)
            print sortIdx
            m = int(len(sortIdx) / 5.0)
            print "deleting " + str(m) + " of the trees"
            idx = sorted([x for x in sortIdx[len(sortIdx) - m : len(sortIdx)]], reverse=True)
            print idx
            print d
            for i in idx:
                print "deleting the tree " + str(i) + "the. The distance to consensus tree was: " + str(d[i])
                del treeList[i]
        else:
            d = np.mean(D, 0)
            print d
            mean = np.mean(d)
            st = np.std(d)
            idx = list()
            for k in range(len(d) - 1, 0, -1):
                if d[k] > mean + 1.5 * st:
                    print "deleting the tree " + str(k) + "the. The distance to consensus tree was: " + str(d[k])
                    del treeList[k]

    return treeList