Esempio n. 1
0
    def __init__(self, data, index=None, pca=False, min_std=10e-9):

        self.data = data
        self._pca = None

        self._zs = ZScore(data, replace_inf=True)
        data = self._zs.normalize(data)

        # to remove columns that contaim nan's and have zero variance
        mask_nan = np.invert(np.isnan(data.sum(axis=0)))
        self._mask = np.ones(mask_nan.shape).astype(bool)

        if index is not None:
            self._mask[:] = False
            self._mask[index] = True

        self._mask *= mask_nan

        data = self.filter(data)

        if pca:
            # data1 = data self.normalize(data)
            self._pca = PCA(data, minfrac=0.05)
            self.traindata = self._pca.project(data)
        else:
            self.traindata = data  #self.normalize(data)
Esempio n. 2
0
    def __call__(self):
        # z-scoring

        if self.treedata is None:
            raise SortingError("No examples for similarity measure available!")

        zs = ZScore(self.data, replace_inf=True)
        data_zs = zs.normalize(self.data)

        # z-scored mean value of pca procjected treedata
        mu = zs.normalize(self.treedata)
        data_zs, mu = filter_nans(data_zs, mu)

        mu = mu.mean(axis=0)
        distsq = [np.power((x - mu), 2).sum() for x in data_zs]
        return np.sqrt(distsq)
Esempio n. 3
0
    def __call__(self):
        # z-scoring

        if self.treedata is None:
            raise SortingError("No examples for similarity measure available!")
        if self.treedata.shape[1] < 2:
            raise SortingError(("CosineSimilarity needs at least 2 "
                                "features for sorting"))

        zs = ZScore(self.data, replace_inf=True)
        data_zs = zs.normalize(self.data)

        # z-scored mean value of pca procjected treedata
        mu = zs.normalize(self.treedata)
        data_zs, mu = filter_nans(data_zs, mu)

        mu = mu.mean(axis=0)
        denom = np.sqrt((mu**2).sum()) * np.sqrt((data_zs**2).sum(axis=1))
        s_cos = np.sum(mu * data_zs, axis=1) / denom
        return -1.0 * s_cos