Esempio n. 1
0
    def __init__(self, data, index=None, pca=False, min_std=10e-9):

        self.data = data
        self._pca = None

        self._zs = ZScore(data, replace_inf=True)
        data = self._zs.normalize(data)

        # to remove columns that contaim nan's and have zero variance
        mask_nan = np.invert(np.isnan(data.sum(axis=0)))
        self._mask = np.ones(mask_nan.shape).astype(bool)

        if index is not None:
            self._mask[:] = False
            self._mask[index] = True

        self._mask *= mask_nan

        data = self.filter(data)

        if pca:
            # data1 = data self.normalize(data)
            self._pca = PCA(data, minfrac=0.05)
            self.traindata = self._pca.project(data)
        else:
            self.traindata = data  #self.normalize(data)
Esempio n. 2
0
    def __call__(self):
        # z-scoring

        if self.treedata is None:
            raise SortingError("No examples for similarity measure available!")

        zs = ZScore(self.data, replace_inf=True)
        data_zs = zs.normalize(self.data)

        # z-scored mean value of pca procjected treedata
        mu = zs.normalize(self.treedata)
        data_zs, mu = filter_nans(data_zs, mu)

        mu = mu.mean(axis=0)
        distsq = [np.power((x - mu), 2).sum() for x in data_zs]
        return np.sqrt(distsq)
Esempio n. 3
0
    def __call__(self):
        # z-scoring

        if self.treedata is None:
            raise SortingError("No examples for similarity measure available!")

        zs = ZScore(self.data, replace_inf=True)
        data_zs = zs.normalize(self.data)

        # z-scored mean value of pca procjected treedata
        mu = zs.normalize(self.treedata)
        data_zs, mu = filter_nans(data_zs, mu)

        mu = mu.mean(axis=0)
        distsq = [np.power((x - mu), 2).sum() for x in data_zs]
        return np.sqrt(distsq)
Esempio n. 4
0
    def __init__(self, data, index=None, pca=False, min_std=10e-9):

        self.data = data
        self._pca = None

        self._zs = ZScore(data, replace_inf=True)
        data = self._zs.normalize(data)

        # to remove columns that contaim nan's and have zero variance
        mask_nan = np.invert(np.isnan(data.sum(axis=0)))
        self._mask = np.ones(mask_nan.shape).astype(bool)

        if index is not None:
            self._mask[:] = False
            self._mask[index] = True

        self._mask *= mask_nan

        data = self.filter(data)

        if pca:
            # data1 = data self.normalize(data)
            self._pca = PCA(data, minfrac=0.05)
            self.traindata = self._pca.project(data)
        else:
            self.traindata = data #self.normalize(data)
Esempio n. 5
0
    def __call__(self):
        # z-scoring

        if self.treedata is None:
            raise SortingError("No examples for similarity measure available!")
        if self.treedata.shape[1] < 2:
            raise SortingError(("CosineSimilarity needs at least 2 "
                                "features for sorting"))

        zs = ZScore(self.data, replace_inf=True)
        data_zs = zs.normalize(self.data)

        # z-scored mean value of pca procjected treedata
        mu = zs.normalize(self.treedata)
        data_zs, mu = filter_nans(data_zs, mu)

        mu = mu.mean(axis=0)
        denom = np.sqrt((mu**2).sum()) * np.sqrt((data_zs**2).sum(axis=1))
        s_cos = np.sum(mu * data_zs, axis=1) / denom
        return -1.0 * s_cos
Esempio n. 6
0
    def __call__(self):
        # z-scoring

        if self.treedata is None:
            raise SortingError("No examples for similarity measure available!")
        if self.treedata.shape[1] < 2:
            raise SortingError(("CosineSimilarity needs at least 2 "
                                "features for sorting"))

        zs = ZScore(self.data, replace_inf=True)
        data_zs = zs.normalize(self.data)

        # z-scored mean value of pca procjected treedata
        mu = zs.normalize(self.treedata)
        data_zs, mu = filter_nans(data_zs, mu)

        mu = mu.mean(axis=0)
        denom = np.sqrt((mu**2).sum())*np.sqrt((data_zs**2).sum(axis=1))
        s_cos = np.sum(mu*data_zs, axis=1)/denom
        return -1.0*s_cos
Esempio n. 7
0
class PreProcessor(object):
    """PreProcessor is used to normalize the data and remove columns that
    contain NaN's and have zero variance. If index is None all features are
    taken into account otherwise only those specified with index. Index can
    be an integer or a list of integers.
    """

    def __init__(self, data, index=None, pca=False, min_std=10e-9):

        self.data = data
        self._pca = None

        self._zs = ZScore(data, replace_inf=True)
        data = self._zs.normalize(data)

        # to remove columns that contaim nan's and have zero variance
        mask_nan = np.invert(np.isnan(data.sum(axis=0)))
        self._mask = np.ones(mask_nan.shape).astype(bool)

        if index is not None:
            self._mask[:] = False
            self._mask[index] = True

        self._mask *= mask_nan

        data = self.filter(data)

        if pca:
            # data1 = data self.normalize(data)
            self._pca = PCA(data, minfrac=0.05)
            self.traindata = self._pca.project(data)
        else:
            self.traindata = data #self.normalize(data)

    @property
    def std(self):
        return self.data.std(axis=0)

    @property
    def mean(self):
        return self.data.mean(axis=0)

    @property
    def mask(self):
        return self._mask

    @property
    def nfeatures(self):
        return self.traindata.shape[1]

    @property
    def nsamples(self):
        return self.data.shape[0]

    def normalize(self, data):
        return self._zs.normalize(data)

    def filter(self, data):
        return data[:, self._mask]

    def __call__(self, data):
        data1 = self.filter(self.normalize(data))
        if self._pca is None:
            return data1
        else:
            return self._pca.project(data1)
Esempio n. 8
0
class PreProcessor(object):
    """PreProcessor is used to normalize the data and remove columns that
    contain NaN's and have zero variance. If index is None all features are
    taken into account otherwise only those specified with index. Index can
    be an integer or a list of integers.
    """
    def __init__(self, data, index=None, pca=False, min_std=10e-9):

        self.data = data
        self._pca = None

        self._zs = ZScore(data, replace_inf=True)
        data = self._zs.normalize(data)

        # to remove columns that contaim nan's and have zero variance
        mask_nan = np.invert(np.isnan(data.sum(axis=0)))
        self._mask = np.ones(mask_nan.shape).astype(bool)

        if index is not None:
            self._mask[:] = False
            self._mask[index] = True

        self._mask *= mask_nan

        data = self.filter(data)

        if pca:
            # data1 = data self.normalize(data)
            self._pca = PCA(data, minfrac=0.05)
            self.traindata = self._pca.project(data)
        else:
            self.traindata = data  #self.normalize(data)

    @property
    def std(self):
        return self.data.std(axis=0)

    @property
    def mean(self):
        return self.data.mean(axis=0)

    @property
    def mask(self):
        return self._mask

    @property
    def nfeatures(self):
        return self.traindata.shape[1]

    @property
    def nsamples(self):
        return self.data.shape[0]

    def normalize(self, data):
        return self._zs.normalize(data)

    def filter(self, data):
        return data[:, self._mask]

    def __call__(self, data):
        data1 = self.filter(self.normalize(data))
        if self._pca is None:
            return data1
        else:
            return self._pca.project(data1)