Esempio n. 1
0
    def __init__(self, traindata, testdata):

        # traindata = np.loadtxt(training_data, delimiter=",")
        # testdata = np.loadtxt(test_data, delimiter=",")

        traindata = np.recfromcsv(traindata)
        testdata = np.recfromcsv(testdata)

        self.feature_names = [n[2:] for n in traindata.dtype.names]
        nftrs = len(self.feature_names)
        traindata = traindata.view(float).reshape((-1, nftrs))
        testdata = testdata.view(float).reshape((-1, nftrs))

        # to remove columns that contaim nan's and have zero variance
        self._mask = np.invert(np.isnan(traindata.sum(axis=0)))* \
            (traindata.std(axis=0) > 0.0)

        traindata = traindata[:, self._mask]
        mask = np.invert(np.isnan(testdata.sum(axis=1)))
        testdata = testdata[:, self._mask][mask, :]

        self._zs = ZScore(traindata)
        self.traindata = self.normalize(traindata)
        self.testdata = self.normalize(testdata)
Esempio n. 2
0
class PreProcessor(object):
    def __init__(self, traindata, testdata):

        # traindata = np.loadtxt(training_data, delimiter=",")
        # testdata = np.loadtxt(test_data, delimiter=",")

        traindata = np.recfromcsv(traindata)
        testdata = np.recfromcsv(testdata)

        self.feature_names = [n[2:] for n in traindata.dtype.names]
        nftrs = len(self.feature_names)
        traindata = traindata.view(float).reshape((-1, nftrs))
        testdata = testdata.view(float).reshape((-1, nftrs))

        # to remove columns that contaim nan's and have zero variance
        self._mask = np.invert(np.isnan(traindata.sum(axis=0))) * (traindata.std(axis=0) > 0.0)

        traindata = traindata[:, self._mask]
        mask = np.invert(np.isnan(testdata.sum(axis=1)))
        testdata = testdata[:, self._mask][mask, :]

        self._zs = ZScore(traindata)
        self.traindata = self.normalize(traindata)
        self.testdata = self.normalize(testdata)

    def index(self, items):
        if not isinstance(items, (list, tuple)):
            items = (items,)
        return [self.feature_names.index(item) for item in items]

    def normalize(self, data):
        return self._zs.normalize(data)

    def filter(self, data):
        return data[:, self._mask]

    @property
    def ranges(self):
        return np.array(
            [
                (self.testdata[:, 0].min(), self.testdata[:, 0].max()),
                (self.testdata[:, 1].min(), self.testdata[:, 1].max()),
            ]
        )

    def __call__(self, data):
        return self.normalize(self.filter(data))
Esempio n. 3
0
class PreProcessor(object):

    def __init__(self, traindata, testdata):

        # traindata = np.loadtxt(training_data, delimiter=",")
        # testdata = np.loadtxt(test_data, delimiter=",")

        traindata = np.recfromcsv(traindata)
        testdata = np.recfromcsv(testdata)

        self.feature_names = [n[2:] for n in traindata.dtype.names]
        nftrs = len(self.feature_names)
        traindata = traindata.view(float).reshape((-1, nftrs))
        testdata = testdata.view(float).reshape((-1, nftrs))

        # to remove columns that contaim nan's and have zero variance
        self._mask = np.invert(np.isnan(traindata.sum(axis=0)))* \
            (traindata.std(axis=0) > 0.0)

        traindata = traindata[:, self._mask]
        mask = np.invert(np.isnan(testdata.sum(axis=1)))
        testdata = testdata[:, self._mask][mask, :]

        self._zs = ZScore(traindata)
        self.traindata = self.normalize(traindata)
        self.testdata = self.normalize(testdata)

    def index(self, items):
        if not isinstance(items, (list, tuple)):
            items = (items, )
        return [self.feature_names.index(item) for item in items]

    def normalize(self, data):
        return self._zs.normalize(data)

    def filter(self, data):
        return data[:, self._mask]

    @property
    def ranges(self):
        return np.array([(self.testdata[:, 0].min(), self.testdata[:, 0].max()),
                         (self.testdata[:, 1].min(), self.testdata[:, 1].max())])

    def __call__(self, data):
        return self.normalize(self.filter(data))
Esempio n. 4
0
    def __init__(self, traindata, testdata):

        # traindata = np.loadtxt(training_data, delimiter=",")
        # testdata = np.loadtxt(test_data, delimiter=",")

        traindata = np.recfromcsv(traindata)
        testdata = np.recfromcsv(testdata)

        self.feature_names = [n[2:] for n in traindata.dtype.names]
        nftrs = len(self.feature_names)
        traindata = traindata.view(float).reshape((-1, nftrs))
        testdata = testdata.view(float).reshape((-1, nftrs))

        # to remove columns that contaim nan's and have zero variance
        self._mask = np.invert(np.isnan(traindata.sum(axis=0))) * (traindata.std(axis=0) > 0.0)

        traindata = traindata[:, self._mask]
        mask = np.invert(np.isnan(testdata.sum(axis=1)))
        testdata = testdata[:, self._mask][mask, :]

        self._zs = ZScore(traindata)
        self.traindata = self.normalize(traindata)
        self.testdata = self.normalize(testdata)
Esempio n. 5
0
# rosize = 236, intesity=222, std=223 (-1 because one columw will be removed)
# features = [235, 221] # one column is remove later
# tset = np.loadtxt("./trainingset.csv", delimiter=",")
# mixed = np.loadtxt("./all.csv", delimiter=",")

# rosize = 236, intesity=222, std=223 (-1 because one columw will be removed)
features = [17, 6]
tset = np.loadtxt("data/reduced_featuresset/metaphase.csv", delimiter=",")
mixed = np.loadtxt("data/reduced_featuresset/all.csv", delimiter=",")


mask = filter_mask(tset)
tset = tset[:, mask]
mixed = mixed[:, mask]

zs = ZScore(tset)
tset = zs.normalize(tset)
mixed = zs.normalize(mixed)

# fit the model
clf = svm.OneClassSVM(nu=0.001, kernel="rbf", gamma=0.01)
clf.fit(tset)


ytset = clf.predict(tset)
ymixed = clf.predict(mixed)

n_error_train = ytset[ytset == -1].size
n_error_outliers = ymixed[ymixed == 1].size

xmin = mixed[:, 0].min()-1
Esempio n. 6
0
# rosize = 236, intesity=222, std=223 (-1 because one columw will be removed)
# features = [235, 221] # one column is remove later
# tset = np.loadtxt("./trainingset.csv", delimiter=",")
# mixed = np.loadtxt("./all.csv", delimiter=",")

# rosize = 236, intesity=222, std=223 (-1 because one columw will be removed)
features = [17, 6]
tset = np.loadtxt("data/reduced_featuresset/metaphase.csv", delimiter=",")
mixed = np.loadtxt("data/reduced_featuresset/all.csv", delimiter=",")

mask = filter_mask(tset)
tset = tset[:, mask]
mixed = mixed[:, mask]

zs = ZScore(tset)
tset = zs.normalize(tset)
mixed = zs.normalize(mixed)

# fit the model
clf = svm.OneClassSVM(nu=0.001, kernel="rbf", gamma=0.01)
clf.fit(tset)

ytset = clf.predict(tset)
ymixed = clf.predict(mixed)

n_error_train = ytset[ytset == -1].size
n_error_outliers = ymixed[ymixed == 1].size

xmin = mixed[:, 0].min() - 1
xmax = mixed[:, 0].max() + 1