def __init__(self, traindata, testdata): # traindata = np.loadtxt(training_data, delimiter=",") # testdata = np.loadtxt(test_data, delimiter=",") traindata = np.recfromcsv(traindata) testdata = np.recfromcsv(testdata) self.feature_names = [n[2:] for n in traindata.dtype.names] nftrs = len(self.feature_names) traindata = traindata.view(float).reshape((-1, nftrs)) testdata = testdata.view(float).reshape((-1, nftrs)) # to remove columns that contaim nan's and have zero variance self._mask = np.invert(np.isnan(traindata.sum(axis=0)))* \ (traindata.std(axis=0) > 0.0) traindata = traindata[:, self._mask] mask = np.invert(np.isnan(testdata.sum(axis=1))) testdata = testdata[:, self._mask][mask, :] self._zs = ZScore(traindata) self.traindata = self.normalize(traindata) self.testdata = self.normalize(testdata)
class PreProcessor(object): def __init__(self, traindata, testdata): # traindata = np.loadtxt(training_data, delimiter=",") # testdata = np.loadtxt(test_data, delimiter=",") traindata = np.recfromcsv(traindata) testdata = np.recfromcsv(testdata) self.feature_names = [n[2:] for n in traindata.dtype.names] nftrs = len(self.feature_names) traindata = traindata.view(float).reshape((-1, nftrs)) testdata = testdata.view(float).reshape((-1, nftrs)) # to remove columns that contaim nan's and have zero variance self._mask = np.invert(np.isnan(traindata.sum(axis=0))) * (traindata.std(axis=0) > 0.0) traindata = traindata[:, self._mask] mask = np.invert(np.isnan(testdata.sum(axis=1))) testdata = testdata[:, self._mask][mask, :] self._zs = ZScore(traindata) self.traindata = self.normalize(traindata) self.testdata = self.normalize(testdata) def index(self, items): if not isinstance(items, (list, tuple)): items = (items,) return [self.feature_names.index(item) for item in items] def normalize(self, data): return self._zs.normalize(data) def filter(self, data): return data[:, self._mask] @property def ranges(self): return np.array( [ (self.testdata[:, 0].min(), self.testdata[:, 0].max()), (self.testdata[:, 1].min(), self.testdata[:, 1].max()), ] ) def __call__(self, data): return self.normalize(self.filter(data))
class PreProcessor(object): def __init__(self, traindata, testdata): # traindata = np.loadtxt(training_data, delimiter=",") # testdata = np.loadtxt(test_data, delimiter=",") traindata = np.recfromcsv(traindata) testdata = np.recfromcsv(testdata) self.feature_names = [n[2:] for n in traindata.dtype.names] nftrs = len(self.feature_names) traindata = traindata.view(float).reshape((-1, nftrs)) testdata = testdata.view(float).reshape((-1, nftrs)) # to remove columns that contaim nan's and have zero variance self._mask = np.invert(np.isnan(traindata.sum(axis=0)))* \ (traindata.std(axis=0) > 0.0) traindata = traindata[:, self._mask] mask = np.invert(np.isnan(testdata.sum(axis=1))) testdata = testdata[:, self._mask][mask, :] self._zs = ZScore(traindata) self.traindata = self.normalize(traindata) self.testdata = self.normalize(testdata) def index(self, items): if not isinstance(items, (list, tuple)): items = (items, ) return [self.feature_names.index(item) for item in items] def normalize(self, data): return self._zs.normalize(data) def filter(self, data): return data[:, self._mask] @property def ranges(self): return np.array([(self.testdata[:, 0].min(), self.testdata[:, 0].max()), (self.testdata[:, 1].min(), self.testdata[:, 1].max())]) def __call__(self, data): return self.normalize(self.filter(data))
def __init__(self, traindata, testdata): # traindata = np.loadtxt(training_data, delimiter=",") # testdata = np.loadtxt(test_data, delimiter=",") traindata = np.recfromcsv(traindata) testdata = np.recfromcsv(testdata) self.feature_names = [n[2:] for n in traindata.dtype.names] nftrs = len(self.feature_names) traindata = traindata.view(float).reshape((-1, nftrs)) testdata = testdata.view(float).reshape((-1, nftrs)) # to remove columns that contaim nan's and have zero variance self._mask = np.invert(np.isnan(traindata.sum(axis=0))) * (traindata.std(axis=0) > 0.0) traindata = traindata[:, self._mask] mask = np.invert(np.isnan(testdata.sum(axis=1))) testdata = testdata[:, self._mask][mask, :] self._zs = ZScore(traindata) self.traindata = self.normalize(traindata) self.testdata = self.normalize(testdata)
# rosize = 236, intesity=222, std=223 (-1 because one columw will be removed) # features = [235, 221] # one column is remove later # tset = np.loadtxt("./trainingset.csv", delimiter=",") # mixed = np.loadtxt("./all.csv", delimiter=",") # rosize = 236, intesity=222, std=223 (-1 because one columw will be removed) features = [17, 6] tset = np.loadtxt("data/reduced_featuresset/metaphase.csv", delimiter=",") mixed = np.loadtxt("data/reduced_featuresset/all.csv", delimiter=",") mask = filter_mask(tset) tset = tset[:, mask] mixed = mixed[:, mask] zs = ZScore(tset) tset = zs.normalize(tset) mixed = zs.normalize(mixed) # fit the model clf = svm.OneClassSVM(nu=0.001, kernel="rbf", gamma=0.01) clf.fit(tset) ytset = clf.predict(tset) ymixed = clf.predict(mixed) n_error_train = ytset[ytset == -1].size n_error_outliers = ymixed[ymixed == 1].size xmin = mixed[:, 0].min()-1
# rosize = 236, intesity=222, std=223 (-1 because one columw will be removed) # features = [235, 221] # one column is remove later # tset = np.loadtxt("./trainingset.csv", delimiter=",") # mixed = np.loadtxt("./all.csv", delimiter=",") # rosize = 236, intesity=222, std=223 (-1 because one columw will be removed) features = [17, 6] tset = np.loadtxt("data/reduced_featuresset/metaphase.csv", delimiter=",") mixed = np.loadtxt("data/reduced_featuresset/all.csv", delimiter=",") mask = filter_mask(tset) tset = tset[:, mask] mixed = mixed[:, mask] zs = ZScore(tset) tset = zs.normalize(tset) mixed = zs.normalize(mixed) # fit the model clf = svm.OneClassSVM(nu=0.001, kernel="rbf", gamma=0.01) clf.fit(tset) ytset = clf.predict(tset) ymixed = clf.predict(mixed) n_error_train = ytset[ytset == -1].size n_error_outliers = ymixed[ymixed == 1].size xmin = mixed[:, 0].min() - 1 xmax = mixed[:, 0].max() + 1