def testReconstructSignal(self): numExamples = 100 numFeatures = 16 X = numpy.random.rand(numExamples, numFeatures) level = 10 mode = "cpd" waveletStr = "db4" C = pywt.wavedec(X[0, :], waveletStr, mode, level=10) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) X2 = MetabolomicsUtils.reconstructSignal(X, Xw, waveletStr, mode, C) tol = 10**-6 self.assertTrue(numpy.linalg.norm(X - X2) < tol)
def testGetWaveletFeaturesTest(self): #See if we can reproduce the data from the wavelet X, X2, Xs, Xopls, YList, df = MetabolomicsUtils.loadData() waveletStr = 'db4' mode = "zpd" level = 10 C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode) X0 = pywt.waverec(C, waveletStr, mode) tol = 10**-6 self.assertTrue(numpy.linalg.norm(X0 - X[0, :]) < tol) def reconstructSignal(X, Xw, waveletStr, level, mode, C): Xrecstr = numpy.zeros(X.shape) for i in range(Xw.shape[0]): C2 = [] colIndex = 0 for j in range(len(list(C))): C2.append(Xw[i, colIndex:colIndex+len(C[j])]) colIndex += len(C[j]) Xrecstr[i, :] = pywt.waverec(tuple(C2), waveletStr, mode) return Xrecstr #Now do the same for the whole X C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C) self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol) waveletStr = 'db8' C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C) self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol) waveletStr = 'haar' C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C) self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol)
def testFilterWavelet(self): numExamples = 100 numFeatures = 16 X = numpy.random.rand(numExamples, numFeatures) level = 10 mode = "cpd" waveletStr = "db4" C = pywt.wavedec(X[0, :], waveletStr, mode, level=10) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) N = 10 Xw2, inds = MetabolomicsUtils.filterWavelet(Xw, N) tol = 10**-6 self.assertEquals(inds.shape[0], N) self.assertTrue(numpy.linalg.norm( Xw[:, inds] - Xw2[:, inds] ) < tol) zeroInds = numpy.setdiff1d(numpy.arange(Xw.shape[1]), inds) self.assertTrue(numpy.linalg.norm(Xw2[:, zeroInds]) < tol)
def testCreateIndicatorLabels(self): metaUtils = MetabolomicsUtils() X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() YCortisol = YCortisol[numpy.logical_not(numpy.isnan(YCortisol))] YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"]) YTesto = YTesto[numpy.logical_not(numpy.isnan(YTesto))] YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"]) YIgf1 = YIgf1[numpy.logical_not(numpy.isnan(YIgf1))] YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"]) s = numpy.sum(YCortisolIndicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) s = numpy.sum(YTestoIndicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) s = numpy.sum(YIgf1Indicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) #Now compare to those labels in the file X, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() dataDir = PathDefaults.getDataDir() + "metabolomic/" fileName = dataDir + "data.RMN.total.6.txt" data = pandas.read_csv(fileName, delimiter=",") YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"]) YCortisolIndicators2 = numpy.array(data[["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"]]) for i in range(YCortisolIndicators.shape[0]): if not numpy.isnan(YCortisol[i]) and not numpy.isnan(YCortisolIndicators2[i, :]).any(): #nptst.assert_almost_equal(YCortisolIndicators2[i, :], YCortisolIndicators[i, :]) pass YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"]) YTestoIndicators2 = numpy.array(data[["Ind.Testo.1", "Ind.Testo.2", "Ind.Testo.3"]]) for i in range(YTestoIndicators.shape[0]): if not numpy.isnan(YTesto[i]) and not numpy.isnan(YTestoIndicators2[i, :]).any(): #print(i, YTesto[i]) nptst.assert_almost_equal(YTestoIndicators2[i, :], YTestoIndicators[i, :]) YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"]) YIgf1Indicators2 = numpy.array(data[["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"]]) for i in range(YIgf1Indicators.shape[0]): if not numpy.isnan(YIgf1[i]) and not numpy.isnan(YIgf1Indicators2[i, :]).any(): #print(i, YIgf1[i]) #nptst.assert_almost_equal(YIgf1Indicators2[i, :], YIgf1Indicators[i, :]) pass
def testScoreLabel(self):# numExamples = 10 Y = numpy.random.rand(numExamples) bounds = numpy.array([0, 0.2, 0.8, 1.0]) YScores = MetabolomicsUtils.scoreLabels(Y, bounds) inds1 = numpy.argsort(Y) inds2 = numpy.argsort(YScores[:, 0]) inds3 = numpy.argsort(YScores[:, -1]) inds4 = numpy.argsort(numpy.abs(Y - 0.5)) inds5 = numpy.argsort(YScores[:, 1]) self.assertTrue((inds1 == inds3).all()) self.assertTrue((inds1 == numpy.flipud(inds2)).all()) self.assertTrue((inds4 == numpy.flipud(inds5)).all()) #Test we don't get problems when Y has the same values Y = numpy.ones(numExamples) YScores = MetabolomicsUtils.scoreLabels(Y, bounds) self.assertTrue((YScores == numpy.ones((Y.shape[0], 3))).all())
def testLoadData(self): metaUtils = MetabolomicsUtils() X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData()
from wallhack.metabolomics.MetabolomicsUtils import MetabolomicsUtils from socket import gethostname from sklearn.decomposition import PCA """ Run a variety of bipartite ranking on the metabolomics data """ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) os.system('taskset -p 0xffffffff %d' % os.getpid()) dataDir = PathDefaults.getDataDir() + "metabolomic/" metaUtils = MetabolomicsUtils() X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() #We model 99.1% of the spectrum with 100 eigenvectors pca = PCA(n_components=100) XPca = pca.fit_transform(X) mode = "cpd" level = 10 XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode) XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode) XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode) dataDict = {} dataDict["raw"] = X dataDict["pca"] = XPca
import numpy from wallhack.metabolomics.MetabolomicsUtils import MetabolomicsUtils X, X2, df = MetabolomicsUtils.loadData() #Just figure out the boundaries of the levels numpy.set_printoptions(threshold=3000) labelNames = ["IGF1.val", "Cortisol.val", "Testosterone.val"] labelNames2 = ["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"] YList = MetabolomicsUtils.createLabelList(df, labelNames) YList2 = MetabolomicsUtils.createLabelList(df, labelNames2) Y, inds = YList[0] Y1 = numpy.array(df.rx(labelNames2[0])).ravel()[inds] Y2 = numpy.array(df.rx(labelNames2[1])).ravel()[inds] Y3 = numpy.array(df.rx(labelNames2[2])).ravel()[inds] inds = numpy.argsort(Y) YY = numpy.c_[Y[inds], Y1[inds]] YY = numpy.c_[YY, Y2[inds]] YY = numpy.c_[YY, Y3[inds]] print(YY) labelNames2 = ["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"] YList2 = MetabolomicsUtils.createLabelList(df, labelNames2) Y, inds = YList[1] Y1 = numpy.array(df.rx(labelNames2[0])).ravel()[inds] Y2 = numpy.array(df.rx(labelNames2[1])).ravel()[inds] Y3 = numpy.array(df.rx(labelNames2[2])).ravel()[inds]
def saveResults(self): """ Compute the results and save them for a particular hormone. Does so for all learners. """ metaUtils = MetabolomicsUtils() logging.debug("Running on hormones: " + str(self.hormoneDict.keys())) for hormoneName, hormoneConc in self.hormoneDict.items(): nonNaInds = numpy.logical_not(numpy.isnan(hormoneConc)) hormoneIndicators = metaUtils.createIndicatorLabel(hormoneConc, metaUtils.boundsDict[hormoneName]) for i in range(hormoneIndicators.shape[1]): #Make labels -1/+1 Y = numpy.array(hormoneIndicators[nonNaInds, i], numpy.int)*2-1 for dataName, dataFeatures in self.dataDict.items(): X = dataFeatures[nonNaInds, :] X = numpy.c_[X, self.ages[nonNaInds]] X = Standardiser().standardiseArray(X) if self.runCartTreeRank: fileName = self.resultsDir + "CartTreeRank-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveResult(X, Y, self.cartTreeRank, self.cartTreeRankParams, fileName) if self.runRbfSvmTreeRank: fileName = self.resultsDir + "RbfSvmTreeRank-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveResult(X, Y, self.rbfSvmTreeRank, self.rbfSvmTreeRankParams, fileName) if self.runL1SvmTreeRank: fileName = self.resultsDir + "L1SvmTreeRank-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveResult(X, Y, self.l1SvmTreeRank, self.l1SvmTreeRankParams, fileName) #For this SVM save the weight vector weightsFileName = self.resultsDir + "WeightsL1SvmTreeRank-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveWeightVectorResults(X, Y, self.l1SvmTreeRank, self.l1SvmTreeRankParams, weightsFileName) if self.runCartTreeRankForest: fileName = self.resultsDir + "CartTreeRankForest-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveResult(X, Y, self.cartTreeRankForest, self.cartTreeRankForestParams, fileName) if self.runRbfSvmTreeRankForest: fileName = self.resultsDir + "RbfSvmTreeRankForest-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveResult(X, Y, self.rbfSvmTreeRankForest, self.rbfSvmTreeRankForestParams, fileName) if self.runL1SvmTreeRankForest: fileName = self.resultsDir + "L1SvmTreeRankForest-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveResult(X, Y, self.l1SvmTreeRankForest, self.l1SvmTreeRankForestParams, fileName) #For this SVM save the weight vector weightsFileName = self.resultsDir + "WeightsL1SvmTreeRankForest-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveWeightVectorResults(X, Y, self.l1SvmTreeRankForest, self.l1SvmTreeRankForestParams, weightsFileName) if self.runRankBoost: fileName = self.resultsDir + "RankBoost-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveResult(X, Y, self.rankBoost, self.rankBoostParams, fileName) if self.runRankSVM: fileName = self.resultsDir + "RankSVM-" + hormoneName + "-" + str(i) + "-" + dataName + ".npy" self.saveResult(X, Y, self.rankSVM, self.rankSVMParams, fileName) logging.debug("All done. See you around!")
import pywt from wallhack.metabolomics.MetabolomicsUtils import MetabolomicsUtils from sandbox.util.PathDefaults import PathDefaults from socket import gethostname import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt from sandbox.data.Standardiser import Standardiser logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) numpy.set_printoptions(linewidth=160, precision=3, suppress=True) dataDir = PathDefaults.getDataDir() + "metabolomic/" metaUtils = MetabolomicsUtils() X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() waveletStr = 'db4' mode = "cpd" maxLevel = 10 errors = numpy.zeros(maxLevel) numFeatures = numpy.zeros(maxLevel) level = 10 waveletStrs = ["haar", "db4", "db8"] plt.figure(0) C = XStd.T.dot(XStd) w, V = numpy.linalg.eigh(C) w = numpy.flipud(numpy.sort(w))
import logging import datetime import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt from sandbox.util.PathDefaults import PathDefaults from sandbox.util.Latex import Latex from wallhack.metabolomics.MetabolomicsUtils import MetabolomicsUtils from wallhack.metabolomics.MetabolomicsExpHelper import MetabolomicsExpHelper logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numpy.set_printoptions(suppress=True, precision=3) resultsDir = PathDefaults.getOutputDir() + "metabolomics/" figureDir = resultsDir + "Figures/" metaUtils = MetabolomicsUtils() X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() dataDict = {} numpy.random.seed(datetime.datetime.now().microsecond) helper = MetabolomicsExpHelper(dataDict, YCortisol, YTesto, YIgf1, ages) dataNames =[] dataNames.extend(["raw", "pca", "Db4", "Db8", "Haar"]) #algorithms = ["CartTreeRank", "CartTreeRankForest", "L1SvmTreeRank", "L1SvmTreeRankForest", "RbfSvmTreeRank", "RbfSvmTreeRankForest", "RankBoost", "RankSVM"] algorithms = ["CartTreeRankForest", "L1SvmTreeRankForest", "RbfSvmTreeRankForest", "RankBoost", "RankSVM"] algorithmsAbbr = ["CART-TRF", "L1-TRF", "RBF-TRF", "RB", "RSVM"] hormoneNameIndicators = [] for i, (hormoneName, hormoneConc) in enumerate(helper.hormoneDict.items()): hormoneIndicators = metaUtils.createIndicatorLabel(hormoneConc, metaUtils.boundsDict[hormoneName])