def __init__(self, YList, X, featuresName, ages, args): super(MetabolomicsExpRunner, self).__init__(args=args) self.X = X self.YList = YList #The list of concentrations self.featuresName = featuresName self.args = args self.ages = ages self.maxDepth = 10 self.numTrees = 10 self.sampleSize = 1.0 self.sampleReplace = True self.folds = 5 self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/" self.leafRankGenerators = [] self.leafRankGenerators.append((LinearSvmGS.generate(), "SVM")) self.leafRankGenerators.append((SvcGS.generate(), "RBF-SVM")) self.leafRankGenerators.append((DecisionTree.generate(), "CART")) self.pcaLeafRankGenerators = [(LinearSvmPca.generate(), "LinearSVM-PCA")] self.funcLeafRankGenerators = [] self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF")) self.funcLeafRankGenerators.append((SvcFGs.generate, "RBF-SVMF")) self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF")) #Store all the label vectors and their missing values YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList) self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds] self.hormoneNames = MetabolomicsUtils.getLabelNames()
def computeRankMetrics(self, X, Y, indexList, bestLearners, standardiserY, labelIndex): #Some code to do ranking using the learner predictors i = 0 rankMetrics = numpy.zeros((len(indexList), self.boundsList[labelIndex].shape[0]-1)) for idxtr, idxts in indexList: logging.info("Iteration " + str(i)) trainX, testX = X[idxtr, :], X[idxts, :] trainY, testY = Y[idxtr], Y[idxts] bestLearners[i].learnModel(trainX, trainY) predY = bestLearners[i].predict(testX) gc.collect() #Now output 3 sets of ranked scores predY = standardiserY.unstandardiseArray(predY) testY = standardiserY.unstandardiseArray(testY) YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex]) YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex]) for j in range(self.boundsList[labelIndex].shape[0]-1): rankMetrics[i, j] = Evaluator.auc(YScores[:, j], YIndList[j]) i += 1 logging.debug(rankMetrics) return rankMetrics
def __init__(self, YList, X, featuresName, ages, args): super(MetabolomicsExpRunner, self).__init__(args=args) self.X = X self.YList = YList #The list of concentrations self.featuresName = featuresName self.args = args self.ages = ages self.maxDepth = 5 self.numTrees = 10 self.folds = 3 self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/" self.leafRankGenerators = [] #self.leafRankGenerators.append((SvcGS.generate(), "SVC")) #self.leafRankGenerators.append((LinearSvmGS.generate(), "LinearSVM")) self.leafRankGenerators.append((LinearSvmPca.generate(), "LinearSVM-PCA")) self.funcLeafRankGenerators = [] #self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF")) #self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF")) self.funcLeafRankGenerators.append((SvcFGs.generate, "SVCF")) #Store all the label vectors and their missing values YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList) self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds] self.hormoneNames = MetabolomicsUtils.getLabelNames()
def __init__(self, df, X, featuresName, ages, args): super(MetabolomicsRegExpRunner, self).__init__(args=args) self.df = df self.X = X self.featuresName = featuresName self.args = args self.ages = ages self.labelNames = MetabolomicsUtils.getLabelNames() self.YList = MetabolomicsUtils.createLabelList(df, self.labelNames) self.boundsList = MetabolomicsUtils.getBounds() self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"
def meanAUC(self, predY, testY, labelIndex, standardiserY): predY = standardiserY.unstandardiseArray(predY) testY = standardiserY.unstandardiseArray(testY) YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex]) YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex]) rankMetrics = numpy.zeros(self.boundsList[labelIndex].shape[0]-1) for j in range(rankMetrics.shape[0]): rankMetrics[j] = Evaluator.auc(YScores[:, j], YIndList[j]) return numpy.mean(rankMetrics)
def testReconstructSignal(self): numExamples = 100 numFeatures = 16 X = numpy.random.rand(numExamples, numFeatures) level = 10 mode = "cpd" waveletStr = "db4" C = pywt.wavedec(X[0, :], waveletStr, mode, level=10) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) X2 = MetabolomicsUtils.reconstructSignal(X, Xw, waveletStr, mode, C) tol = 10**-6 self.assertTrue(numpy.linalg.norm(X - X2) < tol)
def testCreateIndicatorLabels(self): numpy.set_printoptions(threshold=3000) X, X2, Xs, Xopls, YList, df = MetabolomicsUtils.loadData() #YList = MetabolomicsUtils.createLabelList(df, MetabolomicsUtils.getLabelNames()) Y1, inds1 = YList[0] Y2, inds2 = YList[1] Y3, inds3 = YList[2] YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList) s = YIgf1Inds[0] + YIgf1Inds[1] + YIgf1Inds[2] self.assertTrue((s == numpy.ones(s.shape[0])).all()) s = YICortisolInds[0] + YICortisolInds[1] + YICortisolInds[2] self.assertTrue((s == numpy.ones(s.shape[0])).all()) s = YTestoInds[0] + YTestoInds[1] + YTestoInds[2] self.assertTrue((s == numpy.ones(s.shape[0])).all()) #Now compare to those labels in the file labelNames = ["Ind.Testo.1", "Ind.Testo.2", "Ind.Testo.3"] labelNames.extend(["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"]) labelNames.extend(["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"]) Y = numpy.array(df.rx(labelNames[6])).ravel()[inds1] logging.debug(numpy.sum(numpy.abs(YIgf1Inds[0] - Y))) Y = numpy.array(df.rx(labelNames[7])).ravel()[inds1] logging.debug(numpy.sum(numpy.abs(YIgf1Inds[1] - Y))) Y = numpy.array(df.rx(labelNames[8])).ravel()[inds1] logging.debug(numpy.sum(numpy.abs(YIgf1Inds[2] - Y))) Y = numpy.array(df.rx(labelNames[3])).ravel()[inds2] logging.debug(numpy.sum(numpy.abs(YICortisolInds[0] - Y))) Y = numpy.array(df.rx(labelNames[4])).ravel()[inds2] logging.debug(numpy.sum(numpy.abs(YICortisolInds[1] - Y))) Y = numpy.array(df.rx(labelNames[5])).ravel()[inds2] logging.debug(numpy.sum(numpy.abs(YICortisolInds[2] - Y))) Y = numpy.array(df.rx(labelNames[0])).ravel()[inds3] logging.debug(numpy.sum(numpy.abs(YTestoInds[0] - Y))) Y = numpy.array(df.rx(labelNames[1])).ravel()[inds3] logging.debug(numpy.sum(numpy.abs(YTestoInds[1] - Y))) Y = numpy.array(df.rx(labelNames[2])).ravel()[inds3] logging.debug(numpy.sum(numpy.abs(YTestoInds[2] - Y)))
def testGetWaveletFeaturesTest(self): #See if we can reproduce the data from the wavelet X, X2, Xs, Xopls, YList, df = MetabolomicsUtils.loadData() waveletStr = 'db4' mode = "zpd" level = 10 C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode) X0 = pywt.waverec(C, waveletStr, mode) tol = 10**-6 self.assertTrue(numpy.linalg.norm(X0 - X[0, :]) < tol) def reconstructSignal(X, Xw, waveletStr, level, mode, C): Xrecstr = numpy.zeros(X.shape) for i in range(Xw.shape[0]): C2 = [] colIndex = 0 for j in range(len(list(C))): C2.append(Xw[i, colIndex:colIndex+len(C[j])]) colIndex += len(C[j]) Xrecstr[i, :] = pywt.waverec(tuple(C2), waveletStr, mode) return Xrecstr #Now do the same for the whole X C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C) self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol) waveletStr = 'db8' C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C) self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol) waveletStr = 'haar' C = pywt.wavedec(X[0, :], waveletStr, level=level, mode=mode) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) Xrecstr = reconstructSignal(X, Xw, waveletStr, level, mode, C) self.assertTrue(numpy.linalg.norm(X - Xrecstr) < tol)
def testFilterWavelet(self): numExamples = 100 numFeatures = 16 X = numpy.random.rand(numExamples, numFeatures) level = 10 mode = "cpd" waveletStr = "db4" C = pywt.wavedec(X[0, :], waveletStr, mode, level=10) Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) N = 10 Xw2, inds = MetabolomicsUtils.filterWavelet(Xw, N) tol = 10**-6 self.assertEquals(inds.shape[0], N) self.assertTrue(numpy.linalg.norm( Xw[:, inds] - Xw2[:, inds] ) < tol) zeroInds = numpy.setdiff1d(numpy.arange(Xw.shape[1]), inds) self.assertTrue(numpy.linalg.norm(Xw2[:, zeroInds]) < tol)
def testScoreLabel(self):# numExamples = 10 Y = numpy.random.rand(numExamples) bounds = numpy.array([0, 0.2, 0.8, 1.0]) YScores = MetabolomicsUtils.scoreLabels(Y, bounds) inds1 = numpy.argsort(Y) inds2 = numpy.argsort(YScores[:, 0]) inds3 = numpy.argsort(YScores[:, -1]) inds4 = numpy.argsort(numpy.abs(Y - 0.5)) inds5 = numpy.argsort(YScores[:, 1]) self.assertTrue((inds1 == inds3).all()) self.assertTrue((inds1 == numpy.flipud(inds2)).all()) self.assertTrue((inds4 == numpy.flipud(inds5)).all()) #Test we don't get problems when Y has the same values Y = numpy.ones(numExamples) YScores = MetabolomicsUtils.scoreLabels(Y, bounds) self.assertTrue((YScores == numpy.ones((Y.shape[0], 3))).all())
self.saveResults(self.leafRankGenerators, True) def run2(self): logging.debug('module name:' + __name__) logging.debug('parent process:' + str(os.getppid())) logging.debug('process id:' + str(os.getpid())) self.saveResults(self.funcLeafRankGenerators, False) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "metabolomic/" X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData() waveletStr = 'db4' mode = "cpd" level = 10 XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode) XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode) XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode) dataList = [] dataList.extend([(XwDb4, "db4")]) lock = multiprocessing.Lock() numpy.random.seed(datetime.datetime.now().microsecond) #numpy.random.seed(21)
self.saveResults(self.funcLeafRankGenerators, "func") def runPCA(self): logging.debug('module name:' + __name__) logging.debug('parent process:' + str(os.getppid())) logging.debug('process id:' + str(os.getpid())) self.saveResults(self.pcaLeafRankGenerators, "pca") logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "metabolomic/" X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData() mode = "cpd" level = 10 XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode) XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode) XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode) #Filter the wavelets Ns = [10, 25, 50, 75, 100] dataList = [] for i in range(len(Ns)): N = Ns[i] XwDb4F, inds = MetabolomicsUtils.filterWavelet(XwDb4, N) dataList.append((XwDb4F[:, inds], "Db4-" + str(N)))
from rpy2.robjects.packages import importr from socket import gethostname import matplotlib.pyplot as plt from apgl.data.Standardiser import Standardiser logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) numpy.set_printoptions(linewidth=160, precision=3, suppress=True) treeRankLib = importr('TreeRank') baseLib = importr('base') baseLib.options(warn=1) dataDir = PathDefaults.getDataDir() + "metabolomic/" X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData() YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList) mode = "cpd" level = 10 XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode) XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode) XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode) #Plot the correlation of the raw spectrum above x percent Xr = numpy.random.rand(Xs.shape[0], Xs.shape[1]) datasets = [(Xr, "random"), (Xs, "raw"), (XwHaar, "haar"), (XwDb4, "db4"), (XwDb8, "db8")] corLims = numpy.arange(0, 1.01, 0.01)
from apgl.util.PathDefaults import PathDefaults from exp.metabolomics.MetabolomicsUtils import MetabolomicsUtils import numpy import pywt dataDir = PathDefaults.getDataDir() + "functional/" fileName = dataDir + "synthetic_control.data" X = numpy.loadtxt(fileName) #Ignore first 200 examples X = X[200:, :] Y = numpy.zeros(X.shape[0]) Y[0:200] = -1 #Increading trend and decreasing trend Y[200:] = 1 #Upward shift and downward shift #Compute wavelets waveletStr = "db2" level = 2 mode = "cpd" Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) print(X.shape) print(Xw.shape) C = pywt.wavedec(X[0, :], waveletStr, mode, level) for c in C: print(c.shape)
self.saveResults(self.leafRankGenerators, True) def run2(self): logging.debug("module name:" + __name__) logging.debug("parent process:" + str(os.getppid())) logging.debug("process id:" + str(os.getpid())) self.saveResults(self.funcLeafRankGenerators, False) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "metabolomic/" X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData() waveletStr = "db4" mode = "cpd" level = 10 XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, "db4", level, mode) XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, "db8", level, mode) XwHaar = MetabolomicsUtils.getWaveletFeatures(X, "haar", level, mode) dataList = [] dataList.extend([(XwDb4, "db4")]) lock = multiprocessing.Lock() numpy.random.seed(datetime.datetime.now().microsecond) # numpy.random.seed(21)
import numpy from exp.metabolomics.MetabolomicsUtils import MetabolomicsUtils X, X2, df = MetabolomicsUtils.loadData() #Just figure out the boundaries of the levels numpy.set_printoptions(threshold=3000) labelNames = ["IGF1.val", "Cortisol.val", "Testosterone.val"] labelNames2 = ["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"] YList = MetabolomicsUtils.createLabelList(df, labelNames) YList2 = MetabolomicsUtils.createLabelList(df, labelNames2) Y, inds = YList[0] Y1 = numpy.array(df.rx(labelNames2[0])).ravel()[inds] Y2 = numpy.array(df.rx(labelNames2[1])).ravel()[inds] Y3 = numpy.array(df.rx(labelNames2[2])).ravel()[inds] inds = numpy.argsort(Y) YY = numpy.c_[Y[inds], Y1[inds]] YY = numpy.c_[YY, Y2[inds]] YY = numpy.c_[YY, Y3[inds]] print(YY) labelNames2 = ["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"] YList2 = MetabolomicsUtils.createLabelList(df, labelNames2) Y, inds = YList[1] Y1 = numpy.array(df.rx(labelNames2[0])).ravel()[inds] Y2 = numpy.array(df.rx(labelNames2[1])).ravel()[inds] Y3 = numpy.array(df.rx(labelNames2[2])).ravel()[inds]
from rpy2.robjects.packages import importr from socket import gethostname import matplotlib.pyplot as plt from apgl.data.Standardiser import Standardiser logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) numpy.set_printoptions(linewidth=160, precision=3, suppress=True) treeRankLib = importr('TreeRank') baseLib = importr('base') baseLib.options(warn=1) dataDir = PathDefaults.getDataDir() + "metabolomic/" X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData() waveletStr = 'db4' mode = "cpd" maxLevel = 10 errors = numpy.zeros(maxLevel) numFeatures = numpy.zeros(maxLevel) level = 10 waveletStrs = ["haar", "db4", "db8"] #The variances are very similar across different wavelets for waveletStr in waveletStrs: Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) standardiser = Standardiser() Xw = standardiser.centreArray(Xw)