def printBICresults(params, expNameBefCl, expNameAfterCl, modelToRun, nrClustList, runAllExpFunc): fileName = 'resfiles/BICres_%s%s' % (expNameBefCl, expNameAfterCl) bicAllFileName = '%s.npz' % fileName figBicFileName = '%s.png' % fileName runPart = 'R' if runPart == 'R': bic = np.nan * np.ones(len(nrClustList), float) aic = np.nan * np.ones(len(nrClustList), float) # go through every nrClustList file that was found for this experiment for nrClustIndex in range(len(nrClustList)): nrClustCurr = nrClustList[nrClustIndex] expName = '%sCl%d%s' % (expNameBefCl, nrClustCurr, expNameAfterCl) params['plotTrajParams']['expName'] = expName params['nrClust'] = nrClustCurr # [initClust, modelFit, aic/bic, plotBlender, sampleTraj] params['runPartStd'] = [ 'Non-enforcing', 'Non-enforcing', 'Non-enforcing', 'I', 'I' ] params['runPartMain'] = ['R', 'I', 'I'] # [mainPart, plot, stage] modelNames, res = evaluationFramework.runModels( params, expName, modelToRun, runAllExpFunc) if res[0]['std']: print('bic', res[0]['std']['bic']) print('aic', res[0]['std']['aic']) bic[nrClustIndex] = res[0]['std']['bic'] aic[nrClustIndex] = res[0]['std']['aic'] res = None gc.collect() print('garbage collector called') sys.stdout.flush() dataStruct = dict(aic=aic, bic=bic, nrClustList=nrClustList) pickle.dump(dataStruct, open(bicAllFileName, 'wb'), pickle.HIGHEST_PROTOCOL) elif runPart == 'L': dataStruct = pickle.load(open(bicAllFileName, 'rb')) aic = dataStruct['aic'] bic = dataStruct['bic'] else: raise ValueError('need to either load file or run the experiment') foundInd = np.logical_not(np.isnan(bic)) bicFound = bic[foundInd] aicFound = aic[foundInd] nrClustListFound = np.array(nrClustList)[foundInd] minBICInd = np.argmin(bicFound) minBIC = bicFound[minBICInd] nrClustMinBIC = nrClustListFound[minBICInd] minAICInd = np.argmin(aicFound) minAIC = aicFound[minAICInd] nrClustMinAIC = nrClustListFound[minAICInd] print('bicFound, nrClustListBicFound', bicFound, nrClustListFound) print('minBIC, nrClustMinBIC', minBIC, nrClustMinBIC) print('aicFound, nrClustListAicFound', aicFound, nrClustListFound) print('minAIC, nrClustMinAIC', minAIC, nrClustMinAIC) fig = pl.figure() colors = ['r', 'g'] pl.plot(nrClustListFound, bicFound, label='BIC', color=colors[0]) pl.plot(nrClustListFound, aicFound, label='AIC', color=colors[1]) # plot the two dots size = 50 pl.scatter([nrClustMinBIC, nrClustMinAIC], [minBIC, minAIC], color=colors, s=size) fontsize = 14 pl.legend(fontsize=fontsize) pl.xlabel('Number of clusters', fontsize=fontsize) pl.ylabel('Criterion Value', fontsize=fontsize) nrClustToPlot = 8 pl.xlim([1, 2 + nrClustToPlot]) allPlottedBicAicValues = np.concatenate( (bicFound[:nrClustToPlot], aicFound[:nrClustToPlot])) yMax = np.max(allPlottedBicAicValues, axis=0) yMin = np.min(allPlottedBicAicValues, axis=0) yDelta = (yMax - yMin) / 6 pl.ylim([yMin - yDelta, yMax + yDelta]) pl.xticks(range(2, 3 + nrClustToPlot), fontsize=fontsize) pl.yticks(fontsize=fontsize) fig.savefig(figBicFileName, dpi=100) return fig
def main(): nrSubjLong = 100 nrBiomk = 4 nrTimepts = 4 lowerAgeLim = 60 upperAgeLim = 80 shiftsLowerLim = -13 shiftsUpperLim = 10 etaB = 1 * np.ones(nrBiomk) lB = 10 * np.ones(nrBiomk) epsB = 1 * np.ones(nrBiomk) sigmaSB = 2 * np.ones((nrSubjLong, nrBiomk)) sigmaGfunc = GPModel.genSigmaG sigmaEpsfunc = None sigmaSfunc = None outFolder = 'resfiles/synth/' expName = 'synth1' fileName = '%s.npz' % expName forceRegenerate = False params = {} nrFuncUnits = 2 nrBiomkInFuncUnits = 3 nrBiomk = nrBiomkInFuncUnits * nrFuncUnits mapBiomkToFuncUnits = np.array( list(range(nrFuncUnits)) * nrBiomkInFuncUnits) # should give smth like [0,1,2,3,0,1,2,3,0,1,2,3] print('mapBiomkToFuncUnits', mapBiomkToFuncUnits) plotTrajParams['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits plotTrajParams['labels'] = ['b%d' % n for n in range(nrBiomk)] plotTrajParams['nrRowsFuncUnit'] = 2 plotTrajParams['nrColsFuncUnit'] = 3 plotTrajParams['colorsTraj'] = [ colorsys.hsv_to_rgb(hue, 1, 1) for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False) ] # if False, plot estimated traj. in separate plot from true traj. plotTrajParams['allTrajOverlap'] = False params['runIndex'] = args.runIndex params['nrProc'] = args.nrProc params['cluster'] = args.cluster params['plotTrajParams'] = plotTrajParams params['penalty'] = args.penalty params['nrFuncUnits'] = nrFuncUnits params['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits ##### disease agnostic parameters ########### # params of individual biomarkers thetas = np.zeros((nrBiomk, 4), float) thetas[:, 0] = 1 thetas[:, 1] = 10 thetas[:, 3] = 0 for f in range(nrFuncUnits): thetas[mapBiomkToFuncUnits == f, 2] = np.linspace(0.2, 0.9, num=nrBiomkInFuncUnits, endpoint=True) sigmaB = 0.1 * np.ones(nrBiomk) ##### disease specific parameters ########### # params of the dysfunctional trajectories - disease 1 dysfuncParamsDisOne = np.zeros((nrFuncUnits, 4), float) dysfuncParamsDisOne[:, 0] = 1 # ak dysfuncParamsDisOne[:, 1] = 0.3 # bk dysfuncParamsDisOne[:, 2] = [-3, 7] # ck dysfuncParamsDisOne[:, 3] = 0 # dk synthModelDisOne = ParHierModel.ParHierModel(dysfuncParamsDisOne, thetas, mapBiomkToFuncUnits, sigmoidFunc, sigmaB) paramsDisOne = genSynthData.generateDataJMD(nrSubjLong, nrBiomk, nrTimepts, lowerAgeLim, upperAgeLim, shiftsLowerLim, shiftsUpperLim, synthModelDisOne, outFolder, fileName, forceRegenerate, params) paramsDisOne['plotTrajParams']['diagNrs'] = np.unique(paramsDisOne['diag']) paramsDisOne['plotTrajParams']['trueParams'] = paramsDisOne['trueParams'] if forceRegenerate: synthPlotter = Plotter.PlotterJDM(paramsDisOne['plotTrajParams']) fig = synthPlotter.plotTrajData(paramsDisOne['longData'], paramsDisOne['longDiag'], paramsDisOne['trueParams']['dpsLong'], synthModelDisOne, replaceFigMode=True) fig.savefig('%s/synth1Dis1GenData.png' % outFolder) # params of the dysfunctional trajectories - disease 2 dysfuncParamsDisTwo = copy.deepcopy(dysfuncParamsDisOne) dysfuncParamsDisTwo[:, 1] = 1 dysfuncParamsDisTwo[:, 2] = [8, -4] synthModelDisTwo = ParHierModel.ParHierModel(dysfuncParamsDisTwo, thetas, mapBiomkToFuncUnits, sigmoidFunc, sigmaB) paramsDisTwo = copy.deepcopy(paramsDisOne) paramsDisTwo = genSynthData.generateDataJMD(nrSubjLong, nrBiomk, nrTimepts, lowerAgeLim, upperAgeLim, shiftsLowerLim, shiftsUpperLim, synthModelDisTwo, outFolder, fileName, forceRegenerate, paramsDisTwo) # for disease two, only keep the second biomarker in each functional unit indBiomkInDiseaseTwo = np.array(range(nrFuncUnits, (2 * nrFuncUnits))) print('indBiomkInDiseaseTwo', indBiomkInDiseaseTwo) paramsDisTwo['Xtrue'] = paramsDisTwo['X'] paramsDisTwo['Ytrue'] = paramsDisTwo['Y'] paramsDisTwo['X'] = [paramsDisTwo['X'][b] for b in indBiomkInDiseaseTwo] paramsDisTwo['Y'] = [paramsDisTwo['Y'][b] for b in indBiomkInDiseaseTwo] paramsDisTwo['mapBiomkToFuncUnits'] = np.array( [mapBiomkToFuncUnits[b] for b in indBiomkInDiseaseTwo]) # for disease two, change the format of the X and Y arrays, add the missing biomarkers with empty lists XemptyListsAllBiomk = [0 for _ in range(nrBiomk)] YemptyListsAllBiomk = [0 for _ in range(nrBiomk)] for b in range(nrBiomk): XemptyListsAllBiomk[b] = [0 for _ in range(nrSubjLong)] YemptyListsAllBiomk[b] = [0 for _ in range(nrSubjLong)] for s in range(nrSubjLong): if b in indBiomkInDiseaseTwo: XemptyListsAllBiomk[b][s] = paramsDisTwo['Xtrue'][b][s] YemptyListsAllBiomk[b][s] = paramsDisTwo['Ytrue'][b][s] else: XemptyListsAllBiomk[b][s] = np.array([]) YemptyListsAllBiomk[b][s] = np.array([]) paramsDisTwo['XemptyListsAllBiomk'] = XemptyListsAllBiomk paramsDisTwo['YemptyListsAllBiomk'] = YemptyListsAllBiomk paramsDisTwo['plotTrajParams']['diagNrs'] = np.unique(paramsDisTwo['diag']) paramsDisTwo['plotTrajParams']['trueParams'] = paramsDisTwo['trueParams'] paramsDisTwo['plotTrajParams']['trueParams']['trueTrajPredXB'] = \ paramsDisTwo['plotTrajParams']['trueParams']['trueTrajPredXB'][:,indBiomkInDiseaseTwo] paramsDisTwo['plotTrajParams']['labels'] = \ [[paramsDisTwo['plotTrajParams']['labels'][b]] for b in indBiomkInDiseaseTwo] if forceRegenerate: synthPlotter = Plotter.PlotterJDM(paramsDisTwo['plotTrajParams']) fig = synthPlotter.plotTrajData(paramsDisTwo['longData'], paramsDisTwo['longDiag'], paramsDisTwo['trueParams']['dpsLong'], synthModelDisTwo, replaceFigMode=True) fig.savefig('%s/synth1Dis2GenData.png' % outFolder) if np.abs(args.penalty - int(args.penalty) < 0.00001): expName = '%sPen%d' % (expName, args.penalty) else: expName = '%sPen%.1f' % (expName, args.penalty) paramsDisOne['runPartStd'] = ['L', 'L'] paramsDisOne['runPartMain'] = ['R', 'I', 'I'] # [mainPart, plot, stage] paramsDisOne['masterProcess'] = args.runIndex == 0 expNameDisOne = '%sDisOne' % expName modelNames, res = evaluationFramework.runModels(paramsDisOne, expNameDisOne, args.modelToRun, runAllExpSynth) paramsDisTwo['filePathUnitModels'] = '%s/%s_JMD/unitModels.npz' % ( outFolder, expNameDisOne) paramsDisTwo['runPartStd'] = ['R', 'R'] paramsDisTwo['runPartMain'] = ['R', 'I', 'I'] # [mainPart, plot, stage] paramsDisTwo['masterProcess'] = args.runIndex == 0 modelDisTwo = 16 expNameDisTwo = '%sDisTwo' % expName modelNames, res = evaluationFramework.runModels(paramsDisTwo, expNameDisTwo, modelDisTwo, runAllExpSynth)
def main(): nrSubjLong = 100 nrTimepts = 4 lowerAgeLim = 60 upperAgeLim = 80 shiftsLowerLim = -13 shiftsUpperLim = 10 outFolder = 'resfiles/synth/' expName = args.expName fileName = '%s.npz' % expName regenerateData = args.regData params = {} nrFuncUnits = 2 nrBiomkInFuncUnits = 3 nrDis = 2 nrBiomk = nrBiomkInFuncUnits * nrFuncUnits mapBiomkToFuncUnits = np.array( list(range(nrFuncUnits)) * nrBiomkInFuncUnits) # should give smth like [0,1,2,3,0,1,2,3,0,1,2,3] print('mapBiomkToFuncUnits', mapBiomkToFuncUnits) biomkInFuncUnit = [0 for u in range(nrFuncUnits + 1)] for u in range(nrFuncUnits): biomkInFuncUnit[u] = np.where(mapBiomkToFuncUnits == u)[0] biomkInFuncUnit[nrFuncUnits] = np.array( []) # need to leave this as empty list plotTrajParams['biomkInFuncUnit'] = biomkInFuncUnit plotTrajParams['labels'] = ['biomarker %d' % n for n in range(nrBiomk)] plotTrajParams['nrRowsFuncUnit'] = 3 plotTrajParams['nrColsFuncUnit'] = 4 plotTrajParams['colorsTrajBiomkB'] = [ colorsys.hsv_to_rgb(hue, 1, 1) for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False) ] plotTrajParams['colorsTrajUnitsU'] = [ colorsys.hsv_to_rgb(hue, 1, 1) for hue in np.linspace(0, 1, num=nrFuncUnits, endpoint=False) ] # plotTrajParams['yNormMode'] = 'zScoreTraj' # plotTrajParams['yNormMode'] = 'zScoreEarlyStageTraj' plotTrajParams['yNormMode'] = 'unscaled' # if False, plot estimated traj. in separate plot from true traj. plotTrajParams['allTrajOverlap'] = True params['unitNames'] = ['Unit%d' % f for f in range(nrFuncUnits)] params['runIndex'] = args.runIndex params['nrProc'] = args.nrProc params['cluster'] = args.cluster params['plotTrajParams'] = plotTrajParams params['penalty'] = args.penalty params['penaltyUnits'] = 20 params['penaltyDis'] = 1 params['nrFuncUnits'] = nrFuncUnits params['nrFuncUnitsImgOnly'] = nrFuncUnits params['biomkInFuncUnit'] = biomkInFuncUnit params['nrBiomkDisModel'] = nrFuncUnits params['nrExtraBiomk'] = 0 params[ 'nrGlobIterUnit'] = 10 # these parameters are specific for the Joint Model of Disease (JMD) params['iterParamsUnit'] = 50 params['nrGlobIterDis'] = 10 params['iterParamsDis'] = 50 # # params['unitModelObjList'] = MarcoModel.GP_progression_model # params['unitModelObjList'] = SigmoidModel.SigmoidModel # params['disModelObj'] = SigmoidModel.SigmoidModel # by default we have no priors params['priors'] = None ####### set priors for specific models ######### # params['priors'] = dict(prior_length_scale_mean_ratio=0.33, # mean_length_scale = (self.maxX-self.minX)/3 # prior_length_scale_std=1e-4, prior_sigma_mean=2,prior_sigma_std = 1e-3, # prior_eps_mean = 1, prior_eps_std = 1e-2) # params['priors'] = dict(prior_length_scale_mean_ratio=0.9, # mean_length_scale = (self.maxX-self.minX)/3 # prior_length_scale_std=1e-4, prior_sigma_mean=3, prior_sigma_std=1e-3, # prior_eps_mean=0.1, prior_eps_std=1e-6) params['priorsUnitModelsMarcoModel'] = [ dict( prior_length_scale_mean_ratio= 0.05, # mean_length_scale = (self.maxX-self.minX)/3 prior_length_scale_std=1e-6, prior_sigma_mean=0.5, prior_sigma_std=1e-3, prior_eps_mean=0.1, prior_eps_std=1e-6) for u in range(nrFuncUnits) ] transitionTimePriorMean = 1 # in DPS 0-1 space, prior mean transitionTimePriorMin = 0.1 transitionTimePriorMax = 10 bPriorShape, bPriorRate = getGammShapeRateFromTranTime( transitionTimePriorMean, transitionTimePriorMin, transitionTimePriorMax) params['priorsDisModels'] = [ dict(meanA=1, stdA=1e-5, meanD=0, stdD=1e-5, shapeB=bPriorShape, rateB=bPriorRate, timeShiftStd=15) for d in range(nrDis) ] params['priorsUnitModels'] = [None for d in range(nrDis)] ##### disease agnostic parameters ########### # params of individual biomarkers thetas = np.zeros((nrBiomk, 4), float) thetas[:, 0] = 1 thetas[:, 3] = 0 for f in range(nrFuncUnits): thetas[mapBiomkToFuncUnits == f, 2] = np.linspace(0.2, 0.9, num=nrBiomkInFuncUnits, endpoint=True) # set first funtional unit to have traj with lower slopes thetas[mapBiomkToFuncUnits == 0, 1] = 5 thetas[mapBiomkToFuncUnits == 1, 1] = 10 # thetas[mapBiomkToFuncUnits == 2, 1] = 7 if args.expName == 'synth1': sigmaB = 0.05 * np.ones(nrBiomk) elif args.expName == 'synth2': sigmaB = 0.01 * np.ones(nrBiomk) else: raise ValueError('expName should be synth1 or synth2') # scale every biomarker with mean and std. scalingBiomk2B = np.zeros((2, nrBiomk)) # scalingBiomk2B[:, 0] = [200, 100] # mean +/- std # scalingBiomk2B[:, 0] = [200, 100] # mean +/- std # # scalingBiomk2B[:, 1] = [-20, 3] # mean +/- std # scalingBiomk2B[:, 1] = [-20, 3] # mean +/- std # # scalingBiomk2B[:, 2:4] = scalingBiomk2B[:, 0:2] # scalingBiomk2B[:, 4:6] = scalingBiomk2B[:, 0:2] scalingBiomk2B[1, :] = 1 ##### disease 1 - disease specific parameters ########### # params of the dysfunctional trajectories dysfuncParamsDisOne = np.zeros((nrFuncUnits, 4), float) dysfuncParamsDisOne[:, 0] = 1 # ak dysfuncParamsDisOne[:, 1] = [0.3, 0.2] # bk dysfuncParamsDisOne[:, 2] = [-4, 6] # ck dysfuncParamsDisOne[:, 3] = 0 # dk synthModelDisOne = ParHierModel.ParHierModel(dysfuncParamsDisOne, thetas, mapBiomkToFuncUnits, sigmoidFunc, sigmaB) paramsDisOne = copy.deepcopy(params) paramsDisOne = genSynthData.generateDataJMD(nrSubjLong, nrBiomk, nrTimepts, shiftsLowerLim, shiftsUpperLim, synthModelDisOne, outFolder, fileName, regenerateData, paramsDisOne, scalingBiomk2B, ctlDiagNr=CTL, patDiagNr=AD) # paramsDisOne['plotTrajParams']['trueParams'] = paramsDisOne['trueParams'] replaceFigMode = True if regenerateData: synthPlotter = Plotter.PlotterJDM(paramsDisOne['plotTrajParams']) fig = synthPlotter.plotTrajDataMarcoFormat( paramsDisOne['X'], paramsDisOne['Y'], paramsDisOne['diag'], synthModelDisOne, paramsDisOne['trueParamsDis'], replaceFigMode=replaceFigMode) fig.savefig('%s/%sDis1GenData.png' % (outFolder, expName)) ##### disease 2 - disease specific parameters ########### # params of the dysfunctional trajectories dysfuncParamsDisTwo = copy.deepcopy(dysfuncParamsDisOne) dysfuncParamsDisTwo[:, 1] = [0.3, 0.2] # bk dysfuncParamsDisTwo[:, 2] = [6, -4] synthModelDisTwo = ParHierModel.ParHierModel(dysfuncParamsDisTwo, thetas, mapBiomkToFuncUnits, sigmoidFunc, sigmaB) paramsDisTwo = copy.deepcopy(paramsDisOne) nrSubjLongDisTwo = 50 nrTimeptsDisTwo = 4 paramsDisTwo = genSynthData.generateDataJMD(nrSubjLongDisTwo, nrBiomk, nrTimeptsDisTwo, shiftsLowerLim, shiftsUpperLim, synthModelDisTwo, outFolder, fileName, regenerateData, paramsDisTwo, scalingBiomk2B, ctlDiagNr=CTL2, patDiagNr=PCA) # for disease two, only keep the second biomarker in each functional unit indBiomkInDiseaseTwo = np.array(range(nrFuncUnits, (2 * nrFuncUnits))) print('indBiomkInDiseaseTwo', indBiomkInDiseaseTwo) paramsDisTwo['Xtrue'] = paramsDisTwo['X'] paramsDisTwo['Ytrue'] = paramsDisTwo['Y'] # for disease two, change the format of the X and Y arrays, add the missing biomarkers with empty lists XemptyListsAllBiomk = [0 for _ in range(nrBiomk)] YemptyListsAllBiomk = [0 for _ in range(nrBiomk)] visitIndicesDisTwoMissing = [0 for _ in range(nrBiomk)] for b in range(nrBiomk): XemptyListsAllBiomk[b] = [0 for _ in range(nrSubjLongDisTwo)] YemptyListsAllBiomk[b] = [0 for _ in range(nrSubjLongDisTwo)] visitIndicesDisTwoMissing[b] = [0 for _ in range(nrSubjLongDisTwo)] for s in range(nrSubjLongDisTwo): if b in indBiomkInDiseaseTwo: XemptyListsAllBiomk[b][s] = paramsDisTwo['Xtrue'][b][s] YemptyListsAllBiomk[b][s] = paramsDisTwo['Ytrue'][b][s] visitIndicesDisTwoMissing[b][s] = paramsDisTwo['visitIndices'][ b][s] else: XemptyListsAllBiomk[b][s] = np.array([]) YemptyListsAllBiomk[b][s] = np.array([]) visitIndicesDisTwoMissing[b][s] = np.array([]) paramsDisTwo['XemptyListsAllBiomk'] = XemptyListsAllBiomk paramsDisTwo['YemptyListsAllBiomk'] = YemptyListsAllBiomk paramsDisTwo['visitIndicesMissing'] = visitIndicesDisTwoMissing if regenerateData: synthPlotter = Plotter.PlotterJDM(paramsDisTwo['plotTrajParams']) fig = synthPlotter.plotTrajDataMarcoFormat( paramsDisTwo['Xtrue'], paramsDisTwo['Ytrue'], paramsDisTwo['diag'], synthModelDisTwo, paramsDisTwo['trueParamsDis'], replaceFigMode=replaceFigMode) fig.savefig('%s/%sDis2GenDataFull.png' % (outFolder, expName)) synthPlotter = Plotter.PlotterJDM(paramsDisTwo['plotTrajParams']) fig = synthPlotter.plotTrajDataMarcoFormat( paramsDisTwo['XemptyListsAllBiomk'], paramsDisTwo['YemptyListsAllBiomk'], paramsDisTwo['diag'], synthModelDisTwo, paramsDisTwo['trueParamsDis'], replaceFigMode=replaceFigMode) fig.savefig('%s/%sDis2GenDataMissing.png' % (outFolder, expName)) ############### now merge the two datasets ############ # add the biomarkers from the second dataset, same format as dataset 1 # but with missing entries params = paramsDisOne for b in range(nrBiomk): params['X'][b] += paramsDisTwo['XemptyListsAllBiomk'][b] params['Y'][b] += paramsDisTwo['YemptyListsAllBiomk'][b] params['visitIndices'][b] += paramsDisTwo['visitIndicesMissing'][b] # print('visitIndicesDisTwoMissing', visitIndicesDisTwoMissing) # print(adssa) params['RID'] = np.concatenate( (params['RID'], nrSubjLong + paramsDisTwo['RID']), axis=0) # RIDs must be different # this is the full vector of diagnoses for all diseases params['diag'] = np.concatenate( (paramsDisOne['diag'], paramsDisTwo['diag']), axis=0) params['plotTrajParams']['diag'] = params['diag'] params['trueParamsDis'] = [ params['trueParamsDis'], paramsDisTwo['trueParamsDis'] ] for f in range(nrFuncUnits): params['trueParamsFuncUnits'][f]['subShiftsS'] = np.concatenate( (params['trueParamsFuncUnits'][f]['subShiftsS'], paramsDisTwo['trueParamsFuncUnits'][f]['subShiftsS']), axis=0) # map which diagnoses belong to which disease # first disease has CTL+AD, second disease has CTL2+PCA params['diagsSetInDis'] = [np.array([CTL, AD]), np.array([CTL2, PCA])] params['disLabels'] = ['Dis0', 'Dis1'] params['otherBiomkPerDisease'] = [[], []] params['binMaskSubjForEachDisD'] = [ np.in1d(params['diag'], params['diagsSetInDis'][disNr]) for disNr in range(nrDis) ] assert params['diag'].shape[0] == len(params['X'][0]) assert np.sum(params['binMaskSubjForEachDisD'][0]) == len( params['trueParamsDis'][0]['subShiftsS']) assert params['diag'].shape[0] == len( params['trueParamsFuncUnits'][0]['subShiftsS']) # if np.abs(args.penalty - int(args.penalty) < 0.00001): # expName = '%sPen%d' % (expName, args.penalty) # else: # expName = '%sPen%.1f' % (expName, args.penalty) params['runPartStd'] = args.runPartStd params['runPartMain'] = ['R', 'I', 'I'] # [mainPart, plot, stage] params['masterProcess'] = args.runIndex == 0 expNameDisOne = '%s' % expName modelNames, res = evaluationFramework.runModels(params, expName, args.modelToRun, runAllExpSynth)
def launchADNIthick(runIndex, nrProcesses, modelToRun): # dataStruct['pointIndices'] = np.array(range(dataStruct['lhData'].shape[1])) # pickle.dump(dataStruct, open(inputFileData, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) inputPrefix = 'cortThickADNI3Scans' inputFileDataFull = '../data/ADNI/%sData.npz' % inputPrefix inputFileInfo = '../data/ADNI/%sInfo.npz' % inputPrefix print(inputFileInfo) sys.stdout.flush() #if os.path.isfile(inputFileInfo): infoStruct = pickle.load(open(inputFileInfo, 'rb')) print('will enter readDataFile') dataStruct = readDataFile(inputFileDataFull, args.cluster) #selectedBiomk = np.array([x for x in range(4,144)]) # filter AD subjects # diagInd = np.array(np.where(matData['diag'] == PCA)[0]) print('compiling parameters') sys.stdout.flush() data = dataStruct['avghData'] diag = np.array(np.squeeze(infoStruct['diag']), int) scanTimepts = np.squeeze(infoStruct['scanTimepts']) partCode = np.squeeze(infoStruct['partCode']) ageAtScan = np.squeeze(infoStruct['ageAtScan']) pointIndices = dataStruct['pointIndices'] cogTests = infoStruct['cogTests'] visit = infoStruct['visit'] assert (not np.any(np.isnan(data))) print('diag', np.unique(diag), diag) # print(adsas) idx = [0, 1, 2, 3, 4] # print('partCode[idx]', partCode[idx]) # print('ageAtScan[idx]', ageAtScan[idx]) # print('scanTimepts[idx]', scanTimepts[idx]) # print('diag[idx]', diag[idx]) # print('visit[idx]', visit[idx]) # print(adas) #np.set_printoptions(threshold = np.inf) #print(dataZ, np.min(dataZ)) #print(asdsa) #np.set_printoptions(threshold = 3) unqPartCode = np.unique(partCode) nrUnqPart = len(unqPartCode) #print(partCode) #print(scanTimepts) #print(nrUnqPart) #print(np.sum(data == 0, 0)) maxNrZeros = 5 selectedBiomk = np.sum(data == 0, 0) < maxNrZeros # import pdb # pdb.set_trace() data = data[:, selectedBiomk] pointIndices = pointIndices[selectedBiomk] # calculate Z-scores at each point w.r.t controls at baseline controlBlInd = np.logical_and(diag == CTL, scanTimepts == 1) meanCTL = np.mean(data[controlBlInd], 0) # calculate Z-scores stdCTL = np.std(data[controlBlInd], 0) dataZ = (data - meanCTL[None, :]) / stdCTL[None, :] meanAgeCTL = np.mean(ageAtScan[controlBlInd], 0) stdAgeCTL = np.std(ageAtScan[controlBlInd], 0) ageAtScanZ = (ageAtScan - meanAgeCTL) / stdAgeCTL (rowInd, colInd) = np.where(np.isnan(dataZ)) rowIndUnq = np.unique(rowInd) colIndUnq = np.unique(colInd) print(rowIndUnq, colIndUnq) print(np.where(stdCTL == 0)) print(data.shape) sys.stdout.flush() data = dataZ assert (not np.any(np.isnan(data))) dataAD = data[diag == AD, :] indMaxAbnormality = np.argsort(np.mean(dataAD, 0)) # lowest cortical thickness print(indMaxAbnormality) sortedByPvalInd, labels, names = testMeanBiomkValue( data, diag, pointIndices, plotTrajParams) #doTtest(data, diag, pointIndices) #sortedByPvalInd = sortedByPvalInd[selectedBiomk] assert (sortedByPvalInd.shape[0] == data.shape[1]) print(infoStruct['cogTestsLabels']) sys.stdout.flush() params['data'] = data params['diag'] = diag params['scanTimepts'] = scanTimepts params['partCode'] = partCode params['ageAtScan'] = ageAtScanZ params['biomkDir'] = DECR params['modelToRun'] = modelToRun params['cogTests'] = np.squeeze(cogTests) # CDRSOB, ADAS13, MMSE, RAVLT params['cogTests'][:, [ 2, 3 ]] *= -1 # make MMSE and RAVLT have increasing scores from CTL->AD # params['acqDate'] = infoStruct['acqDate'] params['datasetFull'] = 'adniThick' params[ 'fixSpeed'] = False # if true then don't model progression speed, only time shift # map points that have been removed to the closest included points (nearestNeighbours). # also find the adjacency list for the MRF and another subset of 10k points for # initial clustering runPartNN = 'L' plotTrajParams['nearestNeighbours'], params['adjList'], \ params['nearNeighInitClust'], params['initClustSubsetInd'] = findNearNeigh(runPartNN, params['datasetFull'], pointIndices, plotTrajParams['freesurfPath'], indMaxAbnormality) # print(ads) diagNrs = np.unique(diag) # print('diagNrs, diag', diagNrs, diag) # print(asdas) # print(len(params['acqDate']), data.shape[0]) sys.stdout.flush() assert (params['data'].shape[0] == params['diag'].shape[0] == params['scanTimepts'].shape[0] == params['partCode'].shape[0] == params['ageAtScan'].shape[0] == params['cogTests'].shape[0]) # sets an uninformative or informative prior priorNr = setPrior(params, args.informPrior, mean_gamma_alpha=1, std_gamma_alpha=0.3, mu_beta=0, std_beta=5) expName = 'adniThFWHM%dInit%sCl%dPr%dRa%dMrf%d' % ( args.fwhmLevel, args.initClustering, params['nrClust'], priorNr, args.rangeFactor, args.alphaMRF) plotTrajParams['sortedByPvalInd'] = sortedByPvalInd plotTrajParams['pointIndices'] = pointIndices plotTrajParams['labels'] = labels plotTrajParams['names'] = names plotTrajParams['expName'] = expName plotTrajParams['ageTransform'] = (meanAgeCTL, stdAgeCTL) plotTrajParams['datasetFull'] = params['datasetFull'] params['plotTrajParams'] = plotTrajParams # [initClust, modelFit, AIC/BIC, blender, theta_sampling] params['runPartStd'] = ['L', 'Non-enforcing', 'I', 'R', 'L'] params['runPartMain'] = ['I', 'I', 'I'] # [mainPart, plot, stage] params['runPartCogCorr'] = ['R'] params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'I'] params['runPartDirDiag'] = ['R', 'R', 'I'] params['runPartStaging'] = ['L', 'L', 'I'] params['runPartDiffDiag'] = ['R', 'R', 'I'] params['runPartConvPred'] = ['I', 'I', 'I'] params['runPartCVNonOverlap'] = ['I'] params['runPartCVNonOverlapMain'] = ['L', 'L', 'I', 'I', 'I'] params['masterProcess'] = runIndex == 0 # visRegions(data, diag, ageAtScan, plotTrajParams) # # visData(data, diag, ageAtScan, plotTrajParams,sortedByPvalInd) # print(dsasa) # makeAvgBiomkMaps(data, diag, ageAtScan, plotTrajParams, # 'adniTh', args.fwhmLevel, plotTrajParams['diagLabels']) # print(adsa) # (longData, longDiagAllTmpts, longDiag, longScanTimepts, longPartCode, longAgeAtScan, # uniquePartCodeInverse, crossData, crossDiag, scanTimepts, crossPartCode, crossAgeAtScan) = \ # createLongData(data, diag, scanTimepts, partCode, ageAtScan) # # unqDiag = np.unique(longDiag) # nrScans = np.zeros(longDiag.shape, float) # nrSubjLong = longDiag.shape[0] # for s in range(nrSubjLong): # nrScans[s] = longData[s].shape[0] # # longAgeAtBlScan = np.array([longAgeAtScan[s][0] for s in range(nrSubjLong)]) # # for d in range(unqDiag.shape[0]): # print('%s nrSubj %d' % (plotTrajParams['diagLabels'][unqDiag[d]], # np.sum(longDiag == unqDiag[d], axis=0))) # print('%s nrScans %f' % (plotTrajParams['diagLabels'][unqDiag[d]], # np.mean(nrScans[longDiag == unqDiag[d]]))) # print('%s ageAtBlScan %f' % (plotTrajParams['diagLabels'][unqDiag[d]], np.mean(longAgeAtBlScan[longDiag == unqDiag[d]]))) # print(adsas) if params['masterProcess']: # [initClust, modelFit, AIC/BIC, blender, theta_sampling] params['runPartStd'] = ['R', 'R', 'R', 'R', 'R'] params['runPartMain'] = ['I', 'I', 'I'] # [mainPart, plot, stage] params['runPartCogCorr'] = ['R'] params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'I'] params['runPartDirDiag'] = ['R', 'R', 'I'] params['runPartStaging'] = ['L', 'L', 'I'] params['runPartDiffDiag'] = ['R', 'R', 'I'] params['runPartConvPred'] = ['I', 'I', 'I'] params['runPartCVNonOverlap'] = ['I'] params['runPartCVNonOverlapMain'] = ['R', 'R', 'I', 'R', 'R'] runAllExpFunc = adniDEM.runAllExpADNI modelNames, res = evaluationFramework.runModels(params, expName, modelToRun, runAllExpFunc) if params['masterProcess']: printResADNIthick(modelNames, res, plotTrajParams) expNameBefCl = 'adniThFWHM%dInit%s' % (args.fwhmLevel, args.initClustering) expNameAfterCl = 'Pr%dRa%dMrf%d' % (args.informPrior, args.rangeFactor, args.alphaMRF) # nrClustList = range(2, 30) #nrClustList = [2,3,4,5,6,7,8,9,10,12,15,18,20,25,30,35,40,50] nrClustList = [2, 3, 4, 5, 6, 7, 8, 9, 10]
if params['masterProcess']: # [initClust, modelFit, AIC/BIC, blender, theta_sampling] params['runPartStd'] = ['L', 'L', 'I', 'I', 'I'] params['runPartMain'] = ['R', 'R', 'R'] # [mainPart, plot, stage] params['runPartCogCorr'] = ['I'] params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'I'] params['runPartDirDiag'] = ['R', 'R', 'I'] params['runPartStaging'] = ['L', 'L', 'I'] params['runPartDiffDiag'] = ['R', 'R', 'I'] params['runPartConvPred'] = ['I', 'I', 'I'] params['runPartCVNonOverlap'] = ['I'] params['runPartCVNonOverlapMain'] = ['R', 'R', 'I', 'R', 'R'] runAllExpFunc = runAllExpTADPOLE modelNames, res = evaluationFramework.runModels(params, expName, modelToRun, runAllExpFunc) # now generate forecast print('Generating forecast ... ') teamName = 'DIVE6' if args.leaderboard: outputFile = 'TADPOLE_Submission_Leaderboard_D3_%s.csv' % teamName predStartDate = datetime.date(2010, 5, 1) nrYearsToPred = 7 nrMonthsToPred = 12*nrYearsToPred # 5 years else: outputFile = 'TADPOLE_Submission_D3_%s.csv' % teamName predStartDate = datetime.date(2018, 1, 1) nrYearsToPred = 5 nrMonthsToPred = 12*nrYearsToPred # 7 years
def main(): # don't turn this on unless I add cognitive markers in the DRC dataset. addExtraBiomk = False np.random.seed(1) random.seed(1) pd.set_option('display.max_columns', 50) tinyData = args.tinyData finalDataFile = 'data_processed/tadDrc.npz' expName = 'tadDrc' if args.tinyData: finalDataFile = finalDataFile.split('.')[0] + 'Tiny.npz' expName = expName.split('.')[0] + 'Tiny' if addExtraBiomk: finalDataFile = finalDataFile.split('.')[0] + 'Cog.npz' expName = expName.split('.')[0] + 'Cog' regenerateData = (not os.path.isfile(finalDataFile)) or args.regData if regenerateData: prepareData(finalDataFile, tinyData, addExtraBiomk) # print(dada) ds = pickle.load(open(finalDataFile, 'rb')) dataDfAll = ds['dataDfAll'] regParamsICV = ds['regParamsICV'] regParamsAge = ds['regParamsAge'] regParamsGender = ds['regParamsGender'] regParamsDataset = ds['regParamsDataset'] X = ds['X'] Y = ds['Y'] RID = np.array(ds['RID'], int) labels = ds['list_biomarkers'] diag = ds['diag'] outFolder = 'resfiles/' params = {} av45InListBiomk = np.array( [True for x in ds['list_biomarkers'] if x.startswith('AV1451')]).any() if av45InListBiomk: nrBiomkInFuncUnits = 5 else: nrBiomkInFuncUnits = 4 # print('dataDfAll', dataDfAll) nrDis = 2 # nr of diseases params['nrDis'] = nrDis # change the order of the functional units so that the hippocampus and occipital are fitted first unitPermutation = [5, 3, 2, 1, 4, 0] nrFuncUnits = 6 mapBiomkToFuncUnits = np.array((unitPermutation * nrBiomkInFuncUnits)) nrExtraBiomk = 0 if addExtraBiomk: nrExtraBiomk = 5 nrFuncUnits += nrExtraBiomk # add the 3 extra cog markers to a unique functional unit mapBiomkToFuncUnits = np.array( (unitPermutation * nrBiomkInFuncUnits) + list(range(nrFuncUnits - nrExtraBiomk, nrFuncUnits))) # print(mapBiomkToFuncUnits) # print(dasdas) unitNames = [l.split(' ')[-1] for l in labels] unitNames = [unitNames[i] for i in unitPermutation] if addExtraBiomk: extraBiomkNames = ['ADAS13', 'CDRSB', 'RAVLT', 'MMSE', 'FAQ'] unitNames += extraBiomkNames assert len(extraBiomkNames) == nrExtraBiomk nrBiomk = mapBiomkToFuncUnits.shape[0] biomkInFuncUnit = [0 for u in range(nrFuncUnits + 1)] for u in range(nrFuncUnits): biomkInFuncUnit[u] = np.where(mapBiomkToFuncUnits == u)[0] # if addExtraBiomk: # # add extra entry with other biomks to be added in the disease models # extraBiomkNames = ['ADAS13', 'CDRSB', 'RAVLT', 'MMSE', 'FAQ'] # biomkInFuncUnit[nrFuncUnits] = np.array([nrBiomk-3, nrBiomk-2, nrBiomk-1]) # else: biomkInFuncUnit[nrFuncUnits] = np.array( []) # need to leave this as empty list plotTrajParams['biomkInFuncUnit'] = biomkInFuncUnit plotTrajParams['labels'] = labels plotTrajParams['nrRowsFuncUnit'] = 3 plotTrajParams['nrColsFuncUnit'] = 4 plotTrajParams['colorsTrajBiomkB'] = [ colorsys.hsv_to_rgb(hue, 1, 1) for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False) ] plotTrajParams['colorsTrajUnitsU'] = [ colorsys.hsv_to_rgb(hue, 1, 1) for hue in np.linspace(0, 1, num=nrFuncUnits, endpoint=False) ] plotTrajParams['nrBiomk'] = nrBiomk params['nrBiomk'] = nrBiomk # plotTrajParams['yNormMode'] = 'zScoreTraj' # plotTrajParams['yNormMode'] = 'zScoreEarlyStageTraj' plotTrajParams['yNormMode'] = 'unscaled' # if False, plot estimated traj. in separate plot from true traj. plotTrajParams['allTrajOverlap'] = False params['nrFuncUnitsImgOnly'] = nrFuncUnits - nrExtraBiomk params['unitNames'] = unitNames params['runIndex'] = args.runIndex params['nrProc'] = args.nrProc params['cluster'] = args.cluster params['plotTrajParams'] = plotTrajParams params['penaltyUnits'] = args.penalty params['penaltyDis'] = args.penalty params['nrFuncUnits'] = nrFuncUnits params['biomkInFuncUnit'] = biomkInFuncUnit params['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits params['labels'] = labels params['nrExtraBiomk'] = nrExtraBiomk params['X'] = X params['Y'] = Y params['RID'] = RID # print('RID', RID) # print(ads) params['diag'] = diag params['plotTrajParams']['diag'] = params['diag'] params['Xvalid'] = ds['Xvalid'] params['Yvalid'] = ds['Yvalid'] params['RIDvalid'] = ds['RIDvalid'] params['diagValid'] = ds['diagValid'] params['dataDfAll'] = dataDfAll params['visitIndices'] = ds['visitIndices'] params['visitIndicesValid'] = ds['visitIndicesValid'] # params['nrGlobIterUnit'] = 10 # these parameters are specific for the Joint Model of Disease (JMD) # params['iterParamsUnit'] = 60 # params['nrGlobIterDis'] = 10 # params['iterParamsDis'] = 60 # by default we have no priors params['priors'] = None # print([params['X'][b2][subjIndCurrDis[s]] for b2 in range(params['nrBiomk'])]) # print([params['Y'][b2][subjIndCurrDis[s]] for b2 in range(params['nrBiomk'])]) for s in range(len(X[0])): entriesCurrSubj = [X[b][s].shape[0] > 0 for b in range(30)] nrEntriesPerSubj = np.sum(entriesCurrSubj) if nrEntriesPerSubj == 0: print(s, entriesCurrSubj) print(dadsa) print(labels) # print(dasda) ############# set priors for specific models ################ # params['priors'] = dict(prior_length_scale_mean_ratio=0.33, # mean_length_scale = (self.maxX-self.minX)/3 # prior_length_scale_std=1e-4, prior_sigma_mean=2,prior_sigma_std = 1e-3, # prior_eps_mean = 1, prior_eps_std = 1e-2) # params['priors'] = dict(prior_length_scale_mean_ratio=0.9, # mean_length_scale = (self.maxX-self.minX)/3 # prior_length_scale_std=1e-4, prior_sigma_mean=3, prior_sigma_std=1e-3, # prior_eps_mean=0.1, prior_eps_std=1e-6) params['priorsUnitModelsMarcoModel'] = [ dict( prior_length_scale_mean_ratio= 0.05, # mean_length_scale = (self.maxX-self.minX)/3 prior_length_scale_std=1e-6, prior_sigma_mean=0.5, prior_sigma_std=1e-3, prior_eps_mean=0.1, prior_eps_std=1e-6) for u in range(nrFuncUnits) ] transitionTimePriorMean = 1 # in DPS 0-1 space, prior mean transitionTimePriorMin = 0.9 transitionTimePriorMax = 1.1 bPriorShape, bPriorRate = getGammShapeRateFromTranTime( transitionTimePriorMean, transitionTimePriorMin, transitionTimePriorMax) transitionTimePriorMeanAD = 0.1 # using months instead of years transitionTimePriorMinAD = 0.09 transitionTimePriorMaxAD = 0.11 bPriorShapeDisAD, bPriorRateDisAD = getGammShapeRateFromTranTime( transitionTimePriorMeanAD, transitionTimePriorMinAD, transitionTimePriorMaxAD) _, bPriorStdAD = getMeanStdBFromTranTime(transitionTimePriorMeanAD, transitionTimePriorMinAD, transitionTimePriorMaxAD) transitionTimePriorMeanPCA = 500 transitionTimePriorMinPCA = 400 transitionTimePriorMaxPCA = 600 bPriorShapeDisPCA, bPriorRateDisPCA = getGammShapeRateFromTranTime( transitionTimePriorMeanPCA, transitionTimePriorMinPCA, transitionTimePriorMaxPCA) _, bPriorStdPCA = getMeanStdBFromTranTime(transitionTimePriorMeanPCA, transitionTimePriorMinPCA, transitionTimePriorMaxPCA) params['priorsDisModels'] = [0, 0] # priors for tAD params['priorsDisModels'][0] = dict(meanA=1, stdA=1e-20, meanD=0, stdD=1e-20, shapeB=bPriorShapeDisAD, rateB=bPriorRateDisAD, stdPerturbB=bPriorStdAD, timeShiftStd=20000) # priors for PCA params['priorsDisModels'][1] = dict(meanA=1, stdA=1e-20, meanD=0, stdD=1e-20, shapeB=bPriorShapeDisPCA, rateB=bPriorRateDisPCA, stdPerturbB=bPriorStdPCA, timeShiftStd=20000) # params['priorsUnitModels'] = [dict(meanA=1, stdA=1e-20, meanD=0, stdD=1e-20, # shapeB=2, rateB=2, timeShiftStd=20000) for d in range(nrDis)] params['priorsUnitModels'] = [ dict(meanA=1, stdA=1e-5, meanD=0, stdD=1e-5, shapeB=bPriorShape, rateB=bPriorRate, timeShiftStd=20000) for u in range(nrFuncUnits - nrExtraBiomk) ] if nrExtraBiomk > 0: params['priorsUnitModelsLinear'] = [ dict(meanA=1, stdA=0.1, meanB=0, stdB=0.1, timeShiftStd=20000) for u in range(nrExtraBiomk) ] params['priorsUnitModels'] += params['priorsUnitModelsLinear'] bPriorShapeNoDKT, bPriorRateNoDKT = getGammShapeRateFromTranTime( transitionTimePriorMean=50, transitionTimePriorMin=40, transitionTimePriorMax=60) params['priorsNoDKTSigmoid'] = dict(meanA=1, stdA=1e-5, meanD=0, stdD=1e-5, shapeB=bPriorShapeNoDKT, rateB=bPriorRateNoDKT, timeShiftStd=20000) ###################### nrBiomkDisModel = nrFuncUnits params['nrBiomkDisModel'] = nrBiomkDisModel if addExtraBiomk: params['plotTrajParams']['unitNames'] = unitNames + labels[-3:] else: params['plotTrajParams']['unitNames'] = unitNames # map which diagnoses belong to which disease # first disease has CTL+AD, second disease has CTL2+PCA params['diagsSetInDis'] = [ np.array([CTL, MCI, AD, AD2]), np.array([CTL2, PCA]) ] params['disLabels'] = ['tAD', 'PCA'] # if addExtraBiomk: # params['otherBiomkPerDisease'] = [[nrBiomk-3,nrBiomk-2, nrBiomk-1], []] # can also add 3 extra cognitive tests # else: # params['otherBiomkPerDisease'] = [[], []] params['binMaskSubjForEachDisD'] = [ np.in1d(params['diag'], params['diagsSetInDis'][disNr]) for disNr in range(nrDis) ] eps = 0.001 nrXPoints = 50 params['trueParams'] = {} subShiftsS = np.zeros(RID.shape[0]) # params['trueParams']['trueSubjDysfuncScoresSU'] = np.zeros((RID.shape[0],nrFuncUnits)) trueDysfuncXsX = np.linspace(0, 1, nrXPoints) # params['trueParams']['trueTrajXB'] = eps * np.ones((nrXPoints, nrBiomk)) trueTrajFromDysXB = eps * np.ones((nrXPoints, nrBiomk)) trueLineSpacedDPSsX = np.linspace(-10, 10, nrXPoints) trueTrajPredXB = eps * np.ones((nrXPoints, nrBiomk)) trueDysTrajFromDpsXU = eps * np.ones((nrXPoints, nrBiomkDisModel)) scalingBiomk2B = np.zeros((2, nrBiomk)) scalingBiomk2B[1, :] = 1 trueParamsFuncUnits = [0 for _ in range(nrFuncUnits)] for f in range(nrFuncUnits): trueParamsFuncUnits[f] = dict( xsX=trueDysfuncXsX, ysXB=trueTrajFromDysXB[:, biomkInFuncUnit[f]], subShiftsS=subShiftsS, scalingBiomk2B=scalingBiomk2B[:, biomkInFuncUnit[f]]) # disease specific trueParamsDis = [0 for _ in range(nrDis)] for d in range(nrDis): trueParamsDis[d] = dict(xsX=trueLineSpacedDPSsX, ysXU=trueDysTrajFromDpsXU, ysXB=trueTrajPredXB, subShiftsS=np.zeros( np.sum( np.in1d(params['diag'], params['diagsSetInDis'][d]))), scalingBiomk2B=scalingBiomk2B) # for DKT DPMs params['trueParamsFuncUnits'] = trueParamsFuncUnits params['trueParamsDis'] = trueParamsDis # simpler non-DKT DPMs params['trueParams'] = dict(xsX=trueLineSpacedDPSsX, ysXU=trueTrajPredXB, ysXB=trueTrajPredXB, subShiftsS=subShiftsS, scalingBiomk2B=scalingBiomk2B) params['plotTrajParams']['trueParams'] = params['trueParams'] print('diag', params['diag'].shape[0]) # print(adsa) print('X[0]', len(params['X'][0])) assert params['diag'].shape[0] == len(params['X'][0]) # assert params['diag'].shape[0] == len(params['trueParams']['subShiftsTrueMarcoFormatS']) # assert params['diag'].shape[0] == len(params['trueParams']['trueSubjDysfuncScoresSU']) # if args.penalty is not None: # if np.abs(args.penalty - int(args.penalty) < 0.00001): # expName = '%sPen%d' % (expName, args.penalty) # else: # expName = '%sPen%.1f' % (expName, args.penalty) # params['runPartStd'] = ['L', 'L'] params['runPartStd'] = args.runPartStd params['runPartMain'] = ['R', 'I', 'I'] # [mainPart, plot, stage] params['masterProcess'] = args.runIndex == 0 expNameDisOne = '%s' % expName modelNames, res = evaluationFramework.runModels(params, expName, args.modelToRun, runAllExpTadpoleDrc) if params['masterProcess']: printRes(modelNames, res, plotTrajParams, params)
def main(): np.random.seed(1) random.seed(1) pd.set_option('display.max_columns', 50) tinyData = True regenerateData = True if tinyData: finalDataFile = 'tadpoleTiny.npz' else: finalDataFile = 'tadpoleFinalDataWithRegParams.npz' if regenerateData: prepareData(finalDataFile, tinyData) ds = pickle.load(open(finalDataFile, 'rb')) dataDfAll = ds['dataDfAll'] regParamsICV = ds['regParamsICV'] regParamsAge = ds['regParamsAge'] regParamsGender = ds['regParamsGender'] regParamsDataset = ds['regParamsDataset'] X = ds['X'] Y = ds['Y'] RID = np.array(ds['RID']) labels = ds['list_biomarkers'] diag = ds['diag'] meanVols = np.array([np.mean(Y[0][s]) for s in range(RID.shape[0])]) meanVols[diag != CTL2] = np.inf idxOfDRCSubjWithLowVol = np.argmin(meanVols) # print('idxOfDRCSubjWithLowVol', idxOfDRCSubjWithLowVol) # print(diag[idxOfDRCSubjWithLowVol]) # print(labels) # print(asd) outFolder = 'resfiles/' expName = 'tadpole' params = {} nrFuncUnits = 2 nrBiomkInFuncUnits = 5 # nrBiomk = nrBiomkInFuncUnits * nrFuncUnits nrBiomk = len(labels) # print(len(labels), labels) # print(nrBiomk) # print(asd) # mapBiomkToFuncUnits = np.array(list(range(nrFuncUnits)) * nrBiomkInFuncUnits) # should give smth like [0,1,2,3,0,1,2,3,0,1,2,3] # change the order of the functional units so that the hippocampus and occipital are fitted first # unitPermutation = [5,3,2,1,4,0] unitPermutation = [0, 1] mapBiomkToFuncUnits = np.array(unitPermutation * nrBiomkInFuncUnits) unitNames = [l.split(' ')[-1] for l in labels] unitNames = [unitNames[i] for i in unitPermutation] # print('mapBiomkToFuncUnits', mapBiomkToFuncUnits) # print([unitNames[i] for i in mapBiomkToFuncUnits]) # print(unitNames[0]) # print([labels[i] for i in np.where(mapBiomkToFuncUnits == 0)[0]]) # print(asd) plotTrajParams['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits plotTrajParams['labels'] = labels plotTrajParams['nrRowsFuncUnit'] = 3 plotTrajParams['nrColsFuncUnit'] = 3 plotTrajParams['colorsTraj'] = [ colorsys.hsv_to_rgb(hue, 1, 1) for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False) ] # if False, plot estimated traj. in separate plot from true traj. plotTrajParams['allTrajOverlap'] = False params['runIndex'] = args.runIndex params['nrProc'] = args.nrProc params['cluster'] = args.cluster params['plotTrajParams'] = plotTrajParams params['penalty'] = args.penalty params['nrFuncUnits'] = nrFuncUnits params['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits params['labels'] = labels params['X'] = X params['Y'] = Y params['RID'] = RID params['diag'] = diag params['plotTrajParams']['diag'] = params['diag'] nrXPoints = 50 nrDis = 2 # nr of diseases params['trueParams'] = {} params['trueParams']['subShiftsTrueMarcoFormatS'] = np.zeros(RID.shape[0]) params['trueParams']['trueSubjDysfuncScoresSU'] = np.zeros( (RID.shape[0], nrFuncUnits)) params['trueParams']['trueDysfuncXsX'] = np.linspace(0, 1, nrXPoints) params['trueParams']['trueTrajXB'] = np.zeros((nrXPoints, nrBiomk)) params['trueParams']['trueTrajFromDysXB'] = np.zeros((nrXPoints, nrBiomk)) params['trueParams']['trueXsTrajX'] = params['trueParams'][ 'trueDysfuncXsX'] params['trueParams']['trueLineSpacedDPSsX'] = np.linspace( -10, 10, nrXPoints) # params['trueParams']['trueTrajPredXB'] = np.zeros((nrXPoints,nrBiomk)) params['trueParams']['trueDysTrajFromDpsXU'] = [ np.zeros((nrXPoints, nrFuncUnits)) for d in range(nrDis) ] params['plotTrajParams']['trueParams'] = params['trueParams'] params['plotTrajParams']['unitNames'] = unitNames # map which diagnoses belong to which disease # first disease has CTL+AD, second disease has CTL2+PCA params['diagsSetInDis'] = [np.array([CTL, MCI, AD]), np.array([CTL2, PCA])] params['disLabels'] = ['tAD', 'PCA'] print('diag', params['diag'].shape[0]) print('X[0]', len(params['X'][0])) assert params['diag'].shape[0] == len(params['X'][0]) # assert params['diag'].shape[0] == len(params['trueParams']['subShiftsTrueMarcoFormatS']) # assert params['diag'].shape[0] == len(params['trueParams']['trueSubjDysfuncScoresSU']) if np.abs(args.penalty - int(args.penalty) < 0.00001): expName = '%sPen%d' % (expName, args.penalty) else: expName = '%sPen%.1f' % (expName, args.penalty) params['runPartStd'] = ['R', 'R'] params['runPartMain'] = ['R', 'I', 'I'] # [mainPart, plot, stage] params['masterProcess'] = args.runIndex == 0 expNameDisOne = '%s' % expName modelNames, res = evaluationFramework.runModels(params, expName, args.modelToRun, runAllExpTadpoleDrc)
def launchSynth(runIndex, nrProcesses, modelToRun): runAllExpFunc = runAllExpSynth #if os.path.isfile(inputFileData): trajFuncDict = {'lin': linearFunc, 'sig': sigmoidFunc} # forceRegenerate = True forceRegenerate = False ############# define default parameters ##################################### nrSubjDef = 300 nrBiomk = 1000 # not used directly, relevant for when I use real data as I can map then to the actual freesurfer vertices nrClustToGenDef = 3 # number of clusters to generate data from nrClustToFit = args.nrClust nrTimepts = 4 trajFunc = trajFuncDict['sig'] lowerAgeLim = 40 upperAgeLim = 80 dpsLowerLimit = -1 dpsUpperLimit = 2 dpsIntervalDef = dpsUpperLimit - dpsLowerLimit ageInterval = upperAgeLim - lowerAgeLim avgStdScaleFactor = 1 ''' fit sigmoidal function for trajectory with params [a,b,c,d] with minimum d, maximum a+d, slope a*b/4 and slope maximum attained at center c f(s|theta = [a,b,c,d]) = a/(1+exp(-b(s-c)))+d''' thetasTrue = np.zeros((nrClustToGenDef, 4), float) thetasTrue[0, :] = [1, -3, dpsLowerLimit, -1] # make lines intersect the Y=0 axis at lowerAgeLim thetasTrue[1, :] = [1, -1, dpsLowerLimit + dpsIntervalDef / 2, -1] thetasTrue[2, :] = [1, -3, dpsLowerLimit + dpsIntervalDef, -1] slopeLowerLim = -2 slopeUpperLim = -2 slopeInterval = slopeUpperLim - slopeLowerLim trajMinLowerLim = -5 trajMinUpperLim = -5 trajMinInterval = trajMinUpperLim - trajMinLowerLim covPerturbed13 = np.diag([0, 0.35, dpsIntervalDef / 70, 0]) covPerturbed2 = np.diag([0, 0.1, dpsIntervalDef / 70, 0]) covPerturbed = [covPerturbed13, covPerturbed2, covPerturbed13] covSubjShifts = np.array( [[0.05, 0], [0, 10]]) # +/- 10 years shifts on avg, averate rate 1+/-0.4 makeThetaIdentifFunc = VoxelDPM.makeThetasIdentif ############### set parameters ############################################### params['runIndex'] = runIndex params['nrProcesses'] = nrProcesses params['modelToRun'] = modelToRun params['cluster'] = args.cluster params['biomkDir'] = DECR params['initClustering'] = 'k-means' params['rangeFactor'] = float(args.rangeFactor) params['pointIndices'] = np.array(range(nrBiomk), int) params[ 'fixSpeed'] = False # if true then don't model progression speed, only time shift plotTrajParams['sortedByPvalInd'] = range(nrBiomk) plotTrajParams['pointIndices'] = params['pointIndices'] plotTrajParams['labels'] = np.zeros(nrBiomk, int) plotTrajParams['names'] = ['v'] params['plotTrajParams'] = plotTrajParams ################ set up the checkpoints ########## # R - run that part # L - load from saved file # I - ignore part params['runPartStd'] = [ 'R', 'R', 'R', 'I', 'I' ] # [initClust, modelFit, AIC/BIC, blender, theta_sampling] params['runPartMain'] = ['R', 'I', 'I'] # [mainPart, plot, stage] params['masterProcess'] = runIndex == 0 # assign initClustSubsetInd and nearNeighInitClust params['initClustSubsetInd'] = np.array( range(nrBiomk)) # set to identity map params['nearNeighInitClust'] = np.array( range(nrBiomk)) # set to identity map if params['masterProcess']: # [initClust, pointIndices, modelFit, AIC/BIC, checkers/visual] params['runPartStd'] = ['L', 'L', 'L', 'I', 'I'] # [mainPart, plot, stage] params['runPartMain'] = ['R', 'I', 'I'] params['compareTrueParamsFunc'] = compareWithTrueParams # makes changes to params setPrior(params, args.informPrior) # sets an informative or uninformative prior nrSteps = 8 # print('args.stepToRun', args.stepToRun) if args.stepToRun == 0: stepsList = list(range(nrSteps)) else: stepsList = [args.stepToRun - 1] # if runIndex > 0: # stepsList = [runIndex - 1] # print(args.expToRun) # print(adasd) ###################### vary trajectory centers ############################### # copy state of params and plotTrajParams paramsLocal = copy.deepcopy(params) plotTrajParamsLocal = copy.deepcopy(plotTrajParams) paramsLocal['plotTrajParams'] = plotTrajParamsLocal resList = [] plotterObj = PlotterVDPM.PlotterVDPMSynth() if args.expToRun == 1 or args.expToRun == 0: dpsIntervalList = dpsIntervalDef * [5, 2, 1.5, 1, 0.7, 0.5, 0.3, 0.1] for i in stepsList: np.random.seed(1) expFolderShort = 'trajCent%d' % i expFolder = 'resfiles/synth/%s' % expFolderShort os.system('mkdir -p %s' % expFolder) expNameShort = 'data' dataFileName = '%s/%s.npz' % (expFolder, expNameShort) paramsLocal['dataset'] = expNameShort paramsLocal['datasetFull'] = 'synth%s' % expNameShort dpsIntervalCurr = dpsIntervalList[i] thetasTrueCurr = generateThetas(nrClustToGenDef, trajMinLowerLim, trajMinInterval, slopeLowerLim, slopeInterval, dpsLowerLimit, dpsIntervalCurr) covPerturbedCurr = [ np.diag( [0, thetasTrueCurr[c3, 1]**2 / 15, dpsIntervalDef / 70, 0]) for c3 in range(nrClustToGenDef) ] # generate perturbed traj from clusters for each biomk # generate rand clust with uniform prob each clustAssignTrueB, thetasPerturbed = genClustAssThetasPerturb( nrBiomk, nrClustToGenDef, thetasTrueCurr, covPerturbedCurr) paramsLocal = generateClustData( nrSubjDef, nrBiomk, nrClustToGenDef, nrTimepts, trajFunc, thetasTrueCurr, thetasPerturbed, clustAssignTrueB, lowerAgeLim, upperAgeLim, covSubjShifts, avgStdScaleFactor, dataFileName, forceRegenerate, makeThetaIdentifFunc, paramsLocal) # for nrClustToFitCurr in range(1, 110): nrClustToFitCurr = nrClustToGenDef # ############# # setLocalParamsNrClust(nrClustToFitCurr, plotTrajParamsLocal) # changes plotTrajParamsLocal # assert plotTrajParamsLocal['legendColsClust'] == min([nrClustToFitCurr, 4]) # ############# # print('got hereeeeeeeeeeeee') # print(adsas) paramsLocal['nrClust'] = nrClustToGenDef expName = 'synth/%s/init%sCl%dPr%dRa%d' % \ (expFolderShort, args.initClustering, nrClustToFitCurr, args.informPrior, args.rangeFactor) plotTrajParamsLocal['expName'] = expName paramsLocal['plotTrajParams'] = plotTrajParamsLocal modelNames, res = evaluationFramework.runModels( paramsLocal, expName, modelToRun, runAllExpFunc) resList += [res] xLabelStr = 'Distance between trajectories' voxelCorrectAssignMeanValues = [ resList[i][0]['resComp']['voxelCorrectAssignMean'] for i in range(len(stepsList)) ] voxelCorrectAssignStdValues = [ resList[i][0]['resComp']['voxelCorrectAssignStd'] for i in range(len(stepsList)) ] fig = plotterObj.plotSynthResOneExp( voxelCorrectAssignMeanValues, voxelCorrectAssignStdValues, [dpsIntervalList[i] for i in stepsList], xLabelStr) fig.savefig('resfiles/synth/correctVertices_trajCent.png', dpi=100) ###################### vary number of clusters ############################### # copy state of params and plotTrajParams paramsLocal = copy.deepcopy(params) plotTrajParamsLocal = copy.deepcopy(plotTrajParams) paramsLocal['plotTrajParams'] = plotTrajParamsLocal resList = [] if args.expToRun == 2 or args.expToRun == 0: nrClustToGenList = [2, 3, 5, 10, 15, 20, 50, 100] for i in stepsList: np.random.seed(1) expFolderShort = 'nrClust%d' % i expFolder = 'resfiles/synth/%s' % expFolderShort os.system('mkdir -p %s' % expFolder) nrClustToGenCurr = nrClustToGenList[i] expNameShort = 'data' dataFileName = '%s/%s.npz' % (expFolder, expNameShort) paramsLocal['dataset'] = expNameShort paramsLocal['datasetFull'] = 'synth%s' % expNameShort potentialRowsIndCurr = [ j for j in range(len(nrImgMaxList)) if nrImgMaxList[j] >= nrClustToGenCurr ] + [len(nrImgMaxList) - 1] print(potentialRowsIndCurr) nrRowsCurr, nrColsCurr = rowsColsList[potentialRowsIndCurr[0]] plotTrajParamsLocal['nrRows'] = nrRowsCurr plotTrajParamsLocal['nrCols'] = nrColsCurr print('nrRowsCurr', nrRowsCurr) print('nrColsCurr', nrColsCurr) plotTrajParamsLocal['clustHuePoints'] = np.linspace( 0, 1, nrClustToGenCurr, endpoint=False) plotTrajParamsLocal['clustCols'] = [ colorsys.hsv_to_rgb(hue, 1, 1) for hue in plotTrajParamsLocal['clustHuePoints'] ] plotTrajParamsLocal['legendColsClust'] = min([nrClustToGenCurr, 4]) print(plotTrajParamsLocal['clustHuePoints']) # print(adsa) thetasTrueCurr = generateThetas(nrClustToGenCurr, trajMinLowerLim, trajMinInterval, slopeLowerLim, slopeInterval, dpsLowerLimit, dpsIntervalDef) covPerturbedCurr = [ np.diag( [0, thetasTrueCurr[c3, 1]**2 / 15, dpsIntervalDef / 70, 0]) for c3 in range(nrClustToGenCurr) ] # print('nrClust', nrClust) # print('clustAssignTrueB', clustAssignTrueB) # print('thetasTrue', thetasTrue) # print('covPerturbed', covPerturbed) # print(covPerturbed.shape, covPerturbed[clustAssignTrueB[0]], clustAssignTrueB[0]) # generate perturbed traj from clusters for each biomk # generate rand clust with uniform prob each clustAssignTrueB, thetasPerturbed = genClustAssThetasPerturb( nrBiomk, nrClustToGenCurr, thetasTrueCurr, covPerturbedCurr) # print('nrClustToGenCurr', nrClustToGenCurr) # print(adas) paramsLocal = generateClustData( nrSubjDef, nrBiomk, nrClustToGenCurr, nrTimepts, trajFunc, thetasTrueCurr, thetasPerturbed, clustAssignTrueB, lowerAgeLim, upperAgeLim, covSubjShifts, avgStdScaleFactor, dataFileName, forceRegenerate, makeThetaIdentifFunc, paramsLocal) paramsLocal['nrClust'] = nrClustToGenCurr expName = 'synth/%s/init%sCl%dPr%dRa%d' % \ (expFolderShort, args.initClustering, nrClustToGenCurr, args.informPrior, args.rangeFactor) plotTrajParamsLocal['expName'] = expName paramsLocal['plotTrajParams'] = plotTrajParamsLocal modelNames, res = evaluationFramework.runModels( paramsLocal, expName, modelToRun, runAllExpFunc) resList += [res] xLabelStr = 'Number of clusters' voxelCorrectAssignMeanValues = [ resList[i][0]['resComp']['voxelCorrectAssignMean'] for i in range(len(stepsList)) ] voxelCorrectAssignStdValues = [ resList[i][0]['resComp']['voxelCorrectAssignStd'] for i in range(len(stepsList)) ] fig = plotterObj.plotSynthResOneExp( voxelCorrectAssignMeanValues, voxelCorrectAssignStdValues, [nrClustToGenList[i] for i in stepsList], xLabelStr, makeInts=True) fig.savefig('resfiles/synth/correctVertices_nrClust.png', dpi=100) ###################### vary number of subjects ################################ # copy state of params and plotTrajParams paramsLocal = copy.deepcopy(params) plotTrajParamsLocal = copy.deepcopy(plotTrajParams) resList = [] if args.expToRun == 3 or args.expToRun == 0: nrSubjList = [1000, 500, 250, 100, 75, 50, 35, 20] for i in stepsList: np.random.seed(1) expFolderShort = 'nrSubj%d' % i expFolder = 'resfiles/synth/%s' % expFolderShort os.system('mkdir -p %s' % expFolder) nrSubjCurr = nrSubjList[i] expNameShort = 'data' dataFileName = '%s/%s.npz' % (expFolder, expNameShort) paramsLocal['dataset'] = expNameShort paramsLocal['datasetFull'] = 'synth%s' % expNameShort thetasTrueCurr = generateThetas(nrClustToGenDef, trajMinLowerLim, trajMinInterval, slopeLowerLim, slopeInterval, dpsLowerLimit, dpsIntervalDef) covPerturbedCurr = [ np.diag( [0, thetasTrueCurr[c3, 1]**2 / 15, dpsIntervalDef / 70, 0]) for c3 in range(nrClustToGenDef) ] clustAssignTrueB, thetasPerturbed = genClustAssThetasPerturb( nrBiomk, nrClustToGenDef, thetasTrueCurr, covPerturbedCurr) paramsLocal = generateClustData( nrSubjCurr, nrBiomk, nrClustToGenDef, nrTimepts, trajFunc, thetasTrueCurr, thetasPerturbed, clustAssignTrueB, lowerAgeLim, upperAgeLim, covSubjShifts, avgStdScaleFactor, dataFileName, forceRegenerate, makeThetaIdentifFunc, paramsLocal) paramsLocal['nrClust'] = paramsLocal['trueNrClust'] expName = 'synth/%s/init%sCl%dPr%dRa%d' % \ (expFolderShort, args.initClustering, nrClustToFit, args.informPrior, args.rangeFactor) plotTrajParams['expName'] = expName paramsLocal['plotTrajParams'] = plotTrajParamsLocal modelNames, res = evaluationFramework.runModels( paramsLocal, expName, modelToRun, runAllExpSynth) resList += [res] xLabelStr = 'Number of Subjects' voxelCorrectAssignMeanValues = [ resList[i][0]['resComp']['voxelCorrectAssignMean'] for i in range(len(stepsList)) ] voxelCorrectAssignStdValues = [ resList[i][0]['resComp']['voxelCorrectAssignStd'] for i in range(len(stepsList)) ] fig = plotterObj.plotSynthResOneExp(voxelCorrectAssignMeanValues, voxelCorrectAssignStdValues, [nrSubjList[i] for i in stepsList], xLabelStr, makeInts=True, adjLeft=0.2) fig.savefig('resfiles/synth/correctVertices_nrSubj.png', dpi=100)
def main(): nrSubjLong = 100 nrBiomk = 4 nrTimepts = 4 lowerAgeLim = 60 upperAgeLim = 80 shiftsLowerLim = -13 shiftsUpperLim = 10 etaB = 1 * np.ones(nrBiomk) lB = 10 * np.ones(nrBiomk) epsB = 1 * np.ones(nrBiomk) sigmaSB = 2 * np.ones((nrSubjLong, nrBiomk)) sigmaGfunc = GPModel.genSigmaG sigmaEpsfunc = None sigmaSfunc = None outFolder = 'resfiles/synth/' expName = 'synth1' fileName = '%s.npz' % expName forceRegenerate = False params = {} nrFuncUnits = 2 nrBiomkInFuncUnits = 3 nrBiomk = nrBiomkInFuncUnits * nrFuncUnits mapBiomkToFuncUnits = np.array( list(range(nrFuncUnits)) * nrBiomkInFuncUnits) # should give smth like [0,1,2,3,0,1,2,3,0,1,2,3] print('mapBiomkToFuncUnits', mapBiomkToFuncUnits) # params of the dysfunctional trajectories (in the disease specific model) dysfuncParams = np.zeros((nrFuncUnits, 4), float) dysfuncParams[:, 0] = 1 # ak dysfuncParams[:, 1] = 0.3 # bk dysfuncParams[:, 2] = [-3, 7] # ck dysfuncParams[:, 3] = 0 # dk # params of individual biomarkers thetas = np.zeros((nrBiomk, 4), float) thetas[:, 0] = 1 thetas[:, 1] = 10 thetas[:, 3] = 0 for f in range(nrFuncUnits): thetas[mapBiomkToFuncUnits == f, 2] = np.linspace(0.2, 0.9, num=nrBiomkInFuncUnits, endpoint=True) sigmaB = 0.1 * np.ones(nrBiomk) synthModel = ParHierModel.ParHierModel(dysfuncParams, thetas, mapBiomkToFuncUnits, sigmoidFunc, sigmaB) params = genSynthData.generateDataJMD(nrSubjLong, nrBiomk, nrTimepts, lowerAgeLim, upperAgeLim, shiftsLowerLim, shiftsUpperLim, synthModel, outFolder, fileName, forceRegenerate, params) plotTrajParams['diagNrs'] = np.unique(params['diag']) plotTrajParams['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits plotTrajParams['trueParams'] = params['trueParams'] plotTrajParams['labels'] = ['b%d' % n for n in range(nrBiomk)] plotTrajParams['nrRowsFuncUnit'] = 2 plotTrajParams['nrColsFuncUnit'] = 2 plotTrajParams['colorsTraj'] = [ colorsys.hsv_to_rgb(hue, 1, 1) for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False) ] # if False, plot estimated traj. in separate plot from true traj. plotTrajParams['allTrajOverlap'] = False params['runIndex'] = args.runIndex params['nrProc'] = args.nrProc params['cluster'] = args.cluster params['plotTrajParams'] = plotTrajParams params['penalty'] = args.penalty params['nrFuncUnits'] = nrFuncUnits params['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits # params['data'] = dataCross # params['diag'] = diagCross # params['scanTimepts'] = scanTimeptsCross # params['partCode'] = partCodeCross # params['ageAtScan'] = ageAtScanCrossZ # params['trueParams'] = trueParams biomkCols = np.array([ colorsys.hsv_to_rgb(hue, 1, 1) for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False) ]) if forceRegenerate: synthPlotter = Plotter.PlotterJDM(plotTrajParams) fig = synthPlotter.plotTrajData(params['longData'], params['longDiag'], params['trueParams']['dpsLong'], synthModel, replaceFigMode=True) fig.savefig('%s/synth1GeneratedData.png' % outFolder) if np.abs(args.penalty - int(args.penalty) < 0.00001): expName = '%sPen%d' % (expName, args.penalty) else: expName = '%sPen%.1f' % (expName, args.penalty) params['runPartStd'] = ['L', 'R'] # [mainPart, plot, stage] params['runPartMain'] = ['R', 'R', 'I'] params['masterProcess'] = args.runIndex == 0 modelNames, res = evaluationFramework.runModels(params, expName, args.modelToRun, runAllExpSynth)
def launchTadpole(runIndex, nrProcesses, modelToRun): doProcess = 0 if doProcess: if args.leaderboard == 0: inputFileData = '../data/ADNI/challenge_training_data/neil_repo/TADPOLE_D1_D2.csv' sys.stdout.flush() outFileCheckpoint2 = 'tadpoleDf2.npz' print('loading data file') df = pd.read_csv(inputFileData,low_memory=False) df = cleanTadpoleData(df) data, diag, labels, scanTimepts, partCode, ageAtScan, dataDf, monthsSinceRefTime, \ examDates, predInd = parseTadpoleData(df) else: outFileCheckpoint2 = 'tadpoleDf2Ldb.npz' print('loading data file') inputFileDataD1D2 = '../data/ADNI/challenge_training_data/neil_repo/TADPOLE_D1_D2.csv' df = pd.read_csv(inputFileDataD1D2,low_memory=False) df = cleanTadpoleData(df) inputFileDataLB = '../data/ADNI/challenge_training_data/neil_repo/evaluation/TADPOLE_LB1_LB2.csv' dfLB = pd.read_csv(inputFileDataLB, low_memory=False) # this function runs exactly as in the normal submission, no difference here for leaderboard data, diag, labels, scanTimepts, partCode, ageAtScan, dataDf, monthsSinceRefTime, \ examDates, _ = parseTadpoleData(df) filterMaskLB12 = np.logical_or(dfLB.LB1 == 1, dfLB.LB2 == 1) assert data.shape[0] == dfLB.shape[0] # print(np.sum(filterMaskLB12), filterMaskLB12.shape[0]) # print(dads) data = data[filterMaskLB12,:] diag = diag[filterMaskLB12] scanTimepts = scanTimepts[filterMaskLB12] partCode = partCode[filterMaskLB12] ageAtScan = ageAtScan[filterMaskLB12] dataDf = dataDf[filterMaskLB12] dataDf.reset_index(drop=True, inplace=True) dataDf.reindex(index=range(dataDf.shape[0])) monthsSinceRefTime = monthsSinceRefTime[filterMaskLB12] examDates = examDates[filterMaskLB12] predInd = dfLB.RID[dfLB.LB2 == 1].as_matrix() dataStruct = dict(data=data, diag=diag, labels=labels, scanTimepts=scanTimepts, partCode=partCode, ageAtScan=ageAtScan, dataDf=dataDf, monthsSinceRefTime=monthsSinceRefTime, examDates=examDates, predInd=predInd) pickle.dump(dataStruct, open(outFileCheckpoint2, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) else: if args.leaderboard == 0: outFileCheckpoint2 = 'tadpoleDf2.npz' else: outFileCheckpoint2 = 'tadpoleDf2Ldb.npz' dataStruct = pickle.load(open(outFileCheckpoint2, 'rb')) data = dataStruct['data'] diag = dataStruct['diag'] labels = dataStruct['labels'] scanTimepts = dataStruct['scanTimepts'] partCode = dataStruct['partCode'] ageAtScan = dataStruct['ageAtScan'] # dataDf = dataStruct['dataDf'] monthsSinceRefTime = dataStruct['monthsSinceRefTime'] examDates = dataStruct['examDates'] predInd = dataStruct['predInd'] # filter AD subjects # diagInd = np.array(np.where(matData['diag'] == PCA)[0]) print('compiling parameters') sys.stdout.flush() print('diag', np.unique(diag), diag) # print(adsas) unqPartCode = np.unique(partCode) nrUnqPart = len(unqPartCode) # calculate Z-scores at each point w.r.t controls at baseline # controlBlInd = np.logical_and(diag == CTL, scanTimepts == 1) controlInd = diag == CTL stdBiomk = np.nanstd(data[diag == CTL], 0) biomkMaskCTL = np.isnan(np.nanstd(data[diag == CTL], 0)) biomkMaskAD = np.isnan(np.nanstd(data[diag == AD], 0)) biomkMaskMCI = np.isnan(np.nanstd(data[diag == MCI], 0)) mask = np.logical_or(np.logical_or(biomkMaskCTL, biomkMaskMCI), biomkMaskAD) # print(ads) selectedBiomk = np.logical_not(np.logical_or(mask, stdBiomk == 0)) print(data.shape) data = data[:, selectedBiomk] labels = labels[selectedBiomk] pointIndices = np.array(range(data.shape[1])) stdBiomk = np.nanstd(data[controlInd], 0) print(data.shape) # print(ads) meanCTL = np.nanmean(data[controlInd], 0) # calculate Z-scores stdCTL = np.nanstd(data[controlInd], 0) dataZ = (data - meanCTL[None,:])/stdCTL[None,:] data = dataZ outlierRows, outlierCols = np.where(np.abs(dataZ) > 50) filterMask = np.ones(data.shape[0], bool) filterMask[outlierRows] = 0 data = data[filterMask] diag = diag[filterMask] scanTimepts = scanTimepts[filterMask] partCode = partCode[filterMask] ageAtScan = ageAtScan[filterMask] monthsSinceRefTime = monthsSinceRefTime[filterMask] examDates = examDates[filterMask] nrSubj, nrBiomk = data.shape # print('nrBiomk', nrBiomk) # print(adsa) dataAD = data[diag == AD, :] # make all biomarkers decreasing by flipping their signs if necessary # also perform a t-test to see which ones are most informative, sort them by pvalue (i.e. sortedByPvalInd) # the new data is re-scaled data, sortedByPvalInd, biomkScaleExtra, pVals = makeBiomksDecr(data, diag, labels) #doTtest(data, diag, pointIndices) # multiply the scaling we did from controls with (-1) if the biomk had the sign flipped stdBiomkRescale = biomkScaleExtra * stdCTL assert(sortedByPvalInd.shape[0] == data.shape[1]) sys.stdout.flush() global params params['data'] = data params['diag'] = diag params['scanTimepts'] = scanTimepts params['partCode'] = partCode params['ageAtScan'] = ageAtScan params['biomkDir'] = DECR params['modelToRun'] = modelToRun params['datasetFull'] = 'tadpole' params['labels'] = labels params['predInd'] = predInd params['examDates'] = examDates print('outFileCheckpoint2', outFileCheckpoint2) print('d2Ind', np.unique(predInd), np.unique(predInd).shape) # print(adsa) # filter down to 100 subjects to make it run faster, just for testing. Also select only some biomarkers unqPartCode = np.unique(params['partCode']) nrPartToSample = 100 np.random.seed(3) selectedPartCode = np.random.choice(unqPartCode, nrPartToSample) dataIndices = np.in1d(params['partCode'], selectedPartCode) # params = diffEqModel.filterDDSPAIndices(params, dataIndices) indices = [i for i in range(len(labels)) if labels[i] in [b'FDG', b'AV45', b'CDRSB', b'ADAS13', b'Ventricles', b'Hippocampus', b'WholeBrain', b'Entorhinal', b'MidTemp', b'ABETA_UPENNBIOMK9_04_19_17', b'TAU_UPENNBIOMK9_04_19_17', b'PTAU_UPENNBIOMK9_04_19_17']] # indices = sortedByPvalInd[:300] # print('pVals lowest', pVals[sortedByPvalInd[:300]]) # print('pVals highest', pVals[sortedByPvalInd[-100:]]) # print('indices', indices) # print(ads) print('labels', labels[indices]) # print(adsa) print(np.nanstd(data,axis=0)[indices]) data = params['data'][:,indices] params['data'] = data labels = labels[indices] params['labels'] = labels nrBiomk = params['data'].shape[1] print('data.shape', params['data'].shape) meanCTL = meanCTL[indices] stdBiomkRescale = stdBiomkRescale[indices] print(stdBiomkRescale) print('flippedBiomk', labels[stdBiomkRescale < 0]) sortedByPvalInd = np.argsort(np.argsort(sortedByPvalInd[indices])) # visTadpoleHist(data, diag, ageAtScan, labels, plotTrajParams, sortedByPvalInd) # print(adsa) # visTadpoleSpagetti(data, diag, ageAtScan, scanTimepts, partCode, labels, plotTrajParams, sortedByPvalInd) # print(adsa) # print('CTL %f +/- %f', np.nanmean(params['data'][params['diag'] == CTL, 1]), np.nanstd(params['data'][params['diag'] == CTL, 1])) # print('AD %f +/- %f', np.nanmean(params['data'][params['diag'] == AD, 1]), np.nanstd(params['data'][params['diag'] == AD, 1])) # print(ads) # map points that have been removed to the closest included points (nearestNeighbours). # also find the adjacency list for the MRF and another subset of 10k points for # initial clustering runPartNN = 'L' plotTrajParams['nearestNeighbours'] = np.array(range(nrBiomk)) params['adjList'] = np.nan params['nearNeighInitClust'] = np.array(range(nrBiomk)) params['initClustSubsetInd'] = np.array(range(nrBiomk)) params['meanBiomkRescale'] = meanCTL # for rescaling back if necessary params['stdBiomkRescale'] = stdBiomkRescale params['fixSpeed'] = True # if true then don't model progression speed, only time shift diagNrs = np.unique(diag) # print('diagNrs, diag', diagNrs, diag) # print(asdas) # print(len(params['acqDate']), data.shape[0]) sys.stdout.flush() assert(params['data'].shape[0] == params['diag'].shape[0] == params['scanTimepts'].shape[0] == params['partCode'].shape[0] == params['ageAtScan'].shape[0]) # sets an uninformative or informative prior priorNr = setPrior(params, args.informPrior, mean_gamma_alpha=1, std_gamma_alpha=0.3, mu_beta=0, std_beta=5) suffix = '' if args.leaderboard: suffix = 'Ldb' # print(ads) expName = 'tadpoleInit%sCl%dPr%dRa%d%s' % (args.initClustering, params['nrClust'], priorNr, args.rangeFactor, suffix) plotTrajParams['sortedByPvalInd'] = sortedByPvalInd plotTrajParams['pointIndices'] = pointIndices plotTrajParams['expName'] = expName plotTrajParams['ageTransform'] = (0, 1) # no age normalisation was necessary plotTrajParams['datasetFull'] = params['datasetFull'] plotTrajParams['labels'] = labels params['plotTrajParams'] = plotTrajParams # [initClust, modelFit, AIC/BIC, blender, theta_sampling] params['runPartStd'] = ['L', 'L', 'I', 'I', 'I'] params['runPartMain'] = ['R', 'I', 'I'] # [mainPart, plot, stage] params['runPartCogCorr'] = ['I'] params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'L'] params['runPartDirDiag'] = ['R', 'R', 'I'] params['runPartStaging'] = ['L', 'L', 'I'] params['runPartDiffDiag'] = ['R', 'R', 'I'] params['runPartConvPred'] = ['I', 'I', 'I'] params['runPartCVNonOverlap'] = ['R'] params['runPartCVNonOverlapMain'] = ['L', 'L', 'I', 'I', 'L'] params['masterProcess'] = runIndex == 0 if params['masterProcess']: # [initClust, modelFit, AIC/BIC, blender, theta_sampling] params['runPartStd'] = ['L', 'L', 'I', 'I', 'I'] params['runPartMain'] = ['R', 'R', 'R'] # [mainPart, plot, stage] params['runPartCogCorr'] = ['I'] params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'I'] params['runPartDirDiag'] = ['R', 'R', 'I'] params['runPartStaging'] = ['L', 'L', 'I'] params['runPartDiffDiag'] = ['R', 'R', 'I'] params['runPartConvPred'] = ['I', 'I', 'I'] params['runPartCVNonOverlap'] = ['I'] params['runPartCVNonOverlapMain'] = ['R', 'R', 'I', 'R', 'R'] runAllExpFunc = runAllExpTADPOLE modelNames, res = evaluationFramework.runModels(params, expName, modelToRun, runAllExpFunc) # now generate forecast print('Generating forecast ... ') teamName = 'DIVE6' if args.leaderboard: outputFile = 'TADPOLE_Submission_Leaderboard_%s.csv' % teamName predStartDate = datetime.date(2010, 5, 1) nrYearsToPred = 7 nrMonthsToPred = 12*nrYearsToPred # 5 years else: outputFile = 'TADPOLE_Submission_%s.csv' % teamName predStartDate = datetime.date(2018, 1, 1) nrYearsToPred = 5 nrMonthsToPred = 12*nrYearsToPred # 7 years resCurrModel = res[0]['std'] predAdasAllSubj, predVentsAllSubj, predDiagAllSubj = makeTadpoleForecast(predStartDate, nrYearsToPred, nrMonthsToPred, resCurrModel, params) # write forecast to file writeTadpoleSubmission(predAdasAllSubj, predVentsAllSubj, predDiagAllSubj, outputFile, nrMonthsToPred, predStartDate, params)