Exemple #1
0
def printBICresults(params, expNameBefCl, expNameAfterCl, modelToRun,
                    nrClustList, runAllExpFunc):

    fileName = 'resfiles/BICres_%s%s' % (expNameBefCl, expNameAfterCl)
    bicAllFileName = '%s.npz' % fileName
    figBicFileName = '%s.png' % fileName

    runPart = 'R'
    if runPart == 'R':

        bic = np.nan * np.ones(len(nrClustList), float)
        aic = np.nan * np.ones(len(nrClustList), float)

        # go through every nrClustList file that was found for this experiment
        for nrClustIndex in range(len(nrClustList)):
            nrClustCurr = nrClustList[nrClustIndex]

            expName = '%sCl%d%s' % (expNameBefCl, nrClustCurr, expNameAfterCl)
            params['plotTrajParams']['expName'] = expName

            params['nrClust'] = nrClustCurr
            # [initClust, modelFit, aic/bic, plotBlender, sampleTraj]
            params['runPartStd'] = [
                'Non-enforcing', 'Non-enforcing', 'Non-enforcing', 'I', 'I'
            ]
            params['runPartMain'] = ['R', 'I', 'I']  # [mainPart, plot, stage]

            modelNames, res = evaluationFramework.runModels(
                params, expName, modelToRun, runAllExpFunc)

            if res[0]['std']:
                print('bic', res[0]['std']['bic'])
                print('aic', res[0]['std']['aic'])
                bic[nrClustIndex] = res[0]['std']['bic']
                aic[nrClustIndex] = res[0]['std']['aic']

            res = None
            gc.collect()
            print('garbage collector called')
            sys.stdout.flush()

        dataStruct = dict(aic=aic, bic=bic, nrClustList=nrClustList)
        pickle.dump(dataStruct, open(bicAllFileName, 'wb'),
                    pickle.HIGHEST_PROTOCOL)

    elif runPart == 'L':
        dataStruct = pickle.load(open(bicAllFileName, 'rb'))
        aic = dataStruct['aic']
        bic = dataStruct['bic']
    else:
        raise ValueError('need to either load file or run the experiment')

    foundInd = np.logical_not(np.isnan(bic))
    bicFound = bic[foundInd]
    aicFound = aic[foundInd]
    nrClustListFound = np.array(nrClustList)[foundInd]

    minBICInd = np.argmin(bicFound)
    minBIC = bicFound[minBICInd]
    nrClustMinBIC = nrClustListFound[minBICInd]

    minAICInd = np.argmin(aicFound)
    minAIC = aicFound[minAICInd]
    nrClustMinAIC = nrClustListFound[minAICInd]

    print('bicFound, nrClustListBicFound', bicFound, nrClustListFound)
    print('minBIC, nrClustMinBIC', minBIC, nrClustMinBIC)

    print('aicFound, nrClustListAicFound', aicFound, nrClustListFound)
    print('minAIC, nrClustMinAIC', minAIC, nrClustMinAIC)

    fig = pl.figure()
    colors = ['r', 'g']

    pl.plot(nrClustListFound, bicFound, label='BIC', color=colors[0])
    pl.plot(nrClustListFound, aicFound, label='AIC', color=colors[1])

    # plot the two dots

    size = 50
    pl.scatter([nrClustMinBIC, nrClustMinAIC], [minBIC, minAIC],
               color=colors,
               s=size)

    fontsize = 14

    pl.legend(fontsize=fontsize)
    pl.xlabel('Number of clusters', fontsize=fontsize)
    pl.ylabel('Criterion Value', fontsize=fontsize)
    nrClustToPlot = 8
    pl.xlim([1, 2 + nrClustToPlot])

    allPlottedBicAicValues = np.concatenate(
        (bicFound[:nrClustToPlot], aicFound[:nrClustToPlot]))
    yMax = np.max(allPlottedBicAicValues, axis=0)
    yMin = np.min(allPlottedBicAicValues, axis=0)
    yDelta = (yMax - yMin) / 6

    pl.ylim([yMin - yDelta, yMax + yDelta])
    pl.xticks(range(2, 3 + nrClustToPlot), fontsize=fontsize)
    pl.yticks(fontsize=fontsize)

    fig.savefig(figBicFileName, dpi=100)

    return fig
Exemple #2
0
def main():

    nrSubjLong = 100
    nrBiomk = 4
    nrTimepts = 4

    lowerAgeLim = 60
    upperAgeLim = 80

    shiftsLowerLim = -13
    shiftsUpperLim = 10

    etaB = 1 * np.ones(nrBiomk)
    lB = 10 * np.ones(nrBiomk)
    epsB = 1 * np.ones(nrBiomk)
    sigmaSB = 2 * np.ones((nrSubjLong, nrBiomk))

    sigmaGfunc = GPModel.genSigmaG
    sigmaEpsfunc = None
    sigmaSfunc = None

    outFolder = 'resfiles/synth/'

    expName = 'synth1'
    fileName = '%s.npz' % expName

    forceRegenerate = False

    params = {}

    nrFuncUnits = 2
    nrBiomkInFuncUnits = 3

    nrBiomk = nrBiomkInFuncUnits * nrFuncUnits
    mapBiomkToFuncUnits = np.array(
        list(range(nrFuncUnits)) * nrBiomkInFuncUnits)
    # should give smth like [0,1,2,3,0,1,2,3,0,1,2,3]
    print('mapBiomkToFuncUnits', mapBiomkToFuncUnits)

    plotTrajParams['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits
    plotTrajParams['labels'] = ['b%d' % n for n in range(nrBiomk)]
    plotTrajParams['nrRowsFuncUnit'] = 2
    plotTrajParams['nrColsFuncUnit'] = 3
    plotTrajParams['colorsTraj'] = [
        colorsys.hsv_to_rgb(hue, 1, 1)
        for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False)
    ]

    # if False, plot estimated traj. in separate plot from true traj.
    plotTrajParams['allTrajOverlap'] = False

    params['runIndex'] = args.runIndex
    params['nrProc'] = args.nrProc
    params['cluster'] = args.cluster
    params['plotTrajParams'] = plotTrajParams
    params['penalty'] = args.penalty
    params['nrFuncUnits'] = nrFuncUnits
    params['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits

    ##### disease agnostic parameters ###########
    # params of individual biomarkers
    thetas = np.zeros((nrBiomk, 4), float)
    thetas[:, 0] = 1
    thetas[:, 1] = 10
    thetas[:, 3] = 0
    for f in range(nrFuncUnits):
        thetas[mapBiomkToFuncUnits == f,
               2] = np.linspace(0.2,
                                0.9,
                                num=nrBiomkInFuncUnits,
                                endpoint=True)

    sigmaB = 0.1 * np.ones(nrBiomk)

    ##### disease specific parameters ###########
    # params of the dysfunctional trajectories - disease 1
    dysfuncParamsDisOne = np.zeros((nrFuncUnits, 4), float)
    dysfuncParamsDisOne[:, 0] = 1  # ak
    dysfuncParamsDisOne[:, 1] = 0.3  # bk
    dysfuncParamsDisOne[:, 2] = [-3, 7]  # ck
    dysfuncParamsDisOne[:, 3] = 0  # dk

    synthModelDisOne = ParHierModel.ParHierModel(dysfuncParamsDisOne, thetas,
                                                 mapBiomkToFuncUnits,
                                                 sigmoidFunc, sigmaB)

    paramsDisOne = genSynthData.generateDataJMD(nrSubjLong, nrBiomk, nrTimepts,
                                                lowerAgeLim, upperAgeLim,
                                                shiftsLowerLim, shiftsUpperLim,
                                                synthModelDisOne, outFolder,
                                                fileName, forceRegenerate,
                                                params)

    paramsDisOne['plotTrajParams']['diagNrs'] = np.unique(paramsDisOne['diag'])
    paramsDisOne['plotTrajParams']['trueParams'] = paramsDisOne['trueParams']

    if forceRegenerate:
        synthPlotter = Plotter.PlotterJDM(paramsDisOne['plotTrajParams'])
        fig = synthPlotter.plotTrajData(paramsDisOne['longData'],
                                        paramsDisOne['longDiag'],
                                        paramsDisOne['trueParams']['dpsLong'],
                                        synthModelDisOne,
                                        replaceFigMode=True)
        fig.savefig('%s/synth1Dis1GenData.png' % outFolder)

    # params of the dysfunctional trajectories - disease 2
    dysfuncParamsDisTwo = copy.deepcopy(dysfuncParamsDisOne)
    dysfuncParamsDisTwo[:, 1] = 1
    dysfuncParamsDisTwo[:, 2] = [8, -4]

    synthModelDisTwo = ParHierModel.ParHierModel(dysfuncParamsDisTwo, thetas,
                                                 mapBiomkToFuncUnits,
                                                 sigmoidFunc, sigmaB)

    paramsDisTwo = copy.deepcopy(paramsDisOne)

    paramsDisTwo = genSynthData.generateDataJMD(nrSubjLong, nrBiomk, nrTimepts,
                                                lowerAgeLim, upperAgeLim,
                                                shiftsLowerLim, shiftsUpperLim,
                                                synthModelDisTwo, outFolder,
                                                fileName, forceRegenerate,
                                                paramsDisTwo)

    # for disease two, only keep the second biomarker in each functional unit
    indBiomkInDiseaseTwo = np.array(range(nrFuncUnits, (2 * nrFuncUnits)))
    print('indBiomkInDiseaseTwo', indBiomkInDiseaseTwo)
    paramsDisTwo['Xtrue'] = paramsDisTwo['X']
    paramsDisTwo['Ytrue'] = paramsDisTwo['Y']
    paramsDisTwo['X'] = [paramsDisTwo['X'][b] for b in indBiomkInDiseaseTwo]
    paramsDisTwo['Y'] = [paramsDisTwo['Y'][b] for b in indBiomkInDiseaseTwo]
    paramsDisTwo['mapBiomkToFuncUnits'] = np.array(
        [mapBiomkToFuncUnits[b] for b in indBiomkInDiseaseTwo])

    # for disease two, change the format of the X and Y arrays, add the missing biomarkers with empty lists
    XemptyListsAllBiomk = [0 for _ in range(nrBiomk)]
    YemptyListsAllBiomk = [0 for _ in range(nrBiomk)]
    for b in range(nrBiomk):
        XemptyListsAllBiomk[b] = [0 for _ in range(nrSubjLong)]
        YemptyListsAllBiomk[b] = [0 for _ in range(nrSubjLong)]

        for s in range(nrSubjLong):
            if b in indBiomkInDiseaseTwo:
                XemptyListsAllBiomk[b][s] = paramsDisTwo['Xtrue'][b][s]
                YemptyListsAllBiomk[b][s] = paramsDisTwo['Ytrue'][b][s]
            else:
                XemptyListsAllBiomk[b][s] = np.array([])
                YemptyListsAllBiomk[b][s] = np.array([])

    paramsDisTwo['XemptyListsAllBiomk'] = XemptyListsAllBiomk
    paramsDisTwo['YemptyListsAllBiomk'] = YemptyListsAllBiomk

    paramsDisTwo['plotTrajParams']['diagNrs'] = np.unique(paramsDisTwo['diag'])
    paramsDisTwo['plotTrajParams']['trueParams'] = paramsDisTwo['trueParams']
    paramsDisTwo['plotTrajParams']['trueParams']['trueTrajPredXB'] = \
      paramsDisTwo['plotTrajParams']['trueParams']['trueTrajPredXB'][:,indBiomkInDiseaseTwo]

    paramsDisTwo['plotTrajParams']['labels'] = \
      [[paramsDisTwo['plotTrajParams']['labels'][b]] for b in indBiomkInDiseaseTwo]

    if forceRegenerate:
        synthPlotter = Plotter.PlotterJDM(paramsDisTwo['plotTrajParams'])
        fig = synthPlotter.plotTrajData(paramsDisTwo['longData'],
                                        paramsDisTwo['longDiag'],
                                        paramsDisTwo['trueParams']['dpsLong'],
                                        synthModelDisTwo,
                                        replaceFigMode=True)
        fig.savefig('%s/synth1Dis2GenData.png' % outFolder)

    if np.abs(args.penalty - int(args.penalty) < 0.00001):
        expName = '%sPen%d' % (expName, args.penalty)
    else:
        expName = '%sPen%.1f' % (expName, args.penalty)

    paramsDisOne['runPartStd'] = ['L', 'L']
    paramsDisOne['runPartMain'] = ['R', 'I', 'I']  # [mainPart, plot, stage]
    paramsDisOne['masterProcess'] = args.runIndex == 0

    expNameDisOne = '%sDisOne' % expName
    modelNames, res = evaluationFramework.runModels(paramsDisOne,
                                                    expNameDisOne,
                                                    args.modelToRun,
                                                    runAllExpSynth)

    paramsDisTwo['filePathUnitModels'] = '%s/%s_JMD/unitModels.npz' % (
        outFolder, expNameDisOne)

    paramsDisTwo['runPartStd'] = ['R', 'R']
    paramsDisTwo['runPartMain'] = ['R', 'I', 'I']  # [mainPart, plot, stage]
    paramsDisTwo['masterProcess'] = args.runIndex == 0

    modelDisTwo = 16
    expNameDisTwo = '%sDisTwo' % expName
    modelNames, res = evaluationFramework.runModels(paramsDisTwo,
                                                    expNameDisTwo, modelDisTwo,
                                                    runAllExpSynth)
Exemple #3
0
def main():

    nrSubjLong = 100
    nrTimepts = 4

    lowerAgeLim = 60
    upperAgeLim = 80

    shiftsLowerLim = -13
    shiftsUpperLim = 10

    outFolder = 'resfiles/synth/'

    expName = args.expName
    fileName = '%s.npz' % expName

    regenerateData = args.regData

    params = {}

    nrFuncUnits = 2
    nrBiomkInFuncUnits = 3
    nrDis = 2

    nrBiomk = nrBiomkInFuncUnits * nrFuncUnits
    mapBiomkToFuncUnits = np.array(
        list(range(nrFuncUnits)) * nrBiomkInFuncUnits)
    # should give smth like [0,1,2,3,0,1,2,3,0,1,2,3]
    print('mapBiomkToFuncUnits', mapBiomkToFuncUnits)

    biomkInFuncUnit = [0 for u in range(nrFuncUnits + 1)]
    for u in range(nrFuncUnits):
        biomkInFuncUnit[u] = np.where(mapBiomkToFuncUnits == u)[0]

    biomkInFuncUnit[nrFuncUnits] = np.array(
        [])  # need to leave this as empty list

    plotTrajParams['biomkInFuncUnit'] = biomkInFuncUnit
    plotTrajParams['labels'] = ['biomarker %d' % n for n in range(nrBiomk)]
    plotTrajParams['nrRowsFuncUnit'] = 3
    plotTrajParams['nrColsFuncUnit'] = 4
    plotTrajParams['colorsTrajBiomkB'] = [
        colorsys.hsv_to_rgb(hue, 1, 1)
        for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False)
    ]
    plotTrajParams['colorsTrajUnitsU'] = [
        colorsys.hsv_to_rgb(hue, 1, 1)
        for hue in np.linspace(0, 1, num=nrFuncUnits, endpoint=False)
    ]

    # plotTrajParams['yNormMode'] = 'zScoreTraj'
    # plotTrajParams['yNormMode'] = 'zScoreEarlyStageTraj'
    plotTrajParams['yNormMode'] = 'unscaled'

    # if False, plot estimated traj. in separate plot from true traj.
    plotTrajParams['allTrajOverlap'] = True

    params['unitNames'] = ['Unit%d' % f for f in range(nrFuncUnits)]

    params['runIndex'] = args.runIndex
    params['nrProc'] = args.nrProc
    params['cluster'] = args.cluster
    params['plotTrajParams'] = plotTrajParams
    params['penalty'] = args.penalty
    params['penaltyUnits'] = 20
    params['penaltyDis'] = 1
    params['nrFuncUnits'] = nrFuncUnits
    params['nrFuncUnitsImgOnly'] = nrFuncUnits
    params['biomkInFuncUnit'] = biomkInFuncUnit
    params['nrBiomkDisModel'] = nrFuncUnits
    params['nrExtraBiomk'] = 0

    params[
        'nrGlobIterUnit'] = 10  # these parameters are specific for the Joint Model of Disease (JMD)
    params['iterParamsUnit'] = 50
    params['nrGlobIterDis'] = 10
    params['iterParamsDis'] = 50

    # # params['unitModelObjList'] = MarcoModel.GP_progression_model
    # params['unitModelObjList'] = SigmoidModel.SigmoidModel
    # params['disModelObj'] = SigmoidModel.SigmoidModel

    # by default we have no priors
    params['priors'] = None

    ####### set priors for specific models #########

    # params['priors'] = dict(prior_length_scale_mean_ratio=0.33, # mean_length_scale = (self.maxX-self.minX)/3
    #     prior_length_scale_std=1e-4, prior_sigma_mean=2,prior_sigma_std = 1e-3,
    #     prior_eps_mean = 1, prior_eps_std = 1e-2)
    # params['priors'] = dict(prior_length_scale_mean_ratio=0.9,  # mean_length_scale = (self.maxX-self.minX)/3
    #                             prior_length_scale_std=1e-4, prior_sigma_mean=3, prior_sigma_std=1e-3,
    #                             prior_eps_mean=0.1, prior_eps_std=1e-6)

    params['priorsUnitModelsMarcoModel'] = [
        dict(
            prior_length_scale_mean_ratio=
            0.05,  # mean_length_scale = (self.maxX-self.minX)/3
            prior_length_scale_std=1e-6,
            prior_sigma_mean=0.5,
            prior_sigma_std=1e-3,
            prior_eps_mean=0.1,
            prior_eps_std=1e-6) for u in range(nrFuncUnits)
    ]

    transitionTimePriorMean = 1  # in DPS 0-1 space, prior mean
    transitionTimePriorMin = 0.1
    transitionTimePriorMax = 10

    bPriorShape, bPriorRate = getGammShapeRateFromTranTime(
        transitionTimePriorMean, transitionTimePriorMin,
        transitionTimePriorMax)

    params['priorsDisModels'] = [
        dict(meanA=1,
             stdA=1e-5,
             meanD=0,
             stdD=1e-5,
             shapeB=bPriorShape,
             rateB=bPriorRate,
             timeShiftStd=15) for d in range(nrDis)
    ]
    params['priorsUnitModels'] = [None for d in range(nrDis)]

    ##### disease agnostic parameters ###########
    # params of individual biomarkers
    thetas = np.zeros((nrBiomk, 4), float)
    thetas[:, 0] = 1
    thetas[:, 3] = 0
    for f in range(nrFuncUnits):
        thetas[mapBiomkToFuncUnits == f,
               2] = np.linspace(0.2,
                                0.9,
                                num=nrBiomkInFuncUnits,
                                endpoint=True)

    # set first funtional unit to have traj with lower slopes
    thetas[mapBiomkToFuncUnits == 0, 1] = 5
    thetas[mapBiomkToFuncUnits == 1, 1] = 10
    # thetas[mapBiomkToFuncUnits == 2, 1] = 7

    if args.expName == 'synth1':
        sigmaB = 0.05 * np.ones(nrBiomk)
    elif args.expName == 'synth2':
        sigmaB = 0.01 * np.ones(nrBiomk)
    else:
        raise ValueError('expName should be synth1 or synth2')

    # scale every biomarker with mean and std.
    scalingBiomk2B = np.zeros((2, nrBiomk))
    # scalingBiomk2B[:, 0] = [200, 100] # mean +/- std
    # scalingBiomk2B[:, 0] = [200, 100]  # mean +/- std
    #
    # scalingBiomk2B[:, 1] = [-20, 3]  # mean +/- std
    # scalingBiomk2B[:, 1] = [-20, 3]  # mean +/- std
    #
    # scalingBiomk2B[:, 2:4] = scalingBiomk2B[:, 0:2]
    # scalingBiomk2B[:, 4:6] = scalingBiomk2B[:, 0:2]

    scalingBiomk2B[1, :] = 1

    ##### disease 1 - disease specific parameters ###########

    # params of the dysfunctional trajectories
    dysfuncParamsDisOne = np.zeros((nrFuncUnits, 4), float)
    dysfuncParamsDisOne[:, 0] = 1  # ak
    dysfuncParamsDisOne[:, 1] = [0.3, 0.2]  # bk
    dysfuncParamsDisOne[:, 2] = [-4, 6]  # ck
    dysfuncParamsDisOne[:, 3] = 0  # dk

    synthModelDisOne = ParHierModel.ParHierModel(dysfuncParamsDisOne, thetas,
                                                 mapBiomkToFuncUnits,
                                                 sigmoidFunc, sigmaB)

    paramsDisOne = copy.deepcopy(params)

    paramsDisOne = genSynthData.generateDataJMD(nrSubjLong,
                                                nrBiomk,
                                                nrTimepts,
                                                shiftsLowerLim,
                                                shiftsUpperLim,
                                                synthModelDisOne,
                                                outFolder,
                                                fileName,
                                                regenerateData,
                                                paramsDisOne,
                                                scalingBiomk2B,
                                                ctlDiagNr=CTL,
                                                patDiagNr=AD)

    # paramsDisOne['plotTrajParams']['trueParams'] = paramsDisOne['trueParams']

    replaceFigMode = True

    if regenerateData:
        synthPlotter = Plotter.PlotterJDM(paramsDisOne['plotTrajParams'])
        fig = synthPlotter.plotTrajDataMarcoFormat(
            paramsDisOne['X'],
            paramsDisOne['Y'],
            paramsDisOne['diag'],
            synthModelDisOne,
            paramsDisOne['trueParamsDis'],
            replaceFigMode=replaceFigMode)
        fig.savefig('%s/%sDis1GenData.png' % (outFolder, expName))

    ##### disease 2 - disease specific parameters ###########

    # params of the dysfunctional trajectories
    dysfuncParamsDisTwo = copy.deepcopy(dysfuncParamsDisOne)
    dysfuncParamsDisTwo[:, 1] = [0.3, 0.2]  # bk
    dysfuncParamsDisTwo[:, 2] = [6, -4]

    synthModelDisTwo = ParHierModel.ParHierModel(dysfuncParamsDisTwo, thetas,
                                                 mapBiomkToFuncUnits,
                                                 sigmoidFunc, sigmaB)

    paramsDisTwo = copy.deepcopy(paramsDisOne)
    nrSubjLongDisTwo = 50
    nrTimeptsDisTwo = 4

    paramsDisTwo = genSynthData.generateDataJMD(nrSubjLongDisTwo,
                                                nrBiomk,
                                                nrTimeptsDisTwo,
                                                shiftsLowerLim,
                                                shiftsUpperLim,
                                                synthModelDisTwo,
                                                outFolder,
                                                fileName,
                                                regenerateData,
                                                paramsDisTwo,
                                                scalingBiomk2B,
                                                ctlDiagNr=CTL2,
                                                patDiagNr=PCA)

    # for disease two, only keep the second biomarker in each functional unit
    indBiomkInDiseaseTwo = np.array(range(nrFuncUnits, (2 * nrFuncUnits)))
    print('indBiomkInDiseaseTwo', indBiomkInDiseaseTwo)
    paramsDisTwo['Xtrue'] = paramsDisTwo['X']
    paramsDisTwo['Ytrue'] = paramsDisTwo['Y']

    # for disease two, change the format of the X and Y arrays, add the missing biomarkers with empty lists
    XemptyListsAllBiomk = [0 for _ in range(nrBiomk)]
    YemptyListsAllBiomk = [0 for _ in range(nrBiomk)]
    visitIndicesDisTwoMissing = [0 for _ in range(nrBiomk)]
    for b in range(nrBiomk):
        XemptyListsAllBiomk[b] = [0 for _ in range(nrSubjLongDisTwo)]
        YemptyListsAllBiomk[b] = [0 for _ in range(nrSubjLongDisTwo)]
        visitIndicesDisTwoMissing[b] = [0 for _ in range(nrSubjLongDisTwo)]

        for s in range(nrSubjLongDisTwo):
            if b in indBiomkInDiseaseTwo:
                XemptyListsAllBiomk[b][s] = paramsDisTwo['Xtrue'][b][s]
                YemptyListsAllBiomk[b][s] = paramsDisTwo['Ytrue'][b][s]
                visitIndicesDisTwoMissing[b][s] = paramsDisTwo['visitIndices'][
                    b][s]
            else:
                XemptyListsAllBiomk[b][s] = np.array([])
                YemptyListsAllBiomk[b][s] = np.array([])
                visitIndicesDisTwoMissing[b][s] = np.array([])

    paramsDisTwo['XemptyListsAllBiomk'] = XemptyListsAllBiomk
    paramsDisTwo['YemptyListsAllBiomk'] = YemptyListsAllBiomk
    paramsDisTwo['visitIndicesMissing'] = visitIndicesDisTwoMissing

    if regenerateData:
        synthPlotter = Plotter.PlotterJDM(paramsDisTwo['plotTrajParams'])
        fig = synthPlotter.plotTrajDataMarcoFormat(
            paramsDisTwo['Xtrue'],
            paramsDisTwo['Ytrue'],
            paramsDisTwo['diag'],
            synthModelDisTwo,
            paramsDisTwo['trueParamsDis'],
            replaceFigMode=replaceFigMode)
        fig.savefig('%s/%sDis2GenDataFull.png' % (outFolder, expName))

        synthPlotter = Plotter.PlotterJDM(paramsDisTwo['plotTrajParams'])
        fig = synthPlotter.plotTrajDataMarcoFormat(
            paramsDisTwo['XemptyListsAllBiomk'],
            paramsDisTwo['YemptyListsAllBiomk'],
            paramsDisTwo['diag'],
            synthModelDisTwo,
            paramsDisTwo['trueParamsDis'],
            replaceFigMode=replaceFigMode)
        fig.savefig('%s/%sDis2GenDataMissing.png' % (outFolder, expName))

    ############### now merge the two datasets ############

    # add the biomarkers from the second dataset, same format as dataset 1
    # but with missing entries
    params = paramsDisOne
    for b in range(nrBiomk):
        params['X'][b] += paramsDisTwo['XemptyListsAllBiomk'][b]
        params['Y'][b] += paramsDisTwo['YemptyListsAllBiomk'][b]
        params['visitIndices'][b] += paramsDisTwo['visitIndicesMissing'][b]

    # print('visitIndicesDisTwoMissing', visitIndicesDisTwoMissing)
    # print(adssa)

    params['RID'] = np.concatenate(
        (params['RID'], nrSubjLong + paramsDisTwo['RID']),
        axis=0)  # RIDs must be different

    # this is the full vector of diagnoses for all diseases
    params['diag'] = np.concatenate(
        (paramsDisOne['diag'], paramsDisTwo['diag']), axis=0)
    params['plotTrajParams']['diag'] = params['diag']

    params['trueParamsDis'] = [
        params['trueParamsDis'], paramsDisTwo['trueParamsDis']
    ]

    for f in range(nrFuncUnits):
        params['trueParamsFuncUnits'][f]['subShiftsS'] = np.concatenate(
            (params['trueParamsFuncUnits'][f]['subShiftsS'],
             paramsDisTwo['trueParamsFuncUnits'][f]['subShiftsS']),
            axis=0)

    # map which diagnoses belong to which disease
    # first disease has CTL+AD, second disease has CTL2+PCA
    params['diagsSetInDis'] = [np.array([CTL, AD]), np.array([CTL2, PCA])]
    params['disLabels'] = ['Dis0', 'Dis1']
    params['otherBiomkPerDisease'] = [[], []]

    params['binMaskSubjForEachDisD'] = [
        np.in1d(params['diag'], params['diagsSetInDis'][disNr])
        for disNr in range(nrDis)
    ]

    assert params['diag'].shape[0] == len(params['X'][0])
    assert np.sum(params['binMaskSubjForEachDisD'][0]) == len(
        params['trueParamsDis'][0]['subShiftsS'])
    assert params['diag'].shape[0] == len(
        params['trueParamsFuncUnits'][0]['subShiftsS'])

    # if np.abs(args.penalty - int(args.penalty) < 0.00001):
    #   expName = '%sPen%d' % (expName, args.penalty)
    # else:
    #   expName = '%sPen%.1f' % (expName, args.penalty)

    params['runPartStd'] = args.runPartStd
    params['runPartMain'] = ['R', 'I', 'I']  # [mainPart, plot, stage]
    params['masterProcess'] = args.runIndex == 0

    expNameDisOne = '%s' % expName
    modelNames, res = evaluationFramework.runModels(params, expName,
                                                    args.modelToRun,
                                                    runAllExpSynth)
Exemple #4
0
def launchADNIthick(runIndex, nrProcesses, modelToRun):

    # dataStruct['pointIndices'] = np.array(range(dataStruct['lhData'].shape[1]))
    # pickle.dump(dataStruct, open(inputFileData, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
    inputPrefix = 'cortThickADNI3Scans'
    inputFileDataFull = '../data/ADNI/%sData.npz' % inputPrefix
    inputFileInfo = '../data/ADNI/%sInfo.npz' % inputPrefix
    print(inputFileInfo)
    sys.stdout.flush()
    #if os.path.isfile(inputFileInfo):
    infoStruct = pickle.load(open(inputFileInfo, 'rb'))

    print('will enter readDataFile')
    dataStruct = readDataFile(inputFileDataFull, args.cluster)

    #selectedBiomk = np.array([x for x in range(4,144)])

    # filter AD subjects
    # diagInd = np.array(np.where(matData['diag'] == PCA)[0])
    print('compiling parameters')
    sys.stdout.flush()
    data = dataStruct['avghData']
    diag = np.array(np.squeeze(infoStruct['diag']), int)
    scanTimepts = np.squeeze(infoStruct['scanTimepts'])
    partCode = np.squeeze(infoStruct['partCode'])
    ageAtScan = np.squeeze(infoStruct['ageAtScan'])
    pointIndices = dataStruct['pointIndices']
    cogTests = infoStruct['cogTests']
    visit = infoStruct['visit']
    assert (not np.any(np.isnan(data)))

    print('diag', np.unique(diag), diag)
    # print(adsas)

    idx = [0, 1, 2, 3, 4]
    # print('partCode[idx]', partCode[idx])
    # print('ageAtScan[idx]', ageAtScan[idx])
    # print('scanTimepts[idx]', scanTimepts[idx])
    # print('diag[idx]', diag[idx])
    # print('visit[idx]', visit[idx])
    # print(adas)

    #np.set_printoptions(threshold = np.inf)
    #print(dataZ, np.min(dataZ))
    #print(asdsa)
    #np.set_printoptions(threshold = 3)

    unqPartCode = np.unique(partCode)
    nrUnqPart = len(unqPartCode)

    #print(partCode)
    #print(scanTimepts)
    #print(nrUnqPart)

    #print(np.sum(data == 0, 0))
    maxNrZeros = 5
    selectedBiomk = np.sum(data == 0, 0) < maxNrZeros

    # import pdb
    # pdb.set_trace()

    data = data[:, selectedBiomk]
    pointIndices = pointIndices[selectedBiomk]

    # calculate Z-scores at each point w.r.t controls at baseline
    controlBlInd = np.logical_and(diag == CTL, scanTimepts == 1)
    meanCTL = np.mean(data[controlBlInd], 0)  # calculate Z-scores
    stdCTL = np.std(data[controlBlInd], 0)
    dataZ = (data - meanCTL[None, :]) / stdCTL[None, :]

    meanAgeCTL = np.mean(ageAtScan[controlBlInd], 0)
    stdAgeCTL = np.std(ageAtScan[controlBlInd], 0)
    ageAtScanZ = (ageAtScan - meanAgeCTL) / stdAgeCTL

    (rowInd, colInd) = np.where(np.isnan(dataZ))

    rowIndUnq = np.unique(rowInd)
    colIndUnq = np.unique(colInd)

    print(rowIndUnq, colIndUnq)
    print(np.where(stdCTL == 0))
    print(data.shape)
    sys.stdout.flush()

    data = dataZ
    assert (not np.any(np.isnan(data)))

    dataAD = data[diag == AD, :]
    indMaxAbnormality = np.argsort(np.mean(dataAD,
                                           0))  # lowest cortical thickness
    print(indMaxAbnormality)

    sortedByPvalInd, labels, names = testMeanBiomkValue(
        data, diag, pointIndices, plotTrajParams)
    #doTtest(data, diag, pointIndices)

    #sortedByPvalInd = sortedByPvalInd[selectedBiomk]
    assert (sortedByPvalInd.shape[0] == data.shape[1])

    print(infoStruct['cogTestsLabels'])
    sys.stdout.flush()

    params['data'] = data
    params['diag'] = diag
    params['scanTimepts'] = scanTimepts
    params['partCode'] = partCode
    params['ageAtScan'] = ageAtScanZ
    params['biomkDir'] = DECR
    params['modelToRun'] = modelToRun
    params['cogTests'] = np.squeeze(cogTests)  # CDRSOB, ADAS13, MMSE, RAVLT
    params['cogTests'][:, [
        2, 3
    ]] *= -1  # make MMSE and RAVLT have increasing scores from CTL->AD
    # params['acqDate'] = infoStruct['acqDate']
    params['datasetFull'] = 'adniThick'
    params[
        'fixSpeed'] = False  # if true then don't model progression speed, only time shift

    # map points that have been removed to the closest included points (nearestNeighbours).
    # also find the adjacency list for the MRF and another subset of 10k points for
    # initial clustering
    runPartNN = 'L'
    plotTrajParams['nearestNeighbours'], params['adjList'], \
      params['nearNeighInitClust'], params['initClustSubsetInd'] = findNearNeigh(runPartNN,
      params['datasetFull'], pointIndices, plotTrajParams['freesurfPath'], indMaxAbnormality)
    # print(ads)

    diagNrs = np.unique(diag)
    # print('diagNrs, diag', diagNrs, diag)
    # print(asdas)

    # print(len(params['acqDate']), data.shape[0])
    sys.stdout.flush()
    assert (params['data'].shape[0] == params['diag'].shape[0] ==
            params['scanTimepts'].shape[0] == params['partCode'].shape[0] ==
            params['ageAtScan'].shape[0] == params['cogTests'].shape[0])

    # sets an uninformative or informative prior
    priorNr = setPrior(params,
                       args.informPrior,
                       mean_gamma_alpha=1,
                       std_gamma_alpha=0.3,
                       mu_beta=0,
                       std_beta=5)

    expName = 'adniThFWHM%dInit%sCl%dPr%dRa%dMrf%d' % (
        args.fwhmLevel, args.initClustering, params['nrClust'], priorNr,
        args.rangeFactor, args.alphaMRF)
    plotTrajParams['sortedByPvalInd'] = sortedByPvalInd
    plotTrajParams['pointIndices'] = pointIndices
    plotTrajParams['labels'] = labels
    plotTrajParams['names'] = names
    plotTrajParams['expName'] = expName
    plotTrajParams['ageTransform'] = (meanAgeCTL, stdAgeCTL)
    plotTrajParams['datasetFull'] = params['datasetFull']

    params['plotTrajParams'] = plotTrajParams

    # [initClust, modelFit, AIC/BIC, blender, theta_sampling]
    params['runPartStd'] = ['L', 'Non-enforcing', 'I', 'R', 'L']
    params['runPartMain'] = ['I', 'I', 'I']  # [mainPart, plot, stage]
    params['runPartCogCorr'] = ['R']
    params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'I']
    params['runPartDirDiag'] = ['R', 'R', 'I']
    params['runPartStaging'] = ['L', 'L', 'I']
    params['runPartDiffDiag'] = ['R', 'R', 'I']
    params['runPartConvPred'] = ['I', 'I', 'I']
    params['runPartCVNonOverlap'] = ['I']
    params['runPartCVNonOverlapMain'] = ['L', 'L', 'I', 'I', 'I']
    params['masterProcess'] = runIndex == 0

    # visRegions(data, diag, ageAtScan, plotTrajParams)
    #
    # visData(data, diag, ageAtScan, plotTrajParams,sortedByPvalInd)
    # print(dsasa)

    # makeAvgBiomkMaps(data, diag, ageAtScan, plotTrajParams,
    #   'adniTh', args.fwhmLevel, plotTrajParams['diagLabels'])
    # print(adsa)

    # (longData, longDiagAllTmpts, longDiag, longScanTimepts, longPartCode, longAgeAtScan,
    #  uniquePartCodeInverse, crossData, crossDiag, scanTimepts, crossPartCode, crossAgeAtScan) = \
    #   createLongData(data, diag, scanTimepts, partCode, ageAtScan)
    #
    # unqDiag = np.unique(longDiag)
    # nrScans = np.zeros(longDiag.shape, float)
    # nrSubjLong = longDiag.shape[0]
    # for s in range(nrSubjLong):
    #   nrScans[s] = longData[s].shape[0]
    #
    # longAgeAtBlScan = np.array([longAgeAtScan[s][0] for s in range(nrSubjLong)])
    #
    # for d in range(unqDiag.shape[0]):
    #   print('%s nrSubj %d' % (plotTrajParams['diagLabels'][unqDiag[d]],
    #     np.sum(longDiag == unqDiag[d], axis=0)))
    #   print('%s nrScans %f' % (plotTrajParams['diagLabels'][unqDiag[d]],
    #     np.mean(nrScans[longDiag == unqDiag[d]])))
    #   print('%s ageAtBlScan %f' % (plotTrajParams['diagLabels'][unqDiag[d]], np.mean(longAgeAtBlScan[longDiag == unqDiag[d]])))

    # print(adsas)

    if params['masterProcess']:
        # [initClust, modelFit, AIC/BIC, blender, theta_sampling]
        params['runPartStd'] = ['R', 'R', 'R', 'R', 'R']
        params['runPartMain'] = ['I', 'I', 'I']  # [mainPart, plot, stage]
        params['runPartCogCorr'] = ['R']
        params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'I']
        params['runPartDirDiag'] = ['R', 'R', 'I']
        params['runPartStaging'] = ['L', 'L', 'I']
        params['runPartDiffDiag'] = ['R', 'R', 'I']
        params['runPartConvPred'] = ['I', 'I', 'I']
        params['runPartCVNonOverlap'] = ['I']
        params['runPartCVNonOverlapMain'] = ['R', 'R', 'I', 'R', 'R']

    runAllExpFunc = adniDEM.runAllExpADNI
    modelNames, res = evaluationFramework.runModels(params, expName,
                                                    modelToRun, runAllExpFunc)

    if params['masterProcess']:
        printResADNIthick(modelNames, res, plotTrajParams)

        expNameBefCl = 'adniThFWHM%dInit%s' % (args.fwhmLevel,
                                               args.initClustering)
        expNameAfterCl = 'Pr%dRa%dMrf%d' % (args.informPrior, args.rangeFactor,
                                            args.alphaMRF)
        # nrClustList = range(2, 30)
        #nrClustList = [2,3,4,5,6,7,8,9,10,12,15,18,20,25,30,35,40,50]
        nrClustList = [2, 3, 4, 5, 6, 7, 8, 9, 10]
  if params['masterProcess']:
    # [initClust, modelFit, AIC/BIC, blender, theta_sampling]
    params['runPartStd'] = ['L', 'L', 'I', 'I', 'I']
    params['runPartMain'] = ['R', 'R', 'R']  # [mainPart, plot, stage]
    params['runPartCogCorr'] = ['I']
    params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'I']
    params['runPartDirDiag'] = ['R', 'R', 'I']
    params['runPartStaging'] = ['L', 'L', 'I']
    params['runPartDiffDiag'] = ['R', 'R', 'I']
    params['runPartConvPred'] = ['I', 'I', 'I']
    params['runPartCVNonOverlap'] = ['I']
    params['runPartCVNonOverlapMain'] = ['R', 'R', 'I', 'R', 'R']

  runAllExpFunc = runAllExpTADPOLE
  modelNames, res = evaluationFramework.runModels(params, expName, modelToRun, runAllExpFunc)

  # now generate forecast
  print('Generating forecast ... ')
  teamName = 'DIVE6'
  if args.leaderboard:
    outputFile = 'TADPOLE_Submission_Leaderboard_D3_%s.csv' % teamName
    predStartDate = datetime.date(2010, 5, 1)
    nrYearsToPred = 7
    nrMonthsToPred = 12*nrYearsToPred  # 5 years
  else:
    outputFile = 'TADPOLE_Submission_D3_%s.csv' % teamName
    predStartDate = datetime.date(2018, 1, 1)
    nrYearsToPred = 5
    nrMonthsToPred = 12*nrYearsToPred  # 7 years
Exemple #6
0
def main():

    # don't turn this on unless I add cognitive markers in the DRC dataset.
    addExtraBiomk = False

    np.random.seed(1)
    random.seed(1)
    pd.set_option('display.max_columns', 50)
    tinyData = args.tinyData

    finalDataFile = 'data_processed/tadDrc.npz'
    expName = 'tadDrc'

    if args.tinyData:
        finalDataFile = finalDataFile.split('.')[0] + 'Tiny.npz'
        expName = expName.split('.')[0] + 'Tiny'

    if addExtraBiomk:
        finalDataFile = finalDataFile.split('.')[0] + 'Cog.npz'
        expName = expName.split('.')[0] + 'Cog'

    regenerateData = (not os.path.isfile(finalDataFile)) or args.regData
    if regenerateData:
        prepareData(finalDataFile, tinyData, addExtraBiomk)
        # print(dada)

    ds = pickle.load(open(finalDataFile, 'rb'))
    dataDfAll = ds['dataDfAll']
    regParamsICV = ds['regParamsICV']
    regParamsAge = ds['regParamsAge']
    regParamsGender = ds['regParamsGender']
    regParamsDataset = ds['regParamsDataset']
    X = ds['X']
    Y = ds['Y']
    RID = np.array(ds['RID'], int)
    labels = ds['list_biomarkers']
    diag = ds['diag']

    outFolder = 'resfiles/'

    params = {}

    av45InListBiomk = np.array(
        [True for x in ds['list_biomarkers'] if x.startswith('AV1451')]).any()
    if av45InListBiomk:
        nrBiomkInFuncUnits = 5
    else:
        nrBiomkInFuncUnits = 4

    # print('dataDfAll', dataDfAll)

    nrDis = 2  # nr of diseases
    params['nrDis'] = nrDis

    # change the order of the functional units so that the hippocampus and occipital are fitted first
    unitPermutation = [5, 3, 2, 1, 4, 0]

    nrFuncUnits = 6
    mapBiomkToFuncUnits = np.array((unitPermutation * nrBiomkInFuncUnits))
    nrExtraBiomk = 0

    if addExtraBiomk:
        nrExtraBiomk = 5
        nrFuncUnits += nrExtraBiomk  # add the 3 extra cog markers to a unique functional unit

        mapBiomkToFuncUnits = np.array(
            (unitPermutation * nrBiomkInFuncUnits) +
            list(range(nrFuncUnits - nrExtraBiomk, nrFuncUnits)))

    # print(mapBiomkToFuncUnits)
    # print(dasdas)

    unitNames = [l.split(' ')[-1] for l in labels]
    unitNames = [unitNames[i] for i in unitPermutation]
    if addExtraBiomk:
        extraBiomkNames = ['ADAS13', 'CDRSB', 'RAVLT', 'MMSE', 'FAQ']
        unitNames += extraBiomkNames
        assert len(extraBiomkNames) == nrExtraBiomk

    nrBiomk = mapBiomkToFuncUnits.shape[0]
    biomkInFuncUnit = [0 for u in range(nrFuncUnits + 1)]
    for u in range(nrFuncUnits):
        biomkInFuncUnit[u] = np.where(mapBiomkToFuncUnits == u)[0]

    # if addExtraBiomk:
    #   # add extra entry with other biomks to be added in the disease models
    #   extraBiomkNames = ['ADAS13', 'CDRSB', 'RAVLT', 'MMSE', 'FAQ']
    #   biomkInFuncUnit[nrFuncUnits] = np.array([nrBiomk-3, nrBiomk-2, nrBiomk-1])
    # else:

    biomkInFuncUnit[nrFuncUnits] = np.array(
        [])  # need to leave this as empty list

    plotTrajParams['biomkInFuncUnit'] = biomkInFuncUnit
    plotTrajParams['labels'] = labels
    plotTrajParams['nrRowsFuncUnit'] = 3
    plotTrajParams['nrColsFuncUnit'] = 4
    plotTrajParams['colorsTrajBiomkB'] = [
        colorsys.hsv_to_rgb(hue, 1, 1)
        for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False)
    ]
    plotTrajParams['colorsTrajUnitsU'] = [
        colorsys.hsv_to_rgb(hue, 1, 1)
        for hue in np.linspace(0, 1, num=nrFuncUnits, endpoint=False)
    ]
    plotTrajParams['nrBiomk'] = nrBiomk
    params['nrBiomk'] = nrBiomk

    # plotTrajParams['yNormMode'] = 'zScoreTraj'
    # plotTrajParams['yNormMode'] = 'zScoreEarlyStageTraj'
    plotTrajParams['yNormMode'] = 'unscaled'

    # if False, plot estimated traj. in separate plot from true traj.
    plotTrajParams['allTrajOverlap'] = False

    params['nrFuncUnitsImgOnly'] = nrFuncUnits - nrExtraBiomk
    params['unitNames'] = unitNames
    params['runIndex'] = args.runIndex
    params['nrProc'] = args.nrProc
    params['cluster'] = args.cluster
    params['plotTrajParams'] = plotTrajParams
    params['penaltyUnits'] = args.penalty
    params['penaltyDis'] = args.penalty
    params['nrFuncUnits'] = nrFuncUnits
    params['biomkInFuncUnit'] = biomkInFuncUnit
    params['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits
    params['labels'] = labels
    params['nrExtraBiomk'] = nrExtraBiomk

    params['X'] = X
    params['Y'] = Y
    params['RID'] = RID
    # print('RID', RID)
    # print(ads)
    params['diag'] = diag
    params['plotTrajParams']['diag'] = params['diag']
    params['Xvalid'] = ds['Xvalid']
    params['Yvalid'] = ds['Yvalid']
    params['RIDvalid'] = ds['RIDvalid']
    params['diagValid'] = ds['diagValid']
    params['dataDfAll'] = dataDfAll
    params['visitIndices'] = ds['visitIndices']
    params['visitIndicesValid'] = ds['visitIndicesValid']

    # params['nrGlobIterUnit'] = 10 # these parameters are specific for the Joint Model of Disease (JMD)
    # params['iterParamsUnit'] = 60
    # params['nrGlobIterDis'] = 10
    # params['iterParamsDis'] = 60

    # by default we have no priors
    params['priors'] = None

    # print([params['X'][b2][subjIndCurrDis[s]] for b2 in range(params['nrBiomk'])])
    # print([params['Y'][b2][subjIndCurrDis[s]] for b2 in range(params['nrBiomk'])])

    for s in range(len(X[0])):
        entriesCurrSubj = [X[b][s].shape[0] > 0 for b in range(30)]
        nrEntriesPerSubj = np.sum(entriesCurrSubj)
        if nrEntriesPerSubj == 0:
            print(s, entriesCurrSubj)
            print(dadsa)

    print(labels)
    # print(dasda)

    ############# set priors for specific models ################

    # params['priors'] = dict(prior_length_scale_mean_ratio=0.33, # mean_length_scale = (self.maxX-self.minX)/3
    #     prior_length_scale_std=1e-4, prior_sigma_mean=2,prior_sigma_std = 1e-3,
    #     prior_eps_mean = 1, prior_eps_std = 1e-2)
    # params['priors'] = dict(prior_length_scale_mean_ratio=0.9,  # mean_length_scale = (self.maxX-self.minX)/3
    #                             prior_length_scale_std=1e-4, prior_sigma_mean=3, prior_sigma_std=1e-3,
    #                             prior_eps_mean=0.1, prior_eps_std=1e-6)

    params['priorsUnitModelsMarcoModel'] = [
        dict(
            prior_length_scale_mean_ratio=
            0.05,  # mean_length_scale = (self.maxX-self.minX)/3
            prior_length_scale_std=1e-6,
            prior_sigma_mean=0.5,
            prior_sigma_std=1e-3,
            prior_eps_mean=0.1,
            prior_eps_std=1e-6) for u in range(nrFuncUnits)
    ]

    transitionTimePriorMean = 1  # in DPS 0-1 space, prior mean
    transitionTimePriorMin = 0.9
    transitionTimePriorMax = 1.1

    bPriorShape, bPriorRate = getGammShapeRateFromTranTime(
        transitionTimePriorMean, transitionTimePriorMin,
        transitionTimePriorMax)

    transitionTimePriorMeanAD = 0.1  # using months instead of years
    transitionTimePriorMinAD = 0.09
    transitionTimePriorMaxAD = 0.11

    bPriorShapeDisAD, bPriorRateDisAD = getGammShapeRateFromTranTime(
        transitionTimePriorMeanAD, transitionTimePriorMinAD,
        transitionTimePriorMaxAD)

    _, bPriorStdAD = getMeanStdBFromTranTime(transitionTimePriorMeanAD,
                                             transitionTimePriorMinAD,
                                             transitionTimePriorMaxAD)

    transitionTimePriorMeanPCA = 500
    transitionTimePriorMinPCA = 400
    transitionTimePriorMaxPCA = 600

    bPriorShapeDisPCA, bPriorRateDisPCA = getGammShapeRateFromTranTime(
        transitionTimePriorMeanPCA, transitionTimePriorMinPCA,
        transitionTimePriorMaxPCA)

    _, bPriorStdPCA = getMeanStdBFromTranTime(transitionTimePriorMeanPCA,
                                              transitionTimePriorMinPCA,
                                              transitionTimePriorMaxPCA)

    params['priorsDisModels'] = [0, 0]
    # priors for tAD
    params['priorsDisModels'][0] = dict(meanA=1,
                                        stdA=1e-20,
                                        meanD=0,
                                        stdD=1e-20,
                                        shapeB=bPriorShapeDisAD,
                                        rateB=bPriorRateDisAD,
                                        stdPerturbB=bPriorStdAD,
                                        timeShiftStd=20000)
    # priors for PCA
    params['priorsDisModels'][1] = dict(meanA=1,
                                        stdA=1e-20,
                                        meanD=0,
                                        stdD=1e-20,
                                        shapeB=bPriorShapeDisPCA,
                                        rateB=bPriorRateDisPCA,
                                        stdPerturbB=bPriorStdPCA,
                                        timeShiftStd=20000)

    # params['priorsUnitModels'] = [dict(meanA=1, stdA=1e-20, meanD=0, stdD=1e-20,
    #  shapeB=2, rateB=2, timeShiftStd=20000) for d in range(nrDis)]
    params['priorsUnitModels'] = [
        dict(meanA=1,
             stdA=1e-5,
             meanD=0,
             stdD=1e-5,
             shapeB=bPriorShape,
             rateB=bPriorRate,
             timeShiftStd=20000) for u in range(nrFuncUnits - nrExtraBiomk)
    ]

    if nrExtraBiomk > 0:
        params['priorsUnitModelsLinear'] = [
            dict(meanA=1, stdA=0.1, meanB=0, stdB=0.1, timeShiftStd=20000)
            for u in range(nrExtraBiomk)
        ]
        params['priorsUnitModels'] += params['priorsUnitModelsLinear']

    bPriorShapeNoDKT, bPriorRateNoDKT = getGammShapeRateFromTranTime(
        transitionTimePriorMean=50,
        transitionTimePriorMin=40,
        transitionTimePriorMax=60)
    params['priorsNoDKTSigmoid'] = dict(meanA=1,
                                        stdA=1e-5,
                                        meanD=0,
                                        stdD=1e-5,
                                        shapeB=bPriorShapeNoDKT,
                                        rateB=bPriorRateNoDKT,
                                        timeShiftStd=20000)

    ######################

    nrBiomkDisModel = nrFuncUnits
    params['nrBiomkDisModel'] = nrBiomkDisModel

    if addExtraBiomk:
        params['plotTrajParams']['unitNames'] = unitNames + labels[-3:]
    else:
        params['plotTrajParams']['unitNames'] = unitNames

    # map which diagnoses belong to which disease
    # first disease has CTL+AD, second disease has CTL2+PCA
    params['diagsSetInDis'] = [
        np.array([CTL, MCI, AD, AD2]),
        np.array([CTL2, PCA])
    ]
    params['disLabels'] = ['tAD', 'PCA']
    # if addExtraBiomk:
    #   params['otherBiomkPerDisease'] = [[nrBiomk-3,nrBiomk-2, nrBiomk-1], []] # can also add 3 extra cognitive tests
    # else:
    #   params['otherBiomkPerDisease'] = [[], []]

    params['binMaskSubjForEachDisD'] = [
        np.in1d(params['diag'], params['diagsSetInDis'][disNr])
        for disNr in range(nrDis)
    ]

    eps = 0.001
    nrXPoints = 50
    params['trueParams'] = {}
    subShiftsS = np.zeros(RID.shape[0])
    # params['trueParams']['trueSubjDysfuncScoresSU'] = np.zeros((RID.shape[0],nrFuncUnits))
    trueDysfuncXsX = np.linspace(0, 1, nrXPoints)
    # params['trueParams']['trueTrajXB'] = eps * np.ones((nrXPoints, nrBiomk))
    trueTrajFromDysXB = eps * np.ones((nrXPoints, nrBiomk))

    trueLineSpacedDPSsX = np.linspace(-10, 10, nrXPoints)
    trueTrajPredXB = eps * np.ones((nrXPoints, nrBiomk))
    trueDysTrajFromDpsXU = eps * np.ones((nrXPoints, nrBiomkDisModel))

    scalingBiomk2B = np.zeros((2, nrBiomk))
    scalingBiomk2B[1, :] = 1

    trueParamsFuncUnits = [0 for _ in range(nrFuncUnits)]
    for f in range(nrFuncUnits):
        trueParamsFuncUnits[f] = dict(
            xsX=trueDysfuncXsX,
            ysXB=trueTrajFromDysXB[:, biomkInFuncUnit[f]],
            subShiftsS=subShiftsS,
            scalingBiomk2B=scalingBiomk2B[:, biomkInFuncUnit[f]])

    # disease specific
    trueParamsDis = [0 for _ in range(nrDis)]
    for d in range(nrDis):
        trueParamsDis[d] = dict(xsX=trueLineSpacedDPSsX,
                                ysXU=trueDysTrajFromDpsXU,
                                ysXB=trueTrajPredXB,
                                subShiftsS=np.zeros(
                                    np.sum(
                                        np.in1d(params['diag'],
                                                params['diagsSetInDis'][d]))),
                                scalingBiomk2B=scalingBiomk2B)

    # for DKT DPMs
    params['trueParamsFuncUnits'] = trueParamsFuncUnits
    params['trueParamsDis'] = trueParamsDis

    # simpler non-DKT DPMs
    params['trueParams'] = dict(xsX=trueLineSpacedDPSsX,
                                ysXU=trueTrajPredXB,
                                ysXB=trueTrajPredXB,
                                subShiftsS=subShiftsS,
                                scalingBiomk2B=scalingBiomk2B)
    params['plotTrajParams']['trueParams'] = params['trueParams']

    print('diag', params['diag'].shape[0])
    # print(adsa)
    print('X[0]', len(params['X'][0]))
    assert params['diag'].shape[0] == len(params['X'][0])
    # assert params['diag'].shape[0] == len(params['trueParams']['subShiftsTrueMarcoFormatS'])
    # assert params['diag'].shape[0] == len(params['trueParams']['trueSubjDysfuncScoresSU'])

    # if args.penalty is not None:
    #   if np.abs(args.penalty - int(args.penalty) < 0.00001):
    #     expName = '%sPen%d' % (expName, args.penalty)
    #   else:
    #     expName = '%sPen%.1f' % (expName, args.penalty)

    # params['runPartStd'] = ['L', 'L']
    params['runPartStd'] = args.runPartStd
    params['runPartMain'] = ['R', 'I', 'I']  # [mainPart, plot, stage]
    params['masterProcess'] = args.runIndex == 0

    expNameDisOne = '%s' % expName
    modelNames, res = evaluationFramework.runModels(params, expName,
                                                    args.modelToRun,
                                                    runAllExpTadpoleDrc)

    if params['masterProcess']:
        printRes(modelNames, res, plotTrajParams, params)
Exemple #7
0
def main():

    np.random.seed(1)
    random.seed(1)
    pd.set_option('display.max_columns', 50)
    tinyData = True
    regenerateData = True
    if tinyData:
        finalDataFile = 'tadpoleTiny.npz'
    else:
        finalDataFile = 'tadpoleFinalDataWithRegParams.npz'

    if regenerateData:
        prepareData(finalDataFile, tinyData)

    ds = pickle.load(open(finalDataFile, 'rb'))
    dataDfAll = ds['dataDfAll']
    regParamsICV = ds['regParamsICV']
    regParamsAge = ds['regParamsAge']
    regParamsGender = ds['regParamsGender']
    regParamsDataset = ds['regParamsDataset']
    X = ds['X']
    Y = ds['Y']
    RID = np.array(ds['RID'])
    labels = ds['list_biomarkers']
    diag = ds['diag']

    meanVols = np.array([np.mean(Y[0][s]) for s in range(RID.shape[0])])
    meanVols[diag != CTL2] = np.inf
    idxOfDRCSubjWithLowVol = np.argmin(meanVols)
    # print('idxOfDRCSubjWithLowVol', idxOfDRCSubjWithLowVol)
    # print(diag[idxOfDRCSubjWithLowVol])
    # print(labels)
    # print(asd)

    outFolder = 'resfiles/'

    expName = 'tadpole'

    params = {}

    nrFuncUnits = 2
    nrBiomkInFuncUnits = 5

    # nrBiomk = nrBiomkInFuncUnits * nrFuncUnits
    nrBiomk = len(labels)

    # print(len(labels), labels)
    # print(nrBiomk)
    # print(asd)
    # mapBiomkToFuncUnits = np.array(list(range(nrFuncUnits)) * nrBiomkInFuncUnits)
    # should give smth like [0,1,2,3,0,1,2,3,0,1,2,3]

    # change the order of the functional units so that the hippocampus and occipital are fitted first
    # unitPermutation = [5,3,2,1,4,0]
    unitPermutation = [0, 1]
    mapBiomkToFuncUnits = np.array(unitPermutation * nrBiomkInFuncUnits)
    unitNames = [l.split(' ')[-1] for l in labels]
    unitNames = [unitNames[i] for i in unitPermutation]
    # print('mapBiomkToFuncUnits', mapBiomkToFuncUnits)
    # print([unitNames[i] for i in mapBiomkToFuncUnits])
    # print(unitNames[0])
    # print([labels[i] for i in np.where(mapBiomkToFuncUnits == 0)[0]])
    # print(asd)

    plotTrajParams['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits
    plotTrajParams['labels'] = labels
    plotTrajParams['nrRowsFuncUnit'] = 3
    plotTrajParams['nrColsFuncUnit'] = 3
    plotTrajParams['colorsTraj'] = [
        colorsys.hsv_to_rgb(hue, 1, 1)
        for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False)
    ]

    # if False, plot estimated traj. in separate plot from true traj.
    plotTrajParams['allTrajOverlap'] = False

    params['runIndex'] = args.runIndex
    params['nrProc'] = args.nrProc
    params['cluster'] = args.cluster
    params['plotTrajParams'] = plotTrajParams
    params['penalty'] = args.penalty
    params['nrFuncUnits'] = nrFuncUnits
    params['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits
    params['labels'] = labels

    params['X'] = X
    params['Y'] = Y
    params['RID'] = RID
    params['diag'] = diag
    params['plotTrajParams']['diag'] = params['diag']

    nrXPoints = 50
    nrDis = 2  # nr of diseases
    params['trueParams'] = {}
    params['trueParams']['subShiftsTrueMarcoFormatS'] = np.zeros(RID.shape[0])
    params['trueParams']['trueSubjDysfuncScoresSU'] = np.zeros(
        (RID.shape[0], nrFuncUnits))
    params['trueParams']['trueDysfuncXsX'] = np.linspace(0, 1, nrXPoints)
    params['trueParams']['trueTrajXB'] = np.zeros((nrXPoints, nrBiomk))
    params['trueParams']['trueTrajFromDysXB'] = np.zeros((nrXPoints, nrBiomk))
    params['trueParams']['trueXsTrajX'] = params['trueParams'][
        'trueDysfuncXsX']

    params['trueParams']['trueLineSpacedDPSsX'] = np.linspace(
        -10, 10, nrXPoints)
    # params['trueParams']['trueTrajPredXB'] = np.zeros((nrXPoints,nrBiomk))
    params['trueParams']['trueDysTrajFromDpsXU'] = [
        np.zeros((nrXPoints, nrFuncUnits)) for d in range(nrDis)
    ]

    params['plotTrajParams']['trueParams'] = params['trueParams']
    params['plotTrajParams']['unitNames'] = unitNames

    # map which diagnoses belong to which disease
    # first disease has CTL+AD, second disease has CTL2+PCA
    params['diagsSetInDis'] = [np.array([CTL, MCI, AD]), np.array([CTL2, PCA])]
    params['disLabels'] = ['tAD', 'PCA']

    print('diag', params['diag'].shape[0])
    print('X[0]', len(params['X'][0]))
    assert params['diag'].shape[0] == len(params['X'][0])
    # assert params['diag'].shape[0] == len(params['trueParams']['subShiftsTrueMarcoFormatS'])
    # assert params['diag'].shape[0] == len(params['trueParams']['trueSubjDysfuncScoresSU'])

    if np.abs(args.penalty - int(args.penalty) < 0.00001):
        expName = '%sPen%d' % (expName, args.penalty)
    else:
        expName = '%sPen%.1f' % (expName, args.penalty)

    params['runPartStd'] = ['R', 'R']
    params['runPartMain'] = ['R', 'I', 'I']  # [mainPart, plot, stage]
    params['masterProcess'] = args.runIndex == 0

    expNameDisOne = '%s' % expName
    modelNames, res = evaluationFramework.runModels(params, expName,
                                                    args.modelToRun,
                                                    runAllExpTadpoleDrc)
Exemple #8
0
def launchSynth(runIndex, nrProcesses, modelToRun):

    runAllExpFunc = runAllExpSynth

    #if os.path.isfile(inputFileData):
    trajFuncDict = {'lin': linearFunc, 'sig': sigmoidFunc}

    # forceRegenerate = True
    forceRegenerate = False

    ############# define default parameters #####################################

    nrSubjDef = 300
    nrBiomk = 1000
    # not used directly, relevant for when I use real data as I can map then to the actual freesurfer vertices
    nrClustToGenDef = 3  # number of clusters to generate data from
    nrClustToFit = args.nrClust
    nrTimepts = 4
    trajFunc = trajFuncDict['sig']

    lowerAgeLim = 40
    upperAgeLim = 80
    dpsLowerLimit = -1
    dpsUpperLimit = 2
    dpsIntervalDef = dpsUpperLimit - dpsLowerLimit
    ageInterval = upperAgeLim - lowerAgeLim

    avgStdScaleFactor = 1
    ''' fit sigmoidal function for trajectory with params [a,b,c,d] with minimum d, maximum a+d,
  slope a*b/4 and slope maximum attained at center c
  f(s|theta = [a,b,c,d]) = a/(1+exp(-b(s-c)))+d'''
    thetasTrue = np.zeros((nrClustToGenDef, 4), float)
    thetasTrue[0, :] = [1, -3, dpsLowerLimit,
                        -1]  # make lines intersect the Y=0 axis at lowerAgeLim
    thetasTrue[1, :] = [1, -1, dpsLowerLimit + dpsIntervalDef / 2, -1]
    thetasTrue[2, :] = [1, -3, dpsLowerLimit + dpsIntervalDef, -1]

    slopeLowerLim = -2
    slopeUpperLim = -2
    slopeInterval = slopeUpperLim - slopeLowerLim

    trajMinLowerLim = -5
    trajMinUpperLim = -5
    trajMinInterval = trajMinUpperLim - trajMinLowerLim

    covPerturbed13 = np.diag([0, 0.35, dpsIntervalDef / 70, 0])
    covPerturbed2 = np.diag([0, 0.1, dpsIntervalDef / 70, 0])
    covPerturbed = [covPerturbed13, covPerturbed2, covPerturbed13]

    covSubjShifts = np.array(
        [[0.05, 0], [0,
                     10]])  # +/- 10 years shifts on avg, averate rate 1+/-0.4

    makeThetaIdentifFunc = VoxelDPM.makeThetasIdentif

    ############### set parameters ###############################################

    params['runIndex'] = runIndex
    params['nrProcesses'] = nrProcesses
    params['modelToRun'] = modelToRun
    params['cluster'] = args.cluster
    params['biomkDir'] = DECR
    params['initClustering'] = 'k-means'
    params['rangeFactor'] = float(args.rangeFactor)
    params['pointIndices'] = np.array(range(nrBiomk), int)
    params[
        'fixSpeed'] = False  # if true then don't model progression speed, only time shift

    plotTrajParams['sortedByPvalInd'] = range(nrBiomk)
    plotTrajParams['pointIndices'] = params['pointIndices']
    plotTrajParams['labels'] = np.zeros(nrBiomk, int)
    plotTrajParams['names'] = ['v']
    params['plotTrajParams'] = plotTrajParams

    ################ set up the checkpoints ##########
    # R - run that part
    # L - load from saved file
    # I - ignore part

    params['runPartStd'] = [
        'R', 'R', 'R', 'I', 'I'
    ]  # [initClust, modelFit, AIC/BIC, blender, theta_sampling]
    params['runPartMain'] = ['R', 'I', 'I']  # [mainPart, plot, stage]

    params['masterProcess'] = runIndex == 0

    # assign initClustSubsetInd and nearNeighInitClust
    params['initClustSubsetInd'] = np.array(
        range(nrBiomk))  # set to identity map
    params['nearNeighInitClust'] = np.array(
        range(nrBiomk))  # set to identity map

    if params['masterProcess']:
        # [initClust, pointIndices, modelFit, AIC/BIC, checkers/visual]
        params['runPartStd'] = ['L', 'L', 'L', 'I', 'I']
        # [mainPart, plot, stage]
        params['runPartMain'] = ['R', 'I', 'I']

    params['compareTrueParamsFunc'] = compareWithTrueParams

    # makes changes to params
    setPrior(params,
             args.informPrior)  # sets an informative or uninformative prior

    nrSteps = 8
    # print('args.stepToRun', args.stepToRun)
    if args.stepToRun == 0:
        stepsList = list(range(nrSteps))
    else:
        stepsList = [args.stepToRun - 1]

    # if runIndex > 0:
    #   stepsList = [runIndex - 1]

    # print(args.expToRun)
    # print(adasd)

    ###################### vary trajectory centers ###############################
    # copy state of params and plotTrajParams
    paramsLocal = copy.deepcopy(params)
    plotTrajParamsLocal = copy.deepcopy(plotTrajParams)
    paramsLocal['plotTrajParams'] = plotTrajParamsLocal
    resList = []

    plotterObj = PlotterVDPM.PlotterVDPMSynth()

    if args.expToRun == 1 or args.expToRun == 0:
        dpsIntervalList = dpsIntervalDef * [5, 2, 1.5, 1, 0.7, 0.5, 0.3, 0.1]
        for i in stepsList:
            np.random.seed(1)
            expFolderShort = 'trajCent%d' % i
            expFolder = 'resfiles/synth/%s' % expFolderShort
            os.system('mkdir -p %s' % expFolder)
            expNameShort = 'data'
            dataFileName = '%s/%s.npz' % (expFolder, expNameShort)
            paramsLocal['dataset'] = expNameShort
            paramsLocal['datasetFull'] = 'synth%s' % expNameShort
            dpsIntervalCurr = dpsIntervalList[i]

            thetasTrueCurr = generateThetas(nrClustToGenDef, trajMinLowerLim,
                                            trajMinInterval, slopeLowerLim,
                                            slopeInterval, dpsLowerLimit,
                                            dpsIntervalCurr)

            covPerturbedCurr = [
                np.diag(
                    [0, thetasTrueCurr[c3, 1]**2 / 15, dpsIntervalDef / 70, 0])
                for c3 in range(nrClustToGenDef)
            ]

            # generate perturbed traj from clusters for each biomk
            # generate rand clust with uniform prob each

            clustAssignTrueB, thetasPerturbed = genClustAssThetasPerturb(
                nrBiomk, nrClustToGenDef, thetasTrueCurr, covPerturbedCurr)

            paramsLocal = generateClustData(
                nrSubjDef, nrBiomk, nrClustToGenDef, nrTimepts, trajFunc,
                thetasTrueCurr, thetasPerturbed, clustAssignTrueB, lowerAgeLim,
                upperAgeLim, covSubjShifts, avgStdScaleFactor, dataFileName,
                forceRegenerate, makeThetaIdentifFunc, paramsLocal)

            # for nrClustToFitCurr in range(1, 110):

            nrClustToFitCurr = nrClustToGenDef
            # #############
            # setLocalParamsNrClust(nrClustToFitCurr, plotTrajParamsLocal)  # changes plotTrajParamsLocal
            # assert plotTrajParamsLocal['legendColsClust'] == min([nrClustToFitCurr, 4])
            # #############
            # print('got hereeeeeeeeeeeee')
            # print(adsas)

            paramsLocal['nrClust'] = nrClustToGenDef
            expName = 'synth/%s/init%sCl%dPr%dRa%d' % \
                      (expFolderShort, args.initClustering, nrClustToFitCurr, args.informPrior, args.rangeFactor)
            plotTrajParamsLocal['expName'] = expName
            paramsLocal['plotTrajParams'] = plotTrajParamsLocal

            modelNames, res = evaluationFramework.runModels(
                paramsLocal, expName, modelToRun, runAllExpFunc)
            resList += [res]

        xLabelStr = 'Distance between trajectories'
        voxelCorrectAssignMeanValues = [
            resList[i][0]['resComp']['voxelCorrectAssignMean']
            for i in range(len(stepsList))
        ]
        voxelCorrectAssignStdValues = [
            resList[i][0]['resComp']['voxelCorrectAssignStd']
            for i in range(len(stepsList))
        ]
        fig = plotterObj.plotSynthResOneExp(
            voxelCorrectAssignMeanValues, voxelCorrectAssignStdValues,
            [dpsIntervalList[i] for i in stepsList], xLabelStr)
        fig.savefig('resfiles/synth/correctVertices_trajCent.png', dpi=100)

    ###################### vary number of clusters ###############################
    # copy state of params and plotTrajParams
    paramsLocal = copy.deepcopy(params)
    plotTrajParamsLocal = copy.deepcopy(plotTrajParams)
    paramsLocal['plotTrajParams'] = plotTrajParamsLocal
    resList = []

    if args.expToRun == 2 or args.expToRun == 0:
        nrClustToGenList = [2, 3, 5, 10, 15, 20, 50, 100]
        for i in stepsList:
            np.random.seed(1)
            expFolderShort = 'nrClust%d' % i
            expFolder = 'resfiles/synth/%s' % expFolderShort
            os.system('mkdir -p %s' % expFolder)
            nrClustToGenCurr = nrClustToGenList[i]
            expNameShort = 'data'
            dataFileName = '%s/%s.npz' % (expFolder, expNameShort)
            paramsLocal['dataset'] = expNameShort
            paramsLocal['datasetFull'] = 'synth%s' % expNameShort

            potentialRowsIndCurr = [
                j for j in range(len(nrImgMaxList))
                if nrImgMaxList[j] >= nrClustToGenCurr
            ] + [len(nrImgMaxList) - 1]
            print(potentialRowsIndCurr)
            nrRowsCurr, nrColsCurr = rowsColsList[potentialRowsIndCurr[0]]
            plotTrajParamsLocal['nrRows'] = nrRowsCurr
            plotTrajParamsLocal['nrCols'] = nrColsCurr
            print('nrRowsCurr', nrRowsCurr)
            print('nrColsCurr', nrColsCurr)
            plotTrajParamsLocal['clustHuePoints'] = np.linspace(
                0, 1, nrClustToGenCurr, endpoint=False)
            plotTrajParamsLocal['clustCols'] = [
                colorsys.hsv_to_rgb(hue, 1, 1)
                for hue in plotTrajParamsLocal['clustHuePoints']
            ]
            plotTrajParamsLocal['legendColsClust'] = min([nrClustToGenCurr, 4])
            print(plotTrajParamsLocal['clustHuePoints'])
            # print(adsa)

            thetasTrueCurr = generateThetas(nrClustToGenCurr, trajMinLowerLim,
                                            trajMinInterval, slopeLowerLim,
                                            slopeInterval, dpsLowerLimit,
                                            dpsIntervalDef)

            covPerturbedCurr = [
                np.diag(
                    [0, thetasTrueCurr[c3, 1]**2 / 15, dpsIntervalDef / 70, 0])
                for c3 in range(nrClustToGenCurr)
            ]

            # print('nrClust', nrClust)
            # print('clustAssignTrueB', clustAssignTrueB)
            # print('thetasTrue', thetasTrue)
            # print('covPerturbed', covPerturbed)
            # print(covPerturbed.shape, covPerturbed[clustAssignTrueB[0]], clustAssignTrueB[0])
            # generate perturbed traj from clusters for each biomk
            # generate rand clust with uniform prob each

            clustAssignTrueB, thetasPerturbed = genClustAssThetasPerturb(
                nrBiomk, nrClustToGenCurr, thetasTrueCurr, covPerturbedCurr)

            # print('nrClustToGenCurr', nrClustToGenCurr)
            # print(adas)

            paramsLocal = generateClustData(
                nrSubjDef, nrBiomk, nrClustToGenCurr, nrTimepts, trajFunc,
                thetasTrueCurr, thetasPerturbed, clustAssignTrueB, lowerAgeLim,
                upperAgeLim, covSubjShifts, avgStdScaleFactor, dataFileName,
                forceRegenerate, makeThetaIdentifFunc, paramsLocal)

            paramsLocal['nrClust'] = nrClustToGenCurr
            expName = 'synth/%s/init%sCl%dPr%dRa%d' % \
                      (expFolderShort, args.initClustering, nrClustToGenCurr, args.informPrior, args.rangeFactor)
            plotTrajParamsLocal['expName'] = expName
            paramsLocal['plotTrajParams'] = plotTrajParamsLocal

            modelNames, res = evaluationFramework.runModels(
                paramsLocal, expName, modelToRun, runAllExpFunc)
            resList += [res]

        xLabelStr = 'Number of clusters'
        voxelCorrectAssignMeanValues = [
            resList[i][0]['resComp']['voxelCorrectAssignMean']
            for i in range(len(stepsList))
        ]
        voxelCorrectAssignStdValues = [
            resList[i][0]['resComp']['voxelCorrectAssignStd']
            for i in range(len(stepsList))
        ]
        fig = plotterObj.plotSynthResOneExp(
            voxelCorrectAssignMeanValues,
            voxelCorrectAssignStdValues,
            [nrClustToGenList[i] for i in stepsList],
            xLabelStr,
            makeInts=True)
        fig.savefig('resfiles/synth/correctVertices_nrClust.png', dpi=100)

    ###################### vary number of subjects ################################

    # copy state of params and plotTrajParams
    paramsLocal = copy.deepcopy(params)
    plotTrajParamsLocal = copy.deepcopy(plotTrajParams)
    resList = []

    if args.expToRun == 3 or args.expToRun == 0:
        nrSubjList = [1000, 500, 250, 100, 75, 50, 35, 20]
        for i in stepsList:
            np.random.seed(1)
            expFolderShort = 'nrSubj%d' % i
            expFolder = 'resfiles/synth/%s' % expFolderShort
            os.system('mkdir -p %s' % expFolder)
            nrSubjCurr = nrSubjList[i]
            expNameShort = 'data'
            dataFileName = '%s/%s.npz' % (expFolder, expNameShort)
            paramsLocal['dataset'] = expNameShort
            paramsLocal['datasetFull'] = 'synth%s' % expNameShort

            thetasTrueCurr = generateThetas(nrClustToGenDef, trajMinLowerLim,
                                            trajMinInterval, slopeLowerLim,
                                            slopeInterval, dpsLowerLimit,
                                            dpsIntervalDef)

            covPerturbedCurr = [
                np.diag(
                    [0, thetasTrueCurr[c3, 1]**2 / 15, dpsIntervalDef / 70, 0])
                for c3 in range(nrClustToGenDef)
            ]

            clustAssignTrueB, thetasPerturbed = genClustAssThetasPerturb(
                nrBiomk, nrClustToGenDef, thetasTrueCurr, covPerturbedCurr)

            paramsLocal = generateClustData(
                nrSubjCurr, nrBiomk, nrClustToGenDef, nrTimepts, trajFunc,
                thetasTrueCurr, thetasPerturbed, clustAssignTrueB, lowerAgeLim,
                upperAgeLim, covSubjShifts, avgStdScaleFactor, dataFileName,
                forceRegenerate, makeThetaIdentifFunc, paramsLocal)

            paramsLocal['nrClust'] = paramsLocal['trueNrClust']
            expName = 'synth/%s/init%sCl%dPr%dRa%d' % \
                      (expFolderShort, args.initClustering, nrClustToFit, args.informPrior, args.rangeFactor)
            plotTrajParams['expName'] = expName
            paramsLocal['plotTrajParams'] = plotTrajParamsLocal

            modelNames, res = evaluationFramework.runModels(
                paramsLocal, expName, modelToRun, runAllExpSynth)
            resList += [res]

        xLabelStr = 'Number of Subjects'
        voxelCorrectAssignMeanValues = [
            resList[i][0]['resComp']['voxelCorrectAssignMean']
            for i in range(len(stepsList))
        ]
        voxelCorrectAssignStdValues = [
            resList[i][0]['resComp']['voxelCorrectAssignStd']
            for i in range(len(stepsList))
        ]
        fig = plotterObj.plotSynthResOneExp(voxelCorrectAssignMeanValues,
                                            voxelCorrectAssignStdValues,
                                            [nrSubjList[i] for i in stepsList],
                                            xLabelStr,
                                            makeInts=True,
                                            adjLeft=0.2)
        fig.savefig('resfiles/synth/correctVertices_nrSubj.png', dpi=100)
Exemple #9
0
def main():

    nrSubjLong = 100
    nrBiomk = 4
    nrTimepts = 4

    lowerAgeLim = 60
    upperAgeLim = 80

    shiftsLowerLim = -13
    shiftsUpperLim = 10

    etaB = 1 * np.ones(nrBiomk)
    lB = 10 * np.ones(nrBiomk)
    epsB = 1 * np.ones(nrBiomk)
    sigmaSB = 2 * np.ones((nrSubjLong, nrBiomk))

    sigmaGfunc = GPModel.genSigmaG
    sigmaEpsfunc = None
    sigmaSfunc = None

    outFolder = 'resfiles/synth/'

    expName = 'synth1'
    fileName = '%s.npz' % expName

    forceRegenerate = False

    params = {}

    nrFuncUnits = 2
    nrBiomkInFuncUnits = 3

    nrBiomk = nrBiomkInFuncUnits * nrFuncUnits
    mapBiomkToFuncUnits = np.array(
        list(range(nrFuncUnits)) * nrBiomkInFuncUnits)
    # should give smth like [0,1,2,3,0,1,2,3,0,1,2,3]
    print('mapBiomkToFuncUnits', mapBiomkToFuncUnits)

    # params of the dysfunctional trajectories (in the disease specific model)
    dysfuncParams = np.zeros((nrFuncUnits, 4), float)
    dysfuncParams[:, 0] = 1  # ak
    dysfuncParams[:, 1] = 0.3  # bk
    dysfuncParams[:, 2] = [-3, 7]  # ck
    dysfuncParams[:, 3] = 0  # dk

    # params of individual biomarkers
    thetas = np.zeros((nrBiomk, 4), float)
    thetas[:, 0] = 1
    thetas[:, 1] = 10
    thetas[:, 3] = 0
    for f in range(nrFuncUnits):
        thetas[mapBiomkToFuncUnits == f,
               2] = np.linspace(0.2,
                                0.9,
                                num=nrBiomkInFuncUnits,
                                endpoint=True)

    sigmaB = 0.1 * np.ones(nrBiomk)
    synthModel = ParHierModel.ParHierModel(dysfuncParams, thetas,
                                           mapBiomkToFuncUnits, sigmoidFunc,
                                           sigmaB)

    params = genSynthData.generateDataJMD(nrSubjLong, nrBiomk, nrTimepts,
                                          lowerAgeLim, upperAgeLim,
                                          shiftsLowerLim, shiftsUpperLim,
                                          synthModel, outFolder, fileName,
                                          forceRegenerate, params)

    plotTrajParams['diagNrs'] = np.unique(params['diag'])
    plotTrajParams['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits
    plotTrajParams['trueParams'] = params['trueParams']
    plotTrajParams['labels'] = ['b%d' % n for n in range(nrBiomk)]
    plotTrajParams['nrRowsFuncUnit'] = 2
    plotTrajParams['nrColsFuncUnit'] = 2
    plotTrajParams['colorsTraj'] = [
        colorsys.hsv_to_rgb(hue, 1, 1)
        for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False)
    ]

    # if False, plot estimated traj. in separate plot from true traj.
    plotTrajParams['allTrajOverlap'] = False

    params['runIndex'] = args.runIndex
    params['nrProc'] = args.nrProc
    params['cluster'] = args.cluster
    params['plotTrajParams'] = plotTrajParams
    params['penalty'] = args.penalty
    params['nrFuncUnits'] = nrFuncUnits
    params['mapBiomkToFuncUnits'] = mapBiomkToFuncUnits

    # params['data'] = dataCross
    # params['diag'] = diagCross
    # params['scanTimepts'] = scanTimeptsCross
    # params['partCode'] = partCodeCross
    # params['ageAtScan'] = ageAtScanCrossZ
    # params['trueParams'] = trueParams

    biomkCols = np.array([
        colorsys.hsv_to_rgb(hue, 1, 1)
        for hue in np.linspace(0, 1, num=nrBiomk, endpoint=False)
    ])

    if forceRegenerate:
        synthPlotter = Plotter.PlotterJDM(plotTrajParams)
        fig = synthPlotter.plotTrajData(params['longData'],
                                        params['longDiag'],
                                        params['trueParams']['dpsLong'],
                                        synthModel,
                                        replaceFigMode=True)
        fig.savefig('%s/synth1GeneratedData.png' % outFolder)

    if np.abs(args.penalty - int(args.penalty) < 0.00001):
        expName = '%sPen%d' % (expName, args.penalty)
    else:
        expName = '%sPen%.1f' % (expName, args.penalty)

    params['runPartStd'] = ['L', 'R']
    # [mainPart, plot, stage]
    params['runPartMain'] = ['R', 'R', 'I']

    params['masterProcess'] = args.runIndex == 0

    modelNames, res = evaluationFramework.runModels(params, expName,
                                                    args.modelToRun,
                                                    runAllExpSynth)
Exemple #10
0
def launchTadpole(runIndex, nrProcesses, modelToRun):

  doProcess = 0

  if doProcess:
    if args.leaderboard == 0:
      inputFileData = '../data/ADNI/challenge_training_data/neil_repo/TADPOLE_D1_D2.csv'
      sys.stdout.flush()
      outFileCheckpoint2 = 'tadpoleDf2.npz'
      print('loading data file')
      df = pd.read_csv(inputFileData,low_memory=False)
      df = cleanTadpoleData(df)
      data, diag, labels, scanTimepts, partCode, ageAtScan, dataDf, monthsSinceRefTime, \
        examDates, predInd = parseTadpoleData(df)

    else:
      outFileCheckpoint2 = 'tadpoleDf2Ldb.npz'
      print('loading data file')
      inputFileDataD1D2 = '../data/ADNI/challenge_training_data/neil_repo/TADPOLE_D1_D2.csv'
      df = pd.read_csv(inputFileDataD1D2,low_memory=False)
      df = cleanTadpoleData(df)
      inputFileDataLB = '../data/ADNI/challenge_training_data/neil_repo/evaluation/TADPOLE_LB1_LB2.csv'
      dfLB = pd.read_csv(inputFileDataLB, low_memory=False)

      # this function runs exactly as in the normal submission, no difference here for leaderboard
      data, diag, labels, scanTimepts, partCode, ageAtScan, dataDf, monthsSinceRefTime, \
        examDates, _ = parseTadpoleData(df)

      filterMaskLB12 = np.logical_or(dfLB.LB1 == 1, dfLB.LB2 == 1)
      assert data.shape[0] == dfLB.shape[0]

      # print(np.sum(filterMaskLB12), filterMaskLB12.shape[0])
      # print(dads)

      data = data[filterMaskLB12,:]
      diag = diag[filterMaskLB12]
      scanTimepts = scanTimepts[filterMaskLB12]
      partCode = partCode[filterMaskLB12]
      ageAtScan = ageAtScan[filterMaskLB12]
      dataDf = dataDf[filterMaskLB12]
      dataDf.reset_index(drop=True, inplace=True)
      dataDf.reindex(index=range(dataDf.shape[0]))
      monthsSinceRefTime = monthsSinceRefTime[filterMaskLB12]
      examDates = examDates[filterMaskLB12]
      predInd = dfLB.RID[dfLB.LB2 == 1].as_matrix()

    dataStruct = dict(data=data, diag=diag, labels=labels, scanTimepts=scanTimepts,
      partCode=partCode, ageAtScan=ageAtScan, dataDf=dataDf,
      monthsSinceRefTime=monthsSinceRefTime, examDates=examDates, predInd=predInd)
    pickle.dump(dataStruct, open(outFileCheckpoint2, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

  else:
    if args.leaderboard == 0:
      outFileCheckpoint2 = 'tadpoleDf2.npz'
    else:
      outFileCheckpoint2 = 'tadpoleDf2Ldb.npz'


  dataStruct = pickle.load(open(outFileCheckpoint2, 'rb'))
  data = dataStruct['data']
  diag = dataStruct['diag']
  labels = dataStruct['labels']
  scanTimepts = dataStruct['scanTimepts']
  partCode = dataStruct['partCode']
  ageAtScan = dataStruct['ageAtScan']
  # dataDf = dataStruct['dataDf']
  monthsSinceRefTime = dataStruct['monthsSinceRefTime']
  examDates = dataStruct['examDates']
  predInd = dataStruct['predInd']


  # filter AD subjects
  # diagInd = np.array(np.where(matData['diag'] == PCA)[0])
  print('compiling parameters')
  sys.stdout.flush()

  print('diag', np.unique(diag), diag)
  # print(adsas)

  unqPartCode = np.unique(partCode)
  nrUnqPart = len(unqPartCode)

  # calculate Z-scores at each point w.r.t controls at baseline
  # controlBlInd = np.logical_and(diag == CTL, scanTimepts == 1)
  controlInd = diag == CTL
  stdBiomk = np.nanstd(data[diag == CTL], 0)
  biomkMaskCTL = np.isnan(np.nanstd(data[diag == CTL], 0))
  biomkMaskAD = np.isnan(np.nanstd(data[diag == AD], 0))
  biomkMaskMCI = np.isnan(np.nanstd(data[diag == MCI], 0))
  mask = np.logical_or(np.logical_or(biomkMaskCTL, biomkMaskMCI), biomkMaskAD)
  # print(ads)
  selectedBiomk = np.logical_not(np.logical_or(mask, stdBiomk == 0))

  print(data.shape)
  data = data[:, selectedBiomk]
  labels = labels[selectedBiomk]
  pointIndices = np.array(range(data.shape[1]))
  stdBiomk = np.nanstd(data[controlInd], 0)
  print(data.shape)
  # print(ads)

  meanCTL = np.nanmean(data[controlInd], 0)  # calculate Z-scores
  stdCTL = np.nanstd(data[controlInd], 0)
  dataZ = (data - meanCTL[None,:])/stdCTL[None,:]
  data = dataZ

  outlierRows, outlierCols = np.where(np.abs(dataZ) > 50)
  filterMask = np.ones(data.shape[0], bool)
  filterMask[outlierRows] = 0
  data = data[filterMask]
  diag = diag[filterMask]
  scanTimepts = scanTimepts[filterMask]
  partCode = partCode[filterMask]
  ageAtScan = ageAtScan[filterMask]
  monthsSinceRefTime = monthsSinceRefTime[filterMask]
  examDates = examDates[filterMask]

  nrSubj, nrBiomk = data.shape
  # print('nrBiomk', nrBiomk)
  # print(adsa)

  dataAD = data[diag == AD, :]

  # make all biomarkers decreasing by flipping their signs if necessary
  # also perform a t-test to see which ones are most informative, sort them by pvalue (i.e. sortedByPvalInd)
  # the new data is re-scaled
  data, sortedByPvalInd, biomkScaleExtra, pVals = makeBiomksDecr(data, diag, labels)
  #doTtest(data, diag, pointIndices)

  # multiply the scaling we did from controls with (-1) if the biomk had the sign flipped
  stdBiomkRescale = biomkScaleExtra * stdCTL

  assert(sortedByPvalInd.shape[0] == data.shape[1])

  sys.stdout.flush()

  global params

  params['data'] = data
  params['diag'] = diag
  params['scanTimepts'] = scanTimepts
  params['partCode'] = partCode
  params['ageAtScan'] = ageAtScan
  params['biomkDir'] = DECR
  params['modelToRun'] = modelToRun
  params['datasetFull'] = 'tadpole'
  params['labels'] = labels
  params['predInd'] = predInd
  params['examDates'] = examDates

  print('outFileCheckpoint2', outFileCheckpoint2)
  print('d2Ind', np.unique(predInd), np.unique(predInd).shape)
  # print(adsa)

  # filter down to 100 subjects to make it run faster, just for testing. Also select only some biomarkers
  unqPartCode = np.unique(params['partCode'])
  nrPartToSample = 100
  np.random.seed(3)
  selectedPartCode = np.random.choice(unqPartCode, nrPartToSample)
  dataIndices = np.in1d(params['partCode'], selectedPartCode)
  # params = diffEqModel.filterDDSPAIndices(params, dataIndices)


  indices = [i for i in range(len(labels)) if labels[i] in
      [b'FDG', b'AV45', b'CDRSB', b'ADAS13', b'Ventricles',
       b'Hippocampus', b'WholeBrain', b'Entorhinal', b'MidTemp', b'ABETA_UPENNBIOMK9_04_19_17',
       b'TAU_UPENNBIOMK9_04_19_17', b'PTAU_UPENNBIOMK9_04_19_17']]

  # indices = sortedByPvalInd[:300]
  # print('pVals lowest', pVals[sortedByPvalInd[:300]])
  # print('pVals highest', pVals[sortedByPvalInd[-100:]])
  # print('indices', indices)
  # print(ads)
  print('labels', labels[indices])
  # print(adsa)
  print(np.nanstd(data,axis=0)[indices])
  data = params['data'][:,indices]
  params['data'] = data
  labels = labels[indices]
  params['labels'] = labels
  nrBiomk = params['data'].shape[1]
  print('data.shape', params['data'].shape)
  meanCTL = meanCTL[indices]
  stdBiomkRescale = stdBiomkRescale[indices]
  print(stdBiomkRescale)
  print('flippedBiomk', labels[stdBiomkRescale < 0])
  sortedByPvalInd = np.argsort(np.argsort(sortedByPvalInd[indices]))

  # visTadpoleHist(data, diag, ageAtScan, labels, plotTrajParams, sortedByPvalInd)
  # print(adsa)

  # visTadpoleSpagetti(data, diag, ageAtScan, scanTimepts, partCode, labels, plotTrajParams, sortedByPvalInd)
  # print(adsa)

  # print('CTL %f +/- %f', np.nanmean(params['data'][params['diag'] == CTL, 1]), np.nanstd(params['data'][params['diag'] == CTL, 1]))
  # print('AD %f +/- %f', np.nanmean(params['data'][params['diag'] == AD, 1]), np.nanstd(params['data'][params['diag'] == AD, 1]))
  # print(ads)

  # map points that have been removed to the closest included points (nearestNeighbours).
  # also find the adjacency list for the MRF and another subset of 10k points for
  # initial clustering
  runPartNN = 'L'
  plotTrajParams['nearestNeighbours'] = np.array(range(nrBiomk))
  params['adjList'] = np.nan
  params['nearNeighInitClust'] = np.array(range(nrBiomk))
  params['initClustSubsetInd'] = np.array(range(nrBiomk))
  params['meanBiomkRescale'] = meanCTL # for rescaling back if necessary
  params['stdBiomkRescale'] = stdBiomkRescale
  params['fixSpeed'] = True # if true then don't model progression speed, only time shift

  diagNrs = np.unique(diag)
  # print('diagNrs, diag', diagNrs, diag)
  # print(asdas)

  # print(len(params['acqDate']), data.shape[0])
  sys.stdout.flush()
  assert(params['data'].shape[0] == params['diag'].shape[0] ==
    params['scanTimepts'].shape[0] == params['partCode'].shape[0] ==
    params['ageAtScan'].shape[0])

  # sets an uninformative or informative prior
  priorNr = setPrior(params, args.informPrior, mean_gamma_alpha=1,
    std_gamma_alpha=0.3, mu_beta=0, std_beta=5)

  suffix = ''
  if args.leaderboard:
    suffix = 'Ldb'
    # print(ads)

  expName = 'tadpoleInit%sCl%dPr%dRa%d%s' % (args.initClustering, params['nrClust'],
    priorNr, args.rangeFactor, suffix)
  plotTrajParams['sortedByPvalInd'] = sortedByPvalInd
  plotTrajParams['pointIndices'] = pointIndices
  plotTrajParams['expName'] = expName
  plotTrajParams['ageTransform'] = (0, 1) # no age normalisation was necessary
  plotTrajParams['datasetFull'] = params['datasetFull']
  plotTrajParams['labels'] = labels

  params['plotTrajParams'] = plotTrajParams

  # [initClust, modelFit, AIC/BIC, blender, theta_sampling]
  params['runPartStd'] = ['L', 'L', 'I', 'I', 'I']
  params['runPartMain'] = ['R', 'I', 'I']  # [mainPart, plot, stage]
  params['runPartCogCorr'] = ['I']
  params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'L']
  params['runPartDirDiag'] = ['R', 'R', 'I']
  params['runPartStaging'] = ['L', 'L', 'I']
  params['runPartDiffDiag'] = ['R', 'R', 'I']
  params['runPartConvPred'] = ['I', 'I', 'I']
  params['runPartCVNonOverlap'] = ['R']
  params['runPartCVNonOverlapMain'] = ['L', 'L', 'I', 'I', 'L']
  params['masterProcess'] = runIndex == 0

  if params['masterProcess']:
    # [initClust, modelFit, AIC/BIC, blender, theta_sampling]
    params['runPartStd'] = ['L', 'L', 'I', 'I', 'I']
    params['runPartMain'] = ['R', 'R', 'R']  # [mainPart, plot, stage]
    params['runPartCogCorr'] = ['I']
    params['runPartCogCorrMain'] = ['L', 'L', 'I', 'I', 'I']
    params['runPartDirDiag'] = ['R', 'R', 'I']
    params['runPartStaging'] = ['L', 'L', 'I']
    params['runPartDiffDiag'] = ['R', 'R', 'I']
    params['runPartConvPred'] = ['I', 'I', 'I']
    params['runPartCVNonOverlap'] = ['I']
    params['runPartCVNonOverlapMain'] = ['R', 'R', 'I', 'R', 'R']

  runAllExpFunc = runAllExpTADPOLE
  modelNames, res = evaluationFramework.runModels(params, expName, modelToRun, runAllExpFunc)

  # now generate forecast
  print('Generating forecast ... ')
  teamName = 'DIVE6'
  if args.leaderboard:
    outputFile = 'TADPOLE_Submission_Leaderboard_%s.csv' % teamName
    predStartDate = datetime.date(2010, 5, 1)
    nrYearsToPred = 7
    nrMonthsToPred = 12*nrYearsToPred  # 5 years
  else:
    outputFile = 'TADPOLE_Submission_%s.csv' % teamName
    predStartDate = datetime.date(2018, 1, 1)
    nrYearsToPred = 5
    nrMonthsToPred = 12*nrYearsToPred  # 7 years

  resCurrModel = res[0]['std']

  predAdasAllSubj, predVentsAllSubj, predDiagAllSubj = makeTadpoleForecast(predStartDate,
    nrYearsToPred, nrMonthsToPred, resCurrModel, params)

  # write forecast to file
  writeTadpoleSubmission(predAdasAllSubj, predVentsAllSubj, predDiagAllSubj, outputFile,
    nrMonthsToPred, predStartDate, params)