Exemple #1
0
def LoadUKBBData(
        training_data_fraction,
        dis_index,
        filepath='path/to/clinical/data/UKBB_HPO.pth',
        exclude_path='path/to/clinical/data/ukb_withdrawn_current.txt',
        sampler_path='path/to/clinical/samplers'):
    clinData = ClinicalDataset()
    clinData.ReadFromDisk(filepath)

    try:
        sampler = ClinicalDatasetSampler(clinData,
                                         training_data_fraction,
                                         conditionSamplingOnDx=[dis_index],
                                         returnArrays='Torch')
        sampler.ReadFromDisk(sampler_path + 'Sampler_' +
                             dis_index.replace(':', '_'))
        sampler.ConvertToUnconditional()
    except KeyError:
        sampler = ClinicalDatasetSampler(clinData,
                                         training_data_fraction,
                                         returnArrays='Torch')
        sampler.ReadFromDisk(sampler_path + 'Sampler_' +
                             dis_index.replace(':', '_'))

    excluded = np.array(
        pd.read_csv(exclude_path, header=None, index_col=0).index)
    sampler.DropSamples(excluded)
    return clinData, sampler
#load the dataset from disk, include only the HPO terms annotated to the the disease
clinData = ClinicalDataset()
clinData.ReadFromDisk('path/to/clinical/record/dataset')
annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID']
clinData.IncludeOnly(annotated_terms)

#make sure the maximum rank of the model is less than the number of annotated HPO terms
if (len(annotated_terms) - 1) < rank:
    rank = len(annotated_terms) - 1

## load the stored dataset sampler
sampler = ClinicalDatasetSampler(clinData,
                                 training_data_fraction,
                                 conditionSamplingOnDx=[dis_index],
                                 returnArrays='Torch')
sampler.ReadFromDisk('path/to/clinical/dataset/samplers/' + 'Sampler_' +
                     dis_index.replace(':', '_'))

#set the covariates
if covariate_set == 'NULL':
    sampler.SubsetCovariates([])
elif covariate_set != 'ALL':
    sampler.SubsetCovariates(covariate_set.split(','))

#make sure the model hasn't been fit before. If not, then fit it and write to disk.
if 'trialNum_' + trial + '.pth' not in os.listdir(direcPrefix +
                                                  outputFileDirec +
                                                  '/Models/'):

    sampler.ConvertToUnconditional()
    validation_sampler = sampler.GenerateValidationSampler(validation_fraction)
Exemple #3
0
    if model_table.loc[dis_index][[
            'Revised Converged [0.02, 2000]',
            'Revised Increase LR Converged [0.05, 4000]'
    ]].sum() > 0:
        annotated_terms = revised_dis_to_term.loc[dis_index]['HPO_ICD10_ID']
    else:
        annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID']
    max_rank = model_table.loc[dis_index]['Rank']
    clinData.IncludeOnly(annotated_terms)

    #load the sampler
    sampler = ClinicalDatasetSampler(clinData,
                                     training_data_fraction,
                                     conditionSamplingOnDx=[dis_index],
                                     returnArrays='Torch')
    sampler.ReadFromDisk('path/to/samplers' + 'Sampler_' +
                         dis_index.replace(':', '_'))

    #set the covariates
    if model_table.loc[dis_index]['Covariates'] == 'NULL':
        sampler.SubsetCovariates([])
    elif covariate_set != 'ALL':
        sampler.SubsetCovariates(
            model_table.loc[dis_index]['Covariates'].split(','))

    #load the top peforming model
    sampler.ConvertToUnconditional()
    bestVLPIModel = vLPI(sampler, max_rank)
    bestVLPIModel.LoadModel('/path/to/models/' + dis_index.replace(':', '_') +
                            '.pth')

    #compute the latent phenotypes and perplexities for the top performing model
Exemple #4
0
    annotated_terms_ucsf = model_table.loc[dis_index]['Annotated HPO Terms']
    annotated_terms_ukbb = model_table.loc[dis_index][
        'Annotated HPO Terms UKBB']

    max_rank_ucsf = model_table.loc[dis_index]['UCSF Max. Model Rank']
    max_rank_ukbb = model_table.loc[dis_index]['UKBB Max. Model Rank']

    ucsfDataset_HPO.IncludeOnly(annotated_terms_ucsf)
    ucsfDataset_UKBB.IncludeOnly(annotated_terms_ukbb)
    ukbbDataset.IncludeOnly(annotated_terms_ukbb)

    sampler_hpo = ClinicalDatasetSampler(ucsfDataset_HPO,
                                         training_data_fraction,
                                         conditionSamplingOnDx=[dis_index],
                                         returnArrays='Torch')
    sampler_hpo.ReadFromDisk('path/to/samplers/UCSF/' + 'Sampler_' +
                             dis_index.replace(':', '_'))

    sampler_ucsf_ukbb = ClinicalDatasetSampler(
        ucsfDataset_UKBB,
        training_data_fraction,
        conditionSamplingOnDx=[dis_index],
        returnArrays='Torch')
    sampler_ucsf_ukbb.ReadFromDisk('path/to/samplers/UCSF/' + 'Sampler_' +
                                   dis_index.replace(':', '_'))

    if model_table.loc[dis_index]['Covariate Set'] == 'NULL':
        sampler_hpo.SubsetCovariates([])
        sampler_ucsf_ukbb.SubsetCovariates([])
        ukbb_sampler.SubsetCovariates([])
    elif covariate_set != 'ALL':
        sampler_hpo.SubsetCovariates(