def LoadUKBBData( training_data_fraction, dis_index, filepath='path/to/clinical/data/UKBB_HPO.pth', exclude_path='path/to/clinical/data/ukb_withdrawn_current.txt', sampler_path='path/to/clinical/samplers'): clinData = ClinicalDataset() clinData.ReadFromDisk(filepath) try: sampler = ClinicalDatasetSampler(clinData, training_data_fraction, conditionSamplingOnDx=[dis_index], returnArrays='Torch') sampler.ReadFromDisk(sampler_path + 'Sampler_' + dis_index.replace(':', '_')) sampler.ConvertToUnconditional() except KeyError: sampler = ClinicalDatasetSampler(clinData, training_data_fraction, returnArrays='Torch') sampler.ReadFromDisk(sampler_path + 'Sampler_' + dis_index.replace(':', '_')) excluded = np.array( pd.read_csv(exclude_path, header=None, index_col=0).index) sampler.DropSamples(excluded) return clinData, sampler
# read the hpo terms from disk dis_to_term = pd.read_pickle(input_hpo_file) #load the dataset from disk, include only the HPO terms annotated to the the disease clinData = ClinicalDataset() clinData.ReadFromDisk('path/to/clinical/record/dataset') annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID'] clinData.IncludeOnly(annotated_terms) #make sure the maximum rank of the model is less than the number of annotated HPO terms if (len(annotated_terms) - 1) < rank: rank = len(annotated_terms) - 1 ## load the stored dataset sampler sampler = ClinicalDatasetSampler(clinData, training_data_fraction, conditionSamplingOnDx=[dis_index], returnArrays='Torch') sampler.ReadFromDisk('path/to/clinical/dataset/samplers/' + 'Sampler_' + dis_index.replace(':', '_')) #set the covariates if covariate_set == 'NULL': sampler.SubsetCovariates([]) elif covariate_set != 'ALL': sampler.SubsetCovariates(covariate_set.split(',')) #make sure the model hasn't been fit before. If not, then fit it and write to disk. if 'trialNum_' + trial + '.pth' not in os.listdir(direcPrefix + outputFileDirec + '/Models/'):
clinData=ClinicalDataset() clinData.ReadFromDisk('path/to/clinical/dataset') results_table={'OMIM_ICD_ID':[],'Avg Component Weighted R^2':[],'Component Weighted R^2 Matrix':[],'Cluster Labels':[],'Num Replicates, Top Model':[],'Meets Criteria':[]} try: os.mkdir(output_file_prefix+'_Figures') except FileExistsError: pass for dis_index in set(allowed_diseases).intersection(dis_to_term.index): try: print('Computing matrix similarities for '+dis_index) sampler=ClinicalDatasetSampler(clinData,training_data_fraction,conditionSamplingOnDx = [dis_index],returnArrays='Torch') sampler.ReadFromDisk('path/to/dataset/samplers'+'Sampler_'+dis_index.replace(':','_')) sampler.ConvertToUnconditional() all_procrustes_scores=[] procrustes_score_matrix = np.ones((num_trials,num_trials)) for trial_pair in itertools.combinations(range(1,num_trials+1), 2): vlpi_1=vLPI(sampler,max_rank) vlpi_1.LoadModel('path/to/latent/pheno/models/'+input_direc+'MendelianDiseaseIndex_'+dis_index.replace(':','_')+'/Models/trialNum_'+str(trial_pair[0])+'.pth') vlpi_2=vLPI(sampler,max_rank) vlpi_2.LoadModel('path/to/latent/pheno/models/'+input_direc+'MendelianDiseaseIndex_'+dis_index.replace(':','_')+'/Models/trialNum_'+str(trial_pair[1])+'.pth') risk_matrix_1=vlpi_1.ReturnComponents() risk_matrix_2=vlpi_2.ReturnComponents()
def main(): #fixed data loaded into memory dis_table = pd.read_csv(DATA_PATH + "TargetDiseaseCodes.txt", sep='\t', index_col="CODE") ukbb_model_table = pd.read_pickle(DATA_PATH + "ICD10-UKBB_ModelTable.pth") parser = argparse.ArgumentParser( description= 'Imputes the cryptic phenotypes analyzed in Blair et al. 2020 into arbitrary clinical datasets.' ) parser.add_argument( "encoding", help="ICD encoding. Must be either 'ICD10-CM' or 'ICD10-UKBB'.", type=str) parser.add_argument( "datafile", help= "Path to the datafile containing the clinical information. Note, the software expects a tab-delimitted text file with two columns. The first column contains a unique ID for every subject. The second column contains a comma-separated list of diagnosed ICD10 codes. DO NOT include a header.", type=str) parser.add_argument( "cryptic_phenotype", help= "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead." .format(', '.join(list(dis_table.index))), type=str) parser.add_argument("output_file", help="Path to the output file.", type=str) parser.add_argument( "--use_best", help= "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead." .format(', '.join(list(dis_table.index))), action="store_true") parser.add_argument( "--model_path", help= "By default, the program downlads and saves models to the same directory as the software package. This might not be allowed in all settings, so you can specify an alternative path to store models using this option.", type=str) args = parser.parse_args() if args.cryptic_phenotype == 'KEY': print(dis_table) sys.exit() assert args.encoding in [ 'ICD10-CM', 'ICD10-UKBB' ], "Encoding not recognized. Please use 'ICD10-CM' or 'ICD10-UKBB'." assert args.cryptic_phenotype in dis_table.index, "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead.".format( ', '.join(list(dis_table.index))) disease_code = dis_table.loc[args.cryptic_phenotype]['OMIM_HPO_ID'] #initialize the ClinicalDataset class if args.encoding == 'ICD10-CM': currentClinicalDataset = ClinicalDataset() else: currentClinicalDataset = ClinicalDataset(ICDFilePaths=[ ICD_PATH + 'icd10_ukbb.txt', ICD_PATH + 'ICD10_Chapters.txt' ]) #read the dataset into memory currentClinicalDataset.ReadDatasetFromFile(args.datafile, 1, indexColumn=0, hasHeader=False, chunkSize=50000) #set up the model directories if they do not already exist if args.model_path is not None: MODEL_PATH = args.model_path if MODEL_PATH[-1] != '/': MODEL_PATH += '/' else: MODEL_PATH = pkg_resources.resource_filename('CrypticPhenoImpute', 'Models/') try: os.mkdir(MODEL_PATH) except FileExistsError: pass try: os.mkdir(MODEL_PATH + 'ICD10UKBB_Models') except FileExistsError: pass try: os.mkdir(MODEL_PATH + 'ICD10CM_Models') except FileExistsError: pass #if using ICD10-CM, use the vlpi model directly. Requires translating from ICD10-CM into HPO terms if args.encoding == 'ICD10-CM': #load the HPO term table hpo_table = pd.read_csv(DATA_PATH + "HPOTable.txt", sep='\t', index_col="HPO_ICD10_ID") model_table = pd.read_csv(DATA_PATH + "ModelTable.txt", sep='\t', index_col="Disease ID") disease_hpo = model_table.loc[disease_code][ 'Annotated HPO Terms'].split(',') hpo_icd10_map = { hpo: hpo_table.loc[hpo]['ICD10'].split(';') for hpo in disease_hpo } icd10_HPO_map = {} for key, value in hpo_icd10_map.items(): for icd in value: try: icd10_HPO_map[icd] += [key] except KeyError: icd10_HPO_map[icd] = [key] currentClinicalDataset.ConstructNewDataArray(icd10_HPO_map) sampler = ClinicalDatasetSampler(currentClinicalDataset, 0.5) vlpi_model = vLPI(sampler, model_table.loc[disease_code]['Max. Model Rank']) try: vlpi_model.LoadModel(MODEL_PATH + 'ICD10CM_Models/{0:s}.pth'.format( disease_code.replace(':', '_'))) except FileNotFoundError: print("\nDownloading model files from GitHub.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10CM_Models/{0:s}.pth" .format(disease_code.replace(':', '_')), out=MODEL_PATH + 'ICD10CM_Models/') vlpi_model.LoadModel(MODEL_PATH + 'ICD10CM_Models/{0:s}.pth'.format( disease_code.replace(':', '_'))) try: with open( MODEL_PATH + 'ICD10CM_Models/{0:s}_Index.pth'.format( disease_code.replace(':', '_')), 'rb') as f: model_hpo_index = pickle.load(f) except FileNotFoundError: print("\nDownloading index files from GitHub.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10CM_Models/{0:s}_Index.pth" .format(disease_code.replace(':', '_')), out=MODEL_PATH + 'ICD10CM_Models/') with open( MODEL_PATH + 'ICD10CM_Models/{0:s}_Index.pth'.format( disease_code.replace(':', '_')), 'rb') as f: model_hpo_index = pickle.load(f) ######## This code corrects variations in the order in which symptoms are stored that occurred between an earlier and the current version of the ClinicalDataset class ######## Clearly, this is less than ideal, but it wasn't worth refitting all of the models for this small change in storage that could be corrected. symptom_array = currentClinicalDataset.ReturnSparseDataMatrix() new_order = [ currentClinicalDataset.dxCodeToDataIndexMap[x] for x in model_hpo_index.keys() ] symptom_array = (symptom_array.tocsr()[:, new_order]).tocoo() ######## ######## cp = vlpi_model.ComputeEmbeddings( dataArrays=(symptom_array, []))[:, model_table.loc[disease_code]['Top Component']] output_table = pd.DataFrame({ 'Subject_ID': currentClinicalDataset.data.index, args.cryptic_phenotype: cp }) output_table.set_index('Subject_ID', inplace=True, drop=True) output_table.to_csv(args.output_file, sep='\t') # use the ICD10-UKBB encoding else: try: os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_'))) except FileExistsError: pass if args.use_best == True: try: os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_') + '/TopModel')) except FileExistsError: pass try: features = pd.read_csv( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModelFeatures.txt'.format( disease_code.replace(':', '_')), sep='\t', header=None) except FileNotFoundError: print("\nDownloading feature file.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/TopModelFeatures.txt" .format(disease_code.replace(':', '_')), out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_'))) features = pd.read_csv( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModelFeatures.txt'.format( disease_code.replace(':', '_')), sep='\t', header=None) currentClinicalDataset.IncludeOnly(features[0].values) symptom_array = currentClinicalDataset.ReturnSparseDataMatrix() try: with open( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModel/{1:s}'.format( disease_code.replace(':', '_'), ukbb_model_table.loc[disease_code]['Top_Model']), 'rb') as f: model_dict = pickle.load(f) except FileNotFoundError: print("\nDownloading top performing imputation model.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/TopModel/{1:s}" .format(disease_code.replace(':', '_'), ukbb_model_table.loc[disease_code]['Top_Model']), out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModel'.format( disease_code.replace(':', '_'))) with open( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModel/{1:s}'.format( disease_code.replace(':', '_'), ukbb_model_table.loc[disease_code]['Top_Model']), 'rb') as f: model_dict = pickle.load(f) cp = model_dict['Model'].predict(symptom_array) else: try: os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_') + '/BaggedModels')) except FileExistsError: pass try: features = pd.read_csv( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt'.format( disease_code.replace(':', '_')), sep='\t') except FileNotFoundError: print("\nDownloading feature file.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt" .format(disease_code.replace(':', '_')), out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_'))) features = pd.read_csv( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt'.format( disease_code.replace(':', '_')), sep='\t') currentClinicalDataset.IncludeOnly(features['ICD10'].values) symptom_array = currentClinicalDataset.ReturnSparseDataMatrix() all_models = ukbb_model_table.loc[disease_code]['Bagged_Models'] num_models = 0 cp = np.zeros((currentClinicalDataset.numPatients)) for model_string in all_models: try: with open( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}'.format( disease_code.replace(':', '_'), model_string), 'rb') as f: model_dict = pickle.load(f) except FileNotFoundError: print("\nDownloading bagged model: {0:s}.".format( model_string)) wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}" .format(disease_code.replace(':', '_'), model_string), out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModels'.format( disease_code.replace(':', '_'))) with open( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}'.format( disease_code.replace(':', '_'), model_string), 'rb') as f: model_dict = pickle.load(f) cp += model_dict['Model'].predict(symptom_array) num_models += 1 cp /= num_models output_table = pd.DataFrame({ 'Subject_ID': currentClinicalDataset.data.index, args.cryptic_phenotype: cp }) output_table.set_index('Subject_ID', inplace=True, drop=True) output_table.to_csv(args.output_file, sep='\t')
clinData.ReadFromDisk('path/to/clinical/dataset') #set the clinical dataset to only include annotated symptoms if model_table.loc[dis_index][[ 'Revised Converged [0.02, 2000]', 'Revised Increase LR Converged [0.05, 4000]' ]].sum() > 0: annotated_terms = revised_dis_to_term.loc[dis_index]['HPO_ICD10_ID'] else: annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID'] max_rank = model_table.loc[dis_index]['Rank'] clinData.IncludeOnly(annotated_terms) #load the sampler sampler = ClinicalDatasetSampler(clinData, training_data_fraction, conditionSamplingOnDx=[dis_index], returnArrays='Torch') sampler.ReadFromDisk('path/to/samplers' + 'Sampler_' + dis_index.replace(':', '_')) #set the covariates if model_table.loc[dis_index]['Covariates'] == 'NULL': sampler.SubsetCovariates([]) elif covariate_set != 'ALL': sampler.SubsetCovariates( model_table.loc[dis_index]['Covariates'].split(',')) #load the top peforming model sampler.ConvertToUnconditional() bestVLPIModel = vLPI(sampler, max_rank) bestVLPIModel.LoadModel('/path/to/models/' + dis_index.replace(':', '_') +
'Revised Increase LR Converged [0.05, 4000]' ]].sum() > 0: annotated_terms = revised_dis_to_term.loc[dis_index]['HPO_ICD10_ID'] else: annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID'] #set the max rank of the model max_rank = model_table.loc[dis_index]['Rank'] #make a local copy of the clinical data, inlude on the annotated symptoms localClinData = copy.deepcopy(clinData) localClinData.IncludeOnly(annotated_terms) #read the sampler for the disease from disk sampler = ClinicalDatasetSampler(localClinData, training_data_fraction, conditionSamplingOnDx=[dis_index], returnArrays='Torch') sampler.ReadFromDisk('path/to/samplers/' + 'Sampler_' + dis_index.replace(':', '_')) #set the covariates if model_table.loc[dis_index]['Covariates'] == 'NULL': sampler.SubsetCovariates([]) elif covariate_set != 'ALL': sampler.SubsetCovariates( model_table.loc[dis_index]['Covariates'].split(',')) #load the best latent phenotype model sampler.ConvertToUnconditional() bestVLPIModel = vLPI(sampler, max_rank) bestVLPIModel.LoadModel('path/to/best/models/' +
ax.legend(loc='best',frameon=False, fontsize=20) ax.set_xlabel('Cryptic\n'+r'Phenotype ($\mathbf{Z}$)',fontsize=40,fontweight='bold') ax.set_ylabel('Density\n'+r'$\log_{10}$-Scale',fontsize=40,fontweight='bold') plt.savefig('cryptic_pheno_sim.svg') plt.close() # # clinData = ClinicalDataset() #build arbitrary list of disease codes disList =list(clinData.dxCodeToDataIndexMap.keys())[0:num_symptoms+1] # load data into clinical dataset clinData.IncludeOnly(disList) clinData.LoadFromArrays(torch.cat([simData['incidence_data'],simData['target_dis_dx'].reshape(-1,1)],axis=1),simData['covariate_data'],[],catCovDicts=None, arrayType = 'Torch') clinData.ConditionOnDx([disList[-1]]) sampler = ClinicalDatasetSampler(clinData,training_data_fraction,returnArrays='Torch',conditionSamplingOnDx = [disList[-1]]) sampler.ConvertToUnconditional() vlpiModel= vLPI(sampler,inf_rank) try: vlpiModel.LoadModel('IllustrativeExample.pth') except FileNotFoundError: inference_output = vlpiModel.FitModel(batch_size=200,errorTol=(1.0/num_samples)) vlpiModel.PackageModel('IllustrativeExample.pth') inferredCrypticPhenotypes=vlpiModel.ComputeEmbeddings((simData['incidence_data'],simData['covariate_data'])) riskFunction=vlpiModel.ReturnComponents().ravel() latentPhenos=vlpiModel.ComputeEmbeddings(dataArrays=(clinData.ReturnSparseDataMatrix(clinData.data.index.sort_values()),[])).ravel() fig, ax = plt.subplots(figsize=(10, 8)) fig.tight_layout(pad=2)
ukbbDataset, ukbb_sampler = LoadUKBBData(training_data_fraction, dis_index) annotated_terms_ucsf = model_table.loc[dis_index]['Annotated HPO Terms'] annotated_terms_ukbb = model_table.loc[dis_index][ 'Annotated HPO Terms UKBB'] max_rank_ucsf = model_table.loc[dis_index]['UCSF Max. Model Rank'] max_rank_ukbb = model_table.loc[dis_index]['UKBB Max. Model Rank'] ucsfDataset_HPO.IncludeOnly(annotated_terms_ucsf) ucsfDataset_UKBB.IncludeOnly(annotated_terms_ukbb) ukbbDataset.IncludeOnly(annotated_terms_ukbb) sampler_hpo = ClinicalDatasetSampler(ucsfDataset_HPO, training_data_fraction, conditionSamplingOnDx=[dis_index], returnArrays='Torch') sampler_hpo.ReadFromDisk('path/to/samplers/UCSF/' + 'Sampler_' + dis_index.replace(':', '_')) sampler_ucsf_ukbb = ClinicalDatasetSampler( ucsfDataset_UKBB, training_data_fraction, conditionSamplingOnDx=[dis_index], returnArrays='Torch') sampler_ucsf_ukbb.ReadFromDisk('path/to/samplers/UCSF/' + 'Sampler_' + dis_index.replace(':', '_')) if model_table.loc[dis_index]['Covariate Set'] == 'NULL': sampler_hpo.SubsetCovariates([]) sampler_ucsf_ukbb.SubsetCovariates([])
numLatentPhenotypes=2 ## simulate the data simulator = ClinicalDataSimulator(numberOfSymptoms,numLatentPhenotypes,rareDiseaseFrequency) simulatedData=simulator.GenerateClinicalData(numberOfSamples) ## load the data into a ClinicalDataset class clinicalData = ClinicalDataset() #change the dataset to have only 20 symptoms (defaults to full ICD10-CM codebook), named after the first 20 letters allICDCodes = list(clinicalData.dxCodeToDataIndexMap.keys()) symptomConversionMap=dict(zip(allICDCodes[0:numberOfSymptoms],string.ascii_uppercase[0:numberOfSymptoms])) clinicalData.ConstructNewDataArray(symptomConversionMap) # now load into the data structure. Note, when using in this manner, it's the users responsibility to make sure that the input data columns match the data columns of the ClinicalDataset. clinicalData.LoadFromArrays(simulatedData['incidence_data'],simulatedData['covariate_data'],[],catCovDicts=None, arrayType = 'Torch') ## Now load the ClincalDataset into ClinicalDatasetSampler training_data_fraction=0.75 sampler = ClinicalDatasetSampler(clinicalData,training_data_fraction,returnArrays='Torch') ## Intantation the models infNumberOfLatentPhenotypes=10 vlpiModel= vLPI(sampler,infNumberOfLatentPhenotypes) inference_output = vlpiModel.FitModel(batch_size=1000,errorTol=(1.0/numberOfSamples)) vlpiModel.PackageModel('ExampleModel.pth') inferredCrypticPhenotypes=vlpiModel.ComputeEmbeddings((simulatedData['incidence_data'],simulatedData['covariate_data'])) riskFunction=vlpiModel.ReturnComponents()