def LoadUKBBData( training_data_fraction, dis_index, filepath='path/to/clinical/data/UKBB_HPO.pth', exclude_path='path/to/clinical/data/ukb_withdrawn_current.txt', sampler_path='path/to/clinical/samplers'): clinData = ClinicalDataset() clinData.ReadFromDisk(filepath) try: sampler = ClinicalDatasetSampler(clinData, training_data_fraction, conditionSamplingOnDx=[dis_index], returnArrays='Torch') sampler.ReadFromDisk(sampler_path + 'Sampler_' + dis_index.replace(':', '_')) sampler.ConvertToUnconditional() except KeyError: sampler = ClinicalDatasetSampler(clinData, training_data_fraction, returnArrays='Torch') sampler.ReadFromDisk(sampler_path + 'Sampler_' + dis_index.replace(':', '_')) excluded = np.array( pd.read_csv(exclude_path, header=None, index_col=0).index) sampler.DropSamples(excluded) return clinData, sampler
outputFileDirec = 'MendelianDiseaseIndex_' + args.dis_index.replace(':', '_') try: os.mkdir(direcPrefix + outputFileDirec) except FileExistsError: pass try: os.mkdir(direcPrefix + outputFileDirec + '/Models') except FileExistsError: pass # read the hpo terms from disk dis_to_term = pd.read_pickle(input_hpo_file) #load the dataset from disk, include only the HPO terms annotated to the the disease clinData = ClinicalDataset() clinData.ReadFromDisk('path/to/clinical/record/dataset') annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID'] clinData.IncludeOnly(annotated_terms) #make sure the maximum rank of the model is less than the number of annotated HPO terms if (len(annotated_terms) - 1) < rank: rank = len(annotated_terms) - 1 ## load the stored dataset sampler sampler = ClinicalDatasetSampler(clinData, training_data_fraction, conditionSamplingOnDx=[dis_index], returnArrays='Torch') sampler.ReadFromDisk('path/to/clinical/dataset/samplers/' + 'Sampler_' + dis_index.replace(':', '_'))
tree=aggClustIntance.children_ new_order=recursive_sort(tree,distMat.shape[0],2*distMat.shape[0]-2) sorted_distMat = distMat[new_order] sorted_distMat=sorted_distMat[:,new_order] return sorted_distMat,new_order dis_to_term = pd.read_pickle(input_hpo_file) allowed_diseases = [x.strip() for x in open('path/to/list/of/diseases').readlines()] dis_names=pd.read_csv('path/to/diseaes/names',sep='\t') dis_names.set_index('Disease ID', drop=True, inplace=True) clinData=ClinicalDataset() clinData.ReadFromDisk('path/to/clinical/dataset') results_table={'OMIM_ICD_ID':[],'Avg Component Weighted R^2':[],'Component Weighted R^2 Matrix':[],'Cluster Labels':[],'Num Replicates, Top Model':[],'Meets Criteria':[]} try: os.mkdir(output_file_prefix+'_Figures') except FileExistsError: pass for dis_index in set(allowed_diseases).intersection(dis_to_term.index): try: print('Computing matrix similarities for '+dis_index) sampler=ClinicalDatasetSampler(clinData,training_data_fraction,conditionSamplingOnDx = [dis_index],returnArrays='Torch')
def main(): #fixed data loaded into memory dis_table = pd.read_csv(DATA_PATH + "TargetDiseaseCodes.txt", sep='\t', index_col="CODE") ukbb_model_table = pd.read_pickle(DATA_PATH + "ICD10-UKBB_ModelTable.pth") parser = argparse.ArgumentParser( description= 'Imputes the cryptic phenotypes analyzed in Blair et al. 2020 into arbitrary clinical datasets.' ) parser.add_argument( "encoding", help="ICD encoding. Must be either 'ICD10-CM' or 'ICD10-UKBB'.", type=str) parser.add_argument( "datafile", help= "Path to the datafile containing the clinical information. Note, the software expects a tab-delimitted text file with two columns. The first column contains a unique ID for every subject. The second column contains a comma-separated list of diagnosed ICD10 codes. DO NOT include a header.", type=str) parser.add_argument( "cryptic_phenotype", help= "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead." .format(', '.join(list(dis_table.index))), type=str) parser.add_argument("output_file", help="Path to the output file.", type=str) parser.add_argument( "--use_best", help= "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead." .format(', '.join(list(dis_table.index))), action="store_true") parser.add_argument( "--model_path", help= "By default, the program downlads and saves models to the same directory as the software package. This might not be allowed in all settings, so you can specify an alternative path to store models using this option.", type=str) args = parser.parse_args() if args.cryptic_phenotype == 'KEY': print(dis_table) sys.exit() assert args.encoding in [ 'ICD10-CM', 'ICD10-UKBB' ], "Encoding not recognized. Please use 'ICD10-CM' or 'ICD10-UKBB'." assert args.cryptic_phenotype in dis_table.index, "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead.".format( ', '.join(list(dis_table.index))) disease_code = dis_table.loc[args.cryptic_phenotype]['OMIM_HPO_ID'] #initialize the ClinicalDataset class if args.encoding == 'ICD10-CM': currentClinicalDataset = ClinicalDataset() else: currentClinicalDataset = ClinicalDataset(ICDFilePaths=[ ICD_PATH + 'icd10_ukbb.txt', ICD_PATH + 'ICD10_Chapters.txt' ]) #read the dataset into memory currentClinicalDataset.ReadDatasetFromFile(args.datafile, 1, indexColumn=0, hasHeader=False, chunkSize=50000) #set up the model directories if they do not already exist if args.model_path is not None: MODEL_PATH = args.model_path if MODEL_PATH[-1] != '/': MODEL_PATH += '/' else: MODEL_PATH = pkg_resources.resource_filename('CrypticPhenoImpute', 'Models/') try: os.mkdir(MODEL_PATH) except FileExistsError: pass try: os.mkdir(MODEL_PATH + 'ICD10UKBB_Models') except FileExistsError: pass try: os.mkdir(MODEL_PATH + 'ICD10CM_Models') except FileExistsError: pass #if using ICD10-CM, use the vlpi model directly. Requires translating from ICD10-CM into HPO terms if args.encoding == 'ICD10-CM': #load the HPO term table hpo_table = pd.read_csv(DATA_PATH + "HPOTable.txt", sep='\t', index_col="HPO_ICD10_ID") model_table = pd.read_csv(DATA_PATH + "ModelTable.txt", sep='\t', index_col="Disease ID") disease_hpo = model_table.loc[disease_code][ 'Annotated HPO Terms'].split(',') hpo_icd10_map = { hpo: hpo_table.loc[hpo]['ICD10'].split(';') for hpo in disease_hpo } icd10_HPO_map = {} for key, value in hpo_icd10_map.items(): for icd in value: try: icd10_HPO_map[icd] += [key] except KeyError: icd10_HPO_map[icd] = [key] currentClinicalDataset.ConstructNewDataArray(icd10_HPO_map) sampler = ClinicalDatasetSampler(currentClinicalDataset, 0.5) vlpi_model = vLPI(sampler, model_table.loc[disease_code]['Max. Model Rank']) try: vlpi_model.LoadModel(MODEL_PATH + 'ICD10CM_Models/{0:s}.pth'.format( disease_code.replace(':', '_'))) except FileNotFoundError: print("\nDownloading model files from GitHub.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10CM_Models/{0:s}.pth" .format(disease_code.replace(':', '_')), out=MODEL_PATH + 'ICD10CM_Models/') vlpi_model.LoadModel(MODEL_PATH + 'ICD10CM_Models/{0:s}.pth'.format( disease_code.replace(':', '_'))) try: with open( MODEL_PATH + 'ICD10CM_Models/{0:s}_Index.pth'.format( disease_code.replace(':', '_')), 'rb') as f: model_hpo_index = pickle.load(f) except FileNotFoundError: print("\nDownloading index files from GitHub.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10CM_Models/{0:s}_Index.pth" .format(disease_code.replace(':', '_')), out=MODEL_PATH + 'ICD10CM_Models/') with open( MODEL_PATH + 'ICD10CM_Models/{0:s}_Index.pth'.format( disease_code.replace(':', '_')), 'rb') as f: model_hpo_index = pickle.load(f) ######## This code corrects variations in the order in which symptoms are stored that occurred between an earlier and the current version of the ClinicalDataset class ######## Clearly, this is less than ideal, but it wasn't worth refitting all of the models for this small change in storage that could be corrected. symptom_array = currentClinicalDataset.ReturnSparseDataMatrix() new_order = [ currentClinicalDataset.dxCodeToDataIndexMap[x] for x in model_hpo_index.keys() ] symptom_array = (symptom_array.tocsr()[:, new_order]).tocoo() ######## ######## cp = vlpi_model.ComputeEmbeddings( dataArrays=(symptom_array, []))[:, model_table.loc[disease_code]['Top Component']] output_table = pd.DataFrame({ 'Subject_ID': currentClinicalDataset.data.index, args.cryptic_phenotype: cp }) output_table.set_index('Subject_ID', inplace=True, drop=True) output_table.to_csv(args.output_file, sep='\t') # use the ICD10-UKBB encoding else: try: os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_'))) except FileExistsError: pass if args.use_best == True: try: os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_') + '/TopModel')) except FileExistsError: pass try: features = pd.read_csv( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModelFeatures.txt'.format( disease_code.replace(':', '_')), sep='\t', header=None) except FileNotFoundError: print("\nDownloading feature file.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/TopModelFeatures.txt" .format(disease_code.replace(':', '_')), out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_'))) features = pd.read_csv( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModelFeatures.txt'.format( disease_code.replace(':', '_')), sep='\t', header=None) currentClinicalDataset.IncludeOnly(features[0].values) symptom_array = currentClinicalDataset.ReturnSparseDataMatrix() try: with open( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModel/{1:s}'.format( disease_code.replace(':', '_'), ukbb_model_table.loc[disease_code]['Top_Model']), 'rb') as f: model_dict = pickle.load(f) except FileNotFoundError: print("\nDownloading top performing imputation model.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/TopModel/{1:s}" .format(disease_code.replace(':', '_'), ukbb_model_table.loc[disease_code]['Top_Model']), out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModel'.format( disease_code.replace(':', '_'))) with open( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModel/{1:s}'.format( disease_code.replace(':', '_'), ukbb_model_table.loc[disease_code]['Top_Model']), 'rb') as f: model_dict = pickle.load(f) cp = model_dict['Model'].predict(symptom_array) else: try: os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_') + '/BaggedModels')) except FileExistsError: pass try: features = pd.read_csv( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt'.format( disease_code.replace(':', '_')), sep='\t') except FileNotFoundError: print("\nDownloading feature file.") wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt" .format(disease_code.replace(':', '_')), out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format( disease_code.replace(':', '_'))) features = pd.read_csv( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt'.format( disease_code.replace(':', '_')), sep='\t') currentClinicalDataset.IncludeOnly(features['ICD10'].values) symptom_array = currentClinicalDataset.ReturnSparseDataMatrix() all_models = ukbb_model_table.loc[disease_code]['Bagged_Models'] num_models = 0 cp = np.zeros((currentClinicalDataset.numPatients)) for model_string in all_models: try: with open( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}'.format( disease_code.replace(':', '_'), model_string), 'rb') as f: model_dict = pickle.load(f) except FileNotFoundError: print("\nDownloading bagged model: {0:s}.".format( model_string)) wget.download( "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}" .format(disease_code.replace(':', '_'), model_string), out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModels'.format( disease_code.replace(':', '_'))) with open( MODEL_PATH + 'ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}'.format( disease_code.replace(':', '_'), model_string), 'rb') as f: model_dict = pickle.load(f) cp += model_dict['Model'].predict(symptom_array) num_models += 1 cp /= num_models output_table = pd.DataFrame({ 'Subject_ID': currentClinicalDataset.data.index, args.cryptic_phenotype: cp }) output_table.set_index('Subject_ID', inplace=True, drop=True) output_table.to_csv(args.output_file, sep='\t')
xmin=np.floor(cryptic_phenotypes.min()) xmax=np.ceil(cryptic_phenotypes.max()) ax.set_xlim(xmin,xmax) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.hist([cryptic_phenotypes[has_rare_disease==1],cryptic_phenotypes[has_rare_disease==0]],log=True,stacked=True,density=True,color=[color_list[0],grey_color],label=['Pathogenic Genotype Carriers','Control Population']) ax.legend(loc='best',frameon=False, fontsize=20) ax.set_xlabel('Cryptic\n'+r'Phenotype ($\mathbf{Z}$)',fontsize=40,fontweight='bold') ax.set_ylabel('Density\n'+r'$\log_{10}$-Scale',fontsize=40,fontweight='bold') plt.savefig('cryptic_pheno_sim.svg') plt.close() # # clinData = ClinicalDataset() #build arbitrary list of disease codes disList =list(clinData.dxCodeToDataIndexMap.keys())[0:num_symptoms+1] # load data into clinical dataset clinData.IncludeOnly(disList) clinData.LoadFromArrays(torch.cat([simData['incidence_data'],simData['target_dis_dx'].reshape(-1,1)],axis=1),simData['covariate_data'],[],catCovDicts=None, arrayType = 'Torch') clinData.ConditionOnDx([disList[-1]]) sampler = ClinicalDatasetSampler(clinData,training_data_fraction,returnArrays='Torch',conditionSamplingOnDx = [disList[-1]]) sampler.ConvertToUnconditional() vlpiModel= vLPI(sampler,inf_rank) try: vlpiModel.LoadModel('IllustrativeExample.pth') except FileNotFoundError: inference_output = vlpiModel.FitModel(batch_size=200,errorTol=(1.0/num_samples))
'../UCSF-EffectiveRankTopComponents/FinalModels_UCSFPerformanceResults.pth' ) # 1) Identify top component as highest R^2 between UKBB and UCSF model (training dataset) # 2) Compute and store regression model statistics # 3) Compute Severity in UCSF cases along with p-value # 4) Validate severity in UKBB if ICD10 dx codes available if dis_index.replace( ':', '_') + '_UKBBPerformanceResults.pth' not in os.listdir(output_direc): # Step 1) Identify top components from UKBB model in UCSF and UKBB datasets #load the data ucsfDataset_HPO = ClinicalDataset() ucsfDataset_HPO.ReadFromDisk( 'path/to/clinical/data/UCSF_MendelianDisease_HPO.pth') ucsfDataset_UKBB = ClinicalDataset() ucsfDataset_UKBB.ReadFromDisk( 'path/to/clinical/data/UCSF_MendelianDisease_UKBB_HPO.pth') ukbbDataset, ukbb_sampler = LoadUKBBData(training_data_fraction, dis_index) annotated_terms_ucsf = model_table.loc[dis_index]['Annotated HPO Terms'] annotated_terms_ukbb = model_table.loc[dis_index][ 'Annotated HPO Terms UKBB'] max_rank_ucsf = model_table.loc[dis_index]['UCSF Max. Model Rank'] max_rank_ukbb = model_table.loc[dis_index]['UKBB Max. Model Rank']
torch.manual_seed(1023) ## set the simulation parameters numberOfSamples=50000 numberOfSymptoms=20 rareDiseaseFrequency=0.001 numLatentPhenotypes=2 ## simulate the data simulator = ClinicalDataSimulator(numberOfSymptoms,numLatentPhenotypes,rareDiseaseFrequency) simulatedData=simulator.GenerateClinicalData(numberOfSamples) ## load the data into a ClinicalDataset class clinicalData = ClinicalDataset() #change the dataset to have only 20 symptoms (defaults to full ICD10-CM codebook), named after the first 20 letters allICDCodes = list(clinicalData.dxCodeToDataIndexMap.keys()) symptomConversionMap=dict(zip(allICDCodes[0:numberOfSymptoms],string.ascii_uppercase[0:numberOfSymptoms])) clinicalData.ConstructNewDataArray(symptomConversionMap) # now load into the data structure. Note, when using in this manner, it's the users responsibility to make sure that the input data columns match the data columns of the ClinicalDataset. clinicalData.LoadFromArrays(simulatedData['incidence_data'],simulatedData['covariate_data'],[],catCovDicts=None, arrayType = 'Torch') ## Now load the ClincalDataset into ClinicalDatasetSampler training_data_fraction=0.75 sampler = ClinicalDatasetSampler(clinicalData,training_data_fraction,returnArrays='Torch') ## Intantation the models infNumberOfLatentPhenotypes=10