コード例 #1
0
def LoadUKBBData(
        training_data_fraction,
        dis_index,
        filepath='path/to/clinical/data/UKBB_HPO.pth',
        exclude_path='path/to/clinical/data/ukb_withdrawn_current.txt',
        sampler_path='path/to/clinical/samplers'):
    clinData = ClinicalDataset()
    clinData.ReadFromDisk(filepath)

    try:
        sampler = ClinicalDatasetSampler(clinData,
                                         training_data_fraction,
                                         conditionSamplingOnDx=[dis_index],
                                         returnArrays='Torch')
        sampler.ReadFromDisk(sampler_path + 'Sampler_' +
                             dis_index.replace(':', '_'))
        sampler.ConvertToUnconditional()
    except KeyError:
        sampler = ClinicalDatasetSampler(clinData,
                                         training_data_fraction,
                                         returnArrays='Torch')
        sampler.ReadFromDisk(sampler_path + 'Sampler_' +
                             dis_index.replace(':', '_'))

    excluded = np.array(
        pd.read_csv(exclude_path, header=None, index_col=0).index)
    sampler.DropSamples(excluded)
    return clinData, sampler
コード例 #2
0
outputFileDirec = 'MendelianDiseaseIndex_' + args.dis_index.replace(':', '_')
try:
    os.mkdir(direcPrefix + outputFileDirec)
except FileExistsError:
    pass

try:
    os.mkdir(direcPrefix + outputFileDirec + '/Models')
except FileExistsError:
    pass

# read the hpo terms from disk
dis_to_term = pd.read_pickle(input_hpo_file)

#load the dataset from disk, include only the HPO terms annotated to the the disease
clinData = ClinicalDataset()
clinData.ReadFromDisk('path/to/clinical/record/dataset')
annotated_terms = dis_to_term.loc[dis_index]['HPO_ICD10_ID']
clinData.IncludeOnly(annotated_terms)

#make sure the maximum rank of the model is less than the number of annotated HPO terms
if (len(annotated_terms) - 1) < rank:
    rank = len(annotated_terms) - 1

## load the stored dataset sampler
sampler = ClinicalDatasetSampler(clinData,
                                 training_data_fraction,
                                 conditionSamplingOnDx=[dis_index],
                                 returnArrays='Torch')
sampler.ReadFromDisk('path/to/clinical/dataset/samplers/' + 'Sampler_' +
                     dis_index.replace(':', '_'))
    tree=aggClustIntance.children_
    new_order=recursive_sort(tree,distMat.shape[0],2*distMat.shape[0]-2)
    sorted_distMat = distMat[new_order]
    sorted_distMat=sorted_distMat[:,new_order]
    return sorted_distMat,new_order




dis_to_term = pd.read_pickle(input_hpo_file)

allowed_diseases = [x.strip() for x in open('path/to/list/of/diseases').readlines()]
dis_names=pd.read_csv('path/to/diseaes/names',sep='\t')
dis_names.set_index('Disease ID', drop=True, inplace=True)

clinData=ClinicalDataset()
clinData.ReadFromDisk('path/to/clinical/dataset')



results_table={'OMIM_ICD_ID':[],'Avg Component Weighted R^2':[],'Component Weighted R^2 Matrix':[],'Cluster Labels':[],'Num Replicates, Top Model':[],'Meets Criteria':[]}
try:
    os.mkdir(output_file_prefix+'_Figures')
except FileExistsError:
    pass

for dis_index in set(allowed_diseases).intersection(dis_to_term.index):
    try:
        print('Computing matrix similarities for '+dis_index)

        sampler=ClinicalDatasetSampler(clinData,training_data_fraction,conditionSamplingOnDx = [dis_index],returnArrays='Torch')
コード例 #4
0
def main():

    #fixed data loaded into memory
    dis_table = pd.read_csv(DATA_PATH + "TargetDiseaseCodes.txt",
                            sep='\t',
                            index_col="CODE")
    ukbb_model_table = pd.read_pickle(DATA_PATH + "ICD10-UKBB_ModelTable.pth")

    parser = argparse.ArgumentParser(
        description=
        'Imputes the cryptic phenotypes analyzed in Blair et al. 2020 into arbitrary clinical datasets.'
    )

    parser.add_argument(
        "encoding",
        help="ICD encoding. Must be either 'ICD10-CM' or 'ICD10-UKBB'.",
        type=str)

    parser.add_argument(
        "datafile",
        help=
        "Path to the datafile containing the clinical information. Note, the software expects a tab-delimitted text file with two columns. The first column contains a unique ID for every subject. The second column contains a comma-separated list of diagnosed ICD10 codes. DO NOT include a header.",
        type=str)

    parser.add_argument(
        "cryptic_phenotype",
        help=
        "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead."
        .format(', '.join(list(dis_table.index))),
        type=str)

    parser.add_argument("output_file",
                        help="Path to the output file.",
                        type=str)

    parser.add_argument(
        "--use_best",
        help=
        "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead."
        .format(', '.join(list(dis_table.index))),
        action="store_true")

    parser.add_argument(
        "--model_path",
        help=
        "By default, the program downlads and saves models to the same directory as the software package. This might not be allowed in all settings, so you can specify an alternative path to store models using this option.",
        type=str)

    args = parser.parse_args()

    if args.cryptic_phenotype == 'KEY':
        print(dis_table)
        sys.exit()

    assert args.encoding in [
        'ICD10-CM', 'ICD10-UKBB'
    ], "Encoding not recognized. Please use 'ICD10-CM' or 'ICD10-UKBB'."
    assert args.cryptic_phenotype in dis_table.index, "Disease cryptic phenotype to be imputed. Must be in the following list: {0:s}. To see a key for the cryptic phenotypes, provide the argument KEY instead.".format(
        ', '.join(list(dis_table.index)))
    disease_code = dis_table.loc[args.cryptic_phenotype]['OMIM_HPO_ID']

    #initialize the ClinicalDataset class
    if args.encoding == 'ICD10-CM':
        currentClinicalDataset = ClinicalDataset()
    else:
        currentClinicalDataset = ClinicalDataset(ICDFilePaths=[
            ICD_PATH + 'icd10_ukbb.txt', ICD_PATH + 'ICD10_Chapters.txt'
        ])

    #read the dataset into memory
    currentClinicalDataset.ReadDatasetFromFile(args.datafile,
                                               1,
                                               indexColumn=0,
                                               hasHeader=False,
                                               chunkSize=50000)

    #set up the model directories if they do not already exist
    if args.model_path is not None:
        MODEL_PATH = args.model_path
        if MODEL_PATH[-1] != '/':
            MODEL_PATH += '/'
    else:
        MODEL_PATH = pkg_resources.resource_filename('CrypticPhenoImpute',
                                                     'Models/')

    try:
        os.mkdir(MODEL_PATH)
    except FileExistsError:
        pass

    try:
        os.mkdir(MODEL_PATH + 'ICD10UKBB_Models')
    except FileExistsError:
        pass

    try:
        os.mkdir(MODEL_PATH + 'ICD10CM_Models')
    except FileExistsError:
        pass

    #if using ICD10-CM, use the vlpi model directly. Requires translating from ICD10-CM into HPO terms

    if args.encoding == 'ICD10-CM':
        #load the HPO term table
        hpo_table = pd.read_csv(DATA_PATH + "HPOTable.txt",
                                sep='\t',
                                index_col="HPO_ICD10_ID")
        model_table = pd.read_csv(DATA_PATH + "ModelTable.txt",
                                  sep='\t',
                                  index_col="Disease ID")

        disease_hpo = model_table.loc[disease_code][
            'Annotated HPO Terms'].split(',')
        hpo_icd10_map = {
            hpo: hpo_table.loc[hpo]['ICD10'].split(';')
            for hpo in disease_hpo
        }

        icd10_HPO_map = {}
        for key, value in hpo_icd10_map.items():
            for icd in value:
                try:
                    icd10_HPO_map[icd] += [key]
                except KeyError:
                    icd10_HPO_map[icd] = [key]

        currentClinicalDataset.ConstructNewDataArray(icd10_HPO_map)

        sampler = ClinicalDatasetSampler(currentClinicalDataset, 0.5)

        vlpi_model = vLPI(sampler,
                          model_table.loc[disease_code]['Max. Model Rank'])

        try:
            vlpi_model.LoadModel(MODEL_PATH +
                                 'ICD10CM_Models/{0:s}.pth'.format(
                                     disease_code.replace(':', '_')))
        except FileNotFoundError:
            print("\nDownloading model files from GitHub.")
            wget.download(
                "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10CM_Models/{0:s}.pth"
                .format(disease_code.replace(':', '_')),
                out=MODEL_PATH + 'ICD10CM_Models/')
            vlpi_model.LoadModel(MODEL_PATH +
                                 'ICD10CM_Models/{0:s}.pth'.format(
                                     disease_code.replace(':', '_')))

        try:
            with open(
                    MODEL_PATH + 'ICD10CM_Models/{0:s}_Index.pth'.format(
                        disease_code.replace(':', '_')), 'rb') as f:
                model_hpo_index = pickle.load(f)
        except FileNotFoundError:
            print("\nDownloading index files from GitHub.")
            wget.download(
                "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10CM_Models/{0:s}_Index.pth"
                .format(disease_code.replace(':', '_')),
                out=MODEL_PATH + 'ICD10CM_Models/')
            with open(
                    MODEL_PATH + 'ICD10CM_Models/{0:s}_Index.pth'.format(
                        disease_code.replace(':', '_')), 'rb') as f:
                model_hpo_index = pickle.load(f)

        ######## This code corrects variations in the order in which symptoms are stored that occurred between an earlier and the current version of the ClinicalDataset class
        ######## Clearly, this is less than ideal, but it wasn't worth refitting all of the models for this small change in storage that could be corrected.
        symptom_array = currentClinicalDataset.ReturnSparseDataMatrix()
        new_order = [
            currentClinicalDataset.dxCodeToDataIndexMap[x]
            for x in model_hpo_index.keys()
        ]
        symptom_array = (symptom_array.tocsr()[:, new_order]).tocoo()
        ########
        ########

        cp = vlpi_model.ComputeEmbeddings(
            dataArrays=(symptom_array,
                        []))[:, model_table.loc[disease_code]['Top Component']]
        output_table = pd.DataFrame({
            'Subject_ID': currentClinicalDataset.data.index,
            args.cryptic_phenotype: cp
        })
        output_table.set_index('Subject_ID', inplace=True, drop=True)
        output_table.to_csv(args.output_file, sep='\t')

    # use the ICD10-UKBB encoding
    else:
        try:
            os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                disease_code.replace(':', '_')))
        except FileExistsError:
            pass

        if args.use_best == True:
            try:
                os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                    disease_code.replace(':', '_') + '/TopModel'))
            except FileExistsError:
                pass

            try:
                features = pd.read_csv(
                    MODEL_PATH +
                    'ICD10UKBB_Models/{0:s}/TopModelFeatures.txt'.format(
                        disease_code.replace(':', '_')),
                    sep='\t',
                    header=None)
            except FileNotFoundError:
                print("\nDownloading feature file.")
                wget.download(
                    "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/TopModelFeatures.txt"
                    .format(disease_code.replace(':', '_')),
                    out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                        disease_code.replace(':', '_')))
                features = pd.read_csv(
                    MODEL_PATH +
                    'ICD10UKBB_Models/{0:s}/TopModelFeatures.txt'.format(
                        disease_code.replace(':', '_')),
                    sep='\t',
                    header=None)

            currentClinicalDataset.IncludeOnly(features[0].values)
            symptom_array = currentClinicalDataset.ReturnSparseDataMatrix()

            try:
                with open(
                        MODEL_PATH +
                        'ICD10UKBB_Models/{0:s}/TopModel/{1:s}'.format(
                            disease_code.replace(':', '_'),
                            ukbb_model_table.loc[disease_code]['Top_Model']),
                        'rb') as f:
                    model_dict = pickle.load(f)
            except FileNotFoundError:
                print("\nDownloading top performing imputation model.")
                wget.download(
                    "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/TopModel/{1:s}"
                    .format(disease_code.replace(':', '_'),
                            ukbb_model_table.loc[disease_code]['Top_Model']),
                    out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}/TopModel'.format(
                        disease_code.replace(':', '_')))
                with open(
                        MODEL_PATH +
                        'ICD10UKBB_Models/{0:s}/TopModel/{1:s}'.format(
                            disease_code.replace(':', '_'),
                            ukbb_model_table.loc[disease_code]['Top_Model']),
                        'rb') as f:
                    model_dict = pickle.load(f)
            cp = model_dict['Model'].predict(symptom_array)

        else:
            try:
                os.mkdir(MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                    disease_code.replace(':', '_') + '/BaggedModels'))
            except FileExistsError:
                pass

            try:
                features = pd.read_csv(
                    MODEL_PATH +
                    'ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt'.format(
                        disease_code.replace(':', '_')),
                    sep='\t')
            except FileNotFoundError:
                print("\nDownloading feature file.")
                wget.download(
                    "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt"
                    .format(disease_code.replace(':', '_')),
                    out=MODEL_PATH + 'ICD10UKBB_Models/{0:s}'.format(
                        disease_code.replace(':', '_')))
                features = pd.read_csv(
                    MODEL_PATH +
                    'ICD10UKBB_Models/{0:s}/BaggedModelFeatures.txt'.format(
                        disease_code.replace(':', '_')),
                    sep='\t')

            currentClinicalDataset.IncludeOnly(features['ICD10'].values)
            symptom_array = currentClinicalDataset.ReturnSparseDataMatrix()

            all_models = ukbb_model_table.loc[disease_code]['Bagged_Models']
            num_models = 0
            cp = np.zeros((currentClinicalDataset.numPatients))
            for model_string in all_models:
                try:
                    with open(
                            MODEL_PATH +
                            'ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}'.format(
                                disease_code.replace(':', '_'), model_string),
                            'rb') as f:
                        model_dict = pickle.load(f)
                except FileNotFoundError:
                    print("\nDownloading bagged model: {0:s}.".format(
                        model_string))
                    wget.download(
                        "https://raw.githubusercontent.com/daverblair/CrypticPhenoImpute/master/CrypticPhenoImpute/Models/ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}"
                        .format(disease_code.replace(':', '_'), model_string),
                        out=MODEL_PATH +
                        'ICD10UKBB_Models/{0:s}/BaggedModels'.format(
                            disease_code.replace(':', '_')))
                    with open(
                            MODEL_PATH +
                            'ICD10UKBB_Models/{0:s}/BaggedModels/{1:s}'.format(
                                disease_code.replace(':', '_'), model_string),
                            'rb') as f:
                        model_dict = pickle.load(f)
                cp += model_dict['Model'].predict(symptom_array)
                num_models += 1
            cp /= num_models

        output_table = pd.DataFrame({
            'Subject_ID': currentClinicalDataset.data.index,
            args.cryptic_phenotype: cp
        })
        output_table.set_index('Subject_ID', inplace=True, drop=True)
        output_table.to_csv(args.output_file, sep='\t')
コード例 #5
0
xmin=np.floor(cryptic_phenotypes.min())
xmax=np.ceil(cryptic_phenotypes.max())
ax.set_xlim(xmin,xmax)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.hist([cryptic_phenotypes[has_rare_disease==1],cryptic_phenotypes[has_rare_disease==0]],log=True,stacked=True,density=True,color=[color_list[0],grey_color],label=['Pathogenic Genotype Carriers','Control Population'])
ax.legend(loc='best',frameon=False, fontsize=20)
ax.set_xlabel('Cryptic\n'+r'Phenotype ($\mathbf{Z}$)',fontsize=40,fontweight='bold')
ax.set_ylabel('Density\n'+r'$\log_{10}$-Scale',fontsize=40,fontweight='bold')
plt.savefig('cryptic_pheno_sim.svg')
plt.close()

# #
clinData = ClinicalDataset()
#build arbitrary list of disease codes
disList =list(clinData.dxCodeToDataIndexMap.keys())[0:num_symptoms+1]

# load data into clinical dataset
clinData.IncludeOnly(disList)
clinData.LoadFromArrays(torch.cat([simData['incidence_data'],simData['target_dis_dx'].reshape(-1,1)],axis=1),simData['covariate_data'],[],catCovDicts=None, arrayType = 'Torch')
clinData.ConditionOnDx([disList[-1]])
sampler = ClinicalDatasetSampler(clinData,training_data_fraction,returnArrays='Torch',conditionSamplingOnDx = [disList[-1]])
sampler.ConvertToUnconditional()

vlpiModel= vLPI(sampler,inf_rank)
try:
    vlpiModel.LoadModel('IllustrativeExample.pth')
except FileNotFoundError:
    inference_output = vlpiModel.FitModel(batch_size=200,errorTol=(1.0/num_samples))
コード例 #6
0
    '../UCSF-EffectiveRankTopComponents/FinalModels_UCSFPerformanceResults.pth'
)

# 1) Identify top component as highest R^2 between UKBB and UCSF model (training dataset)
# 2) Compute and store regression model statistics
# 3) Compute Severity in UCSF cases along with p-value
# 4) Validate severity in UKBB if ICD10 dx codes available

if dis_index.replace(
        ':',
        '_') + '_UKBBPerformanceResults.pth' not in os.listdir(output_direc):

    # Step 1) Identify top components from UKBB model in UCSF and UKBB datasets

    #load the data
    ucsfDataset_HPO = ClinicalDataset()
    ucsfDataset_HPO.ReadFromDisk(
        'path/to/clinical/data/UCSF_MendelianDisease_HPO.pth')

    ucsfDataset_UKBB = ClinicalDataset()
    ucsfDataset_UKBB.ReadFromDisk(
        'path/to/clinical/data/UCSF_MendelianDisease_UKBB_HPO.pth')

    ukbbDataset, ukbb_sampler = LoadUKBBData(training_data_fraction, dis_index)

    annotated_terms_ucsf = model_table.loc[dis_index]['Annotated HPO Terms']
    annotated_terms_ukbb = model_table.loc[dis_index][
        'Annotated HPO Terms UKBB']

    max_rank_ucsf = model_table.loc[dis_index]['UCSF Max. Model Rank']
    max_rank_ukbb = model_table.loc[dis_index]['UKBB Max. Model Rank']
コード例 #7
0
torch.manual_seed(1023)

## set the simulation parameters

numberOfSamples=50000
numberOfSymptoms=20

rareDiseaseFrequency=0.001
numLatentPhenotypes=2

## simulate the data
simulator = ClinicalDataSimulator(numberOfSymptoms,numLatentPhenotypes,rareDiseaseFrequency)
simulatedData=simulator.GenerateClinicalData(numberOfSamples)

## load the data into a ClinicalDataset class
clinicalData = ClinicalDataset()

#change the dataset to have only 20 symptoms (defaults to full ICD10-CM codebook), named after the first 20 letters
allICDCodes = list(clinicalData.dxCodeToDataIndexMap.keys())
symptomConversionMap=dict(zip(allICDCodes[0:numberOfSymptoms],string.ascii_uppercase[0:numberOfSymptoms]))
clinicalData.ConstructNewDataArray(symptomConversionMap)

# now load into the data structure. Note, when using in this manner, it's the users responsibility to make sure that the input data columns match the data columns of the ClinicalDataset.
clinicalData.LoadFromArrays(simulatedData['incidence_data'],simulatedData['covariate_data'],[],catCovDicts=None, arrayType = 'Torch')

## Now load the ClincalDataset into ClinicalDatasetSampler
training_data_fraction=0.75
sampler = ClinicalDatasetSampler(clinicalData,training_data_fraction,returnArrays='Torch')

## Intantation the models
infNumberOfLatentPhenotypes=10