Esempio n. 1
0
def run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping\
        (pheno_filename, anno_filename, prsFile, minimum_test_samples= 10, relatedness_score=0.95, skipAutosomeFiltering = False, snps_filename=None,
         feature_filename=None, snp_feature_filename=None, selection='all', covariates_filename=None, kinship_filename=None, sample_mapping_filename=None, feature_variant_covariate_filename=None):
    
    selectionStart = None
    selectionEnd = None
    if(":" in selection):
        parts = selection.split(":")
        if("-" not in parts[1]):
            print("No correct sub selection.")
            print("Given in: "+selection)
            print("Expected format: (chr number):(start location)-(stop location)")
            sys.exit()
        chromosome = parts[0]
        if("-" in parts[1]):
            parts2 = parts[1].split("-") 
            selectionStart = int(parts2[0])
            selectionEnd = int(parts2[1])
    else :
        chromosome=selection

    ''' function to take input and intersect sample and genotype.'''
    #Load input data files & filter for relevant data
    #Load input data filesf
    #import pdb; pdb.set_trace();
    phenotype_df = qtl_loader_utils.get_phenotype_df(pheno_filename)
    annotation_df = qtl_loader_utils.get_annotation_df(anno_filename)

    phenotype_df.columns = phenotype_df.columns.astype("str")
    phenotype_df.index = phenotype_df.index.astype("str")
    annotation_df.columns = annotation_df.columns.astype("str")
    annotation_df.index = annotation_df.index.astype("str")
    
    if(annotation_df.shape[0] != annotation_df.groupby(annotation_df.index).first().shape[0]): 
        print("Only one location per feature supported. If multiple locations are needed please look at: --extended_anno_file")
        sys.exit()
    
    #Determine features to be tested
    if chromosome!='all':
        if not selectionStart is None :
            lowest = min([selectionStart,selectionEnd])
            highest = max([selectionStart,selectionEnd])
            annotation_df['mean'] = ((annotation_df["start"] + annotation_df["end"])/2)
            feature_list = list(set(annotation_df.iloc[(annotation_df['chromosome'].values==chromosome) & (annotation_df['mean'].values>=lowest) & (annotation_df["mean"].values<highest)].index.values))
            annotation_df = annotation_df.loc[feature_list,]
            del annotation_df['mean']
        else :
            feature_list = list(annotation_df[annotation_df['chromosome']==chromosome].index)
            annotation_df = annotation_df.loc[feature_list,]
    
    #To be able to read variants from a large file we change the loading here.
    #First we subset the genes to the chunk and get the relevant SNPs based on that.
    
    snp_feature_filter_df= qtl_loader_utils.get_snp_feature_df(snp_feature_filename)
    feature_filter_df = qtl_loader_utils.get_snp_df(feature_filename)
    snp_filter_df = qtl_loader_utils.get_snp_df(snps_filename)
    feature_variant_covariate_df = qtl_loader_utils.get_snp_feature_df(feature_variant_covariate_filename)
    
    #import pdb; pdb.set_trace()
    #Do filtering on variants and features first stage.
    if snp_feature_filter_df is not None:
        if feature_filter_df is not None:
            toSelect = set(feature_filter_df.index.values).intersection(set(annotation_df.index.values))
            annotation_df = annotation_df.loc[toSelect,]
        toSelect = list(set(snp_feature_filter_df['feature'].values).intersection(set(annotation_df.index.values)))
        snp_feature_filter_df = snp_feature_filter_df.loc[snp_feature_filter_df['feature'].isin(toSelect)]
        relSnps = snp_feature_filter_df['snp_id'].values
        
        if snp_filter_df is not None:
            relSnps = set(snp_filter_df.index).intersection(set(relSnps))
        if feature_variant_covariate_df is not None:
            feature_variant_covariate_df = feature_variant_covariate_df.loc[feature_variant_covariate_df['feature'].isin(toSelect)]
            relSnps = np.union1d(relSnps, feature_variant_covariate_df["snp_id"].values)
        
        relSnps = np.unique(relSnps)
        risk_df = qtl_loader_utils.get_grs_subset_df(prsFile, relSnps)
        
        if risk_df is None:
            print("No variants selected during SNP reading.")
            sys.exit()
        risk_df = risk_df.assign(SnpId=risk_df.index.values)
        risk_df = risk_df.drop_duplicates(keep='first')
        risk_df = risk_df.drop(['SnpId'], axis='columns')
        risk_df = risk_df.loc[risk_df.isnull().sum(axis=1)!=risk_df.shape[1],]
    elif snp_filter_df is not None:
        relSnps = snp_filter_df.index
        
        if feature_variant_covariate_df is not None:
            feature_variant_covariate_df = feature_variant_covariate_df.loc[feature_variant_covariate_df['feature'].isin(toSelect)]
            relSnps = np.union1d(relSnps, feature_variant_covariate_df["snp_id"].values)
        
        relSnps = np.unique(relSnps)
        risk_df = qtl_loader_utils.get_grs_subset_df(prsFile, relSnps)
        if risk_df is None:
            print("No variants selected during SNP reading.")
            sys.exit()
        risk_df = risk_df.assign(SnpId=risk_df.index.values)
        risk_df = risk_df.drop_duplicates(keep='first')
        risk_df = risk_df.drop(['SnpId'], axis='columns')
        risk_df = risk_df.loc[risk_df.isnull().sum(axis=1)!=risk_df.shape[1],]
    else :
        risk_df = qtl_loader_utils.get_phenotype_df(prsFile)
    print("Intersecting data.")
    risk_df =  risk_df.astype(float)
    #pdb.set_trace();
    ##Make sure that there is only one entry per feature id!.

    sample2individual_df = qtl_loader_utils.get_samplemapping_df(sample_mapping_filename,list(phenotype_df.columns),'sample')
    sample2individual_df.index = sample2individual_df.index.astype('str')
    sample2individual_df = sample2individual_df.astype('str')
    sample2individual_df['sample']=sample2individual_df.index
    sample2individual_df = sample2individual_df.drop_duplicates();
    ##Filter first the linking files!
    #Subset linking to relevant genotypes.
    orgSize = sample2individual_df.shape[0]
    sample2individual_df = sample2individual_df.loc[sample2individual_df['iid'].map(lambda x: x in list(map(str, risk_df.columns))),:]
    diff = orgSize- sample2individual_df.shape[0]
    orgSize = sample2individual_df.shape[0]
    print("Dropped: "+str(diff)+" samples because they are not present in the genotype file.")
    
    #Subset linking to relevant phenotypes.
    sample2individual_df = sample2individual_df.loc[np.intersect1d(sample2individual_df.index,phenotype_df.columns),:]
    diff = orgSize- sample2individual_df.shape[0]
    orgSize = sample2individual_df.shape[0]
    print("Dropped: "+str(diff)+" samples because they are not present in the phenotype file.")
    #Subset linking vs kinship.
    kinship_df = qtl_loader_utils.get_kinship_df(kinship_filename)
    if kinship_df is not None:
        #Filter from individual2sample_df & sample2individual_df since we don't want to filter from the genotypes.
        sample2individual_df = sample2individual_df[sample2individual_df['iid'].map(lambda x: x in list(map(str, kinship_df.index)))]
        diff = orgSize- sample2individual_df.shape[0]
        orgSize = sample2individual_df.shape[0]
        print("Dropped: "+str(diff)+" samples because they are not present in the kinship file.")
    #Subset linking vs covariates.
    covariate_df = qtl_loader_utils.get_covariate_df(covariates_filename)
    if covariate_df is not None:
        if np.nansum(covariate_df==1,0).max()<covariate_df.shape[0]: covariate_df.insert(0, 'ones',np.ones(covariate_df.shape[0]))
        sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(covariate_df.index)),:]
        diff = orgSize- sample2individual_df.shape[0]
        orgSize = sample2individual_df.shape[0]
        print("Dropped: "+str(diff)+" samples because they are not present in the covariate file.")

    ###
    print("Number of samples with genotype & phenotype data: " + str(sample2individual_df.shape[0]))
    if(sample2individual_df.shape[0]<minimum_test_samples):
        print("Not enough samples with both genotype & phenotype data.")
        sys.exit()
    #import pdb; pdb.set_trace()
    ##Filter now the actual data!
    #Filter phenotype data based on the linking files.
    phenotype_df = phenotype_df.loc[list(set(phenotype_df.index)&set(annotation_df.index)),sample2individual_df.index.values]

    #Filter kinship data based on the linking files.
    genetically_unique_individuals = None
    if kinship_df is not None:
        kinship_df = kinship_df.loc[np.intersect1d(kinship_df.index,sample2individual_df['iid']),np.intersect1d(kinship_df.index,sample2individual_df['iid'])]
    if kinship_df is not None and (relatedness_score is not None):
        genetically_unique_individuals = get_unique_genetic_samples(kinship_df, relatedness_score);
    
    #Filter covariate data based on the linking files.
    
    #Do filtering on features.
    if feature_filter_df is not None:
        toSelect = set(feature_filter_df.index.values).intersection(set(phenotype_df.index.values))
        phenotype_df = phenotype_df.loc[toSelect,:]
        ##Filtering on features to test.
    if snp_feature_filter_df is not None:
        toSelect = set(snp_feature_filter_df['feature'].values).intersection(set(phenotype_df.index.values))
        phenotype_df = phenotype_df.loc[toSelect,:]
        if feature_filter_df is not None:
            snp_feature_filter_df = snp_feature_filter_df.loc[snp_feature_filter_df['feature'].isin(toSelect)]
        ##Filtering on features  to test from the combined feature snp filter.

    #Prepare to filter on SNPs.
    if snp_filter_df is not None:
        toSelect = set(snp_filter_df.index).intersection(set(risk_df.index.values))
        risk_df=risk_df.loc[toSelect,:]
        ##Filtering on SNPs to test from the snp filter.
    
    if snp_feature_filter_df is not None:
        toSelect = set(np.unique(snp_feature_filter_df['snp_id'])).intersection(set(risk_df.index.values))
        risk_df=risk_df.loc[toSelect,:]
        ##Filtering on features to test from the combined feature snp filter.
    
    #Filtering for sites on non allosomes.
    if not skipAutosomeFiltering :
        annotation_df = annotation_df[annotation_df['chromosome'].map(lambda x: x in list(map(str, range(1, 23))))]
    
    feature_list = list(set(annotation_df.index)&set(phenotype_df.index))
    print("Number of features to be tested: " + str(len(feature_list)))
    print("Total number of variants to be considered, before variante QC and feature intersection: " + str(risk_df.shape[0]))
    
    if(phenotype_df.shape[1]<minimum_test_samples):
        print("Not enough samples with both genotype & phenotype data, for current number of covariates.")
        sys.exit()
    
    return [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, genetically_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]
Esempio n. 2
0
def run_structLMM_QTL_analysis_load_intersect_phenotype_environments_covariates_kinship_sample_mapping\
        (pheno_filename, anno_filename, env_filename, geno_prefix, plinkGenotype,  
            cis_mode = True, association_mode = True, skipAutosomeFiltering = False, minimum_test_samples = 10, 
            relatedness_score = 0.95, snps_filename = None, feature_filename = None, 
            snp_feature_filename = None, selection = 'all', covariates_filename = None, kinship_filename = None, 
            sample_mapping_filename = None, extended_anno_filename = None, feature_variant_covariate_filename = None):  
    selectionStart = None
    selectionEnd = None
    if(":" in selection):
        parts = selection.split(":")
        if("-" not in parts[1]):
            print("No correct sub selection.")
            print("Given in: "+selection)
            print("Expected format: (chr number):(start location)-(stop location)")
            sys.exit()
        chromosome = parts[0]
        if("-" in parts[1]):
            parts2 = parts[1].split("-") 
            selectionStart = int(parts2[0])
            selectionEnd = int(parts2[1])
    else :
        chromosome=selection

    ''' function to take input and intersect sample and genotype.'''
    #Load input data files & filter for relevant data
    #Load input data filesf

    phenotype_df = qtl_loader_utils.get_phenotype_df(pheno_filename)
    annotation_df = qtl_loader_utils.get_annotation_df(anno_filename)

    if(plinkGenotype):
        bim,fam,bed = qtl_loader_utils.get_genotype_data(geno_prefix)
        annotation_df.replace(['X', 'Y', 'XY', 'MT'], ['23', '24', '25', '26'],inplace=True)
        if chromosome=='X' :
            chromosome = '23'
        elif chromosome=='Y':
            chromosome = '24'
        elif chromosome=='XY':
            chromosome='25'
        elif chromosome=='MT':
            chromosome='26'
         #X  -> 23
         #Y  -> 24
         #XY -> 25
         #MT -> 26

    else :
        geno_prefix+='.bgen'
        print(geno_prefix)
    print("Intersecting data.")

    if(annotation_df.shape[0] != annotation_df.groupby(annotation_df.index).first().shape[0]): 
        print("Only one location per feature supported. If multiple locations are needed please look at: --extended_anno_file")
        sys.exit()

    ##Make sure that there is only one entry per feature id!.
    sample2individual_df = qtl_loader_utils.get_samplemapping_df(sample_mapping_filename,list(phenotype_df.columns),'sample')
    sample2individual_df['sample']=sample2individual_df.index
    sample2individual_df = sample2individual_df.drop_duplicates();


    ##Filter first the linking files!
    #Subset linking to relevant genotypes.
    orgSize = sample2individual_df.shape[0]
    sample2individual_df = sample2individual_df.loc[sample2individual_df['iid'].map(lambda x: x in list(map(str, fam.index))),:]
    diff = orgSize- sample2individual_df.shape[0]
    orgSize = sample2individual_df.shape[0]
    print("Dropped: "+str(diff)+" samples because they are not present in the genotype file.")
    
    #Subset linking to relevant phenotypes.
    sample2individual_df = sample2individual_df.loc[np.intersect1d(sample2individual_df.index,phenotype_df.columns),:]
    diff = orgSize- sample2individual_df.shape[0]
    orgSize = sample2individual_df.shape[0]
    print("Dropped: "+str(diff)+" samples because they are not present in the phenotype file.")
    #Subset linking vs kinship.
    kinship_df = qtl_loader_utils.get_kinship_df(kinship_filename)
    if kinship_df is not None:
        #Filter from individual2sample_df & sample2individual_df since we don't want to filter from the genotypes.
        sample2individual_df = sample2individual_df[sample2individual_df['iid'].map(lambda x: x in list(map(str, kinship_df.index)))]
        diff = orgSize- sample2individual_df.shape[0]
        orgSize = sample2individual_df.shape[0]
        print("Dropped: "+str(diff)+" samples because they are not present in the kinship file.")
    #Subset linking vs covariates.
    covariate_df = qtl_loader_utils.get_covariate_df(covariates_filename)
    if covariate_df is not None:
        if np.nansum(covariate_df==1,0).max()<covariate_df.shape[0]: covariate_df.insert(0, 'ones',np.ones(covariate_df.shape[0]))
        sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(covariate_df.index)),:]
        diff = orgSize- sample2individual_df.shape[0]
        orgSize = sample2individual_df.shape[0]
        print("Dropped: "+str(diff)+" samples because they are not present in the covariate file.")
    #Subset linking vs environments.
    environment_df = qtl_loader_utils.get_env_df(env_filename)
    if np.nansum(environment_df==1,0).max()<environment_df.shape[0]: environment_df.insert(0, 'ones',np.ones(environment_df.shape[0]))
    sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(environment_df.index)),:]
    diff = orgSize - sample2individual_df.shape[0]
    orgSize = sample2individual_df.shape[0]
    print("Dropped: "+str(diff)+" samples because they are not present in the environment file.")

    ###
    print("Number of samples with genotype & phenotype data: " + str(sample2individual_df.shape[0]))
    if(sample2individual_df.shape[0]<minimum_test_samples):
        print("Not enough samples with both genotype & phenotype data.")
        sys.exit()

    ##Filter now the actual data!
    #Filter phenotype data based on the linking files.
    phenotype_df = phenotype_df.loc[list(set(phenotype_df.index)&set(annotation_df.index)),sample2individual_df.index.values]

    #Filter kinship data based on the linking files.
    genetically_unique_individuals = None
    if kinship_df is not None:
        kinship_df = kinship_df.loc[np.intersect1d(kinship_df.index,sample2individual_df['iid']),np.intersect1d(kinship_df.index,sample2individual_df['iid'])]
        genetically_unique_individuals = get_unique_genetic_samples(kinship_df, relatedness_score);

    #Filter covariate data based on the linking files.
    if covariate_df is not None:
        covariate_df = covariate_df.loc[np.intersect1d(covariate_df.index,sample2individual_df.index.values),:]
    
    snp_feature_filter_df= qtl_loader_utils.get_snp_feature_df(snp_feature_filename)
    try:
        feature_filter_df = qtl_loader_utils.get_snp_df(feature_filename)
    except:
        if feature_filename  is not None:
            feature_filter_df=pd.DataFrame(index=feature_filename)
    #Do filtering on features.
    if feature_filter_df is not None:
        phenotype_df = phenotype_df.loc[feature_filter_df.index,:]
        ##Filtering on features to test.
    if snp_feature_filter_df is not None:
        phenotype_df = phenotype_df.loc[np.unique(snp_feature_filter_df['feature']),:]
        ##Filtering on features  to test from the combined feature snp filter.

    if ((not cis_mode) and len(set(bim['chrom']))<22) :
        print("Warning, running a trans-analysis on snp data from less than 22 chromosomes.\nTo merge data later the permutation P-values need to be written out.")

    if(cis_mode):
        #Remove features from the annotation that are on chromosomes which are not present anyway.
        annotation_df = annotation_df[np.in1d(annotation_df['chromosome'],list(set(bim['chrom'])))]

    #Prepare to filter on snps.
    snp_filter_df = qtl_loader_utils.get_snp_df(snps_filename)
    if snp_filter_df is not None:
        toSelect = set(snp_filter_df.index).intersection(set(bim['snp']))
        bim = bim.loc[bim['snp'].isin(toSelect)]
        ##Filtering on SNPs to test from the snp filter.

    if snp_feature_filter_df is not None:
        toSelect = set(np.unique(snp_feature_filter_df['snp_id'])).intersection(set(bim['snp']))
        bim = bim.loc[bim['snp'].isin(toSelect)]
        ##Filtering on features  to test from the combined feature snp filter.
    
    #Filtering for sites on non allosomes.
    if not skipAutosomeFiltering :
        annotation_df = annotation_df[annotation_df['chromosome'].map(lambda x: x in list(map(str, range(1, 23))))]
    
    #Determine features to be tested
    if chromosome=='all':
        feature_list = list(set(annotation_df.index)&set(phenotype_df.index))
    else:
        if not selectionStart is None :
            lowest = min([selectionStart,selectionEnd])
            highest = max([selectionStart,selectionEnd])
            annotation_df['mean'] = ((annotation_df["start"] + annotation_df["end"])/2)
            feature_list = list(set(annotation_df.iloc[(annotation_df['chromosome'].values==chromosome) & (annotation_df['mean'].values>=lowest) & (annotation_df["mean"].values<highest)].index.values)&set(phenotype_df.index))
            del annotation_df['mean']
        else :
            feature_list = list(set(annotation_df[annotation_df['chromosome']==chromosome].index)&set(phenotype_df.index))

    print("Number of features to be tested: " + str(len(feature_list)))
    print("Total number of variants to be considered, before variante QC and feature intersection: " + str(bim.shape[0]))
    
    if(phenotype_df.shape[1]<minimum_test_samples):
        print("Not enough samples with both genotype & phenotype data, for current number of covariates.")
        sys.exit()
    
    if extended_anno_filename is not None:
        complete_annotation_df = pd.read_csv(extended_anno_filename,sep='\t',index_col=0)
        annotation_df['index']=annotation_df.index
        complete_annotation_df['index']=complete_annotation_df.index
        complete_annotation_df = pd.concat([annotation_df,complete_annotation_df]).drop_duplicates()
        del complete_annotation_df['index']
    else:
        complete_annotation_df = annotation_df

    feature_variant_covariate_df = qtl_loader_utils.get_snp_feature_df(feature_variant_covariate_filename) 

    return [phenotype_df, kinship_df, covariate_df, environment_df, sample2individual_df, complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, genetically_unique_individuals, minimum_test_samples, feature_list,bim,fam,bed, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]