def get_veploader_and_regions(filter_vids): ''' get the ensembl-vep loader and gene regions ''' if snakemake.params.effect == 'LOF': ensemblvepdf = pd.read_csv(snakemake.input.vep_tsv, sep='\t', usecols=['Uploaded_variation', 'Location', 'Gene'], index_col='Uploaded_variation') keep = intersect_ids(filter_vids, ensemblvepdf.index.values) ensemblvepdf = ensemblvepdf.loc[keep] ensemblvepdf.reset_index(inplace=True) eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene']) elif snakemake.params.effect == 'missense': ensemblvepdf = pd.read_csv(snakemake.input.vep_tsv, sep='\t', usecols=['Uploaded_variation', 'Location', 'Gene', 'pos_standardized', 'impact'], index_col='Uploaded_variation') keep = intersect_ids(filter_vids, ensemblvepdf.index.values) ensemblvepdf = ensemblvepdf.loc[keep] ensemblvepdf.reset_index(inplace=True) # filter by impact # since we can't weigh the variants. We use a different way to filter them than in the other tests: # NOT: # ensemblvepdf = ensemblvepdf[ensemblvepdf.groupby(['Gene', 'pos_standardized'])['impact'].transform(np.max) >= snakemake.params.min_impact] ensemblvepdf = ensemblvepdf[ ensemblvepdf.impact >= snakemake.params.min_impact ] eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene'], data=ensemblvepdf[['impact']].values) else: raise NotImplementedError('effect has to be either missense or LOF!') regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0,1,2,3], dtype={0:str, 1: np.int32, 2:np.int32, 3:str}) regions.columns = ['chrom', 'start', 'end', 'name'] # discard all genes that are not on the chromosomes we are looking at: # chromosomes = np.unique(eveploader.pos_df.chrom) # regions = regions[regions.chrom.str.isin(chromosomes)] # discard all genes for which we don't have annotations regions['gene'] = regions.name.str.split('_', expand=True)[0] regions.set_index('gene', inplace=True) genes = intersect_ids(np.unique(regions.index.values), np.unique(eveploader.pos_df.gene)) regions = regions.loc[genes].reset_index() regions = regions.sort_values(['chrom','start','end'])[['chrom','start','end','name','gene']] return eveploader, regions
def sid_filter(vids): if 'sid_include' in snakemake.config: print('limiting to variants present in {}'.format(snakemake.config['sid_include'])) infilepath = snakemake.config['sid_include'] if infilepath.endswith('gz'): with gzip.open(infilepath,'rt') as infile: sid = np.array([l.rstrip() for l in infile]) else: with open(infilepath, 'r') as infile: sid = np.array([l.rstrip() for l in infile]) else: return vids return intersect_ids(vids, sid)
if chromosome not in ['chr9', 'chr16', 'chr21']: continue # set up the ensembl vep loader for the chromosome ensemblvepdf = pd.read_csv(vep_tsv, sep='\t', usecols=['Uploaded_variation', 'Location', 'Gene', 'pos_standardized', 'impact'], index_col='Uploaded_variation') # get set of variants for the chromosome: mac_report = maf_filter(mac_report) filter_vids = mac_report.index.values filter_vids = sid_filter(filter_vids) # filter by MAF keep = intersect_ids(filter_vids, ensemblvepdf.index.values) ensemblvepdf = ensemblvepdf.loc[keep] ensemblvepdf.reset_index(inplace=True) # filter by impact: ensemblvepdf = ensemblvepdf[ensemblvepdf.groupby(['Gene','pos_standardized'])['impact'].transform(np.max) >= snakemake.params.min_impact ] # initialize the loader eveploader = EnsemblVEPLoader(ensemblvepdf['Uploaded_variation'], ensemblvepdf['Location'], ensemblvepdf['Gene'], data = ensemblvepdf[['pos_standardized','impact']].values) # set up the regions to loop over for the chromosome regions = pd.read_csv(snakemake.input.regions_bed, sep='\t', header=None, usecols=[0,1,2,3], dtype={0:str, 1: np.int32, 2:np.int32, 3:str}) regions.columns = ['chrom', 'start', 'end', 'name'] # discard all genes for which we don't have annotations regions['gene'] = regions.name.str.split('_', expand=True)[0]
spliceaidf = pd.read_csv(vep_tsv, sep='\t', usecols=[ 'name', 'chrom', 'end', 'gene', 'max_effect', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL' ], index_col='name') # get set of variants for the chromosome: mac_report = maf_filter(mac_report) filter_vids = mac_report.index.values filter_vids = sid_filter(filter_vids) # filter by MAF keep = intersect_ids(filter_vids, spliceaidf.index.values) spliceaidf = spliceaidf.loc[keep] spliceaidf.reset_index(inplace=True) # filter by impact: spliceaidf = spliceaidf[ spliceaidf.max_effect >= snakemake.params.min_impact] # set up the regions to loop over for the chromosome regions = regions_all.copy() # discard all genes for which we don't have annotations gene_ids = regions.name.str.split( '_', expand=True) # table with two columns, ensembl-id and gene-name regions['gene'] = gene_ids[1] # this is the gene name regions['ensembl_id'] = gene_ids[0]
def main(): # get variants that pass MAF and genotyping filters filter_vids = maf_filter() filter_vids = sid_filter(filter_vids) # get the variant effect predictions and gene regions: eveploader, regions = get_veploader_and_regions(filter_vids) # get the genotype loader: plinkloader = VariantLoaderSnpReader(Bed(snakemake.input.genotypes_bed, count_A1=True)) # intersect variants common_vids = intersect_ids(plinkloader.get_vids(), eveploader.get_vids()) plinkloader.update_variants(common_vids) eveploader.update_variants(common_vids) # drop irrelevant indidivuals iids = iid_filter() plinkloader.update_individuals(iids) # batch size to write genotypes batch_size = 100 def load_and_proc_geno(interval): try: V1 = eveploader.anno_by_interval(interval, gene=interval['name'].split('_')[0]) except KeyError: raise GotNone if V1.index.empty: raise GotNone vids = V1.index.get_level_values('vid') temp_genotypes, temp_vids = plinkloader.genotypes_by_id(vids, return_pos=False) temp_genotypes = np.ma.masked_invalid(temp_genotypes).filled(0.) # since we have already kicked out all the "weird" variants, we can skip the pre-processing below #G1, vids = plinkloader.preprocess_genotypes(temp_genotypes, # temp_vids, # recode_maf=False, # invert_encoding=False, # impute_mean=True, # center=True, # max_maf=snakemake.params.max_maf) # this will kick out any where major/minor are "flipped" #if G1 is None: # raise GotNone G1_burden = (np.sum(temp_genotypes > 0.5, axis=1, keepdims=True) > 0.).astype('i1') return G1_burden, vids genos = [] gene_ids = [] # initialize output file initialize_h5(snakemake.output.h5, len(regions), len(iids)) # the actual loop that glues it all together regions = regions.iterrows() h5 = h5py.File(snakemake.output.h5, 'r+') out = h5['G'] idfile = open(snakemake.output.gene_txt, 'x') i = 0 # keeps track of how many entries have been exported to the hdf5 file ibatch = 0 try: while True: b = [] ids = [] while ibatch < batch_size: _, region = next(regions) try: G, vids = load_and_proc_geno(region) except GotNone: continue b.append(G) ids.append(region['name']) ibatch += 1 out[i:(i+ibatch)] = np.concatenate(b, axis = 1).T write_ids(ids, idfile) i += len(ids) ibatch = 0 except StopIteration: if len(b) > 0: out[i:(i+ibatch)] = np.concatenate(b, axis = 1).T write_ids(ids, idfile) i += len(ids) idfile.close() out.resize(i, axis=0) h5.close() relpath_out = os.path.relpath(snakemake.input.complete_cases, os.path.dirname(snakemake.output.iid_txt) ) os.symlink(relpath_out, snakemake.output.iid_txt)
collapser = LocalCollapsing(distance_threshold=51.) for strand in ['plus', 'minus']: # set up the regions to loop over for the chromosome chromosome_id = chromosome.replace('chr', '') regions = regions_all[(regions_all.chrom == chromosome_id) & (regions_all.strand == strand)] # get variants that pass variant effect prediction threshold: vep_vids, vep_mask = vep_filter(vep_h5[chromosome][strand], vep_bed[chromosome][strand]) # combine filter_vids_chromosome = intersect_ids(vep_vids, filter_vids) # initialize the vep loader veploader = Hdf5Loader(vep_bed[chromosome][strand], vep_h5[chromosome][strand], 'diffscore', from_janggu=True) veploader.update_variants(filter_vids_chromosome) veploader.set_mask(vep_mask) # set up the variant loader (rbp variants) for the chromosome + strand plinkloader = VariantLoaderSnpReader( Bed(bed, count_A1=True, num_threads=4)) plinkloader.update_variants(veploader.get_vids()) plinkloader.update_individuals(covariatesloader.get_iids())
# storing all results here results = [] i_gene = 0 i_chrom = 0 # conditional analysis: # these genotypes will be used for the conditional analysis geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1)) geno_cond.update_individuals(covariatesloader.get_iids()) # this file contains the mapping of associations to SNPs to condition on conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0) conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates() geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids())) logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids()))) # enter the chromosome loop: timer = Timer() for i, (chromosome, bed, vep_tsv, ensembl_vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep): if chromosome.replace('chr','') not in regions_all.chrom.unique(): continue if snakemake.params.debug: # process only two chromosomes if debugging... if i_chrom > 2: break timer.reset()
def test_full_rank_continuous(): import time import numpy as np import pandas as pd import pkg_resources from seak import data_loaders from seak import kernels from seak import scoretest data_path = pkg_resources.resource_filename('seak', 'data/') # Path to veps path_to_VEP_bed = data_path + "dummy_veps.bed" path_to_VEP_hdf5 = data_path + "dummy_veps.hdf5" # Path to genotypes path_to_covariates = data_path + "dummy_covariates_fixed.csv" path_to_plink_files_with_prefix = data_path + "full_rank_continuous" # Path to regions path_to_reference_genes_bed = data_path + "dummy_regions.bed" # Load data # VEPs hdf5_loader = data_loaders.Hdf5Loader(path_to_vep_bed=path_to_VEP_bed, path_to_vep_hdf5=path_to_VEP_hdf5, hdf5_key='diffscore') # Genotypes plink_loader = data_loaders.VariantLoaderSnpReader(path_to_plink_files_with_prefix+'.bed') # Genes ucsc_region_loader = data_loaders.BEDRegionLoader(path_to_regions_UCSC_BED=path_to_reference_genes_bed, chrom_to_load=1, drop_non_numeric_chromosomes=True) # Covariates covariate_loader_csv = data_loaders.CovariatesLoaderCSV(phenotype_of_interest='pheno_full_rank_continuous', path_to_covariates=path_to_covariates, covariate_column_names=['cov1', 'cov2']) # Overlap individuals: genotypes and covariates print('Overlaps') print('Individuals') genotypes_covariates_intersection = data_loaders.intersect_ids(plink_loader.get_iids(), covariate_loader_csv.get_iids()) print(genotypes_covariates_intersection.shape) print(genotypes_covariates_intersection) # Overlap genotypes with VEPs print('Genotypes') #print(len(plink_loader.bim.index)) print(len(hdf5_loader.veps_index_df.index)) veps_genotypes_intersection = data_loaders.intersect_ids(hdf5_loader.get_vids(), plink_loader.get_vids()) print(len(veps_genotypes_intersection)) # Update respective instances print('Updates') #print('plink_loader.bim.shape', plink_loader.bim.shape) plink_loader.update_variants(veps_genotypes_intersection) #print('plink_loader.bim.shape', plink_loader.bim.shape) print('hdf5_loader.veps_index_df.shape', hdf5_loader.veps_index_df.shape) hdf5_loader.update_variants(veps_genotypes_intersection) print('hdf5_loader.veps_index_df.shape', hdf5_loader.veps_index_df.shape) #print('plink_loader.fam.shape', plink_loader.fam.shape) plink_loader.update_individuals(genotypes_covariates_intersection) #print('plink_loader.fam.shape', plink_loader.fam.shape) print('covariate_loader_csv.cov.shape', covariate_loader_csv.cov.shape) covariate_loader_csv.update_individuals(genotypes_covariates_intersection) print('covariate_loader_csv.cov.shape', covariate_loader_csv.cov.shape) Y, X = covariate_loader_csv.get_one_hot_covariates_and_phenotype(test_type='noK') null_model = scoretest.ScoretestNoK(Y, X) results = pd.DataFrame(columns=['name', 'chrom', 'start', 'end', 'p_value', 'n_SNVs', 'time']) for index, region in ucsc_region_loader.regions.iterrows(): t_test_gene_start = time.time() temp_genotypes_info_dict = region.to_dict() temp_genotypes, temp_vids = plink_loader.genotypes_by_region(region) if temp_genotypes is None: continue G, temp_vids = data_loaders.VariantLoader.preprocess_genotypes(temp_genotypes, temp_vids, impute_mean=True, normalize=False, invert_encoding=True, recode_maf=False) if G is None: continue V = hdf5_loader.anno_by_id(temp_vids) GV = kernels.diffscore_max(G, V, False) temp_p_value = null_model.pv_alt_model(GV) temp_genotypes_info_dict['p_value'] = temp_p_value temp_genotypes_info_dict['n_SNVs'] = G.shape[1] t_test_gene_end = time.time() temp_time = float(t_test_gene_end - t_test_gene_start) temp_genotypes_info_dict['time'] = temp_time results = results.append(temp_genotypes_info_dict, ignore_index=True) # results.to_csv('./test_full_rank_continuous.csv') print(results) reference_result = pd.read_csv(data_path + 'reference_results/test_full_rank_continuous.csv', index_col=0) print(np.corrcoef(reference_result['p_value'], results['p_value'])) print(np.all((np.isclose(reference_result['p_value'], results['p_value'])))) assert np.all((np.isclose(reference_result['p_value'], results['p_value']))), 'The last change in code changes the result!!'
Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1)) geno_cond.update_individuals(covariatesloader.get_iids()) # this file contains the mapping of associations to SNPs to condition on conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0) conditional_list = conditional_list[ (conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno'])].drop_duplicates( ) conditional_list = conditional_list.loc[conditional_list.gene_name.isin( results.gene_name)] geno_cond.update_variants( intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids())) logging.info( 'considering {} variants as covariates for conditional tests.'.format( len(geno_cond.get_vids()))) # set up the null model Y, X = covariatesloader.get_one_hot_covariates_and_phenotype('NoK') null_model = ScoretestNoK(Y, X) logging.info('Phenotype: {}, Sample size: {}'.format( snakemake.params.phenotype, len(Y))) def test_gene(gene): # conditional analysis
# storing all results here results = [] i_gene = 0 i_chrom = 0 # conditional analysis: # these genotypes will be used for the conditional analysis geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1)) geno_cond.update_individuals(covariatesloader.get_iids()) # this file contains the mapping of associations to SNPs to condition on conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0) conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates() geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids())) logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids()))) # enter the chromosome loop: timer = Timer() for i, (chromosome, bed, mac_report, vep_tsv) in enumerate(geno_vep): if chromosome.replace('chr','') not in regions_all.chrom.unique(): continue if snakemake.params.debug: # process only two chromosomes if debugging... if i_chrom > 2: break timer.reset()
# storing all results here results = [] i_gene = 0 i_chrom = 0 # conditional analysis: # these genotypes will be used for the conditional analysis geno_cond = VariantLoaderSnpReader(Bed(snakemake.input.conditional_geno, count_A1=True, num_threads=1)) geno_cond.update_individuals(covariatesloader.get_iids()) # this file contains the mapping of associations to SNPs to condition on conditional_list = pd.read_csv(snakemake.input.conditional_list, sep='\t', header=0) conditional_list = conditional_list[(conditional_list.pheno == snakemake.params['phenotype']) | (conditional_list.pheno == snakemake.wildcards['pheno']) ].drop_duplicates() geno_cond.update_variants(intersect_ids(conditional_list.snp_rsid, geno_cond.get_vids())) logging.info('considering {} variants as covariates for conditional tests.'.format(len(geno_cond.get_vids()))) # enter the chromosome loop: timer = Timer() for i, (chromosome, bed, vep_tsv, mac_report, h5_lof, iid_lof, gid_lof) in enumerate(geno_vep): if chromosome.replace('chr','') not in regions_all.chrom.unique(): continue if snakemake.params.debug: # process only two chromosomes if debugging... if i_chrom > 2: break
def test_full_rank_continuous(): # full rank bg kernel # imports import pkg_resources import time import numpy as np import pandas as pd from seak import construct_background_kernel from seak import data_loaders from seak import kernels from seak import scoretest from seak.data_loaders import intersect_ids data_path = pkg_resources.resource_filename('seak', 'data/') # Path to veps path_to_VEP_bed = data_path + "dummy_veps.bed" path_to_VEP_hdf5 = data_path + "dummy_veps.hdf5" # Path to genotypes path_to_covariates = data_path + "dummy_covariates_fixed.csv" path_to_plink_files_with_prefix = data_path + "full_rank_continuous" # Path to regions path_to_reference_genes_bed = data_path + "dummy_regions.bed" # Path to background kernel path_to_plink_bg_kernel = data_path + "full_rank_background_kernel" # Load data # VEPs hdf5_loader = data_loaders.Hdf5Loader(path_to_vep_bed=path_to_VEP_bed, path_to_vep_hdf5=path_to_VEP_hdf5, hdf5_key='diffscore') # Genotypes plink_loader = data_loaders.VariantLoaderSnpReader( path_to_plink_files_with_prefix + '.bed') # Genes ucsc_region_loader = data_loaders.BEDRegionLoader( path_to_regions_UCSC_BED=path_to_reference_genes_bed, chrom_to_load=1, drop_non_numeric_chromosomes=True) # Covariates covariate_loader_csv = data_loaders.CovariatesLoaderCSV( phenotype_of_interest='pheno_full_rank_continuous', path_to_covariates=path_to_covariates, covariate_column_names=['cov1', 'cov2']) # Background_kernel # background_kernel = construct_background_kernel.GRMLoader(path_to_plink_bg_kernel, 10, 1) background_kernel = construct_background_kernel.GRMLoaderSnpReader( path_to_plink_bg_kernel + '.bed', 10, '1') # Overlap individuals: genotypes and covariates genotypes_covariates_GRM_intersection = data_loaders.intersect_ids( plink_loader.get_iids(), covariate_loader_csv.get_iids()) # Overlap genotypes with VEPs veps_genotypes_intersection = data_loaders.intersect_ids( hdf5_loader.get_vids(), plink_loader.get_vids()) # Update respective instances plink_loader.update_variants(veps_genotypes_intersection) hdf5_loader.update_variants(veps_genotypes_intersection) plink_loader.update_individuals(genotypes_covariates_GRM_intersection) covariate_loader_csv.update_individuals( genotypes_covariates_GRM_intersection) background_kernel.update_individuals(genotypes_covariates_GRM_intersection) background_kernel.compute_background_kernel() print('nb_SNVs_unf: {}, nb_SNVs_f: {}'.format( background_kernel.nb_SNVs_unf, background_kernel.nb_SNVs_f)) print('sum(diag(K)): {}'.format(np.diag(background_kernel.K0).sum())) Y, X = covariate_loader_csv.get_one_hot_covariates_and_phenotype( test_type='2K') null_model = scoretest.Scoretest2K(Y, X, background_kernel.K0, background_kernel.G0) results = pd.DataFrame( columns=['name', 'chrom', 'start', 'end', 'p_value', 'n_SNVs', 'time']) for region in ucsc_region_loader: t_test_gene_start = time.time() temp_genotypes_info_dict = region temp_genotypes, temp_vids = plink_loader.genotypes_by_region( region, return_pos=False) if temp_genotypes is None: continue G, temp_vids = data_loaders.VariantLoader.preprocess_genotypes( temp_genotypes, temp_vids, impute_mean=True, normalize=False, invert_encoding=True, recode_maf=False) if G is None: continue V = hdf5_loader.anno_by_id(temp_vids) GV = kernels.diffscore_max(G, V, False) temp_p_value = null_model.pv_alt_model(GV) temp_genotypes_info_dict['p_value'] = temp_p_value temp_genotypes_info_dict['n_SNVs'] = G.shape[1] t_test_gene_end = time.time() temp_time = float(t_test_gene_end - t_test_gene_start) temp_genotypes_info_dict['time'] = temp_time results = results.append(temp_genotypes_info_dict, ignore_index=True) # results.to_csv('./test_full_rank_continuous_2K.csv') reference_result = pd.read_csv( data_path + 'reference_results/test_full_rank_continuous_2K_computed.csv', index_col=0) print(data_path + 'reference_results/test_full_rank_continuous_2K.csv') print('expected result:') print(reference_result) print('actual result:') print(results) print('p-value corrcoef:') print(np.corrcoef(reference_result['p_value'], results['p_value'])) # results.to_csv('test_full_rank_continuous_2K_computed.csv') assert np.all((np.isclose( reference_result['p_value'], results['p_value']))), 'The last change in code changes the result!!'