def _parse_plink_snps_freqs_(genotype_file, snp_indices): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) num_snps = len(snp_indices) freqs_arr = sp.empty(num_snps, dtype='float32') #raw_snps = sp.empty((num_snps,num_individs),dtype='int8') #If these indices are not in order then we place them in the right place while parsing SNPs. snp_order = sp.argsort(snp_indices) ordered_snp_indices = list(snp_indices[snp_order]) ordered_snp_indices.reverse() print 'Iterating over file to load SNPs' snp_i = 0 next_i = ordered_snp_indices.pop() line_i = 0 max_i = ordered_snp_indices[0] while line_i <= max_i: if line_i < next_i: plinkf.next() elif line_i==next_i: line = plinkf.next() snp = sp.array(line, dtype='int8') bin_counts = line.allele_counts() if bin_counts[-1]>0: mode_v = sp.argmax(bin_counts[:2]) snp[snp==3] = mode_v s_i = snp_order[snp_i] freqs_arr[s_i]=sp.sum(snp, dtype='float32')/(2*float(num_individs)) if line_i < max_i: next_i = ordered_snp_indices.pop() snp_i+=1 line_i +=1 plinkf.close() assert snp_i==len(freqs_arr), 'Failed to parse SNPs?' return freqs_arr
def parse_indiv_genotype(genotype_file, ref_path, hdf5_file): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = np.unique(Y) if len(unique_phens) == 1: print 'Unable to find phenotype values.' has_phenotype = False elif len(unique_phens) == 2: cc_bins = np.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]) has_phenotype = True else: print 'Found quantitative phenotype values' has_phenotype = True ref = pd.read_table(ref_path) chrom_list = np.unique(ref['CHROM']) hf = h5py.File(hdf5_file, 'w') if has_phenotype: hf.create_dataset('y', data=Y) hf.create_dataset('fids', data=fids) hf.create_dataset('iids', data=iids) hf.create_dataset('M', data=[ref.shape[0]]) for k in chrom_list: chunk = ref[ref['CHROM'] == k] snp_indices = np.array(chunk['cord_bim'].tolist()) print 'Extracting genotypes of chromosomes %d from genotype_file' % k raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) print 'raw_snps.shape=', raw_snps.shape snp_stds = np.sqrt(2 * freqs * (1 - freqs)) #np.std(raw_snps, 1) snp_means = freqs * 2 #np.mean(raw_snps, 1) g = hf.create_group('chrom_%d' % k) #Check SNP frequencies.. or filter by MAF: to be continued... print 'Writing genotypes of chromosomes %d to hdf5_file' % k g.create_dataset('raw_snps', data=raw_snps, compression='lzf') g.create_dataset('snp_stds', data=snp_stds) g.create_dataset('snp_means', data=snp_means) g.create_dataset('freqs', data=freqs) g.create_dataset('positions', data=chunk['POS'].tolist()) g.create_dataset('alleles', data=zip(chunk['A1'].tolist(), chunk['A2'].tolist())) g.create_dataset('SNP', data=chunk['SNP'].tolist()) hf.flush() hf.close() print 'individual-level data hdf5 written to %s!' % hdf5_file
def parse_plink_snps(genotype_file, snp_indices): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) num_snps = len(snp_indices) raw_snps = sp.empty((num_snps, num_individs), dtype='int8') # If these indices are not in order then we place them in the right place while parsing SNPs. snp_order = sp.argsort(snp_indices) # print(snp_indices) ordered_snp_indices = list(snp_indices[snp_order]) ordered_snp_indices.reverse() print('Iterating over file to load SNPs') snp_i = 0 next_i = ordered_snp_indices.pop() line_i = 0 max_i = ordered_snp_indices[0] while line_i <= max_i: if line_i < next_i: next(plinkf) elif line_i == next_i: line = next(plinkf) snp = sp.array(line, dtype='int8') bin_counts = line.allele_counts() if bin_counts[-1] > 0: mode_v = sp.argmax(bin_counts[:2]) snp[snp == 3] = mode_v s_i = snp_order[snp_i] ## fixed buggy code ## wrong encoding of genotype (A1 should be encoded as 1 instead of A2. It is different from plinkio default) ## original code: # raw_snps[s_i] = snp ## new code raw_snps[s_i] = 2 - snp ## fix finish if line_i < max_i: next_i = ordered_snp_indices.pop() snp_i += 1 line_i += 1 plinkf.close() assert snp_i == len(raw_snps), 'Failed to parse SNPs?' num_indivs = len(raw_snps[0]) freqs = sp.sum(raw_snps, 1, dtype='float32') / (2 * float(num_indivs)) return raw_snps, freqs
def parse_plink_snps(genotype_file, snp_indices): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) num_snps = len(snp_indices) raw_snps = sp.empty((num_snps, num_individs), dtype='int8') # If these indices are not in order then we place them in the right place while parsing SNPs. snp_order = sp.argsort(snp_indices) ordered_snp_indices = list(snp_indices[snp_order]) ordered_snp_indices.reverse() # Iterating over file to load SNPs snp_i = 0 next_i = ordered_snp_indices.pop() line_i = 0 max_i = ordered_snp_indices[0] while line_i <= max_i: if line_i < next_i: next(plinkf) elif line_i == next_i: line = next(plinkf) snp = sp.array(line, dtype='int8') bin_counts = line.allele_counts() if bin_counts[-1] > 0: mode_v = sp.argmax(bin_counts[:2]) snp[snp == 3] = mode_v s_i = snp_order[snp_i] raw_snps[s_i] = snp if line_i < max_i: next_i = ordered_snp_indices.pop() snp_i += 1 line_i += 1 plinkf.close() assert snp_i == len(raw_snps), 'Parsing SNPs from plink file failed.' num_indivs = len(raw_snps[0]) freqs = sp.sum(raw_snps, 1, dtype='float32') / (2 * float(num_indivs)) return raw_snps, freqs
def get_prs_bins(genotype_file, rs_id_map, K_bins=1, phen_map=None, lasso=False, sets=False, verbose=False): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() #1. Figure out indiv filter and get true phenotypes indiv_filter = sp.zeros(len(samples), dtype='bool8') true_phens = [] iids = [] if phen_map is not None: pcs = [] sex = [] covariates = [] phen_iids = set(phen_map.keys()) for samp_i, sample in enumerate(samples): if sample.iid in phen_iids: indiv_filter[samp_i] = True true_phens.append(phen_map[sample.iid]['phen']) iids.append(sample.iid) if 'pcs' in phen_map[sample.iid].keys(): pcs.append(phen_map[sample.iid]['pcs']) if 'sex' in phen_map[sample.iid].keys(): sex.append(phen_map[sample.iid]['sex']) if 'covariates' in phen_map[sample.iid].keys(): #Temp hack... # if phen_map[sample.iid]['sex']==1: # covariates.append([phen_map[sample.iid]['covariates'][0],0]) # else: # covariates.append([0,phen_map[sample.iid]['covariates'][0]]) covariates.append(phen_map[sample.iid]['covariates']) if len(pcs) > 0: assert len(pcs) == len( true_phens ), 'PC information missing for some individuals with phenotypes' if len(sex) > 0: assert len(sex) == len( true_phens ), 'Sex information missing for some individuals with phenotypes' if len(covariates) > 0: assert len(covariates) == len( true_phens ), 'Covariates missing for some individuals with phenotypes' else: for samp_i, sample in enumerate(samples): if sample.affection != 2: indiv_filter[samp_i] = True true_phens.append(sample.affection) iids.append(sample.iid) num_individs = sp.sum(indiv_filter) assert num_individs > 0, 'Issues in parsing the phenotypes and/or PCs?' assert not sp.any(sp.isnan( true_phens)), 'Phenotypes appear to have some NaNs, or parsing failed.' print '%d individuals have phenotype and genotype information.' % num_individs num_non_matching_nts = 0 num_flipped_nts = 0 raw_effects_prs = sp.zeros(num_individs) pval_derived_effects_prs = sp.zeros(num_individs) pval_derived_effects_prs_lasso = sp.zeros(num_individs) bins_prs_dict = {} if K_bins > 1: bk = 1 while bk <= K_bins: bins_prs_dict["prs_bin_%d" % bk] = sp.zeros(num_individs) bk += 1 #Sets pval_derived_effects_prs_high = sp.zeros(num_individs) pval_derived_effects_prs_lasso_high = sp.zeros(num_individs) pval_derived_effects_prs_low = sp.zeros(num_individs) pval_derived_effects_prs_lasso_low = sp.zeros(num_individs) #If these indices are not in order then we place them in the right place while parsing SNPs. print 'Iterating over BED file to calculate risk scores.' locus_list = plinkf.get_loci() snp_i = 0 bins_bounds = rs_id_map["bins_extremes"] #print bins_bounds for locus, row in it.izip(locus_list, plinkf): upd_pval_beta = 0 try: #Check rs-ID # sid = '%d_%d'%(locus.chromosome,locus.bp_position) sid = locus.name rs_info = rs_id_map[sid] except Exception: #Move on if rsID not found. continue if rs_info['upd_pval_beta'] == 0: continue #Check whether the nucleotides are OK, and potentially flip it. ss_nt = rs_info['nts'] g_nt = [locus.allele1, locus.allele2] flip_nts = False os_g_nt = sp.array( [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: raw_beta = -rs_info['raw_beta'] upd_pval_beta = -rs_info['upd_pval_beta'] num_flipped_nts += 1 if lasso: upd_pval_beta_lasso = -rs_info['upd_pval_beta_lasso'] if sets: upd_pval_beta_high = -rs_info['upd_pval_beta_high'] upd_pval_beta_lasso_high = -rs_info[ 'upd_pval_beta_lasso_high'] upd_pval_beta_low = -rs_info['upd_pval_beta_low'] upd_pval_beta_lasso_low = -rs_info[ 'upd_pval_beta_lasso_low'] else: #print "Nucleotides don't match after all?: sid=%s, g_nt=%s, ss_nt=%s" % (locus.name, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 continue else: raw_beta = rs_info['raw_beta'] upd_pval_beta = rs_info['upd_pval_beta'] if lasso: upd_pval_beta_lasso = rs_info['upd_pval_beta_lasso'] if sets: upd_pval_beta_high = rs_info['upd_pval_beta_high'] upd_pval_beta_lasso_high = rs_info[ 'upd_pval_beta_lasso_high'] upd_pval_beta_low = rs_info['upd_pval_beta_low'] upd_pval_beta_lasso_low = rs_info[ 'upd_pval_beta_lasso_low'] #Parse SNP, and fill in the blanks if necessary. snp = sp.array(row, dtype='int8')[indiv_filter] bin_counts = row.allele_counts() if bin_counts[-1] > 0: mode_v = sp.argmax(bin_counts[:2]) snp[snp == 3] = mode_v #Normalize SNP # n_snp = (snp - sp.mean(snp))/sp.std(snp) # print(upd_pval_beta**2) # print sp.where(bins_bounds>=upd_pval_beta**2) # print sp.where(bins_bounds>=upd_pval_beta**2)[0][0] bin_number = sp.where(bins_bounds >= upd_pval_beta**2)[0][0] #Update scores and move on. raw_effects_prs += snp * raw_beta assert not sp.any( sp.isnan(raw_effects_prs)), 'Raw effects PRS is corrupted' snpi_b = snp * upd_pval_beta pval_derived_effects_prs += snpi_b bins_prs_dict["prs_bin_%d" % bin_number] += snpi_b assert not sp.any(sp.isnan( pval_derived_effects_prs)), 'Weighted effects PRS is corrupted' if verbose: if snp_i > 0 and snp_i % 500000 == 0: print("PRS using %d SNPS" % snp_i) #print 'Number of non-matching NTs: %d'%num_non_matching_nts raw_eff_r2 = (sp.corrcoef(raw_effects_prs, true_phens)[0, 1])**2 pval_eff_r2 = (sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1])**2 print 'Raw effects PRS r2: %0.4f' % raw_eff_r2 print 'Weigted effects PRS r2: %0.4f' % pval_eff_r2 if lasso: pval_eff_r2_lasso = (sp.corrcoef( pval_derived_effects_prs_lasso, true_phens)[0, 1])**2 print 'Weigted effects PRS Lasso r2: %0.4f' % pval_eff_r2_lasso if sets: pval_eff_r2_high = (sp.corrcoef( pval_derived_effects_prs_high, true_phens)[0, 1])**2 print 'Weigted effects HIGH PRS r2: %0.4f' % pval_eff_r2_high pval_eff_r2_lasso_high = (sp.corrcoef( pval_derived_effects_prs_lasso_high, true_phens)[0, 1])**2 print 'Weigted effects HIGH PRS Lasso r2: %0.4f' % pval_eff_r2_lasso_high pval_eff_r2_low = (sp.corrcoef( pval_derived_effects_prs_low, true_phens)[0, 1])**2 print 'Weigted effects LOW PRS r2: %0.4f' % pval_eff_r2_low pval_eff_r2_lasso_low = (sp.corrcoef( pval_derived_effects_prs_lasso_low, true_phens)[0, 1])**2 print 'Weigted effects LOW PRS Lasso r2: %0.4f' % pval_eff_r2_lasso_low snp_i += 1 plinkf.close() print "DONE!" print 'Number of non-matching NTs: %d' % num_non_matching_nts print 'Number of flipped NTs: %d' % num_flipped_nts raw_eff_corr = sp.corrcoef(raw_effects_prs, true_phens)[0, 1] raw_eff_r2 = raw_eff_corr**2 pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1] pval_eff_r2 = pval_eff_corr**2 print 'Raw effects PRS correlation: %0.4f' % raw_eff_corr print 'Raw effects PRS r2: %0.4f' % raw_eff_r2 print 'Weigted effects PRS correlation: %0.4f' % pval_eff_corr print 'Weigted effects PRS r2: %0.4f' % pval_eff_r2 if lasso: pval_eff_corr_lasso = sp.corrcoef(pval_derived_effects_prs_lasso, true_phens)[0, 1] pval_eff_r2_lasso = pval_eff_corr_lasso**2 print 'Weigted effects LASSO PRS correlation: %0.4f' % pval_eff_corr_lasso print 'Weigted effects LASSO PRS r2: %0.4f' % pval_eff_r2_lasso if sets: pval_eff_corr_high = sp.corrcoef(pval_derived_effects_prs_high, true_phens)[0, 1] pval_eff_r2_high = pval_eff_corr_high**2 print 'Weigted effects HIGH PRS correlation: %0.4f' % pval_eff_corr_high print 'Weigted effects HIGH PRS r2: %0.4f' % pval_eff_r2_high pval_eff_corr_lasso_high = sp.corrcoef( pval_derived_effects_prs_lasso_high, true_phens)[0, 1] pval_eff_r2_lasso_high = pval_eff_corr_lasso_high**2 print 'Weigted effects HIGH LASSO PRS correlation: %0.4f' % pval_eff_corr_lasso_high print 'Weigted effects HIGH LASSO PRS r2: %0.4f' % pval_eff_r2_lasso_high pval_eff_corr_low = sp.corrcoef(pval_derived_effects_prs_low, true_phens)[0, 1] pval_eff_r2_low = pval_eff_corr_low**2 print 'Weigted effects LOW PRS correlation: %0.4f' % pval_eff_corr_low print 'Weigted effects LOW PRS r2: %0.4f' % pval_eff_r2_low pval_eff_corr_lasso_low = sp.corrcoef( pval_derived_effects_prs_lasso_low, true_phens)[0, 1] pval_eff_r2_lasso_low = pval_eff_corr_lasso_low**2 print 'Weigted effects LOW LASSO PRS correlation: %0.4f' % pval_eff_corr_lasso_low print 'Weigted effects LOW LASSO PRS r2: %0.4f' % pval_eff_r2_lasso_low ret_dict = { 'raw_effects_prs': raw_effects_prs.copy(), 'pval_derived_effects_prs': pval_derived_effects_prs.copy(), 'true_phens': true_phens[:], 'iids': iids } if K_bins > 1: bk = 1 while bk <= K_bins: ret_dict["pval_derived_effects_prs_bin_%d" % bk] = bins_prs_dict["prs_bin_%d" % bk].copy() bk += 1 if len(pcs) > 0: ret_dict['pcs'] = pcs if len(sex) > 0: ret_dict['sex'] = sex if len(covariates) > 0: ret_dict['covariates'] = covariates return ret_dict
def coordinate_datasets(reference_genotype_file, hdf5_file, summary_dict, validation_genotype_file=None, genetic_map_dir=None, min_maf=0.01, skip_coordination=False, max_freq_discrep = 0.15, debug=False): summary_dict[3.9]={'name':'dash', 'value':'Coordination'} t0 = time.time() if validation_genotype_file is not None: print('Coordinating datasets (Summary statistics, LD reference genotypes, and Validation genotypes).') else: print('Coordinating datasets (Summary statistics and LD reference genotypes).') plinkf = plinkfile.PlinkFile(reference_genotype_file) # Figure out chromosomes and positions. if debug: print('Parsing plinkf_dict_val reference genotypes') loci = plinkf.get_loci() plinkf.close() summary_dict[4]={'name':'Num individuals in LD Reference data:','value':plinkfiles.get_num_indivs(reference_genotype_file)} summary_dict[4.1]={'name':'SNPs in LD Reference data:','value':len(loci)} gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes) if validation_genotype_file is not None: if debug: print('Parsing LD validation bim file') plinkf_val = plinkfile.PlinkFile(validation_genotype_file) # Loads only the individuals... plinkf_dict_val = plinkfiles.get_phenotypes(plinkf_val) loci_val = plinkf_val.get_loci() plinkf_val.close() summary_dict[5]={'name':'SNPs in Validation data:','value':len(loci_val)} chr_dict_val = plinkfiles.get_chrom_dict(loci_val, chromosomes) # Open HDF5 file and prepare out data assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.' if plinkf_dict_val['has_phenotype']: hdf5_file.create_dataset('y', data=plinkf_dict_val['phenotypes']) summary_dict[6]={'name':'Num validation phenotypes:','value':plinkf_dict_val['num_individs']} hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict_val['fids'], dtype=util.fids_dtype)) hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict_val['iids'], dtype=util.iids_dtype)) maf_adj_risk_scores = sp.zeros(plinkf_dict_val['num_individs']) # Now summary statistics ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') num_common_snps = 0 # corr_list = [] chromosomes_found = set() num_snps_common_before_filtering =0 num_snps_common_after_filtering =0 tot_num_non_matching_nts = 0 tot_num_non_supported_nts = 0 tot_num_ambig_nts = 0 tot_num_freq_discrep_filtered_snps = 0 tot_num_maf_filtered_snps = 0 tot_g_ss_nt_concord_count = 0 if validation_genotype_file is not None: tot_g_vg_nt_concord_count = 0 tot_vg_ss_nt_concord_count = 0 # Now iterate over chromosomes chrom_i = 0 for chrom in chromosomes: chrom_i +=1 if not debug: sys.stdout.write('\r%0.2f%%' % (100.0 * (float(chrom_i) / (len(chromosomes)+1)))) sys.stdout.flush() try: chr_str = 'chrom_%d' % chrom ssg = ssf[chr_str] except Exception as err_str: print(err_str) print('Did not find chromosome %d in SS dataset.'%chrom) print('Continuing.') continue if debug: print('Coordinating data for chromosome %s' % chr_str) chromosomes_found.add(chrom) #Get summary statistics chromosome group ssg = ssf['chrom_%d' % chrom] ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype) if validation_genotype_file is not None: chrom_d_val = chr_dict_val[chr_str] vg_sids = chrom_d_val['sids'] common_sids = sp.intersect1d(ss_sids, vg_sids) # A map from sid to index for validation data vg_sid_dict = {} for i, sid in enumerate(vg_sids): vg_sid_dict[sid] = i else: common_sids = ss_sids # A map from sid to index for summary stats ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i #The indices to retain for the LD reference genotypes chrom_d = chr_dict[chr_str] g_sids = chrom_d['sids'] common_sids = sp.intersect1d(common_sids, g_sids) # A map from sid to index for LD reference data g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i if debug: print('Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom)) print('Ordering SNPs by genomic positions (based on LD reference genotypes).') g_snp_map = [] for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) # order by positions (based on LD reference file) g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] # Get the ordered sum stats SNPs indices. ss_snp_map = [] for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) # Get the ordered validation SNPs indices if validation_genotype_file is not None: vg_snp_map = [] for sid in common_sids: vg_snp_map.append(vg_sid_dict[sid]) vg_nts = sp.array(chrom_d_val['nts']) vg_nts_ok = sp.array(vg_nts)[vg_snp_map] g_nts = sp.array(chrom_d['nts']) ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype) betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg: ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 if validation_genotype_file is not None: vg_ss_nt_concord_count = sp.sum(vg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_vg_nt_concord_count = sp.sum(g_nts[g_snp_map] == vg_nts_ok) / 2.0 if debug: print('Nucleotide concordance counts out of %d genotypes, vg-rg: %d ; vg-ss: %d' % (len(g_snp_map), g_vg_nt_concord_count, vg_ss_nt_concord_count)) tot_vg_ss_nt_concord_count += vg_ss_nt_concord_count tot_g_vg_nt_concord_count += g_vg_nt_concord_count tot_g_ss_nt_concord_count += g_ss_nt_concord_count if debug: print('Nucleotide concordance counts out of %d genotypes, rg-ss: %d' % (len(g_snp_map), g_ss_nt_concord_count)) num_freq_discrep_filtered_snps = 0 num_non_matching_nts = 0 num_non_supported_nts = 0 num_ambig_nts = 0 # Identifying which SNPs have nucleotides that are ok.. ok_nts = [] ok_indices = {'g': [], 'ss': []} if validation_genotype_file is not None: ok_indices['vg']=[] #Now loop over SNPs to coordinate nucleotides. if validation_genotype_file is not None: for g_i, vg_i, ss_i in zip(g_snp_map, vg_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == vg_sids[vg_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: vg_nt = vg_nts[vg_i] ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_supported_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt))): if sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg: ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['vg'].append(vg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) else: for g_i, ss_i in zip(g_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_matching_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg and ss_freqs[ss_i]>0: ss_freqs[ss_i] = 1.0 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) if debug: print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts) print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts) # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices = snp_indices[ok_indices['g']] raw_snps, freqs = plinkfiles.parse_plink_snps( reference_genotype_file, snp_indices) snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ns = ssg['ns'][...][ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) sids = (ssg['sids'][...]).astype(util.sids_u_dtype) sids = sids[ok_indices['ss']] #Parse validation genotypes, if available if validation_genotype_file is not None: snp_indices_val = sp.array(chrom_d_val['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices_val = snp_indices_val[ok_indices['vg']] raw_snps_val, freqs_val = plinkfiles.parse_plink_snps( validation_genotype_file, snp_indices_val) snp_stds_val = sp.sqrt(2 * freqs_val * (1 - freqs_val)) snp_means_val = freqs_val * 2 # Check SNP frequencies, screen for possible problems.. if max_freq_discrep<1 and 'freqs' in ssg: ss_freqs = ss_freqs[ok_indices['ss']] ok_freq_snps = sp.logical_or(sp.absolute(ss_freqs - freqs) < max_freq_discrep,sp.absolute(ss_freqs + freqs-1) < max_freq_discrep) #Array of np.bool values ok_freq_snps = sp.logical_or(ok_freq_snps,ss_freqs<=0) #Only consider SNPs that actually have frequencies num_freq_discrep_filtered_snps = len(ok_freq_snps)- sp.sum(ok_freq_snps) assert num_freq_discrep_filtered_snps>=0, "Problems when filtering SNPs with frequency discrepencies" if num_freq_discrep_filtered_snps>0: # Filter freq_discrepancy_snps raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] ns = ns[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[ok_freq_snps] snp_stds_val = snp_stds_val[ok_freq_snps] snp_means_val = snp_means_val[ok_freq_snps] freqs_val = freqs_val[ok_freq_snps] if debug: print('Filtered %d SNPs due to frequency discrepancies'%num_freq_discrep_filtered_snps) # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) num_maf_filtered_snps = len(maf_filter)-sp.sum(maf_filter) assert num_maf_filtered_snps>=0, "Problems when filtering SNPs with low minor allele frequencies" if num_maf_filtered_snps>0: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] ns = ns[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[maf_filter] snp_stds_val = snp_stds_val[maf_filter] snp_means_val = snp_means_val[maf_filter] freqs_val = freqs_val[maf_filter] if debug: print('Filtered %d SNPs due to low MAF'%num_maf_filtered_snps) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() # if l[0] in sid_set: # genetic_map.append(l[0]) else: genetic_map = None coord_data_dict = {'chrom': 'chrom_%d' % chrom, 'raw_snps_ref': raw_snps, 'snp_stds_ref': snp_stds, 'snp_means_ref': snp_means, 'freqs_ref': freqs, 'ps': ps, 'ns': ns, 'positions': positions, 'nts': nts, 'sids': sids, 'genetic_map': genetic_map, 'betas': betas, 'log_odds': log_odds} if validation_genotype_file is not None: maf_adj_prs = sp.dot(log_odds, raw_snps_val) if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict_val['phenotypes'], maf_adj_prs)[0, 1] print('Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)) coord_data_dict['raw_snps_val']=raw_snps_val coord_data_dict['snp_stds_val']=snp_stds_val coord_data_dict['snp_means_val']=snp_means_val coord_data_dict['freqs_val']=freqs_val coord_data_dict['log_odds_prs']=maf_adj_prs maf_adj_risk_scores += maf_adj_prs write_coord_data(cord_data_g, coord_data_dict, debug=debug) if debug: print('%d SNPs were retained on chromosome %d.' % (len(sids), chrom)) num_snps_common_before_filtering += len(common_sids) num_snps_common_after_filtering += len(sids) tot_num_ambig_nts += num_ambig_nts tot_num_non_supported_nts += num_non_supported_nts tot_num_non_matching_nts += num_non_matching_nts tot_num_freq_discrep_filtered_snps += num_freq_discrep_filtered_snps tot_num_maf_filtered_snps += num_maf_filtered_snps if not debug: sys.stdout.write('\r%0.2f%%\n' % (100.0)) sys.stdout.flush() # Now calculate the prediction r^2 if validation_genotype_file: if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef( plinkf_dict_val['phenotypes'], maf_adj_risk_scores)[0, 1] print('Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr ** 2)) print('Overall nucleotide concordance counts: rg_vg: %d, rg_ss: %d, vg_ss: %d' % (tot_g_vg_nt_concord_count, tot_g_ss_nt_concord_count, tot_vg_ss_nt_concord_count)) else: if debug: print('Overall nucleotide concordance counts, rg_ss: %d' % (tot_g_ss_nt_concord_count)) summary_dict[7]={'name':'Num chromosomes used:','value':len(chromosomes_found)} summary_dict[8]={'name':'SNPs common across datasets:','value':num_snps_common_before_filtering} summary_dict[9]={'name':'SNPs retained after filtering:','value':num_snps_common_after_filtering} if tot_num_ambig_nts>0: summary_dict[10]={'name':'SNPs w ambiguous nucleotides filtered:','value':tot_num_ambig_nts} if tot_num_non_supported_nts>0: summary_dict[10.1]={'name':'SNPs w unknown/unsupported nucleotides filtered:','value':tot_num_non_supported_nts} if tot_num_non_matching_nts>0: summary_dict[11]={'name':'SNPs w other nucleotide discrepancies filtered:','value':tot_num_non_matching_nts} if min_maf>0: summary_dict[12]={'name':'SNPs w MAF<%0.3f filtered:'%min_maf,'value':tot_num_maf_filtered_snps} if max_freq_discrep<0.5: summary_dict[13]={'name':'SNPs w allele freq discrepancy > %0.3f filtered:'%max_freq_discrep,'value':tot_num_freq_discrep_filtered_snps} t1 = time.time() t = (t1 - t0) summary_dict[13.9]={'name':'dash', 'value':'Running times'} summary_dict[15]={'name':'Run time for coordinating datasets:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None, reference_genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01): # recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding.. print 'Coordinating things w genotype file: %s \nref. genot. file: %s' % ( genotype_file, reference_genotype_file) plinkf = plinkfile.PlinkFile(genotype_file) #Loads only the individuals... (I think?) samples = plinkf.get_samples() num_individs = len(samples) Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens) == 1: print 'Unable to find phenotype values.' has_phenotype = False elif len(unique_phens) == 2: cc_bins = sp.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]) has_phenotype = True else: print 'Found quantitative phenotype values' has_phenotype = True #Figure out chromosomes and positions. print 'Parsing validation genotype bim file' loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) print 'Parsing LD reference genotype bim file' plinkf_ref = plinkfile.PlinkFile(reference_genotype_file) loci_ref = plinkf_ref.get_loci() plinkf_ref.close() chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes) # chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes) #Open HDF5 file and prepare out data assert not 'iids' in hdf5_file.keys( ), 'Something is wrong with the HDF5 file?' if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') maf_adj_risk_scores = sp.zeros(num_individs) num_common_snps = 0 #corr_list = [] tot_g_ss_nt_concord_count = 0 tot_rg_ss_nt_concord_count = 0 tot_g_rg_nt_concord_count = 0 tot_num_non_matching_nts = 0 #Now iterate over chromosomes for chrom in chromosomes: ok_indices = {'g': [], 'rg': [], 'ss': []} chr_str = 'chrom_%d' % chrom print 'Working on chromsome: %s' % chr_str chrom_d = chr_dict[chr_str] chrom_d_ref = chr_dict_ref[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue ssg = ssf['chrom_%d' % chrom] g_sids = chrom_d['sids'] rg_sids = chrom_d_ref['sids'] ss_sids = ssg['sids'][...] print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.' % ( len(g_sids), len(rg_sids), len(ss_sids)) common_sids = sp.intersect1d(ss_sids, g_sids) common_sids = sp.intersect1d(common_sids, rg_sids) print 'Found %d SNPs on chrom %d that were common across all datasets' % ( len(common_sids), chrom) ss_snp_map = [] g_snp_map = [] rg_snp_map = [] ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i rg_sid_dict = {} for i, sid in enumerate(rg_sids): rg_sid_dict[sid] = i for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) #order by positions g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) #order = order.tolist() g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] #Get the other two maps for sid in common_sids: rg_snp_map.append(rg_sid_dict[sid]) for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) g_nts = sp.array(chrom_d['nts']) rg_nts = sp.array(chrom_d_ref['nts']) rg_nts_ok = sp.array(rg_nts)[rg_snp_map] # rg_nts_l = [] # for nt in rg_nts_ok: # rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]]) # rg_nts_ok = sp.array(rg_nts_l) ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0 print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d' % ( len(g_snp_map), g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count) tot_g_ss_nt_concord_count += g_ss_nt_concord_count tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count tot_g_rg_nt_concord_count += g_rg_nt_concord_count num_non_matching_nts = 0 num_ambig_nts = 0 #Identifying which SNPs have nucleotides that are ok.. ok_nts = [] for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map): #To make sure, is the SNP id the same? assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[ ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] rg_nt = rg_nts[rg_i] # rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]] ss_nt = ss_nts[ss_i] #Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue os_g_nt = sp.array( [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) flip_nts = False if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))): if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) #Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['rg'].append(rg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) # if flip_nts: # ok_nts.append([ss_nt[1],ss_nt[0]]) # else: # ok_nts.append(ss_nt) #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0) print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts print '%d SNPs were retained on chromosome %d.' % (len( ok_indices['g']), chrom) #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # order = sp.argsort(positions) # sorted_positions = positions[order] # assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?' # ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) # ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) #Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) snp_indices_ref = sp.array(chrom_d_ref['snp_indices']) snp_indices_ref = snp_indices_ref[ ok_indices['rg']] #Pinpoint where the SNPs are in the file. raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref) snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref)) snp_means_ref = freqs_ref * 2 snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) log_odds = log_odds[ok_indices['ss']] # * sp.sqrt(freqs * (1 - freqs)) ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) #[order] sids = ssg['sids'][...][ok_indices['ss']] #For debugging... # g_sids = sp.array(chrom_d['sids'])[ok_indices['g']] # rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']] # ss_sids = ssg['sids'][...][ok_indices['ss']] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample' % sp.sum( freq_discrepancy_snp) # print freqs[freq_discrepancy_snp] # print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] raw_ref_snps = raw_ref_snps[ok_freq_snps] snp_stds_ref = snp_stds_ref[ok_freq_snps] snp_means_ref = snp_means_ref[ok_freq_snps] freqs = freqs[ok_freq_snps] freqs_ref = freqs_ref[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #For debugging... # if sp.any(freq_discrepancy_snp): # g_sids = g_sids[ok_freq_snps] # rg_sids = rg_sids[ok_freq_snps] # ss_sids = ss_sids[ok_freq_snps] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' #Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "WTF?" if sp.sum(maf_filter) < n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] raw_ref_snps = raw_ref_snps[maf_filter] snp_stds_ref = snp_stds_ref[maf_filter] snp_means_ref = snp_means_ref[maf_filter] freqs = freqs[maf_filter] freqs_ref = freqs_ref[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] # if sp.sum(maf_filter)<n_snps: # g_sids = g_sids[maf_filter] # rg_sids = rg_sids[maf_filter] # ss_sids = ss_sids[maf_filter] # assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?' maf_adj_prs = sp.dot(log_odds, raw_snps) if has_phenotype: maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1] print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % ( chrom, maf_adj_corr) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_val', data=snp_stds) ofg.create_dataset('snp_means_val', data=snp_means) ofg.create_dataset('freqs_val', data=freqs) ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds_ref) ofg.create_dataset('snp_means_ref', data=snp_means_ref) ofg.create_dataset('freqs_ref', data=freqs_ref) ofg.create_dataset('nts', data=nts) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=maf_adj_prs) # print 'Sum betas', sp.sum(betas ** 2) #ofg.create_dataset('prs', data=prs) #risk_scores += prs maf_adj_risk_scores += maf_adj_prs num_common_snps += len(betas)
def coordinate_genot_ss(genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01): """ Assumes plink BED files. Imputes missing genotypes. """ plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) # num_individs = len(gf['chrom_1']['snps'][:, 0]) # Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8') Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens) == 1: print 'Unable to find phenotype values.' has_phenotype = False elif len(unique_phens) == 2: cc_bins = sp.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]) has_phenotype = True else: print 'Found quantitative phenotype values' has_phenotype = True risk_scores = sp.zeros(num_individs) rb_risk_scores = sp.zeros(num_individs) num_common_snps = 0 corr_list = [] rb_corr_list = [] if has_phenotype: hdf5_file.create_dataset('y', data=Y) hdf5_file.create_dataset('fids', data=fids) hdf5_file.create_dataset('iids', data=iids) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') #Figure out chromosomes and positions by looking at SNPs. loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) tot_num_non_matching_nts = 0 for chrom in chromosomes: chr_str = 'chrom_%d' % chrom print 'Working on chromsome: %s' % chr_str chrom_d = chr_dict[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue g_sids = chrom_d['sids'] g_sid_set = set(g_sids) assert len(g_sid_set) == len(g_sids), 'Some duplicates?' ss_sids = ssg['sids'][...] ss_sid_set = set(ss_sids) assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?' #Figure out filters: g_filter = sp.in1d(g_sids, ss_sids) ss_filter = sp.in1d(ss_sids, g_sids) #Order by SNP IDs g_order = sp.argsort(g_sids) ss_order = sp.argsort(ss_sids) g_indices = [] for g_i in g_order: if g_filter[g_i]: g_indices.append(g_i) ss_indices = [] for ss_i in ss_order: if ss_filter[ss_i]: ss_indices.append(ss_i) g_nts = chrom_d['nts'] snp_indices = chrom_d['snp_indices'] ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] assert not sp.any(sp.isnan(betas)), 'WTF?' assert not sp.any(sp.isinf(betas)), 'WTF?' num_non_matching_nts = 0 num_ambig_nts = 0 ok_nts = [] print 'Found %d SNPs present in both datasets' % (len(g_indices)) if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] ss_freqs_list = [] ok_indices = {'g': [], 'ss': []} for g_i, ss_i in it.izip(g_indices, ss_indices): #Is the nucleotide ambiguous? #g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]] g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue #First check if nucleotide is sane? if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue ss_nt = ss_nts[ss_i] #Are the nucleotides the same? flip_nts = False os_g_nt = sp.array( [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: # print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ # (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts #Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] order = sp.argsort(positions) ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) positions = positions[order] #Parse SNPs snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ ok_indices['g']] #Pinpoint where the SNPs are in the file. raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) print 'raw_snps.shape=', raw_snps.shape snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) #sp.std(raw_snps, 1) snp_means = freqs * 2 #sp.mean(raw_snps, 1) betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts)[order] sids = ssg['sids'][...][ok_indices['ss']] #Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample' % sp.sum( freq_discrepancy_snp) print freqs[freq_discrepancy_snp] print ss_freqs[freq_discrepancy_snp] #Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] #Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "WTF?" if sp.sum(maf_filter) < n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] print '%d SNPs with MAF < %0.3f were filtered' % ( n_snps - maf_filter_sum, min_maf) print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom) rb_prs = sp.dot(sp.transpose(raw_snps), log_odds) if has_phenotype: print 'Normalizing SNPs' snp_means.shape = (len(raw_snps), 1) snp_stds.shape = (len(raw_snps), 1) snps = (raw_snps - snp_means) / snp_stds assert snps.shape == raw_snps.shape, 'Aha!' snp_stds = snp_stds.flatten() snp_means = snp_means.flatten() prs = sp.dot(sp.transpose(snps), betas) corr = sp.corrcoef(Y, prs)[0, 1] corr_list.append(corr) print 'PRS correlation for chromosome %d was %0.4f' % (chrom, corr) rb_corr = sp.corrcoef(Y, rb_prs)[0, 1] rb_corr_list.append(rb_corr) print 'Raw effect sizes PRS correlation for chromosome %d was %0.4f' % ( chrom, rb_corr) sid_set = set(sids) if genetic_map_dir is not None: genetic_map = [] with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) print 'Now storing coordinated data to HDF5 file.' ofg = cord_data_g.create_group('chrom_%d' % chrom) ofg.create_dataset('raw_snps_ref', data=raw_snps, compression='lzf') ofg.create_dataset('snp_stds_ref', data=snp_stds) ofg.create_dataset('snp_means_ref', data=snp_means) ofg.create_dataset('freqs_ref', data=freqs) ofg.create_dataset('ps', data=ps) ofg.create_dataset('positions', data=positions) ofg.create_dataset('nts', data=nts) ofg.create_dataset('sids', data=sids) if genetic_map_dir is not None: ofg.create_dataset('genetic_map', data=genetic_map) # print 'Sum of squared effect sizes:', sp.sum(betas ** 2) # print 'Sum of squared log odds:', sp.sum(log_odds ** 2) ofg.create_dataset('betas', data=betas) ofg.create_dataset('log_odds', data=log_odds) ofg.create_dataset('log_odds_prs', data=rb_prs) if has_phenotype: risk_scores += prs rb_risk_scores += rb_prs num_common_snps += len(betas)
def get_num_indivs(genotype_file): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() plinkf.close() return len(samples)
def bed_plink_to_hdf5(genotype_file, out_hdf5_file, indiv_filter=None): """ Note: It may not support all PLINK files for now. """ plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() affections = [] phens = [] iids = [] fids = [] for sample in samples: iids.append(sample.iid) fids.append(sample.fid) affections.append(sample.affection) phens.append(sample.phenotype) num_individs = len(iids) if sp.any(sp.isnan(phens)): print( 'Phenotypes appear to have some NaNs, or perhaps parsing failed?') else: print('%d individuals have phenotype and genotype information.' % num_individs) # If these indices are not in order then we place them in the right place while parsing SNPs. print('Iterating over BED file.') oh5f = h5py.File(out_hdf5_file) # First construct chromosome groups. # Then iterate through the plink file. locus_list = plinkf.get_loci() snp_i = 0 curr_chromosome = 1 print("The current chromosome is Chr", curr_chromosome) for locus, row in izip(locus_list, plinkf): chromosome = locus.chromosome if curr_chromosome == 1: # Initialize data containers sids = [] positions = [] nts_list = [] snps = [] if chromosome != curr_chromosome: ## Print the current chromosome print("The current chromosome is Chr", chromosome) # Store current data in HDF5 file chr_group = oh5f.create_group('chr_%d' % curr_chromosome) chr_group.create_dataset('sids', data=sids) chr_group.create_dataset('positions', data=positions) chr_group.create_dataset('snps', data=sp.array(snps, dtype='int8')) chr_group.create_dataset('nts_list', data=nts_list) oh5f.flush() # re-initialize data containers sids = [] positions = [] nts_list = [] snps = [] curr_chromosome = chromosome sids.append(locus.name) nts_list.append([locus.allele1, locus.allele2]) positions.append(locus.position) # Parse SNP, and fill in the blanks if necessary. if indiv_filter is not None: snp = sp.array(row, dtype='int8')[indiv_filter] else: snp = sp.array(row, dtype='int8') bin_counts = row.allele_counts() if bin_counts[-1] > 0: mode_v = sp.argmax(bin_counts[:2]) snp[snp == 3] = mode_v snps.append(snp) # Store remaining data in HDF5 file chr_group = oh5f.create_group('chr_%d' % curr_chromosome) chr_group.create_dataset('sids', data=sids) chr_group.create_dataset('positions', data=positions) chr_group.create_dataset('snps', data=sp.array(snps, dtype='int8')) chr_group.create_dataset('nts_list', data=nts_list) oh5f.flush() plinkf.close() oh5f.close() print("The parsing is completed") #bed_plink_to_hdf5("../risk_prediction/celiac_disease_data/Cel_disease_CC", "H.h5", indiv_filter=None)
def coordinate_ss(genotype_file=None,ssfformat=None,hdf5_file=None,outfile=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01, skip_coordination=False, keep_all=False,skip_ambiguous=False): """ Assumes plink BED files. Imputes missing genotypes. """ plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() num_individs = len(samples) # num_individs = len(gf['chrom_1']['snps'][:, 0]) # Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8') Y = [s.phenotype for s in samples] fids = [s.fid for s in samples] iids = [s.iid for s in samples] unique_phens = sp.unique(Y) if len(unique_phens) == 1: print 'Unable to find phenotype values.' has_phenotype = False elif len(unique_phens) == 2: cc_bins = sp.bincount(Y) assert len(cc_bins) == 2, 'Problems with loading phenotype' print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1]) has_phenotype = True else: print 'Found quantitative phenotype values' has_phenotype = True risk_scores = sp.zeros(num_individs) rb_risk_scores = sp.zeros(num_individs) num_common_snps = 0 corr_list = [] rb_corr_list = [] ssf = hdf5_file['sum_stats'] ssf_dict={} # Figure out chromosomes and positions by looking at SNPs. loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = _get_chrom_dict_(loci, chromosomes) tot_num_non_matching_nts = 0 for chrom in chromosomes: chr_str = 'chrom_%d' % chrom chr_col = 'chr%d' % chrom print 'Working on chromsome: %s' % chr_str chrom_d = chr_dict[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception, err_str: print err_str print 'Did not find chromsome in SS dataset.' print 'Continuing.' continue g_sids = chrom_d['sids'] g_sid_set = set(g_sids) assert len(g_sid_set) == len(g_sids), 'Some duplicates?' ss_sids = ssg['sids'][...] ss_sid_set = set(ss_sids) assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?' # Figure out filters: g_filter = sp.in1d(g_sids, ss_sids) ss_filter = sp.in1d(ss_sids, g_sids) # Order by SNP IDs g_order = sp.argsort(g_sids) ss_order = sp.argsort(ss_sids) g_indices = [] for g_i in g_order: if g_filter[g_i]: g_indices.append(g_i) ss_indices = [] for ss_i in ss_order: if ss_filter[ss_i]: ss_indices.append(ss_i) g_ntA1=[] g_ntA2=[] g_nts = chrom_d['nts'] snp_indices = chrom_d['snp_indices'] ss_nts = ssg['nts'][...] betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if ssfformat=="LDSCORE" or ssfformat == "STANDARD_FUNCT": ld_score = ssg['ld_score'][...] ### LDSCORE #### Track allele flips indices #### ss_flips = sp.ones(len(ss_indices)) assert not sp.any(sp.isnan(betas)), 'WTF?' # assert not sp.any(sp.isinf(betas)), 'WTF?' num_non_matching_nts = 0 num_ambig_nts = 0 ok_nts = [] print 'Found %d SNPs present in both datasets' % (len(g_indices)) if 'freqs' in ssg.keys(): ss_freqs = ssg['freqs'][...] ss_freqs_list = [] ok_indices = {'g': [], 'ss': []} for g_i, ss_i in it.izip(g_indices, ss_indices): # Is the nucleotide ambiguous? # g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]] g_nt = [g_nts[g_i][0], g_nts[g_i][1]] g_ntA1.append(g_nt[0]) g_ntA2.append(g_nt[1]) if not skip_coordination: if not skip_ambiguous: if tuple(g_nt) in ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue ss_nt = ss_nts[ss_i] # Are the nucleotides the same? flip_nts = False os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] ss_flips[ss_i] = -1 if 'freqs' in ssg.keys(): ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: # print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ # (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] order = sp.argsort(positions) ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) positions = positions[order] # Parse SNPs snp_indices = sp.array(chrom_d['snp_indices']) snp_indices = snp_indices[ok_indices['g']] # Pinpoint where the SNPs are in the file. #raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices) freqs = _parse_plink_snps_freqs_(genotype_file, snp_indices) betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] sids = ssg['sids'][...][ok_indices['ss']] if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT": ld_score = ld_score[ok_indices['ss']] #### LDSCORE # Check SNP frequencies.. if check_mafs and 'freqs' in ssg.keys(): ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15 if sp.any(freq_discrepancy_snp): print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample' % sp.sum( freq_discrepancy_snp) print freqs[freq_discrepancy_snp] print ss_freqs[freq_discrepancy_snp] # Filter freq_discrepancy_snps ok_freq_snps = sp.negative(freq_discrepancy_snp) freqs = freqs[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT": ld_score = ld_score[ok_freq_snps] #### LDSCORE # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "WTF?" if sp.sum(maf_filter) < n_snps: sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT": ld_score = ld_score[maf_filter] print '%d SNPs with MAF < %0.3f were filtered' % (n_snps - maf_filter_sum, min_maf) print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom) num_common_snps += len(betas) ssf_dict[chr_str]['betas']=betas ssf_dict[chr_str]['log_odds'] = log_odds
def get_prs(genotype_file, rs_id_map, phen_map=None): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() # 1. Figure out indiv filter and get true phenotypes indiv_filter = sp.zeros(len(samples), dtype='bool8') true_phens = [] iids = [] if phen_map is not None: pcs = [] sex = [] covariates = [] phen_iids = set(phen_map.keys()) for samp_i, sample in enumerate(samples): if sample.iid in phen_iids: indiv_filter[samp_i] = True true_phens.append(phen_map[sample.iid]['phen']) iids.append(sample.iid) if 'pcs' in phen_map[sample.iid].keys(): pcs.append(phen_map[sample.iid]['pcs']) if 'sex' in phen_map[sample.iid].keys(): sex.append(phen_map[sample.iid]['sex']) if 'covariates' in phen_map[sample.iid].keys(): covariates.append(phen_map[sample.iid]['covariates']) if len(pcs) > 0: assert len(pcs) == len( true_phens ), 'PC information missing for some individuals with phenotypes' if len(sex) > 0: assert len(sex) == len( true_phens ), 'Sex information missing for some individuals with phenotypes' if len(covariates) > 0: assert len(covariates) == len( true_phens ), 'Covariates missing for some individuals with phenotypes' else: for samp_i, sample in enumerate(samples): if sample.affection != 2: indiv_filter[samp_i] = True true_phens.append(sample.affection) iids.append(sample.iid) num_individs = sp.sum(indiv_filter) assert num_individs > 0, 'Issues in parsing the phenotypes and/or PCs?' assert not sp.any(sp.isnan( true_phens)), 'Phenotypes appear to have some NaNs, or parsing failed.' print '%d individuals have phenotype and genotype information.' % num_individs num_non_matching_nts = 0 num_flipped_nts = 0 raw_effects_prs = sp.zeros(num_individs) pval_derived_effects_prs = sp.zeros(num_individs) # If these indices are not in order then we place them in the right place # while parsing SNPs. print 'Iterating over BED file to calculate risk scores.' locus_list = plinkf.get_loci() snp_i = 0 for locus, row in it.izip(locus_list, plinkf): upd_pval_beta = 0 try: # Check rs-ID sid = locus.name rs_info = rs_id_map[sid] except Exception: # Move on if rsID not found. continue if rs_info['upd_pval_beta'] == 0: continue # Check whether the nucleotides are OK, and potentially flip it. ss_nt = rs_info['nts'] g_nt = [locus.allele1, locus.allele2] flip_nts = False os_g_nt = sp.array( [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: raw_beta = -rs_info['raw_beta'] upd_pval_beta = -rs_info['upd_pval_beta'] num_flipped_nts += 1 else: num_non_matching_nts += 1 continue else: raw_beta = rs_info['raw_beta'] upd_pval_beta = rs_info['upd_pval_beta'] # Parse SNP, and fill in the blanks if necessary. snp = sp.array(row, dtype='int8')[indiv_filter] bin_counts = row.allele_counts() if bin_counts[-1] > 0: mode_v = sp.argmax(bin_counts[:2]) snp[snp == 3] = mode_v # Update scores and move on. raw_effects_prs += snp * raw_beta assert not sp.any( sp.isnan(raw_effects_prs) ), 'Some individual raw effects risk scores are NANs (not a number). They are corrupted.' pval_derived_effects_prs += snp * upd_pval_beta assert not sp.any( sp.isnan(pval_derived_effects_prs) ), 'Some individual weighted effects risk scores are NANs (not a number). They are corrupted.' if snp_i > 0 and snp_i % 100000 == 0: print snp_i print 'Number of non-matching NTs: %d' % num_non_matching_nts raw_eff_r2 = (sp.corrcoef(raw_effects_prs, true_phens)[0, 1])**2 pval_eff_r2 = (sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1])**2 print 'Raw effects PRS r2: %0.4f' % raw_eff_r2 print 'Weigted effects PRS r2: %0.4f' % pval_eff_r2 snp_i += 1 plinkf.close() print "DONE!" print 'Number of non-matching NTs: %d' % num_non_matching_nts print 'Number of flipped NTs: %d' % num_flipped_nts raw_eff_corr = sp.corrcoef(raw_effects_prs, true_phens)[0, 1] raw_eff_r2 = raw_eff_corr**2 pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1] pval_eff_r2 = pval_eff_corr**2 print 'Raw effects PRS correlation: %0.4f' % raw_eff_corr print 'Raw effects PRS r2: %0.4f' % raw_eff_r2 print 'Weigted effects PRS correlation: %0.4f' % pval_eff_corr print 'Weigted effects PRS r2: %0.4f' % pval_eff_r2 ret_dict = { 'raw_effects_prs': raw_effects_prs.copy(), 'pval_derived_effects_prs': pval_derived_effects_prs.copy(), 'true_phens': true_phens[:], 'iids': iids } if len(pcs) > 0: ret_dict['pcs'] = pcs if len(sex) > 0: ret_dict['sex'] = sex if len(covariates) > 0: ret_dict['covariates'] = covariates return ret_dict
def bed_to_hdf5_file(bed_file, hdf5_out): """ Note: It may not support all PLINK files for now """ plinkf = plinkfile.PlinkFile(bed_file) samples = plinkf.get_samples() print("Extracting sample information...") affections = [] phenotypes = [] iids = [] fids = [] sex = [] ## For each sample extract the individual identifier, the family identifier, ## the affection, the phenotype and the sex. for sample in samples: iids.append(sample.iid) fids.append(sample.fid) affections.append(sample.affection) #phenotypes.append(sample.phenotype) sex.append(sample.sex) ## Number of individuals N = len(iids) if sp.any(sp.isnan(phenotypes)): print( 'Phenotypes appear to have some NaNs, or perhaps parsing failed?') else: print("%d individuals have phenotype and genotype information." % N) hf = h5py.File(hdf5_out) ## Store sample information in HDF5 file sample_inf = hf.create_group('sample_informations') sample_inf.create_dataset('iids', data=iids) sample_inf.create_dataset('fids', data=fids) sample_inf.create_dataset('Affections', data=affections) sample_inf.create_dataset('Sex', data=sex) sample_inf.create_dataset('Phenotypes', data=phenotypes) hf.flush() print("Iterating over BED file...") ## Iterate through the plink file. locus_list = plinkf.get_loci() chromosomes = [] current_chromosome = 0 print("The current chromosome is Chr", current_chromosome) for locus, row in izip(locus_list, plinkf): ## Get the current chromosome chrom = locus.chromosome ## and store it in the chromosome vector chromosomes.append(locus.chromosome) if current_chromosome == 0: ## Initialize data containers sids = [] positions = [] nts_list = [] snps = [] if chrom != current_chromosome: ## Print the number of the chromosome print("The current chromosome is Chr", chrom) # Store current data in the HDF5 file chr_group = hf.create_group("chr_%d" % current_chromosome) chr_group.create_dataset("sids", data=sids) chr_group.create_dataset('positions', data=positions) chr_group.create_dataset('snps', data=sp.array(snps, dtype='int8')) chr_group.create_dataset('nts_list', data=nts_list) hf.flush() ## re-initialize data containers sids = [] positions = [] nts_list = [] snps = [] current_chromosome = chrom ## Get the SNP name sids.append(locus.name) ## Get the first and the second allele nts_list.append([locus.allele1, locus.allele2]) ## Furthermore, we store the position positions.append(locus.position) ## Parse SNP and fill in the blanks if necessary snp = sp.array(row, dtype="int8") bin_counts = row.allele_counts() if bin_counts[-1] > 0: mode_v = sp.argmax(bin_counts[:2]) snp[snp == 3] = mode_v snps.append(snp) ## Store remaining data in HDF5 file chr_group = hf.create_group("chr_%d" % current_chromosome) chr_group.create_dataset("sids", data=sids) chr_group.create_dataset('positions', data=positions) chr_group.create_dataset('snps', data=sp.array(snps, dtype='int8')) chr_group.create_dataset('nts_list', data=nts_list) hf.create_dataset("Chromosomes", data=chromosomes) hf.flush() plinkf.close() hf.close() print("The parsing is completed")
def get_prs(genotype_file, rs_id_map, phen_map=None): plinkf = plinkfile.PlinkFile(genotype_file) samples = plinkf.get_samples() # 1. Figure out indiv filter and get true phenotypes indiv_filter = sp.zeros(len(samples), dtype='bool8') true_phens = [] iids = [] if phen_map is not None: pcs = [] sex = [] covariates = [] phen_iids = set(phen_map.keys()) for samp_i, sample in enumerate(samples): if sample.iid in phen_iids: indiv_filter[samp_i] = True true_phens.append(phen_map[sample.iid]['phen']) iids.append(sample.iid) if 'pcs' in list(phen_map[sample.iid].keys()): pcs.append(phen_map[sample.iid]['pcs']) if 'sex' in list(phen_map[sample.iid].keys()): sex.append(phen_map[sample.iid]['sex']) if 'covariates' in list(phen_map[sample.iid].keys()): # Temp hack... # if phen_map[sample.iid]['sex']==1: # covariates.append([phen_map[sample.iid]['covariates'][0],0]) # else: # covariates.append([0,phen_map[sample.iid]['covariates'][0]]) covariates.append(phen_map[sample.iid]['covariates']) if len(pcs) > 0: assert len(pcs) == len( true_phens ), 'PC information missing for some individuals with phenotypes' if len(sex) > 0: assert len(sex) == len( true_phens ), 'Sex information missing for some individuals with phenotypes' if len(covariates) > 0: assert len(covariates) == len( true_phens ), 'Covariates missing for some individuals with phenotypes' else: for samp_i, sample in enumerate(samples): if sample.affection != 2: indiv_filter[samp_i] = True true_phens.append(sample.affection) # print(sample.affection) iids.append(sample.iid) num_individs = sp.sum(indiv_filter) assert num_individs > 0, 'Issues in parsing the phenotypes and/or PCs?' assert not sp.any(sp.isnan( true_phens)), 'Phenotypes appear to have some NaNs, or parsing failed.' print('%d individuals have phenotype and genotype information.' % num_individs) num_non_matching_nts = 0 num_flipped_nts = 0 raw_effects_prs = sp.zeros(num_individs) pval_derived_effects_prs = sp.zeros(num_individs) # If these indices are not in order then we place them in the right place while parsing SNPs. print('Iterating over BED file to calculate risk scores.') locus_list = plinkf.get_loci() snp_i = 0 for locus, row in zip(locus_list, plinkf): upd_pval_beta = 0 try: # Check rs-ID # sid = '%d_%d'%(locus.chromosome,locus.bp_position) sid = locus.name rs_info = rs_id_map[sid] except Exception: # Move on if rsID not found. continue if rs_info['upd_pval_beta'] == 0: continue # Check whether the nucleotides are OK, and potentially flip it. ss_nt = rs_info['nts'] g_nt = [locus.allele1, locus.allele2] flip_nts = False os_g_nt = sp.array( [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: raw_beta = -rs_info['raw_beta'] upd_pval_beta = -rs_info['upd_pval_beta'] num_flipped_nts += 1 else: # print "Nucleotides don't match after all?: sid=%s, g_nt=%s, ss_nt=%s" % (locus.name, str(g_nt), str(ss_nt)) num_non_matching_nts += 1 continue else: raw_beta = rs_info['raw_beta'] upd_pval_beta = rs_info['upd_pval_beta'] # Parse SNP, and fill in the blanks if necessary. snp = sp.array(row, dtype='int8')[indiv_filter] bin_counts = row.allele_counts() if bin_counts[-1] > 0: mode_v = sp.argmax(bin_counts[:2]) snp[snp == 3] = mode_v ## fixed buggy code ## wrong encoding of genotype (A1 should be encoded as 1 instead of A2. It is different from plinkio default) ## original code: # no action ## new code snp = 2 - snp ## fix finish # Normalize SNP # n_snp = (snp - sp.mean(snp))/sp.std(snp) # Update scores and move on. raw_effects_prs += snp * raw_beta assert not sp.any( sp.isnan(raw_effects_prs)), 'Raw effects PRS is corrupted' pval_derived_effects_prs += snp * upd_pval_beta assert not sp.any(sp.isnan( pval_derived_effects_prs)), 'Weighted effects PRS is corrupted' if snp_i > 0 and snp_i % 100000 == 0: print(snp_i) print('Number of non-matching NTs: %d' % num_non_matching_nts) raw_eff_r2 = (sp.corrcoef(raw_effects_prs, true_phens)[0, 1])**2 pval_eff_r2 = (sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1])**2 print('Raw effects PRS r2: %0.4f' % raw_eff_r2) print('Weigted effects PRS r2: %0.4f' % pval_eff_r2) snp_i += 1 plinkf.close() print("DONE!") print('Number of non-matching NTs: %d' % num_non_matching_nts) print('Number of flipped NTs: %d' % num_flipped_nts) raw_eff_corr = sp.corrcoef(raw_effects_prs, true_phens)[0, 1] raw_eff_r2 = raw_eff_corr**2 pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1] pval_eff_r2 = pval_eff_corr**2 print('Raw effects PRS correlation: %0.4f' % raw_eff_corr) print('Raw effects PRS r2: %0.4f' % raw_eff_r2) print('Weigted effects PRS correlation: %0.4f' % pval_eff_corr) print('Weigted effects PRS r2: %0.4f' % pval_eff_r2) ret_dict = { 'raw_effects_prs': raw_effects_prs.copy(), 'pval_derived_effects_prs': pval_derived_effects_prs.copy(), 'true_phens': true_phens[:], 'iids': iids } if len(pcs) > 0: ret_dict['pcs'] = pcs if len(sex) > 0: ret_dict['sex'] = sex if len(covariates) > 0: ret_dict['covariates'] = covariates return ret_dict
def coordinate_genot_ss(genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01, skip_coordination=False, debug=False): """ Assumes plink BED files. Imputes missing genotypes. """ from plinkio import plinkfile plinkf = plinkfile.PlinkFile(genotype_file) plinkf_dict = plinkfiles.get_phenotypes(plinkf) num_individs = plinkf_dict['num_individs'] risk_scores = sp.zeros(num_individs) rb_risk_scores = sp.zeros(num_individs) num_common_snps = 0 corr_list = [] rb_corr_list = [] if plinkf_dict['has_phenotype']: hdf5_file.create_dataset('y', data=plinkf_dict['phenotypes']) hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict['fids'], dtype=util.fids_dtype)) hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict['iids'], dtype=util.iids_dtype)) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') # Figure out chromosomes and positions by looking at SNPs. loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes) tot_num_non_matching_nts = 0 for chrom in chromosomes: chr_str = 'chrom_%d' % chrom print('Coordinating data for chromosome %s' % chr_str) chrom_d = chr_dict[chr_str] #print(chrom_d) try: ssg = ssf['chrom_%d' % chrom] except Exception as err_str: print(err_str) print('Did not find chromosome in SS dataset.') print('Continuing.') continue # for x,y in zip(chrom_d['sids'], chrom_d['nts']): # sys.stderr.write(f'{x} {y[0]} {y[1]}\n') # # for x,y in zip(ssg['sids'], ssg['nts']): # sys.stderr.write(f'{x} {y[0]} {y[1]}\n') g_sids = chrom_d['sids'] g_sid_set = set(g_sids) assert len(g_sid_set) == len( g_sids), 'Some SNPs appear to be duplicated?' ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype) ss_sid_set = set(ss_sids) assert len(ss_sid_set) == len( ss_sids), 'Some SNPs appear to be duplicated?' # Figure out filters: g_filter = sp.in1d(g_sids, ss_sids) ss_filter = sp.in1d(ss_sids, g_sids) # Order by SNP IDs g_order = sp.argsort(g_sids) ss_order = sp.argsort(ss_sids) g_indices = [] for g_i in g_order: if g_filter[g_i]: g_indices.append(g_i) ss_indices = [] for ss_i in ss_order: if ss_filter[ss_i]: ss_indices.append(ss_i) g_nts = chrom_d['nts'] snp_indices = chrom_d['snp_indices'] ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype) betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] assert not sp.any(sp.isnan( betas)), 'Some SNP effect estimates are NANs (not a number)' assert not sp.any(sp.isinf( betas)), 'Some SNP effect estimates are INFs (infinite numbers)' # Wallace -start, f**k LDpred w_pos = chrom_d['positions'] # -end num_non_matching_nts = 0 num_ambig_nts = 0 ok_nts = [] if debug: print('Found %d SNPs present in both datasets' % (len(g_indices))) if 'freqs' in ssg: ss_freqs = ssg['freqs'][...] ok_indices = {'g': [], 'ss': []} for g_i, ss_i in zip(g_indices, ss_indices): # for g_i, ss_i, pos_i in zip(g_indices, ss_indices, w_pos): # Is the nucleotide ambiguous? g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if not skip_coordination: if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue ss_nt = ss_nts[ss_i] # Are the nucleotides the same? flip_nts = False os_g_nt = sp.array([ util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]] ]) if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)): # Opposite strand nucleotides flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg: if ss_freqs[ss_i] > 0: ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: # Wallace debug if debug: sys.stderr.write( f'non match at: {g_sids[g_i]} - ssid:{ss_sids[ss_i]}, g_nt: {g_nt[0]} - {g_nt[1]}, ss_nt: {ss_nt[0]} - {ss_nt[1]}\n' ) # End Wallace debug. num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) if debug: print('%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts) print('%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts) # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] order = sp.argsort(positions) ok_indices['g'] = list(sp.array(ok_indices['g'])[order]) ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order]) positions = positions[order] # Parse SNPs snp_indices = sp.array(chrom_d['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices = snp_indices[ok_indices['g']] raw_snps, freqs = plinkfiles.parse_plink_snps(genotype_file, snp_indices) if debug: print('Parsed a %dX%d (SNP) genotype matrix' % (raw_snps.shape[0], raw_snps.shape[1])) snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts)[order] sids = (ssg['sids'][...]).astype(util.sids_u_dtype) sids = sids[ok_indices['ss']] # Check SNP frequencies.. if check_mafs and 'freqs' in ssg: ss_freqs = ss_freqs[ok_indices['ss']] # Assuming freq less than 0 is missing data freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15 # Filter SNPs that doesn't have MAF info from sumstat freq_discrepancy_snp = sp.logical_and(freq_discrepancy_snp, ss_freqs > 0) freq_discrepancy_snp = sp.logical_and(freq_discrepancy_snp, ss_freqs < 1) if sp.any(freq_discrepancy_snp): print( 'Warning: %d SNPs appear to have high frequency ' 'discrepancy between summary statistics and validation sample' % sp.sum(freq_discrepancy_snp)) # Filter freq_discrepancy_snps ok_freq_snps = sp.logical_not(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "Problems when filtering SNPs with low minor allele frequencies" if sp.sum(maf_filter) < n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] print('%d SNPs with MAF < %0.3f were filtered' % (n_snps - maf_filter_sum, min_maf)) print('%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom)) rb_prs = sp.dot(sp.transpose(raw_snps), log_odds) if debug and plinkf_dict['has_phenotype']: print('Normalizing SNPs') snp_means.shape = (len(raw_snps), 1) snp_stds.shape = (len(raw_snps), 1) snps = (raw_snps - snp_means) / snp_stds assert snps.shape == raw_snps.shape, 'Problems when normalizing SNPs (set to have variance 1 and 0 mean)' snp_stds = snp_stds.flatten() snp_means = snp_means.flatten() prs = sp.dot(sp.transpose(snps), betas) corr = sp.corrcoef(plinkf_dict['phenotypes'], prs)[0, 1] corr_list.append(corr) print( 'PRS correlation for chromosome %d was %0.4f when predicting into LD ref data' % (chrom, corr)) rb_corr = sp.corrcoef(plinkf_dict['phenotypes'], rb_prs)[0, 1] rb_corr_list.append(rb_corr) print( 'Raw effect sizes PRS correlation for chromosome %d was %0.4f when predicting into LD ref data' % (chrom, rb_corr)) sid_set = set(sids) if genetic_map_dir is not None: genetic_map = [] with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() if l[0] in sid_set: genetic_map.append(l[0]) else: genetic_map = None coord_data_dict = { 'chrom': 'chrom_%d' % chrom, 'raw_snps_ref': raw_snps, 'snp_stds_ref': snp_stds, 'snp_means_ref': snp_means, 'freqs_ref': freqs, 'ps': ps, 'positions': positions, 'nts': nts, 'sids': sids, 'genetic_map': genetic_map, 'betas': betas, 'log_odds': log_odds, 'log_odds_prs': rb_prs } write_coord_data(cord_data_g, coord_data_dict) if debug and plinkf_dict['has_phenotype']: rb_risk_scores += rb_prs risk_scores += prs num_common_snps += len(betas) if debug and plinkf_dict['has_phenotype']: # Now calculate the prediction R^2 corr = sp.corrcoef(plinkf_dict['phenotypes'], risk_scores)[0, 1] rb_corr = sp.corrcoef(plinkf_dict['phenotypes'], rb_risk_scores)[0, 1] print( 'PRS R2 prediction accuracy for the whole genome was %0.4f (corr=%0.4f) when predicting into LD ref data' % (corr**2, corr)) print( 'Log-odds (effects) PRS R2 prediction accuracy for the whole genome was %0.4f (corr=%0.4f) when predicting into LD ref data' % (rb_corr**2, rb_corr)) print('There were %d SNPs in common' % num_common_snps) print('In all, %d SNPs were excluded due to nucleotide issues.' % tot_num_non_matching_nts) print('Done coordinating genotypes and summary statistics datasets.')
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None, reference_genotype_file=None, hdf5_file=None, genetic_map_dir=None, check_mafs=False, min_maf=0.01, skip_coordination=False, debug=False): print('Coordinating things w genotype file: %s \nref. genot. file: %s' % (genotype_file, reference_genotype_file)) from plinkio import plinkfile plinkf = plinkfile.PlinkFile(genotype_file) # Loads only the individuals... plinkf_dict = plinkfiles.get_phenotypes(plinkf) # Figure out chromosomes and positions. if debug: print('Parsing validation bim file') loci = plinkf.get_loci() plinkf.close() gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes) if debug: print('Parsing LD reference bim file') plinkf_ref = plinkfile.PlinkFile(reference_genotype_file) loci_ref = plinkf_ref.get_loci() plinkf_ref.close() chr_dict_ref = plinkfiles.get_chrom_dict(loci_ref, chromosomes) # Open HDF5 file and prepare out data assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.' if plinkf_dict['has_phenotype']: hdf5_file.create_dataset('y', data=plinkf_dict['phenotypes']) hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict['fids'], dtype=util.fids_dtype)) hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict['iids'], dtype=util.iids_dtype)) ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') maf_adj_risk_scores = sp.zeros(plinkf_dict['num_individs']) num_common_snps = 0 # corr_list = [] tot_g_ss_nt_concord_count = 0 tot_rg_ss_nt_concord_count = 0 tot_g_rg_nt_concord_count = 0 tot_num_non_matching_nts = 0 # Now iterate over chromosomes for chrom in chromosomes: ok_indices = {'g': [], 'rg': [], 'ss': []} chr_str = 'chrom_%d' % chrom print('Coordinating data for chromosome %s' % chr_str) chrom_d = chr_dict[chr_str] chrom_d_ref = chr_dict_ref[chr_str] try: ssg = ssf['chrom_%d' % chrom] except Exception as err_str: print(err_str) print('Did not find chromosome in SS dataset.') print('Continuing.') continue ssg = ssf['chrom_%d' % chrom] g_sids = chrom_d['sids'] rg_sids = chrom_d_ref['sids'] ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype) if debug: print( 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.' % (len(g_sids), len(rg_sids), len(ss_sids))) common_sids = sp.intersect1d(ss_sids, g_sids) common_sids = sp.intersect1d(common_sids, rg_sids) if debug: print( 'Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom)) ss_snp_map = [] g_snp_map = [] rg_snp_map = [] ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i rg_sid_dict = {} for i, sid in enumerate(rg_sids): rg_sid_dict[sid] = i for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) # order by positions g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) # order = order.tolist() g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] # Get the other two maps for sid in common_sids: rg_snp_map.append(rg_sid_dict[sid]) for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) g_nts = sp.array(chrom_d['nts']) rg_nts = sp.array(chrom_d_ref['nts']) rg_nts_ok = sp.array(rg_nts)[rg_snp_map] ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype) betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg: ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0 if debug: print( 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d' % (len(g_snp_map), g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count)) tot_g_ss_nt_concord_count += g_ss_nt_concord_count tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count tot_g_rg_nt_concord_count += g_rg_nt_concord_count num_non_matching_nts = 0 num_ambig_nts = 0 # Identifying which SNPs have nucleotides that are ok.. ok_nts = [] for g_i, rg_i, ss_i in zip(g_snp_map, rg_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[ ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: rg_nt = rg_nts[rg_i] ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 tot_num_non_matching_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue os_g_nt = sp.array([ util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]] ]) flip_nts = False if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))): if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg: ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 tot_num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['rg'].append(rg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) if debug: print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts) print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts) print('%d SNPs were retained on chromosome %d.' % (len(ok_indices['g']), chrom)) # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices = snp_indices[ok_indices['g']] raw_snps, freqs = plinkfiles.parse_plink_snps(genotype_file, snp_indices) snp_indices_ref = sp.array(chrom_d_ref['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices_ref = snp_indices_ref[ok_indices['rg']] raw_ref_snps, freqs_ref = plinkfiles.parse_plink_snps( reference_genotype_file, snp_indices_ref) snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref)) snp_means_ref = freqs_ref * 2 snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) sids = (ssg['sids'][...]).astype(util.sids_u_dtype) sids = sids[ok_indices['ss']] # Check SNP frequencies.. if check_mafs and 'freqs' in ssg: ss_freqs = ss_freqs[ok_indices['ss']] freq_discrepancy_snp = sp.absolute( ss_freqs - (1 - freqs)) > 0.15 #Array of np.bool values if sp.any(freq_discrepancy_snp): print( 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample' % sp.sum(freq_discrepancy_snp)) # Filter freq_discrepancy_snps ok_freq_snps = sp.logical_not(freq_discrepancy_snp) raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] raw_ref_snps = raw_ref_snps[ok_freq_snps] snp_stds_ref = snp_stds_ref[ok_freq_snps] snp_means_ref = snp_means_ref[ok_freq_snps] freqs = freqs[ok_freq_snps] freqs_ref = freqs_ref[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) maf_filter_sum = sp.sum(maf_filter) n_snps = len(maf_filter) assert maf_filter_sum <= n_snps, "Problems when filtering SNPs with low minor allele frequencies" if sp.sum(maf_filter) < n_snps: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] raw_ref_snps = raw_ref_snps[maf_filter] snp_stds_ref = snp_stds_ref[maf_filter] snp_means_ref = snp_means_ref[maf_filter] freqs = freqs[maf_filter] freqs_ref = freqs_ref[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] maf_adj_prs = sp.dot(log_odds, raw_snps) if debug and plinkf_dict['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict['phenotypes'], maf_adj_prs)[0, 1] print( 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() # if l[0] in sid_set: # genetic_map.append(l[0]) else: genetic_map = None coord_data_dict = { 'chrom': 'chrom_%d' % chrom, 'raw_snps_ref': raw_ref_snps, 'snp_stds_ref': snp_stds_ref, 'snp_means_ref': snp_means_ref, 'freqs_ref': freqs_ref, 'ps': ps, 'positions': positions, 'nts': nts, 'sids': sids, 'genetic_map': genetic_map, 'betas': betas, 'log_odds': log_odds, 'log_odds_prs': maf_adj_prs, 'raw_snps_val': raw_snps, 'snp_stds_val': snp_stds, 'snp_means_val': snp_means, 'freqs_val': freqs } write_coord_data(cord_data_g, coord_data_dict) maf_adj_risk_scores += maf_adj_prs num_common_snps += len(betas) # Now calculate the prediction r^2 if debug and plinkf_dict['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict['phenotypes'], maf_adj_risk_scores)[0, 1] print( 'Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr**2)) print( 'Overall nucleotide concordance counts: g_rg: %d, g_ss: %d, rg_ss: %d' % (tot_g_rg_nt_concord_count, tot_g_ss_nt_concord_count, tot_rg_ss_nt_concord_count)) print('There were %d SNPs in common' % num_common_snps) print('In all, %d SNPs were excluded due to nucleotide issues.' % tot_num_non_matching_nts) print('Done!')