def coordinate_datasets(reference_genotype_file, hdf5_file, summary_dict, validation_genotype_file=None, genetic_map_dir=None, min_maf=0.01, skip_coordination=False, max_freq_discrep = 0.15, debug=False): summary_dict[3.9]={'name':'dash', 'value':'Coordination'} t0 = time.time() if validation_genotype_file is not None: print('Coordinating datasets (Summary statistics, LD reference genotypes, and Validation genotypes).') else: print('Coordinating datasets (Summary statistics and LD reference genotypes).') plinkf = plinkfile.PlinkFile(reference_genotype_file) # Figure out chromosomes and positions. if debug: print('Parsing plinkf_dict_val reference genotypes') loci = plinkf.get_loci() plinkf.close() summary_dict[4]={'name':'Num individuals in LD Reference data:','value':plinkfiles.get_num_indivs(reference_genotype_file)} summary_dict[4.1]={'name':'SNPs in LD Reference data:','value':len(loci)} gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes) if validation_genotype_file is not None: if debug: print('Parsing LD validation bim file') plinkf_val = plinkfile.PlinkFile(validation_genotype_file) # Loads only the individuals... plinkf_dict_val = plinkfiles.get_phenotypes(plinkf_val) loci_val = plinkf_val.get_loci() plinkf_val.close() summary_dict[5]={'name':'SNPs in Validation data:','value':len(loci_val)} chr_dict_val = plinkfiles.get_chrom_dict(loci_val, chromosomes) # Open HDF5 file and prepare out data assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.' if plinkf_dict_val['has_phenotype']: hdf5_file.create_dataset('y', data=plinkf_dict_val['phenotypes']) summary_dict[6]={'name':'Num validation phenotypes:','value':plinkf_dict_val['num_individs']} hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict_val['fids'], dtype=util.fids_dtype)) hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict_val['iids'], dtype=util.iids_dtype)) maf_adj_risk_scores = sp.zeros(plinkf_dict_val['num_individs']) # Now summary statistics ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') num_common_snps = 0 # corr_list = [] chromosomes_found = set() num_snps_common_before_filtering =0 num_snps_common_after_filtering =0 tot_num_non_matching_nts = 0 tot_num_non_supported_nts = 0 tot_num_ambig_nts = 0 tot_num_freq_discrep_filtered_snps = 0 tot_num_maf_filtered_snps = 0 tot_g_ss_nt_concord_count = 0 if validation_genotype_file is not None: tot_g_vg_nt_concord_count = 0 tot_vg_ss_nt_concord_count = 0 # Now iterate over chromosomes chrom_i = 0 for chrom in chromosomes: chrom_i +=1 if not debug: sys.stdout.write('\r%0.2f%%' % (100.0 * (float(chrom_i) / (len(chromosomes)+1)))) sys.stdout.flush() try: chr_str = 'chrom_%d' % chrom ssg = ssf[chr_str] except Exception as err_str: print(err_str) print('Did not find chromosome %d in SS dataset.'%chrom) print('Continuing.') continue if debug: print('Coordinating data for chromosome %s' % chr_str) chromosomes_found.add(chrom) #Get summary statistics chromosome group ssg = ssf['chrom_%d' % chrom] ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype) if validation_genotype_file is not None: chrom_d_val = chr_dict_val[chr_str] vg_sids = chrom_d_val['sids'] common_sids = sp.intersect1d(ss_sids, vg_sids) # A map from sid to index for validation data vg_sid_dict = {} for i, sid in enumerate(vg_sids): vg_sid_dict[sid] = i else: common_sids = ss_sids # A map from sid to index for summary stats ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i #The indices to retain for the LD reference genotypes chrom_d = chr_dict[chr_str] g_sids = chrom_d['sids'] common_sids = sp.intersect1d(common_sids, g_sids) # A map from sid to index for LD reference data g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i if debug: print('Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom)) print('Ordering SNPs by genomic positions (based on LD reference genotypes).') g_snp_map = [] for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) # order by positions (based on LD reference file) g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] # Get the ordered sum stats SNPs indices. ss_snp_map = [] for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) # Get the ordered validation SNPs indices if validation_genotype_file is not None: vg_snp_map = [] for sid in common_sids: vg_snp_map.append(vg_sid_dict[sid]) vg_nts = sp.array(chrom_d_val['nts']) vg_nts_ok = sp.array(vg_nts)[vg_snp_map] g_nts = sp.array(chrom_d['nts']) ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype) betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg: ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 if validation_genotype_file is not None: vg_ss_nt_concord_count = sp.sum(vg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_vg_nt_concord_count = sp.sum(g_nts[g_snp_map] == vg_nts_ok) / 2.0 if debug: print('Nucleotide concordance counts out of %d genotypes, vg-rg: %d ; vg-ss: %d' % (len(g_snp_map), g_vg_nt_concord_count, vg_ss_nt_concord_count)) tot_vg_ss_nt_concord_count += vg_ss_nt_concord_count tot_g_vg_nt_concord_count += g_vg_nt_concord_count tot_g_ss_nt_concord_count += g_ss_nt_concord_count if debug: print('Nucleotide concordance counts out of %d genotypes, rg-ss: %d' % (len(g_snp_map), g_ss_nt_concord_count)) num_freq_discrep_filtered_snps = 0 num_non_matching_nts = 0 num_non_supported_nts = 0 num_ambig_nts = 0 # Identifying which SNPs have nucleotides that are ok.. ok_nts = [] ok_indices = {'g': [], 'ss': []} if validation_genotype_file is not None: ok_indices['vg']=[] #Now loop over SNPs to coordinate nucleotides. if validation_genotype_file is not None: for g_i, vg_i, ss_i in zip(g_snp_map, vg_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == vg_sids[vg_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: vg_nt = vg_nts[vg_i] ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_supported_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt))): if sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg: ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['vg'].append(vg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) else: for g_i, ss_i in zip(g_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_matching_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg and ss_freqs[ss_i]>0: ss_freqs[ss_i] = 1.0 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) if debug: print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts) print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts) # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices = snp_indices[ok_indices['g']] raw_snps, freqs = plinkfiles.parse_plink_snps( reference_genotype_file, snp_indices) snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ns = ssg['ns'][...][ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) sids = (ssg['sids'][...]).astype(util.sids_u_dtype) sids = sids[ok_indices['ss']] #Parse validation genotypes, if available if validation_genotype_file is not None: snp_indices_val = sp.array(chrom_d_val['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices_val = snp_indices_val[ok_indices['vg']] raw_snps_val, freqs_val = plinkfiles.parse_plink_snps( validation_genotype_file, snp_indices_val) snp_stds_val = sp.sqrt(2 * freqs_val * (1 - freqs_val)) snp_means_val = freqs_val * 2 # Check SNP frequencies, screen for possible problems.. if max_freq_discrep<1 and 'freqs' in ssg: ss_freqs = ss_freqs[ok_indices['ss']] ok_freq_snps = sp.logical_or(sp.absolute(ss_freqs - freqs) < max_freq_discrep,sp.absolute(ss_freqs + freqs-1) < max_freq_discrep) #Array of np.bool values ok_freq_snps = sp.logical_or(ok_freq_snps,ss_freqs<=0) #Only consider SNPs that actually have frequencies num_freq_discrep_filtered_snps = len(ok_freq_snps)- sp.sum(ok_freq_snps) assert num_freq_discrep_filtered_snps>=0, "Problems when filtering SNPs with frequency discrepencies" if num_freq_discrep_filtered_snps>0: # Filter freq_discrepancy_snps raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] ns = ns[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[ok_freq_snps] snp_stds_val = snp_stds_val[ok_freq_snps] snp_means_val = snp_means_val[ok_freq_snps] freqs_val = freqs_val[ok_freq_snps] if debug: print('Filtered %d SNPs due to frequency discrepancies'%num_freq_discrep_filtered_snps) # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) num_maf_filtered_snps = len(maf_filter)-sp.sum(maf_filter) assert num_maf_filtered_snps>=0, "Problems when filtering SNPs with low minor allele frequencies" if num_maf_filtered_snps>0: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] ns = ns[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[maf_filter] snp_stds_val = snp_stds_val[maf_filter] snp_means_val = snp_means_val[maf_filter] freqs_val = freqs_val[maf_filter] if debug: print('Filtered %d SNPs due to low MAF'%num_maf_filtered_snps) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() # if l[0] in sid_set: # genetic_map.append(l[0]) else: genetic_map = None coord_data_dict = {'chrom': 'chrom_%d' % chrom, 'raw_snps_ref': raw_snps, 'snp_stds_ref': snp_stds, 'snp_means_ref': snp_means, 'freqs_ref': freqs, 'ps': ps, 'ns': ns, 'positions': positions, 'nts': nts, 'sids': sids, 'genetic_map': genetic_map, 'betas': betas, 'log_odds': log_odds} if validation_genotype_file is not None: maf_adj_prs = sp.dot(log_odds, raw_snps_val) if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict_val['phenotypes'], maf_adj_prs)[0, 1] print('Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)) coord_data_dict['raw_snps_val']=raw_snps_val coord_data_dict['snp_stds_val']=snp_stds_val coord_data_dict['snp_means_val']=snp_means_val coord_data_dict['freqs_val']=freqs_val coord_data_dict['log_odds_prs']=maf_adj_prs maf_adj_risk_scores += maf_adj_prs write_coord_data(cord_data_g, coord_data_dict, debug=debug) if debug: print('%d SNPs were retained on chromosome %d.' % (len(sids), chrom)) num_snps_common_before_filtering += len(common_sids) num_snps_common_after_filtering += len(sids) tot_num_ambig_nts += num_ambig_nts tot_num_non_supported_nts += num_non_supported_nts tot_num_non_matching_nts += num_non_matching_nts tot_num_freq_discrep_filtered_snps += num_freq_discrep_filtered_snps tot_num_maf_filtered_snps += num_maf_filtered_snps if not debug: sys.stdout.write('\r%0.2f%%\n' % (100.0)) sys.stdout.flush() # Now calculate the prediction r^2 if validation_genotype_file: if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef( plinkf_dict_val['phenotypes'], maf_adj_risk_scores)[0, 1] print('Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr ** 2)) print('Overall nucleotide concordance counts: rg_vg: %d, rg_ss: %d, vg_ss: %d' % (tot_g_vg_nt_concord_count, tot_g_ss_nt_concord_count, tot_vg_ss_nt_concord_count)) else: if debug: print('Overall nucleotide concordance counts, rg_ss: %d' % (tot_g_ss_nt_concord_count)) summary_dict[7]={'name':'Num chromosomes used:','value':len(chromosomes_found)} summary_dict[8]={'name':'SNPs common across datasets:','value':num_snps_common_before_filtering} summary_dict[9]={'name':'SNPs retained after filtering:','value':num_snps_common_after_filtering} if tot_num_ambig_nts>0: summary_dict[10]={'name':'SNPs w ambiguous nucleotides filtered:','value':tot_num_ambig_nts} if tot_num_non_supported_nts>0: summary_dict[10.1]={'name':'SNPs w unknown/unsupported nucleotides filtered:','value':tot_num_non_supported_nts} if tot_num_non_matching_nts>0: summary_dict[11]={'name':'SNPs w other nucleotide discrepancies filtered:','value':tot_num_non_matching_nts} if min_maf>0: summary_dict[12]={'name':'SNPs w MAF<%0.3f filtered:'%min_maf,'value':tot_num_maf_filtered_snps} if max_freq_discrep<0.5: summary_dict[13]={'name':'SNPs w allele freq discrepancy > %0.3f filtered:'%max_freq_discrep,'value':tot_num_freq_discrep_filtered_snps} t1 = time.time() t = (t1 - t0) summary_dict[13.9]={'name':'dash', 'value':'Running times'} summary_dict[15]={'name':'Run time for coordinating datasets:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}
def coordinate_datasets(reference_genotype_file, hdf5_file, summary_dict, validation_genotype_file=None, genetic_map_dir=None, min_maf=0.01, skip_coordination=False, max_freq_discrep = 0.15, debug=False): summary_dict[3.9]={'name':'dash', 'value':'Coordination'} t0 = time.time() if validation_genotype_file is not None: print('Coordinating datasets (Summary statistics, LD reference genotypes, and Validation genotypes).') else: print('Coordinating datasets (Summary statistics and LD reference genotypes).') plinkf = plinkfile.PlinkFile(reference_genotype_file) # Figure out chromosomes and positions. if debug: print('Parsing plinkf_dict_val reference genotypes') loci = plinkf.get_loci() plinkf.close() summary_dict[4]={'name':'Num individuals in LD Reference data:','value':plinkfiles.get_num_indivs(reference_genotype_file)} summary_dict[4.1]={'name':'SNPs in LD Reference data:','value':len(loci)} gf_chromosomes = [l.chromosome for l in loci] chromosomes = sp.unique(gf_chromosomes) chromosomes.sort() chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes) if validation_genotype_file is not None: if debug: print('Parsing LD validation bim file') plinkf_val = plinkfile.PlinkFile(validation_genotype_file) # Loads only the individuals... plinkf_dict_val = plinkfiles.get_phenotypes(plinkf_val) loci_val = plinkf_val.get_loci() plinkf_val.close() summary_dict[5]={'name':'SNPs in Validation data:','value':len(loci_val)} chr_dict_val = plinkfiles.get_chrom_dict(loci_val, chromosomes) # Open HDF5 file and prepare out data assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.' if plinkf_dict_val['has_phenotype']: hdf5_file.create_dataset('y', data=plinkf_dict_val['phenotypes']) summary_dict[6]={'name':'Num validation phenotypes:','value':plinkf_dict_val['num_individs']} hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict_val['fids'], dtype=util.fids_dtype)) hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict_val['iids'], dtype=util.iids_dtype)) maf_adj_risk_scores = sp.zeros(plinkf_dict_val['num_individs']) # Now summary statistics ssf = hdf5_file['sum_stats'] cord_data_g = hdf5_file.create_group('cord_data') num_common_snps = 0 # corr_list = [] chromosomes_found = set() num_snps_common_before_filtering =0 num_snps_common_after_filtering =0 tot_num_non_matching_nts = 0 tot_num_non_supported_nts = 0 tot_num_ambig_nts = 0 tot_num_freq_discrep_filtered_snps = 0 tot_num_maf_filtered_snps = 0 tot_g_ss_nt_concord_count = 0 if validation_genotype_file is not None: tot_g_vg_nt_concord_count = 0 tot_vg_ss_nt_concord_count = 0 # Now iterate over chromosomes chrom_i = 0 for chrom in chromosomes: chrom_i +=1 if not debug: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (float(chrom_i) / (len(chromosomes)+1)))) sys.stdout.flush() try: chr_str = 'chrom_%d' % chrom ssg = ssf[chr_str] except Exception as err_str: print(err_str) print('Did not find chromosome %d in SS dataset.'%chrom) print('Continuing.') continue if debug: print('Coordinating data for chromosome %s' % chr_str) chromosomes_found.add(chrom) #Get summary statistics chromosome group ssg = ssf['chrom_%d' % chrom] ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype) if validation_genotype_file is not None: chrom_d_val = chr_dict_val[chr_str] vg_sids = chrom_d_val['sids'] common_sids = sp.intersect1d(ss_sids, vg_sids) # A map from sid to index for validation data vg_sid_dict = {} for i, sid in enumerate(vg_sids): vg_sid_dict[sid] = i else: common_sids = ss_sids # A map from sid to index for summary stats ss_sid_dict = {} for i, sid in enumerate(ss_sids): ss_sid_dict[sid] = i #The indices to retain for the LD reference genotypes chrom_d = chr_dict[chr_str] g_sids = chrom_d['sids'] common_sids = sp.intersect1d(common_sids, g_sids) # A map from sid to index for LD reference data g_sid_dict = {} for i, sid in enumerate(g_sids): g_sid_dict[sid] = i if debug: print('Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom)) print('Ordering SNPs by genomic positions (based on LD reference genotypes).') g_snp_map = [] for sid in common_sids: g_snp_map.append(g_sid_dict[sid]) # order by positions (based on LD reference file) g_positions = sp.array(chrom_d['positions'])[g_snp_map] order = sp.argsort(g_positions) g_snp_map = sp.array(g_snp_map)[order] g_snp_map = g_snp_map.tolist() common_sids = sp.array(common_sids)[order] # Get the ordered sum stats SNPs indices. ss_snp_map = [] for sid in common_sids: ss_snp_map.append(ss_sid_dict[sid]) # Get the ordered validation SNPs indices if validation_genotype_file is not None: vg_snp_map = [] for sid in common_sids: vg_snp_map.append(vg_sid_dict[sid]) vg_nts = sp.array(chrom_d_val['nts']) vg_nts_ok = sp.array(vg_nts)[vg_snp_map] g_nts = sp.array(chrom_d['nts']) ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype) betas = ssg['betas'][...] log_odds = ssg['log_odds'][...] if 'freqs' in ssg: ss_freqs = ssg['freqs'][...] g_ss_nt_concord_count = sp.sum( g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0 if validation_genotype_file is not None: vg_ss_nt_concord_count = sp.sum(vg_nts_ok == ss_nts[ss_snp_map]) / 2.0 g_vg_nt_concord_count = sp.sum(g_nts[g_snp_map] == vg_nts_ok) / 2.0 if debug: print('Nucleotide concordance counts out of %d genotypes, vg-rg: %d ; vg-ss: %d' % (len(g_snp_map), g_vg_nt_concord_count, vg_ss_nt_concord_count)) tot_vg_ss_nt_concord_count += vg_ss_nt_concord_count tot_g_vg_nt_concord_count += g_vg_nt_concord_count tot_g_ss_nt_concord_count += g_ss_nt_concord_count if debug: print('Nucleotide concordance counts out of %d genotypes, rg-ss: %d' % (len(g_snp_map), g_ss_nt_concord_count)) num_freq_discrep_filtered_snps = 0 num_non_matching_nts = 0 num_non_supported_nts = 0 num_ambig_nts = 0 # Identifying which SNPs have nucleotides that are ok.. ok_nts = [] ok_indices = {'g': [], 'ss': []} if validation_genotype_file is not None: ok_indices['vg']=[] #Now loop over SNPs to coordinate nucleotides. if validation_genotype_file is not None: for g_i, vg_i, ss_i in zip(g_snp_map, vg_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == vg_sids[vg_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: vg_nt = vg_nts[vg_i] ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_supported_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt))): if sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg: ss_freqs[ss_i] = 1 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue else: num_non_matching_nts += 1 continue # Opposite strand nucleotides # everything seems ok. ok_indices['g'].append(g_i) ok_indices['vg'].append(vg_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) else: for g_i, ss_i in zip(g_snp_map, ss_snp_map): # To make sure, is the SNP id the same? assert g_sids[g_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.' g_nt = g_nts[g_i] if not skip_coordination: ss_nt = ss_nts[ss_i] # Is the nucleotide ambiguous. g_nt = [g_nts[g_i][0], g_nts[g_i][1]] if tuple(g_nt) in util.ambig_nts: num_ambig_nts += 1 continue # First check if nucleotide is sane? if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts): num_non_matching_nts += 1 continue os_g_nt = sp.array( [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]]) flip_nts = False #Coordination is a bit more complicate when validation genotypes are provided.. if not sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt): flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or ( os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1]) # Try flipping the SS nt if flip_nts: betas[ss_i] = -betas[ss_i] log_odds[ss_i] = -log_odds[ss_i] if 'freqs' in ssg and ss_freqs[ss_i]>0: ss_freqs[ss_i] = 1.0 - ss_freqs[ss_i] else: if debug: print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \ (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))) num_non_matching_nts += 1 continue # everything seems ok. ok_indices['g'].append(g_i) ok_indices['ss'].append(ss_i) ok_nts.append(g_nt) if debug: print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts) print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts) # Resorting by position positions = sp.array(chrom_d['positions'])[ok_indices['g']] # Now parse SNPs .. snp_indices = sp.array(chrom_d['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices = snp_indices[ok_indices['g']] raw_snps, freqs = plinkfiles.parse_plink_snps( reference_genotype_file, snp_indices) snp_stds = sp.sqrt(2 * freqs * (1 - freqs)) snp_means = freqs * 2 betas = betas[ok_indices['ss']] log_odds = log_odds[ok_indices['ss']] ps = ssg['ps'][...][ok_indices['ss']] nts = sp.array(ok_nts) sids = (ssg['sids'][...]).astype(util.sids_u_dtype) sids = sids[ok_indices['ss']] #Parse validation genotypes, if available if validation_genotype_file is not None: snp_indices_val = sp.array(chrom_d_val['snp_indices']) # Pinpoint where the SNPs are in the file. snp_indices_val = snp_indices_val[ok_indices['vg']] raw_snps_val, freqs_val = plinkfiles.parse_plink_snps( validation_genotype_file, snp_indices_val) snp_stds_val = sp.sqrt(2 * freqs_val * (1 - freqs_val)) snp_means_val = freqs_val * 2 # Check SNP frequencies, screen for possible problems.. if max_freq_discrep<1 and 'freqs' in ssg: ss_freqs = ss_freqs[ok_indices['ss']] ok_freq_snps = sp.logical_or(sp.absolute(ss_freqs - freqs) < max_freq_discrep,sp.absolute(ss_freqs + freqs-1) < max_freq_discrep) #Array of np.bool values ok_freq_snps = sp.logical_or(ok_freq_snps,ss_freqs<=0) #Only consider SNPs that actually have frequencies num_freq_discrep_filtered_snps = len(ok_freq_snps)- sp.sum(ok_freq_snps) assert num_freq_discrep_filtered_snps>=0, "Problems when filtering SNPs with frequency discrepencies" if num_freq_discrep_filtered_snps>0: # Filter freq_discrepancy_snps raw_snps = raw_snps[ok_freq_snps] snp_stds = snp_stds[ok_freq_snps] snp_means = snp_means[ok_freq_snps] freqs = freqs[ok_freq_snps] ps = ps[ok_freq_snps] positions = positions[ok_freq_snps] nts = nts[ok_freq_snps] sids = sids[ok_freq_snps] betas = betas[ok_freq_snps] log_odds = log_odds[ok_freq_snps] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[ok_freq_snps] snp_stds_val = snp_stds_val[ok_freq_snps] snp_means_val = snp_means_val[ok_freq_snps] freqs_val = freqs_val[ok_freq_snps] if debug: print('Filtered %d SNPs due to frequency discrepancies'%num_freq_discrep_filtered_snps) # Filter minor allele frequency SNPs. maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf)) num_maf_filtered_snps = len(maf_filter)-sp.sum(maf_filter) assert num_maf_filtered_snps>=0, "Problems when filtering SNPs with low minor allele frequencies" if num_maf_filtered_snps>0: raw_snps = raw_snps[maf_filter] snp_stds = snp_stds[maf_filter] snp_means = snp_means[maf_filter] freqs = freqs[maf_filter] ps = ps[maf_filter] positions = positions[maf_filter] nts = nts[maf_filter] sids = sids[maf_filter] betas = betas[maf_filter] log_odds = log_odds[maf_filter] if validation_genotype_file is not None: raw_snps_val = raw_snps_val[maf_filter] snp_stds_val = snp_stds_val[maf_filter] snp_means_val = snp_means_val[maf_filter] freqs_val = freqs_val[maf_filter] if debug: print('Filtered %d SNPs due to low MAF'%num_maf_filtered_snps) genetic_map = [] if genetic_map_dir is not None: with gzip.open(genetic_map_dir + 'chr%d.interpolated_genetic_map.gz' % chrom) as f: for line in f: l = line.split() # if l[0] in sid_set: # genetic_map.append(l[0]) else: genetic_map = None coord_data_dict = {'chrom': 'chrom_%d' % chrom, 'raw_snps_ref': raw_snps, 'snp_stds_ref': snp_stds, 'snp_means_ref': snp_means, 'freqs_ref': freqs, 'ps': ps, 'positions': positions, 'nts': nts, 'sids': sids, 'genetic_map': genetic_map, 'betas': betas, 'log_odds': log_odds} if validation_genotype_file is not None: maf_adj_prs = sp.dot(log_odds, raw_snps_val) if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef(plinkf_dict_val['phenotypes'], maf_adj_prs)[0, 1] print('Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)) coord_data_dict['raw_snps_val']=raw_snps_val coord_data_dict['snp_stds_val']=snp_stds_val coord_data_dict['snp_means_val']=snp_means_val coord_data_dict['freqs_val']=freqs_val coord_data_dict['log_odds_prs']=maf_adj_prs maf_adj_risk_scores += maf_adj_prs write_coord_data(cord_data_g, coord_data_dict, debug=debug) if debug: print('%d SNPs were retained on chromosome %d.' % (len(sids), chrom)) num_snps_common_before_filtering += len(common_sids) num_snps_common_after_filtering += len(sids) tot_num_ambig_nts += num_ambig_nts tot_num_non_supported_nts += num_non_supported_nts tot_num_non_matching_nts += num_non_matching_nts tot_num_freq_discrep_filtered_snps += num_freq_discrep_filtered_snps tot_num_maf_filtered_snps += num_maf_filtered_snps if not debug: sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0)) sys.stdout.flush() # Now calculate the prediction r^2 if validation_genotype_file: if debug and plinkf_dict_val['has_phenotype']: maf_adj_corr = sp.corrcoef( plinkf_dict_val['phenotypes'], maf_adj_risk_scores)[0, 1] print('Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr ** 2)) print('Overall nucleotide concordance counts: rg_vg: %d, rg_ss: %d, vg_ss: %d' % (tot_g_vg_nt_concord_count, tot_g_ss_nt_concord_count, tot_vg_ss_nt_concord_count)) else: if debug: print('Overall nucleotide concordance counts, rg_ss: %d' % (tot_g_ss_nt_concord_count)) summary_dict[7]={'name':'Num chromosomes used:','value':len(chromosomes_found)} summary_dict[8]={'name':'SNPs common across datasets:','value':num_snps_common_before_filtering} summary_dict[9]={'name':'SNPs retained after filtering:','value':num_snps_common_after_filtering} if tot_num_ambig_nts>0: summary_dict[10]={'name':'SNPs w ambiguous nucleotides filtered:','value':tot_num_ambig_nts} if tot_num_non_supported_nts>0: summary_dict[10.1]={'name':'SNPs w unknown/unsupported nucleotides filtered:','value':tot_num_non_supported_nts} if tot_num_non_matching_nts>0: summary_dict[11]={'name':'SNPs w other nucleotide discrepancies filtered:','value':tot_num_non_matching_nts} if min_maf>0: summary_dict[12]={'name':'SNPs w MAF<%0.3f filtered:'%min_maf,'value':tot_num_maf_filtered_snps} if max_freq_discrep<0.5: summary_dict[13]={'name':'SNPs w allele freq discrepancy > %0.3f filtered:'%max_freq_discrep,'value':tot_num_freq_discrep_filtered_snps} t1 = time.time() t = (t1 - t0) summary_dict[13.9]={'name':'dash', 'value':'Running times'} summary_dict[15]={'name':'Run time for coordinating datasets:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}