def main(p_dict): bimfile = None if p_dict['vbim'] is not None: bimfile = p_dict['vbim'] elif p_dict['vgf'] is not None: bimfile = p_dict['vgf'] + '.bim' elif p_dict['gf'] is not None: bimfile = p_dict['gf'] + '.bim' else: print('Set of validation SNPs is missing! Please specify either a validation PLINK genotype file, ' \ 'or a PLINK BIM file with the SNPs of interest.') if os.path.isfile(p_dict['out']): print('Output file (%s) already exists! Delete, rename it, or use a different output file.'\ % (p_dict['out'])) raise Exception('Output file already exists!') h5f = h5py.File(p_dict['out'], 'w') summary_dict = {} summary_dict[0]={'name':'Summary statistics filename:','value':p_dict['ssf']} summary_dict[1]={'name':'LD reference genotypes filename:','value':p_dict['gf']} summary_dict[3]={'name':'Coordinated data output filename:','value':p_dict['out']} if p_dict['vgf'] is not None: summary_dict[2]={'name':'Validation genotypes filename:','value':p_dict['vgf']} sum_stats_parsers.parse_sum_stats(h5f, p_dict, bimfile, summary_dict) coordinate_datasets(p_dict['gf'], h5f,summary_dict, validation_genotype_file=p_dict['vgf'], max_freq_discrep=p_dict['max_freq_discrep'], min_maf=p_dict['maf'], skip_coordination=p_dict['skip_coordination'], debug=p_dict['debug']) h5f.close() reporting.print_summary(summary_dict, 'Summary of coordination step')
def main(p_dict): #Check parameters summary_dict = {} summary_dict[0]={'name':'Coordinated data filename','value':p_dict['cf']} summary_dict[0.1]={'name':'SNP weights output file (prefix)', 'value':p_dict['out']} summary_dict[0.2]={'name':'LD data filename (prefix)', 'value':p_dict['ldf']} summary_dict[1]={'name':'LD radius used','value':str(p_dict['ldr'])} t0 = time.time() summary_dict[1.09]={'name':'dash', 'value':'LD information'} ld_dict = ld.get_ld_dict(p_dict['cf'], p_dict['ldf'], p_dict['ldr'], verbose=p_dict['debug'], compressed=not p_dict['no_ld_compression'], use_hickle=p_dict['hickle_ld'], summary_dict=summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[1.2]={'name':'Running time for calculating LD information:','value':'%d min and %0.2f secs'% (t / 60, t % 60)} t0 = time.time() summary_dict[1.9]={'name':'dash', 'value':'LDpred Gibbs sampler'} ldpred_genomewide(data_file=p_dict['cf'], out_file_prefix=p_dict['out'], ps=p_dict['f'], ld_radius=p_dict['ldr'], ld_dict=ld_dict, n=p_dict['N'], num_iter=p_dict['n_iter'], burn_in=p_dict['n_burn_in'], h2=p_dict['h2'], use_gw_h2=p_dict['use_gw_h2'], verbose=p_dict['debug'], summary_dict=summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[2.2]={'name':'Running time for Gibbs sampler(s):','value':'%d min and %0.2f secs'% (t / 60, t % 60)} reporting.print_summary(summary_dict, 'Summary of LDpred Gibbs')
def main(p_dict): #Check parameters summary_dict = {} summary_dict[0] = { 'name': 'Coordinated data filename', 'value': p_dict['cf'] } summary_dict[0.1] = { 'name': 'SNP weights output file (prefix)', 'value': p_dict['out'] } eff_type = get_eff_type(p_dict['cf']) #If already BLUP betas, then skip LD calculation if eff_type != 'BLUP': summary_dict[0.2] = { 'name': 'LD data filename (prefix)', 'value': p_dict['ldf'] } summary_dict[1.01] = { 'name': 'LD radius used', 'value': str(p_dict['ldr']) } summary_dict[1] = {'name': 'dash', 'value': 'LD information'} t0 = time.time() ld_dict = ld.get_ld_dict_using_p_dict(p_dict, summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[1.2] = { 'name': 'Running time for calculating LD information:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } t0 = time.time() summary_dict[1.9] = {'name': 'dash', 'value': 'LDpred-fast'} ldpred_fast_genomewide( data_file=p_dict['cf'], out_file_prefix=p_dict['out'], ps=p_dict['f'], ld_radius=p_dict['ldr'], ld_dict=ld_dict, n=p_dict['N'], h2=p_dict['h2'], use_gw_h2=p_dict['use_gw_h2'], eff_type=eff_type, summary_dict=summary_dict, debug=p_dict['debug'], ) t1 = time.time() t = (t1 - t0) summary_dict[3] = { 'name': 'Running time for LDpred-fast:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } reporting.print_summary(summary_dict, 'Summary of LDpred-fast')
def main(p_dict): summary_dict = {} summary_dict[0] = { 'name': 'Coordinated data filename', 'value': p_dict['cf'] } summary_dict[0.1] = { 'name': 'SNP weights output file (prefix)', 'value': p_dict['out'] } summary_dict[0.2] = { 'name': 'LD data filename (prefix)', 'value': p_dict['ldf'] } summary_dict[1] = {'name': 'LD radius used', 'value': str(p_dict['ldr'])} t0 = time.time() summary_dict[1.09] = {'name': 'dash', 'value': 'LD information'} ld_dict = ld.get_ld_dict_using_p_dict(p_dict, summary_dict={}) t1 = time.time() t = (t1 - t0) summary_dict[1.2] = { 'name': 'Running time for calculating LD information:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } t0 = time.time() summary_dict[1.9] = {'name': 'dash', 'value': 'LDpred infinitesimal model'} ldpred_inf_genomewide(data_file=p_dict['cf'], out_file_prefix=p_dict['out'], ld_radius=p_dict['ldr'], ld_dict=ld_dict, n=p_dict['N'], h2=p_dict['h2'], use_gw_h2=p_dict['use_gw_h2'], verbose=p_dict['debug'], summary_dict=summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[2.2] = { 'name': 'Running time for LDpred-inf:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } reporting.print_summary(summary_dict, 'Summary of LDpred-inf')
def main(p_dict): summary_dict = {} summary_dict[0]={'name':'Coordinated data filename','value':p_dict['cf']} summary_dict[0.1]={'name':'SNP weights output file (prefix)', 'value':p_dict['out']} summary_dict[0.2]={'name':'LD data filename (prefix)', 'value':p_dict['ldf']} summary_dict[1]={'name':'LD radius used','value':str(p_dict['ldr'])} t0 = time.time() summary_dict[1.09]={'name':'dash', 'value':'LD information'} ld_dict = ld.get_ld_dict(p_dict['cf'], p_dict['ldf'], p_dict['ldr'], verbose=p_dict['debug'], compressed=not p_dict['no_ld_compression'], use_hickle=p_dict['hickle_ld'], summary_dict=summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[1.2]={'name':'Running time for calculating LD information:','value':'%d min and %0.2f secs'% (t / 60, t % 60)} t0 = time.time() summary_dict[1.9]={'name':'dash', 'value':'LDpred Gibbs sampler'} ldpred_genomewide(data_file=p_dict['cf'], out_file_prefix=p_dict['out'], ps=p_dict['f'], ld_radius=p_dict['ldr'], ld_dict=ld_dict, n=p_dict['N'], num_iter=p_dict['n_iter'], burn_in=p_dict['n_burn_in'], h2=p_dict['h2'], verbose=p_dict['debug'], summary_dict=summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[2.2]={'name':'Running time for Gibbs sampler(s):','value':'%d min and %0.2f secs'% (t / 60, t % 60)} reporting.print_summary(summary_dict, 'Summary of LDpred Gibbs')
def main(p_dict): summary_dict = {} summary_dict[0] = { 'name': 'Coordinated data filename', 'value': p_dict['cf'] } summary_dict[0.1] = { 'name': 'SNP weights output file (prefix)', 'value': p_dict['out'] } summary_dict[1] = {'name': 'LD radius used', 'value': str(p_dict['ldr'])} t0 = time.time() summary_dict[1.1] = {'name': 'dash', 'value': 'LD-pruning + Thresholding'} run_pt(p_dict, summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[2] = { 'name': 'Running time for calculating P+T:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } reporting.print_summary(summary_dict, 'Summary of LD-pruning + Tresholding')
def main(p_dict): summary_dict = {} non_zero_chromosomes = set() verbose = p_dict['debug'] t0 = time.time() summary_dict[0] = { 'name': 'Validation genotype file (prefix):', 'value': p_dict['gf'] } summary_dict[0.1] = { 'name': 'Input weight file(s) (prefix):', 'value': p_dict['rf'] } summary_dict[0.2] = { 'name': 'Output scores file(s) (prefix):', 'value': p_dict['out'] } adjust_for_pcs = False adjust_for_covs = False if not p_dict['only_score']: summary_dict[0.9] = {'name': 'dash', 'value': 'Phenotypes'} print('Parsing phenotypes') if p_dict['pf'] is None: if p_dict['gf'] is not None: phen_map = parse_phen_file(p_dict['gf'] + '.fam', 'FAM', verbose=verbose, summary_dict=summary_dict) else: raise Exception('Validation phenotypes were not found.') else: phen_map = parse_phen_file(p_dict['pf'], p_dict['pf_format'], verbose=verbose, summary_dict=summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[1.1] = { 'name': 'Individuals with phenotype information:', 'value': len(phen_map) } summary_dict[1.2] = { 'name': 'Running time for parsing phenotypes:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } if p_dict['cov_file'] != None: adjust_for_covs = True if verbose: print('Parsing additional covariates') with open(p_dict['cov_file'], 'r') as f: num_missing = 0 for line in f: l = line.split() iid = l[0] if iid in phen_map: covariates = list(map(float, l[1:])) phen_map[iid]['covariates'] = covariates else: num_missing += 1 if num_missing > 0: summary_dict[2.1] = { 'name': 'Individuals w missing covariate information:', 'value': num_missing } if verbose: print('Unable to find %d iids in phen file!' % num_missing) summary_dict[2] = { 'name': 'Parsed covariates file:', 'value': p_dict['cov_file'] } if p_dict['pcs_file']: adjust_for_pcs = True if verbose: print('Parsing PCs') with open(p_dict['pcs_file'], 'r') as f: num_missing = 0 for line in f: l = line.split() iid = l[1] if iid in phen_map: pcs = list(map(float, l[2:])) phen_map[iid]['pcs'] = pcs else: num_missing += 1 if num_missing > 0: summary_dict[3.1] = { 'name': 'Individuals w missing PCs:', 'value': num_missing } if verbose: print('Unable to find %d iids in phen file!' % num_missing) summary_dict[3] = { 'name': 'Parsed PCs file:', 'value': p_dict['pcs_file'] } num_individs = len(phen_map) assert num_individs > 0, 'No phenotypes were found!' else: phen_map = None t0 = time.time() prs_file_is_missing = True res_dict = {} if p_dict['rf_format'] == 'LDPRED' or p_dict['rf_format'] == 'ANY': weights_file = '%s_LDpred-inf.txt' % (p_dict['rf']) if os.path.isfile(weights_file): print('') print('Calculating LDpred-inf risk scores') rs_id_map = parse_ldpred_res(weights_file) out_file = '%s_LDpred-inf.txt' % (p_dict['out']) res_dict['LDpred_inf'] = calc_risk_scores( p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if not p_dict['only_score']: summary_dict[5.2] = { 'name': 'LDpred_inf (unadjusted) Pearson R2:', 'value': '%0.4f' % res_dict['LDpred_inf']['pred_r2'] } prs_file_is_missing = False best_ldpred_pred_r2 = 0 best_p = None for p in p_dict['f']: weights_file = '%s_LDpred_p%0.4e.txt' % (p_dict['rf'], p) if os.path.isfile(weights_file): print('') print('Calculating LDpred risk scores using f=%0.3e' % p) rs_id_map = parse_ldpred_res(weights_file) out_file = '%s_LDpred_p%0.4e.txt' % (p_dict['out'], p) method_str = 'LDpred_p%0.4e' % (p) res_dict[method_str] = calc_risk_scores( p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if len(res_dict[method_str]) and ( res_dict[method_str]['pred_r2']) > best_ldpred_pred_r2: best_ldpred_pred_r2 = res_dict[method_str]['pred_r2'] best_p = p prs_file_is_missing = False if best_ldpred_pred_r2 > 0 and not p_dict['only_score']: summary_dict[5.3] = { 'name': 'Best LDpred (f=%0.2e) (unadjusted) R2:' % (best_p), 'value': '%0.4f' % best_ldpred_pred_r2 } # Plot results? if p_dict['rf_format'] == 'P+T' or p_dict['rf_format'] == 'ANY': best_pt_pred_r2 = 0 best_t = None best_r2 = None for max_r2 in p_dict['r2']: for p_thres in p_dict['p']: weights_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (p_dict['rf'], max_r2, p_thres) if os.path.isfile(weights_file): print( 'Calculating P+T risk scores using p-value threshold of %0.3e, and r2 threshold of %0.2f' % (p_thres, max_r2)) rs_id_map, non_zero_chromosomes = parse_pt_res( weights_file) if len(rs_id_map) > 0: out_file = '%s_P+T_p%0.4e.txt' % (p_dict['out'], p_thres) method_str = 'P+T_p%0.4e' % (p_thres) res_dict[method_str] = calc_risk_scores( p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], non_zero_chromosomes=non_zero_chromosomes, adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if len(res_dict[method_str]) and (res_dict[method_str][ 'pred_r2']) > best_pt_pred_r2: best_pt_pred_r2 = res_dict[method_str]['pred_r2'] best_t = p_thres best_r2 = max_r2 else: print( 'No SNPs found with p-values below the given threshold.' ) prs_file_is_missing = False if best_pt_pred_r2 > 0 and not p_dict['only_score']: summary_dict[5.4] = { 'name': 'Best P+T (r2=%0.2f, p=%0.2e) (unadjusted) R2:' % (best_r2, best_t), 'value': '%0.4f' % best_pt_pred_r2 } # Plot results? assert not prs_file_is_missing, 'No SNP weights file was found. A prefix to these should be provided via the --rf flag. Note that the prefix should exclude the _LDpred_.. extension or file ending. ' #Identifying the best prediction if not p_dict['only_score']: best_pred_r2 = 0 best_method_str = None for method_str in res_dict: if len(res_dict[method_str]) and ( res_dict[method_str]['pred_r2']) > best_pred_r2: best_pred_r2 = res_dict[method_str]['pred_r2'] best_method_str = method_str if best_method_str is not None: print( 'The highest (unadjusted) Pearson R2 was %0.4f, and provided by %s' % (best_pred_r2, best_method_str)) summary_dict[5.99] = { 'name': 'dash', 'value': 'Optimal polygenic score' } summary_dict[6] = { 'name': 'Method with highest (unadjusted) Pearson R2:', 'value': best_method_str } summary_dict[6.1] = { 'name': 'Best (unadjusted) Pearson R2:', 'value': '%0.4f' % best_pred_r2 } t1 = time.time() t = (t1 - t0) summary_dict[4.9] = {'name': 'dash', 'value': 'Scoring'} summary_dict[5.9] = { 'name': 'Running time for calculating scores:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } if prs_file_is_missing: print( 'SNP weights files were not found. This could be due to a mis-specified --rf flag, or other issues.' ) reporting.print_summary(summary_dict, 'Scoring Summary')
def main(p_dict): summary_dict = {} non_zero_chromosomes = set() verbose = p_dict['debug'] t0 = time.time() summary_dict[0]={'name':'Validation genotype file (prefix):','value':p_dict['gf']} summary_dict[0.1]={'name':'Input weight file(s) (prefix):','value':p_dict['rf']} summary_dict[0.2]={'name':'Output scores file(s) (prefix):','value':p_dict['out']} adjust_for_pcs=False adjust_for_covs=False if not p_dict['only_score']: summary_dict[0.9]={'name':'dash', 'value':'Phenotypes'} print('Parsing phenotypes') if p_dict['pf'] is None: if p_dict['gf'] is not None: phen_map = parse_phen_file(p_dict['gf'] + '.fam', 'FAM', verbose=verbose, summary_dict=summary_dict) else: raise Exception('Validation phenotypes were not found.') else: phen_map = parse_phen_file(p_dict['pf'], p_dict['pf_format'], verbose=verbose, summary_dict=summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[1.1]={'name':'Individuals with phenotype information:','value':len(phen_map)} summary_dict[1.2]={'name':'Running time for parsing phenotypes:','value':'%d min and %0.2f secs'% (t / 60, t % 60)} if p_dict['cov_file'] != None: adjust_for_covs=True if verbose: print('Parsing additional covariates') with open(p_dict['cov_file'], 'r') as f: num_missing = 0 for line in f: l = line.split() iid = l[0] if iid in phen_map: covariates = list(map(float, l[1:])) phen_map[iid]['covariates'] = covariates else: num_missing += 1 if num_missing > 0: summary_dict[2.1]={'name':'Individuals w missing covariate information:','value':num_missing} if verbose: print('Unable to find %d iids in phen file!' % num_missing) summary_dict[2]={'name':'Parsed covariates file:','value':p_dict['cov_file']} if p_dict['pcs_file']: adjust_for_pcs=True if verbose: print('Parsing PCs') with open(p_dict['pcs_file'], 'r') as f: num_missing = 0 for line in f: l = line.split() iid = l[1] if iid in phen_map: pcs = list(map(float, l[2:])) phen_map[iid]['pcs'] = pcs else: num_missing += 1 if num_missing > 0: summary_dict[3.1]={'name':'Individuals w missing PCs:','value':num_missing} if verbose: print('Unable to find %d iids in phen file!' % num_missing) summary_dict[3]={'name':'Parsed PCs file:','value':p_dict['pcs_file']} num_individs = len(phen_map) assert num_individs > 0, 'No phenotypes were found!' else: phen_map = None t0 = time.time() prs_file_is_missing = True res_dict = {} if p_dict['rf_format'] == 'LDPRED' or p_dict['rf_format']=='ANY': weights_file = '%s_LDpred-inf.txt' % (p_dict['rf']) if os.path.isfile(weights_file): print('') print('Calculating LDpred-inf risk scores') rs_id_map = parse_ldpred_res(weights_file) out_file = '%s_LDpred-inf.txt' % (p_dict['out']) res_dict['LDpred_inf'] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if not p_dict['only_score']: summary_dict[5.2]={'name':'LDpred_inf (unadjusted) Pearson R2:','value':'%0.4f'%res_dict['LDpred_inf']['pred_r2']} prs_file_is_missing = False best_ldpred_pred_r2 = 0 best_p = None for p in p_dict['f']: weights_file = '%s_LDpred_p%0.4e.txt' % (p_dict['rf'], p) if os.path.isfile(weights_file): print('') print('Calculating LDpred risk scores using f=%0.3e' % p) rs_id_map = parse_ldpred_res(weights_file) out_file = '%s_LDpred_p%0.4e.txt' % (p_dict['out'], p) method_str = 'LDpred_p%0.4e' % (p) res_dict[method_str] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_ldpred_pred_r2: best_ldpred_pred_r2 = res_dict[method_str]['pred_r2'] best_p = p prs_file_is_missing=False if best_ldpred_pred_r2>0 and not p_dict['only_score']: summary_dict[5.3]={'name':'Best LDpred (f=%0.2e) (unadjusted) R2:'%(best_p),'value':'%0.4f'%best_ldpred_pred_r2} # Plot results? if p_dict['rf_format'] == 'P+T' or p_dict['rf_format']=='ANY': best_pt_pred_r2 = 0 best_t = None best_r2 = None for max_r2 in p_dict['r2']: for p_thres in p_dict['p']: weights_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (p_dict['rf'], max_r2, p_thres) if os.path.isfile(weights_file): print('') print('Calculating P+T risk scores using p-value threshold of %0.3e, and r2 threshold of %0.2f' % (p_thres, max_r2)) rs_id_map, non_zero_chromosomes = parse_pt_res(weights_file) out_file = '%s_P+T_p%0.4e.txt' % (p_dict['out'], p_thres) method_str = 'P+T_p%0.4e' % (p_thres) res_dict[method_str] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], non_zero_chromosomes=non_zero_chromosomes, adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_pt_pred_r2: best_pt_pred_r2 = res_dict[method_str]['pred_r2'] best_t = p_thres best_r2 = max_r2 prs_file_is_missing=False if best_pt_pred_r2>0 and not p_dict['only_score']: summary_dict[5.4]={'name':'Best P+T (r2=%0.2f, p=%0.2e) (unadjusted) R2:'%(best_r2, best_t),'value':'%0.4f'%best_pt_pred_r2} # Plot results? assert not prs_file_is_missing, 'No SNP weights file was found. A prefix to these should be provided via the --rf flag. Note that the prefix should exclude the _LDpred_.. extension or file ending. ' #Identifying the best prediction if not p_dict['only_score']: best_pred_r2 = 0 best_method_str = None for method_str in res_dict: if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_pred_r2: best_pred_r2 = res_dict[method_str]['pred_r2'] best_method_str = method_str if best_method_str is not None: print('The highest (unadjusted) Pearson R2 was %0.4f, and provided by %s'%(best_pred_r2,best_method_str)) summary_dict[5.99]={'name':'dash', 'value':'Optimal polygenic score'} summary_dict[6]={'name':'Method with highest (unadjusted) Pearson R2:','value':best_method_str} summary_dict[6.1]={'name':'Best (unadjusted) Pearson R2:','value':'%0.4f'%best_pred_r2} t1 = time.time() t = (t1 - t0) summary_dict[4.9]={'name':'dash', 'value':'Scoring'} summary_dict[5.9]={'name':'Running time for calculating scores:','value':'%d min and %0.2f secs'% (t / 60, t % 60)} if prs_file_is_missing: print('SNP weights files were not found. This could be due to a mis-specified --rf flag, or other issues.') reporting.print_summary(summary_dict,'Scoring Summary')
def main(p_dict): assert p_dict['summary_file'] is None or not p_dict[ 'only_score'], 'Prediction summary file cannot be produced when the --only-score flag is set.' summary_dict = {} non_zero_chromosomes = set() verbose = p_dict['debug'] t0 = time.time() summary_dict[0] = { 'name': 'Validation genotype file (prefix):', 'value': p_dict['gf'] } summary_dict[0.1] = { 'name': 'Input weight file(s) (prefix):', 'value': p_dict['rf'] } summary_dict[0.2] = { 'name': 'Output scores file(s) (prefix):', 'value': p_dict['out'] } adjust_for_pcs = False adjust_for_covs = False if not p_dict['only_score']: summary_dict[0.9] = {'name': 'dash', 'value': 'Phenotypes'} if verbose: print('Parsing phenotypes') if p_dict['pf'] is None: if p_dict['gf'] is not None: phen_map = parse_phen_file(p_dict['gf'] + '.fam', 'FAM', verbose=verbose, summary_dict=summary_dict) else: raise Exception('Validation phenotypes were not found.') else: phen_map = parse_phen_file(p_dict['pf'], p_dict['pf_format'], verbose=verbose, summary_dict=summary_dict) t1 = time.time() t = (t1 - t0) summary_dict[1.1] = { 'name': 'Individuals with phenotype information:', 'value': len(phen_map) } summary_dict[1.2] = { 'name': 'Running time for parsing phenotypes:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } if p_dict['cov_file'] != None: adjust_for_covs = True if verbose: print('Parsing additional covariates') parse_covariates(p_dict, phen_map, summary_dict, verbose) if p_dict['pcs_file']: adjust_for_pcs = True if verbose: print('Parsing PCs') parse_pcs(p_dict, phen_map, summary_dict, verbose) num_individs = len(phen_map) assert num_individs > 0, 'No phenotypes were found!' else: phen_map = None t0 = time.time() prs_file_is_missing = True res_dict = {} if p_dict['rf_format'] == 'LDPRED' or p_dict['rf_format'] == 'ANY': weights_file = '%s_LDpred-inf.txt' % (p_dict['rf']) if os.path.isfile(weights_file): print('') print('Calculating LDpred-inf risk scores') rs_id_map = parse_ldpred_res(weights_file) out_file = '%s_LDpred-inf.txt' % (p_dict['out']) res_dict['LDpred_inf'] = calc_risk_scores( p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if not p_dict['only_score']: summary_dict[5.2] = { 'name': 'LDpred_inf (unadjusted) Pearson R2:', 'value': '%0.4f' % res_dict['LDpred_inf']['pred_r2'] } prs_file_is_missing = False best_ldpred_pred_r2 = 0 best_p = None for p in p_dict['f']: weights_file = '%s_LDpred_p%0.4e.txt' % (p_dict['rf'], p) if os.path.isfile(weights_file): print('') print('Calculating LDpred risk scores using f=%0.3e' % p) rs_id_map = parse_ldpred_res(weights_file) out_file = '%s_LDpred_p%0.4e.txt' % (p_dict['out'], p) method_str = 'LDpred_p%0.4e' % (p) res_dict[method_str] = calc_risk_scores( p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if len(res_dict[method_str]) and ( res_dict[method_str]['pred_r2']) > best_ldpred_pred_r2: best_ldpred_pred_r2 = res_dict[method_str]['pred_r2'] best_p = p prs_file_is_missing = False if best_ldpred_pred_r2 > 0 and not p_dict['only_score']: summary_dict[5.3] = { 'name': 'Best LDpred (f=%0.2e) (unadjusted) R2:' % (best_p), 'value': '%0.4f' % best_ldpred_pred_r2 } best_ldpred_fast_pred_r2 = 0 best_p = None for p in p_dict['f']: weights_file = '%s_LDpred_fast_p%0.4e.txt' % (p_dict['rf'], p) if os.path.isfile(weights_file): print('') print('Calculating LDpred-fast risk scores using f=%0.3e' % p) rs_id_map = parse_ldpred_res(weights_file) out_file = '%s_LDpred_fast_p%0.4e.txt' % (p_dict['out'], p) method_str = 'LDpred_fast_p%0.4e' % (p) res_dict[method_str] = calc_risk_scores( p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if len(res_dict[method_str]) and (res_dict[method_str][ 'pred_r2']) > best_ldpred_fast_pred_r2: best_ldpred_fast_pred_r2 = res_dict[method_str]['pred_r2'] best_p = p prs_file_is_missing = False if best_ldpred_fast_pred_r2 > 0 and not p_dict['only_score']: summary_dict[5.4] = { 'name': 'Best LDpred-fast (f=%0.2e) (unadjusted) R2:' % (best_p), 'value': '%0.4f' % best_ldpred_fast_pred_r2 } # Plot results? if p_dict['rf_format'] == 'P+T' or p_dict['rf_format'] == 'ANY': best_pt_pred_r2 = 0 best_t = None best_r2 = None for max_r2 in p_dict['r2']: for p_thres in p_dict['p']: weights_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (p_dict['rf'], max_r2, p_thres) if os.path.isfile(weights_file): print( 'Calculating P+T risk scores using p-value threshold of %0.3e, and r2 threshold of %0.2f' % (p_thres, max_r2)) rs_id_map, non_zero_chromosomes = parse_pt_res( weights_file) if len(rs_id_map) > 0: out_file = '%s_P+T_r%0.2f_p%0.4e.txt' % ( p_dict['out'], max_r2, p_thres) method_str = 'P+T_r%0.2f_p%0.4e' % (max_r2, p_thres) res_dict[method_str] = calc_risk_scores( p_dict['gf'], rs_id_map, phen_map, out_file=out_file, split_by_chrom=p_dict['split_by_chrom'], non_zero_chromosomes=non_zero_chromosomes, adjust_for_pcs=adjust_for_pcs, adjust_for_covariates=adjust_for_covs, only_score=p_dict['only_score'], verbose=verbose, summary_dict=summary_dict) if len(res_dict[method_str]) and (res_dict[method_str][ 'pred_r2']) > best_pt_pred_r2: best_pt_pred_r2 = res_dict[method_str]['pred_r2'] best_t = p_thres best_r2 = max_r2 else: print( 'No SNPs found with p-values below the given threshold.' ) prs_file_is_missing = False if best_pt_pred_r2 > 0 and not p_dict['only_score']: summary_dict[5.5] = { 'name': 'Best P+T (r2=%0.2f, p=%0.2e) (unadjusted) R2:' % (best_r2, best_t), 'value': '%0.4f' % best_pt_pred_r2 } # Plot results? assert not prs_file_is_missing, 'No SNP weights file was found. A prefix to these should be provided via the --rf flag. Note that the prefix should exclude the _LDpred_.. extension or file ending. ' res_summary_file = p_dict['summary_file'] if res_summary_file is not None and not p_dict['only_score']: with open(res_summary_file, 'w') as f: if verbose: print('Writing Results Summary to file %s' % res_summary_file) out_str = 'Pred_Method Pred_corr Pred_R2 SNPs_used\n' f.write(out_str) for method_str in sorted(res_dict): out_str = '%s %0.4f %0.4f %i\n' % ( method_str, res_dict[method_str]['corr_r2'], res_dict[method_str]['pred_r2'], res_dict[method_str]['num_snps']) f.write(out_str) #Identifying the best prediction if not p_dict['only_score']: best_pred_r2 = 0 best_method_str = None for method_str in res_dict: if len(res_dict[method_str]) and ( res_dict[method_str]['pred_r2']) > best_pred_r2: best_pred_r2 = res_dict[method_str]['pred_r2'] best_method_str = method_str if best_method_str is not None: print( 'The highest (unadjusted) Pearson R2 was %0.4f, and provided by %s' % (best_pred_r2, best_method_str)) summary_dict[5.99] = { 'name': 'dash', 'value': 'Optimal polygenic score' } summary_dict[6] = { 'name': 'Method with highest (unadjusted) Pearson R2:', 'value': best_method_str } summary_dict[6.1] = { 'name': 'Best (unadjusted) Pearson R2:', 'value': '%0.4f' % best_pred_r2 } if verbose: summary_dict[6.2] = { 'name': 'Number of SNPs used', 'value': '%d' % res_dict[best_method_str]['num_snps'] } summary_dict[6.3] = { 'name': 'Number of SNPs flipped', 'value': '%d' % res_dict[best_method_str]['num_flipped_nts'] } summary_dict[6.4] = { 'name': 'Fraction of SNPs not found in validation data', 'value': '%0.4f' % res_dict[best_method_str]['perc_missing'] } summary_dict[6.5] = { 'name': 'Number of duplicated SNPs', 'value': '%d' % res_dict[best_method_str]['duplicated_snps'] } summary_dict[6.6] = { 'name': 'Number of non-matching nucleotides SNPs', 'value': '%d' % res_dict[best_method_str]['num_non_matching_nts'] } t1 = time.time() t = (t1 - t0) summary_dict[4.9] = {'name': 'dash', 'value': 'Scoring'} summary_dict[5.9] = { 'name': 'Running time for calculating scores:', 'value': '%d min and %0.2f secs' % (t / 60, t % 60) } if prs_file_is_missing: print( 'SNP weights files were not found. This could be due to a mis-specified --rf flag, or other issues.' ) reporting.print_summary(summary_dict, 'Scoring Summary')