Ejemplo n.º 1
0
def get_herit_dict(df,
                   eff_type,
                   n,
                   h2,
                   use_gw_h2=False,
                   ld_dict=None,
                   summary_dict=None,
                   debug=False):
    if eff_type != 'BLUP':
        ld_scores_dict = ld_dict['ld_scores_dict']
        cord_data_g = df['cord_data']

        #Calculating genome-wide heritability using LD score regression, and partition heritability by chromsomes
        return ld.get_chromosome_herits(cord_data_g,
                                        ld_scores_dict,
                                        n,
                                        h2=h2,
                                        use_gw_h2=use_gw_h2,
                                        summary_dict=summary_dict,
                                        debug=debug)
    else:
        raise NotImplementedError
Ejemplo n.º 2
0
 def test_get_chromosome_herits(self):
     p_dict = make_p_dict(
         '--debug',
         'inf',
         '--cf=%s/test_data/goldens/golden.coord.hdf5' % TEST_DIR,
         '--ldr=100',
         '--ldf=' + self.tmp_file_prefix,
         '--N=10000',
         '--out=' + self.tmp_file_prefix,
     )
     summary_dict = {}
     ld_dict = ld.get_ld_dict_using_p_dict(p_dict, summary_dict)
     coord_file = os.path.join(TEST_DIR,
                               'test_data/goldens/golden.coord.hdf5')
     df = h5py.File(coord_file, 'r')
     herit_dict = ld.get_chromosome_herits(df['cord_data'],
                                           ld_dict['ld_scores_dict'],
                                           n=p_dict['N'],
                                           h2=None)
     print(herit_dict)
     self.assertAlmostEqual(herit_dict['chrom_1'], 0.0741219)
     self.assertAlmostEqual(herit_dict['gw_h2_ld_score_est'], 0.0741219)
Ejemplo n.º 3
0
def ldpred_genomewide(data_file=None,
                      ld_radius=None,
                      ld_dict=None,
                      out_file_prefix=None,
                      summary_dict=None,
                      ps=None,
                      n=None,
                      h2=None,
                      num_iter=None,
                      verbose=False,
                      zero_jump_prob=0.05,
                      burn_in=5):
    """
    Calculate LDpred for a genome
    """
    df = h5py.File(data_file, 'r')
    has_phenotypes = False
    if 'y' in df:
        'Validation phenotypes found.'
        y = df['y'][...]  # Phenotype
        num_individs = len(y)
        risk_scores_pval_derived = sp.zeros(num_individs)
        has_phenotypes = True

    ld_scores_dict = ld_dict['ld_scores_dict']
    chrom_ld_dict = ld_dict['chrom_ld_dict']
    chrom_ref_ld_mats = ld_dict['chrom_ref_ld_mats']

    print('Applying LDpred with LD radius: %d' % ld_radius)
    results_dict = {}
    cord_data_g = df['cord_data']

    #Calculating genome-wide heritability using LD score regression, and partition heritability by chromsomes
    herit_dict = ld.get_chromosome_herits(cord_data_g,
                                          ld_scores_dict,
                                          n,
                                          h2=h2,
                                          debug=verbose,
                                          summary_dict=summary_dict)

    LDpred_inf_chrom_dict = {}
    print('Calculating LDpred-inf weights')
    for chrom_str in util.chromosomes_list:
        if chrom_str in cord_data_g:
            print('Calculating SNP weights for Chromosome %s' %
                  ((chrom_str.split('_'))[1]))
            g = cord_data_g[chrom_str]

            # Filter monomorphic SNPs
            snp_stds = g['snp_stds_ref'][...]
            snp_stds = snp_stds.flatten()
            ok_snps_filter = snp_stds > 0
            pval_derived_betas = g['betas'][...]
            pval_derived_betas = pval_derived_betas[ok_snps_filter]
            h2_chrom = herit_dict[chrom_str]
            start_betas = LDpred_inf.ldpred_inf(
                pval_derived_betas,
                genotypes=None,
                reference_ld_mats=chrom_ref_ld_mats[chrom_str],
                h2=h2_chrom,
                n=n,
                ld_window_size=2 * ld_radius,
                verbose=False)
            LDpred_inf_chrom_dict[chrom_str] = start_betas

    convergence_report = {}
    for p in ps:
        convergence_report[p] = False
        print('Starting LDpred gibbs with f=%0.4f' % p)
        p_str = '%0.4f' % p
        results_dict[p_str] = {}

        if out_file_prefix:
            # Preparing output files
            raw_effect_sizes = []
            ldpred_effect_sizes = []
            ldpred_inf_effect_sizes = []
            out_sids = []
            chromosomes = []
            out_positions = []
            out_nts = []

        chrom_i = 0
        num_chrom = len(util.chromosomes_list)
        for chrom_str in util.chromosomes_list:
            chrom_i += 1
            if chrom_str in cord_data_g:
                g = cord_data_g[chrom_str]
                if verbose and has_phenotypes:
                    if 'raw_snps_val' in g:
                        raw_snps = g['raw_snps_val'][...]
                    else:
                        raw_snps = g['raw_snps_ref'][...]

                # Filter monomorphic SNPs
                snp_stds = g['snp_stds_ref'][...]
                snp_stds = snp_stds.flatten()
                pval_derived_betas = g['betas'][...]
                positions = g['positions'][...]
                sids = (g['sids'][...]).astype(util.sids_u_dtype)
                log_odds = g['log_odds'][...]
                nts = (g['nts'][...]).astype(util.nts_u_dtype)
                ok_snps_filter = snp_stds > 0
                if not sp.all(ok_snps_filter):
                    snp_stds = snp_stds[ok_snps_filter]
                    pval_derived_betas = pval_derived_betas[ok_snps_filter]
                    positions = positions[ok_snps_filter]
                    sids = sids[ok_snps_filter]
                    log_odds = log_odds[ok_snps_filter]
                    nts = nts[ok_snps_filter]
                    if verbose and has_phenotypes:
                        raw_snps = raw_snps[ok_snps_filter]

                if out_file_prefix:
                    chromosomes.extend([chrom_str] * len(pval_derived_betas))
                    out_positions.extend(positions)
                    out_sids.extend(sids)
                    raw_effect_sizes.extend(log_odds)
                    out_nts.extend(nts)

                h2_chrom = herit_dict[chrom_str]
                if 'chrom_ld_boundaries' in ld_dict:
                    ld_boundaries = ld_dict['chrom_ld_boundaries'][chrom_str]
                    res_dict = ldpred_gibbs(
                        pval_derived_betas,
                        h2=h2_chrom,
                        n=n,
                        p=p,
                        ld_radius=ld_radius,
                        verbose=verbose,
                        num_iter=num_iter,
                        burn_in=burn_in,
                        ld_dict=chrom_ld_dict[chrom_str],
                        start_betas=LDpred_inf_chrom_dict[chrom_str],
                        ld_boundaries=ld_boundaries,
                        zero_jump_prob=zero_jump_prob)
                else:
                    res_dict = ldpred_gibbs(
                        pval_derived_betas,
                        h2=h2_chrom,
                        n=n,
                        p=p,
                        ld_radius=ld_radius,
                        verbose=verbose,
                        num_iter=num_iter,
                        burn_in=burn_in,
                        ld_dict=chrom_ld_dict[chrom_str],
                        start_betas=LDpred_inf_chrom_dict[chrom_str],
                        zero_jump_prob=zero_jump_prob)

                updated_betas = res_dict['betas']
                updated_inf_betas = res_dict['inf_betas']
                sum_sqr_effects = sp.sum(updated_betas**2)
                if sum_sqr_effects > herit_dict['gw_h2_ld_score_est']:
                    print(
                        'Sum of squared updated effects estimates seems too large: %0.4f'
                        % sum_sqr_effects)
                    print(
                        'This suggests that the Gibbs sampler did not convergence.'
                    )
                    convergence_report[p] = True

                if verbose:
                    print('Calculating SNP weights for Chromosome %s' %
                          ((chrom_str.split('_'))[1]))
                else:
                    sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' %
                                     (100.0 *
                                      (min(1,
                                           float(chrom_i + 1) / num_chrom))))
                    sys.stdout.flush()

                updated_betas = updated_betas / (snp_stds.flatten())
                updated_inf_betas = updated_inf_betas / (snp_stds.flatten())
                ldpred_effect_sizes.extend(updated_betas)
                ldpred_inf_effect_sizes.extend(updated_inf_betas)
                if verbose and has_phenotypes:
                    prs = sp.dot(updated_betas, raw_snps)
                    risk_scores_pval_derived += prs
                    corr = sp.corrcoef(y, prs)[0, 1]
                    r2 = corr**2
                    print(
                        'The R2 prediction accuracy of PRS using %s was: %0.4f'
                        % (chrom_str, r2))

        if verbose and has_phenotypes:
            num_indivs = len(y)
            results_dict[p_str]['y'] = y
            results_dict[p_str]['risk_scores_pd'] = risk_scores_pval_derived
            print('Prediction accuracy was assessed using %d individuals.' %
                  (num_indivs))

            corr = sp.corrcoef(y, risk_scores_pval_derived)[0, 1]
            r2 = corr**2
            results_dict[p_str]['r2_pd'] = r2
            print(
                'The  R2 prediction accuracy (observed scale) for the whole genome was: %0.4f (%0.6f)'
                % (r2, ((1 - r2)**2) / num_indivs))

            if corr < 0:
                risk_scores_pval_derived = -1 * risk_scores_pval_derived
            auc = util.calc_auc(y, risk_scores_pval_derived)
            print('AUC for the whole genome was: %0.4f' % auc)

            # Now calibration
            denominator = sp.dot(risk_scores_pval_derived.T,
                                 risk_scores_pval_derived)
            y_norm = (y - sp.mean(y)) / sp.std(y)
            numerator = sp.dot(risk_scores_pval_derived.T, y_norm)
            regression_slope = (numerator / denominator)  # [0][0]
            print(
                'The slope for predictions with P-value derived  effects is: %0.4f'
                % regression_slope)
            results_dict[p_str]['slope_pd'] = regression_slope

        weights_out_file = '%s_LDpred_p%0.4e.txt' % (out_file_prefix, p)
        with open(weights_out_file, 'w') as f:
            f.write(
                'chrom    pos    sid    nt1    nt2    raw_beta     ldpred_beta\n'
            )
            for chrom, pos, sid, nt, raw_beta, ldpred_beta in zip(
                    chromosomes, out_positions, out_sids, out_nts,
                    raw_effect_sizes, ldpred_effect_sizes):
                nt1, nt2 = nt[0], nt[1]
                f.write('%s    %d    %s    %s    %s    %0.4e    %0.4e\n' %
                        (chrom, pos, sid, nt1, nt2, raw_beta, ldpred_beta))

    weights_out_file = '%s_LDpred-inf.txt' % (out_file_prefix)
    with open(weights_out_file, 'w') as f:
        f.write(
            'chrom    pos    sid    nt1    nt2    raw_beta    ldpred_inf_beta \n'
        )
        for chrom, pos, sid, nt, raw_beta, ldpred_inf_beta in zip(
                chromosomes, out_positions, out_sids, out_nts,
                raw_effect_sizes, ldpred_inf_effect_sizes):
            nt1, nt2 = nt[0], nt[1]
            f.write('%s    %d    %s    %s    %s    %0.4e    %0.4e\n' %
                    (chrom, pos, sid, nt1, nt2, raw_beta, ldpred_inf_beta))

    summary_dict[2.0] = {
        'name': 'Gibbs sampler fractions used',
        'value': str(ps)
    }
    ['Yes' if convergence_report[p] else 'No' for p in ps]
    summary_dict[2.1] = {
        'name': 'Convergence issues (for each fraction)',
        'value': str(['Yes' if convergence_report[p] else 'No' for p in ps])
    }
Ejemplo n.º 4
0
def ldpred_inf_genomewide(data_file=None, ld_radius = None, ld_dict=None, out_file_prefix=None,
                          n=None, h2=None, verbose=False):
    """
    Calculate LDpred for a genome
    """    
    
    df = h5py.File(data_file,'r')
    has_phenotypes=False
    if 'y' in df:
        'Validation phenotypes found.'
        y = df['y'][...]  # Phenotype
        num_individs = len(y)
        risk_scores_pval_derived = sp.zeros(num_individs)
        has_phenotypes=True

    ld_scores_dict = ld_dict['ld_scores_dict']
    chrom_ref_ld_mats = ld_dict['chrom_ref_ld_mats']
        
    print('Applying LDpred-inf with LD radius: %d' % ld_radius)
    results_dict = {}
    cord_data_g = df['cord_data']

    #Calculating genome-wide heritability using LD score regression, and partition heritability by chromsomes
    herit_dict = ld.get_chromosome_herits(cord_data_g, ld_scores_dict, n, h2=h2)

    if out_file_prefix:
        #Preparing output files
        raw_effect_sizes = []
        ldpred_effect_sizes = []
        sids = []
        chromosomes = []
        positions = []
        nts = []
        
    for chrom_str in util.chromosomes_list:
        if chrom_str in cord_data_g:
            g = cord_data_g[chrom_str]
            if has_phenotypes:
                if 'raw_snps_val' in g:
                    raw_snps = g['raw_snps_val'][...]
                else:
                    raw_snps = g['raw_snps_ref'][...]
            
            snp_stds = g['snp_stds_ref'][...]
            pval_derived_betas = g['betas'][...]
            if out_file_prefix:
                chromosomes.extend([chrom_str]*len(pval_derived_betas))
                positions.extend(g['positions'][...])
                sids_arr = (g['sids'][...]).astype(util.sids_u_dtype)
                sids.extend(sids_arr)
                raw_effect_sizes.extend(g['log_odds'][...])
                nts_arr = (g['nts'][...]).astype(util.nts_u_dtype)
                nts.extend(nts_arr)
        
            h2_chrom = herit_dict[chrom_str] 
            updated_betas = ldpred_inf(pval_derived_betas, genotypes=None, reference_ld_mats=chrom_ref_ld_mats[chrom_str], 
                                                h2=h2_chrom, n=n, ld_window_size=2*ld_radius, verbose=False)
                    
            print('Calculating scores for Chromosome %s'%((chrom_str.split('_'))[1]))
            updated_betas = updated_betas / (snp_stds.flatten())
            ldpred_effect_sizes.extend(updated_betas)
            if has_phenotypes:
                prs = sp.dot(updated_betas, raw_snps)
                risk_scores_pval_derived += prs
                corr = sp.corrcoef(y, prs)[0, 1]
                r2 = corr ** 2
                print('The R2 prediction accuracy of PRS using %s was: %0.4f' %(chrom_str, r2))

                
    if has_phenotypes:
        num_indivs = len(y)
        results_dict['y']=y
        results_dict['risk_scores_pd']=risk_scores_pval_derived
        print('Prediction accuracy was assessed using %d individuals.'%(num_indivs))

        corr = sp.corrcoef(y, risk_scores_pval_derived)[0, 1]
        r2 = corr ** 2
        results_dict['r2_pd']=r2
        print('The  R2 prediction accuracy (observed scale) for the whole genome was: %0.4f (%0.6f)' % (r2, ((1-r2)**2)/num_indivs))

        if corr<0:
            risk_scores_pval_derived = -1* risk_scores_pval_derived
        auc = util.calc_auc(y,risk_scores_pval_derived)
        print('AUC for the whole genome was: %0.4f'%auc)

        #Now calibration                               
        denominator = sp.dot(risk_scores_pval_derived.T, risk_scores_pval_derived)
        y_norm = (y-sp.mean(y))/sp.std(y)
        numerator = sp.dot(risk_scores_pval_derived.T, y_norm)
        regression_slope = (numerator / denominator)
        print('The slope for predictions with P-value derived  effects is: %0.4f'%regression_slope)
        results_dict['slope_pd']=regression_slope
    
    weights_out_file = '%s.txt'%(out_file_prefix)
    with open(weights_out_file,'w') as f:
        f.write('chrom    pos    sid    nt1    nt2    raw_beta    ldpred_inf_beta\n')
        for chrom, pos, sid, nt, raw_beta, ldpred_beta in zip(chromosomes, positions, sids, nts, raw_effect_sizes, ldpred_effect_sizes):
            nt1,nt2 = nt[0],nt[1]
            f.write('%s    %d    %s    %s    %s    %0.4e    %0.4e\n'%(chrom, pos, sid, nt1, nt2, raw_beta, ldpred_beta))
Ejemplo n.º 5
0
def ldpred_genomewide(data_file=None, ld_radius=None, ld_dict=None, out_file_prefix=None, 
                      summary_dict=None, ps=None,
                      n=None, h2=None, num_iter=None, 
                      verbose=False, zero_jump_prob=0.05, burn_in=5):
    """
    Calculate LDpred for a genome
    """    
    df = h5py.File(data_file, 'r')
    has_phenotypes = False
    if 'y' in df:
        'Validation phenotypes found.'
        y = df['y'][...]  # Phenotype
        num_individs = len(y)
        risk_scores_pval_derived = sp.zeros(num_individs)
        has_phenotypes = True

    ld_scores_dict = ld_dict['ld_scores_dict']
    chrom_ld_dict = ld_dict['chrom_ld_dict']
    chrom_ref_ld_mats = ld_dict['chrom_ref_ld_mats']
        
    print('Applying LDpred with LD radius: %d' % ld_radius)
    results_dict = {}
    cord_data_g = df['cord_data']

    #Calculating genome-wide heritability using LD score regression, and partition heritability by chromsomes
    herit_dict = ld.get_chromosome_herits(cord_data_g, ld_scores_dict, n, h2=h2, debug=verbose,summary_dict=summary_dict)

    LDpred_inf_chrom_dict = {}
    print('Calculating LDpred-inf weights')
    for chrom_str in util.chromosomes_list:
        if chrom_str in cord_data_g:
            print('Calculating SNP weights for Chromosome %s' % ((chrom_str.split('_'))[1]))           
            g = cord_data_g[chrom_str]

            # Filter monomorphic SNPs
            snp_stds = g['snp_stds_ref'][...]
            snp_stds = snp_stds.flatten()
            ok_snps_filter = snp_stds > 0
            pval_derived_betas = g['betas'][...]
            pval_derived_betas = pval_derived_betas[ok_snps_filter]            
            h2_chrom = herit_dict[chrom_str]
            start_betas = LDpred_inf.ldpred_inf(pval_derived_betas, genotypes=None, reference_ld_mats=chrom_ref_ld_mats[chrom_str],
                                                h2=h2_chrom, n=n, ld_window_size=2 * ld_radius, verbose=False)
            LDpred_inf_chrom_dict[chrom_str] = start_betas

    
    convergence_report = {}
    for p in ps:
        convergence_report[p] = False
        print('Starting LDpred gibbs with f=%0.4f' % p)
        p_str = '%0.4f' % p
        results_dict[p_str] = {}
        
        if out_file_prefix:
            # Preparing output files
            raw_effect_sizes = []
            ldpred_effect_sizes = []
            ldpred_inf_effect_sizes = []
            out_sids = []
            chromosomes = []
            out_positions = []
            out_nts = []
            
        chrom_i = 0
        num_chrom = len(util.chromosomes_list)
        for chrom_str in util.chromosomes_list:
            chrom_i+=1
            if chrom_str in cord_data_g:
                g = cord_data_g[chrom_str]
                if verbose and has_phenotypes:
                    if 'raw_snps_val' in g:
                        raw_snps = g['raw_snps_val'][...]
                    else:
                        raw_snps = g['raw_snps_ref'][...]
                
                # Filter monomorphic SNPs
                snp_stds = g['snp_stds_ref'][...]
                snp_stds = snp_stds.flatten()
                pval_derived_betas = g['betas'][...]
                positions = g['positions'][...]
                sids = (g['sids'][...]).astype(util.sids_u_dtype)
                log_odds = g['log_odds'][...]
                nts = (g['nts'][...]).astype(util.nts_u_dtype)
                ok_snps_filter = snp_stds > 0
                if not sp.all(ok_snps_filter):
                    snp_stds = snp_stds[ok_snps_filter]
                    pval_derived_betas = pval_derived_betas[ok_snps_filter]
                    positions = positions[ok_snps_filter]
                    sids = sids[ok_snps_filter]
                    log_odds = log_odds[ok_snps_filter]
                    nts = nts[ok_snps_filter]
                    if verbose and has_phenotypes:
                        raw_snps = raw_snps[ok_snps_filter]


                if out_file_prefix:
                    chromosomes.extend([chrom_str] * len(pval_derived_betas))
                    out_positions.extend(positions)
                    out_sids.extend(sids)
                    raw_effect_sizes.extend(log_odds)
                    out_nts.extend(nts)
        
                
                h2_chrom = herit_dict[chrom_str]
                if 'chrom_ld_boundaries' in ld_dict:
                    ld_boundaries = ld_dict['chrom_ld_boundaries'][chrom_str]
                    res_dict = ldpred_gibbs(pval_derived_betas, h2=h2_chrom, n=n, p=p, ld_radius=ld_radius,
                                            verbose=verbose, num_iter=num_iter, burn_in=burn_in, ld_dict=chrom_ld_dict[chrom_str],
                                            start_betas=LDpred_inf_chrom_dict[chrom_str], ld_boundaries=ld_boundaries,
                                            zero_jump_prob=zero_jump_prob,
                                            print_progress=False)
                else:
                    res_dict = ldpred_gibbs(pval_derived_betas, h2=h2_chrom, n=n, p=p, ld_radius=ld_radius,
                                            verbose=verbose, num_iter=num_iter, burn_in=burn_in, ld_dict=chrom_ld_dict[chrom_str],
                                            start_betas=LDpred_inf_chrom_dict[chrom_str], zero_jump_prob=zero_jump_prob,
                                            print_progress=False)
                
                updated_betas = res_dict['betas']
                updated_inf_betas = res_dict['inf_betas']
                sum_sqr_effects = sp.sum(updated_betas ** 2)
                if sum_sqr_effects > herit_dict['gw_h2_ld_score_est']:
                    print('Sum of squared updated effects estimates seems too large: %0.4f'% sum_sqr_effects)
                    print('This suggests that the Gibbs sampler did not convergence.')
                    convergence_report[p] = True
                
                if verbose:
                    print('Calculating SNP weights for Chromosome %s' % ((chrom_str.split('_'))[1]))
                else:
                    sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, float(chrom_i + 1) / num_chrom))))
                    sys.stdout.flush()

                updated_betas = updated_betas / (snp_stds.flatten())
                updated_inf_betas = updated_inf_betas / (snp_stds.flatten())
                ldpred_effect_sizes.extend(updated_betas)
                ldpred_inf_effect_sizes.extend(updated_inf_betas)
                if verbose and has_phenotypes:
                    prs = sp.dot(updated_betas, raw_snps)
                    risk_scores_pval_derived += prs
                    corr = sp.corrcoef(y, prs)[0, 1]
                    r2 = corr ** 2
                    print('The R2 prediction accuracy of PRS using %s was: %0.4f' % (chrom_str, r2))
        
        if not verbose:
            sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0))
            sys.stdout.flush()
        if verbose and has_phenotypes:
            num_indivs = len(y)
            results_dict[p_str]['y'] = y
            results_dict[p_str]['risk_scores_pd'] = risk_scores_pval_derived
            print('Prediction accuracy was assessed using %d individuals.' % (num_indivs))
    
            corr = sp.corrcoef(y, risk_scores_pval_derived)[0, 1]
            r2 = corr ** 2
            results_dict[p_str]['r2_pd'] = r2
            print('The  R2 prediction accuracy (observed scale) for the whole genome was: %0.4f (%0.6f)' % (r2, ((1 - r2) ** 2) / num_indivs))
    
            if corr < 0:
                risk_scores_pval_derived = -1 * risk_scores_pval_derived
            auc = util.calc_auc(y, risk_scores_pval_derived)
            print('AUC for the whole genome was: %0.4f' % auc)
    
            # Now calibration                               
            denominator = sp.dot(risk_scores_pval_derived.T, risk_scores_pval_derived)
            y_norm = (y - sp.mean(y)) / sp.std(y)
            numerator = sp.dot(risk_scores_pval_derived.T, y_norm)
            regression_slope = (numerator / denominator)  # [0][0]
            print('The slope for predictions with P-value derived  effects is: %0.4f' % regression_slope)
            results_dict[p_str]['slope_pd'] = regression_slope
        
        weights_out_file = '%s_LDpred_p%0.4e.txt' % (out_file_prefix, p)
        with open(weights_out_file, 'w') as f:
            f.write('chrom    pos    sid    nt1    nt2    raw_beta     ldpred_beta\n')
            for chrom, pos, sid, nt, raw_beta, ldpred_beta in zip(chromosomes, out_positions, out_sids, out_nts, raw_effect_sizes, ldpred_effect_sizes):
                nt1, nt2 = nt[0], nt[1]
                f.write('%s    %d    %s    %s    %s    %0.4e    %0.4e\n' % (chrom, pos, sid, nt1, nt2, raw_beta, ldpred_beta))

    weights_out_file = '%s_LDpred-inf.txt' % (out_file_prefix)
    with open(weights_out_file, 'w') as f:
        f.write('chrom    pos    sid    nt1    nt2    raw_beta    ldpred_inf_beta \n')
        for chrom, pos, sid, nt, raw_beta, ldpred_inf_beta in zip(chromosomes, out_positions, out_sids, out_nts, raw_effect_sizes, ldpred_inf_effect_sizes):
            nt1, nt2 = nt[0], nt[1]
            f.write('%s    %d    %s    %s    %s    %0.4e    %0.4e\n' % (chrom, pos, sid, nt1, nt2, raw_beta, ldpred_inf_beta))

    summary_dict[2.0]={'name':'Gibbs sampler fractions used','value':str(ps)}
    ['Yes' if convergence_report[p] else 'No' for p in ps]
    summary_dict[2.1]={'name':'Convergence issues (for each fraction)','value':str(['Yes' if convergence_report[p] else 'No' for p in ps])}
Ejemplo n.º 6
0
def ldpred_inf_genomewide(data_file=None, ld_radius = None, ld_dict=None, out_file_prefix=None,
                          n=None, h2=None, verbose=False):
    """
    Calculate LDpred for a genome
    """    
    
    df = h5py.File(data_file,'r')
    has_phenotypes=False
    if 'y' in df:
        'Validation phenotypes found.'
        y = df['y'][...]  # Phenotype
        num_individs = len(y)
        risk_scores_pval_derived = sp.zeros(num_individs)
        has_phenotypes=True

    ld_scores_dict = ld_dict['ld_scores_dict']
    chrom_ref_ld_mats = ld_dict['chrom_ref_ld_mats']
        
    print('Applying LDpred-inf with LD radius: %d' % ld_radius)
    results_dict = {}
    cord_data_g = df['cord_data']

    #Calculating genome-wide heritability using LD score regression, and partition heritability by chromsomes
    herit_dict = ld.get_chromosome_herits(cord_data_g, ld_scores_dict, n, h2=h2)

    if out_file_prefix:
        #Preparing output files
        raw_effect_sizes = []
        ldpred_effect_sizes = []
        sids = []
        chromosomes = []
        positions = []
        nts = []
        
    for chrom_str in util.chromosomes_list:
        if chrom_str in cord_data_g:
            g = cord_data_g[chrom_str]
            if has_phenotypes:
                if 'raw_snps_val' in g:
                    raw_snps = g['raw_snps_val'][...]
                else:
                    raw_snps = g['raw_snps_ref'][...]
            
            snp_stds = g['snp_stds_ref'][...]
            pval_derived_betas = g['betas'][...]
            if out_file_prefix:
                chromosomes.extend([chrom_str]*len(pval_derived_betas))
                positions.extend(g['positions'][...])
                sids_arr = (g['sids'][...]).astype(util.sids_u_dtype)
                sids.extend(sids_arr)
                raw_effect_sizes.extend(g['log_odds'][...])
                nts_arr = (g['nts'][...]).astype(util.nts_u_dtype)
                nts.extend(nts_arr)
        
            h2_chrom = herit_dict[chrom_str] 
            updated_betas = ldpred_inf(pval_derived_betas, genotypes=None, reference_ld_mats=chrom_ref_ld_mats[chrom_str], 
                                                h2=h2_chrom, n=n, ld_window_size=2*ld_radius, verbose=False)
                    
            print('Calculating scores for Chromosome %s'%((chrom_str.split('_'))[1]))
            updated_betas = updated_betas / (snp_stds.flatten())
            ldpred_effect_sizes.extend(updated_betas)
            if has_phenotypes:
                prs = sp.dot(updated_betas, raw_snps)
                risk_scores_pval_derived += prs
                corr = sp.corrcoef(y, prs)[0, 1]
                r2 = corr ** 2
                print('The R2 prediction accuracy of PRS using %s was: %0.4f' %(chrom_str, r2))

                
    if has_phenotypes:
        num_indivs = len(y)
        results_dict['y']=y
        results_dict['risk_scores_pd']=risk_scores_pval_derived
        print('Prediction accuracy was assessed using %d individuals.'%(num_indivs))

        corr = sp.corrcoef(y, risk_scores_pval_derived)[0, 1]
        r2 = corr ** 2
        results_dict['r2_pd']=r2
        print('The  R2 prediction accuracy (observed scale) for the whole genome was: %0.4f (%0.6f)' % (r2, ((1-r2)**2)/num_indivs))

        if corr<0:
            risk_scores_pval_derived = -1* risk_scores_pval_derived
        auc = util.calc_auc(y,risk_scores_pval_derived)
        print('AUC for the whole genome was: %0.4f'%auc)

        #Now calibration                               
        denominator = sp.dot(risk_scores_pval_derived.T, risk_scores_pval_derived)
        y_norm = (y-sp.mean(y))/sp.std(y)
        numerator = sp.dot(risk_scores_pval_derived.T, y_norm)
        regression_slope = (numerator / denominator)
        print('The slope for predictions with P-value derived  effects is: %0.4f'%regression_slope)
        results_dict['slope_pd']=regression_slope
    
    weights_out_file = '%s.txt'%(out_file_prefix)
    with open(weights_out_file,'w') as f:
        f.write('chrom    pos    sid    nt1    nt2    raw_beta    ldpred_inf_beta\n')
        for chrom, pos, sid, nt, raw_beta, ldpred_beta in zip(chromosomes, positions, sids, nts, raw_effect_sizes, ldpred_effect_sizes):
            nt1,nt2 = nt[0],nt[1]
            f.write('%s    %d    %s    %s    %s    %0.4e    %0.4e\n'%(chrom, pos, sid, nt1, nt2, raw_beta, ldpred_beta))