Beispiel #1
0
def main():
    #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    p_dict = parse_parameters()

    #Use the same LD file as LDpred
    local_ld_dict_file = '%s_ldradius%d.pickled.gz'%(p_dict['ld_prefix'], p_dict['ld_radius'])
    
    print """
Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set.
If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. 
"""
    if not os.path.isfile(local_ld_dict_file):
        df = h5py.File(p_dict['coord'])
                 
        chrom_ld_scores_dict = {}
        chrom_ld_dict = {}
        chrom_ref_ld_mats = {}
        ld_score_sum = 0
        num_snps = 0
        print 'Calculating LD information w. radius %d'% p_dict['ld_radius']

        cord_data_g = df['cord_data']

        for chrom_str in cord_data_g.keys():
            print 'Working on %s'%chrom_str
            g = cord_data_g[chrom_str]
            if 'raw_snps_ref' in g.keys():
                raw_snps = g['raw_snps_ref'][...]
                snp_stds = g['snp_stds_ref'][...]
                snp_means = g['snp_means_ref'][...]
            
            n_snps = len(raw_snps)
            snp_means.shape = (n_snps,1)   
            snp_stds.shape = (n_snps,1)   
            
            # Normalize SNPs..
            snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32')
            ret_dict = ld.get_LDpred_ld_tables(snps, ld_radius=p_dict['ld_radius'], ld_window_size=2*p_dict['ld_radius'])
            chrom_ld_dict[chrom_str] = ret_dict['ld_dict']
            chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices']
            ld_scores = ret_dict['ld_scores']
            chrom_ld_scores_dict[chrom_str] = {'ld_scores':ld_scores, 'avg_ld_score':sp.mean(ld_scores)}
            ld_score_sum += sp.sum(ld_scores)
            num_snps += n_snps
        avg_gw_ld_score = ld_score_sum / float(num_snps)
        ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict':chrom_ld_scores_dict}    
        
        print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file
        print 'Genome-wide average LD score was:', ld_scores_dict['avg_gw_ld_score']
        ld_dict = {'ld_scores_dict':ld_scores_dict, 'chrom_ld_dict':chrom_ld_dict, 'chrom_ref_ld_mats':chrom_ref_ld_mats}
        with gzip.open(local_ld_dict_file, 'wb') as f:
            cPickle.dump(ld_dict, f, protocol=2)
        print 'LD information is now pickled.'
    else:
        print 'Loading LD information from file: %s'%local_ld_dict_file
        with gzip.open(local_ld_dict_file, 'r') as f:
            ld_dict = cPickle.load(f)
    
    ldpred_inf_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ld_radius=p_dict['ld_radius'], 
                          ld_dict = ld_dict, n=p_dict['N'], h2=p_dict['H2'], verbose=False)
Beispiel #2
0
def main():
    p_dict = parse_parameters()

    #Use the same LD file as LDpred
    local_ld_dict_file = '%s_ldradius%d.pickled.gz'%(p_dict['ld_prefix'], p_dict['ld_radius'])
    
    print """
Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set.
If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. 
"""
    if not os.path.isfile(local_ld_dict_file):
        df = h5py.File(p_dict['coord'])
                 
        chrom_ld_scores_dict = {}
        chrom_ld_dict = {}
        chrom_ref_ld_mats = {}
        ld_score_sum = 0
        num_snps = 0
        print 'Calculating LD information w. radius %d'% p_dict['ld_radius']

        cord_data_g = df['cord_data']

        for chrom_str in cord_data_g.keys():
            print 'Working on %s'%chrom_str
            g = cord_data_g[chrom_str]
            if 'raw_snps_ref' in g.keys():
                raw_snps = g['raw_snps_ref'][...]
                snp_stds = g['snp_stds_ref'][...]
                snp_means = g['snp_means_ref'][...]
            
            n_snps = len(raw_snps)
            snp_means.shape = (n_snps,1)   
            snp_stds.shape = (n_snps,1)   
            
            # Normalize SNPs..
            snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32')
            ret_dict = ld.get_LDpred_ld_tables(snps, ld_radius=p_dict['ld_radius'], ld_window_size=2*p_dict['ld_radius'])
            chrom_ld_dict[chrom_str] = ret_dict['ld_dict']
            chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices']
            ld_scores = ret_dict['ld_scores']
            chrom_ld_scores_dict[chrom_str] = {'ld_scores':ld_scores, 'avg_ld_score':sp.mean(ld_scores)}
            ld_score_sum += sp.sum(ld_scores)
            num_snps += n_snps
        avg_gw_ld_score = ld_score_sum / float(num_snps)
        ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict':chrom_ld_scores_dict}    
        
        print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file
        print 'Genome-wide average LD score was:', ld_scores_dict['avg_gw_ld_score']
        ld_dict = {'ld_scores_dict':ld_scores_dict, 'chrom_ld_dict':chrom_ld_dict, 'chrom_ref_ld_mats':chrom_ref_ld_mats}
        with gzip.open(local_ld_dict_file, 'wb') as f:
            cPickle.dump(ld_dict, f, protocol=2)
        print 'LD information is now pickled.'
    else:
        print 'Loading LD information from file: %s'%local_ld_dict_file
        with gzip.open(local_ld_dict_file, 'r') as f:
            ld_dict = cPickle.load(f)
    
    ldpred_inf_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ld_radius=p_dict['ld_radius'], 
                          ld_dict = ld_dict, n=p_dict['N'], h2=p_dict['H2'], verbose=False)
Beispiel #3
0
def main():
    p_dict = parse_parameters()
    local_ld_dict_file = '%s_ldradius%d.pickled.gz' % (
        p_dict['local_ld_file_prefix'], p_dict['ld_radius'])

    print """
Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set.
If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. 
"""
    if not os.path.isfile(local_ld_dict_file):
        df = h5py.File(p_dict['coord'])

        chrom_ld_scores_dict = {}
        chrom_ld_dict = {}
        chrom_ref_ld_mats = {}
        if p_dict['gm_ld_radius'] is not None:
            chrom_ld_boundaries = {}
        ld_score_sum = 0
        num_snps = 0
        print 'Calculating LD information w. radius %d' % p_dict['ld_radius']

        cord_data_g = df['cord_data']

        for chrom_str in cord_data_g.keys():
            print 'Working on %s' % chrom_str
            g = cord_data_g[chrom_str]
            if 'raw_snps_ref' in g.keys():
                raw_snps = g['raw_snps_ref'][...]
                snp_stds = g['snp_stds_ref'][...]
                snp_means = g['snp_means_ref'][...]

            #Filter monomorphic SNPs
            ok_snps_filter = snp_stds > 0
            ok_snps_filter = ok_snps_filter.flatten()
            raw_snps = raw_snps[ok_snps_filter]
            snp_means = snp_means[ok_snps_filter]
            snp_stds = snp_stds[ok_snps_filter]

            n_snps = len(raw_snps)
            snp_means.shape = (n_snps, 1)
            snp_stds.shape = (n_snps, 1)

            # Normalize SNPs..
            snps = sp.array((raw_snps - snp_means) / snp_stds, dtype='float32')
            assert snps.shape == raw_snps.shape, 'Array Shape mismatch'
            if p_dict['gm_ld_radius'] is not None:
                assert 'genetic_map' in g.keys(), 'Genetic map is missing.'
                gm = g['genetic_map'][...]
                ret_dict = ld.get_LDpred_ld_tables(
                    snps, gm=gm, gm_ld_radius=p_dict['gm_ld_radius'])
                chrom_ld_boundaries[chrom_str] = ret_dict['ld_boundaries']
            else:
                ret_dict = ld.get_LDpred_ld_tables(
                    snps,
                    ld_radius=p_dict['ld_radius'],
                    ld_window_size=2 * p_dict['ld_radius'])
            chrom_ld_dict[chrom_str] = ret_dict['ld_dict']
            chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices']
            ld_scores = ret_dict['ld_scores']
            chrom_ld_scores_dict[chrom_str] = {
                'ld_scores': ld_scores,
                'avg_ld_score': sp.mean(ld_scores)
            }
            ld_score_sum += sp.sum(ld_scores)
            num_snps += n_snps
        avg_gw_ld_score = ld_score_sum / float(num_snps)
        ld_scores_dict = {
            'avg_gw_ld_score': avg_gw_ld_score,
            'chrom_dict': chrom_ld_scores_dict
        }

        print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file
        print 'Genome-wide average LD score was:', ld_scores_dict[
            'avg_gw_ld_score']
        ld_dict = {
            'ld_scores_dict': ld_scores_dict,
            'chrom_ld_dict': chrom_ld_dict,
            'chrom_ref_ld_mats': chrom_ref_ld_mats
        }
        if p_dict['gm_ld_radius'] is not None:
            ld_dict['chrom_ld_boundaries'] = chrom_ld_boundaries
        f = gzip.open(local_ld_dict_file, 'wb')
        cPickle.dump(ld_dict, f, protocol=2)
        f.close()
        print 'LD information is now pickled.'
    else:
        print 'Loading LD information from file: %s' % local_ld_dict_file
        f = gzip.open(local_ld_dict_file, 'r')
        ld_dict = cPickle.load(f)
        f.close()
    ldpred_genomewide(data_file=p_dict['coord'],
                      out_file_prefix=p_dict['out'],
                      ps=p_dict['PS'],
                      ld_radius=p_dict['ld_radius'],
                      ld_dict=ld_dict,
                      n=p_dict['N'],
                      num_iter=p_dict['num_iter'],
                      h2=p_dict['H2'],
                      verbose=False)
            raw_snps = raw_snps[ok_snps_filter]
            snp_means = snp_means[ok_snps_filter]
            snp_stds = snp_stds[ok_snps_filter]

            n_snps = len(raw_snps)
            snp_means.shape = (n_snps,1)   
            snp_stds.shape = (n_snps,1)   
            
            
            # Normalize SNPs..
            snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32')
            assert snps.shape==raw_snps.shape, 'Array Shape mismatch'
            if p_dict['gm_ld_radius'] is not None:
                assert 'genetic_map' in g.keys(), 'Genetic map is missing.'
                gm = g['genetic_map'][...]
                ret_dict = ld.get_LDpred_ld_tables(snps, gm=gm, gm_ld_radius=p_dict['gm_ld_radius'])
                chrom_ld_boundaries[chrom_str] = ret_dict['ld_boundaries']
            else:
                ret_dict = ld.get_LDpred_ld_tables(snps, ld_radius=p_dict['ld_radius'], ld_window_size=2*p_dict['ld_radius'])
            chrom_ld_dict[chrom_str] = ret_dict['ld_dict']
            chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices']
            ld_scores = ret_dict['ld_scores']
            chrom_ld_scores_dict[chrom_str] = {'ld_scores':ld_scores, 'avg_ld_score':sp.mean(ld_scores)}
            ld_score_sum += sp.sum(ld_scores)
            num_snps += n_snps
        avg_gw_ld_score = ld_score_sum / float(num_snps)
        ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict':chrom_ld_scores_dict}    
        
        print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file
        print 'Genome-wide average LD score was:', ld_scores_dict['avg_gw_ld_score']
        ld_dict = {'ld_scores_dict':ld_scores_dict, 'chrom_ld_dict':chrom_ld_dict, 'chrom_ref_ld_mats':chrom_ref_ld_mats}
def main():
    p_dict = parse_parameters()
    # - start wallace
    # local_ld_dict_file = '%s_ldradius%d.pickled.gz'%(p_dict['local_ld_file_prefix'], p_dict['ld_radius'])
    local_ld_dict_file = p_dict['local_ld_file']
    # - end wallace

    print """
Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set.
If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs.
"""
    # wallace:
    # Generate the local_ld_file file.
    if not os.path.isfile(local_ld_dict_file):
        # - start wallace, should not run into this point in this file.
        print 'ERROR: can not find LD file, please run "LDpred.getLocalLDFile.CHR.Wallace.V1.py" to get them!'
        sys.exit(-1)
        # - end wallace
        df = h5py.File(p_dict['coord'])

        chrom_ld_scores_dict = {}
        chrom_ld_dict = {}
        chrom_ref_ld_mats = {}
        if p_dict['gm_ld_radius'] is not None:
            chrom_ld_boundaries = {}

        ld_score_sum = 0
        num_snps = 0
        print 'Calculating LD information w. radius %d' % p_dict['ld_radius']

        cord_data_g = df['cord_data']

        for chrom_str in cord_data_g.keys():
            print 'Working on %s' % chrom_str
            g = cord_data_g[chrom_str]
            if 'raw_snps_ref' in g.keys():
                raw_snps = g['raw_snps_ref'][...]
                snp_stds = g['snp_stds_ref'][...]
                snp_means = g['snp_means_ref'][...]

            #Filter monomorphic SNPs
            ok_snps_filter = snp_stds > 0
            ok_snps_filter = ok_snps_filter.flatten()
            raw_snps = raw_snps[ok_snps_filter]
            snp_means = snp_means[ok_snps_filter]
            snp_stds = snp_stds[ok_snps_filter]

            n_snps = len(raw_snps)
            snp_means.shape = (n_snps, 1)
            snp_stds.shape = (n_snps, 1)

            # Normalize SNPs..
            snps = sp.array((raw_snps - snp_means) / snp_stds, dtype='float32')
            assert snps.shape == raw_snps.shape, 'Array Shape mismatch'
            if p_dict['gm_ld_radius'] is not None:
                assert 'genetic_map' in g.keys(), 'Genetic map is missing.'
                gm = g['genetic_map'][...]
                ret_dict = ld.get_LDpred_ld_tables(
                    snps, gm=gm, gm_ld_radius=p_dict['gm_ld_radius'])
                chrom_ld_boundaries[chrom_str] = ret_dict['ld_boundaries']
            else:
                ret_dict = ld.get_LDpred_ld_tables(
                    snps,
                    ld_radius=p_dict['ld_radius'],
                    ld_window_size=2 * p_dict['ld_radius'])

            chrom_ld_dict[chrom_str] = ret_dict['ld_dict']
            chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices']
            ld_scores = ret_dict['ld_scores']
            chrom_ld_scores_dict[chrom_str] = {
                'ld_scores': ld_scores,
                'avg_ld_score': sp.mean(ld_scores)
            }
            ld_score_sum += sp.sum(ld_scores)
            num_snps += n_snps

            # - start Wallace ---
            # gather data for estimate heritability
            # ref ldpred_genomewide section:
            betas = g['betas'][...]
            n_betas = len(betas)
            # sum_beta2s += sp.sum(betas ** 2)

            #WRITE OUT CHROMOSOME LEVEL data.
            with open(local_ld_dict_file + '_byFileCache' + '.txt', 'w') as f:
                f.write(
                    chrom_str +
                    ': ld_scores\t%f\tn_snps\t%d\ttotal_beta_square\t%f\tn_betas\t%d\n'
                    % (sp.sum(ld_scores), n_snps, sp.sum(betas**2), n_betas))

            # - end Wallace ---

        avg_gw_ld_score = ld_score_sum / float(num_snps)
        ld_scores_dict = {
            'avg_gw_ld_score': avg_gw_ld_score,
            'chrom_dict': chrom_ld_scores_dict
        }

        print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file
        print 'Genome-wide average LD score was:', ld_scores_dict[
            'avg_gw_ld_score']

        # This part is dumpped to disk
        # Global values: ld_scores_dict
        # Chromosome wise values: chrom_ld_dict, chrom_ref_ld_mats.
        ld_dict = {
            'ld_scores_dict': ld_scores_dict,
            'chrom_ld_dict': chrom_ld_dict,
            'chrom_ref_ld_mats': chrom_ref_ld_mats
        }

        if p_dict['gm_ld_radius'] is not None:
            ld_dict['chrom_ld_boundaries'] = chrom_ld_boundaries

        f = gzip.open(local_ld_dict_file, 'wb')
        cPickle.dump(ld_dict, f, protocol=2)
        f.close()

        print 'LD information is now pickled.'
    else:
        print 'Loading LD information from file: %s' % local_ld_dict_file
        f = gzip.open(local_ld_dict_file, 'r')
        ld_dict = cPickle.load(f)
        f.close()

    # - start wallace
    # ldpred_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ps=p_dict['PS'], ld_radius=p_dict['ld_radius'],
    #                   ld_dict = ld_dict, n=p_dict['N'], num_iter=p_dict['num_iter'], h2=p_dict['H2'], verbose=False)

    ldpred_genomewide(data_file=p_dict['coord'],
                      out_file_prefix=p_dict['out'],
                      ps=p_dict['PS'],
                      ld_radius=p_dict['ld_radius'],
                      ld_dict=ld_dict,
                      n=p_dict['N'],
                      num_iter=p_dict['num_iter'],
                      h2=p_dict['H2'],
                      verbose=False,
                      local_ld_dict_file=local_ld_dict_file)
Beispiel #6
0
def main():
    p_dict = parse_parameters()
    local_ld_dict_file = '%s_ldradius%d.pickled.gz'%(p_dict['local_ld_file_prefix'], p_dict['ld_radius'])
    
    print """
Note: For maximal accuracy all SNPs with LDpred weights should be included in the validation data set.
If they are a subset of the validation data set, then we suggest recalculate LDpred for the overlapping SNPs. 
"""
    if not os.path.isfile(local_ld_dict_file):
        df = h5py.File(p_dict['coord'])
                 
        chrom_ld_scores_dict = {}
        chrom_ld_dict = {}
        chrom_ref_ld_mats = {}
        if p_dict['gm_ld_radius'] is not None:
            chrom_ld_boundaries={}
        ld_score_sum = 0
        num_snps = 0
        print 'Calculating LD information w. radius %d'% p_dict['ld_radius']

        cord_data_g = df['cord_data']

        for chrom_str in cord_data_g.keys():
            print 'Working on %s'%chrom_str
            g = cord_data_g[chrom_str]
            if 'raw_snps_ref' in g.keys():
                raw_snps = g['raw_snps_ref'][...]
                snp_stds = g['snp_stds_ref'][...]
                snp_means = g['snp_means_ref'][...]
            
            
            #Filter monomorphic SNPs
            ok_snps_filter = snp_stds>0
            ok_snps_filter = ok_snps_filter.flatten()
            raw_snps = raw_snps[ok_snps_filter]
            snp_means = snp_means[ok_snps_filter]
            snp_stds = snp_stds[ok_snps_filter]

            n_snps = len(raw_snps)
            snp_means.shape = (n_snps,1)   
            snp_stds.shape = (n_snps,1)   
            
            
            # Normalize SNPs..
            snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32')
            assert snps.shape==raw_snps.shape, 'Array Shape mismatch'
            if p_dict['gm_ld_radius'] is not None:
                assert 'genetic_map' in g.keys(), 'Genetic map is missing.'
                gm = g['genetic_map'][...]
                ret_dict = ld.get_LDpred_ld_tables(snps, gm=gm, gm_ld_radius=p_dict['gm_ld_radius'])
                chrom_ld_boundaries[chrom_str] = ret_dict['ld_boundaries']
            else:
                ret_dict = ld.get_LDpred_ld_tables(snps, ld_radius=p_dict['ld_radius'], ld_window_size=2*p_dict['ld_radius'])
            chrom_ld_dict[chrom_str] = ret_dict['ld_dict']
            chrom_ref_ld_mats[chrom_str] = ret_dict['ref_ld_matrices']
            ld_scores = ret_dict['ld_scores']
            chrom_ld_scores_dict[chrom_str] = {'ld_scores':ld_scores, 'avg_ld_score':sp.mean(ld_scores)}
            ld_score_sum += sp.sum(ld_scores)
            num_snps += n_snps
        avg_gw_ld_score = ld_score_sum / float(num_snps)
        ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score, 'chrom_dict':chrom_ld_scores_dict}    
        
        print 'Done calculating the LD table and LD score, writing to file:', local_ld_dict_file
        print 'Genome-wide average LD score was:', ld_scores_dict['avg_gw_ld_score']
        ld_dict = {'ld_scores_dict':ld_scores_dict, 'chrom_ld_dict':chrom_ld_dict, 'chrom_ref_ld_mats':chrom_ref_ld_mats}
        if p_dict['gm_ld_radius'] is not None:
            ld_dict['chrom_ld_boundaries']=chrom_ld_boundaries 
        f = gzip.open(local_ld_dict_file, 'wb')
        cPickle.dump(ld_dict, f, protocol=2)
        f.close()
        print 'LD information is now pickled.'
    else:
        print 'Loading LD information from file: %s'%local_ld_dict_file
        f = gzip.open(local_ld_dict_file, 'r')
        ld_dict = cPickle.load(f)
        f.close()
    ldpred_genomewide(data_file=p_dict['coord'], out_file_prefix=p_dict['out'], ps=p_dict['PS'], ld_radius=p_dict['ld_radius'], 
                      ld_dict = ld_dict, n=p_dict['N'], num_iter=p_dict['num_iter'], h2=p_dict['H2'], verbose=False)