Beispiel #1
def _parse_plink_snps_freqs_(genotype_file, snp_indices):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    num_snps = len(snp_indices)
    freqs_arr = sp.empty(num_snps, dtype='float32')
    #raw_snps = sp.empty((num_snps,num_individs),dtype='int8')
    #If these indices are not in order then we place them in the right place while parsing SNPs.
    snp_order = sp.argsort(snp_indices)
    ordered_snp_indices = list(snp_indices[snp_order])
    print 'Iterating over file to load SNPs'
    snp_i = 0
    next_i = ordered_snp_indices.pop()
    line_i = 0
    max_i = ordered_snp_indices[0]
    while line_i <= max_i:
        if line_i < next_i:
        elif line_i==next_i:
            line =
            snp = sp.array(line, dtype='int8')
            bin_counts = line.allele_counts()
            if bin_counts[-1]>0:
                mode_v = sp.argmax(bin_counts[:2])
                snp[snp==3] = mode_v
            s_i = snp_order[snp_i]
            freqs_arr[s_i]=sp.sum(snp, dtype='float32')/(2*float(num_individs))
            if line_i < max_i:
                next_i = ordered_snp_indices.pop()
        line_i +=1
    assert snp_i==len(freqs_arr), 'Failed to parse SNPs?'
    return freqs_arr
Beispiel #2
def parse_indiv_genotype(genotype_file, ref_path, hdf5_file):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = np.unique(Y)
    if len(unique_phens) == 1:
        print 'Unable to find phenotype values.'
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = np.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])
        has_phenotype = True
        print 'Found quantitative phenotype values'
        has_phenotype = True

    ref = pd.read_table(ref_path)
    chrom_list = np.unique(ref['CHROM'])
    hf = h5py.File(hdf5_file, 'w')
    if has_phenotype:
        hf.create_dataset('y', data=Y)
    hf.create_dataset('fids', data=fids)
    hf.create_dataset('iids', data=iids)
    hf.create_dataset('M', data=[ref.shape[0]])
    for k in chrom_list:
        chunk = ref[ref['CHROM'] == k]
        snp_indices = np.array(chunk['cord_bim'].tolist())
        print 'Extracting genotypes of chromosomes %d from genotype_file' % k
        raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)
        print 'raw_snps.shape=', raw_snps.shape
        snp_stds = np.sqrt(2 * freqs * (1 - freqs))  #np.std(raw_snps, 1)
        snp_means = freqs * 2  #np.mean(raw_snps, 1)
        g = hf.create_group('chrom_%d' % k)
        #Check SNP frequencies.. or filter by MAF: to be continued...
        print 'Writing genotypes of chromosomes %d to hdf5_file' % k
        g.create_dataset('raw_snps', data=raw_snps, compression='lzf')
        g.create_dataset('snp_stds', data=snp_stds)
        g.create_dataset('snp_means', data=snp_means)
        g.create_dataset('freqs', data=freqs)
        g.create_dataset('positions', data=chunk['POS'].tolist())
                         data=zip(chunk['A1'].tolist(), chunk['A2'].tolist()))
        g.create_dataset('SNP', data=chunk['SNP'].tolist())
    print 'individual-level data hdf5 written to %s!' % hdf5_file
Beispiel #3
def parse_plink_snps(genotype_file, snp_indices):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    num_snps = len(snp_indices)
    raw_snps = sp.empty((num_snps, num_individs), dtype='int8')
    # If these indices are not in order then we place them in the right place while parsing SNPs.
    snp_order = sp.argsort(snp_indices)
    # print(snp_indices)
    ordered_snp_indices = list(snp_indices[snp_order])
    print('Iterating over file to load SNPs')
    snp_i = 0
    next_i = ordered_snp_indices.pop()
    line_i = 0
    max_i = ordered_snp_indices[0]
    while line_i <= max_i:
        if line_i < next_i:
        elif line_i == next_i:
            line = next(plinkf)
            snp = sp.array(line, dtype='int8')
            bin_counts = line.allele_counts()
            if bin_counts[-1] > 0:
                mode_v = sp.argmax(bin_counts[:2])
                snp[snp == 3] = mode_v
            s_i = snp_order[snp_i]

            ## fixed buggy code
            ## wrong encoding of genotype (A1 should be encoded as 1 instead of A2. It is different from plinkio default)
            ## original code:
            # raw_snps[s_i] = snp
            ## new code
            raw_snps[s_i] = 2 - snp
            ## fix finish

            if line_i < max_i:
                next_i = ordered_snp_indices.pop()
            snp_i += 1
        line_i += 1
    assert snp_i == len(raw_snps), 'Failed to parse SNPs?'
    num_indivs = len(raw_snps[0])
    freqs = sp.sum(raw_snps, 1, dtype='float32') / (2 * float(num_indivs))
    return raw_snps, freqs
Beispiel #4
def parse_plink_snps(genotype_file, snp_indices):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    num_snps = len(snp_indices)
    raw_snps = sp.empty((num_snps, num_individs), dtype='int8')
    # If these indices are not in order then we place them in the right place while parsing SNPs.
    snp_order = sp.argsort(snp_indices)
    ordered_snp_indices = list(snp_indices[snp_order])
    # Iterating over file to load SNPs
    snp_i = 0
    next_i = ordered_snp_indices.pop()
    line_i = 0
    max_i = ordered_snp_indices[0]
    while line_i <= max_i: 
        if line_i < next_i:
        elif line_i == next_i:
            line = next(plinkf)
            snp = sp.array(line, dtype='int8')
            bin_counts = line.allele_counts()
            if bin_counts[-1] > 0:
                mode_v = sp.argmax(bin_counts[:2])
                snp[snp == 3] = mode_v
            s_i = snp_order[snp_i]
            raw_snps[s_i] = snp
            if line_i < max_i:
                next_i = ordered_snp_indices.pop()
            snp_i += 1
        line_i += 1
    assert snp_i == len(raw_snps), 'Parsing SNPs from plink file failed.'
    num_indivs = len(raw_snps[0])
    freqs = sp.sum(raw_snps, 1, dtype='float32') / (2 * float(num_indivs))
    return raw_snps, freqs
Beispiel #5
def get_prs_bins(genotype_file,
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()

    #1. Figure out indiv filter and get true phenotypes
    indiv_filter = sp.zeros(len(samples), dtype='bool8')
    true_phens = []
    iids = []
    if phen_map is not None:
        pcs = []
        sex = []
        covariates = []
        phen_iids = set(phen_map.keys())
        for samp_i, sample in enumerate(samples):
            if sample.iid in phen_iids:
                indiv_filter[samp_i] = True
                if 'pcs' in phen_map[sample.iid].keys():
                if 'sex' in phen_map[sample.iid].keys():
                if 'covariates' in phen_map[sample.iid].keys():
                    #Temp hack...
                    #                     if phen_map[sample.iid]['sex']==1:
                    #                         covariates.append([phen_map[sample.iid]['covariates'][0],0])
                    #                     else:
                    #                         covariates.append([0,phen_map[sample.iid]['covariates'][0]])
        if len(pcs) > 0:
            assert len(pcs) == len(
            ), 'PC information missing for some individuals with phenotypes'
        if len(sex) > 0:
            assert len(sex) == len(
            ), 'Sex information missing for some individuals with phenotypes'
        if len(covariates) > 0:
            assert len(covariates) == len(
            ), 'Covariates missing for some individuals with phenotypes'
        for samp_i, sample in enumerate(samples):
            if sample.affection != 2:
                indiv_filter[samp_i] = True

    num_individs = sp.sum(indiv_filter)
    assert num_individs > 0, 'Issues in parsing the phenotypes and/or PCs?'

    assert not sp.any(sp.isnan(
        true_phens)), 'Phenotypes appear to have some NaNs, or parsing failed.'

    print '%d individuals have phenotype and genotype information.' % num_individs

    num_non_matching_nts = 0
    num_flipped_nts = 0

    raw_effects_prs = sp.zeros(num_individs)
    pval_derived_effects_prs = sp.zeros(num_individs)
    pval_derived_effects_prs_lasso = sp.zeros(num_individs)

    bins_prs_dict = {}
    if K_bins > 1:
        bk = 1
        while bk <= K_bins:
            bins_prs_dict["prs_bin_%d" % bk] = sp.zeros(num_individs)
            bk += 1

    pval_derived_effects_prs_high = sp.zeros(num_individs)
    pval_derived_effects_prs_lasso_high = sp.zeros(num_individs)
    pval_derived_effects_prs_low = sp.zeros(num_individs)
    pval_derived_effects_prs_lasso_low = sp.zeros(num_individs)
    #If these indices are not in order then we place them in the right place while parsing SNPs.
    print 'Iterating over BED file to calculate risk scores.'
    locus_list = plinkf.get_loci()
    snp_i = 0

    bins_bounds = rs_id_map["bins_extremes"]
    #print bins_bounds
    for locus, row in it.izip(locus_list, plinkf):
        upd_pval_beta = 0
            #Check rs-ID
            #             sid = '%d_%d'%(locus.chromosome,locus.bp_position)
            sid =
            rs_info = rs_id_map[sid]
        except Exception:  #Move on if rsID not found.

        if rs_info['upd_pval_beta'] == 0:

        #Check whether the nucleotides are OK, and potentially flip it.
        ss_nt = rs_info['nts']
        g_nt = [locus.allele1, locus.allele2]
        flip_nts = False
        os_g_nt = sp.array(
            [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
        if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
            # Opposite strand nucleotides
            flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
            if flip_nts:
                raw_beta = -rs_info['raw_beta']
                upd_pval_beta = -rs_info['upd_pval_beta']
                num_flipped_nts += 1
                if lasso:
                    upd_pval_beta_lasso = -rs_info['upd_pval_beta_lasso']
                    if sets:
                        upd_pval_beta_high = -rs_info['upd_pval_beta_high']
                        upd_pval_beta_lasso_high = -rs_info[
                        upd_pval_beta_low = -rs_info['upd_pval_beta_low']
                        upd_pval_beta_lasso_low = -rs_info[
                #print "Nucleotides don't match after all?: sid=%s, g_nt=%s, ss_nt=%s" % (, str(g_nt), str(ss_nt))
                num_non_matching_nts += 1
            raw_beta = rs_info['raw_beta']
            upd_pval_beta = rs_info['upd_pval_beta']
            if lasso:
                upd_pval_beta_lasso = rs_info['upd_pval_beta_lasso']
                if sets:
                    upd_pval_beta_high = rs_info['upd_pval_beta_high']
                    upd_pval_beta_lasso_high = rs_info[
                    upd_pval_beta_low = rs_info['upd_pval_beta_low']
                    upd_pval_beta_lasso_low = rs_info[

        #Parse SNP, and fill in the blanks if necessary.
        snp = sp.array(row, dtype='int8')[indiv_filter]
        bin_counts = row.allele_counts()
        if bin_counts[-1] > 0:
            mode_v = sp.argmax(bin_counts[:2])
            snp[snp == 3] = mode_v

        #Normalize SNP

#         n_snp = (snp - sp.mean(snp))/sp.std(snp)
#         print(upd_pval_beta**2)
#         print sp.where(bins_bounds>=upd_pval_beta**2)
#         print sp.where(bins_bounds>=upd_pval_beta**2)[0][0]
        bin_number = sp.where(bins_bounds >= upd_pval_beta**2)[0][0]
        #Update scores and move on.
        raw_effects_prs += snp * raw_beta
        assert not sp.any(
            sp.isnan(raw_effects_prs)), 'Raw effects PRS is corrupted'
        snpi_b = snp * upd_pval_beta
        pval_derived_effects_prs += snpi_b
        bins_prs_dict["prs_bin_%d" % bin_number] += snpi_b
        assert not sp.any(sp.isnan(
            pval_derived_effects_prs)), 'Weighted effects PRS is corrupted'

        if verbose:

            if snp_i > 0 and snp_i % 500000 == 0:
                print("PRS using %d SNPS" % snp_i)
                #print 'Number of non-matching NTs: %d'%num_non_matching_nts
                raw_eff_r2 = (sp.corrcoef(raw_effects_prs, true_phens)[0,
                pval_eff_r2 = (sp.corrcoef(pval_derived_effects_prs,
                                           true_phens)[0, 1])**2
                print 'Raw effects PRS r2: %0.4f' % raw_eff_r2
                print 'Weigted effects PRS r2: %0.4f' % pval_eff_r2
                if lasso:
                    pval_eff_r2_lasso = (sp.corrcoef(
                        pval_derived_effects_prs_lasso, true_phens)[0, 1])**2
                    print 'Weigted effects PRS Lasso r2: %0.4f' % pval_eff_r2_lasso

                    if sets:
                        pval_eff_r2_high = (sp.corrcoef(
                            pval_derived_effects_prs_high, true_phens)[0,
                        print 'Weigted effects HIGH PRS r2: %0.4f' % pval_eff_r2_high
                        pval_eff_r2_lasso_high = (sp.corrcoef(
                            true_phens)[0, 1])**2
                        print 'Weigted effects HIGH PRS Lasso r2: %0.4f' % pval_eff_r2_lasso_high

                        pval_eff_r2_low = (sp.corrcoef(
                            pval_derived_effects_prs_low, true_phens)[0, 1])**2
                        print 'Weigted effects LOW PRS r2: %0.4f' % pval_eff_r2_low
                        pval_eff_r2_lasso_low = (sp.corrcoef(
                            true_phens)[0, 1])**2
                        print 'Weigted effects LOW PRS Lasso r2: %0.4f' % pval_eff_r2_lasso_low

        snp_i += 1


    print "DONE!"
    print 'Number of non-matching NTs: %d' % num_non_matching_nts
    print 'Number of flipped NTs: %d' % num_flipped_nts
    raw_eff_corr = sp.corrcoef(raw_effects_prs, true_phens)[0, 1]
    raw_eff_r2 = raw_eff_corr**2
    pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1]
    pval_eff_r2 = pval_eff_corr**2

    print 'Raw effects PRS correlation: %0.4f' % raw_eff_corr
    print 'Raw effects PRS r2: %0.4f' % raw_eff_r2
    print 'Weigted effects PRS correlation: %0.4f' % pval_eff_corr
    print 'Weigted effects PRS r2: %0.4f' % pval_eff_r2

    if lasso:
        pval_eff_corr_lasso = sp.corrcoef(pval_derived_effects_prs_lasso,
                                          true_phens)[0, 1]
        pval_eff_r2_lasso = pval_eff_corr_lasso**2
        print 'Weigted effects LASSO PRS correlation: %0.4f' % pval_eff_corr_lasso
        print 'Weigted effects LASSO PRS r2: %0.4f' % pval_eff_r2_lasso
        if sets:
            pval_eff_corr_high = sp.corrcoef(pval_derived_effects_prs_high,
                                             true_phens)[0, 1]
            pval_eff_r2_high = pval_eff_corr_high**2
            print 'Weigted effects HIGH PRS correlation: %0.4f' % pval_eff_corr_high
            print 'Weigted effects HIGH PRS r2: %0.4f' % pval_eff_r2_high
            pval_eff_corr_lasso_high = sp.corrcoef(
                pval_derived_effects_prs_lasso_high, true_phens)[0, 1]
            pval_eff_r2_lasso_high = pval_eff_corr_lasso_high**2
            print 'Weigted effects HIGH LASSO PRS correlation: %0.4f' % pval_eff_corr_lasso_high
            print 'Weigted effects HIGH LASSO PRS r2: %0.4f' % pval_eff_r2_lasso_high

            pval_eff_corr_low = sp.corrcoef(pval_derived_effects_prs_low,
                                            true_phens)[0, 1]
            pval_eff_r2_low = pval_eff_corr_low**2
            print 'Weigted effects LOW PRS correlation: %0.4f' % pval_eff_corr_low
            print 'Weigted effects LOW PRS r2: %0.4f' % pval_eff_r2_low
            pval_eff_corr_lasso_low = sp.corrcoef(
                pval_derived_effects_prs_lasso_low, true_phens)[0, 1]
            pval_eff_r2_lasso_low = pval_eff_corr_lasso_low**2
            print 'Weigted effects LOW LASSO PRS correlation: %0.4f' % pval_eff_corr_lasso_low
            print 'Weigted effects LOW LASSO PRS r2: %0.4f' % pval_eff_r2_lasso_low

    ret_dict = {
        'raw_effects_prs': raw_effects_prs.copy(),
        'pval_derived_effects_prs': pval_derived_effects_prs.copy(),
        'true_phens': true_phens[:],
        'iids': iids

    if K_bins > 1:
        bk = 1
        while bk <= K_bins:
            ret_dict["pval_derived_effects_prs_bin_%d" %
                     bk] = bins_prs_dict["prs_bin_%d" % bk].copy()
            bk += 1

    if len(pcs) > 0:
        ret_dict['pcs'] = pcs
    if len(sex) > 0:
        ret_dict['sex'] = sex
    if len(covariates) > 0:
        ret_dict['covariates'] = covariates

    return ret_dict
Beispiel #6
def coordinate_datasets(reference_genotype_file, hdf5_file, summary_dict,
                        max_freq_discrep = 0.15,
    summary_dict[3.9]={'name':'dash', 'value':'Coordination'}
    t0 = time.time()
    if validation_genotype_file is not None:
        print('Coordinating datasets (Summary statistics, LD reference genotypes, and Validation genotypes).')
        print('Coordinating datasets (Summary statistics and LD reference genotypes).')
    plinkf = plinkfile.PlinkFile(reference_genotype_file)

    # Figure out chromosomes and positions.
    if debug:
        print('Parsing plinkf_dict_val reference genotypes')
    loci = plinkf.get_loci()
    summary_dict[4]={'name':'Num individuals in LD Reference data:','value':plinkfiles.get_num_indivs(reference_genotype_file)}
    summary_dict[4.1]={'name':'SNPs in LD Reference data:','value':len(loci)}
    gf_chromosomes = [l.chromosome for l in loci]
    chromosomes = sp.unique(gf_chromosomes)

    chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes)
    if validation_genotype_file is not None:
        if debug:
            print('Parsing LD validation bim file')
        plinkf_val = plinkfile.PlinkFile(validation_genotype_file)

        # Loads only the individuals... 
        plinkf_dict_val = plinkfiles.get_phenotypes(plinkf_val)
        loci_val = plinkf_val.get_loci()
        summary_dict[5]={'name':'SNPs in Validation data:','value':len(loci_val)}

        chr_dict_val = plinkfiles.get_chrom_dict(loci_val, chromosomes)

        # Open HDF5 file and prepare out data
        assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.'
        if plinkf_dict_val['has_phenotype']:
            hdf5_file.create_dataset('y', data=plinkf_dict_val['phenotypes'])
            summary_dict[6]={'name':'Num validation phenotypes:','value':plinkf_dict_val['num_individs']}
        hdf5_file.create_dataset('fids', data=sp.array(plinkf_dict_val['fids'], dtype=util.fids_dtype))
        hdf5_file.create_dataset('iids', data=sp.array(plinkf_dict_val['iids'], dtype=util.iids_dtype))

        maf_adj_risk_scores = sp.zeros(plinkf_dict_val['num_individs'])

    # Now summary statistics
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    num_common_snps = 0
    # corr_list = []

    chromosomes_found = set()
    num_snps_common_before_filtering =0
    num_snps_common_after_filtering =0
    tot_num_non_matching_nts = 0
    tot_num_non_supported_nts = 0
    tot_num_ambig_nts = 0
    tot_num_freq_discrep_filtered_snps = 0
    tot_num_maf_filtered_snps = 0
    tot_g_ss_nt_concord_count = 0
    if validation_genotype_file is not None:
        tot_g_vg_nt_concord_count = 0
        tot_vg_ss_nt_concord_count = 0
    # Now iterate over chromosomes
    chrom_i = 0
    for chrom in chromosomes:
        chrom_i +=1
        if not debug:
            sys.stdout.write('\r%0.2f%%' % (100.0 * (float(chrom_i) / (len(chromosomes)+1))))
            chr_str = 'chrom_%d' % chrom
            ssg = ssf[chr_str]
        except Exception as err_str:
                print('Did not find chromosome %d in SS dataset.'%chrom)
        if debug:
            print('Coordinating data for chromosome %s' % chr_str)

        #Get summary statistics chromosome group
        ssg = ssf['chrom_%d' % chrom]
        ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        if validation_genotype_file is not None:
            chrom_d_val = chr_dict_val[chr_str]
            vg_sids = chrom_d_val['sids']
            common_sids = sp.intersect1d(ss_sids, vg_sids)
            # A map from sid to index for validation data        
            vg_sid_dict = {}
            for i, sid in enumerate(vg_sids):
                vg_sid_dict[sid] = i
            common_sids = ss_sids

        # A map from sid to index for summary stats        
        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid] = i

        #The indices to retain for the LD reference genotypes
        chrom_d = chr_dict[chr_str]
        g_sids = chrom_d['sids']
        common_sids = sp.intersect1d(common_sids, g_sids)
        # A map from sid to index for LD reference data        
        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid] = i

        if debug:
            print('Found %d SNPs on chrom %d that were common across all datasets' % (len(common_sids), chrom))
            print('Ordering SNPs by genomic positions (based on LD reference genotypes).')
        g_snp_map = []
        for sid in common_sids:
        # order by positions (based on LD reference file)
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)

        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        # Get the ordered sum stats SNPs indices.
        ss_snp_map = []
        for sid in common_sids:

        # Get the ordered validation SNPs indices
        if validation_genotype_file is not None:
            vg_snp_map = []
            for sid in common_sids:
            vg_nts = sp.array(chrom_d_val['nts'])
            vg_nts_ok = sp.array(vg_nts)[vg_snp_map]

        g_nts = sp.array(chrom_d['nts'])
        ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype)
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg:
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(
            g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0
        if validation_genotype_file is not None:
            vg_ss_nt_concord_count = sp.sum(vg_nts_ok == ss_nts[ss_snp_map]) / 2.0
            g_vg_nt_concord_count = sp.sum(g_nts[g_snp_map] == vg_nts_ok) / 2.0
            if debug:
                print('Nucleotide concordance counts out of %d genotypes, vg-rg: %d ; vg-ss: %d' % (len(g_snp_map), g_vg_nt_concord_count, vg_ss_nt_concord_count))
            tot_vg_ss_nt_concord_count += vg_ss_nt_concord_count
            tot_g_vg_nt_concord_count += g_vg_nt_concord_count
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        if debug:
            print('Nucleotide concordance counts out of %d genotypes, rg-ss: %d' % (len(g_snp_map), g_ss_nt_concord_count))

        num_freq_discrep_filtered_snps = 0
        num_non_matching_nts = 0
        num_non_supported_nts = 0
        num_ambig_nts = 0

        # Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        ok_indices = {'g': [], 'ss': []}
        if validation_genotype_file is not None:

        #Now loop over SNPs to coordinate nucleotides.        
        if validation_genotype_file is not None:
            for g_i, vg_i, ss_i in zip(g_snp_map, vg_snp_map, ss_snp_map):
                # To make sure, is the SNP id the same?
                assert g_sids[g_i] == vg_sids[vg_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
                g_nt = g_nts[g_i]
                if not skip_coordination:
                    vg_nt = vg_nts[vg_i]
                    ss_nt = ss_nts[ss_i]
                    # Is the nucleotide ambiguous.
                    g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
                    if tuple(g_nt) in util.ambig_nts:
                        num_ambig_nts += 1
                    # First check if nucleotide is sane?
                    if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts):
                        num_non_supported_nts += 1
                    os_g_nt = sp.array(
                        [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]])
                    flip_nts = False
                    #Coordination is a bit more complicate when validation genotypes are provided..
                    if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt))):
                        if sp.all(g_nt == vg_nt) or sp.all(os_g_nt == vg_nt):
                            flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                                os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                            # Try flipping the SS nt
                            if flip_nts:
                                betas[ss_i] = -betas[ss_i]
                                log_odds[ss_i] = -log_odds[ss_i]
                                if 'freqs' in ssg:
                                    ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                                if debug:
                                    print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                                          (g_sids[g_i], ss_sids[ss_i], g_i,
                                           ss_i, str(g_nt), str(ss_nt)))
                                num_non_matching_nts += 1
                            num_non_matching_nts += 1
                            # Opposite strand nucleotides
                # everything seems ok.
            for g_i, ss_i in zip(g_snp_map, ss_snp_map):
                # To make sure, is the SNP id the same?
                assert g_sids[g_i] == ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
                g_nt = g_nts[g_i]
                if not skip_coordination:
                    ss_nt = ss_nts[ss_i]
                    # Is the nucleotide ambiguous.
                    g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
                    if tuple(g_nt) in util.ambig_nts:
                        num_ambig_nts += 1
                    # First check if nucleotide is sane?
                    if (not g_nt[0] in util.valid_nts) or (not g_nt[1] in util.valid_nts):
                        num_non_matching_nts += 1
                    os_g_nt = sp.array(
                        [util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]])
                    flip_nts = False
                    #Coordination is a bit more complicate when validation genotypes are provided..
                    if not sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt):
                        flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                            os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                        # Try flipping the SS nt
                        if flip_nts:
                            betas[ss_i] = -betas[ss_i]
                            log_odds[ss_i] = -log_odds[ss_i]
                            if 'freqs' in ssg and ss_freqs[ss_i]>0:
                                ss_freqs[ss_i] = 1.0 - ss_freqs[ss_i]
                            if debug:
                                print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                                      (g_sids[g_i], ss_sids[ss_i], g_i,
                                       ss_i, str(g_nt), str(ss_nt)))
                            num_non_matching_nts += 1
                # everything seems ok.
        if debug:
            print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts)
            print('%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts)

        # Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]

        # Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        # Pinpoint where the SNPs are in the file.
        snp_indices = snp_indices[ok_indices['g']]
        raw_snps, freqs = plinkfiles.parse_plink_snps(
            reference_genotype_file, snp_indices)
        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]  
        log_odds = log_odds[ok_indices['ss']]  

        ns = ssg['ns'][...][ok_indices['ss']]
        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)  
        sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        sids = sids[ok_indices['ss']]

        #Parse validation genotypes, if available
        if validation_genotype_file is not None:
            snp_indices_val = sp.array(chrom_d_val['snp_indices'])
            # Pinpoint where the SNPs are in the file.
            snp_indices_val = snp_indices_val[ok_indices['vg']]
            raw_snps_val, freqs_val = plinkfiles.parse_plink_snps(
                validation_genotype_file, snp_indices_val)
            snp_stds_val = sp.sqrt(2 * freqs_val * (1 - freqs_val))
            snp_means_val = freqs_val * 2

        # Check SNP frequencies, screen for possible problems..
        if max_freq_discrep<1 and 'freqs' in ssg:
            ss_freqs = ss_freqs[ok_indices['ss']]
            ok_freq_snps = sp.logical_or(sp.absolute(ss_freqs - freqs) < max_freq_discrep,sp.absolute(ss_freqs + freqs-1) < max_freq_discrep) #Array of np.bool values
            ok_freq_snps = sp.logical_or(ok_freq_snps,ss_freqs<=0) #Only consider SNPs that actually have frequencies
            num_freq_discrep_filtered_snps = len(ok_freq_snps)- sp.sum(ok_freq_snps)
            assert num_freq_discrep_filtered_snps>=0, "Problems when filtering SNPs with frequency discrepencies"
            if num_freq_discrep_filtered_snps>0:
                # Filter freq_discrepancy_snps
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                ps = ps[ok_freq_snps]
                ns = ns[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                if validation_genotype_file is not None:
                    raw_snps_val = raw_snps_val[ok_freq_snps]
                    snp_stds_val = snp_stds_val[ok_freq_snps]
                    snp_means_val = snp_means_val[ok_freq_snps]
                    freqs_val = freqs_val[ok_freq_snps]
            if debug:
                print('Filtered %d SNPs due to frequency discrepancies'%num_freq_discrep_filtered_snps)

        # Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        num_maf_filtered_snps = len(maf_filter)-sp.sum(maf_filter)
        assert num_maf_filtered_snps>=0, "Problems when filtering SNPs with low minor allele frequencies"
        if num_maf_filtered_snps>0:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            freqs = freqs[maf_filter]
            ps = ps[maf_filter]
            ns = ns[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]
            if validation_genotype_file is not None:
                raw_snps_val = raw_snps_val[maf_filter]
                snp_stds_val = snp_stds_val[maf_filter]
                snp_means_val = snp_means_val[maf_filter]
                freqs_val = freqs_val[maf_filter]
            if debug:
                print('Filtered %d SNPs due to low MAF'%num_maf_filtered_snps)

        genetic_map = []
        if genetic_map_dir is not None:
            with + 'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
#                     if l[0] in sid_set:
#                         genetic_map.append(l[0])
            genetic_map = None

        coord_data_dict = {'chrom': 'chrom_%d' % chrom, 
                           'raw_snps_ref': raw_snps, 
                           'snp_stds_ref': snp_stds, 
                           'snp_means_ref': snp_means, 
                           'freqs_ref': freqs,
                           'ps': ps,
                           'ns': ns,
                           'positions': positions,
                           'nts': nts,
                           'sids': sids,
                           'genetic_map': genetic_map,
                           'betas': betas,
                           'log_odds': log_odds}
        if validation_genotype_file is not None:
            maf_adj_prs =, raw_snps_val)
            if debug and plinkf_dict_val['has_phenotype']:
                maf_adj_corr = sp.corrcoef(plinkf_dict_val['phenotypes'], maf_adj_prs)[0, 1]
                print('Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr))
            maf_adj_risk_scores += maf_adj_prs
        write_coord_data(cord_data_g, coord_data_dict, debug=debug)
        if debug:
            print('%d SNPs were retained on chromosome %d.' % (len(sids), chrom))
        num_snps_common_before_filtering += len(common_sids)
        num_snps_common_after_filtering += len(sids)
        tot_num_ambig_nts += num_ambig_nts
        tot_num_non_supported_nts += num_non_supported_nts
        tot_num_non_matching_nts += num_non_matching_nts
        tot_num_freq_discrep_filtered_snps += num_freq_discrep_filtered_snps
        tot_num_maf_filtered_snps += num_maf_filtered_snps

    if not debug:
        sys.stdout.write('\r%0.2f%%\n' % (100.0))

    # Now calculate the prediction r^2
    if validation_genotype_file:
        if debug and plinkf_dict_val['has_phenotype']:
            maf_adj_corr = sp.corrcoef(
                plinkf_dict_val['phenotypes'], maf_adj_risk_scores)[0, 1]
            print('Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)' % (maf_adj_corr, maf_adj_corr ** 2))
            print('Overall nucleotide concordance counts: rg_vg: %d, rg_ss: %d, vg_ss: %d' % (tot_g_vg_nt_concord_count, tot_g_ss_nt_concord_count, tot_vg_ss_nt_concord_count))
        if debug:
            print('Overall nucleotide concordance counts, rg_ss: %d' % (tot_g_ss_nt_concord_count))        
    summary_dict[7]={'name':'Num chromosomes used:','value':len(chromosomes_found)}
    summary_dict[8]={'name':'SNPs common across datasets:','value':num_snps_common_before_filtering}
    summary_dict[9]={'name':'SNPs retained after filtering:','value':num_snps_common_after_filtering}
    if tot_num_ambig_nts>0:
        summary_dict[10]={'name':'SNPs w ambiguous nucleotides filtered:','value':tot_num_ambig_nts}
    if tot_num_non_supported_nts>0:
        summary_dict[10.1]={'name':'SNPs w unknown/unsupported nucleotides filtered:','value':tot_num_non_supported_nts}
    if tot_num_non_matching_nts>0:
        summary_dict[11]={'name':'SNPs w other nucleotide discrepancies filtered:','value':tot_num_non_matching_nts}
    if min_maf>0:
        summary_dict[12]={'name':'SNPs w MAF<%0.3f filtered:'%min_maf,'value':tot_num_maf_filtered_snps}
    if max_freq_discrep<0.5:
        summary_dict[13]={'name':'SNPs w allele freq discrepancy > %0.3f filtered:'%max_freq_discrep,'value':tot_num_freq_discrep_filtered_snps}

    t1 = time.time()
    t = (t1 - t0)
    summary_dict[13.9]={'name':'dash', 'value':'Running times'}
    summary_dict[15]={'name':'Run time for coordinating datasets:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}
Beispiel #7
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None,
    #   recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding..
    print 'Coordinating things w genotype file: %s \nref. genot. file: %s' % (
        genotype_file, reference_genotype_file)
    plinkf = plinkfile.PlinkFile(genotype_file)

    #Loads only the individuals... (I think?)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]

    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print 'Unable to find phenotype values.'
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])
        has_phenotype = True
        print 'Found quantitative phenotype values'
        has_phenotype = True

    #Figure out chromosomes and positions.
    print 'Parsing validation genotype bim file'
    loci = plinkf.get_loci()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)

    chr_dict = _get_chrom_dict_(loci, chromosomes)

    print 'Parsing LD reference genotype bim file'
    plinkf_ref = plinkfile.PlinkFile(reference_genotype_file)
    loci_ref = plinkf_ref.get_loci()

    chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes)
    #     chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes)

    #Open HDF5 file and prepare out data
    assert not 'iids' in hdf5_file.keys(
    ), 'Something is wrong with the HDF5 file?'
    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)

    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    maf_adj_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    #corr_list = []

    tot_g_ss_nt_concord_count = 0
    tot_rg_ss_nt_concord_count = 0
    tot_g_rg_nt_concord_count = 0
    tot_num_non_matching_nts = 0

    #Now iterate over chromosomes
    for chrom in chromosomes:
        ok_indices = {'g': [], 'rg': [], 'ss': []}

        chr_str = 'chrom_%d' % chrom
        print 'Working on chromsome: %s' % chr_str

        chrom_d = chr_dict[chr_str]
        chrom_d_ref = chr_dict_ref[chr_str]
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'

        ssg = ssf['chrom_%d' % chrom]
        g_sids = chrom_d['sids']
        rg_sids = chrom_d_ref['sids']
        ss_sids = ssg['sids'][...]
        print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.' % (
            len(g_sids), len(rg_sids), len(ss_sids))
        common_sids = sp.intersect1d(ss_sids, g_sids)
        common_sids = sp.intersect1d(common_sids, rg_sids)
        print 'Found %d SNPs on chrom %d that were common across all datasets' % (
            len(common_sids), chrom)

        ss_snp_map = []
        g_snp_map = []
        rg_snp_map = []

        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid] = i

        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid] = i

        rg_sid_dict = {}
        for i, sid in enumerate(rg_sids):
            rg_sid_dict[sid] = i

        for sid in common_sids:

        #order by positions
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)
        #order = order.tolist()
        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        #Get the other two maps
        for sid in common_sids:

        for sid in common_sids:

        g_nts = sp.array(chrom_d['nts'])
        rg_nts = sp.array(chrom_d_ref['nts'])
        rg_nts_ok = sp.array(rg_nts)[rg_snp_map]
        #         rg_nts_l = []
        #         for nt in rg_nts_ok:
        #             rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]])
        #         rg_nts_ok = sp.array(rg_nts_l)
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(
            g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0
        rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0
        g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0
        print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d' % (
            len(g_snp_map), g_rg_nt_concord_count, g_ss_nt_concord_count,
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count
        tot_g_rg_nt_concord_count += g_rg_nt_concord_count

        num_non_matching_nts = 0
        num_ambig_nts = 0

        #Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map):

            #To make sure, is the SNP id the same?
            assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[
                ss_i], 'Some issues with coordinating the genotypes.'

            g_nt = g_nts[g_i]
            rg_nt = rg_nts[rg_i]
            #             rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]]
            ss_nt = ss_nts[ss_i]

            #Is the nucleotide ambiguous.
            g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts += 1
                tot_num_non_matching_nts += 1

            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1

            os_g_nt = sp.array(
                [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])

            flip_nts = False
            if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and
                    (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))):
                if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt):
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0]
                                == ss_nt[1]) or (os_g_nt[1] == ss_nt[0]
                                                 and os_g_nt[0] == ss_nt[1])
                    #Try flipping the SS nt
                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]
                        log_odds[ss_i] = -log_odds[ss_i]
                        if 'freqs' in ssg.keys():
                            ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                        print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                            (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1

                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                    # Opposite strand nucleotides

            # everything seems ok.

#             if flip_nts:
#                 ok_nts.append([ss_nt[1],ss_nt[0]])
#             else:
#                 ok_nts.append(ss_nt)

#print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0)
        print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts
        print '%d SNPs were retained on chromosome %d.' % (len(
            ok_indices['g']), chrom)

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        #         order = sp.argsort(positions)
        #         sorted_positions = positions[order]
        #         assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?'
        #         ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        #         ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])

        #Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[
            ok_indices['g']]  #Pinpoint where the SNPs are in the file.
        raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)

        snp_indices_ref = sp.array(chrom_d_ref['snp_indices'])
        snp_indices_ref = snp_indices_ref[
            ok_indices['rg']]  #Pinpoint where the SNPs are in the file.
        raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file,

        snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref))
        snp_means_ref = freqs_ref * 2

        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))
        log_odds = log_odds[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)  #[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #For debugging...
        #         g_sids = sp.array(chrom_d['sids'])[ok_indices['g']]
        #         rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']]
        #         ss_sids = ssg['sids'][...][ok_indices['ss']]
        #         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample' % sp.sum(
                #                 print freqs[freq_discrepancy_snp]
                #                 print ss_freqs[freq_discrepancy_snp]

                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                raw_ref_snps = raw_ref_snps[ok_freq_snps]
                snp_stds_ref = snp_stds_ref[ok_freq_snps]
                snp_means_ref = snp_means_ref[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                freqs_ref = freqs_ref[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                #For debugging...
#         if sp.any(freq_discrepancy_snp):
#             g_sids = g_sids[ok_freq_snps]
#             rg_sids = rg_sids[ok_freq_snps]
#             ss_sids = ss_sids[ok_freq_snps]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

#Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "WTF?"
        if sp.sum(maf_filter) < n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            raw_ref_snps = raw_ref_snps[maf_filter]
            snp_stds_ref = snp_stds_ref[maf_filter]
            snp_means_ref = snp_means_ref[maf_filter]
            freqs = freqs[maf_filter]
            freqs_ref = freqs_ref[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]

#         if sp.sum(maf_filter)<n_snps:
#             g_sids = g_sids[maf_filter]
#             rg_sids = rg_sids[maf_filter]
#             ss_sids = ss_sids[maf_filter]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        maf_adj_prs =, raw_snps)
        if has_phenotype:
            maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1]
            print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (
                chrom, maf_adj_corr)

        genetic_map = []
        if genetic_map_dir is not None:
            with +
                           'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:

        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_val', data=snp_stds)
        ofg.create_dataset('snp_means_val', data=snp_means)
        ofg.create_dataset('freqs_val', data=freqs)
        ofg.create_dataset('snp_stds_ref', data=snp_stds_ref)
        ofg.create_dataset('snp_means_ref', data=snp_means_ref)
        ofg.create_dataset('freqs_ref', data=freqs_ref)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=maf_adj_prs)
        #         print 'Sum betas', sp.sum(betas ** 2)
        #ofg.create_dataset('prs', data=prs)

        #risk_scores += prs
        maf_adj_risk_scores += maf_adj_prs
        num_common_snps += len(betas)
Beispiel #8
def coordinate_genot_ss(genotype_file=None,
    Assumes plink BED files.  Imputes missing genotypes.
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    #        num_individs = len(gf['chrom_1']['snps'][:, 0])
    #     Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8')
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print 'Unable to find phenotype values.'
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])
        has_phenotype = True
        print 'Found quantitative phenotype values'
        has_phenotype = True
    risk_scores = sp.zeros(num_individs)
    rb_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    corr_list = []
    rb_corr_list = []

    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)

    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    #Figure out chromosomes and positions by looking at SNPs.
    loci = plinkf.get_loci()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)
    chr_dict = _get_chrom_dict_(loci, chromosomes)

    tot_num_non_matching_nts = 0
    for chrom in chromosomes:
        chr_str = 'chrom_%d' % chrom
        print 'Working on chromsome: %s' % chr_str

        chrom_d = chr_dict[chr_str]
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'

        g_sids = chrom_d['sids']
        g_sid_set = set(g_sids)
        assert len(g_sid_set) == len(g_sids), 'Some duplicates?'
        ss_sids = ssg['sids'][...]
        ss_sid_set = set(ss_sids)
        assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?'

        #Figure out filters:
        g_filter = sp.in1d(g_sids, ss_sids)
        ss_filter = sp.in1d(ss_sids, g_sids)

        #Order by SNP IDs
        g_order = sp.argsort(g_sids)
        ss_order = sp.argsort(ss_sids)

        g_indices = []
        for g_i in g_order:
            if g_filter[g_i]:

        ss_indices = []
        for ss_i in ss_order:
            if ss_filter[ss_i]:

        g_nts = chrom_d['nts']
        snp_indices = chrom_d['snp_indices']
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]
        assert not sp.any(sp.isnan(betas)), 'WTF?'
        assert not sp.any(sp.isinf(betas)), 'WTF?'

        num_non_matching_nts = 0
        num_ambig_nts = 0
        ok_nts = []
        print 'Found %d SNPs present in both datasets' % (len(g_indices))

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]
            ss_freqs_list = []

        ok_indices = {'g': [], 'ss': []}
        for g_i, ss_i in it.izip(g_indices, ss_indices):

            #Is the nucleotide ambiguous?
            #g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]]
            g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts += 1
                tot_num_non_matching_nts += 1

            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1

            ss_nt = ss_nts[ss_i]
            #Are the nucleotides the same?
            flip_nts = False
            os_g_nt = sp.array(
                [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
            if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
                # Opposite strand nucleotides
                flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                    os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                if flip_nts:
                    betas[ss_i] = -betas[ss_i]
                    log_odds[ss_i] = -log_odds[ss_i]
                    if 'freqs' in ssg.keys():
                        ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                    #                     print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                    #                         (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1


            # everything seems ok.

        print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        order = sp.argsort(positions)
        ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])
        positions = positions[order]

        #Parse SNPs
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[
            ok_indices['g']]  #Pinpoint where the SNPs are in the file.
        raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)
        print 'raw_snps.shape=', raw_snps.shape

        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))  #sp.std(raw_snps, 1)
        snp_means = freqs * 2  #sp.mean(raw_snps, 1)

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]
        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample' % sp.sum(
                print freqs[freq_discrepancy_snp]
                print ss_freqs[freq_discrepancy_snp]

                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]

        #Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "WTF?"
        if sp.sum(maf_filter) < n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            freqs = freqs[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]

            print '%d SNPs with MAF < %0.3f were filtered' % (
                n_snps - maf_filter_sum, min_maf)

        print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum,

        rb_prs =, log_odds)
        if has_phenotype:
            print 'Normalizing SNPs'
            snp_means.shape = (len(raw_snps), 1)
            snp_stds.shape = (len(raw_snps), 1)
            snps = (raw_snps - snp_means) / snp_stds
            assert snps.shape == raw_snps.shape, 'Aha!'
            snp_stds = snp_stds.flatten()
            snp_means = snp_means.flatten()
            prs =, betas)
            corr = sp.corrcoef(Y, prs)[0, 1]
            print 'PRS correlation for chromosome %d was %0.4f' % (chrom, corr)
            rb_corr = sp.corrcoef(Y, rb_prs)[0, 1]
            print 'Raw effect sizes PRS correlation for chromosome %d was %0.4f' % (
                chrom, rb_corr)

        sid_set = set(sids)
        if genetic_map_dir is not None:
            genetic_map = []
            with +
                           'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:

        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_ref', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds)
        ofg.create_dataset('snp_means_ref', data=snp_means)
        ofg.create_dataset('freqs_ref', data=freqs)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)

#         print 'Sum of squared effect sizes:', sp.sum(betas ** 2)
#         print 'Sum of squared log odds:', sp.sum(log_odds ** 2)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=rb_prs)
        if has_phenotype:
            risk_scores += prs
        rb_risk_scores += rb_prs
        num_common_snps += len(betas)
Beispiel #9
def get_num_indivs(genotype_file):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    return len(samples)
Beispiel #10
def bed_plink_to_hdf5(genotype_file, out_hdf5_file, indiv_filter=None):
	Note: It may not support all PLINK files for now.

    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()

    affections = []
    phens = []
    iids = []
    fids = []

    for sample in samples:

    num_individs = len(iids)
    if sp.any(sp.isnan(phens)):
            'Phenotypes appear to have some NaNs, or perhaps parsing failed?')
        print('%d individuals have phenotype and genotype information.' %

    # If these indices are not in order then we place them in the right place while parsing SNPs.
    print('Iterating over BED file.')
    oh5f = h5py.File(out_hdf5_file)
    # First construct chromosome groups.

    # Then iterate through the plink file.
    locus_list = plinkf.get_loci()
    snp_i = 0

    curr_chromosome = 1
    print("The current chromosome is Chr", curr_chromosome)
    for locus, row in izip(locus_list, plinkf):
        chromosome = locus.chromosome

        if curr_chromosome == 1:
            # Initialize data containers
            sids = []
            positions = []
            nts_list = []
            snps = []
        if chromosome != curr_chromosome:
            ## Print the current chromosome
            print("The current chromosome is Chr", chromosome)
            # Store current data in HDF5 file
            chr_group = oh5f.create_group('chr_%d' % curr_chromosome)
            chr_group.create_dataset('sids', data=sids)
            chr_group.create_dataset('positions', data=positions)
            chr_group.create_dataset('snps', data=sp.array(snps, dtype='int8'))
            chr_group.create_dataset('nts_list', data=nts_list)

            # re-initialize data containers
            sids = []
            positions = []
            nts_list = []
            snps = []
            curr_chromosome = chromosome

        nts_list.append([locus.allele1, locus.allele2])

        # Parse SNP, and fill in the blanks if necessary.
        if indiv_filter is not None:
            snp = sp.array(row, dtype='int8')[indiv_filter]
            snp = sp.array(row, dtype='int8')
        bin_counts = row.allele_counts()
        if bin_counts[-1] > 0:
            mode_v = sp.argmax(bin_counts[:2])
            snp[snp == 3] = mode_v

    # Store remaining data in HDF5 file
    chr_group = oh5f.create_group('chr_%d' % curr_chromosome)
    chr_group.create_dataset('sids', data=sids)
    chr_group.create_dataset('positions', data=positions)
    chr_group.create_dataset('snps', data=sp.array(snps, dtype='int8'))
    chr_group.create_dataset('nts_list', data=nts_list)

    print("The parsing is completed")

#bed_plink_to_hdf5("../risk_prediction/celiac_disease_data/Cel_disease_CC", "H.h5", indiv_filter=None)
Beispiel #11
def coordinate_ss(genotype_file=None,ssfformat=None,hdf5_file=None,outfile=None,
                        skip_coordination=False, keep_all=False,skip_ambiguous=False):
    Assumes plink BED files.  Imputes missing genotypes.
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    #        num_individs = len(gf['chrom_1']['snps'][:, 0])
    #     Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8')
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print 'Unable to find phenotype values.'
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])
        has_phenotype = True
        print 'Found quantitative phenotype values'
        has_phenotype = True
    risk_scores = sp.zeros(num_individs)
    rb_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    corr_list = []
    rb_corr_list = []

    ssf = hdf5_file['sum_stats']

    # Figure out chromosomes and positions by looking at SNPs.
    loci = plinkf.get_loci()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)
    chr_dict = _get_chrom_dict_(loci, chromosomes)
    tot_num_non_matching_nts = 0
    for chrom in chromosomes:
        chr_str = 'chrom_%d' % chrom
        chr_col = 'chr%d' % chrom
        print 'Working on chromsome: %s' % chr_str

        chrom_d = chr_dict[chr_str]
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'

        g_sids = chrom_d['sids']
        g_sid_set = set(g_sids)
        assert len(g_sid_set) == len(g_sids), 'Some duplicates?'
        ss_sids = ssg['sids'][...]
        ss_sid_set = set(ss_sids)
        assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?'

        # Figure out filters:
        g_filter = sp.in1d(g_sids, ss_sids)
        ss_filter = sp.in1d(ss_sids, g_sids)

        # Order by SNP IDs
        g_order = sp.argsort(g_sids)
        ss_order = sp.argsort(ss_sids)

        g_indices = []
        for g_i in g_order:
            if g_filter[g_i]:

        ss_indices = []
        for ss_i in ss_order:
            if ss_filter[ss_i]:
        g_nts = chrom_d['nts']
        snp_indices = chrom_d['snp_indices']
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if ssfformat=="LDSCORE"  or ssfformat == "STANDARD_FUNCT":
            ld_score = ssg['ld_score'][...]  ### LDSCORE
        #### Track allele flips indices ####
        ss_flips = sp.ones(len(ss_indices))
        assert not sp.any(sp.isnan(betas)), 'WTF?'
        # assert not sp.any(sp.isinf(betas)), 'WTF?'

        num_non_matching_nts = 0
        num_ambig_nts = 0
        ok_nts = []
        print 'Found %d SNPs present in both datasets' % (len(g_indices))

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]
            ss_freqs_list = []

        ok_indices = {'g': [], 'ss': []}
        for g_i, ss_i in it.izip(g_indices, ss_indices):

            # Is the nucleotide ambiguous?
            # g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]]
            g_nt = [g_nts[g_i][0], g_nts[g_i][1]]

            if not skip_coordination:
                if not skip_ambiguous:
                    if tuple(g_nt) in ambig_nts:
                        num_ambig_nts += 1
                        tot_num_non_matching_nts += 1

                if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1

                ss_nt = ss_nts[ss_i]

                # Are the nucleotides the same?
                flip_nts = False
                os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
                if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
                    # Opposite strand nucleotides
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                    os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]
                        log_odds[ss_i] = -log_odds[ss_i]
                        ss_flips[ss_i] = -1
                        if 'freqs' in ssg.keys():
                            ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                        #                     print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                        #                         (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1

            # everything seems ok.

        print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts

        # Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        order = sp.argsort(positions)
        ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])
        positions = positions[order]

        # Parse SNPs
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[ok_indices['g']]  # Pinpoint where the SNPs are in the file.
        #raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)
        freqs = _parse_plink_snps_freqs_(genotype_file, snp_indices)

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]
        sids = ssg['sids'][...][ok_indices['ss']]

        if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT":
            ld_score = ld_score[ok_indices['ss']]  #### LDSCORE
        # Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample' % sp.sum(
                print freqs[freq_discrepancy_snp]
                print ss_freqs[freq_discrepancy_snp]

                # Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                freqs = freqs[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]

                if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT":
                    ld_score = ld_score[ok_freq_snps]  #### LDSCORE
        # Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "WTF?"
        if sp.sum(maf_filter) < n_snps:
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]

            if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT":
                ld_score = ld_score[maf_filter]

            print '%d SNPs with MAF < %0.3f were filtered' % (n_snps - maf_filter_sum, min_maf)

        print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom)

        num_common_snps += len(betas)
        ssf_dict[chr_str]['log_odds'] = log_odds
Beispiel #12
def get_prs(genotype_file, rs_id_map, phen_map=None):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()

    # 1. Figure out indiv filter and get true phenotypes
    indiv_filter = sp.zeros(len(samples), dtype='bool8')
    true_phens = []
    iids = []
    if phen_map is not None:
        pcs = []
        sex = []
        covariates = []
        phen_iids = set(phen_map.keys())
        for samp_i, sample in enumerate(samples):
            if sample.iid in phen_iids:
                indiv_filter[samp_i] = True
                if 'pcs' in phen_map[sample.iid].keys():
                if 'sex' in phen_map[sample.iid].keys():
                if 'covariates' in phen_map[sample.iid].keys():
        if len(pcs) > 0:
            assert len(pcs) == len(
            ), 'PC information missing for some individuals with phenotypes'
        if len(sex) > 0:
            assert len(sex) == len(
            ), 'Sex information missing for some individuals with phenotypes'
        if len(covariates) > 0:
            assert len(covariates) == len(
            ), 'Covariates missing for some individuals with phenotypes'
        for samp_i, sample in enumerate(samples):
            if sample.affection != 2:
                indiv_filter[samp_i] = True

    num_individs = sp.sum(indiv_filter)
    assert num_individs > 0, 'Issues in parsing the phenotypes and/or PCs?'

    assert not sp.any(sp.isnan(
        true_phens)), 'Phenotypes appear to have some NaNs, or parsing failed.'

    print '%d individuals have phenotype and genotype information.' % num_individs

    num_non_matching_nts = 0
    num_flipped_nts = 0

    raw_effects_prs = sp.zeros(num_individs)
    pval_derived_effects_prs = sp.zeros(num_individs)
    # If these indices are not in order then we place them in the right place
    # while parsing SNPs.
    print 'Iterating over BED file to calculate risk scores.'
    locus_list = plinkf.get_loci()
    snp_i = 0

    for locus, row in it.izip(locus_list, plinkf):
        upd_pval_beta = 0
            # Check rs-ID
            sid =
            rs_info = rs_id_map[sid]
        except Exception:  # Move on if rsID not found.

        if rs_info['upd_pval_beta'] == 0:

        # Check whether the nucleotides are OK, and potentially flip it.
        ss_nt = rs_info['nts']
        g_nt = [locus.allele1, locus.allele2]
        flip_nts = False
        os_g_nt = sp.array(
            [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
        if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
            # Opposite strand nucleotides
            flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
            if flip_nts:
                raw_beta = -rs_info['raw_beta']
                upd_pval_beta = -rs_info['upd_pval_beta']
                num_flipped_nts += 1
                num_non_matching_nts += 1
            raw_beta = rs_info['raw_beta']
            upd_pval_beta = rs_info['upd_pval_beta']

        # Parse SNP, and fill in the blanks if necessary.
        snp = sp.array(row, dtype='int8')[indiv_filter]
        bin_counts = row.allele_counts()
        if bin_counts[-1] > 0:
            mode_v = sp.argmax(bin_counts[:2])
            snp[snp == 3] = mode_v

        # Update scores and move on.
        raw_effects_prs += snp * raw_beta
        assert not sp.any(
        ), 'Some individual raw effects risk scores are NANs (not a number).  They are corrupted.'
        pval_derived_effects_prs += snp * upd_pval_beta
        assert not sp.any(
        ), 'Some individual weighted effects risk scores are NANs (not a number).  They are corrupted.'

        if snp_i > 0 and snp_i % 100000 == 0:
            print snp_i
            print 'Number of non-matching NTs: %d' % num_non_matching_nts
            raw_eff_r2 = (sp.corrcoef(raw_effects_prs, true_phens)[0, 1])**2
            pval_eff_r2 = (sp.corrcoef(pval_derived_effects_prs,
                                       true_phens)[0, 1])**2
            print 'Raw effects PRS r2: %0.4f' % raw_eff_r2
            print 'Weigted effects PRS r2: %0.4f' % pval_eff_r2

        snp_i += 1


    print "DONE!"
    print 'Number of non-matching NTs: %d' % num_non_matching_nts
    print 'Number of flipped NTs: %d' % num_flipped_nts
    raw_eff_corr = sp.corrcoef(raw_effects_prs, true_phens)[0, 1]
    raw_eff_r2 = raw_eff_corr**2
    pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1]
    pval_eff_r2 = pval_eff_corr**2

    print 'Raw effects PRS correlation: %0.4f' % raw_eff_corr
    print 'Raw effects PRS r2: %0.4f' % raw_eff_r2
    print 'Weigted effects PRS correlation: %0.4f' % pval_eff_corr
    print 'Weigted effects PRS r2: %0.4f' % pval_eff_r2

    ret_dict = {
        'raw_effects_prs': raw_effects_prs.copy(),
        'pval_derived_effects_prs': pval_derived_effects_prs.copy(),
        'true_phens': true_phens[:],
        'iids': iids

    if len(pcs) > 0:
        ret_dict['pcs'] = pcs
    if len(sex) > 0:
        ret_dict['sex'] = sex
    if len(covariates) > 0:
        ret_dict['covariates'] = covariates

    return ret_dict
Beispiel #13
def bed_to_hdf5_file(bed_file, hdf5_out):
	Note: It may not support all PLINK files for now

    plinkf = plinkfile.PlinkFile(bed_file)
    samples = plinkf.get_samples()
    print("Extracting sample information...")

    affections = []
    phenotypes = []
    iids = []
    fids = []
    sex = []

    ## For each sample extract the individual identifier, the family identifier,
    ## the affection, the phenotype and the sex.
    for sample in samples:

    ## Number of individuals
    N = len(iids)

    if sp.any(sp.isnan(phenotypes)):
            'Phenotypes appear to have some NaNs, or perhaps parsing failed?')
        print("%d individuals have phenotype and genotype information." % N)

    hf = h5py.File(hdf5_out)

    ## Store sample information in HDF5 file
    sample_inf = hf.create_group('sample_informations')
    sample_inf.create_dataset('iids', data=iids)
    sample_inf.create_dataset('fids', data=fids)
    sample_inf.create_dataset('Affections', data=affections)
    sample_inf.create_dataset('Sex', data=sex)
    sample_inf.create_dataset('Phenotypes', data=phenotypes)

    print("Iterating over BED file...")

    ## Iterate through the plink file.
    locus_list = plinkf.get_loci()
    chromosomes = []
    current_chromosome = 0
    print("The current chromosome is Chr", current_chromosome)

    for locus, row in izip(locus_list, plinkf):
        ## Get the current chromosome
        chrom = locus.chromosome
        ## and store it in the chromosome vector
        if current_chromosome == 0:
            ## Initialize data containers
            sids = []
            positions = []
            nts_list = []
            snps = []
        if chrom != current_chromosome:
            ## Print the number of the chromosome
            print("The current chromosome is Chr", chrom)
            # Store current data in the HDF5 file
            chr_group = hf.create_group("chr_%d" % current_chromosome)
            chr_group.create_dataset("sids", data=sids)
            chr_group.create_dataset('positions', data=positions)
            chr_group.create_dataset('snps', data=sp.array(snps, dtype='int8'))
            chr_group.create_dataset('nts_list', data=nts_list)

            ## re-initialize data containers
            sids = []
            positions = []
            nts_list = []
            snps = []
            current_chromosome = chrom

        ## Get the SNP name
        ## Get the first and the second allele
        nts_list.append([locus.allele1, locus.allele2])
        ## Furthermore, we store the position

        ## Parse SNP and fill in the blanks if necessary
        snp = sp.array(row, dtype="int8")
        bin_counts = row.allele_counts()
        if bin_counts[-1] > 0:
            mode_v = sp.argmax(bin_counts[:2])
            snp[snp == 3] = mode_v


    ## Store remaining data in HDF5 file
    chr_group = hf.create_group("chr_%d" % current_chromosome)
    chr_group.create_dataset("sids", data=sids)
    chr_group.create_dataset('positions', data=positions)
    chr_group.create_dataset('snps', data=sp.array(snps, dtype='int8'))
    chr_group.create_dataset('nts_list', data=nts_list)
    hf.create_dataset("Chromosomes", data=chromosomes)

    print("The parsing is completed")
Beispiel #14
def get_prs(genotype_file, rs_id_map, phen_map=None):
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()

    # 1. Figure out indiv filter and get true phenotypes
    indiv_filter = sp.zeros(len(samples), dtype='bool8')
    true_phens = []
    iids = []
    if phen_map is not None:
        pcs = []
        sex = []
        covariates = []
        phen_iids = set(phen_map.keys())
        for samp_i, sample in enumerate(samples):
            if sample.iid in phen_iids:
                indiv_filter[samp_i] = True
                if 'pcs' in list(phen_map[sample.iid].keys()):
                if 'sex' in list(phen_map[sample.iid].keys()):
                if 'covariates' in list(phen_map[sample.iid].keys()):
                    # Temp hack...
                    #                     if phen_map[sample.iid]['sex']==1:
                    #                         covariates.append([phen_map[sample.iid]['covariates'][0],0])
                    #                     else:
                    #                         covariates.append([0,phen_map[sample.iid]['covariates'][0]])
        if len(pcs) > 0:
            assert len(pcs) == len(
            ), 'PC information missing for some individuals with phenotypes'
        if len(sex) > 0:
            assert len(sex) == len(
            ), 'Sex information missing for some individuals with phenotypes'
        if len(covariates) > 0:
            assert len(covariates) == len(
            ), 'Covariates missing for some individuals with phenotypes'
        for samp_i, sample in enumerate(samples):
            if sample.affection != 2:
                indiv_filter[samp_i] = True
                # print(sample.affection)

    num_individs = sp.sum(indiv_filter)
    assert num_individs > 0, 'Issues in parsing the phenotypes and/or PCs?'

    assert not sp.any(sp.isnan(
        true_phens)), 'Phenotypes appear to have some NaNs, or parsing failed.'

    print('%d individuals have phenotype and genotype information.' %

    num_non_matching_nts = 0
    num_flipped_nts = 0

    raw_effects_prs = sp.zeros(num_individs)
    pval_derived_effects_prs = sp.zeros(num_individs)
    # If these indices are not in order then we place them in the right place while parsing SNPs.
    print('Iterating over BED file to calculate risk scores.')
    locus_list = plinkf.get_loci()
    snp_i = 0

    for locus, row in zip(locus_list, plinkf):
        upd_pval_beta = 0
            # Check rs-ID
            #             sid = '%d_%d'%(locus.chromosome,locus.bp_position)
            sid =
            rs_info = rs_id_map[sid]
        except Exception:  # Move on if rsID not found.

        if rs_info['upd_pval_beta'] == 0:

        # Check whether the nucleotides are OK, and potentially flip it.
        ss_nt = rs_info['nts']
        g_nt = [locus.allele1, locus.allele2]
        flip_nts = False
        os_g_nt = sp.array(
            [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
        if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
            # Opposite strand nucleotides
            flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
            if flip_nts:
                raw_beta = -rs_info['raw_beta']
                upd_pval_beta = -rs_info['upd_pval_beta']
                num_flipped_nts += 1
                # print "Nucleotides don't match after all?: sid=%s, g_nt=%s, ss_nt=%s" % (, str(g_nt), str(ss_nt))
                num_non_matching_nts += 1
            raw_beta = rs_info['raw_beta']
            upd_pval_beta = rs_info['upd_pval_beta']

        # Parse SNP, and fill in the blanks if necessary.
        snp = sp.array(row, dtype='int8')[indiv_filter]
        bin_counts = row.allele_counts()
        if bin_counts[-1] > 0:
            mode_v = sp.argmax(bin_counts[:2])
            snp[snp == 3] = mode_v

        ## fixed buggy code
        ## wrong encoding of genotype (A1 should be encoded as 1 instead of A2. It is different from plinkio default)
        ## original code:
        # no action
        ## new code
        snp = 2 - snp
        ## fix finish

        # Normalize SNP
        #         n_snp = (snp - sp.mean(snp))/sp.std(snp)

        # Update scores and move on.
        raw_effects_prs += snp * raw_beta
        assert not sp.any(
            sp.isnan(raw_effects_prs)), 'Raw effects PRS is corrupted'
        pval_derived_effects_prs += snp * upd_pval_beta
        assert not sp.any(sp.isnan(
            pval_derived_effects_prs)), 'Weighted effects PRS is corrupted'

        if snp_i > 0 and snp_i % 100000 == 0:
            print('Number of non-matching NTs: %d' % num_non_matching_nts)
            raw_eff_r2 = (sp.corrcoef(raw_effects_prs, true_phens)[0, 1])**2
            pval_eff_r2 = (sp.corrcoef(pval_derived_effects_prs,
                                       true_phens)[0, 1])**2
            print('Raw effects PRS r2: %0.4f' % raw_eff_r2)
            print('Weigted effects PRS r2: %0.4f' % pval_eff_r2)

        snp_i += 1


    print('Number of non-matching NTs: %d' % num_non_matching_nts)
    print('Number of flipped NTs: %d' % num_flipped_nts)
    raw_eff_corr = sp.corrcoef(raw_effects_prs, true_phens)[0, 1]
    raw_eff_r2 = raw_eff_corr**2
    pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1]
    pval_eff_r2 = pval_eff_corr**2

    print('Raw effects PRS correlation: %0.4f' % raw_eff_corr)
    print('Raw effects PRS r2: %0.4f' % raw_eff_r2)
    print('Weigted effects PRS correlation: %0.4f' % pval_eff_corr)
    print('Weigted effects PRS r2: %0.4f' % pval_eff_r2)

    ret_dict = {
        'raw_effects_prs': raw_effects_prs.copy(),
        'pval_derived_effects_prs': pval_derived_effects_prs.copy(),
        'true_phens': true_phens[:],
        'iids': iids

    if len(pcs) > 0:
        ret_dict['pcs'] = pcs
    if len(sex) > 0:
        ret_dict['sex'] = sex
    if len(covariates) > 0:
        ret_dict['covariates'] = covariates

    return ret_dict
Beispiel #15
def coordinate_genot_ss(genotype_file=None,
    Assumes plink BED files.  Imputes missing genotypes.
    from plinkio import plinkfile
    plinkf = plinkfile.PlinkFile(genotype_file)
    plinkf_dict = plinkfiles.get_phenotypes(plinkf)
    num_individs = plinkf_dict['num_individs']
    risk_scores = sp.zeros(num_individs)
    rb_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    corr_list = []
    rb_corr_list = []

    if plinkf_dict['has_phenotype']:
        hdf5_file.create_dataset('y', data=plinkf_dict['phenotypes'])

    ssf = hdf5_file['sum_stats']

    cord_data_g = hdf5_file.create_group('cord_data')

    # Figure out chromosomes and positions by looking at SNPs.
    loci = plinkf.get_loci()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)
    chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes)

    tot_num_non_matching_nts = 0
    for chrom in chromosomes:
        chr_str = 'chrom_%d' % chrom
        print('Coordinating data for chromosome %s' % chr_str)

        chrom_d = chr_dict[chr_str]
            ssg = ssf['chrom_%d' % chrom]
        except Exception as err_str:
            print('Did not find chromosome in SS dataset.')

        # for x,y in zip(chrom_d['sids'], chrom_d['nts']):
        #     sys.stderr.write(f'{x} {y[0]} {y[1]}\n')
        # for x,y in zip(ssg['sids'], ssg['nts']):
        #     sys.stderr.write(f'{x} {y[0]} {y[1]}\n')

        g_sids = chrom_d['sids']
        g_sid_set = set(g_sids)
        assert len(g_sid_set) == len(
            g_sids), 'Some SNPs appear to be duplicated?'
        ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        ss_sid_set = set(ss_sids)
        assert len(ss_sid_set) == len(
            ss_sids), 'Some SNPs appear to be duplicated?'

        # Figure out filters:
        g_filter = sp.in1d(g_sids, ss_sids)
        ss_filter = sp.in1d(ss_sids, g_sids)

        # Order by SNP IDs
        g_order = sp.argsort(g_sids)
        ss_order = sp.argsort(ss_sids)

        g_indices = []
        for g_i in g_order:
            if g_filter[g_i]:

        ss_indices = []
        for ss_i in ss_order:
            if ss_filter[ss_i]:

        g_nts = chrom_d['nts']
        snp_indices = chrom_d['snp_indices']
        ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype)
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]
        assert not sp.any(sp.isnan(
            betas)), 'Some SNP effect estimates are NANs (not a number)'
        assert not sp.any(sp.isinf(
            betas)), 'Some SNP effect estimates are INFs (infinite numbers)'

        # Wallace -start, f**k LDpred
        w_pos = chrom_d['positions']
        # -end

        num_non_matching_nts = 0
        num_ambig_nts = 0
        ok_nts = []
        if debug:
            print('Found %d SNPs present in both datasets' % (len(g_indices)))

        if 'freqs' in ssg:
            ss_freqs = ssg['freqs'][...]

        ok_indices = {'g': [], 'ss': []}
        for g_i, ss_i in zip(g_indices, ss_indices):
            # for g_i, ss_i, pos_i in zip(g_indices, ss_indices, w_pos):

            # Is the nucleotide ambiguous?
            g_nt = [g_nts[g_i][0], g_nts[g_i][1]]

            if not skip_coordination:
                if tuple(g_nt) in util.ambig_nts:
                    num_ambig_nts += 1
                    tot_num_non_matching_nts += 1

                if (not g_nt[0] in util.valid_nts) or (not g_nt[1]
                                                       in util.valid_nts):
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1

                ss_nt = ss_nts[ss_i]

                # Are the nucleotides the same?
                flip_nts = False
                os_g_nt = sp.array([
                if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
                    # Opposite strand nucleotides
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0]
                                == ss_nt[1]) or (os_g_nt[1] == ss_nt[0]
                                                 and os_g_nt[0] == ss_nt[1])

                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]
                        log_odds[ss_i] = -log_odds[ss_i]
                        if 'freqs' in ssg:
                            if ss_freqs[ss_i] > 0:
                                ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                        # Wallace debug
                        if debug:
                                f'non match at: {g_sids[g_i]} - ssid:{ss_sids[ss_i]}, g_nt: {g_nt[0]} - {g_nt[1]}, ss_nt: {ss_nt[0]} - {ss_nt[1]}\n'
                        # End Wallace debug.

                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1


            # everything seems ok.

        if debug:
            print('%d SNPs were excluded due to ambiguous nucleotides.' %
            print('%d SNPs were excluded due to non-matching nucleotides.' %

        # Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        order = sp.argsort(positions)
        ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])
        positions = positions[order]

        # Parse SNPs
        snp_indices = sp.array(chrom_d['snp_indices'])

        # Pinpoint where the SNPs are in the file.
        snp_indices = snp_indices[ok_indices['g']]
        raw_snps, freqs = plinkfiles.parse_plink_snps(genotype_file,
        if debug:
            print('Parsed a %dX%d (SNP) genotype matrix' %
                  (raw_snps.shape[0], raw_snps.shape[1]))

        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]
        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)[order]
        sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        sids = sids[ok_indices['ss']]

        # Check SNP frequencies..
        if check_mafs and 'freqs' in ssg:
            ss_freqs = ss_freqs[ok_indices['ss']]
            # Assuming freq less than 0 is missing data
            freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15
            # Filter SNPs that doesn't have MAF info from sumstat
            freq_discrepancy_snp = sp.logical_and(freq_discrepancy_snp,
                                                  ss_freqs > 0)
            freq_discrepancy_snp = sp.logical_and(freq_discrepancy_snp,
                                                  ss_freqs < 1)
            if sp.any(freq_discrepancy_snp):
                    'Warning: %d SNPs appear to have high frequency '
                    'discrepancy between summary statistics and validation sample'
                    % sp.sum(freq_discrepancy_snp))

                # Filter freq_discrepancy_snps
                ok_freq_snps = sp.logical_not(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]

        # Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "Problems when filtering SNPs with low minor allele frequencies"
        if sp.sum(maf_filter) < n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            freqs = freqs[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]

            print('%d SNPs with MAF < %0.3f were filtered' %
                  (n_snps - maf_filter_sum, min_maf))

        print('%d SNPs were retained on chromosome %d.' %
              (maf_filter_sum, chrom))

        rb_prs =, log_odds)
        if debug and plinkf_dict['has_phenotype']:
            print('Normalizing SNPs')
            snp_means.shape = (len(raw_snps), 1)
            snp_stds.shape = (len(raw_snps), 1)
            snps = (raw_snps - snp_means) / snp_stds
            assert snps.shape == raw_snps.shape, 'Problems when normalizing SNPs (set to have variance 1 and 0 mean)'
            snp_stds = snp_stds.flatten()
            snp_means = snp_means.flatten()
            prs =, betas)
            corr = sp.corrcoef(plinkf_dict['phenotypes'], prs)[0, 1]
                'PRS correlation for chromosome %d was %0.4f when predicting into LD ref data'
                % (chrom, corr))
            rb_corr = sp.corrcoef(plinkf_dict['phenotypes'], rb_prs)[0, 1]
                'Raw effect sizes PRS correlation for chromosome %d was %0.4f when predicting into LD ref data'
                % (chrom, rb_corr))

        sid_set = set(sids)
        if genetic_map_dir is not None:
            genetic_map = []
            with +
                           'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
            genetic_map = None

        coord_data_dict = {
            'chrom': 'chrom_%d' % chrom,
            'raw_snps_ref': raw_snps,
            'snp_stds_ref': snp_stds,
            'snp_means_ref': snp_means,
            'freqs_ref': freqs,
            'ps': ps,
            'positions': positions,
            'nts': nts,
            'sids': sids,
            'genetic_map': genetic_map,
            'betas': betas,
            'log_odds': log_odds,
            'log_odds_prs': rb_prs

        write_coord_data(cord_data_g, coord_data_dict)

        if debug and plinkf_dict['has_phenotype']:
            rb_risk_scores += rb_prs
            risk_scores += prs
        num_common_snps += len(betas)

    if debug and plinkf_dict['has_phenotype']:

        # Now calculate the prediction R^2
        corr = sp.corrcoef(plinkf_dict['phenotypes'], risk_scores)[0, 1]
        rb_corr = sp.corrcoef(plinkf_dict['phenotypes'], rb_risk_scores)[0, 1]
            'PRS R2 prediction accuracy for the whole genome was %0.4f (corr=%0.4f) when predicting into LD ref data'
            % (corr**2, corr))
            'Log-odds (effects) PRS R2 prediction accuracy for the whole genome was %0.4f (corr=%0.4f) when predicting into LD ref data'
            % (rb_corr**2, rb_corr))
    print('There were %d SNPs in common' % num_common_snps)
    print('In all, %d SNPs were excluded due to nucleotide issues.' %
    print('Done coordinating genotypes and summary statistics datasets.')
Beispiel #16
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None,
    print('Coordinating things w genotype file: %s \nref. genot. file: %s' %
          (genotype_file, reference_genotype_file))

    from plinkio import plinkfile
    plinkf = plinkfile.PlinkFile(genotype_file)

    # Loads only the individuals...
    plinkf_dict = plinkfiles.get_phenotypes(plinkf)

    # Figure out chromosomes and positions.
    if debug:
        print('Parsing validation bim file')
    loci = plinkf.get_loci()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)

    chr_dict = plinkfiles.get_chrom_dict(loci, chromosomes)

    if debug:
        print('Parsing LD reference bim file')
    plinkf_ref = plinkfile.PlinkFile(reference_genotype_file)
    loci_ref = plinkf_ref.get_loci()

    chr_dict_ref = plinkfiles.get_chrom_dict(loci_ref, chromosomes)

    # Open HDF5 file and prepare out data
    assert not 'iids' in hdf5_file, 'Something is wrong with the HDF5 file, no individuals IDs were found.'
    if plinkf_dict['has_phenotype']:
        hdf5_file.create_dataset('y', data=plinkf_dict['phenotypes'])

    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    maf_adj_risk_scores = sp.zeros(plinkf_dict['num_individs'])
    num_common_snps = 0
    # corr_list = []

    tot_g_ss_nt_concord_count = 0
    tot_rg_ss_nt_concord_count = 0
    tot_g_rg_nt_concord_count = 0
    tot_num_non_matching_nts = 0

    # Now iterate over chromosomes
    for chrom in chromosomes:
        ok_indices = {'g': [], 'rg': [], 'ss': []}

        chr_str = 'chrom_%d' % chrom
        print('Coordinating data for chromosome %s' % chr_str)

        chrom_d = chr_dict[chr_str]
        chrom_d_ref = chr_dict_ref[chr_str]
            ssg = ssf['chrom_%d' % chrom]
        except Exception as err_str:
            print('Did not find chromosome in SS dataset.')

        ssg = ssf['chrom_%d' % chrom]
        g_sids = chrom_d['sids']
        rg_sids = chrom_d_ref['sids']
        ss_sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        if debug:
                'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.'
                % (len(g_sids), len(rg_sids), len(ss_sids)))
        common_sids = sp.intersect1d(ss_sids, g_sids)
        common_sids = sp.intersect1d(common_sids, rg_sids)
        if debug:
                'Found %d SNPs on chrom %d that were common across all datasets'
                % (len(common_sids), chrom))

        ss_snp_map = []
        g_snp_map = []
        rg_snp_map = []

        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid] = i

        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid] = i

        rg_sid_dict = {}
        for i, sid in enumerate(rg_sids):
            rg_sid_dict[sid] = i

        for sid in common_sids:

        # order by positions
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)
        # order = order.tolist()
        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        # Get the other two maps
        for sid in common_sids:

        for sid in common_sids:

        g_nts = sp.array(chrom_d['nts'])
        rg_nts = sp.array(chrom_d_ref['nts'])
        rg_nts_ok = sp.array(rg_nts)[rg_snp_map]
        ss_nts = (ssg['nts'][...]).astype(util.nts_u_dtype)
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg:
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(
            g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0
        rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0
        g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0
        if debug:
                'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d'
                % (len(g_snp_map), g_rg_nt_concord_count,
                   g_ss_nt_concord_count, rg_ss_nt_concord_count))
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count
        tot_g_rg_nt_concord_count += g_rg_nt_concord_count

        num_non_matching_nts = 0
        num_ambig_nts = 0

        # Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        for g_i, rg_i, ss_i in zip(g_snp_map, rg_snp_map, ss_snp_map):

            # To make sure, is the SNP id the same?
            assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[
                ss_i], 'Some issues with coordinating the genotypes.'

            g_nt = g_nts[g_i]
            if not skip_coordination:

                rg_nt = rg_nts[rg_i]
                ss_nt = ss_nts[ss_i]

                # Is the nucleotide ambiguous.
                g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
                if tuple(g_nt) in util.ambig_nts:
                    num_ambig_nts += 1
                    tot_num_non_matching_nts += 1

                # First check if nucleotide is sane?
                if (not g_nt[0] in util.valid_nts) or (not g_nt[1]
                                                       in util.valid_nts):
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1

                os_g_nt = sp.array([

                flip_nts = False
                if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and
                        (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))):
                    if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt):
                        flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0]
                                    == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and
                                                     os_g_nt[0] == ss_nt[1])
                        # Try flipping the SS nt
                        if flip_nts:
                            betas[ss_i] = -betas[ss_i]
                            log_odds[ss_i] = -log_odds[ss_i]
                            if 'freqs' in ssg:
                                ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                            if debug:
                                print("Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                                      (g_sids[g_i], ss_sids[ss_i], g_i,
                                       ss_i, str(g_nt), str(ss_nt)))
                            num_non_matching_nts += 1
                            tot_num_non_matching_nts += 1

                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1
                        # Opposite strand nucleotides

            # everything seems ok.


        if debug:
            print('%d SNPs had ambiguous nucleotides.' % num_ambig_nts)
            print('%d SNPs were excluded due to nucleotide issues.' %
            print('%d SNPs were retained on chromosome %d.' %
                  (len(ok_indices['g']), chrom))

        # Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]

        # Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        # Pinpoint where the SNPs are in the file.
        snp_indices = snp_indices[ok_indices['g']]
        raw_snps, freqs = plinkfiles.parse_plink_snps(genotype_file,

        snp_indices_ref = sp.array(chrom_d_ref['snp_indices'])
        # Pinpoint where the SNPs are in the file.
        snp_indices_ref = snp_indices_ref[ok_indices['rg']]
        raw_ref_snps, freqs_ref = plinkfiles.parse_plink_snps(
            reference_genotype_file, snp_indices_ref)

        snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref))
        snp_means_ref = freqs_ref * 2

        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)
        sids = (ssg['sids'][...]).astype(util.sids_u_dtype)
        sids = sids[ok_indices['ss']]

        # Check SNP frequencies..
        if check_mafs and 'freqs' in ssg:
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(
                ss_freqs - (1 - freqs)) > 0.15  #Array of np.bool values
            if sp.any(freq_discrepancy_snp):
                    'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample'
                    % sp.sum(freq_discrepancy_snp))

                # Filter freq_discrepancy_snps
                ok_freq_snps = sp.logical_not(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                raw_ref_snps = raw_ref_snps[ok_freq_snps]
                snp_stds_ref = snp_stds_ref[ok_freq_snps]
                snp_means_ref = snp_means_ref[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                freqs_ref = freqs_ref[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]

        # Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "Problems when filtering SNPs with low minor allele frequencies"
        if sp.sum(maf_filter) < n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            raw_ref_snps = raw_ref_snps[maf_filter]
            snp_stds_ref = snp_stds_ref[maf_filter]
            snp_means_ref = snp_means_ref[maf_filter]
            freqs = freqs[maf_filter]
            freqs_ref = freqs_ref[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]

        maf_adj_prs =, raw_snps)
        if debug and plinkf_dict['has_phenotype']:
            maf_adj_corr = sp.corrcoef(plinkf_dict['phenotypes'],
                                       maf_adj_prs)[0, 1]
                'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f'
                % (chrom, maf_adj_corr))

        genetic_map = []
        if genetic_map_dir is not None:
            with +
                           'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()

#                     if l[0] in sid_set:
#                         genetic_map.append(l[0])
            genetic_map = None

        coord_data_dict = {
            'chrom': 'chrom_%d' % chrom,
            'raw_snps_ref': raw_ref_snps,
            'snp_stds_ref': snp_stds_ref,
            'snp_means_ref': snp_means_ref,
            'freqs_ref': freqs_ref,
            'ps': ps,
            'positions': positions,
            'nts': nts,
            'sids': sids,
            'genetic_map': genetic_map,
            'betas': betas,
            'log_odds': log_odds,
            'log_odds_prs': maf_adj_prs,
            'raw_snps_val': raw_snps,
            'snp_stds_val': snp_stds,
            'snp_means_val': snp_means,
            'freqs_val': freqs

        write_coord_data(cord_data_g, coord_data_dict)
        maf_adj_risk_scores += maf_adj_prs
        num_common_snps += len(betas)

    # Now calculate the prediction r^2
    if debug and plinkf_dict['has_phenotype']:
        maf_adj_corr = sp.corrcoef(plinkf_dict['phenotypes'],
                                   maf_adj_risk_scores)[0, 1]
            'Log odds, per PRS correlation for the whole genome was %0.4f (r^2=%0.4f)'
            % (maf_adj_corr, maf_adj_corr**2))
        'Overall nucleotide concordance counts: g_rg: %d, g_ss: %d, rg_ss: %d'
        % (tot_g_rg_nt_concord_count, tot_g_ss_nt_concord_count,
    print('There were %d SNPs in common' % num_common_snps)
    print('In all, %d SNPs were excluded due to nucleotide issues.' %