Beispiel #1
0
def save_montage(NIFTI, ANAT, ONAME, SGN):

    nifti = load_image(NIFTI)
    anat = load_image(ANAT)

    imax = nifti.get_data().max()
    imin = nifti.get_data().min()

    imshow_args = {'vmax': imax, 'vmin': imin}

    mcmap = cmaps[SGN + 1]

    num_features = nifti.shape[-1]
    y = max([1, int(round(sqrt(num_features / 3)))])
    x = int(ceil(num_features / y) + 1)

    font = {'size': 8}
    rc('font', **font)

    f = figure(figsize=[iscale * y, iscale * x / 3])
    subplots_adjust(left=0.01,
                    right=0.99,
                    bottom=0.01,
                    top=0.99,
                    wspace=0.1,
                    hspace=0)

    for i in range(0, num_features):
        data = nifti.get_data()[:, :, :, i]
        data[sign(data) == negative(SGN)] = 0
        if max(abs(data.flatten())) > thr + 0.2:
            ax = subplot(x, y, i + 1)
            max_idx = np.unravel_index(argmax(data), data.shape)
            plot_map(data,
                     xyz_affine(nifti),
                     anat=anat.get_data(),
                     anat_affine=xyz_affine(anat),
                     black_bg=True,
                     threshold=thr,
                     cut_coords=coord_transform(max_idx[0], max_idx[1],
                                                max_idx[2], xyz_affine(nifti)),
                     annotate=False,
                     axes=ax,
                     cmap=mcmap,
                     draw_cross=False,
                     **imshow_args)
            text(0.,
                 0.95,
                 str(i),
                 transform=ax.transAxes,
                 horizontalalignment='center',
                 color=(1, 1, 1))
    savefig(ONAME, facecolor=(0, 0, 0))
Beispiel #2
0
def calc_ibd_kinship(snps, dtype='single', scaled=True):
    num_snps = len(snps)
    n_indivs = len(snps[0])
    k_mat = sp.zeros((n_indivs, n_indivs), dtype=dtype)
    for chunk_i, i in enumerate(range(0, num_snps, n_indivs)):
        snps_array = sp.array(snps[i:i + n_indivs])
        snps_array = snps_array.T
        norm_snps_array = (snps_array - sp.mean(snps_array, 0)) / sp.std(snps_array, 0)
        assert sp.all(sp.negative(sp.isnan(norm_snps_array))), 'WTF?'
        x = sp.mat(norm_snps_array.T)
        k_mat += x.T * x
        sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (min(1, ((chunk_i + 1.0) * n_indivs) / num_snps))))
        sys.stdout.flush()
    k_mat = k_mat / float(num_snps)
    if scaled:
        k_mat = scale_k(k_mat)
    return k_mat
Beispiel #3
0
def calc_ibd_kinship(snps, dtype='single', scaled=True):
    num_snps = len(snps)
    n_indivs = len(snps[0])
    k_mat = sp.zeros((n_indivs, n_indivs), dtype=dtype)
    for chunk_i, i in enumerate(range(0, num_snps, n_indivs)):
        snps_array = sp.array(snps[i:i + n_indivs])
        snps_array = snps_array.T
        norm_snps_array = (snps_array - sp.mean(snps_array, 0)) / sp.std(
            snps_array, 0)
        assert sp.all(sp.negative(sp.isnan(norm_snps_array))), 'WTF?'
        x = sp.mat(norm_snps_array.T)
        k_mat += x.T * x
        sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' %
                         (100.0 *
                          (min(1, ((chunk_i + 1.0) * n_indivs) / num_snps))))
        sys.stdout.flush()
    k_mat = k_mat / float(num_snps)
    if scaled:
        k_mat = scale_k(k_mat)
    return k_mat
Beispiel #4
0
def parse_1KG_snp_info(input_file='/project/TheHonestGene/faststorage/1Kgenomes/phase3/1k_genomes_hg.hdf5' ,
                       out_file='/project/PCMA/faststorage/1_DATA/1k_genomes/1K_SNP_INFO_EUR_MAF0.05.hdf5',
                       filter_ambiguous=True,
                       maf_thres=0.05):
    print 'Generating a SNP info file'
    ih5f = h5py.File(input_file)
    oh5f = h5py.File(out_file)
    num_indivs = len(ih5f['indivs']['continent'])
    eur_filter = ih5f['indivs']['continent'][...] == 'EUR'
    num_eur_indivs = sp.sum(eur_filter)
    print 'Number of European individuals: %d \nTotal number of individuals: %d' % (num_eur_indivs, num_indivs)
    std_thres = sp.sqrt(2.0 * (1 - maf_thres) * (maf_thres))

    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom
        
        print 'Loading SNPs and data'
        snps = sp.array(ih5f[chrom_str]['calldata']['snps'][...], dtype='int8')
        print 'Excluding non-European individuals'
        snps = snps[:, eur_filter]

        print "Loading other SNP information"
        snp_ids = ih5f[chrom_str]['variants']['ID'][...]
        positions = ih5f[chrom_str]['variants']['POS'][...]

        print 'Loading NTs'
        ref_nts = ih5f[chrom_str]['variants']['REF'][...]
        alt_nts = ih5f[chrom_str]['variants']['ALT'][...]
        
        print 'Filtering multi-allelic SNPs'
        multi_allelic_filter = sp.negative(ih5f[chrom_str]['variants']['MULTI_ALLELIC'][...])
        snps = snps[multi_allelic_filter]
        ref_nts = ref_nts[multi_allelic_filter]
        alt_nts = alt_nts[multi_allelic_filter]
        snp_ids = snp_ids[multi_allelic_filter]
        positions = positions[multi_allelic_filter]
        
        print 'Filter SNPs with missing NT information'
        nt_filter = sp.in1d(ref_nts, ok_nts)
        nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts)
        if sp.sum(nt_filter) < len(nt_filter):
            snps = snps[nt_filter]
            ref_nts = ref_nts[nt_filter]
            alt_nts = alt_nts[nt_filter]
            snp_ids = snp_ids[nt_filter]
            positions = positions[nt_filter]

        print 'Filtering SNPs with MAF <', maf_thres
        afs = sp.sum(snps, axis=1) / num_eur_indivs
        assert sp.all(0 <= afs) and sp.all(afs <= 2), 'AF is out of range' 
        mafs = sp.minimum(afs, 1 - afs)
        maf_filter = mafs < maf_thres
        snps = snps[maf_filter]
        ref_nts = ref_nts[maf_filter]
        alt_nts = alt_nts[maf_filter]
        snp_ids = snp_ids[maf_filter]
        positions = positions[maf_filter]
        mafs = mafs[maf_filter]
        
    
        g = oh5f.create_group(chrom_str)
        g.create_dataset('sids', data=snp_ids)
        g.create_dataset('positions', data=positions)
        g.create_dataset('eur_mafs', data=mafs)
        g.create_dataset('ref', data=ref_nts)
        g.create_dataset('alt', data=alt_nts)
        oh5f.flush()
    oh5f.close()
Beispiel #5
0
def get_kinships(snps_file='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/new_snps.HDF5',
                 plot_figures = True, 
                 figure_dir = 'C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode',
                 fig_id = 'all',
                 min_maf = 0.1,
                 max_strain_num=200):
    """
    Calculates the kinship
    """
    h5f = h5py.File(snps_file)
    gene_groups = h5f.keys()
    all_strains = set()
    for gg in gene_groups:
        data_g = h5f[gg]
        strains = data_g['strains'][...]
        if len(strains)<max_strain_num:
            all_strains = set(strains).union(all_strains)
    num_strains = len(all_strains)
    print 'Found %d "distinct" strains'%num_strains
    
    ordered_strains = sorted(list(all_strains))
    strain_index = pd.Index(ordered_strains)
    K_snps = sp.zeros((num_strains,num_strains))
    counts_mat_snps = sp.zeros((num_strains,num_strains))
    K_codon_snps = sp.zeros((num_strains,num_strains))
    counts_mat_codon_snps = sp.zeros((num_strains,num_strains))
        
    K_nonsyn_snps = sp.zeros((num_strains,num_strains))
    counts_mat_nonsyn_snps = sp.zeros((num_strains,num_strains))
    
    K_syn_snps = sp.zeros((num_strains,num_strains))
    counts_mat_syn_snps = sp.zeros((num_strains,num_strains))

    for i, gg in enumerate(gene_groups):
        if i%100==0:
            print 'Working on gene nr. %d'%i 
        data_g = h5f[gg]
        strains = data_g['strains'][...]
        if len(strains)<max_strain_num:
            strain_mask = strain_index.get_indexer(strains)
            
            snps = data_g['norm_snps'][...]
            freqs = data_g['freqs'][...]
            mafs = sp.minimum(freqs,1-freqs)
            maf_mask = mafs>min_maf
            snps = snps[maf_mask]
            if len(snps)==0:
                continue
            K_snps_slice = K_snps[strain_mask]
            K_snps_slice[:,strain_mask] += sp.dot(snps.T,snps)
            K_snps[strain_mask] = K_snps_slice
            counts_mat_snps_slice = counts_mat_snps[strain_mask]
            counts_mat_snps_slice[:,strain_mask] += len(snps)
            counts_mat_snps[strain_mask] = counts_mat_snps_slice
    
            codon_snps = data_g['norm_codon_snps'][...]
            if len(codon_snps)==0:
                continue
            freqs = data_g['codon_snp_freqs'][...]
            mafs = sp.minimum(freqs,1-freqs)
            maf_mask = mafs>min_maf
            codon_snps = codon_snps[maf_mask]
            is_synonimous_snp = data_g['is_synonimous_snp'][...]
            is_synonimous_snp = is_synonimous_snp[maf_mask]
            if len(codon_snps)>0:
                K_codon_snps_slice = K_codon_snps[strain_mask]
                K_codon_snps_slice[:,strain_mask] += sp.dot(codon_snps.T,codon_snps)
                K_codon_snps[strain_mask] = K_codon_snps_slice
                counts_mat_codon_snps_slice = counts_mat_codon_snps[strain_mask]
                counts_mat_codon_snps_slice[:,strain_mask] += len(codon_snps)
                counts_mat_codon_snps[strain_mask] = counts_mat_codon_snps_slice
        
        
                
                if sp.sum(is_synonimous_snp)>0:
                    syn_snps = codon_snps[is_synonimous_snp]
                    K_syn_snps_slice = K_syn_snps[strain_mask]
                    K_syn_snps_slice[:,strain_mask] += sp.dot(syn_snps.T,syn_snps)
                    K_syn_snps[strain_mask] = K_syn_snps_slice
                    counts_mat_syn_snps_slice = counts_mat_syn_snps[strain_mask]
                    counts_mat_syn_snps_slice[:,strain_mask] += len(syn_snps)
                    counts_mat_syn_snps[strain_mask] = counts_mat_syn_snps_slice
            
                is_nonsynonimous_snp = sp.negative(is_synonimous_snp)
                if sp.sum(is_nonsynonimous_snp)>0:
                    nonsyn_snps = codon_snps[is_nonsynonimous_snp]                
                    K_nonsyn_snps_slice = K_nonsyn_snps[strain_mask]
                    K_nonsyn_snps_slice[:,strain_mask] += sp.dot(nonsyn_snps.T,nonsyn_snps)
                    K_nonsyn_snps[strain_mask] = K_nonsyn_snps_slice
                    counts_mat_nonsyn_snps_slice = counts_mat_nonsyn_snps[strain_mask]
                    counts_mat_nonsyn_snps_slice[:,strain_mask] += len(nonsyn_snps)
                    counts_mat_nonsyn_snps[strain_mask] = counts_mat_nonsyn_snps_slice

    
    
    K_snps  = K_snps/counts_mat_snps  #element-wise division
    K_codon_snps  = K_codon_snps/counts_mat_codon_snps  #element-wise division

    K_syn_snps  = K_syn_snps/counts_mat_syn_snps  #element-wise division
    K_nonsyn_snps  = K_nonsyn_snps/counts_mat_nonsyn_snps  #element-wise division

    if plot_figures:
        plot_dirty_PCA(K_snps,figure_fn='PCA34_all_snps_%s.pdf'%fig_id, k_figure_fn='K_all_snps_%s.png'%fig_id, 
                       figure_dir=figure_dir, strains=ordered_strains, title='All SNPs')
        plot_dirty_PCA(K_codon_snps,figure_fn='PCA34_codon_snps_%s.pdf'%fig_id, k_figure_fn='K_codon_snps_%s.png'%fig_id, 
                       figure_dir=figure_dir, strains=ordered_strains, title='Codon SNPs')
        plot_dirty_PCA(K_syn_snps,figure_fn='PCA34_syn_snps_%s.pdf'%fig_id, k_figure_fn='K_syn_snps_%s.png'%fig_id, 
                       figure_dir=figure_dir, strains=ordered_strains, title='Synonymous SNPs')
        plot_dirty_PCA(K_nonsyn_snps,figure_fn='PCA_34nonsyn_snps_%s.pdf'%fig_id, k_figure_fn='K_nonsyn_snps_%s.png'%fig_id, 
                       figure_dir=figure_dir, strains=ordered_strains, title='Non-Synonymous SNPs')

    print 'Average number of SNPs: %0.2f.'%sp.mean(counts_mat_snps)
    print 'Average number of codon SNPs: %0.2f.'%sp.mean(counts_mat_snps)
    print 'Average number of codon SNPs: %0.2f.'%sp.mean(counts_mat_snps)
    print 'Average number of codon SNPs: %0.2f.'%sp.mean(counts_mat_snps)
    
    return {'K_snps':K_snps, 'K_codon_snps':K_codon_snps, 'counts_mat_snps':counts_mat_snps, 'counts_mat_codon_snps':counts_mat_codon_snps,
            'K_syn_snps':K_syn_snps, 'K_nonsyn_snps':K_nonsyn_snps, 'counts_mat_syn_snps':counts_mat_syn_snps, 'counts_mat_nonsyn_snps':counts_mat_nonsyn_snps,
            'strains':ordered_strains}
Beispiel #6
0
def leave_k_out_blup(num_repeats=20, num_cvs=5, genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/', k_thres=0.5):
    """

    """
    import h5py
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    rep_dict = {}
    for rep_i in range(num_repeats):
        res_dict = {}
        for phenotype in phenotypes:
            env_dict = {}
            for env in envs:
                print phenotype, env
                s1 = time.time()
                # Load data..
                d = hdf5_data.coordinate_cegs_genotype_phenotype(
                    phen_dict, phenotype, env, k_thres=k_thres)
                Y_means = d['Y_means']
                snps = d['snps']
                assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?'
                K = kinship.calc_ibd_kinship(snps)
                print '\nKinship calculated'
                assert sp.all(sp.negative(sp.isnan(K))), 'WTF?'
                n = len(Y_means)
                # partition genotypes in k parts.
                gt_ids = d['gt_ids']
                num_ids = len(gt_ids)
                chunk_size = num_ids / num_cvs

                # Create k CV sets of prediction and validation data

                cv_chunk_size = int((n / num_cvs) + 1)
                ordering = sp.random.permutation(n)

                a = sp.arange(n)
                osb_ys = []
                pred_ys = []
                p_herits = []
                for cv_i, i in enumerate(range(0, n, cv_chunk_size)):
                    cv_str = 'cv_%d' % cv_i
                    # print 'Working on CV %d' % cv_i
                    end_i = min(n, i + cv_chunk_size)
                    validation_filter = sp.in1d(a, ordering[i:end_i])
                    training_filter = sp.negative(validation_filter)

                    train_snps = snps[:, training_filter]
                    val_snps = snps[:, validation_filter]

                    train_Y = Y_means[training_filter]
                    val_Y = Y_means[validation_filter]

                    #Calc. kinship
                    K_train = K[training_filter, :][:, training_filter]
                    K_cross = K[validation_filter, :][:, training_filter]
                    # Do gBLUP
                    lmm = lm.LinearMixedModel(train_Y)
                    lmm.add_random_effect(K_train)
                    r1 = lmm.get_REML()

                    # Now the BLUP.
                    y_mean = sp.mean(lmm.Y)
                    Y = lmm.Y - y_mean
                    p_herit = r1['pseudo_heritability']
                    p_herits.append(p_herit)
                    #delta = (1 - p_herit) / p_herit
            #        if K_inverse == None:
            #            K_inverse = K.I
            #        M = (sp.eye(K.shape[0]) + delta * K_inverse)
            #        u_blup = M.I * Y
                    M = sp.mat(p_herit * sp.mat(K_train) +
                               (1 - p_herit) * sp.eye(K_train.shape[0]))
                    u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten()
                    osb_ys.extend(val_Y)
                    pred_ys.extend(u_mean_pred)
                corr = sp.corrcoef(osb_ys, pred_ys)[1, 0]
                print 'Correlation:', corr
                r2 = corr**2
                print 'R2:', r2
                mean_herit = sp.mean(p_herits)
                print 'Avg. heritability:', mean_herit
                env_dict[env] = {'R2': r2, 'obs_y': osb_ys,
                                 'pred_y': pred_ys, 'corr': corr, 'avg_herit': mean_herit}

            res_dict[phenotype] = env_dict
        rep_dict[rep_i] = res_dict
    res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % (
        num_cvs, k_thres)
    h5f = h5py.File(res_hdf5_file)
    for rep_i in range(num_repeats):
        res_dict = rep_dict[rep_i]
        rep_g = h5f.create_group('repl_%d' % rep_i)
        for phenotype in phenotypes:
            phen_g = rep_g.create_group(phenotype)
            for env in envs:
                d = res_dict[phenotype][env]
                env_g = phen_g.create_group(env)
                env_g.create_dataset('R2',  data=[d['R2']])
                env_g.create_dataset('corr',  data=[d['corr']])
                env_g.create_dataset('obs_y',  data=d['obs_y'])
                env_g.create_dataset('pred_y',  data=d['pred_y'])
                env_g.create_dataset('avg_herit',  data=[d['avg_herit']])
    h5f.close()
Beispiel #7
0
def coordinate_cegs_genotype_phenotype(
    phen_dict,
    phenotype='Protein',
    env='mated',
    k_thres=0.8,
    ind_missing_thres=0.5,
    snp_missing_thres=0.05,
    maf_thres=0.1,
    genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5'
):
    """
    Parse genotypes and coordinate with phenotype, and ready data for analysis.
    """
    gh5f = h5py.File(genotype_file)
    p_dict = phen_dict[phenotype][env]
    print 'Loading SNPs'
    snps = sp.array(gh5f['gt'][...], dtype='single')
    snps = snps[:, p_dict['ind_filter']]
    positions = gh5f['pos'][...]
    m, n = snps.shape
    print 'Loaded %d SNPs for %d individuals' % (m, n)
    print 'Filtering individuals with missing rates >%0.2f' % ind_missing_thres
    missing_mat = sp.isnan(snps)
    ind_missing_rates = sp.sum(missing_mat, 0) / float(m)
    ind_filter = ind_missing_rates < ind_missing_thres
    snps = snps[:, ind_filter]
    n = sp.sum(ind_filter)
    print 'Filtered %d individuals due to high missing rates' % sp.sum(
        sp.negative(ind_filter))
    gt_ids = gh5f['gt_ids'][p_dict['ind_filter']]
    gt_ids = gt_ids[ind_filter]
    Y_means = p_dict['Y_means'][p_dict['ind_filter']]
    Y_means = Y_means[ind_filter]
    Y_medians = p_dict['Y_medians'][p_dict['ind_filter']]
    Y_medians = Y_medians[ind_filter]
    rep_count = p_dict['rep_count'][p_dict['ind_filter']]
    rep_count = rep_count[ind_filter]

    print 'Now removing "bad" genotypes.'
    bad_genotypes = [
        'Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591',
        'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 'Raleigh_336',
        'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799',
        'Raleigh_821', 'Raleigh_822', 'Raleigh_884', 'Raleigh_335'
    ]
    ind_filter = sp.negative(sp.in1d(gt_ids, bad_genotypes))
    gt_ids = gt_ids[ind_filter]
    Y_means = Y_means[ind_filter]
    Y_medians = Y_medians[ind_filter]
    rep_count = rep_count[ind_filter]
    snps = snps[:, ind_filter]
    print 'Removed %d "bad" genotypes' % sp.sum(sp.negative(ind_filter))

    n = len(snps[0])
    print 'Filtering SNPs with missing rate >%0.2f' % snp_missing_thres
    missing_mat = sp.isnan(snps)
    snp_missing_rates = sp.sum(missing_mat, 1) / float(n)
    snps_filter = snp_missing_rates < snp_missing_thres
    snps = snps[snps_filter]
    positions = positions[snps_filter]
    m = sp.sum(snps_filter)
    print 'Filtered %d SNPs due to high missing rate' % sp.sum(
        sp.negative(snps_filter))

    print 'Now imputing (w mean)'
    missing_mat = sp.isnan(snps)
    ok_counts = n - sp.sum(missing_mat, 1)
    snps[missing_mat] = 0
    snp_means = sp.sum(snps, 1) / ok_counts
    #     print snp_means.shape
    #     print snp_means[:10]
    #     import pdb
    #     pdb.set_trace()
    for i in range(len(snps)):
        snps[i, missing_mat[i]] = snp_means[i]

    print 'And filtering SNPs with MAF<%0.2f' % maf_thres
    snp_means = sp.mean(snps, 1)
    snp_mafs = sp.minimum(snp_means, 1 - snp_means)
    snps_filter = snp_mafs > maf_thres
    snps = snps[snps_filter]
    positions = positions[snps_filter]
    print 'Filtered %d SNPs with low MAFs' % sp.sum(sp.negative(snps_filter))

    print 'Filtering based on kinship w threshold:', k_thres
    import kinship
    K = kinship.calc_ibd_kinship(snps)
    print '\nKinship calculated'
    K_ind_filter = []
    for i in range(n):
        K_ind_filter.append(not sp.any(K[i, i + 1:n] > k_thres))
    if sum(K_ind_filter) == n:
        print 'No individuals were filtered based on kinship..'
    else:
        print 'Filtering %d individuals based on kinship.' % (
            n - sum(K_ind_filter))
        K_ind_filter = sp.array(K_ind_filter)
        gt_ids = gt_ids[K_ind_filter]
        Y_means = Y_means[K_ind_filter]
        Y_medians = Y_medians[K_ind_filter]
        rep_count = rep_count[K_ind_filter]
        snps = snps[:, K_ind_filter]

        print 'Again filtering SNPs with MAF<%0.2f' % maf_thres
        snp_means = sp.mean(snps, 1)
        snp_mafs = sp.minimum(snp_means, 1 - snp_means)
        snps_filter = snp_mafs > maf_thres
        snps = snps[snps_filter]
        positions = positions[snps_filter]
        print 'Filtered %d additional SNPs with low MAFs' % sp.sum(
            sp.negative(snps_filter))

    print 'All filtering done.'

    m, n = snps.shape
    print 'In all there are %d SNPs remaining, for %d individuals.' % (m, n)

    ret_dict = {
        'Y_means': Y_means,
        'Y_medians': Y_medians,
        'rep_count': rep_count,
        'gt_ids': gt_ids,
        'positions': positions,
        'snps': snps
    }

    return ret_dict
def obj(z):
    x_pts, y_pts = z2xy(z)
    area = sp.integrate.trapz(y_pts, x=x_pts)
    return sp.negative(area)
Beispiel #9
0
def gen_unrelated_eur_1k_data(
        input_file='/home/bjarni/TheHonestGene/faststorage/1Kgenomes/phase3/1k_genomes_hg.hdf5',
        out_file='/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5',
        maf_thres=0.01,
        max_relatedness=0.05,
        K_thinning_frac=0.1,
        debug=False):
    h5f = h5py.File(input_file)
    num_indivs = len(h5f['indivs']['continent'])
    eur_filter = h5f['indivs']['continent'][...] == 'EUR'
    num_eur_indivs = sp.sum(eur_filter)
    print 'Number of European individuals: %d', num_eur_indivs
    K = sp.zeros((num_eur_indivs, num_eur_indivs), dtype='float64')
    num_snps = 0
    std_thres = sp.sqrt(2.0 * (1 - maf_thres) * (maf_thres))

    print 'Calculating kinship'
    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom

        print 'Loading SNPs and data'
        snps = sp.array(h5f[chrom_str]['calldata']['snps'][...], dtype='int8')

        print 'Loading NTs'
        ref_nts = h5f[chrom_str]['variants']['REF'][...]
        alt_nts = h5f[chrom_str]['variants']['ALT'][...]

        print 'Filtering multi-allelic SNPs'
        multi_allelic_filter = sp.negative(
            h5f[chrom_str]['variants']['MULTI_ALLELIC'][...])
        snps = snps[multi_allelic_filter]
        ref_nts = ref_nts[multi_allelic_filter]
        alt_nts = alt_nts[multi_allelic_filter]

        if K_thinning_frac < 1:
            print 'Thinning SNPs for kinship calculation'
            thinning_filter = sp.random.random(len(snps)) < K_thinning_frac
            snps = snps[thinning_filter]
            alt_nts = alt_nts[thinning_filter]
            ref_nts = ref_nts[thinning_filter]

        print 'Filter SNPs with missing NT information'
        nt_filter = sp.in1d(ref_nts, ok_nts)
        nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts)
        if sp.sum(nt_filter) < len(nt_filter):
            snps = snps[nt_filter]

        print 'Filtering non-European individuals'
        snps = snps[:, eur_filter]

        print 'Filtering SNPs with MAF <', maf_thres
        snp_stds = sp.std(snps, 1)
        maf_filter = snp_stds.flatten() > std_thres
        snps = snps[maf_filter]
        snp_stds = snp_stds[maf_filter]

        print '%d SNPs remaining after all filtering steps.' % len(snps)

        print 'Normalizing SNPs'
        snp_means = sp.mean(snps, 1)
        norm_snps = (snps - snp_means[sp.newaxis].T) / snp_stds[sp.newaxis].T

        print 'Updating kinship'
        K += sp.dot(norm_snps.T, norm_snps)
        num_snps += len(norm_snps)
        assert sp.isclose(
            sp.sum(sp.diag(K)) / (num_snps * num_eur_indivs), 1.0)

    K = K / float(num_snps)
    print 'Kinship calculation done using %d SNPs\n' % num_snps

    # Filter individuals
    print 'Filtering individuals'
    keep_indiv_set = set(range(num_eur_indivs))
    for i in range(num_eur_indivs):
        if i in keep_indiv_set:
            for j in range(i + 1, num_eur_indivs):
                if K[i, j] > max_relatedness:
                    if j in keep_indiv_set:
                        keep_indiv_set.remove(j)
    keep_indivs = list(keep_indiv_set)
    keep_indivs.sort()
    print 'Retained %d individuals\n' % len(keep_indivs)

    # Checking that everything is ok!
    K_ok = K[keep_indivs]
    K_ok = K_ok[:, keep_indivs]
    assert (K_ok - sp.tril(K_ok)).max() < max_relatedness

    indiv_filter = sp.zeros(num_indivs, dtype='bool8')
    indiv_filter[(sp.arange(num_indivs)[eur_filter])[keep_indivs]] = 1

    assert sp.sum(indiv_filter) == len(keep_indivs)

    # Store in new file
    print 'Now storing data.'
    oh5f = h5py.File(out_file, 'w')
    indiv_ids = h5f['indivs']['indiv_ids'][indiv_filter]
    oh5f.create_dataset('indiv_ids', data=indiv_ids)
    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom

        print 'Loading SNPs and data'
        snps = sp.array(h5f[chrom_str]['calldata']['snps'][...], dtype='int8')
        snp_ids = h5f[chrom_str]['variants']['ID'][...]
        positions = h5f[chrom_str]['variants']['POS'][...]

        print 'Loading NTs'
        ref_nts = h5f[chrom_str]['variants']['REF'][...]
        alt_nts = h5f[chrom_str]['variants']['ALT'][...]

        print 'Filtering multi-allelic SNPs'
        multi_allelic_filter = sp.negative(
            h5f[chrom_str]['variants']['MULTI_ALLELIC'][...])
        snps = snps[multi_allelic_filter]
        ref_nts = ref_nts[multi_allelic_filter]
        alt_nts = alt_nts[multi_allelic_filter]
        positions = positions[multi_allelic_filter]
        snp_ids = snp_ids[multi_allelic_filter]

        print 'Filter individuals'
        snps = snps[:, indiv_filter]

        print 'Filter SNPs with missing NT information'
        nt_filter = sp.in1d(ref_nts, ok_nts)
        nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts)
        if sp.sum(nt_filter) < len(nt_filter):
            snps = snps[nt_filter]
            ref_nts = ref_nts[nt_filter]
            alt_nts = alt_nts[nt_filter]
            positions = positions[nt_filter]
            snp_ids = snp_ids[nt_filter]

        print 'filter monomorphic SNPs'
        snp_stds = sp.std(snps, 1)
        mono_morph_filter = snp_stds > 0
        snps = snps[mono_morph_filter]
        ref_nts = ref_nts[mono_morph_filter]
        alt_nts = alt_nts[mono_morph_filter]
        positions = positions[mono_morph_filter]
        snp_ids = snp_ids[mono_morph_filter]
        snp_stds = snp_stds[mono_morph_filter]

        snp_means = sp.mean(snps, 1)

        if debug:
            if K_thinning_frac < 1:
                print 'Thinning SNPs for kinship calculation'
                thinning_filter = sp.random.random(len(snps)) < K_thinning_frac
                k_snps = snps[thinning_filter]
                k_snp_stds = snp_stds[thinning_filter]

            print 'Filtering SNPs with MAF <', maf_thres
            maf_filter = k_snp_stds.flatten() > std_thres
            k_snps = k_snps[maf_filter]
            k_snp_stds = k_snp_stds[maf_filter]
            k_snp_means = sp.mean(k_snps)

            print 'Verifying that the Kinship makes sense'
            norm_snps = (k_snps -
                         k_snp_means[sp.newaxis].T) / k_snp_stds[sp.newaxis].T
            K = sp.dot(norm_snps.T, norm_snps)
            num_snps += len(norm_snps)
            if sp.isclose(
                    sp.sum(sp.diag(K)) / (num_snps * num_eur_indivs),
                    1.0) and (K - sp.tril(K)).max() < (max_relatedness * 1.5):
                print 'It looks OK!'
            else:
                raise Exception('Kinship looks wrong?')

        nts = sp.array([[nt1, nt2] for nt1, nt2 in izip(ref_nts, alt_nts)])

        print 'Writing to disk'
        cg = oh5f.create_group(chrom_str)
        cg.create_dataset('snps', data=snps)
        cg.create_dataset('snp_means', data=snp_means[sp.newaxis].T)
        cg.create_dataset('snp_stds', data=snp_stds[sp.newaxis].T)
        cg.create_dataset('snp_ids', data=snp_ids)
        cg.create_dataset('positions', data=positions)
        cg.create_dataset('nts', data=nts)
        oh5f.flush()
        print 'Done writing to disk'


#         centimorgans = h5f[chrom_str]['centimorgans'][...]
#         cg.create_dataset('centimorgans',data=centimorgans)
#
#         centimorgan_rates = h5f[chrom_str]['centimorgan_rates'][...]
#         cg.create_dataset('centimorgan_rates',data=centimorgan_rates)

    oh5f.close()
    h5f.close()
    print 'Done'
Beispiel #10
0
def leave_k_out_blup(
        num_cvs=20,
        genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/',
        k_thres=0.5):
    """

    """
    import h5py
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    res_dict = {}
    for phenotype in phenotypes:
        env_dict = {}
        for env in envs:
            print phenotype, env
            s1 = time.time()
            #Load data..
            d = hdf5_data.coordinate_cegs_genotype_phenotype(phen_dict,
                                                             phenotype,
                                                             env,
                                                             k_thres=k_thres)
            Y_means = d['Y_means']
            snps = d['snps']
            assert sp.all(sp.negative(sp.isnan(snps))), 'WTF?'
            K = kinship.calc_ibd_kinship(snps)
            print '\nKinship calculated'
            assert sp.all(sp.negative(sp.isnan(K))), 'WTF?'
            n = len(Y_means)
            #partition genotypes in k parts.
            gt_ids = d['gt_ids']
            num_ids = len(gt_ids)
            chunk_size = num_ids / num_cvs

            #Create k CV sets of prediction and validation data

            cv_chunk_size = int((n / num_cvs) + 1)
            ordering = sp.random.permutation(n)

            a = sp.arange(n)
            osb_ys = []
            pred_ys = []
            p_herits = []
            for cv_i, i in enumerate(range(0, n, cv_chunk_size)):
                cv_str = 'cv_%d' % cv_i
                #print 'Working on CV %d' % cv_i
                end_i = min(n, i + cv_chunk_size)
                validation_filter = sp.in1d(a, ordering[i:end_i])
                training_filter = sp.negative(validation_filter)

                train_snps = snps[:, training_filter]
                val_snps = snps[:, validation_filter]

                train_Y = Y_means[training_filter]
                val_Y = Y_means[validation_filter]

                #Calc. kinship
                K_train = K[training_filter, :][:, training_filter]
                K_cross = K[validation_filter, :][:, training_filter]
                #Do gBLUP
                lmm = lm.LinearMixedModel(train_Y)
                lmm.add_random_effect(K_train)
                r1 = lmm.get_REML()

                #Now the BLUP.
                y_mean = sp.mean(lmm.Y)
                Y = lmm.Y - y_mean
                p_herit = r1['pseudo_heritability']
                p_herits.append(p_herit)
                #delta = (1 - p_herit) / p_herit
                #        if K_inverse == None:
                #            K_inverse = K.I
                #        M = (sp.eye(K.shape[0]) + delta * K_inverse)
                #        u_blup = M.I * Y
                M = sp.mat(p_herit * sp.mat(K_train) +
                           (1 - p_herit) * sp.eye(K_train.shape[0]))
                u_mean_pred = sp.array(K_cross * (M.I * Y)).flatten()
                osb_ys.extend(val_Y)
                pred_ys.extend(u_mean_pred)
            corr = sp.corrcoef(osb_ys, pred_ys)[1, 0]
            print 'Correlation:', corr
            r2 = corr**2
            print 'R2:', r2
            mean_herit = sp.mean(p_herits)
            print 'Avg. heritability:', mean_herit
            env_dict[env] = {
                'R2': r2,
                'obs_y': osb_ys,
                'pred_y': pred_ys,
                'corr': corr,
                'avg_herit': mean_herit
            }

        res_dict[phenotype] = env_dict

    res_hdf5_file = '/Users/bjarnivilhjalmsson/data/tmp/leave_%d_BLUP_results_kthres_%0.1f.hdf5' % (
        num_cvs, k_thres)
    h5f = h5py.File(res_hdf5_file)
    for phenotype in phenotypes:
        phen_g = h5f.create_group(phenotype)
        for env in envs:
            d = res_dict[phenotype][env]
            env_g = phen_g.create_group(env)
            env_g.create_dataset('R2', data=[d['R2']])
            env_g.create_dataset('corr', data=[d['corr']])
            env_g.create_dataset('obs_y', data=d['obs_y'])
            env_g.create_dataset('pred_y', data=d['pred_y'])
            env_g.create_dataset('avg_herit', data=[d['avg_herit']])
    h5f.close()
Beispiel #11
0
def gen_sfs_plots(
        snps_hdf5_file='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/called_snps.hdf5',
        fig_dir='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/',
        filter_pop=None):

    ### Here I will do the SFS for each genospecies based on the rhizobium xls file
    pop = parse_pop_map()
    pop_map = pop.keys()
    ct_array = pop.values()
    from itertools import izip
    h5f = h5py.File(snps_hdf5_file)
    gene_groups = sorted(h5f.keys())

    syn_mafs = []
    nonsyn_mafs = []
    all_mafs = []
    sfs_dict = {}
    for i, gg in enumerate(gene_groups):
        if i % 100 == 0:
            print '%d: Gene %s' % (i, gg)
        g = h5f[gg]
        if g['codon_snps'].size > 1:
            #print g['codon_snps'].shape

            if filter_pop is not None:
                strains = g['strains']
                indiv_filter = sp.zeros((len(strains)), dtype='bool8')
                for s_i, s in enumerate(strains):
                    if pop[s]['genospecies'] == filter_pop:
                        indiv_filter[s_i] = True
                    codon_snps = g['codon_snps'][...]
                    codon_snps = codon_snps[:,
                                            indiv_filter]  # reducing the collumns based on the genospecies
                    t_codon_snps = sp.transpose(codon_snps)
                    freqs = sp.mean(t_codon_snps, 0)
                    # rows are snps collumns are individuals
                    #counts = np.sum(codon_snps, axis = 0)
                    #print counts
                    #for c in counts:
                    #    if c in sfs_dict:
                    #        sfs_dict[c] += 1
                    #    else:
                    #        sfs_dict[c] = 1
                #with open('dict.csv', 'wb') as csv_file:
                #    writer = csv.writer(csv_file)
                #    for key, value in sfs_dict.items():
                #        writer.writerow([key, value])

            else:
                codon_snps = g['codon_snps'][...]
                t_codon_snps = sp.transpose(codon_snps)
                freqs = sp.mean(t_codon_snps, 0)  # number of minor allele
            mafs = sp.minimum(freqs, 1 - freqs)
            is_synonimous_snp = g['is_synonimous_snp'][...]
            syn_mafs.extend(mafs[is_synonimous_snp])
            nonsyn_mafs.extend(mafs[sp.negative(is_synonimous_snp)])
            all_mafs.extend(mafs)

    if filter_pop is not None:
        output_file = "%s.csv" % (str(argv[1]))
        np.savetxt(output_file, all_mafs, delimiter=',')  # X is an array
        output_file = "%ssyn_mafs.csv" % (str(argv[1]))
        np.savetxt(output_file, syn_mafs, delimiter=',')
        output_file = "%snon_syn_mafs.csv" % (str(argv[1]))
        np.savetxt(output_file, nonsyn_mafs, delimiter=',')
    # pylab.clf()
    # pylab.hist(all_mafs, bins=50)
    # pylab.title('SFS (all binary codon SNPs)')
    # pylab.savefig('%s/sfs_all_%s.png'%(fig_dir,filter_pop))

    #pylab.clf()
    #pylab.hist(nonsyn_mafs, bins=50)
    #pylab.title('SFS (non-synonimous SNPs)')
    #pylab.savefig('%s/sfs_non_syn_%s.png'%(fig_dir,filter_pop))

    #pylab.clf()
    #pylab.hist(syn_mafs, bins=50)
    #pylab.title('SFS (synonimous SNPs)')
    #pylab.savefig('%s/sfs_syn_%s.png'%(fig_dir,filter_pop))

    else:
        output_file = "total_2.csv"
        np.savetxt(output_file, all_mafs, delimiter=',')
        pylab.clf()
        pylab.hist(all_mafs, bins=50)
        pylab.title('SFS (all binary codon SNPs)')
        pylab.savefig(fig_dir + '/sfs_all.png')

        pylab.clf()
        pylab.hist(nonsyn_mafs, bins=50)
        pylab.title('SFS (non-synonimous SNPs)')
        pylab.savefig(fig_dir + '/sfs_non_syn.png')

        pylab.clf()
        pylab.hist(syn_mafs, bins=50)
        pylab.title('SFS (synonimous SNPs)')
        pylab.savefig(fig_dir + '/sfs_syn.png')
Beispiel #12
0
def coordinate_genot_ss(genotype_file=None,
                        hdf5_file=None,
                        genetic_map_dir=None,
                        check_mafs=False,
                        min_maf=0.01):
    """
    Assumes plink BED files.  Imputes missing genotypes.
    """
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    #        num_individs = len(gf['chrom_1']['snps'][:, 0])
    #     Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8')
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print 'Unable to find phenotype values.'
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])
        has_phenotype = True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype = True
    risk_scores = sp.zeros(num_individs)
    rb_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    corr_list = []
    rb_corr_list = []

    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)

    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    #Figure out chromosomes and positions by looking at SNPs.
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()
    chr_dict = _get_chrom_dict_(loci, chromosomes)

    tot_num_non_matching_nts = 0
    for chrom in chromosomes:
        chr_str = 'chrom_%d' % chrom
        print 'Working on chromsome: %s' % chr_str

        chrom_d = chr_dict[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        g_sids = chrom_d['sids']
        g_sid_set = set(g_sids)
        assert len(g_sid_set) == len(g_sids), 'Some duplicates?'
        ss_sids = ssg['sids'][...]
        ss_sid_set = set(ss_sids)
        assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?'

        #Figure out filters:
        g_filter = sp.in1d(g_sids, ss_sids)
        ss_filter = sp.in1d(ss_sids, g_sids)

        #Order by SNP IDs
        g_order = sp.argsort(g_sids)
        ss_order = sp.argsort(ss_sids)

        g_indices = []
        for g_i in g_order:
            if g_filter[g_i]:
                g_indices.append(g_i)

        ss_indices = []
        for ss_i in ss_order:
            if ss_filter[ss_i]:
                ss_indices.append(ss_i)

        g_nts = chrom_d['nts']
        snp_indices = chrom_d['snp_indices']
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]
        assert not sp.any(sp.isnan(betas)), 'WTF?'
        assert not sp.any(sp.isinf(betas)), 'WTF?'

        num_non_matching_nts = 0
        num_ambig_nts = 0
        ok_nts = []
        print 'Found %d SNPs present in both datasets' % (len(g_indices))

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]
            ss_freqs_list = []

        ok_indices = {'g': [], 'ss': []}
        for g_i, ss_i in it.izip(g_indices, ss_indices):

            #Is the nucleotide ambiguous?
            #g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]]
            g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts += 1
                tot_num_non_matching_nts += 1
                continue

            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1
                continue

            ss_nt = ss_nts[ss_i]
            #Are the nucleotides the same?
            flip_nts = False
            os_g_nt = sp.array(
                [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
            if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
                # Opposite strand nucleotides
                flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                    os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                if flip_nts:
                    betas[ss_i] = -betas[ss_i]
                    log_odds[ss_i] = -log_odds[ss_i]
                    if 'freqs' in ssg.keys():
                        ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                else:
                    #                     print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                    #                         (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1

                    continue

            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['ss'].append(ss_i)
            ok_nts.append(g_nt)

        print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        order = sp.argsort(positions)
        ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])
        positions = positions[order]

        #Parse SNPs
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[
            ok_indices['g']]  #Pinpoint where the SNPs are in the file.
        raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)
        print 'raw_snps.shape=', raw_snps.shape

        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))  #sp.std(raw_snps, 1)
        snp_means = freqs * 2  #sp.mean(raw_snps, 1)

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]
        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample' % sp.sum(
                    freq_discrepancy_snp)
                print freqs[freq_discrepancy_snp]
                print ss_freqs[freq_discrepancy_snp]

                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]

        #Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "WTF?"
        if sp.sum(maf_filter) < n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            freqs = freqs[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]

            print '%d SNPs with MAF < %0.3f were filtered' % (
                n_snps - maf_filter_sum, min_maf)

        print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum,
                                                           chrom)

        rb_prs = sp.dot(sp.transpose(raw_snps), log_odds)
        if has_phenotype:
            print 'Normalizing SNPs'
            snp_means.shape = (len(raw_snps), 1)
            snp_stds.shape = (len(raw_snps), 1)
            snps = (raw_snps - snp_means) / snp_stds
            assert snps.shape == raw_snps.shape, 'Aha!'
            snp_stds = snp_stds.flatten()
            snp_means = snp_means.flatten()
            prs = sp.dot(sp.transpose(snps), betas)
            corr = sp.corrcoef(Y, prs)[0, 1]
            corr_list.append(corr)
            print 'PRS correlation for chromosome %d was %0.4f' % (chrom, corr)
            rb_corr = sp.corrcoef(Y, rb_prs)[0, 1]
            rb_corr_list.append(rb_corr)
            print 'Raw effect sizes PRS correlation for chromosome %d was %0.4f' % (
                chrom, rb_corr)

        sid_set = set(sids)
        if genetic_map_dir is not None:
            genetic_map = []
            with gzip.open(genetic_map_dir +
                           'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
                        genetic_map.append(l[0])

        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_ref', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds)
        ofg.create_dataset('snp_means_ref', data=snp_means)
        ofg.create_dataset('freqs_ref', data=freqs)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)


#         print 'Sum of squared effect sizes:', sp.sum(betas ** 2)
#         print 'Sum of squared log odds:', sp.sum(log_odds ** 2)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=rb_prs)
        if has_phenotype:
            risk_scores += prs
        rb_risk_scores += rb_prs
        num_common_snps += len(betas)
Beispiel #13
0
def grasp_callback(my_grasp):

    my_pose = geometry_msgs.msg.PoseStamped()
    my_pose.header.stamp = my_grasp.markers[0].header.stamp
    my_pose.header.frame_id = "/xtion_rgb_optical_frame"

    pose_target = geometry_msgs.msg.Pose()
    pose_target.position.x = my_grasp.markers[0].points[0].x
    pose_target.position.y = my_grasp.markers[0].points[0].y
    pose_target.position.z = my_grasp.markers[0].points[0].z

    ## Convert to quaternion

    u = [1, 0, 0]
    norm = linalg.norm([
        my_grasp.markers[i].points[0].x - my_grasp.markers[0].points[1].x,
        my_grasp.markers[0].points[0].y - my_grasp.markers[0].points[1].y,
        my_grasp.markers[0].points[0].z - my_grasp.markers[0].points[1].z
    ])
    v = asarray([
        my_grasp.markers[0].points[0].x - my_grasp.markers[0].points[1].x,
        my_grasp.markers[0].points[0].y - my_grasp.markers[0].points[1].y,
        my_grasp.markers[0].points[0].z - my_grasp.markers[0].points[1].z
    ]) / norm

    if (array_equal(u, v)):
        pose_target.orientation.w = 1
        pose_target.orientation.x = 0
        pose_target.orientation.y = 0
        pose_target.orientation.z = 0
    elif (array_equal(u, negative(v))):
        pose_target.orientation.w = 0
        pose_target.orientation.x = 0
        pose_target.orientation.y = 0
        pose_target.orientation.z = 1
    else:
        half = [u[0] + v[0], u[1] + v[1], u[2] + v[2]]
        pose_target.orientation.w = dot(u, half)
        temp = cross(u, half)
        pose_target.orientation.x = temp[0]
        pose_target.orientation.y = temp[1]
        pose_target.orientation.z = temp[2]
    norm = math.sqrt(pose_target.orientation.x * pose_target.orientation.x +
                     pose_target.orientation.y * pose_target.orientation.y +
                     pose_target.orientation.z * pose_target.orientation.z +
                     pose_target.orientation.w * pose_target.orientation.w)

    if norm == 0:
        norm = 1

    my_pose.pose.orientation.x = pose_target.orientation.x / norm
    my_pose.pose.orientation.y = pose_target.orientation.y / norm
    my_pose_.pose.orientation.z = pose_target.orientation.z / norm
    my_pose.pose.orientation.w = pose_target.orientation.w / norm

    pose_target_trans = geometry_msgs.msg.PoseStamped()
    pose_target_trans.header.stamp = pose_target.header.stamp
    pose_target_trans.header.frame_id = "/map"
    now = rospy.Time.now()
    listener.waitForTransform("/map", "/xtion_rgb_optical_frame", now,
                              rospy.Duration(1.0))
    pose_target_trans = listener.transformPose("/map", pose_target)

    my_grasp_pub.publish(pose_target_trans)
Beispiel #14
0
def gen_sfs_plots(snps_hdf5_file = 'C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/called_snps.hdf5', 
                 fig_dir = 'C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/', filter_pop=None):
    
    ### Here I will do the SFS for each genospecies based on the rhizobium xls file
    pop = parse_pop_map()
    pop_map = pop.keys()
    ct_array = pop.values()
    from itertools import izip
    h5f = h5py.File(snps_hdf5_file)
    gene_groups = sorted(h5f.keys())
    
    syn_mafs = []
    nonsyn_mafs = []
    all_mafs = []
    sfs_dict = {}
    for i, gg in enumerate(gene_groups):
        if i%100==0:
            print '%d: Gene %s'%(i,gg)  
        g = h5f[gg]
        if g['codon_snps'].size>1:
            #print g['codon_snps'].shape
            
            if filter_pop is not None:
                strains = g['strains']
                indiv_filter = sp.zeros((len(strains)),dtype='bool8')
                for s_i, s in enumerate(strains):
                    if pop[s]['genospecies']== filter_pop:
                        indiv_filter[s_i]=True
                    codon_snps = g['codon_snps'][...]
                    codon_snps = codon_snps[:,indiv_filter] # reducing the collumns based on the genospecies
                    t_codon_snps = sp.transpose(codon_snps)
                    freqs = sp.mean(t_codon_snps,0)
                    # rows are snps collumns are individuals  
                    #counts = np.sum(codon_snps, axis = 0)
                    #print counts
                    #for c in counts:
                    #    if c in sfs_dict:
                    #        sfs_dict[c] += 1
                    #    else:
                    #        sfs_dict[c] = 1
                #with open('dict.csv', 'wb') as csv_file:
                #    writer = csv.writer(csv_file)
                #    for key, value in sfs_dict.items():
                #        writer.writerow([key, value])
                
            else:
                codon_snps = g['codon_snps'][...]
                t_codon_snps = sp.transpose(codon_snps)
                freqs = sp.mean(t_codon_snps,0) # number of minor allele
            mafs = sp.minimum(freqs,1-freqs)
            is_synonimous_snp = g['is_synonimous_snp'][...]
            syn_mafs.extend(mafs[is_synonimous_snp])
            nonsyn_mafs.extend(mafs[sp.negative(is_synonimous_snp)])
            all_mafs.extend(mafs)
             
    if filter_pop is not None:
        output_file = "%s.csv" %(str(argv[1]))
        np.savetxt(output_file, all_mafs, delimiter=',')   # X is an array
        output_file = "%ssyn_mafs.csv" %(str(argv[1]))
        np.savetxt(output_file, syn_mafs, delimiter=',')
        output_file = "%snon_syn_mafs.csv" %(str(argv[1])) 
        np.savetxt(output_file, nonsyn_mafs, delimiter=',') 
       # pylab.clf()
       # pylab.hist(all_mafs, bins=50)
       # pylab.title('SFS (all binary codon SNPs)')
       # pylab.savefig('%s/sfs_all_%s.png'%(fig_dir,filter_pop))
    
        #pylab.clf()
        #pylab.hist(nonsyn_mafs, bins=50)
        #pylab.title('SFS (non-synonimous SNPs)')
        #pylab.savefig('%s/sfs_non_syn_%s.png'%(fig_dir,filter_pop))
    
        #pylab.clf()
        #pylab.hist(syn_mafs, bins=50)
        #pylab.title('SFS (synonimous SNPs)')
        #pylab.savefig('%s/sfs_syn_%s.png'%(fig_dir,filter_pop))
        
    else:
        output_file = "total_2.csv"
        np.savetxt(output_file, all_mafs, delimiter=',')
        pylab.clf()
        pylab.hist(all_mafs, bins=50)
        pylab.title('SFS (all binary codon SNPs)')
        pylab.savefig(fig_dir+'/sfs_all.png')
    
        pylab.clf()
        pylab.hist(nonsyn_mafs, bins=50)
        pylab.title('SFS (non-synonimous SNPs)')
        pylab.savefig(fig_dir+'/sfs_non_syn.png')
    
        pylab.clf()
        pylab.hist(syn_mafs, bins=50)
        pylab.title('SFS (synonimous SNPs)')
        pylab.savefig(fig_dir+'/sfs_syn.png')
Beispiel #15
0
def get_kinships(snps_file='/project/NChain/faststorage/rhizobium/ld/new_snps.hdf5',
                 plot_figures=False,
                 figure_dir='/project/NChain/faststorage/rhizobium/ld/figures',
                 fig_id='all',
                 min_maf=0.1,
                 max_strain_num=200):
    """
    Calculates the kinship
    """
    h5f = h5py.File(snps_file)
    gene_groups = h5f.keys()
    all_strains = set()
    for gg in gene_groups:
        data_g = h5f[gg]
        strains = data_g['strains'][...]
        if len(strains) < max_strain_num:
            all_strains = set(strains).union(all_strains)
    num_strains = len(all_strains)
    print 'Found %d "distinct" strains' % num_strains
    
    ordered_strains = sorted(list(all_strains))
    strain_index = pd.Index(ordered_strains)
    K_snps = sp.zeros((num_strains, num_strains))
    counts_mat_snps = sp.zeros((num_strains, num_strains))
    K_codon_snps = sp.zeros((num_strains, num_strains))
    counts_mat_codon_snps = sp.zeros((num_strains, num_strains))
        
    K_nonsyn_snps = sp.zeros((num_strains, num_strains))
    counts_mat_nonsyn_snps = sp.zeros((num_strains, num_strains))
    
    K_syn_snps = sp.zeros((num_strains, num_strains))
    counts_mat_syn_snps = sp.zeros((num_strains, num_strains))

    for i, gg in enumerate(gene_groups):
        if i % 100 == 0:
            print 'Working on gene nr. %d' % i 
        data_g = h5f[gg]
        strains = data_g['strains'][...]
        if len(strains) < max_strain_num:
            strain_mask = strain_index.get_indexer(strains)
            
            snps = data_g['norm_snps'][...]
            freqs = data_g['freqs'][...]
            mafs = sp.minimum(freqs, 1 - freqs)
            maf_mask = mafs > min_maf
            snps = snps[maf_mask]
            if len(snps) == 0:
                continue
            K_snps_slice = K_snps[strain_mask]
            K_snps_slice[:, strain_mask] += sp.dot(snps.T, snps)
            K_snps[strain_mask] = K_snps_slice
            counts_mat_snps_slice = counts_mat_snps[strain_mask]
            counts_mat_snps_slice[:, strain_mask] += len(snps)
            counts_mat_snps[strain_mask] = counts_mat_snps_slice
    
            codon_snps = data_g['norm_codon_snps'][...]
            if len(codon_snps) == 0:
                continue
            freqs = data_g['codon_snp_freqs'][...]
            mafs = sp.minimum(freqs, 1 - freqs)
            maf_mask = mafs > min_maf
            codon_snps = codon_snps[maf_mask]
            is_synonimous_snp = data_g['is_synonimous_snp'][...]
            is_synonimous_snp = is_synonimous_snp[maf_mask]
            if len(codon_snps) > 0:
                K_codon_snps_slice = K_codon_snps[strain_mask]
                K_codon_snps_slice[:, strain_mask] += sp.dot(codon_snps.T, codon_snps)
                K_codon_snps[strain_mask] = K_codon_snps_slice
                counts_mat_codon_snps_slice = counts_mat_codon_snps[strain_mask]
                counts_mat_codon_snps_slice[:, strain_mask] += len(codon_snps)
                counts_mat_codon_snps[strain_mask] = counts_mat_codon_snps_slice
        
                if sp.sum(is_synonimous_snp) > 0:
                    syn_snps = codon_snps[is_synonimous_snp]
                    K_syn_snps_slice = K_syn_snps[strain_mask]
                    K_syn_snps_slice[:, strain_mask] += sp.dot(syn_snps.T, syn_snps)
                    K_syn_snps[strain_mask] = K_syn_snps_slice
                    counts_mat_syn_snps_slice = counts_mat_syn_snps[strain_mask]
                    counts_mat_syn_snps_slice[:, strain_mask] += len(syn_snps)
                    counts_mat_syn_snps[strain_mask] = counts_mat_syn_snps_slice
            
                is_nonsynonimous_snp = sp.negative(is_synonimous_snp)
                if sp.sum(is_nonsynonimous_snp) > 0:
                    nonsyn_snps = codon_snps[is_nonsynonimous_snp]                
                    K_nonsyn_snps_slice = K_nonsyn_snps[strain_mask]
                    K_nonsyn_snps_slice[:, strain_mask] += sp.dot(nonsyn_snps.T, nonsyn_snps)
                    K_nonsyn_snps[strain_mask] = K_nonsyn_snps_slice
                    counts_mat_nonsyn_snps_slice = counts_mat_nonsyn_snps[strain_mask]
                    counts_mat_nonsyn_snps_slice[:, strain_mask] += len(nonsyn_snps)
                    counts_mat_nonsyn_snps[strain_mask] = counts_mat_nonsyn_snps_slice

    
    
    K_snps = K_snps / counts_mat_snps  # element-wise division
    K_codon_snps = K_codon_snps / counts_mat_codon_snps  # element-wise division

    K_syn_snps = K_syn_snps / counts_mat_syn_snps  # element-wise division
    K_nonsyn_snps = K_nonsyn_snps / counts_mat_nonsyn_snps  # element-wise division

    if plot_figures:
        plot_dirty_PCA(K_snps, figure_fn='PCA_all_snps_%s.pdf' % fig_id, k_figure_fn='K_all_snps_%s.png' % fig_id,
                       figure_dir=figure_dir, strains=ordered_strains, title='All SNPs')
        plot_dirty_PCA(K_codon_snps, figure_fn='PCA_codon_snps_%s.pdf' % fig_id, k_figure_fn='K_codon_snps_%s.png' % fig_id,
                       figure_dir=figure_dir, strains=ordered_strains, title='Codon SNPs')
        plot_dirty_PCA(K_syn_snps, figure_fn='PCA_syn_snps_%s.pdf' % fig_id, k_figure_fn='K_syn_snps_%s.png' % fig_id,
                       figure_dir=figure_dir, strains=ordered_strains, title='Synonymous SNPs')
        plot_dirty_PCA(K_nonsyn_snps, figure_fn='PCA_nonsyn_snps_%s.pdf' % fig_id, k_figure_fn='K_nonsyn_snps_%s.png' % fig_id,
                       figure_dir=figure_dir, strains=ordered_strains, title='Non-Synonymous SNPs')

    print 'Average number of SNPs: %0.2f.' % sp.mean(counts_mat_snps)
    print 'Average number of codon SNPs: %0.2f.' % sp.mean(counts_mat_snps)
    print 'Average number of codon SNPs: %0.2f.' % sp.mean(counts_mat_snps)
    print 'Average number of codon SNPs: %0.2f.' % sp.mean(counts_mat_snps)

    return {'K_snps':K_snps, 'K_codon_snps':K_codon_snps, 'counts_mat_snps':counts_mat_snps, 'counts_mat_codon_snps':counts_mat_codon_snps,
            'K_syn_snps':K_syn_snps, 'K_nonsyn_snps':K_nonsyn_snps, 'counts_mat_syn_snps':counts_mat_syn_snps, 'counts_mat_nonsyn_snps':counts_mat_nonsyn_snps,
            'strains':ordered_strains}
Beispiel #16
0
def coordinate_cegs_genotype_phenotype(phen_dict, phenotype='Protein',env='mated',k_thres=0.8, ind_missing_thres=0.5, snp_missing_thres=0.05, maf_thres=0.1,
                                       genotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.Filter_imputed.hdf5'):
    """
    Parse genotypes and coordinate with phenotype, and ready data for analysis.
    """
    gh5f = h5py.File(genotype_file)
    p_dict = phen_dict[phenotype][env]
    print 'Loading SNPs'
    snps = sp.array(gh5f['gt'][...],dtype='single')
    snps = snps[:,p_dict['ind_filter']]
    positions = gh5f['pos'][...]
    m,n = snps.shape
    print 'Loaded %d SNPs for %d individuals'%(m,n)
    print 'Filtering individuals with missing rates >%0.2f'%ind_missing_thres
    missing_mat = sp.isnan(snps)
    ind_missing_rates = sp.sum(missing_mat,0)/float(m)
    ind_filter = ind_missing_rates<ind_missing_thres
    snps = snps[:,ind_filter]
    n = sp.sum(ind_filter)  
    print 'Filtered %d individuals due to high missing rates'%sp.sum(sp.negative(ind_filter))
    gt_ids = gh5f['gt_ids'][p_dict['ind_filter']]
    gt_ids = gt_ids[ind_filter]
    Y_means = p_dict['Y_means'][p_dict['ind_filter']]
    Y_means = Y_means[ind_filter]
    Y_medians = p_dict['Y_medians'][p_dict['ind_filter']]
    Y_medians = Y_medians[ind_filter]
    rep_count =  p_dict['rep_count'][p_dict['ind_filter']]
    rep_count = rep_count[ind_filter]
    
    print 'Now removing "bad" genotypes.'
    bad_genotypes = ['Raleigh_272', 'Raleigh_378', 'Raleigh_554', 'Raleigh_591', 'Raleigh_398', 'Raleigh_138', 'Raleigh_208', 
                     'Raleigh_336', 'Raleigh_370', 'Raleigh_373', 'Raleigh_374', 'Raleigh_799', 'Raleigh_821', 'Raleigh_822',
                     'Raleigh_884', 'Raleigh_335']
    ind_filter = sp.negative(sp.in1d(gt_ids,bad_genotypes))
    gt_ids = gt_ids[ind_filter]
    Y_means= Y_means[ind_filter]
    Y_medians= Y_medians[ind_filter]
    rep_count= rep_count[ind_filter]    
    snps = snps[:,ind_filter]
    print 'Removed %d "bad" genotypes'%sp.sum(sp.negative(ind_filter))
    
    n = len(snps[0])
    print 'Filtering SNPs with missing rate >%0.2f'%snp_missing_thres
    missing_mat = sp.isnan(snps)
    snp_missing_rates = sp.sum(missing_mat,1)/float(n)
    snps_filter = snp_missing_rates<snp_missing_thres
    snps = snps[snps_filter]
    positions = positions[snps_filter]
    m = sp.sum(snps_filter)
    print 'Filtered %d SNPs due to high missing rate'%sp.sum(sp.negative(snps_filter))
    
    print 'Now imputing (w mean)'
    missing_mat = sp.isnan(snps)
    ok_counts = n-sp.sum(missing_mat,1)
    snps[missing_mat]=0
    snp_means = sp.sum(snps,1)/ok_counts
#     print snp_means.shape
#     print snp_means[:10]
#     import pdb
#     pdb.set_trace()
    for i in range(len(snps)):
        snps[i,missing_mat[i]]=snp_means[i]

    print 'And filtering SNPs with MAF<%0.2f'%maf_thres
    snp_means = sp.mean(snps,1)
    snp_mafs = sp.minimum(snp_means,1-snp_means)
    snps_filter = snp_mafs>maf_thres
    snps = snps[snps_filter]
    positions = positions[snps_filter]
    print 'Filtered %d SNPs with low MAFs'%sp.sum(sp.negative(snps_filter))
    

    print 'Filtering based on kinship w threshold:',k_thres
    import kinship
    K = kinship.calc_ibd_kinship(snps)
    print '\nKinship calculated'
    K_ind_filter = []
    for i in range(n):
        K_ind_filter.append(not sp.any(K[i,i+1:n]>k_thres))
    if sum(K_ind_filter)==n:
        print 'No individuals were filtered based on kinship..'
    else:
        print 'Filtering %d individuals based on kinship.'%(n-sum(K_ind_filter))
        K_ind_filter = sp.array(K_ind_filter)
        gt_ids = gt_ids[K_ind_filter]
        Y_means= Y_means[K_ind_filter]
        Y_medians= Y_medians[K_ind_filter]
        rep_count= rep_count[K_ind_filter]    
        snps = snps[:,K_ind_filter]
        
        print 'Again filtering SNPs with MAF<%0.2f'%maf_thres
        snp_means = sp.mean(snps,1)
        snp_mafs = sp.minimum(snp_means,1-snp_means)
        snps_filter = snp_mafs>maf_thres
        snps = snps[snps_filter]
        positions = positions[snps_filter]
        print 'Filtered %d additional SNPs with low MAFs'%sp.sum(sp.negative(snps_filter))


    print 'All filtering done.'
    
    m,n = snps.shape
    print 'In all there are %d SNPs remaining, for %d individuals.'%(m,n)
    
    ret_dict = {'Y_means':Y_means, 'Y_medians':Y_medians, 'rep_count':rep_count, 'gt_ids':gt_ids, 
                'positions':positions, 'snps':snps}
    
    
    
    return ret_dict
Beispiel #17
0
def parse_cegs_drosophila_phenotypes(phenotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/allphenotypes_5.0_cleaned.tab.reps.hdf5',):
    """
    Parser for CEGS Drosophila phenotype data
    """
    import pylab
    #Load phenotypes...
    ph5f = h5py.File(phenotype_file)
    #Now take the median and mean of all values for all individuals.
    phen_dict = {}
    for phen in ph5f.keys():
        #First mated
        Y_mated = ph5f[phen]['Y_mated'][...]
        Z_mated = ph5f[phen]['Z_mated'][...]
        sample_filter = sp.negative(sp.isnan(Y_mated))
        Ys_sum = sp.dot(Y_mated[sample_filter], Z_mated[sample_filter])
        rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_mated[sample_filter])
        Y_means = Ys_sum/rep_count
        #Now calculate medians by iteration.
        phen_vals_list = [[] for i in range(216)]
        for i in range(len(Y_mated)):
            ind_i = sp.where(1==Z_mated[i])[0][0]
            phen_vals_list[ind_i].append(Y_mated[i])
        medians = sp.zeros(216)
        for i, pl in enumerate(phen_vals_list):
            if len(pl)>0:
                medians[i] = sp.median(pl)
            else:
                medians[i] = sp.nan
        ind_filter = sp.negative(sp.isnan(Y_means))
        if phen=='Triglyceride':
            ind_filter = (Y_means>0)*ind_filter
        
        phen_dict[phen]={'mated':{'Y_means':Y_means, 'rep_count':rep_count, 'ind_filter':ind_filter, 'Y_medians':medians}}
        
        print 'Plotting phenotype histograms for %s, %s'%(phen,'mated')
        mated_filtered_means = Y_means[ind_filter]
        pylab.hist(mated_filtered_means)
        pylab.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_means.png' % (phen))
        pylab.clf()
        mated_filtered_medians = medians[ind_filter]
        pylab.hist(mated_filtered_medians)
        pylab.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_medians.png' % (phen))
        pylab.clf()


        #Then virgin
        Y_virgin = ph5f[phen]['Y_virgin'][...]
        Z_virgin = ph5f[phen]['Z_virgin'][...]
        sample_filter = sp.negative(sp.isnan(Y_virgin))
        Ys_sum = sp.dot(Y_virgin[sample_filter], Z_virgin[sample_filter])
        rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_virgin[sample_filter])
        Y_means = Ys_sum/rep_count
        #Now calculate medians by iteration.
        phen_vals_list = [[] for i in range(216)]
        for i in range(len(Y_virgin)):
            ind_i = sp.where(1==Z_virgin[i])[0][0]
            phen_vals_list[ind_i].append(Y_virgin[i])
        medians = sp.zeros(216)
        for i, pl in enumerate(phen_vals_list):
            if len(pl)>0:
                medians[i] = sp.median(pl)
            else:
                medians[i] = sp.nan
        ind_filter = sp.negative(sp.isnan(Y_means))
        if phen=='Triglyceride':
            ind_filter = (Y_means>0)*ind_filter

        phen_dict[phen]['virgin']={'Y_means':Y_means, 'rep_count':rep_count, 'ind_filter':ind_filter, 'Y_medians':medians}
    
        print 'Plotting phenotype histograms for %s, %s'%(phen,'virgin')
        virgin_filtered_means = Y_means[ind_filter]
        pylab.hist(virgin_filtered_means)
        pylab.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_means.png' % (phen))
        pylab.clf()
        virgin_filtered_medians = medians[ind_filter]
        pylab.hist(virgin_filtered_medians)
        pylab.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_medians.png' % (phen))
        pylab.clf()

        means_corr = sp.corrcoef(mated_filtered_means, virgin_filtered_means)[0,1]
        medians_corr = sp.corrcoef(mated_filtered_medians, virgin_filtered_medians)[0,1]
        print 'Correlation between mated and virgin flies, means: %0.2f, medians: %0.2f'%(means_corr,medians_corr)
        phen_dict[phen]['corrs'] = {'means':means_corr, 'medians':medians_corr}
    return phen_dict
def grasp_callback(my_grasp):

    ## Planning to a Pose goal
    ## ^^^^^^^^^^^^^^^^^^^^^^^
    ## We can plan a motion for this group to a desired pose for the
    ## end-effector

    pose_target = geometry_msgs.msg.PoseStamped()
    pose_target.header.stamp = my_grasp.markers[0].header.stamp.secs
    pose_target.header.frame_id = "/camera_rgb_optical_frame"
    pose_target.pose.position.x = my_grasp.markers[0].points[1].x
    pose_target.pose.position.y = my_grasp.markers[0].points[1].y
    pose_target.pose.position.z = my_grasp.markers[0].points[1].z

    ## Convert to quaternion

    u = [1, 0, 0]
    norm = linalg.norm([
        my_grasp.markers[0].points[0].x - my_grasp.markers[0].points[1].x,
        my_grasp.markers[0].points[0].y - my_grasp.markers[0].points[1].y,
        my_grasp.markers[0].points[0].z - my_grasp.markers[0].points[1].z
    ])
    v = asarray([
        my_grasp.markers[0].points[0].x - my_grasp.markers[0].points[1].x,
        my_grasp.markers[0].points[0].y - my_grasp.markers[0].points[1].y,
        my_grasp.markers[0].points[0].z - my_grasp.markers[0].points[1].z
    ]) / norm

    if (array_equal(u, v)):
        pose_target.pose.orientation.w = 1
        pose_target.pose.orientation.x = 0
        pose_target.pose.orientation.y = 0
        pose_target.pose.orientation.z = 0
    elif (array_equal(u, negative(v))):
        pose_target.pose.orientation.w = 0
        pose_target.pose.orientation.x = 0
        pose_target.pose.orientation.y = 0
        pose_target.pose.orientation.z = 1
    else:
        half = [u[0] + v[0], u[1] + v[1], u[2] + v[2]]
        pose_target.pose.orientation.w = dot(u, half)
        temp = cross(u, half)
        pose_target.pose.orientation.x = temp[0]
        pose_target.pose.orientation.y = temp[1]
        pose_target.pose.orientation.z = temp[2]
    norm = math.sqrt(
        pose_target.pose.orientation.x * pose_target.pose.orientation.x +
        pose_target.pose.orientation.y * pose_target.pose.orientation.y +
        pose_target.pose.orientation.z * pose_target.pose.orientation.z +
        pose_target.pose.orientation.w * pose_target.pose.orientation.w)
    if norm == 0:
        norm = 1
    pose_target.pose.orientation.x = pose_target.pose.orientation.x / norm
    pose_target.pose.orientation.y = pose_target.pose.orientation.y / norm
    pose_target.pose.orientation.z = pose_target.pose.orientation.z / norm
    pose_target.pose.orientation.w = pose_target.pose.orientation.w / norm

    print "Timestamp: %d." % pose_target.header.stamp
    print "Position X: %f." % pose_target.pose.position.x
    print "Position Y: %f." % pose_target.pose.position.y
    print "Position Z: %f." % pose_target.pose.position.z
    print "Orientation X: %f." % pose_target.pose.orientation.x
    print "Orientation Y: %f." % pose_target.pose.orientation.y
    print "Orientation Z: %f." % pose_target.pose.orientation.z
    print "Orientation W: %f." % pose_target.pose.orientation.w

    ## Broadcast transform
    br = tf.TransformBroadcaster()
    br.sendTransform(
        (pose_target.pose.position.x, pose_target.pose.position.y,
         pose_target.pose.position.z),
        (pose_target.pose.orientation.x, pose_target.pose.orientation.y,
         pose_target.pose.orientation.z, pose_target.pose.orientation.w),
        my_grasp.markers[0].header.stamp, "/grasping_target",
        "/camera_rgb_optical_frame")
    #br.sendTransform((pose_target.pose.position.x, pose_target.pose.position.y, pose_target.pose.position.z), tf.transformations.quaternion_from_euler(my_grasp.grasps[0].axis.x, my_grasp.grasps[0].axis.y, my_grasp.grasps[0].axis.z), my_grasp.header.stamp, "/grasping_target", "/camera_rgb_optical_frame")

    ## Listening to transform
    (trans, rot) = listener.lookupTransform('/world', '/grasping_target',
                                            rospy.Time(0))

    ## Build new pose
    pose_target_trans = geometry_msgs.msg.PoseStamped()
    pose_target_trans.header.frame_id = "/world"
    pose_target_trans.header.stamp = my_grasp.markers[0].header.stamp
    pose_target_trans.pose.position.x = trans[0]
    pose_target_trans.pose.position.y = trans[1]
    pose_target_trans.pose.position.z = trans[2]
    pose_target_trans.pose.orientation.x = rot[0]
    pose_target_trans.pose.orientation.y = rot[1]
    pose_target_trans.pose.orientation.z = rot[2]
    pose_target_trans.pose.orientation.w = rot[3]

    print "NEW POSITION."
    print "Position X: %f." % pose_target_trans.pose.position.x
    print "Position Y: %f." % pose_target_trans.pose.position.y
    print "Position Z: %f." % pose_target_trans.pose.position.z
    print "Orientation X: %f." % pose_target_trans.pose.orientation.x
    print "Orientation Y: %f." % pose_target_trans.pose.orientation.y
    print "Orientation Z: %f." % pose_target_trans.pose.orientation.z
    print "Orientation W: %f." % pose_target_trans.pose.orientation.w

    my_grasp_pub.publish(pose_target_trans)

    group.set_pose_target(pose_target_trans.pose, end_effector_link="my_eef")

    ## Now, we call the planner to compute the plan
    ## and visualize it if successful
    ## Note that we are just planning, not asking move_group
    ## to actually move the robot

    #  group.set_planner_id("RRTstarkConfigDefault")
    #  group.allow_replanning(True)

    ## Planning with collision detection can be slow.  Lets set the planning time
    ## to be sure the planner has enough time to plan around the box.  10 seconds
    ## should be plenty.
    #  group.set_planning_time(5.0)

    plan1 = group.plan()

    ## Moving to a pose goal
    ## ^^^^^^^^^^^^^^^^^^^^^
    ##
    ## Moving to a pose goal is similar to the step above
    ## except we now use the go() function. Note that
    ## the pose goal we had set earlier is still active
    ## and so the robot will try to move to that goal. We will
    ## not use that function in this tutorial since it is
    ## a blocking function and requires a controller to be active
    ## and report success on execution of a trajectory.

    # Uncomment below line when working with a real robot
    #  group.go(wait=True)

    print "============ DONE PLANNING ============"
    sys.exit("DONE PLANNING")
    ## Sleep to give Rviz time to visualize the plan. */
    rospy.sleep(5)
    group.clear_pose_targets()
Beispiel #19
0
def coordinate_genot_ss(genotype_file=None,
                        hdf5_file=None,
                        genetic_map_dir=None,
                        check_mafs=False,
                        min_maf =0.01):
    """
    Assumes plink BED files.  Imputes missing genotypes.
    """
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
#        num_individs = len(gf['chrom_1']['snps'][:, 0])
#     Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8')
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = sp.unique(Y)
    if len(unique_phens)==1:
        print 'Unable to find phenotype values.'
        has_phenotype=False
    elif len(unique_phens)==2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins)==2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1])
        has_phenotype=True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype=True
    risk_scores = sp.zeros(num_individs)
    rb_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    corr_list = []
    rb_corr_list = []

    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)
    
    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    #Figure out chromosomes and positions by looking at SNPs.  
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci] 

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()
    chr_dict = _get_chrom_dict_(loci, chromosomes)
    
    tot_num_non_matching_nts = 0
    for chrom in chromosomes:
        chr_str = 'chrom_%d'%chrom
        print 'Working on chromsome: %s'%chr_str
        
        chrom_d = chr_dict[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        g_sids = chrom_d['sids']
        g_sid_set = set(g_sids)
        assert len(g_sid_set) == len(g_sids), 'Some duplicates?'
        ss_sids = ssg['sids'][...]
        ss_sid_set = set(ss_sids)
        assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?'

        #Figure out filters:
        g_filter = sp.in1d(g_sids,ss_sids)
        ss_filter = sp.in1d(ss_sids,g_sids)

        #Order by SNP IDs
        g_order = sp.argsort(g_sids)
        ss_order = sp.argsort(ss_sids)

        g_indices = []
        for g_i in g_order:
            if g_filter[g_i]:
                g_indices.append(g_i)

        ss_indices = []
        for ss_i in ss_order:
            if ss_filter[ss_i]:
                ss_indices.append(ss_i)

        g_nts = chrom_d['nts']
        snp_indices = chrom_d['snp_indices']
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]
        assert not sp.any(sp.isnan(betas)), 'WTF?'
        assert not sp.any(sp.isinf(betas)), 'WTF?'

        num_non_matching_nts = 0
        num_ambig_nts = 0
        ok_nts = []
        print 'Found %d SNPs present in both datasets'%(len(g_indices))

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]
            ss_freqs_list=[]
        
        ok_indices = {'g':[], 'ss':[]}
        for g_i, ss_i in it.izip(g_indices, ss_indices):
            
            #Is the nucleotide ambiguous?
            #g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]]
            g_nt = [g_nts[g_i][0],g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts +=1
                tot_num_non_matching_nts += 1
                continue
            
            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1                
                continue

            ss_nt = ss_nts[ss_i]
            #Are the nucleotides the same?
            flip_nts = False
            os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
            if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
                # Opposite strand nucleotides
                flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                if flip_nts:
                    betas[ss_i] = -betas[ss_i]
                    log_odds[ss_i] = -log_odds[ss_i]
                    if 'freqs' in ssg.keys():
                        ss_freqs[ss_i] = 1-ss_freqs[ss_i]
                else:
#                     print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
#                         (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                        
                    continue

            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['ss'].append(ss_i)
            ok_nts.append(g_nt)

        print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        order = sp.argsort(positions)
        ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])
        positions = positions[order]
        
        #Parse SNPs
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file.
        raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)
        print 'raw_snps.shape=', raw_snps.shape

        snp_stds = sp.sqrt(2*freqs*(1-freqs)) #sp.std(raw_snps, 1) 
        snp_means = freqs*2 #sp.mean(raw_snps, 1)

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]
        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp)
                print freqs[freq_discrepancy_snp]
                print ss_freqs[freq_discrepancy_snp]
                
                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                 
        
        #Filter minor allele frequency SNPs.
        maf_filter = (freqs>min_maf)*(freqs<(1-min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum<=n_snps, "WTF?"
        if sp.sum(maf_filter)<n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            freqs = freqs[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]
            
            
            print '%d SNPs with MAF < %0.3f were filtered'%(n_snps-maf_filter_sum,min_maf)

        print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom)
        
        rb_prs = sp.dot(sp.transpose(raw_snps), log_odds)
        if has_phenotype:
            print 'Normalizing SNPs'
            snp_means.shape = (len(raw_snps),1)
            snp_stds.shape = (len(raw_snps),1)
            snps = (raw_snps - snp_means) / snp_stds
            assert snps.shape==raw_snps.shape, 'Aha!'
            snp_stds = snp_stds.flatten()
            snp_means = snp_means.flatten()
            prs = sp.dot(sp.transpose(snps), betas)
            corr = sp.corrcoef(Y, prs)[0, 1]
            corr_list.append(corr)
            print 'PRS correlation for chromosome %d was %0.4f' % (chrom, corr)
            rb_corr = sp.corrcoef(Y, rb_prs)[0, 1]
            rb_corr_list.append(rb_corr)
            print 'Raw effect sizes PRS correlation for chromosome %d was %0.4f' % (chrom, rb_corr)
        
        sid_set = set(sids)
        if genetic_map_dir is not None:
            genetic_map = [] 
            with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
                        genetic_map.append(l[0])
        
        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_ref', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds)
        ofg.create_dataset('snp_means_ref', data=snp_means)
        ofg.create_dataset('freqs_ref', data=freqs)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)
#         print 'Sum of squared effect sizes:', sp.sum(betas ** 2)
#         print 'Sum of squared log odds:', sp.sum(log_odds ** 2)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=rb_prs)
        if has_phenotype:
            risk_scores += prs
        rb_risk_scores += rb_prs
        num_common_snps += len(betas)
Beispiel #20
0
def coordinate_genotypes_ss_w_ld_ref(genotype_file=None,
                                     reference_genotype_file=None,
                                     hdf5_file=None,
                                     genetic_map_dir=None,
                                     check_mafs=False,
                                     min_maf=0.01):
    #   recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding..
    print 'Coordinating things w genotype file: %s \nref. genot. file: %s' % (
        genotype_file, reference_genotype_file)
    plinkf = plinkfile.PlinkFile(genotype_file)

    #Loads only the individuals... (I think?)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]

    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print 'Unable to find phenotype values.'
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])
        has_phenotype = True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype = True

    #Figure out chromosomes and positions.
    print 'Parsing validation genotype bim file'
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()

    chr_dict = _get_chrom_dict_(loci, chromosomes)

    print 'Parsing LD reference genotype bim file'
    plinkf_ref = plinkfile.PlinkFile(reference_genotype_file)
    loci_ref = plinkf_ref.get_loci()
    plinkf_ref.close()

    chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes)
    #     chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes)

    #Open HDF5 file and prepare out data
    assert not 'iids' in hdf5_file.keys(
    ), 'Something is wrong with the HDF5 file?'
    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)

    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    maf_adj_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    #corr_list = []

    tot_g_ss_nt_concord_count = 0
    tot_rg_ss_nt_concord_count = 0
    tot_g_rg_nt_concord_count = 0
    tot_num_non_matching_nts = 0

    #Now iterate over chromosomes
    for chrom in chromosomes:
        ok_indices = {'g': [], 'rg': [], 'ss': []}

        chr_str = 'chrom_%d' % chrom
        print 'Working on chromsome: %s' % chr_str

        chrom_d = chr_dict[chr_str]
        chrom_d_ref = chr_dict_ref[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        ssg = ssf['chrom_%d' % chrom]
        g_sids = chrom_d['sids']
        rg_sids = chrom_d_ref['sids']
        ss_sids = ssg['sids'][...]
        print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.' % (
            len(g_sids), len(rg_sids), len(ss_sids))
        common_sids = sp.intersect1d(ss_sids, g_sids)
        common_sids = sp.intersect1d(common_sids, rg_sids)
        print 'Found %d SNPs on chrom %d that were common across all datasets' % (
            len(common_sids), chrom)

        ss_snp_map = []
        g_snp_map = []
        rg_snp_map = []

        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid] = i

        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid] = i

        rg_sid_dict = {}
        for i, sid in enumerate(rg_sids):
            rg_sid_dict[sid] = i

        for sid in common_sids:
            g_snp_map.append(g_sid_dict[sid])

        #order by positions
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)
        #order = order.tolist()
        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        #Get the other two maps
        for sid in common_sids:
            rg_snp_map.append(rg_sid_dict[sid])

        for sid in common_sids:
            ss_snp_map.append(ss_sid_dict[sid])

        g_nts = sp.array(chrom_d['nts'])
        rg_nts = sp.array(chrom_d_ref['nts'])
        rg_nts_ok = sp.array(rg_nts)[rg_snp_map]
        #         rg_nts_l = []
        #         for nt in rg_nts_ok:
        #             rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]])
        #         rg_nts_ok = sp.array(rg_nts_l)
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(
            g_nts[g_snp_map] == ss_nts[ss_snp_map]) / 2.0
        rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map]) / 2.0
        g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok) / 2.0
        print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d' % (
            len(g_snp_map), g_rg_nt_concord_count, g_ss_nt_concord_count,
            rg_ss_nt_concord_count)
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count
        tot_g_rg_nt_concord_count += g_rg_nt_concord_count

        num_non_matching_nts = 0
        num_ambig_nts = 0

        #Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map):

            #To make sure, is the SNP id the same?
            assert g_sids[g_i] == rg_sids[rg_i] == ss_sids[
                ss_i], 'Some issues with coordinating the genotypes.'

            g_nt = g_nts[g_i]
            rg_nt = rg_nts[rg_i]
            #             rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]]
            ss_nt = ss_nts[ss_i]

            #Is the nucleotide ambiguous.
            g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts += 1
                tot_num_non_matching_nts += 1
                continue

            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1
                continue

            os_g_nt = sp.array(
                [opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])

            flip_nts = False
            if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and
                    (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))):
                if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt):
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0]
                                == ss_nt[1]) or (os_g_nt[1] == ss_nt[0]
                                                 and os_g_nt[0] == ss_nt[1])
                    #Try flipping the SS nt
                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]
                        log_odds[ss_i] = -log_odds[ss_i]
                        if 'freqs' in ssg.keys():
                            ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                    else:
                        print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                            (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1
                        continue

                else:
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                    continue
                    # Opposite strand nucleotides

            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['rg'].append(rg_i)
            ok_indices['ss'].append(ss_i)

            ok_nts.append(g_nt)
#             if flip_nts:
#                 ok_nts.append([ss_nt[1],ss_nt[0]])
#             else:
#                 ok_nts.append(ss_nt)

#print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0)
        print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts
        print '%d SNPs were retained on chromosome %d.' % (len(
            ok_indices['g']), chrom)

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        #         order = sp.argsort(positions)
        #         sorted_positions = positions[order]
        #         assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?'
        #         ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        #         ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])

        #Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[
            ok_indices['g']]  #Pinpoint where the SNPs are in the file.
        raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)

        snp_indices_ref = sp.array(chrom_d_ref['snp_indices'])
        snp_indices_ref = snp_indices_ref[
            ok_indices['rg']]  #Pinpoint where the SNPs are in the file.
        raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file,
                                                     snp_indices_ref)

        snp_stds_ref = sp.sqrt(2 * freqs_ref * (1 - freqs_ref))
        snp_means_ref = freqs_ref * 2

        snp_stds = sp.sqrt(2 * freqs * (1 - freqs))
        snp_means = freqs * 2

        betas = betas[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))
        log_odds = log_odds[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)  #[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #For debugging...
        #         g_sids = sp.array(chrom_d['sids'])[ok_indices['g']]
        #         rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']]
        #         ss_sids = ssg['sids'][...][ok_indices['ss']]
        #         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample' % sp.sum(
                    freq_discrepancy_snp)
                #                 print freqs[freq_discrepancy_snp]
                #                 print ss_freqs[freq_discrepancy_snp]

                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                raw_ref_snps = raw_ref_snps[ok_freq_snps]
                snp_stds_ref = snp_stds_ref[ok_freq_snps]
                snp_means_ref = snp_means_ref[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                freqs_ref = freqs_ref[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                #For debugging...
#         if sp.any(freq_discrepancy_snp):
#             g_sids = g_sids[ok_freq_snps]
#             rg_sids = rg_sids[ok_freq_snps]
#             ss_sids = ss_sids[ok_freq_snps]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

#Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "WTF?"
        if sp.sum(maf_filter) < n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            raw_ref_snps = raw_ref_snps[maf_filter]
            snp_stds_ref = snp_stds_ref[maf_filter]
            snp_means_ref = snp_means_ref[maf_filter]
            freqs = freqs[maf_filter]
            freqs_ref = freqs_ref[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]


#         if sp.sum(maf_filter)<n_snps:
#             g_sids = g_sids[maf_filter]
#             rg_sids = rg_sids[maf_filter]
#             ss_sids = ss_sids[maf_filter]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        maf_adj_prs = sp.dot(log_odds, raw_snps)
        if has_phenotype:
            maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1]
            print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (
                chrom, maf_adj_corr)

        genetic_map = []
        if genetic_map_dir is not None:
            with gzip.open(genetic_map_dir +
                           'chr%d.interpolated_genetic_map.gz' % chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
                        genetic_map.append(l[0])

        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_val', data=snp_stds)
        ofg.create_dataset('snp_means_val', data=snp_means)
        ofg.create_dataset('freqs_val', data=freqs)
        ofg.create_dataset('raw_snps_ref',
                           data=raw_ref_snps,
                           compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds_ref)
        ofg.create_dataset('snp_means_ref', data=snp_means_ref)
        ofg.create_dataset('freqs_ref', data=freqs_ref)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=maf_adj_prs)
        #         print 'Sum betas', sp.sum(betas ** 2)
        #ofg.create_dataset('prs', data=prs)

        #risk_scores += prs
        maf_adj_risk_scores += maf_adj_prs
        num_common_snps += len(betas)
Beispiel #21
0
def coordinate_genotypes_ss_w_ld_ref(genotype_file = None,
                                    reference_genotype_file = None,
                                    hdf5_file = None,
                                    genetic_map_dir=None,
                                    check_mafs=False,
                                    min_maf=0.01):
#   recode_dict = {1:'A', 2:'T', 3:'C', 4:'G'} #1K genomes recoding..
    print 'Coordinating things w genotype file: %s \nref. genot. file: %s'%(genotype_file, reference_genotype_file) 
    plinkf = plinkfile.PlinkFile(genotype_file)
    
    #Loads only the individuals... (I think?)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    
    unique_phens = sp.unique(Y)
    if len(unique_phens)==1:
        print 'Unable to find phenotype values.'
        has_phenotype=False
    elif len(unique_phens)==2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins)==2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases'%(cc_bins[0], cc_bins[1])
        has_phenotype=True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype=True

    #Figure out chromosomes and positions.  
    print 'Parsing validation genotype bim file'
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci] 

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()
    
    chr_dict = _get_chrom_dict_(loci, chromosomes)

    print 'Parsing LD reference genotype bim file'
    plinkf_ref = plinkfile.PlinkFile(reference_genotype_file)
    loci_ref = plinkf_ref.get_loci()
    plinkf_ref.close()
    
    chr_dict_ref = _get_chrom_dict_(loci_ref, chromosomes)
#     chr_dict_ref = _get_chrom_dict_bim_(reference_genotype_file+'.bim', chromosomes)
    
    #Open HDF5 file and prepare out data
    assert not 'iids' in hdf5_file.keys(), 'Something is wrong with the HDF5 file?'
    if has_phenotype:
        hdf5_file.create_dataset('y', data=Y)
    
    hdf5_file.create_dataset('fids', data=fids)
    hdf5_file.create_dataset('iids', data=iids)
    ssf = hdf5_file['sum_stats']
    cord_data_g = hdf5_file.create_group('cord_data')

    maf_adj_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    #corr_list = []
    
    tot_g_ss_nt_concord_count = 0
    tot_rg_ss_nt_concord_count = 0
    tot_g_rg_nt_concord_count = 0    
    tot_num_non_matching_nts = 0
   
    #Now iterate over chromosomes
    for chrom in chromosomes:
        ok_indices = {'g':[], 'rg':[], 'ss':[]}
        
        chr_str = 'chrom_%d'%chrom
        print 'Working on chromsome: %s'%chr_str
        
        chrom_d = chr_dict[chr_str]
        chrom_d_ref = chr_dict_ref[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        ssg = ssf['chrom_%d' % chrom]
        g_sids = chrom_d['sids']
        rg_sids = chrom_d_ref['sids']
        ss_sids = ssg['sids'][...]
        print 'Found %d SNPs in validation data, %d SNPs in LD reference data, and %d SNPs in summary statistics.'%(len(g_sids), len(rg_sids), len(ss_sids))
        common_sids = sp.intersect1d(ss_sids, g_sids)
        common_sids = sp.intersect1d(common_sids, rg_sids)
        print 'Found %d SNPs on chrom %d that were common across all datasets'%(len(common_sids), chrom)

        ss_snp_map = []
        g_snp_map = []
        rg_snp_map = []
        
        ss_sid_dict = {}
        for i, sid in enumerate(ss_sids):
            ss_sid_dict[sid]=i

        g_sid_dict = {}
        for i, sid in enumerate(g_sids):
            g_sid_dict[sid]=i

        rg_sid_dict = {}
        for i, sid in enumerate(rg_sids):
            rg_sid_dict[sid]=i
            
        for sid in common_sids:
            g_snp_map.append(g_sid_dict[sid])
        
        #order by positions
        g_positions = sp.array(chrom_d['positions'])[g_snp_map]
        order = sp.argsort(g_positions)
        #order = order.tolist()
        g_snp_map = sp.array(g_snp_map)[order]
        g_snp_map = g_snp_map.tolist()
        common_sids = sp.array(common_sids)[order]

        #Get the other two maps
        for sid in common_sids:
            rg_snp_map.append(rg_sid_dict[sid])
        
        for sid in common_sids:
            ss_snp_map.append(ss_sid_dict[sid])
            
        
        g_nts = sp.array(chrom_d['nts'])
        rg_nts = sp.array(chrom_d_ref['nts'])
        rg_nts_ok = sp.array(rg_nts)[rg_snp_map]
#         rg_nts_l = []
#         for nt in rg_nts_ok:
#             rg_nts_l.append([recode_dict[nt[0]],recode_dict[nt[1]]])
#         rg_nts_ok = sp.array(rg_nts_l)
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]

        g_ss_nt_concord_count = sp.sum(g_nts[g_snp_map] == ss_nts[ss_snp_map])/2.0
        rg_ss_nt_concord_count = sp.sum(rg_nts_ok == ss_nts[ss_snp_map])/2.0
        g_rg_nt_concord_count = sp.sum(g_nts[g_snp_map] == rg_nts_ok)/2.0
        print 'Nucleotide concordance counts out of %d genotypes: vg-g: %d, vg-ss: %d, g-ss: %d'%(len(g_snp_map),g_rg_nt_concord_count, g_ss_nt_concord_count, rg_ss_nt_concord_count)
        tot_g_ss_nt_concord_count += g_ss_nt_concord_count
        tot_rg_ss_nt_concord_count += rg_ss_nt_concord_count
        tot_g_rg_nt_concord_count += g_rg_nt_concord_count


        num_non_matching_nts = 0
        num_ambig_nts = 0


        #Identifying which SNPs have nucleotides that are ok..
        ok_nts = []
        for g_i, rg_i, ss_i in it.izip(g_snp_map, rg_snp_map, ss_snp_map):
            
            #To make sure, is the SNP id the same?
            assert g_sids[g_i]==rg_sids[rg_i]==ss_sids[ss_i], 'Some issues with coordinating the genotypes.'
            
            g_nt = g_nts[g_i]
            rg_nt = rg_nts[rg_i]
#             rg_nt = [recode_dict[rg_nts[rg_i][0]],recode_dict[rg_nts[rg_i][1]]]
            ss_nt = ss_nts[ss_i]

            #Is the nucleotide ambiguous.
            g_nt = [g_nts[g_i][0],g_nts[g_i][1]]
            if tuple(g_nt) in ambig_nts:
                num_ambig_nts +=1
                tot_num_non_matching_nts += 1                
                continue
            
            #First check if nucleotide is sane?
            if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                num_non_matching_nts += 1
                tot_num_non_matching_nts += 1                
                continue
            
            os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])

            flip_nts = False
            if not ((sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)) and (sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt))):
                if sp.all(g_nt == rg_nt) or sp.all(os_g_nt == rg_nt):
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                    #Try flipping the SS nt
                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]                        
                        log_odds[ss_i] = -log_odds[ss_i]    
                        if 'freqs' in ssg.keys():
                            ss_freqs[ss_i] = 1-ss_freqs[ss_i]
                    else:
                        print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                            (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1
                        continue

                    
                else:
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                    continue
                    # Opposite strand nucleotides
            
           
            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['rg'].append(rg_i)
            ok_indices['ss'].append(ss_i)

            ok_nts.append(g_nt)
#             if flip_nts:
#                 ok_nts.append([ss_nt[1],ss_nt[0]])
#             else:
#                 ok_nts.append(ss_nt)                

                        
        #print '%d SNPs in LD references to be flipped.'%((len(ref_snp_directions)-sp.sum(ref_snp_directions))/2.0)
        print '%d SNPs had ambiguous nucleotides.' % num_ambig_nts 
        print '%d SNPs were excluded due to nucleotide issues.' % num_non_matching_nts 
        print '%d SNPs were retained on chromosome %d.' % (len(ok_indices['g']), chrom)

        #Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
#         order = sp.argsort(positions)
#         sorted_positions = positions[order]
#         assert sp.all(sorted_positions==positions), 'Perhaps something is wrong here?'
#         ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
#         ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])

        
        #Now parse SNPs ..
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[ok_indices['g']] #Pinpoint where the SNPs are in the file.
        raw_snps,freqs = _parse_plink_snps_(genotype_file, snp_indices)
        
        snp_indices_ref = sp.array(chrom_d_ref['snp_indices'])
        snp_indices_ref = snp_indices_ref[ok_indices['rg']] #Pinpoint where the SNPs are in the file.
        raw_ref_snps, freqs_ref = _parse_plink_snps_(reference_genotype_file, snp_indices_ref)
        
        
        snp_stds_ref = sp.sqrt(2*freqs_ref*(1-freqs_ref)) 
        snp_means_ref = freqs_ref*2

        snp_stds = sp.sqrt(2*freqs*(1-freqs)) 
        snp_means = freqs*2
        
        betas = betas[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))
        log_odds = log_odds[ok_indices['ss']]  # * sp.sqrt(freqs * (1 - freqs))

        ps = ssg['ps'][...][ok_indices['ss']]
        nts = sp.array(ok_nts)#[order]
        sids = ssg['sids'][...][ok_indices['ss']]

        #For debugging...
#         g_sids = sp.array(chrom_d['sids'])[ok_indices['g']]
#         rg_sids = sp.array(chrom_d_ref['sids'])[ok_indices['rg']]
#         ss_sids = ssg['sids'][...][ok_indices['ss']]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'
        
        #Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs-(1-freqs))>0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs were filtered due to high allele frequency discrepancy between summary statistics and validation sample'%sp.sum(freq_discrepancy_snp)
#                 print freqs[freq_discrepancy_snp]
#                 print ss_freqs[freq_discrepancy_snp]
                 
                #Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                raw_snps = raw_snps[ok_freq_snps]
                snp_stds = snp_stds[ok_freq_snps]
                snp_means = snp_means[ok_freq_snps]
                raw_ref_snps = raw_ref_snps[ok_freq_snps]
                snp_stds_ref = snp_stds_ref[ok_freq_snps]
                snp_means_ref = snp_means_ref[ok_freq_snps]
                freqs = freqs[ok_freq_snps]
                freqs_ref = freqs_ref[ok_freq_snps]
                ps = ps[ok_freq_snps]
                positions = positions[ok_freq_snps]
                nts = nts[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]
                #For debugging...
#         if sp.any(freq_discrepancy_snp):
#             g_sids = g_sids[ok_freq_snps]
#             rg_sids = rg_sids[ok_freq_snps]
#             ss_sids = ss_sids[ok_freq_snps]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'

        
        
        #Filter minor allele frequency SNPs.
        maf_filter = (freqs>min_maf)*(freqs<(1-min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum<=n_snps, "WTF?"
        if sp.sum(maf_filter)<n_snps:
            raw_snps = raw_snps[maf_filter]
            snp_stds = snp_stds[maf_filter]
            snp_means = snp_means[maf_filter]
            raw_ref_snps = raw_ref_snps[maf_filter]
            snp_stds_ref = snp_stds_ref[maf_filter]
            snp_means_ref = snp_means_ref[maf_filter]
            freqs = freqs[maf_filter]
            freqs_ref = freqs_ref[maf_filter]
            ps = ps[maf_filter]
            positions = positions[maf_filter]
            nts = nts[maf_filter]
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]
#         if sp.sum(maf_filter)<n_snps:
#             g_sids = g_sids[maf_filter]
#             rg_sids = rg_sids[maf_filter]
#             ss_sids = ss_sids[maf_filter]
#         assert sp.all(g_sids==rg_sids) and sp.all(rg_sids==ss_sids), 'WTF?'
        
        
        
        maf_adj_prs = sp.dot(log_odds, raw_snps)
        if has_phenotype:
            maf_adj_corr = sp.corrcoef(Y, maf_adj_prs)[0, 1]
            print 'Log odds, per genotype PRS correlation w phenotypes for chromosome %d was %0.4f' % (chrom, maf_adj_corr)

        genetic_map = [] 
        if genetic_map_dir is not None:
            with gzip.open(genetic_map_dir+'chr%d.interpolated_genetic_map.gz'%chrom) as f:
                for line in f:
                    l = line.split()
                    if l[0] in sid_set:
                        genetic_map.append(l[0])
        
        
        print 'Now storing coordinated data to HDF5 file.'
        ofg = cord_data_g.create_group('chrom_%d' % chrom)
        ofg.create_dataset('raw_snps_val', data=raw_snps, compression='lzf')
        ofg.create_dataset('snp_stds_val', data=snp_stds)
        ofg.create_dataset('snp_means_val', data=snp_means)
        ofg.create_dataset('freqs_val', data=freqs)
        ofg.create_dataset('raw_snps_ref', data=raw_ref_snps, compression='lzf')
        ofg.create_dataset('snp_stds_ref', data=snp_stds_ref)
        ofg.create_dataset('snp_means_ref', data=snp_means_ref)
        ofg.create_dataset('freqs_ref', data=freqs_ref)
        ofg.create_dataset('nts', data=nts)
        ofg.create_dataset('ps', data=ps)
        ofg.create_dataset('positions', data=positions)
        ofg.create_dataset('sids', data=sids)
        if genetic_map_dir is not None:
            ofg.create_dataset('genetic_map', data=genetic_map)
        ofg.create_dataset('betas', data=betas)
        ofg.create_dataset('log_odds', data=log_odds)
        ofg.create_dataset('log_odds_prs', data=maf_adj_prs)
#         print 'Sum betas', sp.sum(betas ** 2)
        #ofg.create_dataset('prs', data=prs)
        
        
        #risk_scores += prs
        maf_adj_risk_scores += maf_adj_prs
        num_common_snps += len(betas)
Beispiel #22
0
def parse_1KG_snp_info(
        input_file='/project/TheHonestGene/faststorage/1Kgenomes/phase3/1k_genomes_hg.hdf5',
        out_file='/project/PCMA/faststorage/1_DATA/1k_genomes/1K_SNP_INFO_EUR_MAF0.05.hdf5',
        filter_ambiguous=True,
        maf_thres=0.05):
    print 'Generating a SNP info file'
    ih5f = h5py.File(input_file)
    oh5f = h5py.File(out_file)
    num_indivs = len(ih5f['indivs']['continent'])
    eur_filter = ih5f['indivs']['continent'][...] == 'EUR'
    num_eur_indivs = sp.sum(eur_filter)
    print 'Number of European individuals: %d \nTotal number of individuals: %d' % (
        num_eur_indivs, num_indivs)
    std_thres = sp.sqrt(2.0 * (1 - maf_thres) * (maf_thres))

    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom

        print 'Loading SNPs and data'
        snps = sp.array(ih5f[chrom_str]['calldata']['snps'][...], dtype='int8')
        print 'Excluding non-European individuals'
        snps = snps[:, eur_filter]

        print "Loading other SNP information"
        snp_ids = ih5f[chrom_str]['variants']['ID'][...]
        positions = ih5f[chrom_str]['variants']['POS'][...]

        print 'Loading NTs'
        ref_nts = ih5f[chrom_str]['variants']['REF'][...]
        alt_nts = ih5f[chrom_str]['variants']['ALT'][...]

        print 'Filtering multi-allelic SNPs'
        multi_allelic_filter = sp.negative(
            ih5f[chrom_str]['variants']['MULTI_ALLELIC'][...])
        snps = snps[multi_allelic_filter]
        ref_nts = ref_nts[multi_allelic_filter]
        alt_nts = alt_nts[multi_allelic_filter]
        snp_ids = snp_ids[multi_allelic_filter]
        positions = positions[multi_allelic_filter]

        print 'Filter SNPs with missing NT information'
        nt_filter = sp.in1d(ref_nts, ok_nts)
        nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts)
        if sp.sum(nt_filter) < len(nt_filter):
            snps = snps[nt_filter]
            ref_nts = ref_nts[nt_filter]
            alt_nts = alt_nts[nt_filter]
            snp_ids = snp_ids[nt_filter]
            positions = positions[nt_filter]

        print 'Filtering SNPs with MAF <', maf_thres
        afs = sp.sum(snps, axis=1) / float(num_eur_indivs)
        assert sp.all(0.0 <= afs) and sp.all(afs <= 2.0), 'AF is out of range'
        mafs = sp.minimum(afs, 1.0 - afs)
        maf_filter = mafs < maf_thres
        snps = snps[maf_filter]
        ref_nts = ref_nts[maf_filter]
        alt_nts = alt_nts[maf_filter]
        snp_ids = snp_ids[maf_filter]
        positions = positions[maf_filter]
        mafs = mafs[maf_filter]

        g = oh5f.create_group(chrom_str)
        g.create_dataset('sids', data=snp_ids)
        g.create_dataset('positions', data=positions)
        g.create_dataset('eur_mafs', data=mafs)
        g.create_dataset('ref', data=ref_nts)
        g.create_dataset('alt', data=alt_nts)
        oh5f.flush()
    oh5f.close()
Beispiel #23
0
def coordinate_ss(genotype_file=None,ssfformat=None,hdf5_file=None,outfile=None,
                        genetic_map_dir=None,
                        check_mafs=False,
                        min_maf=0.01,
                        skip_coordination=False, keep_all=False,skip_ambiguous=False):
    """
    Assumes plink BED files.  Imputes missing genotypes.
    """
    plinkf = plinkfile.PlinkFile(genotype_file)
    samples = plinkf.get_samples()
    num_individs = len(samples)
    #        num_individs = len(gf['chrom_1']['snps'][:, 0])
    #     Y = sp.array(gf['indivs']['phenotype'][...] == 'Case', dtype='int8')
    Y = [s.phenotype for s in samples]
    fids = [s.fid for s in samples]
    iids = [s.iid for s in samples]
    unique_phens = sp.unique(Y)
    if len(unique_phens) == 1:
        print 'Unable to find phenotype values.'
        has_phenotype = False
    elif len(unique_phens) == 2:
        cc_bins = sp.bincount(Y)
        assert len(cc_bins) == 2, 'Problems with loading phenotype'
        print 'Loaded %d controls and %d cases' % (cc_bins[0], cc_bins[1])
        has_phenotype = True
    else:
        print 'Found quantitative phenotype values'
        has_phenotype = True
    risk_scores = sp.zeros(num_individs)
    rb_risk_scores = sp.zeros(num_individs)
    num_common_snps = 0
    corr_list = []
    rb_corr_list = []

    ssf = hdf5_file['sum_stats']
    ssf_dict={}

    # Figure out chromosomes and positions by looking at SNPs.
    loci = plinkf.get_loci()
    plinkf.close()
    gf_chromosomes = [l.chromosome for l in loci]

    chromosomes = sp.unique(gf_chromosomes)
    chromosomes.sort()
    chr_dict = _get_chrom_dict_(loci, chromosomes)
    tot_num_non_matching_nts = 0
    for chrom in chromosomes:
        chr_str = 'chrom_%d' % chrom
        chr_col = 'chr%d' % chrom
        print 'Working on chromsome: %s' % chr_str

        chrom_d = chr_dict[chr_str]
        try:
            ssg = ssf['chrom_%d' % chrom]
        except Exception, err_str:
            print err_str
            print 'Did not find chromsome in SS dataset.'
            print 'Continuing.'
            continue

        g_sids = chrom_d['sids']
        g_sid_set = set(g_sids)
        assert len(g_sid_set) == len(g_sids), 'Some duplicates?'
        ss_sids = ssg['sids'][...]
        ss_sid_set = set(ss_sids)
        assert len(ss_sid_set) == len(ss_sids), 'Some duplicates?'

        # Figure out filters:
        g_filter = sp.in1d(g_sids, ss_sids)
        ss_filter = sp.in1d(ss_sids, g_sids)

        # Order by SNP IDs
        g_order = sp.argsort(g_sids)
        ss_order = sp.argsort(ss_sids)

        g_indices = []
        for g_i in g_order:
            if g_filter[g_i]:
                g_indices.append(g_i)

        ss_indices = []
        for ss_i in ss_order:
            if ss_filter[ss_i]:
                ss_indices.append(ss_i)
        g_ntA1=[]
        g_ntA2=[]
        g_nts = chrom_d['nts']
        snp_indices = chrom_d['snp_indices']
        ss_nts = ssg['nts'][...]
        betas = ssg['betas'][...]
        log_odds = ssg['log_odds'][...]

        if ssfformat=="LDSCORE"  or ssfformat == "STANDARD_FUNCT":
            ld_score = ssg['ld_score'][...]  ### LDSCORE
        #### Track allele flips indices ####
        ss_flips = sp.ones(len(ss_indices))
        assert not sp.any(sp.isnan(betas)), 'WTF?'
        # assert not sp.any(sp.isinf(betas)), 'WTF?'

        num_non_matching_nts = 0
        num_ambig_nts = 0
        ok_nts = []
        print 'Found %d SNPs present in both datasets' % (len(g_indices))

        if 'freqs' in ssg.keys():
            ss_freqs = ssg['freqs'][...]
            ss_freqs_list = []

        ok_indices = {'g': [], 'ss': []}
        for g_i, ss_i in it.izip(g_indices, ss_indices):

            # Is the nucleotide ambiguous?
            # g_nt = [recode_dict[g_nts[g_i][0]],recode_dict[g_nts[g_i][1]]
            g_nt = [g_nts[g_i][0], g_nts[g_i][1]]
            g_ntA1.append(g_nt[0])
            g_ntA2.append(g_nt[1])

            if not skip_coordination:
                if not skip_ambiguous:
                    if tuple(g_nt) in ambig_nts:
                        num_ambig_nts += 1
                        tot_num_non_matching_nts += 1
                        continue

                if (not g_nt[0] in valid_nts) or (not g_nt[1] in valid_nts):
                    num_non_matching_nts += 1
                    tot_num_non_matching_nts += 1
                    continue

                ss_nt = ss_nts[ss_i]

                # Are the nucleotides the same?
                flip_nts = False
                os_g_nt = sp.array([opp_strand_dict[g_nt[0]], opp_strand_dict[g_nt[1]]])
                if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
                    # Opposite strand nucleotides
                    flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
                    os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
                    if flip_nts:
                        betas[ss_i] = -betas[ss_i]
                        log_odds[ss_i] = -log_odds[ss_i]
                        ss_flips[ss_i] = -1
                        if 'freqs' in ssg.keys():
                            ss_freqs[ss_i] = 1 - ss_freqs[ss_i]
                    else:
                        #                     print "Nucleotides don't match after all?: g_sid=%s, ss_sid=%s, g_i=%d, ss_i=%d, g_nt=%s, ss_nt=%s" % \
                        #                         (g_sids[g_i], ss_sids[ss_i], g_i, ss_i, str(g_nt), str(ss_nt))
                        num_non_matching_nts += 1
                        tot_num_non_matching_nts += 1
                        continue

            # everything seems ok.
            ok_indices['g'].append(g_i)
            ok_indices['ss'].append(ss_i)
            ok_nts.append(g_nt)

        print '%d SNPs were excluded due to ambiguous nucleotides.' % num_ambig_nts
        print '%d SNPs were excluded due to non-matching nucleotides.' % num_non_matching_nts

        # Resorting by position
        positions = sp.array(chrom_d['positions'])[ok_indices['g']]
        order = sp.argsort(positions)
        ok_indices['g'] = list(sp.array(ok_indices['g'])[order])
        ok_indices['ss'] = list(sp.array(ok_indices['ss'])[order])
        positions = positions[order]

        # Parse SNPs
        snp_indices = sp.array(chrom_d['snp_indices'])
        snp_indices = snp_indices[ok_indices['g']]  # Pinpoint where the SNPs are in the file.
        #raw_snps, freqs = _parse_plink_snps_(genotype_file, snp_indices)
        freqs = _parse_plink_snps_freqs_(genotype_file, snp_indices)

        betas = betas[ok_indices['ss']]
        log_odds = log_odds[ok_indices['ss']]
        sids = ssg['sids'][...][ok_indices['ss']]


        if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT":
            ld_score = ld_score[ok_indices['ss']]  #### LDSCORE
        # Check SNP frequencies..
        if check_mafs and 'freqs' in ssg.keys():
            ss_freqs = ss_freqs[ok_indices['ss']]
            freq_discrepancy_snp = sp.absolute(ss_freqs - (1 - freqs)) > 0.15
            if sp.any(freq_discrepancy_snp):
                print 'Warning: %d SNPs appear to have high frequency discrepancy between summary statistics and validation sample' % sp.sum(
                    freq_discrepancy_snp)
                print freqs[freq_discrepancy_snp]
                print ss_freqs[freq_discrepancy_snp]

                # Filter freq_discrepancy_snps
                ok_freq_snps = sp.negative(freq_discrepancy_snp)
                freqs = freqs[ok_freq_snps]
                sids = sids[ok_freq_snps]
                betas = betas[ok_freq_snps]
                log_odds = log_odds[ok_freq_snps]



                if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT":
                    ld_score = ld_score[ok_freq_snps]  #### LDSCORE
        # Filter minor allele frequency SNPs.
        maf_filter = (freqs > min_maf) * (freqs < (1 - min_maf))
        maf_filter_sum = sp.sum(maf_filter)
        n_snps = len(maf_filter)
        assert maf_filter_sum <= n_snps, "WTF?"
        if sp.sum(maf_filter) < n_snps:
            sids = sids[maf_filter]
            betas = betas[maf_filter]
            log_odds = log_odds[maf_filter]


            if ssfformat == "LDSCORE" or ssfformat == "STANDARD_FUNCT":
                ld_score = ld_score[maf_filter]

            print '%d SNPs with MAF < %0.3f were filtered' % (n_snps - maf_filter_sum, min_maf)

        print '%d SNPs were retained on chromosome %d.' % (maf_filter_sum, chrom)


        num_common_snps += len(betas)
        ssf_dict[chr_str]['betas']=betas
        ssf_dict[chr_str]['log_odds'] = log_odds
Beispiel #24
0
def gen_unrelated_eur_1k_data(input_file='/home/bjarni/TheHonestGene/faststorage/1Kgenomes/phase3/1k_genomes_hg.hdf5' ,
                              out_file='/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5',
                              maf_thres=0.01, max_relatedness=0.05, K_thinning_frac=0.1, debug=False):
    h5f = h5py.File(input_file)
    num_indivs = len(h5f['indivs']['continent'])
    eur_filter = h5f['indivs']['continent'][...] == 'EUR'
    num_eur_indivs = sp.sum(eur_filter)
    print 'Number of European individuals: %d', num_eur_indivs
    K = sp.zeros((num_eur_indivs, num_eur_indivs), dtype='single')
    num_snps = 0
    std_thres = sp.sqrt(2.0 * (1 - maf_thres) * (maf_thres))

    print 'Calculating kinship'
    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom
        
        print 'Loading SNPs and data'
        snps = sp.array(h5f[chrom_str]['calldata']['snps'][...], dtype='int8')

        print 'Loading NTs'
        ref_nts = h5f[chrom_str]['variants']['REF'][...]
        alt_nts = h5f[chrom_str]['variants']['ALT'][...]
        
        print 'Filtering multi-allelic SNPs'
        multi_allelic_filter = sp.negative(h5f[chrom_str]['variants']['MULTI_ALLELIC'][...])
        snps = snps[multi_allelic_filter]
        ref_nts = ref_nts[multi_allelic_filter]
        alt_nts = alt_nts[multi_allelic_filter]


        if K_thinning_frac < 1:
            print 'Thinning SNPs for kinship calculation'
            thinning_filter = sp.random.random(len(snps)) < K_thinning_frac
            snps = snps[thinning_filter]
            alt_nts = alt_nts[thinning_filter]
            ref_nts = ref_nts[thinning_filter]

        print 'Filter SNPs with missing NT information'
        nt_filter = sp.in1d(ref_nts, ok_nts)
        nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts)
        if sp.sum(nt_filter) < len(nt_filter):
            snps = snps[nt_filter]

        print 'Filtering non-European individuals'
        snps = snps[:, eur_filter]

        print 'Filtering SNPs with MAF <', maf_thres
        snp_stds = sp.std(snps, 1)
        maf_filter = snp_stds.flatten() > std_thres
        snps = snps[maf_filter]
        snp_stds = snp_stds[maf_filter]
        
        print '%d SNPs remaining after all filtering steps.' % len(snps)

        print 'Normalizing SNPs'
        snp_means = sp.mean(snps, 1)
        norm_snps = (snps - snp_means[sp.newaxis].T) / snp_stds[sp.newaxis].T
        
        print 'Updating kinship'        
        K += sp.dot(norm_snps.T, norm_snps)
        num_snps += len(norm_snps)
        assert sp.isclose(sp.sum(sp.diag(K)) / (num_snps * num_eur_indivs), 1.0)

    K = K / float(num_snps)
    print 'Kinship calculation done using %d SNPs\n' % num_snps
    
    # Filter individuals
    print 'Filtering individuals'
    keep_indiv_set = set(range(num_eur_indivs))
    for i in range(num_eur_indivs):
        if i in keep_indiv_set:
            for j in range(i + 1, num_eur_indivs):
                if K[i, j] > max_relatedness:
                    if j in keep_indiv_set:
                        keep_indiv_set.remove(j)
    keep_indivs = list(keep_indiv_set)
    keep_indivs.sort()
    print 'Retained %d individuals\n' % len(keep_indivs)
    
    # Checking that everything is ok!
    K_ok = K[keep_indivs]
    K_ok = K_ok[:, keep_indivs]
    assert (K_ok - sp.tril(K_ok)).max() < max_relatedness

    indiv_filter = sp.zeros(num_indivs, dtype='bool8')
    indiv_filter[(sp.arange(num_indivs)[eur_filter])[keep_indivs]] = 1
    
    assert sp.sum(indiv_filter) == len(keep_indivs)
    
    # Store in new file
    print 'Now storing data.'
    oh5f = h5py.File(out_file, 'w')
    indiv_ids = h5f['indivs']['indiv_ids'][indiv_filter]
    oh5f.create_dataset('indiv_ids', data=indiv_ids)    
    for chrom in range(1, 23):
        print 'Working on Chromosome %d' % chrom
        chrom_str = 'chr%d' % chrom
        
        print 'Loading SNPs and data'
        snps = sp.array(h5f[chrom_str]['calldata']['snps'][...], dtype='int8')
        snp_ids = h5f[chrom_str]['variants']['ID'][...]
        positions = h5f[chrom_str]['variants']['POS'][...]

        print 'Loading NTs'
        ref_nts = h5f[chrom_str]['variants']['REF'][...]
        alt_nts = h5f[chrom_str]['variants']['ALT'][...]
        
        print 'Filtering multi-allelic SNPs'
        multi_allelic_filter = sp.negative(h5f[chrom_str]['variants']['MULTI_ALLELIC'][...])
        snps = snps[multi_allelic_filter]
        ref_nts = ref_nts[multi_allelic_filter]
        alt_nts = alt_nts[multi_allelic_filter]
        positions = positions[multi_allelic_filter]
        snp_ids = snp_ids[multi_allelic_filter]

        print 'Filter individuals'
        snps = snps[:, indiv_filter]
        
        print 'Filter SNPs with missing NT information'
        nt_filter = sp.in1d(ref_nts, ok_nts)
        nt_filter = nt_filter * sp.in1d(alt_nts, ok_nts)
        if sp.sum(nt_filter) < len(nt_filter):
            snps = snps[nt_filter]
            ref_nts = ref_nts[nt_filter]
            alt_nts = alt_nts[nt_filter]
            positions = positions[nt_filter]
            snp_ids = snp_ids[nt_filter]
        
        print 'filter monomorphic SNPs'
        snp_stds = sp.std(snps, 1)
        mono_morph_filter = snp_stds > 0
        snps = snps[mono_morph_filter]
        ref_nts = ref_nts[mono_morph_filter]
        alt_nts = alt_nts[mono_morph_filter]
        positions = positions[mono_morph_filter]
        snp_ids = snp_ids[mono_morph_filter]
        snp_stds = snp_stds[mono_morph_filter]

        snp_means = sp.mean(snps, 1)

        if debug:
            if K_thinning_frac < 1:
                print 'Thinning SNPs for kinship calculation'
                thinning_filter = sp.random.random(len(snps)) < K_thinning_frac
                k_snps = snps[thinning_filter]
                k_snp_stds = snp_stds[thinning_filter]

    
            print 'Filtering SNPs with MAF <', maf_thres
            maf_filter = k_snp_stds.flatten() > std_thres
            k_snps = k_snps[maf_filter]
            k_snp_stds = k_snp_stds[maf_filter]
            k_snp_means = sp.mean(k_snps)

            print 'Verifying that the Kinship makes sense'
            norm_snps = (k_snps - k_snp_means[sp.newaxis].T) / k_snp_stds[sp.newaxis].T
            K = sp.dot(norm_snps.T, norm_snps)
            num_snps += len(norm_snps)
            if sp.isclose(sp.sum(sp.diag(K)) / (num_snps * num_eur_indivs), 1.0) and (K - sp.tril(K)).max() < (max_relatedness * 1.5):
                print 'It looks OK!'
            else:
                raise Exception('Kinship looks wrong?')
        

        nts = sp.array([[nt1, nt2] for nt1, nt2 in izip(ref_nts, alt_nts)])

        print 'Writing to disk'
        cg = oh5f.create_group(chrom_str)
        cg.create_dataset('snps', data=snps)
        cg.create_dataset('snp_means', data=snp_means[sp.newaxis].T)
        cg.create_dataset('snp_stds', data=snp_stds[sp.newaxis].T)
        cg.create_dataset('snp_ids', data=snp_ids)
        cg.create_dataset('positions', data=positions)
        cg.create_dataset('nts', data=nts)
        oh5f.flush()
        print 'Done writing to disk'
        
#         centimorgans = h5f[chrom_str]['centimorgans'][...]
#         cg.create_dataset('centimorgans',data=centimorgans)
#         
#         centimorgan_rates = h5f[chrom_str]['centimorgan_rates'][...]
#         cg.create_dataset('centimorgan_rates',data=centimorgan_rates)
        
    oh5f.close()
    h5f.close()
    print 'Done'
Beispiel #25
0
def parse_cegs_drosophila_phenotypes(
    phenotype_file='/Users/bjarnivilhjalmsson/data/cegs_lehmann/allphenotypes_5.0_cleaned.tab.reps.hdf5',
):
    """
    Parser for CEGS Drosophila phenotype data
    """
    import pylab
    #Load phenotypes...
    ph5f = h5py.File(phenotype_file)
    #Now take the median and mean of all values for all individuals.
    phen_dict = {}
    for phen in ph5f.keys():
        #First mated
        Y_mated = ph5f[phen]['Y_mated'][...]
        Z_mated = ph5f[phen]['Z_mated'][...]
        sample_filter = sp.negative(sp.isnan(Y_mated))
        Ys_sum = sp.dot(Y_mated[sample_filter], Z_mated[sample_filter])
        rep_count = sp.dot(sp.ones(sum(sample_filter)), Z_mated[sample_filter])
        Y_means = Ys_sum / rep_count
        #Now calculate medians by iteration.
        phen_vals_list = [[] for i in range(216)]
        for i in range(len(Y_mated)):
            ind_i = sp.where(1 == Z_mated[i])[0][0]
            phen_vals_list[ind_i].append(Y_mated[i])
        medians = sp.zeros(216)
        for i, pl in enumerate(phen_vals_list):
            if len(pl) > 0:
                medians[i] = sp.median(pl)
            else:
                medians[i] = sp.nan
        ind_filter = sp.negative(sp.isnan(Y_means))
        if phen == 'Triglyceride':
            ind_filter = (Y_means > 0) * ind_filter

        phen_dict[phen] = {
            'mated': {
                'Y_means': Y_means,
                'rep_count': rep_count,
                'ind_filter': ind_filter,
                'Y_medians': medians
            }
        }

        print 'Plotting phenotype histograms for %s, %s' % (phen, 'mated')
        mated_filtered_means = Y_means[ind_filter]
        pylab.hist(mated_filtered_means)
        pylab.savefig(
            '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_means.png' %
            (phen))
        pylab.clf()
        mated_filtered_medians = medians[ind_filter]
        pylab.hist(mated_filtered_medians)
        pylab.savefig(
            '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_mated_medians.png'
            % (phen))
        pylab.clf()

        #Then virgin
        Y_virgin = ph5f[phen]['Y_virgin'][...]
        Z_virgin = ph5f[phen]['Z_virgin'][...]
        sample_filter = sp.negative(sp.isnan(Y_virgin))
        Ys_sum = sp.dot(Y_virgin[sample_filter], Z_virgin[sample_filter])
        rep_count = sp.dot(sp.ones(sum(sample_filter)),
                           Z_virgin[sample_filter])
        Y_means = Ys_sum / rep_count
        #Now calculate medians by iteration.
        phen_vals_list = [[] for i in range(216)]
        for i in range(len(Y_virgin)):
            ind_i = sp.where(1 == Z_virgin[i])[0][0]
            phen_vals_list[ind_i].append(Y_virgin[i])
        medians = sp.zeros(216)
        for i, pl in enumerate(phen_vals_list):
            if len(pl) > 0:
                medians[i] = sp.median(pl)
            else:
                medians[i] = sp.nan
        ind_filter = sp.negative(sp.isnan(Y_means))
        if phen == 'Triglyceride':
            ind_filter = (Y_means > 0) * ind_filter

        phen_dict[phen]['virgin'] = {
            'Y_means': Y_means,
            'rep_count': rep_count,
            'ind_filter': ind_filter,
            'Y_medians': medians
        }

        print 'Plotting phenotype histograms for %s, %s' % (phen, 'virgin')
        virgin_filtered_means = Y_means[ind_filter]
        pylab.hist(virgin_filtered_means)
        pylab.savefig(
            '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_means.png'
            % (phen))
        pylab.clf()
        virgin_filtered_medians = medians[ind_filter]
        pylab.hist(virgin_filtered_medians)
        pylab.savefig(
            '/Users/bjarnivilhjalmsson/data/tmp/cegs_hist_%s_virgin_medians.png'
            % (phen))
        pylab.clf()

        means_corr = sp.corrcoef(mated_filtered_means,
                                 virgin_filtered_means)[0, 1]
        medians_corr = sp.corrcoef(mated_filtered_medians,
                                   virgin_filtered_medians)[0, 1]
        print 'Correlation between mated and virgin flies, means: %0.2f, medians: %0.2f' % (
            means_corr, medians_corr)
        phen_dict[phen]['corrs'] = {
            'means': means_corr,
            'medians': medians_corr
        }
    return phen_dict