forked from AlexTISYoung/snipar
/
fPGS_obs.py
158 lines (150 loc) · 8.11 KB
/
fPGS_obs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from sibreg.sibreg import *
import argparse, code
from pysnptools.snpreader import Pheno
######### Command line arguments #########
if __name__ == '__main__':
parser=argparse.ArgumentParser()
parser.add_argument('pgs', type=str, help='Location of the PGS file')
parser.add_argument('phenofile',type=str,help='Location of the phenotype file')
parser.add_argument('ped',type=str,help='Location of pedigree file with FID giving sibships')
parser.add_argument('outprefix',type=str,help='Location to output association statistic hdf5 file')
parser.add_argument('--scale_phen',action='store_true',help='Scale phenotype to have variance 1',default=False)
parser.add_argument('--sibdiff',action='store_true',default = False,help='Fit sibling difference in PGS model')
parser.add_argument('--trios',action = 'store_true',default = False,help='Fit model with individuals with both parents genotyped')
parser.add_argument('--phen_index',type=int,help='If the phenotype file contains multiple phenotypes, which phenotype should be analysed (default 1, first)',
default=1)
parser.add_argument('--tau_init',type=float,help='Initial value for ratio between shared family environmental variance and residual variance',
default=1)
parser.add_argument('--missing_char',type=str,help='Missing value string in phenotype file (default NA)',default='NA')
args=parser.parse_args()
if args.trios or args.sibdiff:
pass
else:
raise ValueError('Must do at least one analysis from --trios or --sibdiff')
# Read PGS
print('Reading '+str(args.pgs))
f = open(args.pgs, 'r')
cols = f.readline()
if len(cols.split('\t')) > len(cols.split(' ')):
cols = np.array(cols.split('\t'))
delim = '\t'
else:
cols = np.array(cols.split(' '))
delim = ' '
if cols[0] == 'FID' and cols[1]== 'IID':
pass
else:
raise ValueError('First two columns of PGS must be FID, IID')
f.close()
ids = np.loadtxt(args.pgs, dtype='U', usecols=(0,1), delimiter=delim, skiprows=1)
pgs_vals = np.loadtxt(args.pgs, usecols=tuple([x for x in range(2, cols.shape[0])]),delimiter=delim, skiprows=1)
pg = gtarray(pgs_vals.reshape((pgs_vals.shape[0],1)), ids[:, 1], sid=cols[2:cols.shape[0]], fams=ids[:, 0])
print('Normalising PGS to have mean zero and variance 1')
pg.mean_normalise()
pg.scale()
# Read phenotype
print('Reading '+str(args.phenofile))
pheno = Pheno(args.phenofile, missing=args.missing_char).read()
# pheno = Pheno('phenotypes/eduyears_resid.ped', missing='NA').read()
y = np.array(pheno.val)
pheno_ids = np.array(pheno.iid)[:, 1]
if y.ndim == 1:
pass
elif y.ndim == 2:
y = y[:, args.phen_index - 1]
else:
raise ValueError('Incorrect dimensions of phenotype array')
# Remove y NAs
y_not_nan = np.logical_not(np.isnan(y))
if np.sum(y_not_nan) < y.shape[0]:
y = y[y_not_nan]
pheno_ids = pheno_ids[y_not_nan]
y = y-np.mean(y)
if args.scale_phen:
y = y/np.std(y)
print('Number of non-missing phenotype observations: ' + str(y.shape[0]))
# Read pedigree
print('Reading '+str(args.ped))
ped = np.loadtxt(args.ped,dtype='U')
controls = np.array([x[0]=='_' for x in ped[:,0]])
ped = ped[np.logical_not(controls),:]
ped_dict = make_id_dict(ped,1)
# Estimate associative effect
print('Estimating associative effect')
in_pg = np.array([x in pg.id_dict for x in pheno_ids])
pg_indices = np.array([pg.id_dict[x] for x in pheno_ids[in_pg]])
pg_in_ped = np.array([x in ped_dict for x in pg.ids])
pg.fams[pg_in_ped] = np.array([ped[ped_dict[x],0] for x in pg.ids[pg_in_ped]])
pg_model = model(y[in_pg], pg.gts[pg_indices,0], pg.fams[pg_indices], add_intercept=True)
sigma_2_init = np.var(y) * args.tau_init / (1 + args.tau_init)
pg_optim = pg_model.optimize_model(np.array([sigma_2_init,args.tau_init]))
pg_alpha = pg_model.alpha_mle(pg_optim['tau'],pg_optim['sigma2'],compute_cov = True)
print('Family variance estimate :'+str(round(pg_optim['sigma2']/pg_optim['tau'],4)))
print('Residual variance estimate :' + str(round(pg_optim['sigma2'], 4)))
sibcor = 1/(1+pg_optim['tau'])
print('Sibling correlation estimate: '+str(round(sibcor,4)))
print('Associative effect: '+str(round(pg_alpha[0][1],6))+ ' (S.E. '+str(round(np.sqrt(pg_alpha[1][1,1]),7))+')')
if args.trios:
print('Analysing individuals with both parents genotyped')
par_status, gt_indices, fam_labels = find_par_gts(pg.ids, ped, pg.id_dict)
# Identify individuals with all observed
all_obs = np.min(gt_indices, axis=1) > 0
n = np.sum(all_obs)
if n == 0:
raise ValueError('No individuals with both parents genotyped')
print(str(n) + ' individuals with both parents genotyped')
gt_indices = gt_indices[all_obs, :]
bpg_ids = pg.ids[all_obs]
fam_labels = fam_labels[all_obs]
# Make array
G = np.zeros((n, 3), dtype=np.float32)
G[:, 0] = pg.gts[gt_indices[:, 0], 0]
G[:, 1] = pg.gts[gt_indices[:, 1], 0]
G[:, 2] = pg.gts[gt_indices[:, 2], 0]
# Estimate
bpg_id_dict = make_id_dict(bpg_ids)
in_bpg = np.array([x in bpg_id_dict for x in pheno_ids])
bpg_indices = np.array([bpg_id_dict[x] for x in pheno_ids[in_bpg]])
print('Estimate model for individuals with both parents genotyped')
bpg_model = model(y[in_bpg], G[bpg_indices, :], fam_labels[bpg_indices], add_intercept=True)
alpha_bpg = bpg_model.alpha_mle(pg_optim['tau'],pg_optim['sigma2'],compute_cov = True)
outcols = np.array(['proband', 'paternal', 'maternal']).reshape((3,1))
# Save output
alpha_bpg_out = np.zeros((3, 2))
alpha_bpg_out[:, 0] = alpha_bpg[0][1:4]
alpha_bpg_out[:, 1] = np.sqrt(np.diag(alpha_bpg[1])[1:4])
np.savetxt(args.outprefix + '.bpg.pgs_effects.txt',
np.hstack((outcols, np.array(alpha_bpg_out, dtype='S20'))),
delimiter='\t', fmt='%s')
np.savetxt(args.outprefix + '.bpg.pgs_vcov.txt', alpha_bpg[1][1:4, 1:4])
if args.sibdiff:
print('Analysing with sibling difference method')
fam_means = get_fam_means(pg.ids, ped, pg.gts, pg.ids, remove_proband=False)
# Remove overlap with trios
if args.trios:
in_bpg = np.array([x in bpg_id_dict for x in fam_means.ids])
n_overlap = np.sum(in_bpg)
if n_overlap == fam_means.ids.shape[0]:
raise ValueError('No sibships without both parents genotyped')
else:
print('Removing '+str(n_overlap)+' individuals with both parents genotyped from sib difference analysis')
fam_means = gtarray(fam_means.gts[np.logical_not(in_bpg),:],fam_means.ids[np.logical_not(in_bpg)])
print('Found '+str(fam_means.ids.shape[0])+' individuals with genotyped siblings')
G = np.zeros((fam_means.gts.shape[0],2),dtype = np.float32)
pg_indices = np.array([pg.id_dict[x] for x in fam_means.ids])
G[:,0] = pg.gts[pg_indices,0]
G[:,1] = fam_means.gts[:,0]
G[:,0] = G[:,0] - G[:,1]
fam_labels = np.array([ped[ped_dict[x],0] for x in fam_means.ids])
# Match with phenotype
in_fam_means = np.array([x in fam_means.id_dict for x in pheno_ids])
fam_means_indices = np.array([fam_means.id_dict[x] for x in pheno_ids[in_fam_means]])
print('Estimating model using sibling differences')
sdiff_model = model(y[in_fam_means], G[fam_means_indices, :], fam_labels[fam_means_indices], add_intercept=True)
alpha_sdiff = sdiff_model.alpha_mle(pg_optim['tau'],pg_optim['sigma2'],compute_cov = True)
alpha_sdiff_out = np.zeros((2, 2))
alpha_sdiff_out[:, 0] = alpha_sdiff[0][1:3]
alpha_sdiff_out[:, 1] = np.sqrt(np.diag(alpha_sdiff[1])[1:3])
np.savetxt(args.outprefix + '.sibdiff.pgs_effects.txt',
np.hstack((np.array(['direct', 'between-family']).reshape((2,1)), np.array(alpha_sdiff_out, dtype='S20'))),
delimiter='\t', fmt='%s')