-
Notifications
You must be signed in to change notification settings - Fork 0
/
fix_it_fam.py
321 lines (291 loc) · 11.1 KB
/
fix_it_fam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#! /ust/bin/python2.7
#! ./
'''
Created Jan 25, 2012
@author: Jessica Bonnie
'''
from collections import namedtuple
import os
import sys
import getopt
import shutil
import pc_toolbox
import fix_it
AnnotInfo = namedtuple("AnnotInfo","name,lz,rs,hg19_chr,hg19_pos,hg18_chr,hg18_pos,band")
ANNOT_LOC = '/home/jkb4y/work/data/quinlan-immunochip-snps-annotated-2011-Dec-15_edit.txt'
#ANNOT_DICT = '/home/jkb4y/work/data/annot.dict'
RS_KEEP = ['rs10127859']
HG19_CHR_TITLE = 'hg19_chrom'
HG18_CHR_TITLE = 'hg18_chrom'
NAME_TITLE = 'name'
RS_ID_TITLE = 'rsID'
HG18_POS_TITLE = 'hg18_end'
BAND_TITLE = 'band'
HG19_POS_TITLE = 'hg19_end'
BUILD = 'hg18'
KEEP_IM_LOC = '/home/jkb4y/work/data/keep_im_list.txt'
DUPLICATE_FIX_DICT = {'chr6_106659585':'rs17066588','imm_2_230785920':'rs11556887','imm_2_230857352':'rs4972946'}
def read_annot(annot_indices, line, im_keep, build):
alist = line.strip().split()
hg18_chr=alist[annot_indices['hg18_chr']].replace('X','23').replace('Y','24')
##
## hg18_chr.replace('X','23')
## hg18_chr.replace('Y','24')
hg19_chr=alist[annot_indices['hg19_chr']].replace('X','23').replace('Y','24')
## hg19_chr.replace('X','23')
## hg19_chr.replace('Y','24')
rs=alist[annot_indices['rs']]
hg18_pos=alist[annot_indices['hg18_pos']]
hg19_pos=alist[annot_indices['hg19_pos']]
band=alist[annot_indices['band']]
name=alist[annot_indices['name']]
#ref = rs
if build == 'hg19':
if rs == '0' or name in im_keep:
if name in RS_KEEP:
lz=name
else:
lz = "chr"+hg19_chr+":"+hg19_pos
## elif name in DUPLICATE_FIX_DICT:
## lz = DUPLICATE_FIX_DICT[name]
else:
lz = rs
if build == 'hg18':
if name.startswith('rs'):
lz = name
elif name in DUPLICATE_FIX_DICT:
lz = DUPLICATE_FIX_DICT[name]
else:
lz = "chr"+hg18_chr+":"+hg18_pos
annot_info = AnnotInfo(hg18_chr=hg18_chr,hg18_pos=hg18_pos,
hg19_chr=hg19_chr,hg19_pos=hg19_pos,
name=name,rs=rs,
lz=lz,band=band)
## if not ref in keep_rs_list:
return annot_info
def write_annot_dict(annot_loc, build):
counter = 0
dict_loc = locate_annot_dict(build, annot_loc)
annot_dict = open(dict_loc, mode="w")
im_keep = list()
with open(KEEP_IM_LOC, mode="r") as im:
for im_line in im:
im_keep.append(im_line.strip())
with open(annot_loc, mode = "r") as annot:
line1 = True
for aline in annot:
if line1:
a_indices = fix_it.read_annot_titles(aline)
line1 = False
else:
a_info = fix_it.read_annot(a_indices,aline,im_keep,build)
annot_dict.write('\t'.join(a_info) +'\n')
annot_dict.close()
def locate_fixed_table(table_loc, build):
(basepath, ext) = os.path.splitext(table_loc)
output_loc = basepath + '_lz'+'_'+build+ext
return output_loc
def fix_table(table_loc, annot_dict, purpose, build):
line1 = True
counter = 0
## (basepath, ext) = os.path.splitext(table_loc)
if purpose in ["HAPMAP","MAP"]:
(basepath, ext) = os.path.splitext(table_loc)
orig_rename = basepath + '~'
new_loc = str(rename_as_necessary(orig_rename, ext)) + ext
shutil.copy(table_loc, new_loc)
output_loc = table_loc
table_loc = new_loc
else:
output_loc = locate_fixed_table(table_loc, build)
with open(table_loc, mode="r") as table:
output = open(output_loc, mode="w")
index_dict = {'chr':0,'pos':3,'snp':1}
error_list = list()
for line in table:
## if counter > 5000:
## print line_list
if line1:
if purpose == "META":
index_dict = pc_toolbox.read_meta_titles(line)
elif purpose == "FAMILY":
index_dict = pc_toolbox.read_fam_titles(line)
output.write(line)
line1 = False
## else:
## index_dict = {'chr':0,'pos':3,'snp':1}
## output.write(line)
## line1 = False
else:
line_list = line.strip().split()
snp = line_list[index_dict['snp']]
## chro = line_list[index_dict['chr']]
## pos = line_list[index_dict['pos']]
if build == 'hg18':
try:
line_list[index_dict['chr']]= annot_dict[snp].hg18_chr
line_list[index_dict['pos']]= annot_dict[snp].hg18_pos
except KeyError:
error_list.append(list(line_list))
## line_list[index_dict['chr']] = 'ERROR'
## line_list[index_dict['pos']]= 'ERROR'
if build == 'hg19':
try:
line_list[index_dict['chr']]= annot_dict[snp].hg19_chr
line_list[index_dict['pos']]= annot_dict[snp].hg19_pos
except KeyError:
error_list.append(list(line_list))
## line_list[index_dict['chr']] = 'ERROR'
## line_list[index_dict['pos']]= 'ERROR'
## line_list[index_dict['chr']]= annot_dict[snp].hg19_chr
## line_list[index_dict['pos']]= annot_dict[snp].hg19_pos
try:
line_list[index_dict['snp']]= annot_dict[snp].lz
except KeyError:
print("{0} is missing from the dictionary!".format(line_list[index_dict['snp']]))
## line_list[index_dict['snp']] = 'ERROR'
## if counter > 5000:
## print line_list
## counter = 0
counter = 1 + counter
output.write('\t'.join(line_list)+'\n')
output.close()
base, ext = os.path.splitext(table_loc)
error_loc = base + '_NAMEERRORS.txt'
efile = open(error_loc,mode="w")
efile.write('\t'.join(['CHR','RS','cM','POS'])+'\n')
for error in error_list:
efile.write('\t'.join(error)+'\n')
efile.close()
def rename_as_necessary(new_orig,ext):
'''
Determines if the desired new name for the input map file already exists,
and, if so, chooses another name.
Args:
new_orig -- target to which to map file could be moved
ext -- extension of the map file
Returns:
next_rename -- target name to which the original map file can
be saved without overwriting any other file
'''
next_rename = new_orig
if os.path.exists(new_orig + ext):
print( '''There is already a file named {0}, which means that map_adapt has
already been run on this map file. To avoid overwriting the contents of {0},
the contents of the input file will be written to: '''.format(new_orig + ext))
if new_orig[-1].isdigit():
pieces = new_orig.rpartition('~')
up1 = int(pieces[2])+ 1
next_rename = pieces[0]+'~'+ str(up1)
print(next_rename + ext)
next_rename = rename_as_necessary(next_rename, ext)
else:
next_rename = new_orig + '1'
print(next_rename + ext)
next_rename = rename_as_necessary(next_rename, ext)
return next_rename
def read_annot_titles(line):
title_list = line.strip().split()
hg18_chr_col = title_list.index(HG18_CHR_TITLE)
rs_col = title_list.index(RS_ID_TITLE)
name_col = title_list.index(NAME_TITLE)
hg18_pos_col = title_list.index(HG18_POS_TITLE)
band_col = title_list.index(BAND_TITLE)
hg19_pos_col = title_list.index(HG19_POS_TITLE)
hg19_chr_col = title_list.index(HG19_CHR_TITLE)
annot_indices = {'hg18_chr':hg18_chr_col, 'name':name_col,
'rs':rs_col,'hg18_pos':hg18_pos_col,
'band':band_col,'hg19_pos':hg19_pos_col,
'hg19_chr':hg19_chr_col}
return annot_indices
def locate_annot_dict(build, annot_loc=ANNOT_LOC):
head, tail = os.path.split(annot_loc)
dict_loc = os.path.join(head,'annot'+'_'+build+'.dict')
return dict_loc
def build_annot_dict(purpose, annot_dict_loc):
#global meta
annot_dict = dict()
with open(annot_dict_loc, mode="r") as annie:
for a in annie:
a_list = a.strip().split()
ainfo = AnnotInfo._make(a_list)
ref = ainfo.rs
if ainfo.rs == '0':
ref = ainfo.name
if purpose in ['META','FAMILY']:
annot_dict[ref]=ainfo
annot_dict[ainfo.name]=ainfo
elif purpose == 'HAPMAP':
annot_dict[ainfo.name]=ainfo
elif purpose == 'MAP':
annot_dict[ainfo.name]=ainfo
elif purpose == 'LOG':
annot_dict[ainfo.lz]=ainfo
return annot_dict
def cl_arguments(argv):
'''Reads arguments from the command line and assigns values to globals
Keyword arguments:
argv -- commandline arguments (?)
'''
global annot_loc, table_loc, meta, build, hapmap, family
table_loc = None
meta = False
annot_loc = None
build = BUILD
hapmap = False
family = False
try:
opts, args = getopt.getopt(argv, "h",
["help","annot=","map=","meta=","build=",
"hapmap", "family="])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h","--help"):
usage()
sys.exit()
elif opt in ("--map"):
table_loc = arg
elif opt in ("--meta"):
table_loc = arg
meta = True
elif opt in ("--family"):
table_loc = arg
family = True
elif opt in ("--hapmap"):
hapmap = True
elif opt in ("--annot"):
annot_loc = None
elif opt in ("--build"):
build = arg
def usage():
print('''
USAGE: fix_it.py [FLAG] OBJECT
FLAG DESCRIPTION CURRENT DEFAULT
--meta filepath of meta table
--map filepath of plink map file
--annot filepath of annotation file {0}
--build genome build ( hg18 or hg19 ) {1}
-h, --help display this usage string
'''.format(BUILD, ANNOT_LOC))
def main(argv):
global annot_loc, table_loc, meta, build, hapmap, family
cl_arguments(argv)
print annot_loc
if annot_loc is not None:
print("WRITING ANNOT!")
write_annot_dict(annot_loc, build)
if meta:
purpose = 'META'
elif family:
purpose = 'FAMILY'
elif hapmap:
purpose = 'HAPMAP'
else:
purpose = 'MAP'
dict_loc = locate_annot_dict(build)
annot_dict = fix_it.build_annot_dict(purpose, dict_loc)
fix_table(table_loc,annot_dict,purpose,build)
if __name__=='__main__':
main(sys.argv[1:])