-
Notifications
You must be signed in to change notification settings - Fork 0
/
SNPparser.py
executable file
·87 lines (80 loc) · 2.64 KB
/
SNPparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
'''
Script to convert vcf file into Structure input file.
As Infile specify the vcf file from which to call genotypes. MissingData
specifies the amount of missing data allowed for each locus.
'''
import sys
from os import system as bash
Infile = sys.argv[1]
CombinedOutfile = sys.argv[2]
MissingRowData = float(sys.argv[3])
MissingColumnData = float(sys.argv[4])
OutfilesA = []
OutfilesB = []
with open(Infile) as File:
for line in File:
if '#' in line:
for individual in line.split('\t')[9:]:
OutfilesA.append(individual.rstrip('\n') + '_A')
OutfilesB.append(individual.rstrip('\n') + '_B')
FileCount = -1
for FileA in OutfilesA:
FileCount += 1
fA = open(OutfilesA[FileCount], 'w')
fA.write(OutfilesA[FileCount][:-2] + '\t')
fA.close()
FileCount = -1
for FileB in OutfilesB:
FileCount += 1
fB = open(OutfilesB[FileCount], 'w')
fB.write(OutfilesB[FileCount][:-2] + '\t')
fB.close()
elif '#' not in line and line.split('\t')[9:].count('./.') < \
MissingRowData * float(len(line.split('\t')[9:])):
Genotypes = line.split('\t')[9:]
FileCount = -1
for genotype in Genotypes:
FileCount += 1
fA = open(OutfilesA[FileCount], 'a')
fB = open(OutfilesB[FileCount], 'a')
fA.write(genotype[0:1] + '\t')
fB.write(genotype[2:3] + '\t')
fA.close()
fB.close()
else:
continue
FileCount = -1
for newline in OutfilesA:
FileCount += 1
fA = open(OutfilesA[FileCount], 'r+')
for line in fA:
newline = line.rstrip('\t')
fA.seek(0)
fA.write(newline + '\n')
fA.close()
FileCount = -1
for newline in OutfilesB:
FileCount += 1
fB = open(OutfilesB[FileCount], 'r+')
for line in fB:
newline = line.rstrip('\t')
fB.seek(0)
fB.write(newline + '\n')
fB.close()
### COMBINE outfiles
FileCount = -1
Outfiles = OutfilesA
for file in OutfilesB:
Outfiles.append(file)
CombinedGenotypes = open(CombinedOutfile, 'w')
for infile in sorted(Outfiles):
with open(infile) as Combine:
for line in Combine:
if line.count('.') < MissingColumnData * float(len(line.split('\t')[1:])):
line = line.replace('.', '-9')
CombinedGenotypes.write(line)
else:
continue
bash('rm ' + infile)
CombinedGenotypes.close()