-
Notifications
You must be signed in to change notification settings - Fork 4
/
ISGDataPuller.py
100 lines (84 loc) · 3.26 KB
/
ISGDataPuller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
####################################################
#
#ISGDatapuller opens an ISG data file and converts the information
#into a list of SNPs. Each SNP contains:
# chrom - the reference id for the reference genome
# pos - the position within the genome that the SNP is at
# entropy - the entropy value of the SNP, used to calculate mutual information
# aGenomes - a list of all genomes that have the value A in the SNP
# tGenomes - a list of all genomes that have the value Tin the SNP
# cGenomes - a list of all genomes that have the value C in the SNP
# gGenomes - a list of all genomes that have the value G in the SNP
#
#The entropy value for each SNP is calculated when its created.
#
####################################################
import EntropyCalculator
def main(inputFile, treeTable):
fo = open(inputFile, "r")
ISGData = []
#read in number of genomes from file
fo.seek(12)
numGenomes = int(fo.readline().strip())
numGenomes += 1 #add one to include reference genome
#read in header row
arrLine = fo.readline().strip().split("\t")
#place genome names in array
arrGenomeName = []
intCounter = 0
for word in arrLine:
if(intCounter > 1 and intCounter < (numGenomes + 2)):
arrGenomeName.append(word)
intCounter += 1
#Determine genome group SNP differentiates
for line in fo.readlines():
arrLine = line.strip().split("\t")
strChrom = arrLine[0]
strPos = arrLine[1]
arrSNP = []
#place all SNPs into array
for i in range(2,numGenomes+2):
arrSNP.append(arrLine[i])
#sort Genomes into groups by SNP call
arrA = []
arrT = []
arrC = []
arrG = []
intCounter = 0
for j in arrSNP:
if(arrSNP[intCounter] == 'A'):
arrA.append(arrGenomeName[intCounter])
if(arrSNP[intCounter] == 'T'):
arrT.append(arrGenomeName[intCounter])
if(arrSNP[intCounter] == 'C'):
arrC.append(arrGenomeName[intCounter])
if(arrSNP[intCounter] == 'G'):
arrG.append(arrGenomeName[intCounter])
intCounter += 1
#Checking if there at least 2 groups of at least 2 genomes
intGroups = 0
if(len(arrA) >= 1):
intGroups += 1
if(len(arrT) >= 1):
intGroups += 1
if(len(arrC) >= 1):
intGroups += 1
if(len(arrG) >= 1):
intGroups += 1
# calculate entropy values
entropy = EntropyCalculator.main(treeTable, frozenset(arrA), frozenset(arrT), frozenset(arrC), frozenset(arrG))
ISGData.append(SNP(strChrom, int(strPos), entropy, arrA, arrT, arrC, arrG))
fo.close()
return ISGData
# class containing information about a specific SNP
class SNP(object):
def __init__(self, chrom, pos, entropy, aGenomes, tGenomes, cGenomes, gGenomes):
self.chrom = chrom
self.pos = pos
self.entropy = entropy
self.aGenomes = aGenomes
self.tGenomes = tGenomes
self.cGenomes = cGenomes
self.gGenomes = gGenomes
def printSNP(self):
print(self.pos, self.entropy, self.aGenomes, self.tGenomes, self.cGenomes, self.gGenomes)