-
Notifications
You must be signed in to change notification settings - Fork 0
/
compare.py
243 lines (208 loc) · 11.4 KB
/
compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import sys
import getopt
import Helper
from Helper import OrthologPair
#from classes import GeneLevelProtein
#from classes import DomainLevelProtein
from classes.GeneLevelProtein import GeneLevelProtein
from classes.DomainLevelProtein import DomainLevelProtein
import ConfigParser
import os
__doc__ = "usage: python compare.py -f <filename> -d <filename> -a <taxid organism 1> -b <taxid organism 2> -o <outputfilename> -w\n" \
"f: result file of a gene level inparanoid run of pattern Output.<OrganismA>-<OrganismB>\n" \
"d: result file of a domain level inparanoid run of pattern Output.<OrganismA>-<OrganismB>\n" \
"a, b: taxids for organisms A and B\n" \
"o: path where compare results should be saved.\n" \
"w: optional if id lists should be written"
rcp = ConfigParser.RawConfigParser()
rcp.read("orthology.cfg")
#OrthologPair = namedtuple("OrthologPair", ["first", "second"])
def mapDomainOrthologsToProteins(pairwiseDomains, domainsA, domainsB):
def isMapped(mapping, pair):
for entry in mapping:
if pair.first == entry.first or pair.second == entry.second:
return True
return False
pairwiseFull = {}
mapped = {}
for pair in pairwiseDomains:
if pairwiseDomains[pair] not in mapped:
mapped[pairwiseDomains[pair]] = {}
orthpair = OrthologPair(first=domainsA[pair.first.header].accession, second=domainsB[pair.second.header].accession)
if orthpair not in mapped[pairwiseDomains[pair]]:
mapped[pairwiseDomains[pair]][orthpair] = []
if orthpair not in pairwiseFull:
pairwiseFull[orthpair] = 1
mapped[pairwiseDomains[pair]][orthpair].append(pair)
elif not isMapped(mapped[pairwiseDomains[pair]][orthpair], pair):
pairwiseFull[orthpair] += 1
mapped[pairwiseDomains[pair]][orthpair].append(pair)
return pairwiseFull
def findProteinsWhereAllDomainsInferOrthology(pairsFull, pairsDomains):
allDomainsInfer = []
notAll = []
notAny = []
for pair in pairsFull:
if pair in pairsDomains:
if pairsFull[pair] == pairsDomains[pair]:
allDomainsInfer.append(pair)
elif pairsFull[pair] > pairsDomains[pair]:
notAll.append(pair)
elif pairsFull[pair] < pairsDomains[pair]:
print "Take a closer look at", pair, "in findProteinsWereAllDomainsInferOrthology(pairsFull, pairsDomains)"
else:
notAny.append(pair)
return allDomainsInfer, notAll, notAny
# add length condition
def findProteinsOrthologyOnlyByDomains(proteinsA, proteinsB, pairsDomains, pairsFull, taxidA, taxidB, cutoff):
only = []
other = [] #other contains accession ids of proteins that have no orthology but at least one domain of them has
tsvA = Helper.initTsvForOrganism(taxidA)
tsvB = Helper.initTsvForOrganism(taxidB)
for pair in pairsDomains:
if pair not in pairsFull:
amount = min(len(tsvA[pair.first]), len(tsvB[pair.second]))
if pairsDomains[pair] >= amount:
only.append(pair)
other.append(pair)
return only
# for pair in pairsDomains:
def addMinimumDomainCountToFullOrthologs(pairwise):
pairwiseCount = {}
notTheSameA = []
notTheSameB = []
theSame = []
for pair in pairwise:
accPair = OrthologPair(first = pair.first.accession, second = pair.second.accession)
if len(pair.first.domains) == len(pair.second.domains):
theSame.append(len(pair.first.domains))
else:
notTheSameA.append(len(pair.first.domains))
notTheSameB.append(len(pair.second.domains))
pairwiseCount[accPair] = min(len(pair.first.domains), len(pair.second.domains))
print len(theSame), "sequence pairs have the same amount of domains with a mean of", Helper.mean(theSame), "and a median of", Helper.median(theSame)
print len(notTheSameA), "sequence pairs do not have the same amount of domains with a mean of", Helper.mean(notTheSameA), "/", Helper.mean(notTheSameB), "and a median of", Helper.median(notTheSameA), "/" , Helper.median(notTheSameB)
return pairwiseCount
def main():
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "hwf:d:o:a:b:", ["help", "write","ffile", "dfile", "ofile", "taxidA", "taxidB"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
fullfile = ''
domainfile = ''
outname = ''
outputfile = ''
tsvA = ''
tsvB = ''
taxA = ''
taxB = ''
write = False
resultpath = ''
ok = True
for opt, arg in opts:
if opt in ("-f", "--ffile"):
fullfile = rcp.get("Filepaths", "resultpath") + arg
elif opt in ("-d", "--dfile"):
domainfile = rcp.get("Filepaths", "resultpath") + arg
elif opt in ("-o", "--ofile"):
outname = arg
outputfile = rcp.get("Filepaths", "comparisonpath") + arg + "/Comparison" + arg
resultpath = rcp.get("Filepaths", "comparisonpath") + outname + "/"
os.system("mkdir " + resultpath)
elif opt in ("-a", "--taxidA"):
tsvA = rcp.get("Filepaths", "tsvpath") + arg + ".tsv"
taxA = arg
elif opt in ("-b", "--taxidB"):
tsvB = rcp.get("Filepaths", "tsvpath") + arg + ".tsv"
taxB = arg
elif opt in ("-w", "--write"):
write = True
else:
ok = False
if ok:
if not write:
"Option -w was not set."
cutoff = rcp.getint("Options","domainlengthcutoff")
#intialising
proteinsA, proteinsB, orthologs, shortA, shortB = GeneLevelProtein.initGeneLevelProteins(fullfile, tsvA, tsvB, True)
domainsA, domainsB, orthologsD = DomainLevelProtein.initDomainLevelProteins(domainfile)
#calc pairwise ortholog mappings
print "pairwise orthology mappings ..."
pairwise = Helper.pairwiseOrthologs(orthologs, proteinsA, proteinsB)
print "pairwise domain orthology mappings ..."
pairwiseDomains = Helper.pairwiseOrthologs(orthologsD, domainsA, domainsB)
#analysing stuff
print "mapping domains to proteins ..."
mapping = mapDomainOrthologsToProteins(pairwiseDomains, domainsA, domainsB)
print "add counters to full sequence orthologs ..."
counters = addMinimumDomainCountToFullOrthologs(pairwise)
print "find proteins where all / not all / not any domains infer orthology ..."
allDomains, notAll, notAny = findProteinsWhereAllDomainsInferOrthology(counters, mapping)
print "find protein orthology only by domains ..."
onlyByDomains = findProteinsOrthologyOnlyByDomains(proteinsA, proteinsB, mapping, counters, taxA, taxB, cutoff)
print "start filtering ..."
# run length filter
filteredOnlyByDomains = Helper.filterDomainOrthologyByLength(onlyByDomains, taxA, taxB, 0.5)
#filteredNotAny = Helper.filterDomainOrthologyByLength(notAny, taxA, taxB)
filteredSome = Helper.filterDomainOrthologyByLength(notAll, taxA, taxB, 0.5)
filteredAllDomains = Helper.filterDomainOrthologyByLength(allDomains, taxA, taxB, 0.5)
filteredOnlyByDomains30 = Helper.filterDomainOrthologyByLength(onlyByDomains, taxA, taxB, 0.3)
filteredSome30 = Helper.filterDomainOrthologyByLength(notAll, taxA, taxB, 0.3)
filteredAllDomains30 = Helper.filterDomainOrthologyByLength(allDomains, taxA, taxB, 0.3)
# print information
outhandle = open(outputfile, 'w')
#length = rcp.getfloat("Options", "mincombdomainlength") * 100
fullCount = len(counters)
domainCount = len(mapping)
outhandle.write("Basic information:\n")
outhandle.write(str(shortA) + " domains from organism A were shorter than " + str(cutoff) + "\n")
outhandle.write(str(shortB) + " domains from organism B were shorter than " + str(cutoff) + "\n")
outhandle.write(str(fullCount) + " ortholog pairs in the full sequence orthology set\n")
outhandle.write(str(domainCount) + " ortholog pairs in the domain orthology set\n\n")
def percentage(pairs):
return str(float(len(pairs))/float(fullCount) * 100)
outhandle.write("\nOrthology support information:\n")
outhandle.write("50 / 30 % filter cutoff\n")
outhandle.write(str(len(allDomains)) + " ortholog pairs that are also supported by all their domains - " + percentage(allDomains) + "%\n")
outhandle.write(str(len(filteredAllDomains30)) + " when filtered (30) - "+ percentage(filteredAllDomains30) +"%\n")
outhandle.write(str(len(filteredAllDomains)) + " when filtered (50) - " + percentage(filteredAllDomains) + "%\n")
outhandle.write(str(len(notAll)) + " ortholog pairs that are not supported by all their domains - "+ percentage(notAll) +"%\n")
outhandle.write(str(len(filteredSome30)) + " when filtered (30) - " + percentage(filteredSome30) + "%\n")
outhandle.write(str(len(filteredSome)) + " when filtered (50)" + percentage(filteredSome) +"%\n")
assert len(counters) - len(allDomains) - len(notAll) == len(notAny)
outhandle.write(str(len(notAny)) + " ortholog pairs that were not supported by any domains - " + percentage(notAny) + "%\n")
#outhandle.write(str(len(filteredNotAny)) + " when filtered\n")
outhandle.write(str(len(onlyByDomains)) + " ortholog pairs that are supported by all their constituent domains but not by the full sequence - " + percentage(onlyByDomains) +"%\n")
outhandle.write(str(len(filteredOnlyByDomains30)) + " when filtered (30) \n")
outhandle.write(str(len(filteredOnlyByDomains)) + " when filtered (50) \n")
outhandle.close()
if write:
# writing orthology groups to different files
Helper.printPairsToFile(counters, resultpath+"FullSequencesOrthologs"+outname)
Helper.printPairsToFile(pairwiseDomains, resultpath+"PairwiseDomains"+outname)
Helper.printPairsToFile(mapping, resultpath+"DomainsMappedToProteins"+outname)
Helper.printPairsToFile(allDomains, resultpath+"AllDomains"+outname)
Helper.printPairsToFile(onlyByDomains, resultpath+"OnlyByDomains"+outname)
Helper.printPairsToFile(notAny, resultpath+"NotAnyDomains"+outname)
Helper.printPairsToFile(notAll, resultpath+"NotAllDomains"+outname)
# filtered stuff
Helper.printPairsToFile(filteredOnlyByDomains, resultpath+"FilteredOnlyByDomains"+outname)
Helper.printPairsToFile(filteredAllDomains, resultpath+"FilteredAllDomains"+outname)
Helper.printPairsToFile(filteredSome, resultpath+"FilteredNotAllDomains"+outname)
Helper.printPairsToFile(filteredOnlyByDomains30, resultpath+"FilteredOnlyByDomains30"+outname)
Helper.printPairsToFile(filteredAllDomains30, resultpath+"FilteredAllDomains30"+outname)
Helper.printPairsToFile(filteredSome30, resultpath+"FilteredNotAllDomains30"+outname)
else:
print opts
print __doc__
sys.exit(0)
if __name__ == "__main__":
main()