forked from nsalomonis/altanalyze
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SubGeneViewerExport.py
211 lines (191 loc) · 10.1 KB
/
SubGeneViewerExport.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
###SubGeneViewerExport
#Copyright 2005-2008 J. David Gladstone Institutes, San Francisco California
#Author Nathan Salomonis - nsalomonis@gmail.com
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is furnished
#to do so, subject to the following conditions:
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import sys, string
import os.path
import unique
import export
dirfile = unique
############ File Import Functions #############
def filepath(filename):
fn = unique.filepath(filename)
return fn
def read_directory(sub_dir):
dir_list = unique.read_directory(sub_dir)
#add in code to prevent folder names from being included
dir_list2 = []
for entry in dir_list:
if entry[-4:] == ".txt" or entry[-4:] == ".all" or entry[-5:] == ".data" or entry[-3:] == ".fa":
dir_list2.append(entry)
return dir_list2
def returnDirectories(sub_dir):
dir=os.path.dirname(dirfile.__file__)
dir_list = os.listdir(dir + sub_dir)
###Below code used to prevent FILE names from being included
dir_list2 = []
for entry in dir_list:
if "." not in entry: dir_list2.append(entry)
return dir_list2
class GrabFiles:
def setdirectory(self,value): self.data = value
def display(self): print self.data
def searchdirectory(self,search_term):
#self is an instance while self.data is the value of the instance
files = getDirectoryFiles(self.data,search_term)
if len(files)<1: print 'files not found'
return files
def returndirectory(self):
dir_list = getAllDirectoryFiles(self.data)
return dir_list
def getAllDirectoryFiles(import_dir):
all_files = []
dir_list = read_directory(import_dir) #send a sub_directory to a function to identify all files in a directory
for data in dir_list: #loop through each file in the directory to output results
data_dir = import_dir[1:]+'/'+data
all_files.append(data_dir)
return all_files
def getDirectoryFiles(import_dir,search_term):
dir_list = read_directory(import_dir) #send a sub_directory to a function to identify all files in a directory
matches=[]
for data in dir_list: #loop through each file in the directory to output results
data_dir = import_dir[1:]+'/'+data
if search_term in data_dir: matches.append(data_dir)
return matches
def cleanUpLine(line):
line = string.replace(line,'\n','')
line = string.replace(line,'\c','')
data = string.replace(line,'\r','')
data = string.replace(data,'"','')
return data
############### Main Program ###############
def importAnnotationData(filename):
fn=filepath(filename); x=1
global gene_symbol_db; gene_symbol_db={}
for line in open(fn,'rU').xreadlines():
data = cleanUpLine(line)
t = string.split(data,'\t')
if x==0: x=1
else:
gene = t[0]
try: symbol = t[1]
except IndexError: symbol = ''
if len(symbol)>0: gene_symbol_db[gene] = symbol
def importGeneData(filename,data_type):
fn=filepath(filename); x=0; gene_db={}
for line in open(fn,'rU').xreadlines():
data = cleanUpLine(line)
t = string.split(data,'\t')
if x==0:x=1
else:
proceed = 'yes'
if data_type == 'junction': gene, region5, region3 = t; value_str = region5+':'+region3
if data_type == 'feature':
probeset, gene, feature, region = t; value_str = region,feature+':'+region+':'+probeset ###access region data later
#if (gene,region) not in region_db: region_db[gene,region] = feature,probeset ### Needed for processed structure table (see two lines down)
try: region_db[gene,region].append((feature,probeset)) ### Needed for processed structure table (see two lines down)
except KeyError: region_db[gene,region] = [(feature,probeset)]
try: region_count_db[(gene,region)]+=1
except KeyError: region_count_db[(gene,region)]=1
###have to add in when parsing structure probeset values for nulls (equal to 0)
if data_type == 'structure':
gene, exon, type, block, region, const, start, annot = t; region_id = exon
if len(annot)<1: annot = '---'
if (gene,exon) in region_db:
probeset_data = region_db[(gene,exon)]
for (feature,probeset) in probeset_data:
count = str(region_count_db[(gene,exon)]) ###here, feature is the label (reversed below)
value_str = feature+':'+exon+':'+probeset+':'+type+':'+count+':'+const+':'+start+':'+annot
if gene in gene_symbol_db: ###Only incorporate gene data with a gene symbol, since Cytoscape currently requires this
try: gene_db[gene].append(value_str)
except KeyError: gene_db[gene] = [value_str]
proceed = 'no'
else: ### Occurs when no probeset is present: E.g. the imaginary first and last UTR region if doesn't exit
feature = exon ###feature contains the region information, exon is the label used in Cytoscape
exon,null = string.split(exon,'.')
probeset = '0'
count = '1'
null_value_str = exon,exon+':'+feature+':'+probeset ###This is how Alex has it... to display the label without the '.1' first
try: feature_db[gene].append(null_value_str)
except KeyError: feature_db[gene] = [null_value_str]
value_str = exon+':'+feature+':'+probeset+':'+type+':'+count+':'+const+':'+start+':'+annot
if gene in structure_region_db:
order_db = structure_region_db[gene]
order_db[exon] = block
else:
order_db = {}
order_db[exon] = block
structure_region_db[gene] = order_db
if gene in gene_symbol_db and proceed == 'yes': ###Only incorporate gene data with a gene symbol, since Cytoscape currently requires this
try: gene_db[gene].append(value_str)
except KeyError: gene_db[gene] = [value_str]
return gene_db
def exportData(gene_db,data_type,species):
export_file = 'AltDatabase/ensembl/SubGeneViewer/'+species+'/Xport_sgv_'+data_type+'.csv'
if data_type == 'feature': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_feature'+'\n'
if data_type == 'structure': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_structure'+'\n'
if data_type == 'splice': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_splice'+'\n'
data = export.createExportFile(export_file,'AltDatabase/ensembl/SubGeneViewer/'+species)
#fn=filepath(export_file); data = open(fn,'w')
data.write(title)
for gene in gene_db:
try:
symbol = gene_symbol_db[gene]
value_str_list = gene_db[gene]
value_str = string.join(value_str_list,',')
values = string.join([gene,symbol,value_str],'\t')+'\n'; data.write(values)
except KeyError: null = []
data.close()
print "exported to",export_file
def customLSDeepCopy(ls):
ls2=[]
for i in ls: ls2.append(i)
return ls2
def reorganizeData(species):
global region_db; global region_count_db; global structure_region_db; global feature_db
region_db={}; region_count_db={}; structure_region_db={}
import_dir = '/AltDatabase/ensembl/'+species
g = GrabFiles(); g.setdirectory(import_dir)
exon_struct_file = g.searchdirectory('exon-structure')
feature_file = g.searchdirectory('feature-data')
junction_file = g.searchdirectory('junction-data')
annot_file = g.searchdirectory('Ensembl-annotations.')
importAnnotationData(annot_file[0])
### Run the files through the same function which has options for different pieces of data. Feature data is processed a bit differently
### since fake probeset data is supplied for intron and UTR features not probed for
splice_db = importGeneData(junction_file[0],'junction')
feature_db = importGeneData(feature_file[0],'feature')
structure_db = importGeneData(exon_struct_file[0],'structure')
for gene in feature_db:
order_db = structure_region_db[gene]
temp_list0 = []; temp_list = []; rank = 1
for (region,value_str) in feature_db[gene]:
###First, we have to get the existing order... this is important because when we sort, it screw up ranking within an intron with many probesets
temp_list0.append((rank,region,value_str)); rank+=1
for (rank,region,value_str) in temp_list0:
try: block_number = order_db[region]
except KeyError: print gene, region, order_db;kill
temp_list.append((int(block_number),rank,value_str)) ###Combine the original ranking plus the ranking included from taking into account regions not covered by probesets
temp_list.sort()
temp_list2 = []
for (block,rank,value_str) in temp_list:
temp_list2.append(value_str)
feature_db[gene] = temp_list2
exportData(splice_db,'splice',species)
exportData(structure_db,'structure',species)
exportData(feature_db,'feature',species)
if __name__ == '__main__':
dirfile = unique
species = 'Hs'
reorganizeData(species)