forked from Pex2892/PETAL
/
analysis.py
258 lines (196 loc) · 11.5 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import globals as gl
from utility import download_file, download_read_html, set_progress_bar, export_data_for_deep
import os
import gzip
from joblib import Parallel, delayed
from xml.dom.minidom import parseString
def run_analysis(starting_depth):
for deep in range(starting_depth, gl.deep_input + 1):
if deep == 1:
# download initial pathway
download_file('http://rest.kegg.jp/get/' + gl.pathway_input + '/kgml',
os.path.join(os.getcwd(), 'database', 'pathways', 'xml'), gl.pathway_input + '.xml.gz')
# get info first gene from gene name
hsa_finded, url_finded = get_info_gene_initial(gl.pathway_input, gl.gene_input)
# set globals variables
gl.gene_input_hsa = hsa_finded
gl.gene_input_url = url_finded
# read initial pathway, create and add genes to csv
list_rows_df_returned = read_kgml(deep, gl.pathway_input, gl.gene_input, hsa_finded, gl.gene_input, 1)
# add n genes found to the dataframe
unified([list_rows_df_returned])
# retrive other list pathways in reference to initial pathway
list_pathways_this_gene = download_read_html(url_finded)
# The pathway set as input from the config file is removed
if gl.pathway_input in list_pathways_this_gene:
list_pathways_this_gene.remove(gl.pathway_input)
if len(list_pathways_this_gene) > 0:
# process single gene on each CPUs available
list_rows_df_returned = Parallel(n_jobs=gl.num_cores_input)(delayed(analysis_deep_n)(
deep, gl.gene_input, hsa_finded, pathway_this_gene, gl.gene_input, 1)
for pathway_this_gene in set_progress_bar(
'[Deep: %d]' % deep, str(len(list_pathways_this_gene)))(list_pathways_this_gene))
unified(list_rows_df_returned)
else:
print('[Deep: 1] Only directly connected genes were found')
else:
# Retrieve the genes found at depth-1, avoiding the input gene
df_genes_resulted = (
gl.DF_TREE[(gl.DF_TREE['deep'] == deep - 1) & (gl.DF_TREE['name_son'] != gl.gene_input)])
for index, row in set_progress_bar(
'[Deep: %d]' % deep, str(df_genes_resulted.shape[0]))(df_genes_resulted.iterrows()):
# Return a list of pathways about the gene passed in input
list_pathways_this_gene = download_read_html(row['url_kegg_son'])
# The pathway set as input from the config file is removed, so as to avoid an endless loop
# if gl.pathway_input in list_pathways_this_gene:
# list_pathways_this_gene.remove(gl.pathway_input)
# process single gene on each CPUs available
list_rows_df_returned = Parallel(n_jobs=gl.num_cores_input)(
delayed(analysis_deep_n)(
deep, row['name_son'], row['hsa_son'], pathway_this_gene,
row['fullpath'], row['occurrences']) for pathway_this_gene in list_pathways_this_gene
)
unified(list_rows_df_returned)
# ----- START DROP DUPLICATES -----
# Duplicates of the same level are extracted and sorted in alphabetical order
df_genes_this_level = (gl.DF_TREE[gl.DF_TREE['deep'] == deep])
df_duplicated_filtered = df_genes_this_level[df_genes_this_level.duplicated(
subset=['name_son'], keep=False)].sort_values('name_son')
# The names of the genes that are duplicated are recovered
list_name_genes_duplicated = df_duplicated_filtered.name_son.unique()
# process single gene on each CPUs available
list_rows_to_do_df_returned = Parallel(n_jobs=gl.num_cores_input)(
delayed(get_info_row_duplicated)(df_duplicated_filtered, gene_duplicate)
for gene_duplicate in list_name_genes_duplicated
)
# The number of occurrences of the found links is updated and the duplicates will be deleted
clean_update_row_duplicates(list_rows_to_do_df_returned)
gl.DF_TREE = gl.DF_TREE[(gl.DF_TREE['deep'] == deep)]
# export in csv per deep
export_data_for_deep(deep)
# Row indexes are reset, because they are no longer sequential due to the elimination of duplicates
gl.DF_TREE = gl.DF_TREE.reset_index(drop=True)
# ----- END DROP DUPLICATES -----
def get_info_gene_initial(pathway_hsa, name_gene):
filename = os.path.join(os.getcwd(), 'database', 'pathways', 'xml', pathway_hsa + '.xml.gz')
with gzip.open(filename, "rb") as f:
content = f.read().decode('utf-8')
mydoc = parseString(content)
# GENES
entry = mydoc.getElementsByTagName('entry')
for elem in entry:
string_check = name_gene + ','
if elem.attributes['type'].value == 'gene' and string_check in \
elem.getElementsByTagName('graphics')[0].attributes['name'].value:
return elem.attributes['name'].value, elem.attributes['link'].value
print('The gene entered not exit into pathway selected!')
exit(1)
def search_id_to_hsa(list_genes_this_pathway, hsa_gene):
return [item for item in list_genes_this_pathway if hsa_gene == item[1]]
def search_gene_to_id(list_genes_this_pathway, id_gene):
return [item for item in list_genes_this_pathway if id_gene in item]
def concat_multiple_subtype(list_subtype):
# All the "subtypes" of the selected relationship are concatenated
if len(list_subtype) > 0:
subtype = []
for item in list_subtype:
subtype.append(item.attributes['name'].value)
return '//'.join(subtype)
return 'None'
def read_kgml(deep, pathway_hsa, name_gene_start, hsa_gene_start, path, occu):
filename = os.path.join(os.getcwd(), 'database', 'pathways', 'xml', pathway_hsa + '.xml.gz')
with gzip.open(filename, "rb") as f:
content = f.read().decode('utf-8')
mydoc = parseString(content)
# GENES
entry = mydoc.getElementsByTagName('entry')
graphics = mydoc.getElementsByTagName('graphics')
# RELATIONS
relation = mydoc.getElementsByTagName('relation')
list_genes_this_pathway = []
for elem, elem2 in zip(entry, graphics):
if 'hsa:' in elem.attributes['name'].value and elem.attributes['type'].value == 'gene':
# Genes are saved are of the "gene" type
list_genes_this_pathway.append((elem.attributes['id'].value, elem.attributes['name'].value,
elem2.attributes['name'].value.split(',')[0],
elem.attributes['link'].value))
# All the ids contained in the pathway are recovered, because a gene can have
# different ids in different pathways or in the same pathway
list_ids_gene_input = search_id_to_hsa(list_genes_this_pathway, hsa_gene_start)
list_rows = list()
if len(list_ids_gene_input) > 0:
# You scroll through the relationships found within the pathway
for elem in relation:
# Scroll the list of ids found
for id_gene in list_ids_gene_input:
# Check that "entry2" has at least one match in the id list
if elem.attributes['entry1'].value == id_gene[0]:
# We try to find a correspondence between gene and your id. If it returns zero,
# it indicates that that gene does not exist or is group or compund type and not gene type
if elem.attributes['entry1'].value == id_gene[0]:
list_gene_relation = search_gene_to_id(list_genes_this_pathway, elem.attributes['entry2'].value)
# Checks if at least one connection has been found
if len(list_gene_relation) > 0:
row = {
'deep': deep,
'name_father': name_gene_start,
'hsa_father': hsa_gene_start,
'name_son': list_gene_relation[0][2],
'hsa_son': list_gene_relation[0][1],
'url_kegg_son': list_gene_relation[0][3],
'relation': elem.attributes['type'].value,
'type_rel': concat_multiple_subtype(elem.getElementsByTagName('subtype')),
'pathway_of_origin': pathway_hsa,
'fullpath': path + '/' + list_gene_relation[0][2],
'occurrences': occu
}
list_rows.append(row)
return list_rows
def analysis_deep_n(deep, gene, gene_hsa, pathway_this_gene, path, occu):
download_file('http://rest.kegg.jp/get/' + pathway_this_gene + '/kgml',
os.path.join(os.getcwd(), 'database', 'pathways', 'xml'), pathway_this_gene + '.xml.gz')
list_rows = read_kgml(deep, pathway_this_gene, gene, gene_hsa, path, occu)
return list_rows
def unified(list_rows_returned):
# Add the results obtained by the threads in parallel in the main data frame
for row in list_rows_returned:
if row is not None:
for cell in row:
gl.DF_TREE = gl.DF_TREE.append(cell, ignore_index=True)
def get_info_row_duplicated(df_filtered, gene):
grouped_df = (df_filtered[df_filtered['name_son'] == gene]).groupby('name_father')
list_to_do_df = list()
for key, group in grouped_df:
hsa_end_refactor = ' '.join(group['hsa_son'].tolist())
hsa_end_refactor = sorted(set(hsa_end_refactor.split(' ')))
hsa_end_refactor = ' '.join(hsa_end_refactor)
url_gene_end_refactor = 'https://www.kegg.jp/dbget-bin/www_bget?' + hsa_end_refactor.replace(' ', '+')
occurences_calculated = group.iloc[0]['occurrences'] * group.shape[0]
# row to update (take the first one by updating all fields)
# list of rows to be removed, keeping only one
# the hsa_end of the gene is created
# url_end of the gene is created
# concatenation of all relations based on the source pathways
# concatenation of all type_rel based on the source pathway
# concatenation of all path_origin
list_to_do_df.append((
group.index[0],
list(filter(group.index[0].__ne__, group.index.values.tolist())),
hsa_end_refactor,
url_gene_end_refactor,
'§§'.join(group['relation'].tolist()),
'§§'.join(group['type_rel'].tolist()),
'§§'.join(group['pathway_of_origin'].tolist()),
occurences_calculated
)
)
return list_to_do_df
def clean_update_row_duplicates(list_to_do_df):
for row in list_to_do_df:
for cell in row:
# The selected row is updated, in the 4 specified columns, with the new information
cols = ['hsa_son', 'url_kegg_son', 'relation', 'type_rel', 'pathway_of_origin', 'occurrences']
gl.DF_TREE.loc[cell[0], cols] = [cell[2], cell[3], cell[4], cell[5], cell[6], cell[7]]
# The selected rows are removed, because they are duplicated
if len(cell[1]) > 0:
gl.DF_TREE = gl.DF_TREE.drop(cell[1])