forked from OpenBEL/resource-generator
/
gp_baseline.py
executable file
·292 lines (251 loc) · 9.59 KB
/
gp_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env python3
# coding: utf-8
'''
gp_baseline.py
The entrance point to the program. gp_baseline calls out to
namespace.py, equiv.py, and annotate.py to construct various
.bel files.
inputs:
-b resource-generator phase to begin at [1,2,3,4,5]
-e resource-generator phase to end at [1,2,3,4,5] (>= begin phase)
-n the directory to store the new equivalence data
-p pickle file name suffix for parsed data
-v enables verbose mode
phases:
1. data download
2. data parser with pickler
3. namespace builder
4. annotation builder
5. equivalence builder
'''
from configuration import *
import argparse
import os
import parsed
import pickle
import time
import shutil
import annotate
from common import download
from datasets import NamespaceDataSet, DataSet
from constants import PARSER_TYPE, RES_LOCATION
# collect paths needed for proper resource file location
import sys
# script source path
if sys.argv[0].find('/') < 0:
src_dir = '.'
else:
src_dir = sys.argv[0][:sys.argv[0].rfind('/')]
# script execution path
cwd = os.getcwd()
# allow for successful import of equiv module
# - equiv.py attempts to load data from [cwd]/datasets/meshcs_to_gocc.csv
# using os.getcwd() for value of [cwd]
# - if gp_baseline is not launched from its source directory, the import fails
os.chdir(src_dir)
# assure full path is saved
src_dir = os.getcwd()
import equiv
os.chdir(cwd)
parser = argparse.ArgumentParser(description="""Generate namespace and equivalence files
for gene/protein datasets.""")
parser.add_argument("-v", "--verbose", required=False, action="store_true",
help="enable verbose program output")
parser.add_argument("-n", required=True, nargs=1, metavar="DIRECTORY",
help="directory to store the new namespace equivalence data")
parser.add_argument("-b", "--begin_phase", type=int, choices=[1, 2, 3, 4, 5], default = 1,
help="resource-generator phase to begin at")
parser.add_argument("-e", "--end_phase", type=int, choices=[1, 2, 3, 4, 5], default = 5,
help="resource-generator phase to end at")
parser.add_argument("-p", "--parsed_pickle", type=str, default = 'parsed_data.pickle',
help="pickle file name suffix for parsed data")
args = parser.parse_args()
verbose = args.verbose
if verbose:
print('\nRunning gp_baseline in verbose mode.\n')
if args.begin_phase > args.end_phase:
args.end_phase = args.begin_phase
print('Reseting end phase to match begin phase: %d.' % (args.end_phase))
resource_dir = args.n[0]
if not os.path.exists(resource_dir):
os.mkdir(resource_dir)
if verbose:
print('Created resource destination directory:', resource_dir)
# change to resource directory
os.chdir(resource_dir)
if verbose:
print('Changing to directory:', resource_dir)
# make dataset directory
if not os.path.exists('datasets'):
os.mkdir('datasets')
if verbose:
print('Created datasets directory')
# bring in some dependancies
dep_files = []
dep_files.append('selventa-legacy-diseases.txt')
dep_files.append('selventa-legacy-chemical-names.txt')
dep_files.append('selventa-protein-families.txt')
dep_files.append('selventa-named-complexes.txt')
if args.begin_phase <=1:
for df in dep_files:
if not os.path.exists(src_dir+'/datasets/'+df):
print('WARNING !!! Dependency file %s not found in %s/datasets/' % (df, src_dir))
else:
shutil.copy(src_dir+'/datasets/'+df, os.getcwd()+'/datasets')
if verbose:
print('Copying dependency file %s to %s/datasets/' % (df, os.getcwd()))
# make templates directory
if not os.path.exists('templates'):
os.mkdir('templates')
if verbose:
print('Created templates directory')
for df in os.listdir(src_dir+'/templates'):
shutil.copy(src_dir+'/templates/'+df, os.getcwd()+'/templates')
if verbose:
print('Copying template file %s to %s/templates/' % (df, os.getcwd()))
cwd = os.getcwd()
start_time = time.time()
if args.begin_phase <= 1:
print('\n======= Phase I, downloading data =======')
for name, url_tuple in baseline_data.items():
if verbose:
print('Downloading ' +str(name))
sys.stdout.flush()
path = os.path.join('datasets/', name)
# if url_tuple[RES_LOCATION].startswith('http') or \
# url_tuple[RES_LOCATION].startswith('ftp'):
loc = url_tuple[RES_LOCATION]
if any([loc.startswith(x) for x in ['file', 'ftp', 'http']]):
download(url_tuple[RES_LOCATION], path)
print(loc)
print('Phase 1 ran in %.3f minutes' % ((time.time() - start_time) / 60))
if args.end_phase == 1:
print('\nTerminating process after phase 1 as specified by user.')
print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60))
sys.exit()
else:
print('\nSkipping phase 1.')
sys.stdout.flush()
if args.begin_phase <= 2:
print('\n======= Phase II, parsing data =======')
# For now, download and store the data in the parsed.py module. This module
# could be replaced or re-implemented using something like DBM to help with
# memory usage.
interval_time = time.time()
working_dir = os.getcwd()
# object_dict is dictionary with keys = prefix + '_data' and value = data object
# use object_dict to access data objects by name
object_dict = {}
for root, dirs, filenames in os.walk(working_dir):
for fn in filenames:
if fn in baseline_data:
try:
data_tuple = baseline_data.get(fn)
data_object = data_tuple[2]
parser = data_tuple[PARSER_TYPE]('datasets/'+fn)
if verbose:
parser.is_verbose()
print('Running {0} on file {1}'.format(str(parser), fn))
except:
print('WARNING - skipping {0}; file not properly configured'.format(fn))
continue
for x in parser.parse():
parsed.build_data(x, str(parser), data_object)
# if data_tuple[2] is a list of objects, handle list
if isinstance(data_object, list):
for o in data_object:
o.source_file = fn
with open(str(o) + '.' + args.parsed_pickle, 'wb') as f:
pickle.dump(o, f, pickle.HIGHEST_PROTOCOL)
object_dict[str(o) + '_data'] = o
continue
# if data_tuple[2] is a single object
elif isinstance(data_object, DataSet):
data_object.source_file = fn
with open(str(data_object) + '.' + args.parsed_pickle, 'wb') as f:
pickle.dump(data_object, f, pickle.HIGHEST_PROTOCOL)
object_dict[str(data_object) + '_data'] = data_object
print('Phase II ran in %.3f minutes' % ((time.time() - interval_time) / 60))
if args.end_phase == 2:
print('\nTerminating process after phase 2 as specified by user.')
print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60))
sys.exit()
else:
print('\nSkipping phase 2.')
sys.stdout.flush()
# if beginning after parsing phase (phase 2), load pickled objects and store in object_dict
if args.begin_phase >= 3:
object_dict = {}
for file_name in os.listdir("."):
if file_name.endswith("parsed_data.pickle"):
with open(file_name, 'rb') as f:
d = pickle.load(f)
file_name = file_name.replace(".parsed_data.pickle","") + '_data'
if isinstance(d, DataSet):
object_dict[file_name] = d
if verbose:
print('Loading {0} from pickle file'.format(str(d)))
if args.begin_phase <= 3:
print('\n======= Phase III, building namespaces =======')
interval_time = time.time()
for dataset in object_dict.values():
if not isinstance(dataset, NamespaceDataSet):
continue
if not 'ns' in dataset.scheme_type:
continue # skip annotation 'anno' data sets
if verbose:
print('Generating namespace file for ' +str(dataset))
try:
dataset.write_ns_values(cwd)
except:
print("Unexpected error:", sys.exc_info()[1])
print('Phase III ran in %.3f minutes' % ((time.time() - interval_time) / 60))
if args.end_phase == 3:
print('\nTerminating process after phase 3 as specified by user.')
print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60))
sys.exit()
else:
print('\nSkipping phase 3.')
sys.stdout.flush()
if args.begin_phase <= 4:
print('\n======= Phase IV, building annotations =======')
interval_time = time.time()
# NOTE - Phase Iv not implemented!
print('Phase IV ran in %.3f minutes' % ((time.time() - interval_time) / 60))
if args.end_phase == 4:
print('\nTerminating process after phase 4 as specified by user.')
print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60))
sys.exit()
else:
print('\nSkipping phase 4.')
sys.stdout.flush()
print('\n======= Phase V, building equivalences =======')
# Any datasets producing a .beleq file should be added to equiv_data
interval_time = time.time()
if args.begin_phase > 2:
# need to reload some data into parsed objects since they are needed by eqiv:
# - meshd ...needs... do
parsed.do_data = object_dict.get('do_data')
# - affy ...needs... g2
parsed.gene2acc_data = object_dict.get('gene2acc_data')
# equiv_root_data should include string names for each namespacedataset used as a 'root' for equivalence
equiv_root_data = ['egid_data','hgnc_data', 'mgi_data', 'rgd_data', 'gobp_data', 'chebi_data', 'gocc_data', 'do_data', 'meshc_data']
for data_name in equiv_root_data:
data = object_dict.get(data_name)
if data:
if verbose:
print('Generating equivalence file for ' + str(data))
equiv.equiv(data, verbose)
# now make equivalences for namespace datasets that are not root
for data_name, data in object_dict.items():
# skip equiv root datasets handled above
if data_name in equiv_root_data:
continue
elif isinstance(data, NamespaceDataSet) and 'ns' in data.scheme_type:
if verbose:
print('Generating equivalence file for ' + str(data))
equiv.equiv(data, verbose)
print('Phase V ran in %.3f minutes' % ((time.time() - interval_time) / 60))
print('\n======= Phase VI, finished! =======')
print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60))
# vim: ts=4 sts=4 sw=4 noexpandtab