-
Notifications
You must be signed in to change notification settings - Fork 2
/
import_3dsynthetichela.py
executable file
·480 lines (442 loc) · 15.8 KB
/
import_3dsynthetichela.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
#!/usr/bin/env python2
"""
This script takes a path to a folder with the 3D Hela Synthetic image files,
merges them appropriately into a single file, and uploads this to the specified
OMERO server.
Each cell and nucleus image file corresponds to several images with each
different channel type (e.g. endosome, lysosome, ...).
Cell and nucleus files are named like:
3Dhela<cell_number>cell.tif
(e.g. 3Dhela1cell.tif)
and:
3Dhela<cell_number>nucleus.tif
(e.g. 3Dhela1nucleus.tif)
respectively. The other channel image files are named like:
3Dhela<cell_number><channel_type><image_number>.tif
(e.g. 3Dhela1lysosome1.tif)
so one image would, for example, have channels corresponding to files:
3Dhela1cell.tif
3Dhela1nucleus.tif
3Dhela1endosome1.tif
and another image would correspond to:
3Dhela1cell.tif
3Dhela1nucleus.tif
3Dhela1endosome2.tif
and another corresponding to:
3Dhela1cell.tif
3Dhela1nucleus.tif
3Dhela1lysosome1.tif
"""
from __future__ import print_function
import argparse
import getpass
import glob
import os
import re
import string
import sys
import bioformats
import bioformats.log4j
import javabridge
import numpy
try:
import omero
except ImportError as e:
print("ImportError: {}".format(e))
print("Don't forget to set your PYTHONPATH to include the OMERO.py library!")
sys.exit(1)
import omero.gateway
import omero.rtypes
from PIL import Image
DEFAULT_HOST = 'localhost'
DEFAULT_PORT = '4064'
DEFAULT_USERNAME = 'root'
DEFAULT_PASSWORD = 'omero'
DATASETS = [
{
'name' : 'Lysosome',
'infix' : 'lysosome',
'protein_class': 'vesicles',
'protein_type': 'GMM'
}, {
'name': 'Mitochondrion',
'infix' : 'mitochondrion',
'protein_class': 'vesicles',
'protein_type': 'GMM'
}, {
'name' : 'Nucleolus',
'infix': 'nucleoli',
'protein_class': 'vesicles',
'protein_type': 'GMM'
} , {
'name' : 'Endosome',
'infix' : 'endosome',
'protein_class': 'vesicles',
'protein_type': 'GMM'
}, {
'name': 'Centrosome',
'infix': 'centrosome',
'protein_class': 'vesciles',
'protein_type': 'GMM'
}, {
'name': 'Microtubule',
'infix': 'microtubule',
'protein_class': 'microtubule',
'protein_type': 'network'
}
]
CHANNELS = [
'Cell membrane',
'DNA',
'protein'
]
TAG_ANNOTATIONS = dict()
def get_args():
"""
Parse the command line args using the argparse library.
"""
parser = argparse.ArgumentParser(
description="3D Synthetic HeLa collection uploader"
)
parser.add_argument("-t", "--test", action="store_true",
help="upload a single image as a test")
parser.add_argument("path", help="path to the 3D synthetic HeLa collection")
return parser.parse_args() # exits on malformed arguments
def error_exit(msg):
"""
Print the specified message and exit with non-zero status.
"""
print(msg)
sys.exit(1)
def get_cell_i_from_other_tif_name(tif_name):
"""
Given a tif name like
/users/foo/3Dhela5lysosome7.tif
get the 5, which is the index of the cell this tif corresponds to.
"""
without_extension = tif_name[:-4]
without_i = without_extension.rstrip(string.digits)
without_name = without_i.rstrip(string.letters)
return int(without_name[without_name.rindex("3Dhela") + 6:])
def check_structure(data_path):
"""
Make sure the directory exists and that each cell file has a nucleus file (and
vice versa). Also check that all the extra channel images have a cell file.
Note that we don't check number of images or continuity of the indices.
"""
if not os.path.isdir(data_path):
error_exit("'%s' is not a valid directory.".format(data_path))
all_tifs = set(glob.glob(os.path.join(data_path, "3Dhela*.tif")))
cell_tifs = set(glob.glob(os.path.join(data_path, "3Dhela*cell.tif")))
nucleus_tifs = set(glob.glob(os.path.join(data_path, "3Dhela*nucleus.tif")))
other_tifs = all_tifs.difference(cell_tifs.union(nucleus_tifs))
# Check each cell.tif has a nucleus.tif and vice versa
for cell_tif in cell_tifs:
expected_nucleus_tif = cell_tif[:-8] + "nucleus.tif"
if expected_nucleus_tif not in nucleus_tifs:
err_template = "Found cell tif '{}' without matching nucleus tif."
error_exit(err_template.format(cell_tif))
for nucleus_tif in nucleus_tifs:
expected_cell_tif = nucleus_tif[:-11] + "cell.tif"
if expected_cell_tif not in cell_tifs:
err_template = "Found nucleus tif '{}' without matching cell tif."
error_exit(err_template.format(nucleus_tif))
# Check each other channel tif has a cell.tif
for other_tif in other_tifs:
cell_i = get_cell_i_from_other_tif_name(other_tif)
cell_i_i = other_tif.rindex("3Dhela") + 6
expected_cell_tif = other_tif[:cell_i_i] + str(cell_i) + "cell.tif"
if expected_cell_tif not in cell_tifs:
err_template = "Found other tif '{}' without matching cell tif."
error_exit(err_template.format(other_tif))
def prompt_for(item, default, secret=False):
"""
Ask the user to provide item, defaulting to default. Don't echo back input if
secret.
"""
prompt = "{:s} [{:s}]: ".format(item, default)
if secret:
s = getpass.getpass(prompt)
else:
s = raw_input(prompt)
if s == "":
return default
else:
return s
def get_omero_connection():
"""
Prompt the user for OMERO connection information, defaulting to the parameters
defined in the constants at the top.
"""
host = prompt_for("OMERO host", DEFAULT_HOST)
port = prompt_for("OMERO port", DEFAULT_PORT)
username = prompt_for("OMERO username", DEFAULT_USERNAME)
password = prompt_for("OMERO password", DEFAULT_PASSWORD, secret=True)
conn = omero.gateway.BlitzGateway(username, password, host=host, port=port)
conn.connect()
return conn
def print_inline(s):
"""
Print the provided string on the same line, flushing output so it's displayed.
"""
print('\r' + s, end='')
sys.stdout.flush()
def organize_by_cell_i(tifs):
"""
Given a list of tifs, return a dictionary mapping a cell index to the tifs
that correspond to that cell index.
"""
cell_i_to_tifs = dict()
for tif in tifs:
cell_i = get_cell_i_from_other_tif_name(tif)
if cell_i in cell_i_to_tifs:
cell_i_to_tifs[cell_i].append(tif)
else:
cell_i_to_tifs[cell_i] = [tif]
return cell_i_to_tifs
def get_i_from_other_tif(tif):
"""
Given a tif name like
/users/foo/3Dhela5lysosome7.tif
get the 7.
"""
without_extension = tif[:-4]
return int(without_extension[len(without_extension.rstrip(string.digits)):])
def merge_image_channel_files(tif_name, infix):
"""
Given a path containing directories for each channel, grab the channel image
for image_name in each of the channels, and merge them into a single plane
generator which is returned.
The plane generator should return an x-y plane of values for each z, channel,
and time combination (in that order). In this case, we have a variable number
of z planes, 3 channels, and 1 time.
"""
with bioformats.ImageReader(tif_name) as reader:
z_count = reader.rdr.getImageCount()
prefix = tif_name[:tif_name.rindex(infix)]
def plane_generator():
for z_i in xrange(z_count):
for channel in CHANNELS:
if channel == "Cell membrane":
channel_path = prefix + "cell.tif"
elif channel == "DNA":
channel_path = prefix + "nucleus.tif"
elif channel == "protein":
channel_path = tif_name
with bioformats.ImageReader(channel_path) as reader:
yield reader.read(index=z_i)
return plane_generator, z_count
def get_omero_name_from_tif_name(tif_name):
"""
Given a tif name, like '/foo/blah/3Dhela1endosome1.tif', return a string like
'01endosome01'.
"""
unique_part = tif_name[tif_name.rindex("3Dhela") + 6:-4]
[cell_i, image_i] = map(int, re.findall(r"\d+", unique_part))
protein = unique_part.strip(string.digits)
return "{:02d}{}{:02d}".format(cell_i, protein, image_i)
def upload_image(plane_generator, name, conn, c_count, z_count):
"""
Upload the image (stored in the plane_generator as the OMERO API specifies)
using the OMERO connection conn.
"""
channel_count = c_count
time_count = 1
return conn.createImageFromNumpySeq(
plane_generator(), name, z_count, channel_count, time_count
)
def add_channel_labels(image, protein_name):
"""
Add the name for each channel (e.g. DNA, Cell membrane, ...)
"""
for (i, channel) in enumerate(image.getChannels()):
channel_name = CHANNELS[i]
if channel_name == 'protein':
channel_name = protein_name
logical_channel = channel.getLogicalChannel()
logical_channel.setName(channel_name)
logical_channel.save()
def add_key_value_pairs(conn, image, protein_class, protein_type):
"""
Add the key value pairs to the image. This consists of the models used in
generating the differnet channels, as well as Murphy Lab as the owner, and
the data url.
"""
map_annotation = omero.gateway.MapAnnotationWrapper(conn)
map_annotation.setNs(omero.constants.metadata.NSCLIENTMAPANNOTATION)
key_value_pairs = [
['owner', 'Murphy Lab'],
['data-url', 'http://murphylab.web.cmu.edu/data/'],
['channel0_class', 'framework'],
['channel0_type', 'diffeomorphic'],
['channel1_class', 'framework'],
['channel1_type', 'diffeomorphic'],
['channel2_class', protein_class],
['channel2_type', protein_type]
]
map_annotation.setValue(key_value_pairs)
map_annotation.save()
image.linkAnnotation(map_annotation)
def add_pixel_size(image, conn):
"""
Add the physical pixel size to the image metadata (0.049 x 0.049 x 0.2).
"""
image = conn.getObject("Image", image.getId()) # reloads the pixels
pixels = image.getPrimaryPixels()._obj
xy_size = omero.model.LengthI(0.049, omero.model.enums.UnitsLength.MICROMETER)
z_size = omero.model.LengthI(0.2, omero.model.enums.UnitsLength.MICROMETER)
pixels.setPhysicalSizeX(xy_size)
pixels.setPhysicalSizeY(xy_size)
pixels.setPhysicalSizeZ(z_size)
conn.getUpdateService().saveObject(pixels)
def get_tag_annotation(name, conn):
"""
Look for the tag with the given name. If multiple exist, ask the user which
one to choose and remember it for later.
"""
if name in TAG_ANNOTATIONS:
return TAG_ANNOTATIONS[name]
else:
attributes = {"textValue": name}
matches = list(conn.getObjects("TagAnnotation", attributes=attributes))
if len(matches) == 0:
tag_annotation = omero.gateway.TagAnnotationWrapper(conn)
tag_annotation.setValue(name)
tag_annotation.save()
elif len(matches) == 1:
tag_annotation = matches[0]
else:
print()
print(" Found multiple tags with name '{}':".format(name))
tag_ids = map(lambda t: int(t.getId()), matches)
print(" {}".format(tag_ids))
prompt = " Enter preferred tag ID:"
tag_annotation = None
while tag_annotation is None:
tag_id_input = prompt_for(prompt, str(tag_ids[0]))
if tag_id_input.isdigit() and int(tag_id_input) in tag_ids:
tag_annotation = matches[tag_ids.index(int(tag_id_input))]
else:
print(" IDs: {}".format(tag_ids))
TAG_ANNOTATIONS[name] = tag_annotation
return tag_annotation
def add_tags(image, conn):
"""
Add the 3D, HeLa, and synthetic tags to the image.
"""
for tag_name in ['3D', 'HeLa', 'synthetic']:
tag_annotation = get_tag_annotation(tag_name, conn)
image.linkAnnotation(tag_annotation)
def add_metadata(image, dataset, conn):
"""
Add channel labels and the marker used for the protein.
"""
add_channel_labels(image, dataset['name'])
add_key_value_pairs(
conn, image, dataset['protein_class'], dataset['protein_type']
)
add_pixel_size(image, conn)
add_tags(image, conn)
def upload_dataset(conn, dataset, data_path, is_test=False):
"""
Go through the channels for each image in this dataset and merge them into a
plane generator before uploading and adding proper metadata. Return the list
of image ids uploaded so that they can be linked to the dataset later on.
If is_test is true, only upload one image.
"""
name, infix = dataset['name'], dataset['infix']
print('Uploading {} dataset...'.format(name))
all_tifs = glob.glob(os.path.join(data_path, "3Dhela*.tif"))
dataset_tifs = filter(lambda t: infix in os.path.basename(t), all_tifs)
if is_test:
dataset_tifs = dataset_tifs[0:1]
count = len(dataset_tifs)
i = 0
cell_i_to_tifs = organize_by_cell_i(dataset_tifs)
im_ids = []
for cell_i in sorted(cell_i_to_tifs.keys()):
tifs = sorted(cell_i_to_tifs[cell_i], key=get_i_from_other_tif)
for tif in tifs:
i += 1
plane_generator, z_count = merge_image_channel_files(tif, infix)
omero_name = get_omero_name_from_tif_name(tif)
status = "[{:4d}/{:d}] {:12.12} |".format(i, count, omero_name)
print_inline(" {} Uploading image... ".format(status))
im = upload_image(plane_generator, omero_name, conn,len(CHANNELS),z_count)
im_ids.append(im.getId())
print_inline(" {} Adding metadata... ".format(status))
add_metadata(im, dataset, conn)
pad = ' ' * (len(status) + 8) # cover up last in-line printing
print("\rUploaded {:d} images.{}".format(count, pad))
return im_ids
def get_hela_project(conn):
"""
Look for a project titled "HeLa" and return it if it exists. Otherwise, make a
new project called "HeLa" and return that.
"""
hela_project = None
user_id = conn.getUser().getId()
for project in conn.listProjects(user_id):
if project.getName() == "HeLa":
hela_project = project
if hela_project is None:
hela_project = omero.model.ProjectI()
hela_project.setName(omero.rtypes.rstring("HeLa"))
hela_project = conn.getUpdateService().saveAndReturnObject(hela_project)
return hela_project
def organize_uploaded_images(conn, dataset_to_ids):
"""
Given a dictionary mapping dataset name to a list of ids belonging to that
dataset, create a dataset for each of these and link its images to it, looking
for a HeLA project and creating one if it doesn't exist.
"""
print_inline('Organizing images...')
update_service = conn.getUpdateService()
project = get_hela_project(conn)
for (dataset_name, im_ids) in dataset_to_ids.items():
dataset = omero.model.DatasetI()
omero_dataset_name = "3D Synthetic HeLa - " + dataset_name
dataset.setName(omero.rtypes.rstring(omero_dataset_name))
dataset = update_service.saveAndReturnObject(dataset)
link = omero.model.ProjectDatasetLinkI()
link.setParent(omero.model.ProjectI(project.getId(), False))
link.setChild(omero.model.DatasetI(dataset.getId(), False))
update_service.saveObject(link)
for im_id in im_ids:
link = omero.model.DatasetImageLinkI()
link.setParent(omero.model.DatasetI(dataset.getId(), False))
link.setChild(omero.model.ImageI(im_id, False))
update_service.saveObject(link)
print('\rOrganized images. ')
def upload_3d_synthetic_hela_collection(data_path, is_test=False):
"""
Given the path to the 3d synthetic hela images, they are in a
one-image-per-channel format. For each dataset, merge corresponding channels
of the same image into one and upload them to an OMERO instance. Look for a
HeLa project and create it if it doesn't exist. Otherwise, upload the datasets
to the existing HeLa project.
If is_test is true, only upload one image.
"""
check_structure(data_path)
conn = get_omero_connection()
dataset_to_ids = dict()
javabridge.start_vm(class_path=bioformats.JARS)
bioformats.log4j.basic_config()
for dataset in DATASETS:
name, infix = dataset['name'], dataset['infix']
im_ids = upload_dataset(conn, dataset, data_path, is_test)
dataset_to_ids[name] = im_ids
if is_test:
break
organize_uploaded_images(conn, dataset_to_ids)
javabridge.kill_vm()
conn._closeSession()
def main():
"""
Get the argument specifying the directory and run the script to upload images
from that directory.
"""
args = get_args()
upload_3d_synthetic_hela_collection(args.path, args.test)
if __name__ == '__main__':
main()