Ejemplo n.º 1
0
    def run_job(self, batch, assume_clean_state=False):
        '''Extracts OMEXML from microscope image or metadata files.

        Parameters
        ----------
        batch: dict
            description of the *run* job
        assume_clean_state: bool, optional
            assume that output of previous runs has already been cleaned up

        Note
        ----
        The actual processing is delegated to the
       `showinf <http://www.openmicroscopy.org/site/support/bio-formats5.1/users/comlinetools/display.html>`_
        Bioformats command line tool.

        Raises
        ------
        subprocess.CalledProcessError
            when extraction failed
        '''
        # NOTE: Ideally, we would use the BFOmeXmlReader together with JavaBridge
        # but this approach has several shortcomings and requires too much
        # memory to run efficiently on individual cores.
        with tm.utils.ExperimentSession(self.experiment_id) as session:
            for fid in batch['microscope_image_file_ids']:
                img_file = session.query(tm.MicroscopeImageFile).get(fid)
                logger.info('process image %d' % img_file.id)
                # The "showinf" command line tool writes the extracted OMEXML
                # to standard output.
                command = [
                    'showinf', '-omexml-only', '-nopix', '-novalid', '-nocore',
                    '-no-upgrade', '-no-sas', img_file.location
                ]
                p = subprocess.Popen(command,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)
                stdout, stderr = p.communicate()
                logger.debug("showinf STDOUT: \n```%s```\n", stdout)
                logger.debug("showinf STDERR: \n```%s```\n", stderr)
                if p.returncode != 0 or not stdout:
                    raise MetadataError(
                        'Extraction of OMEXML failed! Error message:\n%s' %
                        stderr)
                # the OME-XML data is contained within XML tags `<OME ...>` and `</OME>`
                start = stdout.find("<OME")
                if start == -1:
                    raise ValueError(
                        "Cannot find OME-XML start tag in `showinf` output.")
                end = stdout.rfind("</OME>", start)
                if end == -1:
                    raise ValueError(
                        "Cannot find OME-XML closing tag in `showinf` output.")
                img_file.omexml = unicode(stdout[start:end + len('</OME>')])
                session.add(img_file)
                session.commit()
                session.expunge(img_file)
Ejemplo n.º 2
0
    def run_job(self, batch, assume_clean_state=False):
        '''Extracts OMEXML from microscope image or metadata files.

        Parameters
        ----------
        batch: dict
            description of the *run* job
        assume_clean_state: bool, optional
            assume that output of previous runs has already been cleaned up

        Note
        ----
        The actual processing is delegated to the
       `showinf <http://www.openmicroscopy.org/site/support/bio-formats5.1/users/comlinetools/display.html>`_
        Bioformats command line tool.

        Raises
        ------
        subprocess.CalledProcessError
            when extraction failed
        '''
        # NOTE: Ideally, we would use the BFOmeXmlReader together with JavaBridge
        # but this approach has several shortcomings and requires too much
        # memory to run efficiently on individual cores.
        with tm.utils.ExperimentSession(self.experiment_id) as session:
            for fid in batch['microscope_image_file_ids']:
                img_file = session.query(tm.MicroscopeImageFile).get(fid)
                logger.info('process image %d' % img_file.id)
                # The "showinf" command line tool writes the extracted OMEXML
                # to standard output.
                command = [
                    'showinf', '-omexml-only', '-nopix', '-novalid', '-nocore',
                    '-no-upgrade', '-no-sas', img_file.location
                ]
                p = subprocess.Popen(command,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)
                stdout, stderr = p.communicate()
                if p.returncode != 0 or not stdout:
                    raise MetadataError(
                        'Extraction of OMEXML failed! Error message:\n%s' %
                        stderr)
                try:
                    # We only want the XML. This will remove potential
                    # warnings and other stuff we don't want.
                    omexml = re.search(r'<(\w+).*</\1>',
                                       stdout,
                                       flags=re.DOTALL).group()
                except:
                    raise RegexError('OMEXML metadata could not be extracted.')
                img_file.omexml = unicode(omexml)
                session.add(img_file)
                session.commit()
                session.expunge(img_file)
Ejemplo n.º 3
0
    def determine_grid_coordinates_from_layout(self, stitch_layout,
                                               stitch_dimensions):
        '''Determines the coordinates of each image acquisition site within the
        continuous acquisition grid (slide or well in a plate)
        based on a provided layout.

        Parameters
        ----------
        stitch_layout: str
            layout of the acquisition grid
            (options: ``"horizontal"``, ``"zigzag_horizontal"``, ``"vertical"``,
            or ``"zigzag_vertical"``)
        stitch_dimensions: Tuple[int]
            dimensions of the acquisition grid, i.e. number of images
            along the vertical and horizontal axis of the acquired area

        Returns
        -------
        pandas.DataFrame
            metadata for each 2D *Plane* element

        See also
        --------
        :func:`illuminati.stitch.calc_grid_coordinates_from_layout`
        '''
        md = self.metadata

        logger.info('determine acquisition grid coordinates based on layout')
        # Determine the number of unique positions per well
        acquisitions_per_well = md.groupby(
            ['well_name', 'channel_name', 'zplane', 'tpoint'])
        n_acquisitions_per_well = acquisitions_per_well.count().name
        if len(np.unique(n_acquisitions_per_well)) > 1:
            raise MetadataError(
                'Each well must have the same number of acquisition sites.')
        n_sites = n_acquisitions_per_well[0]
        sites = acquisitions_per_well.groups.values()

        logger.debug('stitch layout: {0}; stitch dimensions: {1}'.format(
            stitch_layout, stitch_dimensions))
        coordinates = stitch.calc_grid_coordinates_from_layout(
            stitch_dimensions, stitch_layout)
        y_coordinates = [c[0] for c in coordinates]
        x_coordinates = [c[1] for c in coordinates]
        for indices in sites:
            if len(indices) != len(coordinates):
                raise ValueError('Incorrect stitch dimensions provided.')
            md.loc[indices, 'well_position_y'] = y_coordinates
            md.loc[indices, 'well_position_x'] = x_coordinates

        return self.metadata
Ejemplo n.º 4
0
    def configure_from_filenames(self, plate_dimensions, regex):
        '''Configures metadata based on information encoded in image filenames
        using a regular expression with the followsing fields:

            - *w*: well
            - *t*: time point
            - *s*: acquisition site
            - *z*: focal plane (z dimension)
            - *c*: channel

        Parameters
        ----------
        plate_dimensions: Tuple[int]
            number of rows and columns in the well plate
        regex: str
            named regular expression

        Raises
        ------
        tmlib.errors.MetadataError
            when image files contain more than more plane, since this case
            wouldn't allow a 1-to-1 mapping of information from filename to
            image plane

        Returns
        -------
        pandas.DataFrame
            metadata for each 2D *Plane* element
        '''
        logger.info('update image metadata with filename information')
        md = self.metadata
        filenames = natsorted(
            list(set([f for fm in self._file_mapper_list for f in fm.files])))
        if md.shape[0] != len(filenames):
            raise MetadataError(
                'Configuration of metadata based on filenames '
                'works only when each image file contains only a single plane.'
            )

        logger.info('retrieve metadata from filenames via regular expression')
        self.check_regular_expression(regex)
        for i, f in enumerate(filenames):
            # Not every microscope provides all the information in the filename.
            fields = self.extract_fields_from_filename(regex, f)
            md.at[i, 'channel_name'] = str(fields.c)
            md.at[i, 'site'] = int(fields.s)
            md.at[i, 'zplane'] = int(fields.z)
            md.at[i, 'tpoint'] = int(fields.t)
            md.at[i, 'well_name'] = str(fields.w)

        return self.metadata
Ejemplo n.º 5
0
    def determine_grid_coordinates_from_stage_positions(self):
        '''Determines the coordinates of each image acquisition site within the
        continuous acquisition grid (slide or well in a plate)
        based on the absolute microscope stage positions.

        Returns
        -------
        pandas.DataFrame
            metadata for each 2D *Plane* element

        Raises
        ------
        MetadataError
            when stage position information is not available from `metadata`

        See also
        --------
        :func:`illuminati.stitch.calc_grid_coordinates_from_positions`
        '''
        md = self.metadata
        if (any(md.stage_position_y.isnull())
                or any(md.stage_position_x.isnull())):
            raise MetadataError('Stage position information is not available.')

        logger.info('translate absolute microscope stage positions into '
                    'relative acquisition grid coordinates')

        planes_per_well = md.groupby(['well_name'])
        n_tpoints = len(np.unique(md.tpoint))
        n_channels = len(np.unique(md.channel_name))
        n_zplanes = len(np.unique(md.zplane))
        for well_name in np.unique(md.well_name):
            ix = planes_per_well.groups[well_name]
            positions = zip(md.loc[ix, 'stage_position_y'],
                            md.loc[ix, 'stage_position_x'])
            n = len(positions) / (n_tpoints * n_channels * n_zplanes)
            coordinates = self._calculate_coordinates(positions, n)
            md.loc[ix, 'well_position_y'] = [c[0] for c in coordinates]
            md.loc[ix, 'well_position_x'] = [c[1] for c in coordinates]

        return self.metadata
Ejemplo n.º 6
0
    def _combine_omexml_elements(self, omexml_images, omexml_metadata):
        logger.info('combine OMEXML elements')
        # We assume here that each image files contains the same number images.
        n_images = omexml_images.values()[0].image_count * len(omexml_images)
        if omexml_metadata is not None:
            extra_omexml_available = True
            if not isinstance(omexml_metadata, bioformats.omexml.OMEXML):
                raise TypeError('Argument "omexml_metadata" must have type '
                                'bioformats.omexml.OMEXML.')
            if omexml_metadata.image_count != n_images:
                raise MetadataError(
                    'Number of images in "omexml_metadata" must match '
                    'the total number of Image elements in "omexml_images".')
        else:
            extra_omexml_available = False
            omexml_metadata = bioformats.OMEXML(XML_DECLARATION)
            omexml_metadata.image_count = n_images

        image_element_attributes = {'AcquisitionDate', 'Name'}
        channel_element_attributes = {'Name'}
        pixel_element_attributes = {
            'PixelType', 'SizeC', 'SizeT', 'SizeX', 'SizeY', 'SizeZ'
        }
        plane_element_attributes = {
            'PositionX', 'PositionY', 'PositionZ', 'TheC', 'TheT', 'TheZ'
        }
        filenames = natsorted(omexml_images)
        count = 0
        for i, f in enumerate(filenames):
            omexml_img = omexml_images[f]
            n_series = omexml_img.image_count
            for s in xrange(n_series):
                extracted_image = omexml_img.image(s)
                md_image = omexml_metadata.image(count)
                for attr in image_element_attributes:
                    extracted_value = getattr(extracted_image, attr)
                    if extracted_value is not None:
                        setattr(md_image, attr, extracted_value)

                extracted_pixels = extracted_image.Pixels
                n_planes = extracted_pixels.plane_count
                if n_planes == 0:
                    # Sometimes an image doesn't have any plane elements.
                    # Let's create them for consistency.
                    extracted_pixels = self._create_channel_planes(
                        extracted_pixels)
                    n_planes = extracted_pixels.plane_count

                md_pixels = md_image.Pixels
                md_pixels.plane_count = n_planes
                if extra_omexml_available and (md_pixels.plane_count !=
                                               n_planes):
                    raise MetadataError(
                        'Image element #%d in OMEXML obtained from additional '
                        'metdata files must have the same number of Plane  '
                        'elements as the corresponding Image elements in the '
                        'OMEXML element obtained from image file "%s".' %
                        (i, f))

                for attr in pixel_element_attributes:
                    extracted_value = getattr(extracted_pixels, attr)
                    if extracted_value is not None:
                        # This is python-bioformats being stupid by setting
                        # random default values.
                        setattr(md_pixels, attr, extracted_value)

                for p in xrange(n_planes):
                    extracted_plane = extracted_pixels.Plane(p)
                    md_plane = md_pixels.Plane(p)
                    for attr in plane_element_attributes:
                        extracted_value = getattr(extracted_plane, attr)
                        md_value = getattr(md_plane, attr)
                        if md_value is None and extracted_value is not None:
                            setattr(md_plane, attr, extracted_value)

                    fm = ImageFileMapping()
                    fm.ref_index = count + p
                    fm.files = [f]
                    fm.series = [s]
                    fm.planes = [p]
                    self._file_mapper_list.append(fm)
                    self._file_mapper_lut[f].append(fm)

                n_channels = extracted_pixels.channel_count
                md_image.channel_count = n_channels
                for c in xrange(n_channels):
                    extracted_channel = extracted_pixels.Channel(c)
                    md_channel = md_pixels.Channel(c)
                    for attr in channel_element_attributes:
                        extracted_value = getattr(extracted_channel, attr)
                        if extracted_value is not None:
                            setattr(md_channel, attr, extracted_value)

                count += 1

        return omexml_metadata
Ejemplo n.º 7
0
    def collect_job_output(self, batch):
        '''Assigns registered image files from different acquisitions to
        separate *cycles*. If an acquisition includes multiple time points,
        a separate *cycle* is created for each time point.
        The mapping from *acquisitions* to *cycles* is consequently
        1 -> n, where n is the number of time points per acquisition (n >= 1).

        Whether acquisition time points will be interpreted as actual
        time points in a time series depends on the value of
        :attr:`tm.Experiment.plate_acquisition_mode`.

        Parameters
        ----------
        batch: dict
            description of the *collect* job
        '''
        with tm.utils.ExperimentSession(self.experiment_id) as session:
            # We need to do this per plate to ensure correct indices
            # TODO: check plates have similar channels, etc
            experiment = session.query(tm.Experiment).one()
            acquisition_mode = experiment.plate_acquisition_mode
            logger.info('plates were acquired in mode "%s"', acquisition_mode)
            is_time_series = acquisition_mode == 'basic'
            if is_time_series:
                logger.info('time points are interpreted as time series')
            is_multiplexing = acquisition_mode == 'multiplexing'
            if is_multiplexing:
                logger.info('time points are interpreted as multiplexing cycles')

        with tm.utils.ExperimentSession(self.experiment_id) as session:

            channels = session.query(tm.Channel.name, tm.Channel.id).all()
            channel_lut = dict(channels)

            bit_depth = session.query(tm.Channel.bit_depth).distinct().one()
            if len(bit_depth) > 1:
                raise MetadataError('All channels must have the same bit depth.')
            bit_depth = bit_depth[0]
            wavelengths = session.query(tm.Channel.wavelength).\
                distinct().\
                all()
            wavelengths = [w[0] for w in wavelengths]

            # We order acquisitions by the time they got created. This will
            # determine the order of multiplexing cycles.
            plates = session.query(tm.Plate.id).\
                order_by(tm.Plate.created_at).\
                all()
            plate_ids = [p.id for p in plates]
            for p in plate_ids:
                acquisitions = session.query(tm.Acquisition.id).\
                    filter_by(plate_id=p).\
                    order_by(tm.Acquisition.created_at).\
                    all()
                acquisition_ids = [a.id for a in acquisitions]
                t_index = 0
                w_index = 0
                c_index = 0
                for a in acquisition_ids:
                    logger.debug('acquisition %d', a)
                    tpoints = session.query(tm.ChannelImageFile.tpoint).\
                        filter_by(acquisition_id=a).\
                        distinct().\
                        all()
                    tpoints = [t[0] for t in tpoints]
                    for t in tpoints:
                        logger.debug('time point #%d', t)
                        cycle = session.get_or_create(
                            tm.Cycle,
                            index=c_index, experiment_id=self.experiment_id
                        )

                        for w in wavelengths:
                            # Get all channel_image_files for the currently
                            # processed acquisition that match the old values
                            # of the "tpoint" and "channel_id" attributes.
                            image_files = session.query(tm.ChannelImageFile.id).\
                                filter_by(
                                    tpoint=t, acquisition_id=a,
                                    channel_id=channel_lut[w]
                                ).\
                                all()

                            if len(image_files) == 0:
                                # A wavelength might not have been used at
                                # every time point.
                                continue

                            logger.debug('wavelength "%s"', w)
                            if is_multiplexing:
                                # In case of a multiplexing experiment
                                # we create a separate channel for each
                                # combination of wavelength and tpoint.
                                new_channel_name = '{c}_{w}'.format(
                                    c=c_index, w=w
                                )
                            else:
                                # In case of a time series experiment
                                # the name of the channel remains unchanged.
                                new_channel_name = w

                            # Check whether the channel already exists and
                            # update the name accordingly (upon creation, the
                            # "name" attribute should have been set to the
                            # value of the "wavelength" attribute).
                            channel = session.query(tm.Channel).\
                                filter_by(name=w, wavelength=w).\
                                one_or_none()
                            if channel is not None:
                                channel.name = new_channel_name
                                session.add(channel)
                                session.commit()
                            else:
                                channel = tm.Channel(
                                    name=new_channel_name, wavelength=w,
                                    bit_depth=bit_depth,
                                    experiment_id=self.experiment_id
                                )
                                session.add(channel)
                                session.commit()

                            logger.info(
                                'update time point and channel id '
                                'of channel image files: tpoint=%d, channel=%s',
                                t_index, channel.name
                            )
                            # Update the attributes of channel_image_files with
                            # the new values for tpoint and channel_id and also
                            # add the cycle_id.
                            session.bulk_update_mappings(
                                tm.ChannelImageFile, [
                                  {
                                    'id': f.id,
                                    'tpoint': t_index,
                                    'cycle_id': cycle.id,
                                    'channel_id': channel.id
                                  } for f in image_files
                                ]
                            )

                            # Update lookup table
                            channel_lut[new_channel_name] = channel.id

                        if is_time_series:
                            t_index += 1
                        else:
                            c_index += 1
Ejemplo n.º 8
0
    def run_job(self, batch, assume_clean_state=False):
        '''Configures OMEXML metadata extracted from microscope image files and
        complements it with metadata retrieved from additional microscope
        metadata files and/or user input.

        The actual processing is delegated to a format-specific implementation of
        :class:`MetadataHandler <tmlib.workflow.metaconfig.base.MetadataHandler>`.

        Parameters
        ----------
        batch: dict
            job description
        assume_clean_state: bool, optional
            assume that output of previous runs has already been cleaned up

        See also
        --------
        :mod:`tmlib.workflow.metaconfig.cellvoyager`
        '''
        regexp = batch.get('regex', '')
        if not regexp:
            regexp = get_microscope_type_regex(
                batch['microscope_type'], as_string=True
            )[0]
        with tm.utils.ExperimentSession(self.experiment_id) as session:
            experiment = session.query(tm.Experiment).one()
            plate_dimensions = experiment.plates[0].dimensions
            acquisition = session.query(tm.Acquisition).\
                get(batch['acquisition_id'])
            metadata_files = session.query(tm.MicroscopeMetadataFile.location).\
                filter_by(acquisition_id=batch['acquisition_id']).\
                all()
            metadata_filenames = [f.location for f in metadata_files]
            image_files = session.query(
                    tm.MicroscopeImageFile.name, tm.MicroscopeImageFile.omexml
                ).\
                filter_by(acquisition_id=batch['acquisition_id']).\
                all()
            omexml_images = {
                f.name: bioformats.OMEXML(f.omexml) for f in image_files
            }

        MetadataReader = metadata_reader_factory(batch['microscope_type'])
        if MetadataReader is not None:
            with MetadataReader() as mdreader:
                omexml_metadata = mdreader.read(
                    metadata_filenames, omexml_images.keys()
                )
        else:
            omexml_metadata = None

        MetadataHandler = metadata_handler_factory(batch['microscope_type'])
        mdhandler = MetadataHandler(omexml_images, omexml_metadata)
        mdhandler.configure_from_omexml()
        missing = mdhandler.determine_missing_metadata()
        if missing:
            logger.warning(
                'required metadata information is missing: "%s"',
                '", "'.join(missing)
            )
            logger.info(
                'try to retrieve missing metadata from filenames '
                'using regular expression'
            )
            if regexp is None:
                logger.warn('no regular expression provided')
            mdhandler.configure_from_filenames(
                plate_dimensions=plate_dimensions, regex=regexp
            )
        missing = mdhandler.determine_missing_metadata()
        if missing:
            raise MetadataError(
                'The following metadata information is missing:\n"%s"\n'
                % '", "'.join(missing)
            )
        # Once we have collected basic metadata such as information about
        # channels and focal planes, we try to determine the relative position
        # of images within the acquisition grid
        try:
            logger.info(
                'try to determine grid coordinates from microscope '
                'stage positions'
            )
            mdhandler.determine_grid_coordinates_from_stage_positions()
        except MetadataError as error:
            logger.warning(
                'microscope stage positions are not available: "%s"'
                % str(error)
            )
            logger.info(
                'try to determine grid coordinates from provided stitch layout'
            )
            # In general, the values of these arguments can be ``None``, because
            # they are not required and may not be used.
            # However, in case the grid coordinates should be determined based
            # on user interput, these arguments are required.
            if not isinstance(batch['n_vertical'], int):
                raise TypeError(
                    'Value of argument "n_vertical" must be an integer.'
                )
            if not isinstance(batch['n_horizontal'], int):
                raise TypeError(
                    'Value of argument "n_horizontal" must be an integer.'
                )
            mdhandler.determine_grid_coordinates_from_layout(
                stitch_layout=batch['stitch_layout'],
                stitch_dimensions=(batch['n_vertical'], batch['n_horizontal'])
            )

        if batch['perform_mip']:
            mdhandler.group_metadata_per_zstack()

        # Create consistent zero-based ids
        mdhandler.update_indices()
        mdhandler.assign_acquisition_site_indices()
        md = mdhandler.remove_redundant_columns()
        fmaps = mdhandler.create_image_file_mappings()

        logger.info('create database entries')

        with tm.utils.ExperimentSession(self.experiment_id) as session:
            channels = dict()
            bit_depth = md['bit_depth'][0]
            for ch_name in np.unique(md['channel_name']):
                logger.info('create channel "%s"', ch_name)
                ch = session.get_or_create(
                    tm.Channel, experiment_id=self.experiment_id,
                    name=ch_name, wavelength=ch_name, bit_depth=bit_depth,
                )
                channels[ch_name] = ch.id

        for w in np.unique(md.well_name):

            with tm.utils.ExperimentSession(self.experiment_id) as session:
                acquisition = session.query(tm.Acquisition).\
                    get(batch['acquisition_id'])

                logger.info('create well "%s"', w)
                w_index = (md.well_name == w)
                well = session.get_or_create(
                    tm.Well, plate_id=acquisition.plate.id, name=w
                )

                channel_image_files = []
                for s in np.unique(md.loc[w_index, 'site']):
                    logger.debug('create site #%d', s)
                    s_index = (md.site == s)
                    y = md.loc[s_index, 'well_position_y'].values[0]
                    x = md.loc[s_index, 'well_position_x'].values[0]
                    height = md.loc[s_index, 'height'].values[0]
                    width = md.loc[s_index, 'width'].values[0]
                    site = session.get_or_create(
                        tm.Site, y=y, x=x, height=height, width=width,
                        well_id=well.id
                    )

                    for index, i in md.ix[s_index].iterrows():
                        channel_image_files.append(
                            tm.ChannelImageFile(
                                tpoint=i.tpoint, zplane=i.zplane,
                                channel_id=channels[i.channel_name],
                                site_id=site.id, acquisition_id=acquisition.id,
                                file_map=fmaps[index],
                            )
                        )

                session.bulk_save_objects(channel_image_files)