Exemple #1
0
 def one(self):
     results = self.execute()
     if not len(results):
         raise NoResultFound()
     if len(results) > 1:
         raise MultipleResultsFound()
     return results[0]
    def size(self, iterator, recurse=False, verbose=False):
        """ Yields (path, size-in-bytes) tuples for the selected data
        objects and collections.

        Examples:

        >>> session.bulk.size('~/data/out*.txt')
        >>> session.bulk.size('./data', recurse=True)

        Arguments:

        iterator: iterator or str
            Defines which items are subject to the bulk operation.
            Can be an iterator (e.g. using search_manager.find())
            or a string (which will be used to construct a
            search_manager.iglob() iterator). Data sizes will be returned
            for matching data objects and, if used recursively, collections.

        recurse: bool (default: False)
            Whether to use recursion, meaning that the data size of
            matching collections will be calculated as the sum of
            their data objects and subcollection sizes.

        verbose: bool (default: False)
            Whether to print more output.
        """
        if isinstance(iterator, str):
            iterator = self.session.search.iglob(iterator)

        for item in iterator:
            path = self.session.path.get_absolute_irods_path(item)

            if self.session.collections.exists(path):
                if recurse:
                    new_iterator = self.size(item + '/*',
                                             recurse=True,
                                             verbose=verbose)
                    size = sum([result[1] for result in new_iterator])
                else:
                    self.log('Skipping collection %s (no recursion)' % item,
                             verbose)
                    continue
            else:
                dirname = os.path.dirname(path)
                basename = os.path.basename(path)
                criteria = [
                    Criterion('=', Collection.name, dirname),
                    Criterion('=', DataObject.name, basename)
                ]
                fields = [DataObject.size]
                q = self.session.query(*fields).filter(*criteria)

                results = [result for result in q.get_results()]
                if len(results) > 1:
                    raise MultipleResultsFound('Different replicas of data ' + \
                                        'object %s have different sizes' % path)

                size = results[0][DataObject.size]

            yield (item, size)
Exemple #3
0
 def one(self):
     results = self.execute()
     if results.continue_index > 0:
         self.continue_index(results.continue_index).close()
     if not len(results):
         raise NoResultFound()
     if len(results) > 1:
         raise MultipleResultsFound()
     return results[0]
Exemple #4
0
    def federated_exists(self, sds_file, root_collection):
        """Check whether a data object is present in a federated iRODS zone with the same checksum.

        Parameters
        ----------
        sds_file : `SDSFile`
            File to search.
        root_collection : `str`
            The archive's root collection.

        Raises
        ------
        MultipleResultsFound
            Raised if more than one different versions of the file exist in remote location.
        """
        # Query iRODS
        q = (irods_session.session.query(
            Collection.name, DataObject.name, DataObject.checksum).filter(
                Collection.name == sds_file.custom_directory(root_collection)).
             filter(DataObject.name == sds_file.filename))
        results = q.all()

        # No file found
        if len(results) == 0:
            self.logger.debug("File %s does not exist in root collection %s." %
                              (sds_file.filename, root_collection))
            return False

        # Read checksum(s) into a set to eliminate repeats
        checksum_set = {r[DataObject.checksum] for r in results}
        if len(checksum_set) > 1:
            raise MultipleResultsFound(
                "File %s has more than one different version." %
                sds_file.custom_path(root_collection))
        remote_checksum = checksum_set.pop()

        # Compare checksums
        if sds_file.checksum == remote_checksum:
            self.logger.debug(
                "File %s does exist in iRODS, with same checksum (%s)." %
                (sds_file.filename, sds_file.checksum))
            return True

        self.logger.debug(
            "File %s does exist in iRODS, but with a different checksum (%s vs %s)."
            % (sds_file.filename, remote_checksum, sds_file.checksum))
        return False
Exemple #5
0
    def get_federated_pid(self, sds_file, root_collection):
        """Get the PID of a data object in a federated iRODS.

        Parameters
        ----------
        sds_file : `SDSFile`
            File to search.
        root_collection : `str`
            The archive's root collection.

        Returns
        -------
        pid : `str`
            The PID is the file has one, or None if the file does not exist or does not have a PID.

        Raises
        ------
        MultipleResultsFound
            Raised if file has more than one different PID assigned to it.
        """
        # Query iRODS
        q = (irods_session.session.query(
            Collection.name, DataObject.name, DataObjectMeta.value).filter(
                Collection.name == sds_file.custom_directory(root_collection)).
             filter(DataObject.name == sds_file.filename).filter(
                 DataObjectMeta.name == "PID"))
        results = q.all()

        # No file or PID found
        if len(results) == 0:
            self.logger.debug(
                "File %s does not exist or does not have a PID registered." %
                sds_file.filename)
            return None

        # Read PID(s) into a set to eliminate repeats
        pid_set = {r[DataObjectMeta.value] for r in results}
        if len(pid_set) > 1:
            raise MultipleResultsFound("File %s has more than one PID." %
                                       sds_file.custom_path(root_collection))

        # Return the PID
        pid = pid_set.pop()
        self.logger.debug("File %s has PID %s." % (sds_file.filename, pid))
        return pid