Example #1
0
class FileObject(DigitalObject):
    PDF_CONTENT_MODEL = 'info:fedora/islandora:sp_pdf'
    CONTENT_MODELS = [ PDF_CONTENT_MODEL ]
    file = FileDatastream("OBJ", "Binary datastream", defaults={
            'versionable': True,
    })
    mods = FileDatastream("MODS", "Mods record for this object.", defaults={
            'versionable': True,
    })
Example #2
0
class TestPdfObject(DigitalObject):
    pdf = FileDatastream("PDF",
                         "PDF document",
                         defaults={
                             'versionable': False,
                             'mimetype': 'application/pdf'
                         })
Example #3
0
def get_fedora_proxy_class(dsid):
    fcm = "info:fedora/genrepo:File-1.0"
    return type(
        "FedoraProxyObject", (DigitalObject, ),
        dict(FILE_CONTENT_MODEL=fcm,
             CONTENT_MODELS=[fcm],
             DATASTREAM=FileDatastream(dsid,
                                       "Binary datastream",
                                       defaults={
                                           "versionable": True,
                                       })))
Example #4
0
class FileObject(DigitalObject):
    """An opaque file for repositing on behalf of a user. Inherits the
    standard Dublin Core and RELS-EXT datastreams from
    :class:`~eulcore.fedora.models.DigitalObject`, and adds both a
    ``master`` datastream to contain the user's file as well as a content
    model for identifying these objects.
    """
    CONTENT_MODELS = [AccessibleObject.PUBLIC_ACCESS_CMODEL]
    view_template = 'file/view.html'

    @property
    def default_pidspace(self):
        # use configured fedora pidspace (if any) when minting pids
        # dynamic property so it will always get current setting (e.g., if changed for tests)
        return getattr(settings, 'FEDORA_PIDSPACE', None)

    master = FileDatastream("master",
                            "reposited master file",
                            defaults={
                                'versionable': True,
                            })
    "reposited master :class:`~eulcore.fedora.models.FileDatastream`"

    def _get_oai_id(self):
        return self.rels_ext.content.value(subject=self.uriref,
                                           predicate=rdfns.oai.itemID)

    def _set_oai_id(self, value):
        # if value is None, remove the value
        if value is None:
            self._del_oai_id()
        else:
            # update/replace any oai item id (only one allowed)
            self.rels_ext.content.set(
                (self.uriref, rdfns.oai.itemID, Literal(value)))

    def _del_oai_id(self):
        self.rels_ext.content.remove(
            (self.uriref, rdfns.oai.itemID, self.oai_id))

    oai_id = property(_get_oai_id, _set_oai_id, _del_oai_id)

    @property
    def collection(self):
        collection_uri = self.rels_ext.content.value(
            subject=self.uriref, predicate=rdfns.relsext.isMemberOfCollection)
        if collection_uri:
            return CollectionObject(
                self.api,
                str(collection_uri).replace('info:fedora/', ''))
Example #5
0
class SimpleDigitalObject(DigitalObject):
    CONTENT_MODELS = ['info:fedora/%s:SimpleDjangoCModel' % TEST_PIDSPACE]
    # NOTE: distinguish from SimpleCModel in non-django fedora unit tests
    # and use configured pidspace for automatic clean-up

    # extend digital object with datastreams for testing
    text = Datastream("TEXT",
                      "Text datastream",
                      defaults={
                          'mimetype': 'text/plain',
                      })
    image = FileDatastream('IMAGE',
                           'managed binary image datastream',
                           defaults={'mimetype': 'image/png'})
Example #6
0
class AudioObject(FileObject):
    CONTENT_MODELS = [
        'info:fedora/genrepo-demo:Audio-1.0',
        AccessibleObject.PUBLIC_ACCESS_CMODEL
    ]
    content_types = ('audio/mpeg', )
    view_template = 'file/audio.html'

    master = FileDatastream(
        "source-audio",
        "Master audio",
        defaults={
            'mimetype': 'audio/mpeg',
            # FIXME: versioned? checksum?
        })
Example #7
0
class FileObject(DigitalObject):
    FILE_CONTENT_MODEL = 'info:fedora/genrepo:File-1.0'
    CONTENT_MODELS = [FILE_CONTENT_MODEL]
    file = FileDatastream("MYDS",
                          "Binary datastream",
                          defaults={
                              'versionable': True,
                          })
    brilmeta = Datastream("BRILMETA",
                          "BRIL Metadata",
                          defaults={
                              'versionable': True,
                          })
    PREMISmeta = Datastream("PREMIS",
                            "PREMIS Object Metadata",
                            defaults={
                                'versionable': True,
                            })
Example #8
0
    def __init__(self, *args, **kwargs):
        super(FedoraStorage, self).__init__(*args, **kwargs)
        self.namespace = kwargs["namespace"]
        self.image_name = kwargs["image_name"]
        self.thumbnail_name = "THUMBNAIL"
        self.binary_name = "BINARY"
        self.script_name = "OCR_SCRIPT"
        self.transcript_name = kwargs["transcript_name"]

        self.repo = Repository(
                root=kwargs["root"], username=kwargs["username"],
                password=kwargs["password"])

        self.model = type("Document", (DigitalObject,), {
            "default_pidspace": kwargs["namespace"],
            "FILE_CONTENT_MODEL": "info:fedora/genrepo:File-1.0",
            "CONTENT_MODELS":     ["info:fedora/genrepo:File-1.0"],
            "image": FileDatastream(self.image_name, "Document image", defaults={
              'versionable': True,
            }),
            "binary": FileDatastream(self.binary_name, "Document image binary", defaults={
              'versionable': True,
            }),
            "thumbnail": FileDatastream(self.thumbnail_name, "Document image thumbnail", defaults={
              'versionable': True,
            }),
            "script": FileDatastream(self.script_name, "OCR Script", defaults={
                "versionable": True,
            }),
            "transcript": FileDatastream(self.transcript_name, "Document transcript", defaults={
                "versionable": True,
            }),
            "meta": FileDatastream("meta", "Document metadata", defaults={
                "versionable": False,
            }),
        })
Example #9
0
class AudioObject(DigitalObject):
    '''Fedora Audio Object.  Extends :class:`~eulfedora.models.DigitalObject`.'''
    AUDIO_CONTENT_MODEL = 'info:fedora/emory-control:EuterpeAudio-1.0'
    CONTENT_MODELS = [AUDIO_CONTENT_MODEL]
    NEW_OBJECT_VIEW = 'audio:view'

    allowed_mimetypes = ['audio/x-wav', 'audio/wav']

    mods = XmlDatastream("MODS",
                         "MODS Metadata",
                         AudioMods,
                         defaults={
                             'control_group': 'M',
                             'format': mods.MODS_NAMESPACE,
                             'versionable': True,
                         })
    'MODS :class:`~eulfedora.models.XmlDatastream` with content as :class:`AudioMods`'

    audio = FileDatastream("AUDIO",
                           "Audio datastream",
                           defaults={
                               'mimetype': 'audio/x-wav',
                               'versionable': True,
                           })
    'master audio :class:`~eulfedora.models.FileDatastream`'

    compressed_audio = FileDatastream("CompressedAudio",
                                      "Compressed audio datastream",
                                      defaults={
                                          'mimetype': 'audio/mpeg',
                                          'versionable': True,
                                      })
    'access copy of audio :class:`~eulfedora.models.FileDatastream`'

    digitaltech = XmlDatastream("DigitalTech",
                                "Technical Metadata - Digital",
                                DigitalTech,
                                defaults={
                                    'control_group': 'M',
                                    'versionable': True,
                                })
    '''digital technical metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`DigitalTech`'''

    sourcetech = XmlDatastream("SourceTech",
                               "Technical Metadata - Source",
                               SourceTech,
                               defaults={
                                   'control_group': 'M',
                                   'versionable': True,
                               })
    '''source technical metadata :class:`~eulfedora.models.XmlDatastream` with content as
    :class:`SourceTech`'''

    rights = XmlDatastream("Rights",
                           "Usage rights and access control metadata",
                           Rights,
                           defaults={
                               'control_group': 'M',
                               'versionable': True,
                           })
    '''access control metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`Rights`'''

    jhove = FileDatastream(
        "JHOVE",
        "JHOVE datastream",
        defaults={
            'mimetype': 'application/xml',
            'control_group': 'M',
            'versionable': True,
            'format': 'http://hul.harvard.edu/ois/xml/xsd/jhove/jhove.xsd',
        })
    'JHOVE technical metadata for the master audio :class:`~eulfedora.models.FileDatastream`'
    # JHOVE is xml, but treat it as a file for now since we're just storing it,
    # not doing any processing, updating, etc.

    # map datastream IDs to human-readable names for inherited history_events method
    component_key = {
        'AUDIO': 'audio (master)',
        'CompressedAudio': 'audio (access version)',
        'SourceTech': 'source technical metadata',
        'DigitalTech': 'digital technical metadata',
        'JHOVE': 'technical metadata',
        'MODS': 'descriptive metadata',
        'DC': 'descriptive metadata',
        'Rights': 'rights metadata',
        'RELS-EXT':
        'collection membership',  # TODO: revise when/if we add more relations
    }

    collection = Relation(relsext.isMemberOfCollection, type=CollectionObject)
    ''':class:`~keep.collection.models.CollectionObject that this object is a member of,
    via `isMemberOfCollection` relation.
    '''
    @property
    def content_md5(self):
        return self.audio.checksum

    def save(self, logMessage=None):
        '''Save the object.  If the content of any :class:`~AudioObject.mods`,
        :class:`AudioObject.rels_ext`, or :class:`AudioObject.digitaltech`
        datastreams have been changed, the DC will be updated and saved as well.

        :param logMessage: optional log message
        '''
        if not self.exists or self.mods.isModified() or self.rels_ext.isModified() or \
            self.digitaltech.isModified() or self.rights.isModified():
            # DC is derivative metadata based on MODS/RELS-EXT/Digital Tech
            # If this is a new item (does not yet exist in Fedora)
            # OR if any of the relevant datastreams have changed, update DC
            self._update_dc()

        # for now, keep object label in sync with MODS title
        if self.mods.isModified() and self.mods.content.title:
            self.label = self.mods.content.title

        return super(AudioObject, self).save(logMessage)

    @models.permalink
    def get_absolute_url(self):
        'Absolute url to view this object within the site'
        return ('audio:view', [str(self.pid)])

    def get_access_url(self):
        "Absolute url to hear this object's access version"
        if self.compressed_audio.exists:
            return reverse('audio:download-compressed-audio',
                           args=[str(self.pid),
                                 self.access_file_extension()])
        # as of file migration (1.2), legacy DM access path is no longer needed

    def access_file_extension(self):
        '''Return the expected file extension for whatever type of
        compressed audio datastream the current object has (if it has
        one), based on the datastream mimetype.  Currently, compressed
        audio could be MP3 or M4A/MP4.'''
        if self.compressed_audio.exists:
            if self.compressed_audio.mimetype == 'audio/mpeg':
                return 'mp3'
            if self.compressed_audio.mimetype == 'audio/mp4':
                return 'm4a'

    @property
    def conversion_result(self):
        '''Return the :class:`~eulcommon.djangoextras.taskresult.models.TaskResult`
        for the most recently requested access copy conversion (if any).
        '''
        conversions = TaskResult.objects.filter(
            object_id=self.pid).order_by('-created')
        if conversions:
            return conversions[0]

    @property
    def researcher_access(self):
        return allow_researcher_access(self.rights.content)

    def _update_dc(self):
        '''Update Dublin Core (derivative metadata) based on master metadata
        from MODS, RELS-EXT, and digital tech metadata in order to keep data
        synchronized and make fields that need to be searchable accessible to
        Fedora findObjects API method.
         '''
        # identifiers
        del (self.dc.content.identifier_list)  # clear out any existing names

        # title
        if self.mods.content.title:
            self.label = self.mods.content.title
            self.dc.content.title = self.mods.content.title
        if self.mods.content.resource_type:
            self.dc.content.type = self.mods.content.resource_type

        # creator names
        del (self.dc.content.creator_list)  # clear out any existing names
        for name in self.mods.content.names:
            # for now, use unicode conversion as defined in mods.Name
            self.dc.content.creator_list.append(unicode(name))

        # clear out any dates previously in DC
        del (self.dc.content.date_list)
        if self.mods.content.origin_info and \
           len(self.mods.content.origin_info.created) and \
           self.mods.content.origin_info.created[0].date:
            self.dc.content.date_list.append(
                self.mods.content.origin_info.created[0].date)
        if self.mods.content.origin_info and \
           len(self.mods.content.origin_info.issued) and \
           self.mods.content.origin_info.issued[0].date:
            self.dc.content.date_list.append(
                self.mods.content.origin_info.issued[0].date)

        # clear out any descriptions previously in DC and set from MODS/digitaltech
        del (self.dc.content.description_list)
        if self.mods.content.general_note and \
           self.mods.content.general_note.text:
            self.dc.content.description_list.append(
                self.mods.content.general_note.text)

        # clear out any rights previously in DC and set contents from Rights accessStatus
        del (self.dc.content.rights_list)
        if self.rights.content.access_status:
            # access code no longer needs to be included, since we will not be searching
            self.dc.content.rights_list.append(
                self.rights.content.access_status.text)

    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data`
        method to include additional fields specific to Keep
        Audio objects.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        # FIXME: is it worth splitting out descriptive index data here?
        data = super(AudioObject, self).index_data()
        data['object_type'] = 'audio'
        if self.collection and self.collection.exists:

            # collection_source_id  (0 is an allowable id, so check not None)
            if self.collection.mods.content.source_id is not None:
                data[
                    'collection_source_id'] = self.collection.mods.content.source_id

            # FIXME: previously indexing URI; is this needed for any reason or can we
            # use pid?  (needs to match collection index pid field for solr join)
            # data['collection_id'] = self.collection.uri
            data['collection_id'] = self.collection.pid
            try:
                # pull parent & archive collection objects directly from fedora
                parent = CollectionObject(self.api, self.collection.uri)
                data['collection_label'] = parent.label
                # NB: as of 2011-08-23, eulindexer doesn't support automatic
                # reindexing of audio objects when their collection changes.
                # as a result, archive_id and archive_label may be stale.
                # disable indexing them until eulindexer supports those
                # chained updates.
                #data['archive_id'] = parent.collection_id
                #archive = CollectionObject(self.api, parent.collection_id)
                #data['archive_label'] = archive.label
            except RequestFailed as rf:
                logger.error(
                    'Error accessing collection or archive object in Fedora: %s'
                    % rf)

        # include resolvable ARK if available
        if self.mods.content.ark_uri:
            data['ark_uri'] = self.mods.content.ark_uri

        # old identifiers from previous digital masters
        dm1_ids = []
        if self.mods.content.dm1_id:
            dm1_ids.append(self.mods.content.dm1_id)
        if self.mods.content.dm1_other_id:
            dm1_ids.append(self.mods.content.dm1_other_id)
        if dm1_ids:
            data['dm1_id'] = dm1_ids

        # digitization purpose, if not empty
        if self.digitaltech.content.digitization_purpose_list:
            # convert nodelist to a normal list that can be serialized as json
            data['digitization_purpose'] = [
                dp for dp in self.digitaltech.content.digitization_purpose_list
            ]

        # related files
        if self.sourcetech.content.related_files_list:
            data['related_files'] = [
                rel for rel in self.sourcetech.content.related_files_list
            ]

        # part note
        if self.mods.content.part_note and self.mods.content.part_note.text:
            data['part'] = self.mods.content.part_note.text

        # sublocation
        if self.sourcetech.content.sublocation:
            data['sublocation'] = self.sourcetech.content.sublocation

        # rights access status code
        if self.rights.content.access_status:
            data['access_code'] = self.rights.content.access_status.code
        # copyright date from rights metadata
        if self.rights.content.copyright_date:
            data['copyright_date'] = self.rights.content.copyright_date
        # ip note from rights metadata
        if self.rights.content.ip_note:
            data['ip_note'] = self.rights.content.ip_note

        # boolean values that should always be available
        data.update({
            # should this item be accessible to researchers?
            'researcher_access':
            bool(self.researcher_access),  # if None, we want False
            # flags to indicate which datastreams are available
            'has_access_copy': self.compressed_audio.exists,
            'has_original': self.audio.exists,
        })

        if self.compressed_audio.exists:
            data.update({
                'access_copy_size': self.compressed_audio.size,
                'access_copy_mimetype': self.compressed_audio.mimetype,
            })
        if self.digitaltech.content.duration:
            data['duration'] = self.digitaltech.content.duration

        if self.mods.content.origin_info and \
           self.mods.content.origin_info.issued \
                and not self.mods.content.origin_info.issued.is_empty():
            data['date_issued'] = [
                unicode(di) for di in self.mods.content.origin_info.issued
            ]
        if self.mods.content.origin_info and \
           self.mods.content.origin_info.created \
                and not self.mods.content.origin_info.created.is_empty():
            data['date_created'] = [
                unicode(di) for di in self.mods.content.origin_info.created
            ]

        if self.audio.exists:
            data['content_md5'] = self.audio.checksum

        return data

    @staticmethod
    def init_from_file(filename,
                       initial_label=None,
                       request=None,
                       checksum=None,
                       mimetype=None):
        '''Static method to create a new :class:`AudioObject` instance from
        a file.  Sets the object label and metadata title based on the initial
        label specified, or file basename.  Calculates and stores the duration
        based on the file. Also sets the following default metadata values:

            * mods:typeOfResource = "sound recording"
            * dt:codecQuality = "lossless"

        :param filename: full path to the audio file, as a string
        :param initial_label: optional initial label to use; if not specified,
            the base name of the specified file will be used
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param checksum: the checksum of the file being sent to fedora.
        :returns: :class:`AudioObject` initialized from the file
        '''
        if initial_label is None:
            initial_label = os.path.basename(filename)
        repo = Repository(request=request)
        obj = repo.get_object(type=AudioObject)
        # set initial object label from the base filename
        obj.label = initial_label
        obj.dc.content.title = obj.mods.content.title = obj.label
        obj.audio.content = open(
            filename)  # FIXME: at what point does/should this get closed?
        # Set the file checksum, if set.
        obj.audio.checksum = checksum
        # set content datastream mimetype if passed in
        if mimetype is not None:
            obj.audio.mimetype = mimetype
        #Get the label, minus the ".wav" (mimetype indicates that)
        obj.audio.label = initial_label[:-4]
        # set initial mods:typeOfResource - all AudioObjects default to sound recording
        obj.mods.content.resource_type = 'sound recording'
        # set codec quality to lossless in digital tech metadata
        # - default for AudioObjects, should only accept lossless audio for master file
        obj.digitaltech.content.codec_quality = 'lossless'
        # get wav duration and store in digital tech metadata
        obj.digitaltech.content.duration = '%d' % round(wav_duration(filename))

        return obj

    @staticmethod
    def all():
        'Find all Audio objects by content model within the configured pidspace.'
        search_opts = {
            'type': AudioObject,
            # restrict to objects in configured pidspace
            'pid__contains': '%s:*' % settings.FEDORA_PIDSPACE,
            # restrict by cmodel in dc:format
            'format__contains': AudioObject.AUDIO_CONTENT_MODEL,
        }
        repo = Repository()
        return repo.find_objects(**search_opts)
Example #10
0
class ImageObject(FileObject):
    CONTENT_MODELS = [
        'info:fedora/genrepo-demo:Image-1.0',
        AccessibleObject.PUBLIC_ACCESS_CMODEL
    ]
    IMAGE_SERVICE = 'genrepo-demo:DjatokaImageService'

    content_types = ('image/jpeg', 'image/jp2', 'image/gif', 'image/bmp',
                     'image/png', 'image/tiff')
    view_template = 'file/image.html'

    # DC & RELS-EXT inherited; override master
    master = FileDatastream(
        "source-image",
        "Master TIFF image",
        defaults={
            'mimetype': 'image/tiff',
            # FIXME: versioned? checksum?
        })

    has_preview = True

    def get_preview_image(self):
        return self.getDissemination(self.IMAGE_SERVICE,
                                     'getRegion',
                                     params={'level': 1})

    def get_region(self, params):
        # expose djatoka getRegion method for djatoka seadragon deep zoom
        return self.getDissemination(self.IMAGE_SERVICE,
                                     'getRegion',
                                     params=params)

    _image_metadata = None

    @property
    def image_metadata(self):
        'Image metadata as returned by Djatoka getMetadata method (width, height, etc.)'
        if self._image_metadata is None:
            imgmeta = self.getDissemination(self.IMAGE_SERVICE, 'getMetadata')
            # getDissemination returns a tuple of result, url
            # load the image metadata returned by djatoka via json and return
            self._image_metadata = json.loads(imgmeta[0])
        return self._image_metadata

    # expose width & height from image metadata as properties
    @property
    def width(self):
        return self.image_metadata['width']

    @property
    def height(self):
        return self.image_metadata['height']

    def deepzoom_info(self):
        # generate deepzoom image info xmlobject for based on width & height
        return DziImage(tilesize=256,
                        overlap=1,
                        format='jpg',
                        width=self.width,
                        height=self.height)
Example #11
0
class DiskImage(DigitalObject):
    '''Fedora object for Disk Images.  Extends :class:`~keep.common.fedora.DigitalObject`.
    '''

    # NOTE about datastream naming conventions
    # Where a corresponding datastream id already exists within the Keep
    # (i.e. MODS for mods metadata), the same datastream id will be used
    # Where a Keep datastream id does not already exist (e.g., Premis), following
    # Hydra content model conventions, based on generic simple Hydra content model
    # For documentation on Hydra content models, see:
    #   https://wiki.duraspace.org/display/hydra/Hydra+objects%2C+content+models+%28cModels%29+and+disseminators
    #   https://wiki.duraspace.org/display/hydra/Hydra+objects%2C+content+models+%28cModels%29+and+disseminators#Hydraobjects%2Ccontentmodels%28cModels%29anddisseminators-genericContent

    DISKIMAGE_CONTENT_MODEL = 'info:fedora/emory-control:DiskImage-1.0'
    CONTENT_MODELS = [DISKIMAGE_CONTENT_MODEL]
    NEW_OBJECT_VIEW = 'file:view'

    diskimage_mimetypes = [
        'application/x-aff',  # AFF, advanced forensic format
        'application/x-ad1',  # AD1, proprietary disk image format
        'application/x-iso9660-image',  # ISO
        'application/x-ewf',  # E01 Expert Witness Format
        'application/x-tar',  # tar file
        'application/mbox'  # mbox (? may require extra magic file entries)
    ]

    # mapping of mimetype to format label to insert in Premis
    mimetype_format = {
        'application/x-aff': 'AFF',
        'application/x-ad1': 'AD1',
        'application/x-iso9660-image': 'ISO',
        'application/x-ewf': 'E01',
        'application/x-tar': 'TAR',
        'application/mbox': 'MBOX'
    }

    allowed_mimetypes = ['', 'application/octet-stream'] + diskimage_mimetypes
    # NOTE: empty type and application/octet-stream are required for javascript upload,
    # because browser does not detect any mimetype at all for AFF and AD1 files
    # and detects ISO as the generic application/octet-stream
    # NOTE: Mimetypes for AD1 and AFF are custom mimetypes and must be configured
    # in your local magic files.  See the deploy notes for more information.

    collection = Relation(relsext.isMemberOfCollection, type=CollectionObject)
    ''':class:`~keep.collection.models.CollectionObject that this object belongs to,
    via `isMemberOfCollection` relation.
    '''

    #: original DiskImage object that this DiskImage is related to, if
    #: this is a migrated object; related via fedora-rels-ext isDerivationOf
    original = Relation(relsext.isDerivationOf, type='self')
    #: migrated DiskImage object that supercedes this object, if a
    #: migration has occurred; related via fedora-rels-ext hasDerivation
    migrated = Relation(relsext.hasDerivation, type='self')

    mods = XmlDatastream("MODS",
                         "MODS Metadata",
                         DiskImageMods,
                         defaults={
                             'control_group': 'M',
                             'format': mods.MODS_NAMESPACE,
                             'versionable': True,
                         })
    '''descriptive metadata as MODS - :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`LocalMods`'''
    # note: using base local mods for now; may need to extend for disk images

    content = FileDatastream("content",
                             "Master disk image file",
                             defaults={
                                 'versionable': False,
                             })
    'master disk image binary content as :class:`~eulfedora.models.FileDatastream`'
    # NOTE: could be one of a few allowed mimetypes

    rights = XmlDatastream("Rights",
                           "Usage rights and access control metadata",
                           Rights,
                           defaults={
                               'control_group': 'M',
                               'versionable': True,
                           })
    '''access control metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`Rights`'''

    provenance = XmlDatastream('provenanceMetadata',
                               'Provenance metadata',
                               DiskImagePremis,
                               defaults={'versionable': False})
    '''``provenanceMetadata`` datastream for PREMIS object metadata; datastream
    XML content will be an instance of :class:`eulxml.xmlmap.premis.Premis`.'''

    # map datastream IDs to human-readable names for inherited history_events method
    component_key = {
        'MODS': 'descriptive metadata',
        'DC': 'descriptive metadata',
        'Rights': 'rights metadata',
        'RELS-EXT':
        'collection membership or last fixity check',  # TODO: revise as we add more relations
        'provenanceMetadata': 'provenance metadata',
    }

    def get_default_pid(self):
        # extend common default pid logic in to also set ARK identifier
        # in the premis object
        pid = super(DiskImage, self).get_default_pid()

        if self.mods.content.ark:
            self.provenance.content.object.id = self.mods.content.ark
            self.provenance.content.object.id_type = 'ark'

        return pid

    @property
    def has_supplemental_content(self):
        '''Boolean to indicate if this disk image object has any supplemental
        file datastreams.

        .. Note:: only works on saved objects
        '''
        return any(
            dsid.startswith('supplement') for dsid in self.ds_list.keys())

    @property
    def supplemental_content(self):
        '''Generator for supplemental content datastreams'''
        for dsid in self.ds_list.keys():
            if dsid.startswith('supplement'):
                yield self.getDatastreamObject(dsid)

    _content_checksum = None
    '''Used as a workaround for Fedora 3.4 issue with file URIs and checksums
    and to support duplicate detection based on checksums, store
    content checksum without sending it to Fedora.'''

    @property
    def content_md5(self):
        return self._content_checksum or self.content.checksum

    # NOTE: auto-calculated information such as checksums stored in premis
    # will need to be updated anytime the master disk image datastream is updated
    # (will probably need to extend the save method for this)

    def save(self, logMessage=None):
        '''Save the object.  If the content of any :class:`~AudioObject.mods`,
        :class:`AudioObject.rels_ext`, or :class:`AudioObject.digitaltech`
        datastreams have been changed, the DC will be updated and saved as well.

        :param logMessage: optional log message
        '''
        if not self.exists or self.mods.isModified() or \
            self.rels_ext.isModified() or self.rights.isModified():
            # DC is derivative metadata.
            # If this is a new item (does not yet exist in Fedora)
            # OR if any of the relevant datastreams have changed, update it.
            self._update_dc()

        return super(DiskImage, self).save(logMessage)

    def _update_dc(self):
        '''Update Dublin Core (derivative metadata) based on master metadata
        from MODS, RELS-EXT, and rights metadata in order to keep data
        synchronized and make fields that need to be searchable accessible to
        Fedora findObjects API method.
         '''
        # NOTE: borrowed almost completely from audio, with minor modifications
        # TODO: move to common code somewhere?

        # identifiers
        del (self.dc.content.identifier_list)  # clear out any existing names

        # title
        if self.mods.content.title:
            # not strictly DC, but also keep object label in sync with MODS title
            self.label = self.mods.content.title
            self.dc.content.title = self.mods.content.title
        if self.mods.content.resource_type:
            self.dc.content.type = self.mods.content.resource_type

        # clear out any dates previously in DC
        del (self.dc.content.coverage_list)
        if self.mods.content.coveringdate_start and \
           self.mods.content.coveringdate_end:
            # FIXME: not sure the best way to indicate date range here
            self.dc.content.coverage_list.append(
                '%s:%s' % (self.mods.content.coveringdate_start,
                           self.mods.content.coveringdate_end))

        # clear out any descriptions previously in DC and set from MODS abstract
        del (self.dc.content.description_list)
        if self.mods.content.abstract and \
           self.mods.content.abstract.text:
            self.dc.content.description_list.append(
                self.mods.content.abstract.text)

        # clear out any rights previously in DC and set contents from Rights accessStatus
        del (self.dc.content.rights_list)
        if self.rights.content.access_status:
            # set dc:rights to text of access status
            self.dc.content.rights_list.append(
                self.rights.content.access_status.text)

    @staticmethod
    def init_from_file(filename,
                       initial_label=None,
                       request=None,
                       checksum=None,
                       mimetype=None,
                       content_location=None,
                       sha1_checksum=None):
        '''Static method to create a new :class:`DiskImage` instance from
        a file.  Sets the object label and metadata title based on the initial
        label specified, or file basename.

        :param filename: full path to the disk image file, as a string
        :param initial_label: optional initial label to use; if not specified,
            the base name of the specified file will be used
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param checksum: the MD5 checksum of the file being sent to fedora.
        :param mimetype: the mimetype for the main disk image content.
        :param content_location: optional file URI for file-based Fedora ingest
        :param sha1_checksum: the SHA1 checksum of the file being sent to fedora,
            for storage in the PREMIS technical metadata. Note that SHA-1 will
            be calculated if not passed in (slow for large files).
        :returns: :class:`DiskImage` initialized from the file
        '''

        # if no checksum was passed in, calculate one
        if checksum is None:
            checksum = md5sum(filename)

        basename, ext = os.path.splitext(os.path.basename(filename))

        # ajax upload passes original filename as initial label
        if initial_label is not None:
            # if initial label looks like a file, strip off the extension
            # for the object name/title
            if initial_label.lower().endswith('.aff') or \
               initial_label.lower().endswith('.ad1') or \
               initial_label.lower().endswith('.iso'):
                basename, ext = os.path.splitext(initial_label)
                # NOTE: also using extension from original filename
                # here because in some cases (under apache?) uploaded file
                # names do not have the original extension
                initial_label = basename

        else:
            initial_label = basename

        repo = Repository(request=request)
        obj = repo.get_object(type=DiskImage)
        # set initial object label from the base filename
        obj.label = initial_label
        obj.mods.content.title = obj.label
        obj.dc.content.title = obj.label
        # set initial mods:typeOfResource - same for all Disk Images
        obj.mods.content.resource_type = 'software, multimedia'
        # set genre as born digital
        obj.mods.content.genres.append(
            mods.Genre(authority='aat', text='born digital'))

        # Set the file checksum
        obj.content.checksum = checksum
        # set mimetype
        if mimetype is None:
            # if no mimetype was passed in, determine from file
            m = magic.Magic(mime=True)
            mtype = m.from_file(filename)
            mimetype, separator, options = mtype.partition(';')
        obj.content.mimetype = mimetype

        # Set disk image datastream label to filename
        obj.content.label = initial_label

        # premis data
        obj.provenance.content.create_object()
        # NOTE: premis object id will be same as short-form ARK stored in MODS
        # It cannot be set until pid is minted, which will happen in get_default_pid,
        # but premis is order dependent so add a place-holder here
        obj.provenance.content.object.id_type = 'ark'
        obj.provenance.content.object.id = ''

        # object type required to be schema valid, must be in premis namespace
        obj.provenance.content.object.type = 'p:file'

        # composition level required for object characteristics; probably should be 0 (?)
        obj.provenance.content.object.composition_level = 0
        # store checksums in premis: MD5 (already calculated) and SHA-1
        # picky about order here too: force algorithm to be added first
        obj.provenance.content.object.checksums.append(
            PremisFixity(algorithm='MD5'))
        obj.provenance.content.object.checksums[0].digest = checksum
        # add sha-1 to checksums in premis; calculate if not passed in
        if sha1_checksum is None:
            sha1_checksum = sha1sum(filename)
        obj.provenance.content.object.checksums.append(
            PremisFixity(algorithm='SHA-1'))
        obj.provenance.content.object.checksums[1].digest = sha1_checksum

        obj.provenance.content.object.create_format()
        # set format based on mimetype
        if mimetype in DiskImage.mimetype_format:
            obj_format = DiskImage.mimetype_format[mimetype]
        else:
            # as a fallback, use the file extension for format
            obj_format = ext.upper().strip('.')
        obj.provenance.content.object.format.name = obj_format

        # if a content URI is specified (e.g. for large files), use that
        if content_location is not None:
            obj.content.ds_location = content_location
        # otherwise set the file as content to be posted
        else:
            obj.content.content = open(filename)
            # FIXME: at what point does/should this file get closed?

        # descriptive/technical metadata todo

        return obj

    @staticmethod
    def init_from_bagit(path, request=None, file_uri=True):
        '''Static method to create a new :class:`DiskImage` instance from
        a BagIt.  Sets the object label and metadata title based on the
        name of the bag, and looks for a supported disk image file type
        (e.g. AFF or AD1) to use as the content datastream for the object.
        Content checksum is pulled from the BagIt metadata, and repository
        ingest will be done via file URIs based on configured
        **LARGE_FILE_STAGING_DIR** and **LARGE_FILE_STAGING_FEDORA_DIR**
        to better support ingesting large files (unless file_uri
        is False).

        Raises an exception if BagIt is not valid or if it does not
        contain a supported disk image data file.  (Note: using fast validation
        without checksum calculation, to minimize the time required to ingest
        large files.)

        :param path: full path to the BagIt directory that contains
            a disk image file
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param file_uri: ingest BagIt data via file uris based on
            configured staging directories (default behavior)
            instead of uploading the content to Fedora

        :returns: :class:`DiskImage` initialized from the BagIt contents
        '''

        # TODO: add optional file uri ingest flag, default to false
        # (mostly to allow testing)
        # - for all data files other than disk image, add
        # supplementN datastream with mimetype/filename as label/checksum
        # see if eulfedora getDatastreamObject can be used to init
        # a new/unmapped ds?

        bag = bagit.Bag(path)
        # NOTE: using fast validation here to avoid recalculating checksums
        # for very large files; only checksum compare will be done by fedora
        bag.validate(fast=True)  # raises bagit.BagValidationError if not valid

        # use the base name of the BagIt as initial object label
        initial_label = os.path.basename(path)

        # identify disk image content file within the bag
        content_file = None
        m = magic.Magic(mime=True)
        supplemental_files = []
        supplement_mimetypes = {}
        diskimage_mimetype = None
        # loop through bag content until we find a supported disk image file
        for data_path in bag.payload_files():
            # path is relative to bag root dir
            filename = os.path.join(path, data_path)
            mtype = m.from_file(filename)
            mimetype, separator, options = mtype.partition(';')
            if mimetype in DiskImage.diskimage_mimetypes:
                checksum_err_msg = '%%s checksum not found for disk image %s' \
                    % os.path.basename(data_path)
                # require both MD5 and SHA-1 for disk image to ingest
                try:
                    md5_checksum = bag.entries[data_path]['md5']
                except KeyError:
                    raise Exception(checksum_err_msg % 'MD5')
                try:
                    sha1_checksum = bag.entries[data_path]['sha1']
                except KeyError:
                    raise Exception(checksum_err_msg % 'SHA-1')

                # this is the disk image content file
                # store file and mimetype for further initialization
                content_file = filename
                diskimage_mimetype = mimetype

            # any data file that is not a disk image should be assumed
            # to be a supplemental file
            else:
                supplemental_files.append(filename)
                # store the mimetype so we don't have to recalculate
                supplement_mimetypes[filename] = mimetype

        # no disk image data found
        if content_file is None:
            raise Exception('No disk image content found in %s' %
                            os.path.basename(path))

        optional_args = {}
        if file_uri:
            ingest_location = 'file://%s' % urllib.quote(content_file)
            # if Fedora base path is different from locally mounted staging directory,
            # convert from local path to fedora server path
            if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR',
                       None) is not None:
                ingest_location = ingest_location.replace(
                    settings.LARGE_FILE_STAGING_DIR,
                    settings.LARGE_FILE_STAGING_FEDORA_DIR)

            optional_args['content_location'] = ingest_location

        img = DiskImage.init_from_file(content_file,
                                       initial_label=initial_label,
                                       checksum=md5_checksum,
                                       mimetype=diskimage_mimetype,
                                       request=request,
                                       sha1_checksum=sha1_checksum,
                                       **optional_args)

        i = 0
        for i in range(len(supplemental_files)):
            sfile = supplemental_files[i]
            dsid = 'supplement%d' % i
            dsobj = img.getDatastreamObject(dsid,
                                            dsobj_type=FileDatastreamObject)
            dsobj.label = os.path.basename(sfile)
            dsobj.mimetype = supplement_mimetypes[sfile]
            # convert to relative path *within* the bag for BagIt metadata lookup
            data_path = sfile.replace(path, '').lstrip('/')
            dsobj.checksum = bag.entries[data_path]['md5']
            logger.debug('Adding supplemental dastream %s label=%s mimetype=%s checksum=%s' % \
                (dsid, dsobj.label, dsobj.mimetype, dsobj.checksum))

            if file_uri:
                ingest_location = 'file://%s' % urllib.quote(sfile)
                # if Fedora base path is different from locally mounted staging directory,
                # convert from local path to fedora server path
                if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR',
                           None) is not None:
                    ingest_location = ingest_location.replace(
                        settings.LARGE_FILE_STAGING_DIR,
                        settings.LARGE_FILE_STAGING_FEDORA_DIR)

                dsobj.ds_location = ingest_location
            else:
                # will probably only work for small/test content
                dsobj.content = open(sfile).read()

        return img

    @models.permalink
    def get_absolute_url(self):
        'Absolute url to view this object within the site'
        return (DiskImage.NEW_OBJECT_VIEW, [str(self.pid)])

    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data`
        method to include additional fields specific to Keep and for
        disk images.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        data = super(DiskImage, self).index_data()
        # FIXME: is born-digital type still needed for anything? perms?
        # data['object_type'] = 'born-digital'
        data['object_type'] = 'disk image'
        # set as born digital for now; eventually, we'll need to distinguish
        # between kinds of born digital content

        if self.collection and self.collection.exists:

            # collection_source_id  (0 is an allowable id, so check not None)
            if self.collection.mods.content.source_id is not None:
                data[
                    'collection_source_id'] = self.collection.mods.content.source_id

            data['collection_id'] = self.collection.pid
            data['collection_label'] = self.collection.label

        # include resolvable ARK if available
        if self.mods.content.ark_uri:
            data['ark_uri'] = self.mods.content.ark_uri

        if self.content.checksum:
            data['content_md5'] = self.content.checksum

        # copied from audio; enable once we have rights editing
        # # rights access status code
        # if self.rights.content.access_status:
        #     data['access_code'] = self.rights.content.access_status.code
        # # copyright date from rights metadata
        # if self.rights.content.copyright_date:
        #     data['copyright_date'] = self.rights.content.copyright_date
        # # ip note from rights metadata
        # if self.rights.content.ip_note:
        #     data['ip_note'] = self.rights.content.ip_note

        if self.provenance.content.fixity_checks:
            last_fixity_check = self.provenance.content.fixity_checks[-1]
            data['last_fixity_check'] = last_fixity_check.date
            data['last_fixity_result'] = last_fixity_check.outcome

        # store disk image format and size
        # - some disk images (i.e., objects migrated from AD1/AFF)
        # will have two sets of object characteristics; we want the
        # format from the last one listed
        if self.provenance.content.object and \
          self.provenance.content.object.latest_format:
            data[
                'content_format'] = self.provenance.content.object.latest_format.name

        data['content_size'] = self.content.size

        if self.original:
            data['original_pid'] = self.original.pid

        return data
Example #12
0
class Video(DigitalObject):
    '''Fedora Video Object.  Extends :class:`~eulfedora.models.DigitalObject`.'''
    VIDEO_CONTENT_MODEL = 'info:fedora/emory-control:Video-1.0'
    CONTENT_MODELS = [VIDEO_CONTENT_MODEL]
    NEW_OBJECT_VIEW = 'video:view'

    # There are several mimetypes for MPEG files
    allowed_master_mimetypes = {
        'video/quicktime': 'mov',
        'video/x-dv': 'dv',
        'video/mpeg': 'mpg',
        'video/x-m4v': 'm4v',
        'video/x-msvideo': 'avi'
    }
    allowed_access_mimetypes = {'video/mp4': 'mp4'}

    mods = XmlDatastream("MODS",
                         "MODS Metadata",
                         VideoMods,
                         defaults={
                             'control_group': 'M',
                             'format': mods.MODS_NAMESPACE,
                             'versionable': True,
                         })

    digitaltech = XmlDatastream("DigitalTech",
                                "Technical Metadata - Digital",
                                VideoDigitalTech,
                                defaults={
                                    'control_group': 'M',
                                    'versionable': True,
                                })
    '''digital technical metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`DigitalTech`'''

    'MODS :class:`~eulfedora.models.XmlDatastream` with content as :class:`VideoMods`'

    content = FileDatastream("VIDEO",
                             "Video datastream",
                             defaults={
                                 'versionable': True,
                             })
    'master video :class:`~eulfedora.models.FileDatastream`'

    provenance = XmlDatastream('provenanceMetadata',
                               'Provenance metadata',
                               VideoPremis,
                               defaults={'versionable': False})
    '''``provenanceMetadata`` datastream for PREMIS object metadata; datastream
    XML content will be an instance of :class:`eulxml.xmlmap.premis.Premis`.'''

    access_copy = FileDatastream("CompressedVideo",
                                 "Compressed video datastream",
                                 defaults={
                                     'mimetype': 'video/mp4',
                                     'versionable': True,
                                 })
    'access copy of video :class:`~eulfedora.models.FileDatastream`'

    sourcetech = XmlDatastream("SourceTech",
                               "Technical Metadata - Source",
                               VideoSourceTech,
                               defaults={
                                   'control_group': 'M',
                                   'versionable': True,
                               })
    '''source technical metadata :class:`~eulfedora.models.XmlDatastream` with content as
    :class:`SourceTech`'''

    rights = XmlDatastream("Rights",
                           "Usage rights and access control metadata",
                           Rights,
                           defaults={
                               'control_group': 'M',
                               'versionable': True,
                           })
    '''access control metadata :class:`~eulfedora.models.XmlDatastream`
    with content as :class:`Rights`'''

    # # map datastream IDs to human-readable names for inherited history_events method
    component_key = {
        'Video': 'video (master)',
        'CompressedVideo': 'video (access version)',
        'SourceTech': 'source technical metadata',
        'DigitalTech': 'digital technical metadata',
        'MODS': 'descriptive metadata',
        'DC': 'descriptive metadata',
        'Rights': 'rights metadata',
        'RELS-EXT': 'collection membership',
    }
    #
    collection = Relation(relsext.isMemberOfCollection, type=CollectionObject)
    ''':class:`~keep.collection.models.CollectionObject that this object is a member of,
    via `isMemberOfCollection` relation.
    '''
    _content_checksum = None
    '''Used as a workaround for Fedora 3.4 issue with file URIs and checksums
    and to support duplicate detection based on checksums, store
    content checksum without sending it to Fedora.'''
    @property
    def content_md5(self):
        return self._content_checksum or self.content.checksum

    def get_default_pid(self):
        # extend common default pid logic in to also set ARK identifier
        # in the premis object
        pid = super(Video, self).get_default_pid()

        if self.mods.content.ark:
            self.provenance.content.create_object()
            self.provenance.content.object.id = self.mods.content.ark
            self.provenance.content.object.id_type = 'ark'

        return pid

    def save(self, logMessage=None):
        '''Save the object.  If the content of any :class:`~Video.mods`,
        :class:`Video.rels_ext`, or :class:`Video.digitaltech`
        datastreams have been changed, the DC will be updated and saved as well.

        :param logMessage: optional log message
        '''
        if not self.exists or self.mods.isModified() or self.rels_ext.isModified() or \
            self.digitaltech.isModified() or self.rights.isModified():
            # DC is derivative metadata based on MODS/RELS-EXT/Digital Tech
            # If this is a new item (does not yet exist in Fedora)
            # OR if any of the relevant datastreams have changed, update DC
            self._update_dc()

        # for now, keep object label in sync with MODS title
        if self.mods.isModified() and self.mods.content.title:
            self.label = self.mods.content.title

        return super(Video, self).save(logMessage)

    #
    @models.permalink
    def get_absolute_url(self):
        'Absolute url to view this object within the site'
        return ('video:view', [str(self.pid)])

    def get_access_url(self):
        "Absolute url to hear this object's access version"
        if self.access_copy.exists:
            return reverse('video:download-compressed-video',
                           args=[str(self.pid)])

    def access_file_extension(self):
        '''Return the expected file extension for whatever type of
        compressed video datastream the current object has (if it has
        one), based on the datastream mimetype.  Currently, compressed
        video is MP4.'''
        if self.access_copy.exists:
            return self.allowed_access_mimetypes.get(self.access_copy.mimetype,
                                                     'mp4')

    @property
    def researcher_access(self):
        return allow_researcher_access(self.rights.content)

    def _update_dc(self):
        '''Update Dublin Core (derivative metadata) based on master metadata
        from MODS, RELS-EXT, and digital tech metadata in order to keep data
        synchronized and make fields that need to be searchable accessible to
        Fedora findObjects API method.
         '''
        # identifiers
        del self.dc.content.identifier_list  # clear out any existing names

        # title
        if self.mods.content.title:
            self.label = self.mods.content.title
            self.dc.content.title = self.mods.content.title
        if self.mods.content.resource_type:
            self.dc.content.type = self.mods.content.resource_type

        # creator names
        del self.dc.content.creator_list  # clear out any existing names
        for name in self.mods.content.names:
            # for now, use unicode conversion as defined in mods.Name
            self.dc.content.creator_list.append(unicode(name))

        # clear out any dates previously in DC
        del self.dc.content.date_list
        if self.mods.content.origin_info and \
           len(self.mods.content.origin_info.created) and \
           self.mods.content.origin_info.created[0].date:
            self.dc.content.date_list.append(
                self.mods.content.origin_info.created[0].date)
        if self.mods.content.origin_info and \
           len(self.mods.content.origin_info.issued) and \
           self.mods.content.origin_info.issued[0].date:
            self.dc.content.date_list.append(
                self.mods.content.origin_info.issued[0].date)

        # clear out any descriptions previously in DC and set from MODS/digitaltech
        del self.dc.content.description_list
        if self.mods.content.general_note and \
           self.mods.content.general_note.text:
            self.dc.content.description_list.append(
                self.mods.content.general_note.text)

        # clear out any rights previously in DC and set contents from Rights accessStatus
        del self.dc.content.rights_list
        if self.rights.content.access_status:
            # access code no longer needs to be included, since we will not be searching
            self.dc.content.rights_list.append(
                self.rights.content.access_status.text)

    def index_data(self):
        '''Extend the default
        :meth:`eulfedora.models.DigitalObject.index_data`
        method to include additional fields specific to Keep
        Video objects.'''
        # NOTE: we don't want to rely on other objects being indexed in Solr,
        # so index data should not use Solr to find any related object info

        data = super(Video, self).index_data()
        data['object_type'] = 'video'
        if self.collection and self.collection.exists:

            # collection_source_id  (0 is an allowable id, so check not None)
            if self.collection.mods.content.source_id is not None:
                data[
                    'collection_source_id'] = self.collection.mods.content.source_id
            data['collection_id'] = self.collection.pid
            try:
                # pull parent & archive collection objects directly from fedora
                parent = CollectionObject(self.api, self.collection.uri)
                data['collection_label'] = parent.label
            except RequestFailed as rf:
                logger.error(
                    'Error accessing collection or archive object in Fedora: %s'
                    % rf)

        # include resolvable ARK if available
        if self.mods.content.ark_uri:
            data['ark_uri'] = self.mods.content.ark_uri

        #TODO May have to add these sections if more metada is added
        # # old identifiers from previous digital masters
        dm1_ids = []
        if self.mods.content.dm1_id:
            dm1_ids.append(self.mods.content.dm1_id)
        if self.mods.content.dm1_other_id:
            dm1_ids.append(self.mods.content.dm1_other_id)
        if dm1_ids:
            data['dm1_id'] = dm1_ids

        # digitization purpose, if not empty
        if self.digitaltech.content.digitization_purpose_list:
            # convert nodelist to a normal list that can be serialized as json
            data['digitization_purpose'] = [
                dp for dp in self.digitaltech.content.digitization_purpose_list
            ]

        # sublocation
        if self.sourcetech.content.sublocation:
            data['sublocation'] = self.sourcetech.content.sublocation

        # rights access status code
        if self.rights.content.access_status:
            data['access_code'] = self.rights.content.access_status.code
        # copyright date from rights metadata
        if self.rights.content.copyright_date:
            data['copyright_date'] = self.rights.content.copyright_date
        # ip note from rights metadata
        if self.rights.content.ip_note:
            data['ip_note'] = self.rights.content.ip_note
        #
        # # boolean values that should always be available
        data.update({
            # should this item be accessible to researchers?
            'researcher_access': bool(self.researcher_access),
            # flags to indicate which datastreams are available
            'has_access_copy': self.access_copy.exists,
            'has_original': self.content.exists,
        })

        if self.access_copy.exists:
            data.update({
                'access_copy_size': self.access_copy.info.size,
                'access_copy_mimetype': self.access_copy.mimetype,
            })

        if self.digitaltech.content.duration:
            data['duration'] = self.digitaltech.content.duration

        if self.mods.content.origin_info and \
           self.mods.content.origin_info.issued \
                and not self.mods.content.origin_info.issued.is_empty():
            data['date_issued'] = [
                unicode(di) for di in self.mods.content.origin_info.issued
            ]
        if self.mods.content.origin_info and \
           self.mods.content.origin_info.created \
                and not self.mods.content.origin_info.created.is_empty():
            data['date_created'] = [
                unicode(di) for di in self.mods.content.origin_info.created
            ]

        # store master video format and size
        if self.provenance.content.object and self.provenance.content.object.format:
            data['content_format'] = self.provenance.content.object.format.name
        data['content_size'] = self.content.size

        return data

    @staticmethod
    def init_from_file(master_filename,
                       initial_label=None,
                       request=None,
                       master_md5_checksum=None,
                       master_sha1_checksum=None,
                       master_location=None,
                       master_mimetype=None,
                       access_filename=None,
                       access_location=None,
                       access_md5_checksum=None,
                       access_mimetype=None):
        '''Static method to create a new :class:`Video` instance from
        a file.  Sets the object label and metadata title based on the initial
        label specified, or file basename.  Calculates and stores the duration
        based on the file. Also sets the following default metadata values:

            * mods:typeOfResource = "sound recording"

        :param master_filename: full path to the master file, as a string
        :param initial_label: optional initial label to use; if not specified,
            the base name of the specified file will be used
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param master_md5_checksum: the MD5 checksum of the master file being sent to fedora.
        :param master_sha1_checksum: the sha-1 checksum of the master file being sent to fedora.
        :param master_location: optional file URI for file-based Fedora ingest of master file
        :param master_mimetype: the master_mimetype of the master file being sent to fedora
        :param access_filename: full path to the access file, as a string
        :param access_md5_checksum: the MD5 checksum of the access file being sent to fedora.
        :param access_mimetype: the mimetype of the access file being sent to fedora
        :returns: :class:`Video` initialized from the file
        '''

        if initial_label is None:
            initial_label = os.path.basename(master_filename)
        repo = Repository(request=request)
        obj = repo.get_object(type=Video)
        # set initial object label from the base master_filename
        obj.label = initial_label
        obj.dc.content.title = obj.mods.content.title = obj.label
        # Set the file checksum, if set.
        obj.content.checksum = master_md5_checksum
        # set content datastream master_mimetype if passed in
        if master_mimetype is not None:
            obj.content.mimetype = master_mimetype
        #Get the label, minus the extention (master_mimetype indicates that)
        obj.content.label = initial_label.rsplit('.', 1)[0]
        # set initial mods:typeOfResource - all Vodeo default to video recording
        obj.mods.content.resource_type = 'moving image'
        # get duration and store in digital tech metadata
        try:
            info = MediaInfo.parse(master_filename)
            duration = info.tracks[0].duration / 1000
        except:
            raise Exception('Error getting video duration')

        obj.digitaltech.content.duration = '%d' % round(duration)

        # premis data
        obj.provenance.content.create_object()
        obj.provenance.content.object.id_type = 'ark'
        obj.provenance.content.object.id = ''

        obj.provenance.content.object.type = 'p:file'
        obj.provenance.content.object.checksums.append(
            PremisFixity(algorithm='MD5'))
        obj.provenance.content.object.checksums[0].digest = master_md5_checksum

        if master_sha1_checksum is None:
            master_sha1_checksum = sha1sum(master_filename)
        obj.provenance.content.object.checksums.append(
            PremisFixity(algorithm='SHA-1'))
        obj.provenance.content.object.checksums[
            1].digest = master_sha1_checksum

        obj.provenance.content.object.create_format()
        #format name will be upper-cased version of file extension
        obj.provenance.content.object.format.name = master_filename.rsplit(
            '.', 1)[1].upper()

        # if a content URI is specified (e.g. for large files), use that
        if master_location is not None:
            obj.content.ds_location = master_location

        # otherwise set the file as content to be posted
        else:
            obj.content.content = open(master_filename)

        # Access copy data

        # if a access URI is specified (e.g. for large files), use that
        if access_location is not None:
            obj.access_copy.ds_location = access_location

        # otherwise set the access file as content to be posted
        else:
            obj.access_copy.content = open(access_filename)

        obj.access_copy.mimetype = access_mimetype
        obj.access_copy.checksum = access_md5_checksum
        obj.access_copy.label = initial_label

        return obj

    @staticmethod
    def init_from_bagit(path, request=None, file_uri=True):
        '''Static method to create a new :class:`Video` instance from
        a BagIt.  Sets the object label and metadata title based on the
        name of the bag, and looks for a supported video file type
        to use as the content datastream for the object.
        Content checksum is pulled from the BagIt metadata, and repository
        ingest will be done via file URIs based on configured
        **LARGE_FILE_STAGING_DIR** and **LARGE_FILE_STAGING_FEDORA_DIR**
        to better support ingesting large files (unless file_uri
        is False).

        Raises an exception if BagIt is not valid or if it does not
        contain a supported video data file.  (Note: using fast validation
        without checksum calculation, to minimize the time required to ingest
        large files.)

        :param path: full path to the BagIt directory that contains
            a video file
        :param request: :class:`django.http.HttpRequest` passed into a view method;
            must be passed in order to connect to Fedora as the currently-logged
            in user
        :param file_uri: ingest BagIt data via file uris based on
            configured staging directories (default behavior)
            instead of uploading the content to Fedora

        :returns: :class:`Video` initialized from the BagIt contents
        '''

        bag = bagit.Bag(path)
        # NOTE: using fast validation here to avoid recalculating checksums
        # for very large files; only checksum compare will be done by fedora
        bag.validate(fast=True)  # raises bagit.BagValidationError if not valid

        # use the base name of the BagIt as initial object label
        initial_label = os.path.basename(path)

        # identify video content file within the bag
        m = magic.Magic(mime=True)
        # loop through bag content until we find a supported video file

        opts = {'request': request, 'initial_label': initial_label}

        for data_path in bag.payload_files():
            # path is relative to bag root dir
            filename = os.path.join(path, data_path)
            mtype = m.from_file(filename)
            mimetype, separator, options = mtype.partition(';')

            # require both MD5 and SHA-1 for video to ingest
            try:
                md5_checksum = bag.entries[data_path]['md5']
            except KeyError:
                raise Exception('MD5 checksum mismatch on file %s' % data_path)
            try:
                sha1_checksum = bag.entries[data_path]['sha1']
            except KeyError:
                raise Exception('SHA-1 checksum mismatch on file %s' %
                                data_path)

            if mimetype in Video.allowed_master_mimetypes.keys():
                opts['master_filename'] = filename
                opts['master_md5_checksum'] = md5_checksum
                opts['master_sha1_checksum'] = sha1_checksum
                opts['master_mimetype'] = mimetype
                if file_uri:
                    # if Fedora base path is different from locally mounted staging directory,
                    # convert from local path to fedora server path
                    master_location = 'file://%s' % urllib.quote(
                        opts['master_filename'])
                    if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR',
                               None) is not None:
                        master_location = master_location.replace(
                            settings.LARGE_FILE_STAGING_DIR,
                            settings.LARGE_FILE_STAGING_FEDORA_DIR)
                    opts['master_location'] = master_location

            elif mimetype in Video.allowed_access_mimetypes.keys():
                opts['access_filename'] = filename
                opts['access_md5_checksum'] = md5_checksum
                opts['access_mimetype'] = mimetype
                if file_uri:
                    # if Fedora base path is different from locally mounted staging directory,
                    # convert from local path to fedora server path
                    access_location = 'file://%s' % urllib.quote(
                        opts['access_filename'])
                    if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR',
                               None) is not None:
                        access_location = access_location.replace(
                            settings.LARGE_FILE_STAGING_DIR,
                            settings.LARGE_FILE_STAGING_FEDORA_DIR)
                    opts['access_location'] = access_location
        # no Video found
        if 'master_filename' not in opts:
            raise Exception('No Video content found in %s' %
                            os.path.basename(path))

        vid = Video.init_from_file(**opts)

        return vid

    def old_dm_media_path(self):
        old_id = self.mods.content.dm1_other_id or self.mods.content.dm1_id
        if old_id:
            coll_obj = self._collection_object()
            if not coll_obj:
                return
            coll_path = coll_obj.old_dm_media_path()
            if not coll_path:
                return
            return '%svideo/%s.m4a' % (coll_path, old_id)

    def _collection_object(self):
        return self.collection