class FileObject(DigitalObject): PDF_CONTENT_MODEL = 'info:fedora/islandora:sp_pdf' CONTENT_MODELS = [ PDF_CONTENT_MODEL ] file = FileDatastream("OBJ", "Binary datastream", defaults={ 'versionable': True, }) mods = FileDatastream("MODS", "Mods record for this object.", defaults={ 'versionable': True, })
class TestPdfObject(DigitalObject): pdf = FileDatastream("PDF", "PDF document", defaults={ 'versionable': False, 'mimetype': 'application/pdf' })
def get_fedora_proxy_class(dsid): fcm = "info:fedora/genrepo:File-1.0" return type( "FedoraProxyObject", (DigitalObject, ), dict(FILE_CONTENT_MODEL=fcm, CONTENT_MODELS=[fcm], DATASTREAM=FileDatastream(dsid, "Binary datastream", defaults={ "versionable": True, })))
class FileObject(DigitalObject): """An opaque file for repositing on behalf of a user. Inherits the standard Dublin Core and RELS-EXT datastreams from :class:`~eulcore.fedora.models.DigitalObject`, and adds both a ``master`` datastream to contain the user's file as well as a content model for identifying these objects. """ CONTENT_MODELS = [AccessibleObject.PUBLIC_ACCESS_CMODEL] view_template = 'file/view.html' @property def default_pidspace(self): # use configured fedora pidspace (if any) when minting pids # dynamic property so it will always get current setting (e.g., if changed for tests) return getattr(settings, 'FEDORA_PIDSPACE', None) master = FileDatastream("master", "reposited master file", defaults={ 'versionable': True, }) "reposited master :class:`~eulcore.fedora.models.FileDatastream`" def _get_oai_id(self): return self.rels_ext.content.value(subject=self.uriref, predicate=rdfns.oai.itemID) def _set_oai_id(self, value): # if value is None, remove the value if value is None: self._del_oai_id() else: # update/replace any oai item id (only one allowed) self.rels_ext.content.set( (self.uriref, rdfns.oai.itemID, Literal(value))) def _del_oai_id(self): self.rels_ext.content.remove( (self.uriref, rdfns.oai.itemID, self.oai_id)) oai_id = property(_get_oai_id, _set_oai_id, _del_oai_id) @property def collection(self): collection_uri = self.rels_ext.content.value( subject=self.uriref, predicate=rdfns.relsext.isMemberOfCollection) if collection_uri: return CollectionObject( self.api, str(collection_uri).replace('info:fedora/', ''))
class SimpleDigitalObject(DigitalObject): CONTENT_MODELS = ['info:fedora/%s:SimpleDjangoCModel' % TEST_PIDSPACE] # NOTE: distinguish from SimpleCModel in non-django fedora unit tests # and use configured pidspace for automatic clean-up # extend digital object with datastreams for testing text = Datastream("TEXT", "Text datastream", defaults={ 'mimetype': 'text/plain', }) image = FileDatastream('IMAGE', 'managed binary image datastream', defaults={'mimetype': 'image/png'})
class AudioObject(FileObject): CONTENT_MODELS = [ 'info:fedora/genrepo-demo:Audio-1.0', AccessibleObject.PUBLIC_ACCESS_CMODEL ] content_types = ('audio/mpeg', ) view_template = 'file/audio.html' master = FileDatastream( "source-audio", "Master audio", defaults={ 'mimetype': 'audio/mpeg', # FIXME: versioned? checksum? })
class FileObject(DigitalObject): FILE_CONTENT_MODEL = 'info:fedora/genrepo:File-1.0' CONTENT_MODELS = [FILE_CONTENT_MODEL] file = FileDatastream("MYDS", "Binary datastream", defaults={ 'versionable': True, }) brilmeta = Datastream("BRILMETA", "BRIL Metadata", defaults={ 'versionable': True, }) PREMISmeta = Datastream("PREMIS", "PREMIS Object Metadata", defaults={ 'versionable': True, })
def __init__(self, *args, **kwargs): super(FedoraStorage, self).__init__(*args, **kwargs) self.namespace = kwargs["namespace"] self.image_name = kwargs["image_name"] self.thumbnail_name = "THUMBNAIL" self.binary_name = "BINARY" self.script_name = "OCR_SCRIPT" self.transcript_name = kwargs["transcript_name"] self.repo = Repository( root=kwargs["root"], username=kwargs["username"], password=kwargs["password"]) self.model = type("Document", (DigitalObject,), { "default_pidspace": kwargs["namespace"], "FILE_CONTENT_MODEL": "info:fedora/genrepo:File-1.0", "CONTENT_MODELS": ["info:fedora/genrepo:File-1.0"], "image": FileDatastream(self.image_name, "Document image", defaults={ 'versionable': True, }), "binary": FileDatastream(self.binary_name, "Document image binary", defaults={ 'versionable': True, }), "thumbnail": FileDatastream(self.thumbnail_name, "Document image thumbnail", defaults={ 'versionable': True, }), "script": FileDatastream(self.script_name, "OCR Script", defaults={ "versionable": True, }), "transcript": FileDatastream(self.transcript_name, "Document transcript", defaults={ "versionable": True, }), "meta": FileDatastream("meta", "Document metadata", defaults={ "versionable": False, }), })
class AudioObject(DigitalObject): '''Fedora Audio Object. Extends :class:`~eulfedora.models.DigitalObject`.''' AUDIO_CONTENT_MODEL = 'info:fedora/emory-control:EuterpeAudio-1.0' CONTENT_MODELS = [AUDIO_CONTENT_MODEL] NEW_OBJECT_VIEW = 'audio:view' allowed_mimetypes = ['audio/x-wav', 'audio/wav'] mods = XmlDatastream("MODS", "MODS Metadata", AudioMods, defaults={ 'control_group': 'M', 'format': mods.MODS_NAMESPACE, 'versionable': True, }) 'MODS :class:`~eulfedora.models.XmlDatastream` with content as :class:`AudioMods`' audio = FileDatastream("AUDIO", "Audio datastream", defaults={ 'mimetype': 'audio/x-wav', 'versionable': True, }) 'master audio :class:`~eulfedora.models.FileDatastream`' compressed_audio = FileDatastream("CompressedAudio", "Compressed audio datastream", defaults={ 'mimetype': 'audio/mpeg', 'versionable': True, }) 'access copy of audio :class:`~eulfedora.models.FileDatastream`' digitaltech = XmlDatastream("DigitalTech", "Technical Metadata - Digital", DigitalTech, defaults={ 'control_group': 'M', 'versionable': True, }) '''digital technical metadata :class:`~eulfedora.models.XmlDatastream` with content as :class:`DigitalTech`''' sourcetech = XmlDatastream("SourceTech", "Technical Metadata - Source", SourceTech, defaults={ 'control_group': 'M', 'versionable': True, }) '''source technical metadata :class:`~eulfedora.models.XmlDatastream` with content as :class:`SourceTech`''' rights = XmlDatastream("Rights", "Usage rights and access control metadata", Rights, defaults={ 'control_group': 'M', 'versionable': True, }) '''access control metadata :class:`~eulfedora.models.XmlDatastream` with content as :class:`Rights`''' jhove = FileDatastream( "JHOVE", "JHOVE datastream", defaults={ 'mimetype': 'application/xml', 'control_group': 'M', 'versionable': True, 'format': 'http://hul.harvard.edu/ois/xml/xsd/jhove/jhove.xsd', }) 'JHOVE technical metadata for the master audio :class:`~eulfedora.models.FileDatastream`' # JHOVE is xml, but treat it as a file for now since we're just storing it, # not doing any processing, updating, etc. # map datastream IDs to human-readable names for inherited history_events method component_key = { 'AUDIO': 'audio (master)', 'CompressedAudio': 'audio (access version)', 'SourceTech': 'source technical metadata', 'DigitalTech': 'digital technical metadata', 'JHOVE': 'technical metadata', 'MODS': 'descriptive metadata', 'DC': 'descriptive metadata', 'Rights': 'rights metadata', 'RELS-EXT': 'collection membership', # TODO: revise when/if we add more relations } collection = Relation(relsext.isMemberOfCollection, type=CollectionObject) ''':class:`~keep.collection.models.CollectionObject that this object is a member of, via `isMemberOfCollection` relation. ''' @property def content_md5(self): return self.audio.checksum def save(self, logMessage=None): '''Save the object. If the content of any :class:`~AudioObject.mods`, :class:`AudioObject.rels_ext`, or :class:`AudioObject.digitaltech` datastreams have been changed, the DC will be updated and saved as well. :param logMessage: optional log message ''' if not self.exists or self.mods.isModified() or self.rels_ext.isModified() or \ self.digitaltech.isModified() or self.rights.isModified(): # DC is derivative metadata based on MODS/RELS-EXT/Digital Tech # If this is a new item (does not yet exist in Fedora) # OR if any of the relevant datastreams have changed, update DC self._update_dc() # for now, keep object label in sync with MODS title if self.mods.isModified() and self.mods.content.title: self.label = self.mods.content.title return super(AudioObject, self).save(logMessage) @models.permalink def get_absolute_url(self): 'Absolute url to view this object within the site' return ('audio:view', [str(self.pid)]) def get_access_url(self): "Absolute url to hear this object's access version" if self.compressed_audio.exists: return reverse('audio:download-compressed-audio', args=[str(self.pid), self.access_file_extension()]) # as of file migration (1.2), legacy DM access path is no longer needed def access_file_extension(self): '''Return the expected file extension for whatever type of compressed audio datastream the current object has (if it has one), based on the datastream mimetype. Currently, compressed audio could be MP3 or M4A/MP4.''' if self.compressed_audio.exists: if self.compressed_audio.mimetype == 'audio/mpeg': return 'mp3' if self.compressed_audio.mimetype == 'audio/mp4': return 'm4a' @property def conversion_result(self): '''Return the :class:`~eulcommon.djangoextras.taskresult.models.TaskResult` for the most recently requested access copy conversion (if any). ''' conversions = TaskResult.objects.filter( object_id=self.pid).order_by('-created') if conversions: return conversions[0] @property def researcher_access(self): return allow_researcher_access(self.rights.content) def _update_dc(self): '''Update Dublin Core (derivative metadata) based on master metadata from MODS, RELS-EXT, and digital tech metadata in order to keep data synchronized and make fields that need to be searchable accessible to Fedora findObjects API method. ''' # identifiers del (self.dc.content.identifier_list) # clear out any existing names # title if self.mods.content.title: self.label = self.mods.content.title self.dc.content.title = self.mods.content.title if self.mods.content.resource_type: self.dc.content.type = self.mods.content.resource_type # creator names del (self.dc.content.creator_list) # clear out any existing names for name in self.mods.content.names: # for now, use unicode conversion as defined in mods.Name self.dc.content.creator_list.append(unicode(name)) # clear out any dates previously in DC del (self.dc.content.date_list) if self.mods.content.origin_info and \ len(self.mods.content.origin_info.created) and \ self.mods.content.origin_info.created[0].date: self.dc.content.date_list.append( self.mods.content.origin_info.created[0].date) if self.mods.content.origin_info and \ len(self.mods.content.origin_info.issued) and \ self.mods.content.origin_info.issued[0].date: self.dc.content.date_list.append( self.mods.content.origin_info.issued[0].date) # clear out any descriptions previously in DC and set from MODS/digitaltech del (self.dc.content.description_list) if self.mods.content.general_note and \ self.mods.content.general_note.text: self.dc.content.description_list.append( self.mods.content.general_note.text) # clear out any rights previously in DC and set contents from Rights accessStatus del (self.dc.content.rights_list) if self.rights.content.access_status: # access code no longer needs to be included, since we will not be searching self.dc.content.rights_list.append( self.rights.content.access_status.text) def index_data(self): '''Extend the default :meth:`eulfedora.models.DigitalObject.index_data` method to include additional fields specific to Keep Audio objects.''' # NOTE: we don't want to rely on other objects being indexed in Solr, # so index data should not use Solr to find any related object info # FIXME: is it worth splitting out descriptive index data here? data = super(AudioObject, self).index_data() data['object_type'] = 'audio' if self.collection and self.collection.exists: # collection_source_id (0 is an allowable id, so check not None) if self.collection.mods.content.source_id is not None: data[ 'collection_source_id'] = self.collection.mods.content.source_id # FIXME: previously indexing URI; is this needed for any reason or can we # use pid? (needs to match collection index pid field for solr join) # data['collection_id'] = self.collection.uri data['collection_id'] = self.collection.pid try: # pull parent & archive collection objects directly from fedora parent = CollectionObject(self.api, self.collection.uri) data['collection_label'] = parent.label # NB: as of 2011-08-23, eulindexer doesn't support automatic # reindexing of audio objects when their collection changes. # as a result, archive_id and archive_label may be stale. # disable indexing them until eulindexer supports those # chained updates. #data['archive_id'] = parent.collection_id #archive = CollectionObject(self.api, parent.collection_id) #data['archive_label'] = archive.label except RequestFailed as rf: logger.error( 'Error accessing collection or archive object in Fedora: %s' % rf) # include resolvable ARK if available if self.mods.content.ark_uri: data['ark_uri'] = self.mods.content.ark_uri # old identifiers from previous digital masters dm1_ids = [] if self.mods.content.dm1_id: dm1_ids.append(self.mods.content.dm1_id) if self.mods.content.dm1_other_id: dm1_ids.append(self.mods.content.dm1_other_id) if dm1_ids: data['dm1_id'] = dm1_ids # digitization purpose, if not empty if self.digitaltech.content.digitization_purpose_list: # convert nodelist to a normal list that can be serialized as json data['digitization_purpose'] = [ dp for dp in self.digitaltech.content.digitization_purpose_list ] # related files if self.sourcetech.content.related_files_list: data['related_files'] = [ rel for rel in self.sourcetech.content.related_files_list ] # part note if self.mods.content.part_note and self.mods.content.part_note.text: data['part'] = self.mods.content.part_note.text # sublocation if self.sourcetech.content.sublocation: data['sublocation'] = self.sourcetech.content.sublocation # rights access status code if self.rights.content.access_status: data['access_code'] = self.rights.content.access_status.code # copyright date from rights metadata if self.rights.content.copyright_date: data['copyright_date'] = self.rights.content.copyright_date # ip note from rights metadata if self.rights.content.ip_note: data['ip_note'] = self.rights.content.ip_note # boolean values that should always be available data.update({ # should this item be accessible to researchers? 'researcher_access': bool(self.researcher_access), # if None, we want False # flags to indicate which datastreams are available 'has_access_copy': self.compressed_audio.exists, 'has_original': self.audio.exists, }) if self.compressed_audio.exists: data.update({ 'access_copy_size': self.compressed_audio.size, 'access_copy_mimetype': self.compressed_audio.mimetype, }) if self.digitaltech.content.duration: data['duration'] = self.digitaltech.content.duration if self.mods.content.origin_info and \ self.mods.content.origin_info.issued \ and not self.mods.content.origin_info.issued.is_empty(): data['date_issued'] = [ unicode(di) for di in self.mods.content.origin_info.issued ] if self.mods.content.origin_info and \ self.mods.content.origin_info.created \ and not self.mods.content.origin_info.created.is_empty(): data['date_created'] = [ unicode(di) for di in self.mods.content.origin_info.created ] if self.audio.exists: data['content_md5'] = self.audio.checksum return data @staticmethod def init_from_file(filename, initial_label=None, request=None, checksum=None, mimetype=None): '''Static method to create a new :class:`AudioObject` instance from a file. Sets the object label and metadata title based on the initial label specified, or file basename. Calculates and stores the duration based on the file. Also sets the following default metadata values: * mods:typeOfResource = "sound recording" * dt:codecQuality = "lossless" :param filename: full path to the audio file, as a string :param initial_label: optional initial label to use; if not specified, the base name of the specified file will be used :param request: :class:`django.http.HttpRequest` passed into a view method; must be passed in order to connect to Fedora as the currently-logged in user :param checksum: the checksum of the file being sent to fedora. :returns: :class:`AudioObject` initialized from the file ''' if initial_label is None: initial_label = os.path.basename(filename) repo = Repository(request=request) obj = repo.get_object(type=AudioObject) # set initial object label from the base filename obj.label = initial_label obj.dc.content.title = obj.mods.content.title = obj.label obj.audio.content = open( filename) # FIXME: at what point does/should this get closed? # Set the file checksum, if set. obj.audio.checksum = checksum # set content datastream mimetype if passed in if mimetype is not None: obj.audio.mimetype = mimetype #Get the label, minus the ".wav" (mimetype indicates that) obj.audio.label = initial_label[:-4] # set initial mods:typeOfResource - all AudioObjects default to sound recording obj.mods.content.resource_type = 'sound recording' # set codec quality to lossless in digital tech metadata # - default for AudioObjects, should only accept lossless audio for master file obj.digitaltech.content.codec_quality = 'lossless' # get wav duration and store in digital tech metadata obj.digitaltech.content.duration = '%d' % round(wav_duration(filename)) return obj @staticmethod def all(): 'Find all Audio objects by content model within the configured pidspace.' search_opts = { 'type': AudioObject, # restrict to objects in configured pidspace 'pid__contains': '%s:*' % settings.FEDORA_PIDSPACE, # restrict by cmodel in dc:format 'format__contains': AudioObject.AUDIO_CONTENT_MODEL, } repo = Repository() return repo.find_objects(**search_opts)
class ImageObject(FileObject): CONTENT_MODELS = [ 'info:fedora/genrepo-demo:Image-1.0', AccessibleObject.PUBLIC_ACCESS_CMODEL ] IMAGE_SERVICE = 'genrepo-demo:DjatokaImageService' content_types = ('image/jpeg', 'image/jp2', 'image/gif', 'image/bmp', 'image/png', 'image/tiff') view_template = 'file/image.html' # DC & RELS-EXT inherited; override master master = FileDatastream( "source-image", "Master TIFF image", defaults={ 'mimetype': 'image/tiff', # FIXME: versioned? checksum? }) has_preview = True def get_preview_image(self): return self.getDissemination(self.IMAGE_SERVICE, 'getRegion', params={'level': 1}) def get_region(self, params): # expose djatoka getRegion method for djatoka seadragon deep zoom return self.getDissemination(self.IMAGE_SERVICE, 'getRegion', params=params) _image_metadata = None @property def image_metadata(self): 'Image metadata as returned by Djatoka getMetadata method (width, height, etc.)' if self._image_metadata is None: imgmeta = self.getDissemination(self.IMAGE_SERVICE, 'getMetadata') # getDissemination returns a tuple of result, url # load the image metadata returned by djatoka via json and return self._image_metadata = json.loads(imgmeta[0]) return self._image_metadata # expose width & height from image metadata as properties @property def width(self): return self.image_metadata['width'] @property def height(self): return self.image_metadata['height'] def deepzoom_info(self): # generate deepzoom image info xmlobject for based on width & height return DziImage(tilesize=256, overlap=1, format='jpg', width=self.width, height=self.height)
class DiskImage(DigitalObject): '''Fedora object for Disk Images. Extends :class:`~keep.common.fedora.DigitalObject`. ''' # NOTE about datastream naming conventions # Where a corresponding datastream id already exists within the Keep # (i.e. MODS for mods metadata), the same datastream id will be used # Where a Keep datastream id does not already exist (e.g., Premis), following # Hydra content model conventions, based on generic simple Hydra content model # For documentation on Hydra content models, see: # https://wiki.duraspace.org/display/hydra/Hydra+objects%2C+content+models+%28cModels%29+and+disseminators # https://wiki.duraspace.org/display/hydra/Hydra+objects%2C+content+models+%28cModels%29+and+disseminators#Hydraobjects%2Ccontentmodels%28cModels%29anddisseminators-genericContent DISKIMAGE_CONTENT_MODEL = 'info:fedora/emory-control:DiskImage-1.0' CONTENT_MODELS = [DISKIMAGE_CONTENT_MODEL] NEW_OBJECT_VIEW = 'file:view' diskimage_mimetypes = [ 'application/x-aff', # AFF, advanced forensic format 'application/x-ad1', # AD1, proprietary disk image format 'application/x-iso9660-image', # ISO 'application/x-ewf', # E01 Expert Witness Format 'application/x-tar', # tar file 'application/mbox' # mbox (? may require extra magic file entries) ] # mapping of mimetype to format label to insert in Premis mimetype_format = { 'application/x-aff': 'AFF', 'application/x-ad1': 'AD1', 'application/x-iso9660-image': 'ISO', 'application/x-ewf': 'E01', 'application/x-tar': 'TAR', 'application/mbox': 'MBOX' } allowed_mimetypes = ['', 'application/octet-stream'] + diskimage_mimetypes # NOTE: empty type and application/octet-stream are required for javascript upload, # because browser does not detect any mimetype at all for AFF and AD1 files # and detects ISO as the generic application/octet-stream # NOTE: Mimetypes for AD1 and AFF are custom mimetypes and must be configured # in your local magic files. See the deploy notes for more information. collection = Relation(relsext.isMemberOfCollection, type=CollectionObject) ''':class:`~keep.collection.models.CollectionObject that this object belongs to, via `isMemberOfCollection` relation. ''' #: original DiskImage object that this DiskImage is related to, if #: this is a migrated object; related via fedora-rels-ext isDerivationOf original = Relation(relsext.isDerivationOf, type='self') #: migrated DiskImage object that supercedes this object, if a #: migration has occurred; related via fedora-rels-ext hasDerivation migrated = Relation(relsext.hasDerivation, type='self') mods = XmlDatastream("MODS", "MODS Metadata", DiskImageMods, defaults={ 'control_group': 'M', 'format': mods.MODS_NAMESPACE, 'versionable': True, }) '''descriptive metadata as MODS - :class:`~eulfedora.models.XmlDatastream` with content as :class:`LocalMods`''' # note: using base local mods for now; may need to extend for disk images content = FileDatastream("content", "Master disk image file", defaults={ 'versionable': False, }) 'master disk image binary content as :class:`~eulfedora.models.FileDatastream`' # NOTE: could be one of a few allowed mimetypes rights = XmlDatastream("Rights", "Usage rights and access control metadata", Rights, defaults={ 'control_group': 'M', 'versionable': True, }) '''access control metadata :class:`~eulfedora.models.XmlDatastream` with content as :class:`Rights`''' provenance = XmlDatastream('provenanceMetadata', 'Provenance metadata', DiskImagePremis, defaults={'versionable': False}) '''``provenanceMetadata`` datastream for PREMIS object metadata; datastream XML content will be an instance of :class:`eulxml.xmlmap.premis.Premis`.''' # map datastream IDs to human-readable names for inherited history_events method component_key = { 'MODS': 'descriptive metadata', 'DC': 'descriptive metadata', 'Rights': 'rights metadata', 'RELS-EXT': 'collection membership or last fixity check', # TODO: revise as we add more relations 'provenanceMetadata': 'provenance metadata', } def get_default_pid(self): # extend common default pid logic in to also set ARK identifier # in the premis object pid = super(DiskImage, self).get_default_pid() if self.mods.content.ark: self.provenance.content.object.id = self.mods.content.ark self.provenance.content.object.id_type = 'ark' return pid @property def has_supplemental_content(self): '''Boolean to indicate if this disk image object has any supplemental file datastreams. .. Note:: only works on saved objects ''' return any( dsid.startswith('supplement') for dsid in self.ds_list.keys()) @property def supplemental_content(self): '''Generator for supplemental content datastreams''' for dsid in self.ds_list.keys(): if dsid.startswith('supplement'): yield self.getDatastreamObject(dsid) _content_checksum = None '''Used as a workaround for Fedora 3.4 issue with file URIs and checksums and to support duplicate detection based on checksums, store content checksum without sending it to Fedora.''' @property def content_md5(self): return self._content_checksum or self.content.checksum # NOTE: auto-calculated information such as checksums stored in premis # will need to be updated anytime the master disk image datastream is updated # (will probably need to extend the save method for this) def save(self, logMessage=None): '''Save the object. If the content of any :class:`~AudioObject.mods`, :class:`AudioObject.rels_ext`, or :class:`AudioObject.digitaltech` datastreams have been changed, the DC will be updated and saved as well. :param logMessage: optional log message ''' if not self.exists or self.mods.isModified() or \ self.rels_ext.isModified() or self.rights.isModified(): # DC is derivative metadata. # If this is a new item (does not yet exist in Fedora) # OR if any of the relevant datastreams have changed, update it. self._update_dc() return super(DiskImage, self).save(logMessage) def _update_dc(self): '''Update Dublin Core (derivative metadata) based on master metadata from MODS, RELS-EXT, and rights metadata in order to keep data synchronized and make fields that need to be searchable accessible to Fedora findObjects API method. ''' # NOTE: borrowed almost completely from audio, with minor modifications # TODO: move to common code somewhere? # identifiers del (self.dc.content.identifier_list) # clear out any existing names # title if self.mods.content.title: # not strictly DC, but also keep object label in sync with MODS title self.label = self.mods.content.title self.dc.content.title = self.mods.content.title if self.mods.content.resource_type: self.dc.content.type = self.mods.content.resource_type # clear out any dates previously in DC del (self.dc.content.coverage_list) if self.mods.content.coveringdate_start and \ self.mods.content.coveringdate_end: # FIXME: not sure the best way to indicate date range here self.dc.content.coverage_list.append( '%s:%s' % (self.mods.content.coveringdate_start, self.mods.content.coveringdate_end)) # clear out any descriptions previously in DC and set from MODS abstract del (self.dc.content.description_list) if self.mods.content.abstract and \ self.mods.content.abstract.text: self.dc.content.description_list.append( self.mods.content.abstract.text) # clear out any rights previously in DC and set contents from Rights accessStatus del (self.dc.content.rights_list) if self.rights.content.access_status: # set dc:rights to text of access status self.dc.content.rights_list.append( self.rights.content.access_status.text) @staticmethod def init_from_file(filename, initial_label=None, request=None, checksum=None, mimetype=None, content_location=None, sha1_checksum=None): '''Static method to create a new :class:`DiskImage` instance from a file. Sets the object label and metadata title based on the initial label specified, or file basename. :param filename: full path to the disk image file, as a string :param initial_label: optional initial label to use; if not specified, the base name of the specified file will be used :param request: :class:`django.http.HttpRequest` passed into a view method; must be passed in order to connect to Fedora as the currently-logged in user :param checksum: the MD5 checksum of the file being sent to fedora. :param mimetype: the mimetype for the main disk image content. :param content_location: optional file URI for file-based Fedora ingest :param sha1_checksum: the SHA1 checksum of the file being sent to fedora, for storage in the PREMIS technical metadata. Note that SHA-1 will be calculated if not passed in (slow for large files). :returns: :class:`DiskImage` initialized from the file ''' # if no checksum was passed in, calculate one if checksum is None: checksum = md5sum(filename) basename, ext = os.path.splitext(os.path.basename(filename)) # ajax upload passes original filename as initial label if initial_label is not None: # if initial label looks like a file, strip off the extension # for the object name/title if initial_label.lower().endswith('.aff') or \ initial_label.lower().endswith('.ad1') or \ initial_label.lower().endswith('.iso'): basename, ext = os.path.splitext(initial_label) # NOTE: also using extension from original filename # here because in some cases (under apache?) uploaded file # names do not have the original extension initial_label = basename else: initial_label = basename repo = Repository(request=request) obj = repo.get_object(type=DiskImage) # set initial object label from the base filename obj.label = initial_label obj.mods.content.title = obj.label obj.dc.content.title = obj.label # set initial mods:typeOfResource - same for all Disk Images obj.mods.content.resource_type = 'software, multimedia' # set genre as born digital obj.mods.content.genres.append( mods.Genre(authority='aat', text='born digital')) # Set the file checksum obj.content.checksum = checksum # set mimetype if mimetype is None: # if no mimetype was passed in, determine from file m = magic.Magic(mime=True) mtype = m.from_file(filename) mimetype, separator, options = mtype.partition(';') obj.content.mimetype = mimetype # Set disk image datastream label to filename obj.content.label = initial_label # premis data obj.provenance.content.create_object() # NOTE: premis object id will be same as short-form ARK stored in MODS # It cannot be set until pid is minted, which will happen in get_default_pid, # but premis is order dependent so add a place-holder here obj.provenance.content.object.id_type = 'ark' obj.provenance.content.object.id = '' # object type required to be schema valid, must be in premis namespace obj.provenance.content.object.type = 'p:file' # composition level required for object characteristics; probably should be 0 (?) obj.provenance.content.object.composition_level = 0 # store checksums in premis: MD5 (already calculated) and SHA-1 # picky about order here too: force algorithm to be added first obj.provenance.content.object.checksums.append( PremisFixity(algorithm='MD5')) obj.provenance.content.object.checksums[0].digest = checksum # add sha-1 to checksums in premis; calculate if not passed in if sha1_checksum is None: sha1_checksum = sha1sum(filename) obj.provenance.content.object.checksums.append( PremisFixity(algorithm='SHA-1')) obj.provenance.content.object.checksums[1].digest = sha1_checksum obj.provenance.content.object.create_format() # set format based on mimetype if mimetype in DiskImage.mimetype_format: obj_format = DiskImage.mimetype_format[mimetype] else: # as a fallback, use the file extension for format obj_format = ext.upper().strip('.') obj.provenance.content.object.format.name = obj_format # if a content URI is specified (e.g. for large files), use that if content_location is not None: obj.content.ds_location = content_location # otherwise set the file as content to be posted else: obj.content.content = open(filename) # FIXME: at what point does/should this file get closed? # descriptive/technical metadata todo return obj @staticmethod def init_from_bagit(path, request=None, file_uri=True): '''Static method to create a new :class:`DiskImage` instance from a BagIt. Sets the object label and metadata title based on the name of the bag, and looks for a supported disk image file type (e.g. AFF or AD1) to use as the content datastream for the object. Content checksum is pulled from the BagIt metadata, and repository ingest will be done via file URIs based on configured **LARGE_FILE_STAGING_DIR** and **LARGE_FILE_STAGING_FEDORA_DIR** to better support ingesting large files (unless file_uri is False). Raises an exception if BagIt is not valid or if it does not contain a supported disk image data file. (Note: using fast validation without checksum calculation, to minimize the time required to ingest large files.) :param path: full path to the BagIt directory that contains a disk image file :param request: :class:`django.http.HttpRequest` passed into a view method; must be passed in order to connect to Fedora as the currently-logged in user :param file_uri: ingest BagIt data via file uris based on configured staging directories (default behavior) instead of uploading the content to Fedora :returns: :class:`DiskImage` initialized from the BagIt contents ''' # TODO: add optional file uri ingest flag, default to false # (mostly to allow testing) # - for all data files other than disk image, add # supplementN datastream with mimetype/filename as label/checksum # see if eulfedora getDatastreamObject can be used to init # a new/unmapped ds? bag = bagit.Bag(path) # NOTE: using fast validation here to avoid recalculating checksums # for very large files; only checksum compare will be done by fedora bag.validate(fast=True) # raises bagit.BagValidationError if not valid # use the base name of the BagIt as initial object label initial_label = os.path.basename(path) # identify disk image content file within the bag content_file = None m = magic.Magic(mime=True) supplemental_files = [] supplement_mimetypes = {} diskimage_mimetype = None # loop through bag content until we find a supported disk image file for data_path in bag.payload_files(): # path is relative to bag root dir filename = os.path.join(path, data_path) mtype = m.from_file(filename) mimetype, separator, options = mtype.partition(';') if mimetype in DiskImage.diskimage_mimetypes: checksum_err_msg = '%%s checksum not found for disk image %s' \ % os.path.basename(data_path) # require both MD5 and SHA-1 for disk image to ingest try: md5_checksum = bag.entries[data_path]['md5'] except KeyError: raise Exception(checksum_err_msg % 'MD5') try: sha1_checksum = bag.entries[data_path]['sha1'] except KeyError: raise Exception(checksum_err_msg % 'SHA-1') # this is the disk image content file # store file and mimetype for further initialization content_file = filename diskimage_mimetype = mimetype # any data file that is not a disk image should be assumed # to be a supplemental file else: supplemental_files.append(filename) # store the mimetype so we don't have to recalculate supplement_mimetypes[filename] = mimetype # no disk image data found if content_file is None: raise Exception('No disk image content found in %s' % os.path.basename(path)) optional_args = {} if file_uri: ingest_location = 'file://%s' % urllib.quote(content_file) # if Fedora base path is different from locally mounted staging directory, # convert from local path to fedora server path if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR', None) is not None: ingest_location = ingest_location.replace( settings.LARGE_FILE_STAGING_DIR, settings.LARGE_FILE_STAGING_FEDORA_DIR) optional_args['content_location'] = ingest_location img = DiskImage.init_from_file(content_file, initial_label=initial_label, checksum=md5_checksum, mimetype=diskimage_mimetype, request=request, sha1_checksum=sha1_checksum, **optional_args) i = 0 for i in range(len(supplemental_files)): sfile = supplemental_files[i] dsid = 'supplement%d' % i dsobj = img.getDatastreamObject(dsid, dsobj_type=FileDatastreamObject) dsobj.label = os.path.basename(sfile) dsobj.mimetype = supplement_mimetypes[sfile] # convert to relative path *within* the bag for BagIt metadata lookup data_path = sfile.replace(path, '').lstrip('/') dsobj.checksum = bag.entries[data_path]['md5'] logger.debug('Adding supplemental dastream %s label=%s mimetype=%s checksum=%s' % \ (dsid, dsobj.label, dsobj.mimetype, dsobj.checksum)) if file_uri: ingest_location = 'file://%s' % urllib.quote(sfile) # if Fedora base path is different from locally mounted staging directory, # convert from local path to fedora server path if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR', None) is not None: ingest_location = ingest_location.replace( settings.LARGE_FILE_STAGING_DIR, settings.LARGE_FILE_STAGING_FEDORA_DIR) dsobj.ds_location = ingest_location else: # will probably only work for small/test content dsobj.content = open(sfile).read() return img @models.permalink def get_absolute_url(self): 'Absolute url to view this object within the site' return (DiskImage.NEW_OBJECT_VIEW, [str(self.pid)]) def index_data(self): '''Extend the default :meth:`eulfedora.models.DigitalObject.index_data` method to include additional fields specific to Keep and for disk images.''' # NOTE: we don't want to rely on other objects being indexed in Solr, # so index data should not use Solr to find any related object info data = super(DiskImage, self).index_data() # FIXME: is born-digital type still needed for anything? perms? # data['object_type'] = 'born-digital' data['object_type'] = 'disk image' # set as born digital for now; eventually, we'll need to distinguish # between kinds of born digital content if self.collection and self.collection.exists: # collection_source_id (0 is an allowable id, so check not None) if self.collection.mods.content.source_id is not None: data[ 'collection_source_id'] = self.collection.mods.content.source_id data['collection_id'] = self.collection.pid data['collection_label'] = self.collection.label # include resolvable ARK if available if self.mods.content.ark_uri: data['ark_uri'] = self.mods.content.ark_uri if self.content.checksum: data['content_md5'] = self.content.checksum # copied from audio; enable once we have rights editing # # rights access status code # if self.rights.content.access_status: # data['access_code'] = self.rights.content.access_status.code # # copyright date from rights metadata # if self.rights.content.copyright_date: # data['copyright_date'] = self.rights.content.copyright_date # # ip note from rights metadata # if self.rights.content.ip_note: # data['ip_note'] = self.rights.content.ip_note if self.provenance.content.fixity_checks: last_fixity_check = self.provenance.content.fixity_checks[-1] data['last_fixity_check'] = last_fixity_check.date data['last_fixity_result'] = last_fixity_check.outcome # store disk image format and size # - some disk images (i.e., objects migrated from AD1/AFF) # will have two sets of object characteristics; we want the # format from the last one listed if self.provenance.content.object and \ self.provenance.content.object.latest_format: data[ 'content_format'] = self.provenance.content.object.latest_format.name data['content_size'] = self.content.size if self.original: data['original_pid'] = self.original.pid return data
class Video(DigitalObject): '''Fedora Video Object. Extends :class:`~eulfedora.models.DigitalObject`.''' VIDEO_CONTENT_MODEL = 'info:fedora/emory-control:Video-1.0' CONTENT_MODELS = [VIDEO_CONTENT_MODEL] NEW_OBJECT_VIEW = 'video:view' # There are several mimetypes for MPEG files allowed_master_mimetypes = { 'video/quicktime': 'mov', 'video/x-dv': 'dv', 'video/mpeg': 'mpg', 'video/x-m4v': 'm4v', 'video/x-msvideo': 'avi' } allowed_access_mimetypes = {'video/mp4': 'mp4'} mods = XmlDatastream("MODS", "MODS Metadata", VideoMods, defaults={ 'control_group': 'M', 'format': mods.MODS_NAMESPACE, 'versionable': True, }) digitaltech = XmlDatastream("DigitalTech", "Technical Metadata - Digital", VideoDigitalTech, defaults={ 'control_group': 'M', 'versionable': True, }) '''digital technical metadata :class:`~eulfedora.models.XmlDatastream` with content as :class:`DigitalTech`''' 'MODS :class:`~eulfedora.models.XmlDatastream` with content as :class:`VideoMods`' content = FileDatastream("VIDEO", "Video datastream", defaults={ 'versionable': True, }) 'master video :class:`~eulfedora.models.FileDatastream`' provenance = XmlDatastream('provenanceMetadata', 'Provenance metadata', VideoPremis, defaults={'versionable': False}) '''``provenanceMetadata`` datastream for PREMIS object metadata; datastream XML content will be an instance of :class:`eulxml.xmlmap.premis.Premis`.''' access_copy = FileDatastream("CompressedVideo", "Compressed video datastream", defaults={ 'mimetype': 'video/mp4', 'versionable': True, }) 'access copy of video :class:`~eulfedora.models.FileDatastream`' sourcetech = XmlDatastream("SourceTech", "Technical Metadata - Source", VideoSourceTech, defaults={ 'control_group': 'M', 'versionable': True, }) '''source technical metadata :class:`~eulfedora.models.XmlDatastream` with content as :class:`SourceTech`''' rights = XmlDatastream("Rights", "Usage rights and access control metadata", Rights, defaults={ 'control_group': 'M', 'versionable': True, }) '''access control metadata :class:`~eulfedora.models.XmlDatastream` with content as :class:`Rights`''' # # map datastream IDs to human-readable names for inherited history_events method component_key = { 'Video': 'video (master)', 'CompressedVideo': 'video (access version)', 'SourceTech': 'source technical metadata', 'DigitalTech': 'digital technical metadata', 'MODS': 'descriptive metadata', 'DC': 'descriptive metadata', 'Rights': 'rights metadata', 'RELS-EXT': 'collection membership', } # collection = Relation(relsext.isMemberOfCollection, type=CollectionObject) ''':class:`~keep.collection.models.CollectionObject that this object is a member of, via `isMemberOfCollection` relation. ''' _content_checksum = None '''Used as a workaround for Fedora 3.4 issue with file URIs and checksums and to support duplicate detection based on checksums, store content checksum without sending it to Fedora.''' @property def content_md5(self): return self._content_checksum or self.content.checksum def get_default_pid(self): # extend common default pid logic in to also set ARK identifier # in the premis object pid = super(Video, self).get_default_pid() if self.mods.content.ark: self.provenance.content.create_object() self.provenance.content.object.id = self.mods.content.ark self.provenance.content.object.id_type = 'ark' return pid def save(self, logMessage=None): '''Save the object. If the content of any :class:`~Video.mods`, :class:`Video.rels_ext`, or :class:`Video.digitaltech` datastreams have been changed, the DC will be updated and saved as well. :param logMessage: optional log message ''' if not self.exists or self.mods.isModified() or self.rels_ext.isModified() or \ self.digitaltech.isModified() or self.rights.isModified(): # DC is derivative metadata based on MODS/RELS-EXT/Digital Tech # If this is a new item (does not yet exist in Fedora) # OR if any of the relevant datastreams have changed, update DC self._update_dc() # for now, keep object label in sync with MODS title if self.mods.isModified() and self.mods.content.title: self.label = self.mods.content.title return super(Video, self).save(logMessage) # @models.permalink def get_absolute_url(self): 'Absolute url to view this object within the site' return ('video:view', [str(self.pid)]) def get_access_url(self): "Absolute url to hear this object's access version" if self.access_copy.exists: return reverse('video:download-compressed-video', args=[str(self.pid)]) def access_file_extension(self): '''Return the expected file extension for whatever type of compressed video datastream the current object has (if it has one), based on the datastream mimetype. Currently, compressed video is MP4.''' if self.access_copy.exists: return self.allowed_access_mimetypes.get(self.access_copy.mimetype, 'mp4') @property def researcher_access(self): return allow_researcher_access(self.rights.content) def _update_dc(self): '''Update Dublin Core (derivative metadata) based on master metadata from MODS, RELS-EXT, and digital tech metadata in order to keep data synchronized and make fields that need to be searchable accessible to Fedora findObjects API method. ''' # identifiers del self.dc.content.identifier_list # clear out any existing names # title if self.mods.content.title: self.label = self.mods.content.title self.dc.content.title = self.mods.content.title if self.mods.content.resource_type: self.dc.content.type = self.mods.content.resource_type # creator names del self.dc.content.creator_list # clear out any existing names for name in self.mods.content.names: # for now, use unicode conversion as defined in mods.Name self.dc.content.creator_list.append(unicode(name)) # clear out any dates previously in DC del self.dc.content.date_list if self.mods.content.origin_info and \ len(self.mods.content.origin_info.created) and \ self.mods.content.origin_info.created[0].date: self.dc.content.date_list.append( self.mods.content.origin_info.created[0].date) if self.mods.content.origin_info and \ len(self.mods.content.origin_info.issued) and \ self.mods.content.origin_info.issued[0].date: self.dc.content.date_list.append( self.mods.content.origin_info.issued[0].date) # clear out any descriptions previously in DC and set from MODS/digitaltech del self.dc.content.description_list if self.mods.content.general_note and \ self.mods.content.general_note.text: self.dc.content.description_list.append( self.mods.content.general_note.text) # clear out any rights previously in DC and set contents from Rights accessStatus del self.dc.content.rights_list if self.rights.content.access_status: # access code no longer needs to be included, since we will not be searching self.dc.content.rights_list.append( self.rights.content.access_status.text) def index_data(self): '''Extend the default :meth:`eulfedora.models.DigitalObject.index_data` method to include additional fields specific to Keep Video objects.''' # NOTE: we don't want to rely on other objects being indexed in Solr, # so index data should not use Solr to find any related object info data = super(Video, self).index_data() data['object_type'] = 'video' if self.collection and self.collection.exists: # collection_source_id (0 is an allowable id, so check not None) if self.collection.mods.content.source_id is not None: data[ 'collection_source_id'] = self.collection.mods.content.source_id data['collection_id'] = self.collection.pid try: # pull parent & archive collection objects directly from fedora parent = CollectionObject(self.api, self.collection.uri) data['collection_label'] = parent.label except RequestFailed as rf: logger.error( 'Error accessing collection or archive object in Fedora: %s' % rf) # include resolvable ARK if available if self.mods.content.ark_uri: data['ark_uri'] = self.mods.content.ark_uri #TODO May have to add these sections if more metada is added # # old identifiers from previous digital masters dm1_ids = [] if self.mods.content.dm1_id: dm1_ids.append(self.mods.content.dm1_id) if self.mods.content.dm1_other_id: dm1_ids.append(self.mods.content.dm1_other_id) if dm1_ids: data['dm1_id'] = dm1_ids # digitization purpose, if not empty if self.digitaltech.content.digitization_purpose_list: # convert nodelist to a normal list that can be serialized as json data['digitization_purpose'] = [ dp for dp in self.digitaltech.content.digitization_purpose_list ] # sublocation if self.sourcetech.content.sublocation: data['sublocation'] = self.sourcetech.content.sublocation # rights access status code if self.rights.content.access_status: data['access_code'] = self.rights.content.access_status.code # copyright date from rights metadata if self.rights.content.copyright_date: data['copyright_date'] = self.rights.content.copyright_date # ip note from rights metadata if self.rights.content.ip_note: data['ip_note'] = self.rights.content.ip_note # # # boolean values that should always be available data.update({ # should this item be accessible to researchers? 'researcher_access': bool(self.researcher_access), # flags to indicate which datastreams are available 'has_access_copy': self.access_copy.exists, 'has_original': self.content.exists, }) if self.access_copy.exists: data.update({ 'access_copy_size': self.access_copy.info.size, 'access_copy_mimetype': self.access_copy.mimetype, }) if self.digitaltech.content.duration: data['duration'] = self.digitaltech.content.duration if self.mods.content.origin_info and \ self.mods.content.origin_info.issued \ and not self.mods.content.origin_info.issued.is_empty(): data['date_issued'] = [ unicode(di) for di in self.mods.content.origin_info.issued ] if self.mods.content.origin_info and \ self.mods.content.origin_info.created \ and not self.mods.content.origin_info.created.is_empty(): data['date_created'] = [ unicode(di) for di in self.mods.content.origin_info.created ] # store master video format and size if self.provenance.content.object and self.provenance.content.object.format: data['content_format'] = self.provenance.content.object.format.name data['content_size'] = self.content.size return data @staticmethod def init_from_file(master_filename, initial_label=None, request=None, master_md5_checksum=None, master_sha1_checksum=None, master_location=None, master_mimetype=None, access_filename=None, access_location=None, access_md5_checksum=None, access_mimetype=None): '''Static method to create a new :class:`Video` instance from a file. Sets the object label and metadata title based on the initial label specified, or file basename. Calculates and stores the duration based on the file. Also sets the following default metadata values: * mods:typeOfResource = "sound recording" :param master_filename: full path to the master file, as a string :param initial_label: optional initial label to use; if not specified, the base name of the specified file will be used :param request: :class:`django.http.HttpRequest` passed into a view method; must be passed in order to connect to Fedora as the currently-logged in user :param master_md5_checksum: the MD5 checksum of the master file being sent to fedora. :param master_sha1_checksum: the sha-1 checksum of the master file being sent to fedora. :param master_location: optional file URI for file-based Fedora ingest of master file :param master_mimetype: the master_mimetype of the master file being sent to fedora :param access_filename: full path to the access file, as a string :param access_md5_checksum: the MD5 checksum of the access file being sent to fedora. :param access_mimetype: the mimetype of the access file being sent to fedora :returns: :class:`Video` initialized from the file ''' if initial_label is None: initial_label = os.path.basename(master_filename) repo = Repository(request=request) obj = repo.get_object(type=Video) # set initial object label from the base master_filename obj.label = initial_label obj.dc.content.title = obj.mods.content.title = obj.label # Set the file checksum, if set. obj.content.checksum = master_md5_checksum # set content datastream master_mimetype if passed in if master_mimetype is not None: obj.content.mimetype = master_mimetype #Get the label, minus the extention (master_mimetype indicates that) obj.content.label = initial_label.rsplit('.', 1)[0] # set initial mods:typeOfResource - all Vodeo default to video recording obj.mods.content.resource_type = 'moving image' # get duration and store in digital tech metadata try: info = MediaInfo.parse(master_filename) duration = info.tracks[0].duration / 1000 except: raise Exception('Error getting video duration') obj.digitaltech.content.duration = '%d' % round(duration) # premis data obj.provenance.content.create_object() obj.provenance.content.object.id_type = 'ark' obj.provenance.content.object.id = '' obj.provenance.content.object.type = 'p:file' obj.provenance.content.object.checksums.append( PremisFixity(algorithm='MD5')) obj.provenance.content.object.checksums[0].digest = master_md5_checksum if master_sha1_checksum is None: master_sha1_checksum = sha1sum(master_filename) obj.provenance.content.object.checksums.append( PremisFixity(algorithm='SHA-1')) obj.provenance.content.object.checksums[ 1].digest = master_sha1_checksum obj.provenance.content.object.create_format() #format name will be upper-cased version of file extension obj.provenance.content.object.format.name = master_filename.rsplit( '.', 1)[1].upper() # if a content URI is specified (e.g. for large files), use that if master_location is not None: obj.content.ds_location = master_location # otherwise set the file as content to be posted else: obj.content.content = open(master_filename) # Access copy data # if a access URI is specified (e.g. for large files), use that if access_location is not None: obj.access_copy.ds_location = access_location # otherwise set the access file as content to be posted else: obj.access_copy.content = open(access_filename) obj.access_copy.mimetype = access_mimetype obj.access_copy.checksum = access_md5_checksum obj.access_copy.label = initial_label return obj @staticmethod def init_from_bagit(path, request=None, file_uri=True): '''Static method to create a new :class:`Video` instance from a BagIt. Sets the object label and metadata title based on the name of the bag, and looks for a supported video file type to use as the content datastream for the object. Content checksum is pulled from the BagIt metadata, and repository ingest will be done via file URIs based on configured **LARGE_FILE_STAGING_DIR** and **LARGE_FILE_STAGING_FEDORA_DIR** to better support ingesting large files (unless file_uri is False). Raises an exception if BagIt is not valid or if it does not contain a supported video data file. (Note: using fast validation without checksum calculation, to minimize the time required to ingest large files.) :param path: full path to the BagIt directory that contains a video file :param request: :class:`django.http.HttpRequest` passed into a view method; must be passed in order to connect to Fedora as the currently-logged in user :param file_uri: ingest BagIt data via file uris based on configured staging directories (default behavior) instead of uploading the content to Fedora :returns: :class:`Video` initialized from the BagIt contents ''' bag = bagit.Bag(path) # NOTE: using fast validation here to avoid recalculating checksums # for very large files; only checksum compare will be done by fedora bag.validate(fast=True) # raises bagit.BagValidationError if not valid # use the base name of the BagIt as initial object label initial_label = os.path.basename(path) # identify video content file within the bag m = magic.Magic(mime=True) # loop through bag content until we find a supported video file opts = {'request': request, 'initial_label': initial_label} for data_path in bag.payload_files(): # path is relative to bag root dir filename = os.path.join(path, data_path) mtype = m.from_file(filename) mimetype, separator, options = mtype.partition(';') # require both MD5 and SHA-1 for video to ingest try: md5_checksum = bag.entries[data_path]['md5'] except KeyError: raise Exception('MD5 checksum mismatch on file %s' % data_path) try: sha1_checksum = bag.entries[data_path]['sha1'] except KeyError: raise Exception('SHA-1 checksum mismatch on file %s' % data_path) if mimetype in Video.allowed_master_mimetypes.keys(): opts['master_filename'] = filename opts['master_md5_checksum'] = md5_checksum opts['master_sha1_checksum'] = sha1_checksum opts['master_mimetype'] = mimetype if file_uri: # if Fedora base path is different from locally mounted staging directory, # convert from local path to fedora server path master_location = 'file://%s' % urllib.quote( opts['master_filename']) if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR', None) is not None: master_location = master_location.replace( settings.LARGE_FILE_STAGING_DIR, settings.LARGE_FILE_STAGING_FEDORA_DIR) opts['master_location'] = master_location elif mimetype in Video.allowed_access_mimetypes.keys(): opts['access_filename'] = filename opts['access_md5_checksum'] = md5_checksum opts['access_mimetype'] = mimetype if file_uri: # if Fedora base path is different from locally mounted staging directory, # convert from local path to fedora server path access_location = 'file://%s' % urllib.quote( opts['access_filename']) if getattr(settings, 'LARGE_FILE_STAGING_FEDORA_DIR', None) is not None: access_location = access_location.replace( settings.LARGE_FILE_STAGING_DIR, settings.LARGE_FILE_STAGING_FEDORA_DIR) opts['access_location'] = access_location # no Video found if 'master_filename' not in opts: raise Exception('No Video content found in %s' % os.path.basename(path)) vid = Video.init_from_file(**opts) return vid def old_dm_media_path(self): old_id = self.mods.content.dm1_other_id or self.mods.content.dm1_id if old_id: coll_obj = self._collection_object() if not coll_obj: return coll_path = coll_obj.old_dm_media_path() if not coll_path: return return '%svideo/%s.m4a' % (coll_path, old_id) def _collection_object(self): return self.collection