def createMetadata(self, form): entry = sword2.Entry() p = self.paper entry.add_field('title', p.title) for a in p.authors: if a.orcid: entry.add_author(unicode(a), uri='http://{}/{}'.format( settings.ORCID_BASE_DOMAIN, a.orcid)) else: entry.add_author(unicode(a)) if p.abstract: entry.add_field('dcterms_abstract', p.abstract) entry.add_field('dcterms_issued', p.pubdate.isoformat()) for pub in p.publications: entry.add_field('dcterms_identifier', 'doi:' + pub.doi) if pub.journal and pub.journal.issn: entry.add_field('dcterms_isPartOf', 'issn:' + pub.journal.issn) for rec in p.oairecords: entry.add_field('dcterms_source', rec.splash_url) entry.add_field('dcterms_type', p.doctype) return entry
def __init__(self, entry=None, title=None, dataverse=None, edit_uri=None, edit_media_uri=None, statement_uri=None, **kwargs): # Generate sword entry sword_entry = sword2.Entry(entry) if not get_elements(sword_entry.pretty_print(), namespace='dcterms', tag='title'): # Append title to entry if isinstance(title, basestring): sword_entry.add_field(format_term('title'), title) else: raise DataverseException('Study needs a single, valid title.') if kwargs: # Updates sword entry from keyword arguments for k in kwargs.keys(): if isinstance(kwargs[k], list): for item in kwargs[k]: sword_entry.add_field(format_term(k), item) else: sword_entry.add_field(format_term(k), kwargs[k]) self.entry = sword_entry.pretty_print() self.dataverse = dataverse self.edit_uri = edit_uri self.edit_media_uri = edit_media_uri self.statement_uri = statement_uri
def replace_deposit_metadata(self, receipt, **metadata_kwargs): """ Replace a deposit's metadata with that defined by **metadata_kwargs. Return a Receipt object for this replacement action. """ e = sword2.Entry() e.add_fields(**metadata_kwargs) replace_receipt = self.connection.update(metadata_entry=e, dr=receipt, in_progress=True) return replace_receipt
def create_deposit_from_metadata(self, collection, in_progress=True, **metadata_kwargs): """ Create a deposit in a specified collection by providing metadata in **metadata_kwargs. Return a Receipt object for this transaction. """ e = sword2.Entry() e.add_fields(**metadata_kwargs) receipt = self.connection.create(col_iri=collection.href, in_progress=in_progress, metadata_entry=e) return receipt
def _delete_update_lom(self, package, delete_lom_ids): """ Notifys LOM that AUs with `delete_lom_ids` will be deleted. Helper to update_package_status. """ # Update LOM that local copies will be deleted entry = sword2.Entry(id="urn:uuid:{}".format(package.uuid)) entry.register_namespace("lom", utils.NSMAP["lom"]) for lom_id in delete_lom_ids: if lom_id: etree.SubElement(entry.entry, utils.PREFIX_NS["lom"] + "content", recrawl="false").text = lom_id LOGGER.debug("edit entry: %s", entry) # SWORD2 client doesn't handle 202 respose correctly - implementing here # Correct function is self.sword_connection.update_metadata_for_resource headers = { "Content-Type": "application/atom+xml;type=entry", "Content-Length": str(len(str(entry))), "On-Behalf-Of": str(self.content_provider_id), } response, content = self.sword_connection.h.request( uri=package.misc_attributes["edit_iri"], method="PUT", headers=headers, payload=str(entry), ) # Return with error message if response not 200 LOGGER.debug("response code: %s", response["status"]) if response["status"] != 200: if response["status"] == 202: # Accepted - pushing new config return _( "Lockss-o-matic is updating the config to stop harvesting. Please try again to delete local files." ) if response["status"] == 204: # No Content - no matching AIP return _("Package %(uuid)s is not found in LOCKSS") % { "uuid": package.uuid } if response[ "status"] == 409: # Conflict - Files in AU with recrawl return _( "There are files in the LOCKSS Archival Unit (AU) that do not have 'recrawl=false'." ) return _( "Error %(error)s when requesting LOCKSS stop harvesting deleted files." ) % { "error": response["status"] } return None
def _delete_update_lom(self, package, delete_lom_ids): """ Notifys LOM that AUs with `delete_lom_ids` will be deleted. Helper to update_package_status. """ # Update LOM that local copies will be deleted entry = sword2.Entry(id='urn:uuid:{}'.format(package.uuid)) entry.register_namespace('lom', utils.NSMAP['lom']) for lom_id in delete_lom_ids: if lom_id: etree.SubElement(entry.entry, utils.PREFIX_NS['lom'] + 'content', recrawl='false').text = lom_id LOGGER.debug('edit entry: %s', entry) # SWORD2 client doesn't handle 202 respose correctly - implementing here # Correct function is self.sword_connection.update_metadata_for_resource headers = { 'Content-Type': "application/atom+xml;type=entry", 'Content-Length': str(len(str(entry))), 'On-Behalf-Of': str(self.content_provider_id), } response, content = self.sword_connection.h.request( uri=package.misc_attributes['edit_iri'], method='PUT', headers=headers, payload=str(entry)) # Return with error message if response not 200 LOGGER.debug('response code: %s', response['status']) if response['status'] != 200: if response['status'] == 202: # Accepted - pushing new config return 'Lockss-o-matic is updating the config to stop harvesting. Please try again to delete local files.' if response['status'] == 204: # No Content - no matching AIP return 'Package {} is not found in LOCKSS'.format(package.uuid) if response[ 'status'] == 409: # Conflict - Files in AU with recrawl return "There are files in the LOCKSS Archival Unit (AU) that do not have 'recrawl=false'." return 'Error {} when requesting LOCKSS stop harvesting deleted files.'.format( response['status']) return None
def _create_resource(self, package, output_files): """ Given a package, create an Atom resource entry to send to LOCKSS. Parses metadata for the Atom entry from the METS file, uses LOCKSS-o-matic-specific tags to describe size and checksums. """ # Parse METS to get information for atom entry relative_mets_path = os.path.join( os.path.splitext(os.path.basename(package.current_path))[0], "data", 'METS.{}.xml'.format(package.uuid)) (mets_path, temp_dir) = package.extract_file(relative_mets_path) mets = etree.parse(mets_path) # Delete temp dir if created if os.path.exists(temp_dir): shutil.rmtree(temp_dir) # Parse out name and description if found slug = str(package.uuid) title = os.path.basename(package.current_path) summary = 'AIP generated by Archivematica with uuid {}'.format(package.uuid) dublincore = mets.find('mets:dmdSec/mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore', namespaces=utils.NSMAP) if dublincore is not None: title = dublincore.findtext('dcterms:title', namespaces=utils.NSMAP, default=title) slug = dublincore.findtext('dcterms:title', namespaces=utils.NSMAP, default=slug) summary = dublincore.findtext('dcterms:description', namespaces=utils.NSMAP, default=summary) # Parse out Agent for author authors = mets.xpath(".//mets:mdWrap[@MDTYPE='PREMIS:AGENT']//mets:agentType[text()='organization']/ancestor::mets:agent/*/mets:agentIdentifierValue", namespaces=utils.NSMAP) author = authors[0].text if authors else None # Create atom entry entry = sword2.Entry( title=title, id='urn:uuid:' + package.uuid, author={'name': author}, summary=summary) # Add each chunk to the atom entry if not self.pointer_root: self.pointer_root = etree.parse(package.full_pointer_file_path) entry.register_namespace('lom', utils.NSMAP['lom']) for index, file_path in enumerate(output_files): # Get external URL if len(output_files) == 1: external_url = self._download_url(package.uuid) else: external_url = self._download_url(package.uuid, index + 1) # Get checksum and size from pointer file (or generate if not found) file_e = self.pointer_root.find(".//mets:fileGrp[@USE='LOCKSS chunk']/mets:file[@ID='{}']".format(os.path.basename(file_path)), namespaces=utils.NSMAP) if file_e is not None: checksum_name = file_e.get('CHECKSUMTYPE') checksum_value = file_e.get('CHECKSUM') size = int(file_e.get('SIZE')) else: # Not split, generate try: checksum = utils.generate_checksum(file_path, self.checksum_type) except ValueError: # Invalid checksum type checksum = utils.generate_checksum(file_path, 'md5') checksum_name = checksum.name.upper().replace('SHA', 'SHA-') checksum_value = checksum.hexdigest() size = os.path.getsize(file_path) # Convert size to kB size = str(math.ceil(size / 1000)) # Add new content entry and values entry.add_field('lom_content', external_url) content_entry = entry.entry[-1] content_entry.set('size', size) content_entry.set('checksumType', checksum_name) content_entry.set('checksumValue', checksum_value) LOGGER.debug('LOCKSS atom entry: %s', entry) return entry, slug
def move_from_storage_service(self, source_path, destination_path, package=None): LOGGER.info('source_path: %s, destination_path: %s, package: %s', source_path, destination_path, package) if package is None: LOGGER.warning('DSpace requires package param') return # This only handles compressed AIPs if not os.path.isfile(source_path): raise NotImplementedError( _('Storing in DSpace does not support uncompressed AIPs')) self._get_sword_connection() # Create item by depositing AtoM doc LOGGER.debug('Create SWORD2 entry') kwargs = self._get_metadata(source_path, package.uuid) entry = sword2.Entry(title=kwargs.get('dcterms_title'), **kwargs) destination_path = package.current_location.relative_path LOGGER.debug('POST SWORD2 entry %s %s', destination_path, entry) entry_receipt = self.sword_connection.create( col_iri=destination_path, in_progress=True, metadata_entry=entry, ) # TODO store these in Package.misc_attributes LOGGER.info('Edit IRI: %s', entry_receipt.edit) LOGGER.info('Edit Media IRI: %s', entry_receipt.edit_media) LOGGER.info('Statement IRI: %s', entry_receipt.atom_statement_iri) # Split package upload_paths = self._split_package(source_path) for upload_path in upload_paths: LOGGER.info('Add file %s to %s', upload_path, entry_receipt.edit_media) # Add file to DSpace item with open(upload_path, 'r') as f: content = f.read() # sword2 iterates over this twice # Note: This has problems because httplib2 tries all requests using basic auth without any auth and retries after getting a 401. This breaks with files over 2097152 bytes. # A possible solution is to use a different http_impl in the connection, but that returns incorrect URIs in the deposit recept # LOGGER.debug('Using sword2') # self.sword_connection.add_file_to_resource( # edit_media_iri=entry_receipt.edit_media, # payload=content, # filename=os.path.basename(upload_path), # mimetype=mimetypes.guess_type(upload_path), # ) # This replicates the sword2 behaviour but using requests for the basic auth LOGGER.debug('Using requests') headers = { 'Content-Type': str(mimetypes.guess_type(upload_path)), # 'Content-MD5': str(md5sum), 'Content-Length': str(os.path.getsize(upload_path)), 'Content-Disposition': "attachment; filename=%s" % urllib.quote(os.path.basename(upload_path)), } requests.post(entry_receipt.edit_media, headers=headers, data=content, auth=(self.user, self.password)) # Finalize deposit LOGGER.info('Complete deposit for %s', entry_receipt.edit) try: complete_receipt = self.sword_connection.complete_deposit( dr=entry_receipt) except Exception: LOGGER.error( 'Error creating item: Status: %s, response: %s', self.sword_connection.history[-1]['payload'] ['response'].status, self.sword_connection.history[-1]['payload']['response'].resp) LOGGER.error(self.sword_connection.history[-1]) raise LOGGER.info('Complete receipt: %s', complete_receipt) package.current_path = entry_receipt.atom_statement_iri package.save() # Fetch statement LOGGER.info( 'Request Atom serialisation of the deposit statement from %s', entry_receipt.atom_statement_iri) try: statement = self.sword_connection.get_atom_sword_statement( entry_receipt.atom_statement_iri) except Exception: LOGGER.error( 'Error creating item: Status: %s, response: %s', self.sword_connection.history[-1]['payload'] ['response'].status, self.sword_connection.history[-1]['payload']['response'].resp) LOGGER.error(self.sword_connection.history[-1]) raise LOGGER.info('Statement: %s', statement.xml_document) # Get DSpace handle regex = r'bitstream/(?P<handle>\d+/\d+)/' # get Dspace handle regex match = re.search(regex, statement.original_deposits[0].id) if match: LOGGER.info('Handle: %s', match.group('handle')) handle = match.group('handle') else: LOGGER.warning('No match found in %s', statement.original_deposits[0].id) return package.misc_attributes.update({'handle': handle}) package.save() # Set permissions on metadata bitstreams self._set_permissions(package)