def _delete_files(self): """ Delete AIP local files once stored in LOCKSS from disk and pointer file. Helper to update_package_status. """ # Get paths to delete if self.keep_local: # Get all LOCKSS chucks local path FLocats delete_elements = self.pointer_root.xpath( ".//mets:fileGrp[@USE='LOCKSS chunk']/*/mets:FLocat[@LOCTYPE='OTHER' and @OTHERLOCTYPE='SYSTEM']", namespaces=utils.NSMAP) else: # Get all local path FLocats delete_elements = self.pointer_root.xpath( ".//mets:FLocat[@LOCTYPE='OTHER' and @OTHERLOCTYPE='SYSTEM']", namespaces=utils.NSMAP) LOGGER.debug('delete_elements: %s', delete_elements) # Delete paths from delete_elements from disk, and remove from METS for element in delete_elements: path = element.get(utils.PREFIX_NS['xlink'] + 'href') LOGGER.debug('path to delete: %s', path) try: os.remove(path) except os.error as e: if e.errno != errno.ENOENT: LOGGER.exception('Could not delete %s', path) element.getparent().remove(element) # Update pointer file # If delete_elements is false, then this function has probably already # been run, and we don't want to add another delete event if not self.keep_local and delete_elements: amdsec = self.pointer_root.find('mets:amdSec', namespaces=utils.NSMAP) # Add 'deletion' PREMIS:EVENT utils.mets_add_event( amdsec, event_type='deletion', event_outcome_detail_note='AIP deleted from local storage', ) # If file was split if self.pointer_root.find(".//mets:fileGrp[@USE='LOCKSS chunk']", namespaces=utils.NSMAP) is not None: # Delete fileGrp USE="AIP" del_elem = self.pointer_root.find( ".//mets:fileGrp[@USE='Archival Information Package']", namespaces=utils.NSMAP) del_elem.getparent().remove(del_elem) # Delete structMap div TYPE='Local copy' del_elem = self.pointer_root.find( ".//mets:structMap/*/mets:div[@TYPE='Local copy']", namespaces=utils.NSMAP) del_elem.getparent().remove(del_elem) return None
def _split_package(self, package): """ Splits the package into chunks of size self.au_size. Returns list of paths to the chunks. If the package has already been split (and an event is in the pointer file), returns the list if file paths from the pointer file. Updates the pointer file with the new LOCKSS chunks, and adds 'division' event. """ # Parse pointer file if not self.pointer_root: self.pointer_root = etree.parse(package.full_pointer_file_path) # Check if file is already split, and if so just return split files if self.pointer_root.xpath('.//premis:eventType[text()="division"]', namespaces=utils.NSMAP): chunks = self.pointer_root.findall(".//mets:div[@TYPE='Archival Information Package']/mets:div[@TYPE='LOCKSS chunk']", namespaces=utils.NSMAP) output_files = [c.find('mets:fptr', namespaces=utils.NSMAP).get('FILEID') for c in chunks] return output_files file_path = package.full_path expected_num_files = math.ceil(os.path.getsize(file_path) / self.au_size) LOGGER.debug('expected_num_files: %s', expected_num_files) # No split needed - just return the file path if expected_num_files <= 1: LOGGER.debug('Only one file expected, not splitting') output_files = [file_path] # No events or structMap changes needed LOGGER.info('LOCKSS: after splitting: %s', output_files) return output_files # Split file # Strip extension, add .tar-1 ('-1' to make rename script happy) output_path = os.path.splitext(file_path)[0] + '.tar-1' command = ['tar', '--create', '--multi-volume', '--tape-length', str(self.au_size), '--new-volume-script', 'common/tar_new_volume.sh', '-f', output_path, file_path] # TODO reserve space in quota for extra files LOGGER.info('LOCKSS split command: %s', command) try: subprocess.check_call(command) except Exception: LOGGER.exception("Split of %s failed with command %s", file_path, command) raise output_path = output_path[:-2] # Remove '-1' dirname, basename = os.path.split(output_path) output_files = sorted([os.path.join(dirname, entry) for entry in os.listdir(dirname) if entry.startswith(basename)]) # Update pointer file amdsec = self.pointer_root.find('mets:amdSec', namespaces=utils.NSMAP) # Add 'division' PREMIS:EVENT try: event_detail = subprocess.check_output(['tar', '--version']) except subprocess.CalledProcessError as e: event_detail = e.output or _('Error: getting tool info; probably GNU tar') utils.mets_add_event( amdsec, event_type='division', event_detail=event_detail, event_outcome_detail_note='{} LOCKSS chunks created'.format(len(output_files)), ) # Update structMap & fileSec self.pointer_root.find('mets:structMap', namespaces=utils.NSMAP).set('TYPE', 'logical') aip_div = self.pointer_root.find("mets:structMap/mets:div[@TYPE='Archival Information Package']", namespaces=utils.NSMAP) filesec = self.pointer_root.find('mets:fileSec', namespaces=utils.NSMAP) filegrp = etree.SubElement(filesec, utils.PREFIX_NS['mets'] + 'fileGrp', USE='LOCKSS chunk') # Move ftpr to Local copy div local_ftpr = aip_div.find('mets:fptr', namespaces=utils.NSMAP) if local_ftpr is not None: div = etree.SubElement(aip_div, utils.PREFIX_NS['mets'] + 'div', TYPE='Local copy') div.append(local_ftpr) # This moves local_fptr # Add each split chunk to structMap & fileSec for idx, out_path in enumerate(output_files): # Add div to structMap div = etree.SubElement(aip_div, utils.PREFIX_NS['mets'] + 'div', TYPE='LOCKSS chunk', ORDER=str(idx + 1)) etree.SubElement(div, utils.PREFIX_NS['mets'] + 'fptr', FILEID=os.path.basename(out_path)) # Get checksum and size for fileSec try: checksum = utils.generate_checksum(out_path, self.checksum_type) except ValueError: # Invalid checksum type checksum = utils.generate_checksum(out_path, 'md5') checksum_name = checksum.name.upper().replace('SHA', 'SHA-') size = os.path.getsize(out_path) # Add file & FLocat to fileSec file_e = etree.SubElement(filegrp, utils.PREFIX_NS['mets'] + 'file', ID=os.path.basename(out_path), SIZE=str(size), CHECKSUM=checksum.hexdigest(), CHECKSUMTYPE=checksum_name) flocat = etree.SubElement(file_e, utils.PREFIX_NS['mets'] + 'FLocat', OTHERLOCTYPE="SYSTEM", LOCTYPE="OTHER") flocat.set(utils.NSMAP['xlink'] + 'href', out_path) # Write out pointer file again with open(package.full_pointer_file_path, 'w') as f: f.write(etree.tostring(self.pointer_root, pretty_print=True, xml_declaration=True, encoding='utf-8')) return output_files