def test_delete_aip(self): # Verify AIP exists results = self.client.search( index='aips', body={'query': { 'term': { 'uuid': self.aip_uuid } }}, _source='uuid', ) assert results['hits']['total'] == 1 assert results['hits']['hits'][0]['_source']['uuid'] == self.aip_uuid # Delete AIP elasticSearchFunctions.delete_aip(self.client, self.aip_uuid) # Verify AIP gone results = self.client.search( index='aips', body={'query': { 'term': { 'uuid': self.aip_uuid } }}, _source='uuid', ) assert results['hits']['total'] == 0
def aip_delete(request, uuid): try: aip = elasticSearchFunctions.connect_and_get_aip_data(uuid) aip_filepath = aip['filePath'] os.remove(aip_filepath) elasticSearchFunctions.delete_aip(uuid) elasticSearchFunctions.connect_and_delete_aip_files(uuid) return HttpResponseRedirect(reverse('components.archival_storage.views.overview')) except: raise Http404
def index_aip(job): """Write AIP information to ElasticSearch. """ sip_uuid = job.args[1] # %SIPUUID% sip_name = job.args[2] # %SIPName% sip_staging_path = job.args[3] # %SIPDirectory% sip_type = job.args[4] # %SIPType% if "aips" not in mcpclient_settings.SEARCH_ENABLED: logger.info("Skipping indexing: AIPs indexing is currently disabled.") return 0 elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() aip_info = storage_service.get_file_info(uuid=sip_uuid) job.pyprint("AIP info:", aip_info) aip_info = aip_info[0] mets_staging_path = os.path.join(sip_staging_path, "METS.{}.xml".format(sip_uuid)) identifiers = get_identifiers(job, sip_staging_path) # If this is an AIC, find the number of AIP stored in it and index that aips_in_aic = None if sip_type == "AIC": try: uv = UnitVariable.objects.get(unittype="SIP", unituuid=sip_uuid, variable="AIPsinAIC") aips_in_aic = uv.variablevalue except UnitVariable.DoesNotExist: pass # Delete ES index before creating new one if reingesting if "REIN" in sip_type: job.pyprint( "Deleting outdated entry for AIP and AIP files with UUID", sip_uuid, "from archival storage", ) elasticSearchFunctions.delete_aip(client, sip_uuid) elasticSearchFunctions.delete_aip_files(client, sip_uuid) job.pyprint("Indexing AIP and AIP files") # Even though we treat MODS identifiers as SIP-level, we need to index them # here because the archival storage tab actually searches on the # aips/aipfile index. ret = elasticSearchFunctions.index_aip_and_files( client=client, uuid=sip_uuid, aip_stored_path=aip_info["current_full_path"], mets_staging_path=mets_staging_path, name=sip_name, aip_size=aip_info["size"], aips_in_aic=aips_in_aic, identifiers=identifiers, encrypted=aip_info["encrypted"], printfn=job.pyprint, ) if ret == 1: job.pyprint("Error indexing AIP and AIP files", file=sys.stderr) return ret
def aip_delete(request, uuid): try: aip = elasticSearchFunctions.connect_and_get_aip_data(uuid) aip_filepath = aip['filePath'] os.remove(aip_filepath) elasticSearchFunctions.delete_aip(uuid) elasticSearchFunctions.connect_and_delete_aip_files(uuid) return HttpResponseRedirect( reverse('components.archival_storage.views.overview')) except: raise Http404
def sync_es_aip_status_with_storage_service(uuid, es_status): """Update AIP's status in ES indices to match Storage Service. This is a bit of a kludge that is made necessary by the fact that the Storage Service does not update ElasticSearch directly when a package's status has changed. Updates to ES are visible in Archival Storage after running a new search or refreshing the page. :param uuid: AIP UUID. :param es_status: Current package status in ES. :returns: Boolean indicating whether AIP should be kept in search results (i.e. has not been deleted from Storage Service). """ keep_in_results = True amclient = setup_amclient() amclient.package_uuid = uuid api_results = amclient.get_package_details() if api_results in AMCLIENT_ERROR_CODES: logger.warning( "Package {} not found in Storage Service. AMClient error code: {}".format( uuid, api_results ) ) return keep_in_results aip_status = api_results.get("status") if not aip_status: logger.warning( "Status for package {} could not be retrived from Storage Service." ) return keep_in_results if ( aip_status == es.STATUS_DELETE_REQUESTED and es_status != es.STATUS_DELETE_REQUESTED ): es_client = es.get_client() es.mark_aip_deletion_requested(es_client, uuid) elif aip_status == es.STATUS_UPLOADED and es_status != es.STATUS_UPLOADED: es_client = es.get_client() es.revert_aip_deletion_request(es_client, uuid) elif aip_status == es.STATUS_DELETED: keep_in_results = False es_client = es.get_client() es.delete_aip(es_client, uuid) es.delete_aip_files(es_client, uuid) return keep_in_results
def test_delete_aip(self): # Verify AIP exists results = self.client.search( index='aips', doc_type='aip', body={'query': { 'term': { 'uuid': self.aip_uuid } }}, fields='uuid', ) assert results['hits']['total'] == 1 assert results['hits']['hits'][0]['fields']['uuid'] == [self.aip_uuid] # Delete AIP success = elasticSearchFunctions.delete_aip(self.client, self.aip_uuid) # Verify AIP gone assert success is True results = self.client.search( index='aips', doc_type='aip', body={'query': { 'term': { 'uuid': self.aip_uuid } }}, fields='uuid', ) assert results['hits']['total'] == 0
def test_delete_aip(self): # Verify AIP exists results = self.client.search( index="aips", body={"query": {"term": {"uuid": self.aip_uuid}}}, _source="uuid", ) assert results["hits"]["total"] == 1 assert results["hits"]["hits"][0]["_source"]["uuid"] == self.aip_uuid # Delete AIP elasticSearchFunctions.delete_aip(self.client, self.aip_uuid) # Verify AIP gone results = self.client.search( index="aips", body={"query": {"term": {"uuid": self.aip_uuid}}}, _source="uuid", ) assert results["hits"]["total"] == 0
def index_aip(): """ Write AIP information to ElasticSearch. """ sip_uuid = sys.argv[1] # %SIPUUID% sip_name = sys.argv[2] # %SIPName% sip_path = sys.argv[3] # %SIPDirectory% sip_type = sys.argv[4] # %SIPType% # Check if ElasticSearch is enabled client_config_path = '/etc/archivematica/MCPClient/clientConfig.conf' config = ConfigParser.SafeConfigParser() config.read(client_config_path) elastic_search_disabled = False try: elastic_search_disabled = config.getboolean( 'MCPClient', "disableElasticsearchIndexing") except ConfigParser.NoOptionError: pass if elastic_search_disabled: print('Skipping indexing: indexing is currently disabled in', client_config_path) return 0 print('SIP UUID:', sip_uuid) aip_info = storage_service.get_file_info(uuid=sip_uuid) print('AIP info:', aip_info) aip_info = aip_info[0] mets_name = 'METS.{}.xml'.format(sip_uuid) mets_path = os.path.join(sip_path, mets_name) mods_paths = list_mods(sip_path) identifiers = [] for mods in mods_paths: identifiers.extend(extract_identifiers_from_mods(mods)) # If this is an AIC, find the number of AIP stored in it and index that aips_in_aic = None if sip_type == "AIC": try: uv = UnitVariable.objects.get(unittype="SIP", unituuid=sip_uuid, variable="AIPsinAIC") aips_in_aic = uv.variablevalue except UnitVariable.DoesNotExist: pass print('Indexing AIP info') # Delete ES index before creating new one if reingesting if 'REIN' in sip_type: print('Deleting outdated entry for AIP and AIP files with UUID', sip_uuid, 'from archival storage') elasticSearchFunctions.delete_aip(sip_uuid) elasticSearchFunctions.connect_and_delete_aip_files(sip_uuid) # Index AIP elasticSearchFunctions.connect_and_index_aip( sip_uuid, sip_name, aip_info['current_full_path'], mets_path, size=aip_info['size'], aips_in_aic=aips_in_aic, identifiers=identifiers) # Index AIP files print('Indexing AIP files') # Even though we treat MODS identifiers as SIP-level, we need to index them # here because the archival storage tab actually searches on the # aips/aipfile index. exitCode = elasticSearchFunctions.connect_and_index_files( index='aips', type='aipfile', uuid=sip_uuid, pathToArchive=sip_path, identifiers=identifiers, sipName=sip_name, ) if exitCode == 1: print('Error indexing AIP files', file=sys.stderr) return 1 return 0
def processAIPThenDeleteMETSFile(path, temp_dir, es_client, delete_existing_data=False): archive_file = os.path.basename(path) # Regex match the UUID - AIP might end with .7z, .tar.bz2, or # something else. match = re.search( r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", archive_file) if match is not None: aip_uuid = match.group() else: return -1 print("Processing AIP", aip_uuid) if delete_existing_data is True: print("Deleting AIP", aip_uuid, "from aips/aip and aips/aipfile.") elasticSearchFunctions.delete_aip(es_client, aip_uuid) elasticSearchFunctions.delete_aip_files(es_client, aip_uuid) # AIP filenames are <name>-<uuid><extension> # Index of match end is right before the extension subdir = archive_file[:match.end()] aip_name = subdir[:-37] mets_file = "METS." + aip_uuid + ".xml" mets_file_relative_path = os.path.join("data", mets_file) if os.path.isfile(path): mets_file_relative_path = os.path.join(subdir, mets_file_relative_path) path_to_mets = extract_file( archive_path=path, destination_dir=temp_dir, relative_path=mets_file_relative_path, ) # If AIC, need to extract number of AIPs in AIC to index as well aips_in_aic = None root = etree.parse(path_to_mets) try: aip_type = ns.xml_find_premis( root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:type" ).text except AttributeError: pass else: if aip_type == "Archival Information Collection": aips_in_aic = get_aips_in_aic(root, path, temp_dir) aip_info = storage_service.get_file_info(uuid=aip_uuid) if not aip_info: print("Information not found in Storage Service for AIP UUID: ", aip_uuid) return 1 return elasticSearchFunctions.index_aip_and_files( client=es_client, uuid=aip_uuid, aip_stored_path=path, mets_staging_path=path_to_mets, name=aip_name, aip_size=aip_info[0]["size"], aips_in_aic=aips_in_aic, identifiers=[], # TODO get these )
def list_display(request): if 'aips' not in settings.SEARCH_ENABLED: return render(request, 'archival_storage/list.html') current_page_number = int(request.GET.get('page', 1)) logger.debug('Current page: %s', current_page_number) # get count of AIP files es_client = elasticSearchFunctions.get_client() aip_indexed_file_count = aip_file_count(es_client) # get AIPs order_by = request.GET.get('order_by', 'name_unanalyzed') sort_by = request.GET.get('sort_by', 'up') if sort_by == 'down': sort_direction = 'desc' else: sort_direction = 'asc' sort_specification = order_by + ':' + sort_direction sort_params = 'order_by=' + order_by + '&sort_by=' + sort_by # get list of UUIDs of AIPs that are deleted or pending deletion aips_deleted_or_pending_deletion = [] should_haves = [ {'match': {'status': 'DEL_REQ'}}, {'match': {'status': 'DELETED'}}, ] query = { "query": { "bool": { "should": should_haves } } } deleted_aip_results = es_client.search( body=query, index='aips', doc_type='aip', fields='uuid,status' ) for deleted_aip in deleted_aip_results['hits']['hits']: aips_deleted_or_pending_deletion.append(deleted_aip['fields']['uuid'][0]) # Fetch results and paginate def es_pager(page, page_size): """ Fetch one page of normalized entries from Elasticsearch. :param page: 1-indexed page to fetch :param page_size: Number of entries on a page :return: List of dicts for each entry, where keys and values have been cleaned up """ start = (page - 1) * page_size results = es_client.search( index='aips', doc_type='aip', body=elasticSearchFunctions.MATCH_ALL_QUERY, fields='origin,uuid,filePath,created,name,size,encrypted', sort=sort_specification, size=page_size, from_=start, ) # normalize results - each of the fields contains a single value, # but is returned from the ES API as a single-length array # e.g. {"fields": {"uuid": ["abcd"], "name": ["aip"] ...}} return [elasticSearchFunctions.normalize_results_dict(d) for d in results['hits']['hits']] items_per_page = 10 count = es_client.count(index='aips', doc_type='aip', body=elasticSearchFunctions.MATCH_ALL_QUERY)['count'] results = LazyPagedSequence(es_pager, page_size=items_per_page, length=count) # Paginate page = helpers.pager( results, items_per_page, current_page_number ) # process deletion, etc., and format results aips = [] for aip in page.object_list: # If an AIP was deleted or is pending deletion, react if status changed if aip['uuid'] in aips_deleted_or_pending_deletion: # check with storage server to see current status api_results = storage_service.get_file_info(uuid=aip['uuid']) try: aip_status = api_results[0]['status'] except IndexError: # Storage service does not know about this AIP # TODO what should happen here? logger.info("AIP not found in storage service: {}".format(aip)) continue # delete AIP metadata in ElasticSearch if AIP has been deleted from the # storage server # TODO: handle this asynchronously if aip_status == 'DELETED': elasticSearchFunctions.delete_aip(es_client, aip['uuid']) elasticSearchFunctions.delete_aip_files(es_client, aip['uuid']) elif aip_status != 'DEL_REQ': # update the status in ElasticSearch for this AIP elasticSearchFunctions.mark_aip_stored(es_client, aip['uuid']) else: aip_status = 'UPLOADED' # Tweak AIP presentation and add to display array if aip_status != 'DELETED': aip['status'] = AIP_STATUS_DESCRIPTIONS[aip_status] try: size = '{0:.2f} MB'.format(float(aip['size'])) except (TypeError, ValueError): size = 'Removed' aip['size'] = size aip['href'] = aip['filePath'].replace(AIPSTOREPATH + '/', "AIPsStore/") aip['date'] = aip['created'] aips.append(aip) total_size = total_size_of_aips(es_client) # Find out which AIPs are encrypted return render(request, 'archival_storage/list.html', { 'total_size': total_size, 'aip_indexed_file_count': aip_indexed_file_count, 'aips': aips, 'page': page, 'search_params': sort_params, } )
def list_display(request): if "aips" not in settings.SEARCH_ENABLED: return render(request, "archival_storage/list.html") current_page_number = int(request.GET.get("page", 1)) logger.debug("Current page: %s", current_page_number) # get count of AIP files es_client = elasticSearchFunctions.get_client() aip_indexed_file_count = aip_file_count(es_client) # get AIPs order_by = request.GET.get("order_by", "name") sort_by = request.GET.get("sort_by", "up") sort_params = "order_by=" + order_by + "&sort_by=" + sort_by # use raw subfield to sort by name if order_by == "name": order_by = order_by + ".raw" # change sort_by param to ES sort directions if sort_by == "down": sort_by = "desc" else: sort_by = "asc" sort_specification = order_by + ":" + sort_by # get list of UUIDs of AIPs that are deleted or pending deletion aips_deleted_or_pending_deletion = [] should_haves = [{ "match": { "status": "DEL_REQ" } }, { "match": { "status": "DELETED" } }] query = {"query": {"bool": {"should": should_haves}}} deleted_aip_results = es_client.search(body=query, index="aips", _source="uuid,status") for deleted_aip in deleted_aip_results["hits"]["hits"]: aips_deleted_or_pending_deletion.append(deleted_aip["_source"]["uuid"]) # Fetch results and paginate def es_pager(page, page_size): """ Fetch one page of normalized entries from Elasticsearch. :param page: 1-indexed page to fetch :param page_size: Number of entries on a page :return: List of dicts for each entry, where keys and values have been cleaned up """ start = (page - 1) * page_size results = es_client.search( index="aips", body={"query": { "match_all": {} }}, _source="origin,uuid,filePath,created,name,size,encrypted", sort=sort_specification, size=page_size, from_=start, ) return [d["_source"] for d in results["hits"]["hits"]] items_per_page = 10 count = es_client.count(index="aips", body={"query": { "match_all": {} }})["count"] results = LazyPagedSequence(es_pager, page_size=items_per_page, length=count) # Paginate page = helpers.pager(results, items_per_page, current_page_number) # process deletion, etc., and format results aips = [] for aip in page.object_list: # If an AIP was deleted or is pending deletion, react if status changed if aip["uuid"] in aips_deleted_or_pending_deletion: # check with storage server to see current status api_results = storage_service.get_file_info(uuid=aip["uuid"]) try: aip_status = api_results[0]["status"] except IndexError: # Storage service does not know about this AIP # TODO what should happen here? logger.info("AIP not found in storage service: {}".format(aip)) continue # delete AIP metadata in ElasticSearch if AIP has been deleted from the # storage server # TODO: handle this asynchronously if aip_status == "DELETED": elasticSearchFunctions.delete_aip(es_client, aip["uuid"]) elasticSearchFunctions.delete_aip_files(es_client, aip["uuid"]) elif aip_status != "DEL_REQ": # update the status in ElasticSearch for this AIP elasticSearchFunctions.mark_aip_stored(es_client, aip["uuid"]) else: aip_status = "UPLOADED" # Tweak AIP presentation and add to display array if aip_status != "DELETED": aip["status"] = AIP_STATUS_DESCRIPTIONS[aip_status] try: size = "{0:.2f} MB".format(float(aip["size"])) except (TypeError, ValueError): size = "Removed" aip["size"] = size aip["href"] = aip["filePath"].replace(AIPSTOREPATH + "/", "AIPsStore/") aip["date"] = aip["created"] aips.append(aip) total_size = total_size_of_aips(es_client) # Find out which AIPs are encrypted return render( request, "archival_storage/list.html", { "total_size": total_size, "aip_indexed_file_count": aip_indexed_file_count, "aips": aips, "page": page, "search_params": sort_params, }, )
def index_aip(): """ Write AIP information to ElasticSearch. """ sip_uuid = sys.argv[1] # %SIPUUID% sip_name = sys.argv[2] # %SIPName% sip_path = sys.argv[3] # %SIPDirectory% sip_type = sys.argv[4] # %SIPType% if not mcpclient_settings.SEARCH_ENABLED: logger.info('Skipping indexing: indexing is currently disabled.') return 0 elasticSearchFunctions.setup_reading_from_conf(mcpclient_settings) client = elasticSearchFunctions.get_client() print('SIP UUID:', sip_uuid) aip_info = storage_service.get_file_info(uuid=sip_uuid) print('AIP info:', aip_info) aip_info = aip_info[0] mets_name = 'METS.{}.xml'.format(sip_uuid) mets_path = os.path.join(sip_path, mets_name) identifiers = get_identifiers(sip_path) # If this is an AIC, find the number of AIP stored in it and index that aips_in_aic = None if sip_type == "AIC": try: uv = UnitVariable.objects.get(unittype="SIP", unituuid=sip_uuid, variable="AIPsinAIC") aips_in_aic = uv.variablevalue except UnitVariable.DoesNotExist: pass print('Indexing AIP info') # Delete ES index before creating new one if reingesting if 'REIN' in sip_type: print('Deleting outdated entry for AIP and AIP files with UUID', sip_uuid, 'from archival storage') elasticSearchFunctions.delete_aip(client, sip_uuid) elasticSearchFunctions.delete_aip_files(client, sip_uuid) # Index AIP elasticSearchFunctions.index_aip(client, sip_uuid, sip_name, aip_info['current_full_path'], mets_path, size=aip_info['size'], aips_in_aic=aips_in_aic, identifiers=identifiers, encrypted=aip_info['encrypted']) # Index AIP files print('Indexing AIP files') # Even though we treat MODS identifiers as SIP-level, we need to index them # here because the archival storage tab actually searches on the # aips/aipfile index. exitCode = elasticSearchFunctions.index_files( client, index='aips', type_='aipfile', uuid=sip_uuid, pathToArchive=sip_path, identifiers=identifiers, sipName=sip_name, ) if exitCode == 1: print('Error indexing AIP files', file=sys.stderr) return 1 return 0
def process_package( self, es_client, package_info, temp_dir, delete_before_reindexing, is_aic=False ): """Index package in 'aips' and 'aipfiles' indices. :param es_client: Elasticsearch client. :param package_info: Package info dict returned by Storage Service. :param temp_dir: Path to tempdir for downloaded METS files. :param delete_before_reindexing: Boolean of whether to delete package from indices prior to reindexing. :is_aic: Optional boolean to indicate if package being indexed is an AIC. :returns: Boolean indicating success. """ uuid = package_info["uuid"] # Download the AIP METS file to a temporary directory. mets_relative_path = am.relative_path_to_aip_mets_file( package_info["uuid"], package_info["current_path"] ) mets_filename = os.path.basename(mets_relative_path) mets_download_path = os.path.join(temp_dir, mets_filename) storageService.extract_file(uuid, mets_relative_path, mets_download_path) if not os.path.isfile(mets_download_path): error_message = "Unable to download AIP METS file from Storage Service" self.error( "Error indexing package {0}. Details: {1}".format(uuid, error_message) ) return False aips_in_aic = None if is_aic: mets_root = etree.parse(mets_download_path) aips_in_aic = get_aips_in_aic(mets_root, temp_dir, uuid) package_name = am.package_name_from_path( package_info["current_path"], remove_uuid_suffix=True ) aip_location = package_info.get("current_location", "") location_description = storageService.retrieve_storage_location_description( aip_location ) if delete_before_reindexing: self.info( "Deleting package {} from 'aips' and 'aipfiles' indices.".format(uuid) ) es.delete_aip(es_client, uuid) es.delete_aip_files(es_client, uuid) # Index the AIP and then immediately delete the METS file. try: es.index_aip_and_files( client=es_client, uuid=uuid, aip_stored_path=package_info["current_full_path"], mets_staging_path=mets_download_path, name=package_name, aip_size=package_info["size"], aips_in_aic=aips_in_aic, encrypted=package_info.get("encrypted", False), location=location_description, ) self.info("Successfully indexed package {}".format(uuid)) os.remove(mets_download_path) return True except (ElasticsearchException, etree.XMLSyntaxError) as err: self.error("Error indexing package {0}. Details: {1}".format(uuid, err)) os.remove(mets_download_path) return False
def processAIPThenDeleteMETSFile(path, temp_dir, es_client, delete_existing_data=False): archive_file = os.path.basename(path) # Regex match the UUID - AIP might end with .7z, .tar.bz2, or # something else. match = re.search( r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", archive_file) if match is not None: aip_uuid = match.group() else: return -1 print('Processing AIP', aip_uuid) if delete_existing_data is True: print('Deleting AIP', aip_uuid, 'from aips/aip and aips/aipfile.') elasticSearchFunctions.delete_aip(es_client, aip_uuid) elasticSearchFunctions.delete_aip_files(es_client, aip_uuid) # AIP filenames are <name>-<uuid><extension> # Index of match end is right before the extension subdir = archive_file[:match.end()] aip_name = subdir[:-37] mets_file = "METS." + aip_uuid + ".xml" mets_file_relative_path = os.path.join("data", mets_file) if os.path.isfile(path): mets_file_relative_path = os.path.join(subdir, mets_file_relative_path) path_to_mets = extract_file( archive_path=path, destination_dir=temp_dir, relative_path=mets_file_relative_path) # If AIC, need to extract number of AIPs in AIC to index as well aips_in_aic = None root = etree.parse(path_to_mets) try: aip_type = root.find( "m:dmdSec/m:mdWrap/m:xmlData/dc:dublincore/dc:type", namespaces=NSMAP).text except AttributeError: pass else: if aip_type == "Archival Information Collection": aips_in_aic = get_aips_in_aic(root, path, temp_dir) aip_info = storage_service.get_file_info(uuid=aip_uuid) if aip_info: elasticSearchFunctions.index_aip( client=es_client, uuid=aip_uuid, name=aip_name, filePath=path, pathToMETS=path_to_mets, aips_in_aic=aips_in_aic, identifiers=[], # TODO get these size=aip_info[0]['size'], ) elasticSearchFunctions.index_mets_file_metadata( client=es_client, uuid=aip_uuid, metsFilePath=path_to_mets, index='aips', type_='aipfile', sipName=aip_name, identifiers=[], # TODO get these )