def create_capabilitylist(self) -> SitemapData: """ ``build step 5`` :samp:`Create a new capabilitylist over sitemaps found in metadata directory` :return: :class:`SitemapData` over the newly created capabilitylist """ capabilitylist_path = self.para.abs_metadata_path("capabilitylist.xml") if os.path.exists( capabilitylist_path) and self.para.is_saving_sitemaps: os.remove(capabilitylist_path) doc_types = [ "resourcelist", "changelist", "resourcedump", "changedump" ] capabilitylist = CapabilityList() for doc_type in doc_types: index_path = self.para.abs_metadata_path(doc_type + "-index.xml") if os.path.exists(index_path): capabilitylist.add( Resource(uri=self.para.uri_from_path(index_path), capability=doc_type)) else: doc_list_files = sorted( glob(self.para.abs_metadata_path(doc_type + "_*.xml"))) for doc_list in doc_list_files: capabilitylist.add( Resource(uri=self.para.uri_from_path(doc_list), capability=doc_type)) return self.finish_sitemap(-1, capabilitylist)
def get_change_dump_xml(self, from_date): """ Get change dump xml. :return: Updated Change List info """ if not self._validation(): return None change_dump = ChangeDump() change_dump.up = '{}resync/capability.xml'.format(request.url_root) change_dump.index = '{}resync/{}/changedump.xml'.format( request.url_root, self.repository_id) record_changes = self._get_record_changes_with_interval(from_date) for data in record_changes: try: next_ch = self._next_change(data, record_changes) if data.get('status') == 'deleted': continue loc = '{}resync/{}/{}/change_dump_content.zip'.format( request.url_root, self.repository_id, '{}.{}'.format(data.get('record_id'), data.get('record_version'))) rc = Resource(loc, lastmod=data.get("updated"), mime_type='application/zip', md_from=data.get('updated'), md_until=datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat(), ln=[]) if next_ch and next_ch.get('updated'): rc.md_until = next_ch.get('updated') if self.change_dump_manifest: ln = { 'rel': 'contents', 'href': '{}resync/{}/{}/changedump_manifest.xml'.format( request.url_root, self.repository_id, '{}.{}'.format(data.get('record_id'), data.get('record_version'))), 'type': 'application/xml' } rc.ln.append(ln) change_dump.add(rc) except Exception: current_app.logger.error('-' * 60) traceback.print_exc(file=sys.stdout) current_app.logger.error('-' * 60) continue return change_dump.as_xml()
def generate(self) -> [Resource]: elastic_page_generator = self.elastic_page_generator() erased_changes = False for e_page in elastic_page_generator(): if not erased_changes: # this will happen at the first scroll self.erase_changes() logger.info("Erasing changes") erased_changes = True for e_hit in e_page: e_source = e_hit['_source'] if self.elastic_params.strategy == Strategy.resourcelist.value: e_doc = ResourceDoc.as_resource_doc(e_source) else: e_doc = ChangeDoc.as_change_doc(e_source) uri = e_doc.location.uri_from_path( param_url_prefix=self.elastic_params.url_prefix, param_resource_root_dir=self.elastic_params. resource_root_dir) if self.elastic_params.strategy == Strategy.resourcelist.value: ln = [] if e_doc.ln: for link in e_doc.ln: link_uri = link.href.uri_from_path( param_url_prefix=self.elastic_params. url_prefix, param_resource_root_dir=self.elastic_params. resource_root_dir) ln.append({ 'href': link_uri, 'rel': link.rel, 'mime': link.mime }) resource = Resource(uri=uri, length=e_doc.length, lastmod=e_doc.lastmod, md5=e_doc.md5, mime_type=e_doc.mime, ln=ln) else: resource = Resource(uri=uri, lastmod=e_doc.lastmod, change=e_doc.change) yield resource
def get_resource_dump_manifest(self, record_id): """ Get resource dump manifest. :param record_id: Identifier of record. :return: (xml) content of resourcedumpmanifest """ _validation = self._validation(record_id) if self.resource_dump_manifest and _validation: rdm = ResourceDumpManifest() rdm.up = '{}resync/{}/resourcedump.xml'.format( request.url_root, self.repository_id) record = WekoRecord.get_record_by_pid(record_id) if record: for file in record.files: current_app.logger.debug(file.info()) file_info = file.info() path = 'recid_{}/{}'.format(record.get('recid'), file_info.get('key')) lastmod = str(datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat()) rdm.add( Resource( '{}record/{}/files/{}'.format( request.url_root, record.get('recid'), file_info.get('key')), lastmod=lastmod, sha256=file_info.get('checksum').split(':')[1], length=str(file_info.get('size')), path=path)) return rdm.as_xml() return None
def get_resource_list_xml(self, from_date=None, to_date=None): """ Get content of resource list. :return: (xml) resource list content """ if not self._validation(): return None r = get_items_by_index_tree(self.repository_id) rl = ResourceList() rl.up = INVENIO_CAPABILITY_URL.format(request.url_root) for item in r: if item: resource_date = str_to_datetime( item.get('_source').get('_updated')) if from_date and str_to_datetime(from_date) > resource_date: continue if to_date and str_to_datetime(to_date) < resource_date: continue id_item = item.get('_source').get('control_number') # url = '{}records/{}'.format(request.url_root, str(id_item)) url = '{}resync/{}/records/{}'.format(request.url_root, str(self.repository_id), str(id_item)) rl.add( Resource(url, lastmod=item.get('_source').get('_updated'))) return rl.as_xml()
def oaipmh_header_to_resourcesync_resource(self, header): """Maps an OAI-PMH record identifier to a ResourceSync Resource. header an instance of `sickle.models.Header` https://sickle.readthedocs.io/en/latest/api.html#sickle.models.Header """ soup = BeautifulSoup(header.raw.encode('utf-8'), 'xml') lastmod = soup.header.datestamp.text identifier = soup.identifier.text query_string = 'verb=GetRecord&identifier={}&metadataPrefix={}'.format( urllib.parse.quote(identifier, safe=''), urllib.parse.quote(self.params['oaipmh_metadataprefix'], safe='')) parts = urllib.parse.urlparse( self.params['oaipmh_base_url'])[:4] + (query_string, '') uri = urllib.parse.urlunparse(parts) # do a GET request for each record to retrieve the 'content-length' r = get(uri) length = len(r.content) # compute md5 of the GetRecord element (OAI-PMH responses include # responseDate tags, so the md5 of the entire response is different for # subsequent requests for the same record) m = md5() element = str(BeautifulSoup(r.content, 'xml').GetRecord).encode('utf-8') m.update(element) return Resource(uri=uri, lastmod=lastmod, md5=m.hexdigest(), length=length, mime_type="text/xml")
def update_resource_sync(self, capabilitylist_data): """ ``build step 6`` :samp:`Update description with newly created capabilitylist` :param capabilitylist_data: :class:`SitemapData` over the newly created capabilitylist :return: :class:`SitemapData` over updated description """ src_desc_path = self.para.abs_description_path() well_known_dir = os.path.dirname(src_desc_path) os.makedirs(well_known_dir, exist_ok=True) src_description = SourceDescription() if os.path.exists(src_desc_path): src_description = self.read_sitemap(src_desc_path, src_description) src_description.add(Resource( uri=capabilitylist_data.uri, capability=Capability.capabilitylist.name), replace=True) sitemap_data = SitemapData(len(src_description), -1, self.para.description_url(), src_desc_path, Capability.description.name) if self.para.is_saving_sitemaps: self.save_sitemap(src_description, src_desc_path) sitemap_data.document_saved = True self.observers_inform(self, ExecutorEvent.completed_document, document=src_description, sitemap_data=sitemap_data) return sitemap_data
def create_index(self, sitemap_data_iter: iter) -> SitemapData: changelist_index_path = self.param.abs_metadata_path( "changelist-index.xml") changelist_index_uri = self.param.uri_from_path(changelist_index_path) if os.path.exists(changelist_index_path): os.remove(changelist_index_path) changelist_files = sorted( glob(self.param.abs_metadata_path("changelist_*.xml"))) if len(changelist_files) > 1: changelist_index = ChangeList() changelist_index.sitemapindex = True changelist_index.md_from = self.date_resourcelist_completed for cl_file in changelist_files: changelist = self.read_sitemap(cl_file, ChangeList()) uri = self.param.uri_from_path(cl_file) changelist_index.resources.append( Resource(uri=uri, md_from=changelist.md_from, md_until=changelist.md_until)) if self.param.is_saving_sitemaps: index_link = changelist.link("index") if index_link is None: changelist.link_set(rel="index", href=changelist_index_uri) self.save_sitemap(changelist, cl_file) self.finish_sitemap(-1, changelist_index)
def get_change_dump_index(self): """ Delete unregister bucket by pid. Arguments: Returns: None. """ if not self._validation(): return None changedump = ListBaseWithIndex(capability_name='changedump', ) changedump.up = INVENIO_CAPABILITY_URL.format(request.url_root) published_date = self.publish_date or datetime.datetime.utcnow() change_date = published_date day_now = datetime.datetime.now() while change_date < day_now: until = change_date + timedelta(days=self.interval_by_date) if until > day_now: until = day_now change = Resource( '{}/{}/changedump.xml'.format(self.url_path, change_date.strftime(r"%Y%m%d")), capability='changedump', md_from=str( change_date.replace( tzinfo=datetime.timezone.utc).isoformat()), md_until=str( until.replace(tzinfo=datetime.timezone.utc).isoformat())) changedump.add(change) change_date = until return changedump.as_xml()
def generator(filenames: iter, count=0) -> [int, Resource]: passes_gate = self.resource_gate() for filename in filenames: if not isinstance(filename, str): LOG.warning("Not a string: %s" % filename) filename = str(filename) file = os.path.abspath(filename) if not os.path.exists(file): LOG.warning("File does not exist: %s" % file) elif os.path.isdir(file): for cr, rsc in generator(self.walk_directories(file), count=count): yield cr, rsc count = cr elif os.path.isfile(file): if passes_gate(file): count += 1 path = os.path.relpath(file, self.para.resource_dir) uri = self.para.url_prefix + defaults.sanitize_url_path(path) stat = os.stat(file) resource = Resource(uri=uri, length=stat.st_size, lastmod=defaults.w3c_datetime(stat.st_ctime), md5=defaults.md5_for_file(file), mime_type=defaults.mime_type(file)) yield count, resource self.observers_inform(self, ExecutorEvent.created_resource, resource=resource, count=count, file=file) else: self.observers_inform(self, ExecutorEvent.rejected_file, file=file) else: LOG.warning("Not a regular file: %s" % file)
def get_capability_content(cls): """ Get capability_list. :return: (Resource Obj) list resource dump and resource list """ list_resource = cls.get_list_resource() caplist = [] for resource in list_resource: if resource._validation(): caplist.append( Resource('{}/resourcelist.xml'.format(resource.url_path), capability='resourcelist')) caplist.append( Resource('{}/resourcedump.xml'.format(resource.url_path), capability='resourcedump')) return caplist
def get_capability_content(cls): """ Get capability list content. :return: (Resource Obj) list resource dump and resource list """ list_change = cls.get_all() caplist = [] for change in list_change: if change._validation(): caplist.append( Resource('{}/changelist.xml'.format(change.url_path), capability='changelist')) caplist.append( Resource('{}/changedump.xml'.format(change.url_path), capability='changedump')) return caplist
def generate(self): url = "http://www.resourcesync.org" m = md5() m.update(url.encode("utf8")) rm = Resource(uri=url, lastmod="2016-10-01", md5=m.hexdigest(), length=20, mime_type="application/xml") return [rm]
def render_well_know_resourcesync(): """Generate source description xml.""" cap = ListBaseWithIndex(capability_name='description', ln=[{ 'href': request.url_root, 'rel': 'describedby' }]) cap.add( Resource('{}resync/capability.xml'.format(request.url_root), capability='capability')) return cap.as_xml()
def generate(self): url = "http://www.resourcesync.org" m = md5() body = url + "new changes" m.update(body.encode("utf8")) rm = Resource( uri=url, lastmod="2017-06-14", md5=m.hexdigest(), length=len(body), mime_type="application/xml" ) return [rm]
def create_index(self, sitemap_data_iter: iter) -> SitemapData: changelist_index_path = self.param.abs_metadata_path( "changedump-index.xml") changelist_index_uri = self.param.uri_from_path(changelist_index_path) if os.path.exists(changelist_index_path): os.remove(changelist_index_path) changelist_files = sorted( glob(self.param.abs_metadata_path("changedump_*.xml"))) changedump_files = sorted( glob(self.param.abs_metadata_path("cd_*.zip"))) if len(changelist_files) > 1: # changelist_index = ChangeDumpManifest() changelist_index = ChangeDump() changelist_index.modified = defaults.w3c_now() # changelist_index.sitemapindex = True # changelist_index.modified = self.date_resourcelist_completed for cl_file, cd_file in zip(changelist_files, changedump_files): # changelist = self.read_sitemap(cl_file, ChangeDump(md_from=changelist.md_from, md_until=changelist.md_until)) changelist = self.read_sitemap(cl_file, ChangeDump()) uri = self.param.uri_from_path(cd_file) lastmod = str( defaults.reformat_datetime( defaults.file_modification_date(cd_file))) md5 = defaults.md5_for_file(cd_file) mime_type = defaults.mime_type(cd_file) cd_length = os.path.getsize(cd_file) cd = Resource(uri=uri, length=cd_length, lastmod=lastmod, ln=[{ 'rel': 'contents', 'href': cl_file }]) # changelist_index.resources.add(Resource(uri=uri, length=cd_length, md_from=changelist.md_from, changelist_index.add(cd) if self.param.is_saving_sitemaps: index_link = changelist.link("index") if index_link is None: changelist.link_set(rel="index", href=changelist_index_uri) self.save_sitemap(changelist, cl_file) self.finish_sitemap(-1, changelist_index)
def get_resource_dump_xml(self, from_date=None, to_date=None): """ Get content of resource dump. :return: (xml) resource dump content """ if not self._validation(): return None from .utils import parse_date if from_date: from_date = parse_date(from_date) if to_date: to_date = parse_date(to_date) r = get_items_by_index_tree(self.repository_id) rd = ResourceDump() rd.up = INVENIO_CAPABILITY_URL.format(request.url_root) for item in r: if item: resource_date = parse_date(item.get('_source').get('_updated')) if from_date and from_date > resource_date: continue if to_date and to_date < resource_date: continue id_item = item.get('_source').get('control_number') url = '{}resync/{}/{}/file_content.zip'.format( request.url_root, self.repository_id, str(id_item)) rs = Resource(url, lastmod=item.get('_source').get('_updated'), ln=[]) if self.resource_dump_manifest: href = '{}resync/{}/{}/resourcedump_manifest.xml'.format( request.url_root, self.repository_id, str(id_item)) rs.ln.append({ 'rel': 'contents', 'href': href, 'type': 'application/xml' }) rd.add(rs) return rd.as_xml()
def create_index(self, sitemap_data_iter: iter): if len(sitemap_data_iter) > 1: resourcelist_index = ResourceList() resourcelist_index.sitemapindex = True resourcelist_index.md_at = self.date_start_processing resourcelist_index.md_completed = self.date_end_processing index_path = self.param.abs_metadata_path("resourcelist-index.xml") rel_index_path = os.path.relpath(index_path, self.param.resource_dir) index_url = self.param.url_prefix + defaults.sanitize_url_path( rel_index_path) resourcelist_index.link_set(rel="up", href=self.param.capabilitylist_url()) for sitemap_data in sitemap_data_iter: resourcelist_index.add( Resource(uri=sitemap_data.uri, md_at=sitemap_data.doc_start, md_completed=sitemap_data.doc_end)) if sitemap_data.document_saved: self.update_rel_index(index_url, sitemap_data.path) self.finish_sitemap(-1, resourcelist_index)
def solr_results_to_resourcesync_resource(self, a_result): if not self.params['metadata_disseminator'] == '': uri = self.params['metadata_disseminator'].replace( '_ID_', a_result['id']) else: uri = a_result['id'] # self.params['metadata_type']) lastmod = a_result['timestamp'] # do a GET request for each record to retrieve the 'content-length' r = get(uri) length = len(r.content) # compute md5 of the metadata record m = md5() element = str(r.content).encode('utf-8') m.update(element) return Resource(uri=uri, lastmod=lastmod, md5=m.hexdigest(), length=length, mime_type="text/xml")
def get_change_list_content_xml(self, from_date, from_date_args=None, to_date_args=None): """ Get change list xml. :return: Updated Change List info """ if not self._validation(): return None from .utils import parse_date if from_date_args: from_date_args = parse_date(from_date_args) if to_date_args: to_date_args = parse_date(to_date_args) change_list = ChangeList() change_list.up = INVENIO_CAPABILITY_URL.format(request.url_root) change_list.index = '{}resync/{}/changelist.xml'.format( request.url_root, self.repository_id, ) record_changes = self._get_record_changes_with_interval(from_date) for data in record_changes: try: if from_date_args and from_date_args > parse_date( data.get("updated")): continue if to_date_args and to_date_args < parse_date( data.get("updated")): continue pid_object = PersistentIdentifier.get('recid', data.get('record_id')) latest_pid = PIDVersioning(child=pid_object).last_child is_latest = str(latest_pid.pid_value) == "{}.{}".format( data.get('record_id'), data.get('record_version')) if not is_latest and data.get('status') != 'deleted': loc = '{}resync/{}/records/{}'.format( request.url_root, self.repository_id, '{}.{}'.format(data.get('record_id'), data.get('record_version'))) else: loc = '{}resync/{}/records/{}'.format( request.url_root, self.repository_id, data.get('record_id')) rc = Resource( loc, lastmod=data.get("updated"), change=data.get('status'), md_at=data.get("updated"), ) change_list.add(rc) except Exception: current_app.logger.error('-' * 60) traceback.print_exc(file=sys.stdout) current_app.logger.error('-' * 60) continue return change_list.as_xml()
def get_change_dump_manifest_xml(self, record_id): """Get change dump manifest xml. :param record_id: Identifier of record :return xml """ if not self._is_record_in_index(record_id) or not self._validation(): return None cdm = ChangeDumpManifest() cdm.up = '{}resync/{}/changedump.xml'.format(request.url_root, self.repository_id) if self.change_dump_manifest: prev_id, prev_ver_id = record_id.split(".") current_record = WekoRecord.get_record_by_pid(record_id) from .utils import get_pid prev_record_pid = get_pid('{}.{}'.format(prev_id, str(int(prev_ver_id) - 1))) if prev_record_pid: prev_record = WekoRecord.get_record( id_=prev_record_pid.object_uuid) else: prev_record = None if current_record: list_file = [file for file in current_record.files] current_checksum = [ file.info().get('checksum') for file in current_record.files ] prev_checksum = [] if prev_record: list_file.extend([file for file in prev_record.files]) prev_checksum = [ file.info().get('checksum') for file in prev_record.files ] for file in list_file: file_info = file.info() change = None if file_info.get('checksum') in prev_checksum: if file_info.get('checksum') in current_checksum: change = None if file_info.get('checksum') not in current_checksum: change = 'deleted' else: if file_info.get('checksum') in current_checksum: change = 'created' path = 'recid_{}/{}'.format(current_record.get('recid'), file_info.get('key')) lastmod = str(datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat()) if change: re = Resource( '{}record/{}/files/{}'.format( request.url_root, current_record.get('recid'), file_info.get('key')), lastmod=lastmod, sha256=file_info.get('checksum').split(':')[1], length=str(file_info.get('size')), path=path if change != 'delete' else '', change=change) cdm.add(re) return cdm.as_xml()
#!/usr/bin/env python if (True): #keep indentation of README from resync import Resource, ResourceList rl = ResourceList() rl.add(Resource('http://example.com/res1', lastmod='2013-01-01')) rl.add(Resource('http://example.com/res2', lastmod='2013-01-02')) print rl.as_xml()