def crawl_archives(self): registry = getUtility(IRegistry) base_url = registry.get('castle.aws_s3_base_url', None) storage = archival.Storage(self.site) urls = [] for key, archive_data in storage.archives.items(): # archives do not need to be re-indexed ever. # see if the key is in ES, if it is move on url = archive_data.get('view_url', None) or archive_data['url'] urls.append(aws.swap_url(url, base_url=base_url)) query = {"bool": {"filter": {"term": {"sitemap": "archives"}}}} existing_urls = self.get_all_from_es(query) for _id in set(urls) - set(existing_urls): # pages that have not yet been crawled try: self.crawl_archive_url(_id) except Exception: logger.error('Error indexing archive url: ' + _id, exc_info=True) for _id in set(existing_urls) - set(urls): # pages that have been removed from the archive self.delete_from_index(_id)
def test_swap_url(self): awsurl = 'https://s3-us-gov-west-1.amazonaws.com/bucketname/archives/path/to/resource' baseurl = 'http://foo.com/' swappedurl = 'http://foo.com/archives/path/to/resource' resulturl = aws.swap_url(awsurl, base_url=baseurl) self.assertEqual(swappedurl, resulturl)
def __call__(self): shield.protect(self.request, recheck=True) self.notfound = self.context self.context = api.portal.get() if '++' in self.request.URL: self.request.response.setStatus(404) try: return self.index() except Exception: logger.warn( "Failed to render 404 template, had to return simple response" ) return "not found" archive_storage = archival.Storage(self.context) site_url = self.context.absolute_url() path = self.request.ACTUAL_URL[len(site_url):].rstrip('/') wants_view = False if path.endswith('/view'): wants_view = True path = path.rsplit('/view', 1)[0] new_url = None if path.startswith('/resolveuid'): uid = path.replace('/resolveuid/', '') try: new_url = archive_storage.get_archive_url_by_uid(uid) except Exception: pass else: try: new_url = archive_storage.get_archive_url_by_path( path, wants_view) except Exception: pass if new_url: # XXX need to force redirect this way since normal redirect # gets overridden with 404 if self.request.environ.get('QUERY_STRING'): new_url += '?' + self.request.environ['QUERY_STRING'] raise Redirect(aws.swap_url(new_url)) self.attempt_redirect() self.request.response.setStatus(404) return self.index()
def fix_urls(storage, dom): parsed_endpoint = urlparse(storage.s3_conn.meta.client.meta.endpoint_url) for Mover in storage.Movers: mover = Mover(dom) for el in mover.get_elements(): url = mover.get_url(el) if url is None: continue # check that the url is an s3 url if not is_s3_url(url, parsed_endpoint): continue original = None if 'original-url' in el.attrib: # need to maintain the original original original = el.attrib['original-url'] mover.modify(el, aws.swap_url(url)) if original: el.attrib['original-url'] = original
def fix_urls(storage, dom): for Mover in storage.Movers: mover = Mover(dom) for el in mover.get_elements(): url = mover.get_url(el) if url is None: continue # check that the url is an s3 url parsed = urlparse(url) if parsed.netloc != storage.s3_conn.server_name(): continue original = None if 'original-url' in el.attrib: # need to maintain the original original original = el.attrib['original-url'] mover.modify(el, aws.swap_url(url)) if original: el.attrib['original-url'] = original
def __call__(self): shield.protect(self.request) self.notfound = self.context self.context = api.portal.get() archive_storage = archival.Storage(self.context) site_url = self.context.absolute_url() path = self.request.ACTUAL_URL[len(site_url):].rstrip('/') wants_view = False if path.endswith('/view'): wants_view = True path = path.rsplit('/view', 1)[0] new_url = None if path.startswith('/resolveuid'): uid = path.replace('/resolveuid/', '') try: new_url = archive_storage.get_archive_url_by_uid(uid) except: pass else: try: new_url = archive_storage.get_archive_url_by_path(path, wants_view) except: pass if new_url: # XXX need to force redirect this way since normal redirect # gets overridden with 404 if self.request.environ.get('QUERY_STRING'): new_url += '?' + self.request.environ['QUERY_STRING'] raise Redirect(aws.swap_url(new_url)) # seems this overrides plone.app.redirector handler redirector = queryMultiAdapter((self.context, self.request), name=u'plone_redirector_view') if redirector: redirector.attempt_redirect() return self.index()
def transform_content(self, content, from_url): parsed_url = urlparse(from_url) domain = parsed_url.netloc dom = fromstring(content) for Mover in self.Movers: mover = Mover(dom) for el in mover.get_elements(): url = mover.get_url(el) if url is None: continue if url[0] == '/': url = '{}://{}{}'.format(parsed_url.scheme, domain, url) elif 'https://' not in url and 'http://' not in url: url = urljoin(from_url, url) # check that the url is on the site... rdomain = urlparse(url).netloc if rdomain and domain != rdomain: continue if url not in self.resources: # need to move resource resource_url = url if not url.startswith('http'): resource_url = urljoin(from_url, url) moved_url = self.move_resource(resource_url, mover.keep_ext) if moved_url: self.resources[url] = moved_url if url in self.resources: mover.modify(el, aws.swap_url(self.resources[url])) content = tostring(dom) for Util in getAllUtilitiesRegisteredFor(IArchiveContentTransformer): try: util = Util(self) content = util(content) except Exception: logger.info('Error with archive utility', exc_info=True) return content