def test_archive_transformers(self): storage = archival.Storage(self.portal) result = storage.transform_content( '<html><body><div class="foo">foo</div></body></html>', 'http://foobar.com') self.assertTrue('>bar<' in result)
def __init__(self, site, request): self.site = site self.request = request bucket_name = api.portal.get_registry_record('castle.aws_s3_bucket_name') self.configured_bucket_name = bucket_name self.s3, self.bucket = aws.get_bucket(bucket_name) self.archive_storage = archival.Storage(site)
def crawl_archives(self): registry = getUtility(IRegistry) base_url = registry.get('castle.aws_s3_base_url', None) storage = archival.Storage(self.site) urls = [] for key, archive_data in storage.archives.items(): # archives do not need to be re-indexed ever. # see if the key is in ES, if it is move on url = archive_data.get('view_url', None) or archive_data['url'] urls.append(aws.swap_url(url, base_url=base_url)) query = {"bool": {"filter": {"term": {"sitemap": "archives"}}}} existing_urls = self.get_all_from_es(query) for _id in set(urls) - set(existing_urls): # pages that have not yet been crawled try: self.crawl_archive_url(_id) except Exception: logger.error('Error indexing archive url: ' + _id, exc_info=True) for _id in set(existing_urls) - set(urls): # pages that have been removed from the archive self.delete_from_index(_id)
def test_archive_replacement_selector(self): storage = archival.Storage(self.portal) storage.replacements = {'.foobar': 'barfoo'} self.assertEqual( '<html><body><div class="foobar">barfoo</div></body></html>', storage.apply_replacements('<html><body><div class="foobar"></div>' '</body></html>'))
def test_archive_transformers(self): login(self.portal, TEST_USER_NAME) setRoles(self.portal, TEST_USER_ID, ('Member', 'Manager')) storage = archival.Storage(self.portal) result = storage.transform_content( '<html><body><div class="foo">foo</div></body></html>', 'http://foobar.com') self.assertTrue('>bar<' in result)
def test_archive_replacement_text(self): storage = archival.Storage(self.portal) storage.replacements = {'foobar': 'barfoo'} self.assertEqual( '<html><body><div>barfoo</div></body></html>', storage.apply_replacements('<html><body><div>foobar</div>' '</body></html>'))
def test_archive_replacement_text(self): login(self.portal, TEST_USER_NAME) setRoles(self.portal, TEST_USER_ID, ('Member', 'Manager')) storage = archival.Storage(self.portal) storage.replacements = {'foobar': 'barfoo'} self.assertEqual( '<html><body><div>barfoo</div></body></html>', storage.apply_replacements('<html><body><div>foobar</div>' '</body></html>'))
def test_is_s3_url(self): site = api.portal.get() storage = archival.Storage(site) parsed_endpoint = urlparse( storage.s3_conn.meta.client.meta.endpoint_url) urlstr = "http://localhost:9000/{}/{}".format(self.test_bucket_name, self.test_access_key) self.assertFalse(is_s3_url(urlstr, parsed_endpoint)) urlstr = "https://s3.amazonaws.com/{}/{}".format( self.test_bucket_name, self.test_access_key) self.assertTrue(is_s3_url(urlstr, parsed_endpoint))
def __call__(self): shield.protect(self.request, recheck=True) self.notfound = self.context self.context = api.portal.get() if '++' in self.request.URL: self.request.response.setStatus(404) try: return self.index() except Exception: logger.warn( "Failed to render 404 template, had to return simple response" ) return "not found" archive_storage = archival.Storage(self.context) site_url = self.context.absolute_url() path = self.request.ACTUAL_URL[len(site_url):].rstrip('/') wants_view = False if path.endswith('/view'): wants_view = True path = path.rsplit('/view', 1)[0] new_url = None if path.startswith('/resolveuid'): uid = path.replace('/resolveuid/', '') try: new_url = archive_storage.get_archive_url_by_uid(uid) except Exception: pass else: try: new_url = archive_storage.get_archive_url_by_path( path, wants_view) except Exception: pass if new_url: # XXX need to force redirect this way since normal redirect # gets overridden with 404 if self.request.environ.get('QUERY_STRING'): new_url += '?' + self.request.environ['QUERY_STRING'] raise Redirect(aws.swap_url(new_url)) self.attempt_redirect() self.request.response.setStatus(404) return self.index()
def test_move_resource(self): # this is creating a bucket in the moto/mock s3 service s3conn = boto3.resource('s3', endpoint_url=self.test_bucket_endpoint) s3conn.create_bucket(Bucket='castletest') s3, bucket = aws.get_bucket("castletest") storage = archival.Storage(self.portal) testcontent = 'this is some content' moveresource = api.content.create(type='Document', id='moveresource', container=self.portal) moveresource.content = testcontent api.content.transition(moveresource, 'publish') vpath = "/moveresource" url = self.portal.absolute_url() + vpath new_url = storage.move_resource(url, use_vhm=False) self.assertIsNotNone(new_url) try: # trim off, e.g., 'https://s3.amazonaws.com/bucketname' # and then convert the path back from the url escaped version droppart = "{}/{}/".format(self.test_bucket_endpoint, self.test_bucket_name) content_path = unquote_plus(new_url[len(droppart):]) bucket.Object(content_path).load() except botocore.exceptions.ClientError: self.fail("object does not exist after move") # move by url of content again new_url2 = storage.move_resource(url, use_vhm=False) self.assertEqual(new_url, new_url2) # test for existence of content in aws still try: # trim off 'https://s3.amazonaws.com/castletest/' # and then convert the path back from the url escaped version droppart = "{}/{}/".format(self.test_bucket_endpoint, self.test_bucket_name) content_path = unquote_plus(new_url[len(droppart):]) bucket.Object(content_path).load() except botocore.exceptions.ClientError: self.fail("object does not exist after move")
def test_move_to_aws(self): # this is creating a bucket in the moto/mock s3 service s3conn = boto3.resource('s3', endpoint_url=self.test_bucket_endpoint) s3conn.create_bucket(Bucket='castletest') s3, bucket = aws.get_bucket("castletest") storage = archival.Storage(self.portal) content = "this is a test" content_path = "a/test/path/for/this/test.html" content_type = "text/html; charset=utf-8" # the key should not be there before we run through this self.assertRaises(botocore.exceptions.ClientError, lambda: bucket.Object(content_path).load()) storage.move_to_aws(content, content_path, content_type) try: bucket.Object(archival.CONTENT_KEY_PREFIX + content_path).load() except botocore.exceptions.ClientError: self.fail("object does not exist after move")
def __call__(self): shield.protect(self.request) self.notfound = self.context self.context = api.portal.get() archive_storage = archival.Storage(self.context) site_url = self.context.absolute_url() path = self.request.ACTUAL_URL[len(site_url):].rstrip('/') wants_view = False if path.endswith('/view'): wants_view = True path = path.rsplit('/view', 1)[0] new_url = None if path.startswith('/resolveuid'): uid = path.replace('/resolveuid/', '') try: new_url = archive_storage.get_archive_url_by_uid(uid) except: pass else: try: new_url = archive_storage.get_archive_url_by_path(path, wants_view) except: pass if new_url: # XXX need to force redirect this way since normal redirect # gets overridden with 404 if self.request.environ.get('QUERY_STRING'): new_url += '?' + self.request.environ['QUERY_STRING'] raise Redirect(aws.swap_url(new_url)) # seems this overrides plone.app.redirector handler redirector = queryMultiAdapter((self.context, self.request), name=u'plone_redirector_view') if redirector: redirector.attempt_redirect() return self.index()
return '/'.join(parsed.path.split('/')[2:]) if __name__ == '__main__': login_as_admin(app) # noqa site = app[args.site_id] # noqa setSite(site) toremove = {} # uid: path catalog = api.portal.get_tool('portal_catalog') registry = getUtility(IRegistry) crawler_settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') es = ElasticSearchCatalog(catalog) crawler = Crawler(site, crawler_settings, es) storage = archival.Storage(site) for key, archive_data in storage.archives.items(): for url in (archive_data.get('view_url'), archive_data['url']): if not url: continue resp = requests.get(url) if 'html' not in resp.headers.get('content-type'): continue print('processing ' + url) dom = fromstring(resp.content) prop = dom.cssselect('meta[property="og:url"]') fix_urls(storage, dom) html = tostring(dom) if html == resp.content:
parser = argparse.ArgumentParser(description='...') parser.add_argument('--file', dest='file', default=False) parser.add_argument('--site-id', dest='site_id', default='Plone') parser.add_argument('--site-url', dest='site_url', default='') args, _ = parser.parse_known_args() user = app.acl_users.getUser('admin') # noqa newSecurityManager(None, user.__of__(app.acl_users)) # noqa site = app[args.site_id] # noqa setSite(site) fi = open(args.file) items = json.loads(fi.read()) fi.close() storage = archival.Storage(site, UrlOpener=archival.RequestsUrlOpener) count = 0 for item in items: count += 1 content_path = '/' + '/'.join(item['path'].split('/')[2:]) url = args.site_url.rstrip('/') + content_path # need to export UID also new_url = storage.add_url(url, content_path, item['uid']) if new_url: print('imported %s -> %s' % (url, new_url)) else: print('error importing %s' % (url, )) if count % 100 == 0: print('done with %i' % count) transaction.commit()
def _archive_content(obj): storage = archival.Storage(getSite()) storage.add_content(obj) _sync_and_store(storage, IUUID(obj))
def archive(site): setup_site(site) if (not api.portal.get_registry_record('castle.archival_enabled') or not api.portal.get_registry_record('castle.aws_s3_bucket_name') or not api.portal.get_registry_record('castle.aws_s3_key') or not api.portal.get_registry_record('castle.aws_s3_secret') or not api.portal.get_registry_record('plone.public_url')): logger.error( 'Can not archive content. Either not enabled, S3 API not set or no public ' 'url set') return storage = archival.Storage(site) for brain in archival.getContentToArchive(): try: ob = brain.getObject() container = aq_parent(ob) if (IPloneSiteRoot.providedBy(container) and getDefaultPage(container) == ob.getId()): continue allowed = set(rolesForPermissionOn('View', ob)) if 'Anonymous' not in allowed: # we can *not* archive unpublished content continue new_url = storage.add_content(ob) # resets login creds.. login_as_admin(app) # noqa if new_url: logger.warn('imported %s -> %s' % (ob.absolute_url(), new_url)) # XXX might need to re-architect... might get conflict errors with how slow # archiving takes... api.content.delete(ob) transaction.commit() else: logger.error('error importing %s' % ob.absolute_url()) except: logger.error('Error archiving %s' % brain.getPath(), exc_info=True) content_to_archive = archival.getContentToArchive(7) if len(content_to_archive) == 0: return backend_url = get_backend_url() # send out email warning of content about to be archived email_text = """ <p>Warning, this content will be archived in 7 days. Login to <a href="{site_url}">{site_title}</a> to extend this content. </p> <ul>""".format(site_title=api.portal.get_registry_record('plone.site_title'), site_url=backend_url) site_url = api.portal.get().absolute_url() for brain in content_to_archive: url = brain.getURL() url = url.replace(site_url, backend_url) email_text += """<li> <a href="{url}">{title}</a></li>""".format(url=url, title=brain.Title) email_text += '</ul>' for user in api.user.get_users(): roles = api.user.get_roles(user=user) if ('Site Administrator' not in roles and 'Manager' not in roles): continue email = user.getProperty('email') if not email: continue name = user.getProperty('fullname') or user.getId() html = '<p>Hi {name},</p>'.format(name=name) + email_text send_email(recipients=email, subject="Content will be archived(Site: %s)" % (api.portal.get_registry_record('plone.site_title')), html=html)
def _archive_url(url, path, uid): storage = archival.Storage(getSite(), UrlOpener=archival.RequestsUrlOpener) storage.add_url(url, path, uid) _sync_and_store(storage, uid)