def run(app): singleton.SingleInstance('crawler') app = spoof_request(app) # noqa login_as_admin(app) # noqa count = 0 while True: try: if 'site-id' in sys.argv: siteid = sys.argv['site-id'] setup_site(app[siteid]) crawl_site(app[siteid]) # noqa else: for oid in app.objectIds(): # noqa obj = app[oid] # noqa if IPloneSiteRoot.providedBy(obj): try: setup_site(obj) obj._p_jar.sync() crawl_site(obj, count % 10 == 0) except Exception: logger.error('Error crawling site %s' % oid, exc_info=True) except KeyError: pass except Exception: logger.error('Error setting up crawling', exc_info=True) logger.info('Waiting to crawl again') time.sleep(10 * 60) count += 1
if 'original-url' in el.attrib: # need to maintain the original original original = el.attrib['original-url'] mover.modify(el, aws.swap_url(url)) if original: el.attrib['original-url'] = original def get_key_from_url(url): parsed = urlparse(url) # parsed url includes bucket so we strip off bucket to get actual key return '/'.join(parsed.path.split('/')[2:]) if __name__ == '__main__': login_as_admin(app) # noqa site = app[args.site_id] # noqa setSite(site) toremove = {} # uid: path catalog = api.portal.get_tool('portal_catalog') registry = getUtility(IRegistry) crawler_settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') es = ElasticSearchCatalog(catalog) crawler = Crawler(site, crawler_settings, es) storage = archival.Storage(site) for key, archive_data in storage.archives.items(): for url in (archive_data.get('view_url'), archive_data['url']): if not url: continue
def archive(site): setup_site(site) if (not api.portal.get_registry_record('castle.archival_enabled') or not api.portal.get_registry_record('castle.aws_s3_bucket_name') or not api.portal.get_registry_record('castle.aws_s3_key') or not api.portal.get_registry_record('castle.aws_s3_secret') or not api.portal.get_registry_record('plone.public_url')): logger.error( 'Can not archive content. Either not enabled, S3 API not set or no public ' 'url set') return storage = archival.Storage(site) for brain in archival.getContentToArchive(): try: ob = brain.getObject() container = aq_parent(ob) if (IPloneSiteRoot.providedBy(container) and getDefaultPage(container) == ob.getId()): continue allowed = set(rolesForPermissionOn('View', ob)) if 'Anonymous' not in allowed: # we can *not* archive unpublished content continue new_url = storage.add_content(ob) # resets login creds.. login_as_admin(app) # noqa if new_url: logger.warn('imported %s -> %s' % (ob.absolute_url(), new_url)) # XXX might need to re-architect... might get conflict errors with how slow # archiving takes... api.content.delete(ob) transaction.commit() else: logger.error('error importing %s' % ob.absolute_url()) except: logger.error('Error archiving %s' % brain.getPath(), exc_info=True) content_to_archive = archival.getContentToArchive(7) if len(content_to_archive) == 0: return backend_url = get_backend_url() # send out email warning of content about to be archived email_text = """ <p>Warning, this content will be archived in 7 days. Login to <a href="{site_url}">{site_title}</a> to extend this content. </p> <ul>""".format(site_title=api.portal.get_registry_record('plone.site_title'), site_url=backend_url) site_url = api.portal.get().absolute_url() for brain in content_to_archive: url = brain.getURL() url = url.replace(site_url, backend_url) email_text += """<li> <a href="{url}">{title}</a></li>""".format(url=url, title=brain.Title) email_text += '</ul>' for user in api.user.get_users(): roles = api.user.get_roles(user=user) if ('Site Administrator' not in roles and 'Manager' not in roles): continue email = user.getProperty('email') if not email: continue name = user.getProperty('fullname') or user.getId() html = '<p>Hi {name},</p>'.format(name=name) + email_text send_email(recipients=email, subject="Content will be archived(Site: %s)" % (api.portal.get_registry_record('plone.site_title')), html=html)