def run(app): singleton.SingleInstance('crawler') app = spoof_request(app) # noqa login_as_admin(app) # noqa count = 0 while True: try: if 'site-id' in sys.argv: siteid = sys.argv['site-id'] setup_site(app[siteid]) crawl_site(app[siteid]) # noqa else: for oid in app.objectIds(): # noqa obj = app[oid] # noqa if IPloneSiteRoot.providedBy(obj): try: setup_site(obj) obj._p_jar.sync() crawl_site(obj, count % 10 == 0) except Exception: logger.error('Error crawling site %s' % oid, exc_info=True) except KeyError: pass except Exception: logger.error('Error setting up crawling', exc_info=True) logger.info('Waiting to crawl again') time.sleep(10 * 60) count += 1
def find_broken(site): setup_site(site) catalog = site.portal_catalog broken = [] good_urls = [] req = getRequest() for brain in catalog(object_provides=ILayoutAware.__identifier__): ob = brain.getObject() layout = getLayout(ob) dom = getHTMLSerializer(layout) tiles.renderTiles(req, dom.tree, ob.absolute_url() + '/layout_view') root = dom.tree.getroot() for anchor in root.cssselect('a'): if not anchor.attrib.get('href'): continue url = anchor.attrib['href'] if url[0] == '#' or url.startswith('data:') or url.startswith( 'mailto:'): continue if url in good_urls: continue if find_url(ob, url): good_urls.append(url) else: try: text = unidecode(anchor.text_content()) except: text = '' result = '{} linking to broken -> {}({})'.format( brain.getPath(), url, text) broken.append(result) print(result) for img in root.cssselect('img'): if not img.attrib.get('src'): continue url = img.attrib['src'] if url[0] == '#' or url.startswith('data:'): continue if find_url(ob, url): good_urls.append(url) else: result = '{} linking to broken image -> {}'.format( brain.getPath(), url) broken.append(result) print(result) filename = 'broken-links-{}.txt' fi = open(filename, 'w') fi.write('\n'.join(broken)) fi.close()
def upgrade(site): setup_site(site) # attempt to upgrade plone first pm = site.portal_migration report = pm.upgrade(dry_run=False) print(report) ps = site.portal_setup # go through all profiles that need upgrading for profile_id in ps.listProfilesWithUpgrades(): # do our best to detect good upgrades if profile_id.split(':')[0] in ('Products.CMFPlacefulWorkflow', 'plone.app.iterate', 'plone.app.multilingual', 'Products.PloneKeywordManager', 'collective.easyform', 'plone.session'): continue if not profile_id.endswith(':default'): continue steps_to_run = ps.listUpgrades(profile_id) if steps_to_run: print('Running profile upgrades for {}'.format(profile_id)) ps.upgradeProfile(profile_id) remaining = ps.listUpgrades(profile_id) if remaining: if args.skip_incomplete: print( '[{}] Running upgrades did not finish all upgrade steps: {}' .format(profile_id, remaining)) else: raise Exception( '[{}] Running upgrades did not finish all upgrade steps: {}' .format(profile_id, remaining)) transaction.commit()
indent=4, separators=(',', ': ')) def add_quotes(ss): return '"' + ss + '"' SCRIPT_DIR = os.path.join(get_module_dir(castle.cms), '_scripts') CMFPlone_DIR = get_module_dir(Products.CMFPlone) webpack_aliases = {} bundles_config = {} for site in get_sites(app): # noqa setup_site(site) os.environ['SITE_ID'] = site.getId() break registry = getUtility(IRegistry) bundles = registry.collectionOfInterface( IBundleRegistry, prefix="plone.bundles", check=False) resources = registry.collectionOfInterface( IResourceRegistry, prefix="plone.resources", check=False) with open(os.path.join(SCRIPT_DIR, 'templates/watchable-grunt.js')) as fi: tmpl = fi.read() with open('watchable-grunt.js', 'w') as output: output.write(tmpl % {
def index_site(site): setup_site(site) catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) if not es.enabled: return req = getRequest() assert req is not None alsoProvides(req, IReindexActive) # first we want to get all document ids from elastic page_size = 700 ids = [] result = es.connection.search( index=es.index_name, doc_type=es.doc_type, scroll='30s', size=page_size, fields=[], body={ "query": { "match_all": {} } }) ids.extend([r['_id'] for r in result['hits']['hits']]) scroll_id = result['_scroll_id'] while scroll_id: result = es.connection.scroll( scroll_id=scroll_id, scroll='30s' ) if len(result['hits']['hits']) == 0: break ids.extend([r['_id'] for r in result['hits']['hits']]) scroll_id = result['_scroll_id'] index = {} count = 0 for brain in catalog(): count += 1 # go through each object and reindex using bulk setting try: ob = brain.getObject() except Exception: print('Could not get object of %s' % brain.getPath()) continue try: uid = IUUID(ob) index[uid] = ob except TypeError: print('Could not get UID of %s' % brain.getPath()) continue if uid in ids: # remove from uids... When all said and done, # we'll make sure the uids left are in fact no longer on the # system and remove them from es ids.remove(uid) if len(index) > 300: print('finished indexing %i' % count) index_batch([], index, [], es) site._p_jar.invalidateCache() # noqa transaction.begin() site._p_jar.sync() # noqa index = {} index_batch([], index, [], es) remove = [] for uid in ids: brains = catalog(UID=uid) if len(brains) == 0: remove.append(uid) index_batch(remove, {}, [], es)
def archive(site): setup_site(site) if (not api.portal.get_registry_record('castle.archival_enabled') or not api.portal.get_registry_record('castle.aws_s3_bucket_name') or not api.portal.get_registry_record('castle.aws_s3_key') or not api.portal.get_registry_record('castle.aws_s3_secret') or not api.portal.get_registry_record('plone.public_url')): logger.error( 'Can not archive content. Either not enabled, S3 API not set or no public ' 'url set') return storage = archival.Storage(site) for brain in archival.getContentToArchive(): try: ob = brain.getObject() container = aq_parent(ob) if (IPloneSiteRoot.providedBy(container) and getDefaultPage(container) == ob.getId()): continue allowed = set(rolesForPermissionOn('View', ob)) if 'Anonymous' not in allowed: # we can *not* archive unpublished content continue new_url = storage.add_content(ob) # resets login creds.. login_as_admin(app) # noqa if new_url: logger.warn('imported %s -> %s' % (ob.absolute_url(), new_url)) # XXX might need to re-architect... might get conflict errors with how slow # archiving takes... api.content.delete(ob) transaction.commit() else: logger.error('error importing %s' % ob.absolute_url()) except: logger.error('Error archiving %s' % brain.getPath(), exc_info=True) content_to_archive = archival.getContentToArchive(7) if len(content_to_archive) == 0: return backend_url = get_backend_url() # send out email warning of content about to be archived email_text = """ <p>Warning, this content will be archived in 7 days. Login to <a href="{site_url}">{site_title}</a> to extend this content. </p> <ul>""".format(site_title=api.portal.get_registry_record('plone.site_title'), site_url=backend_url) site_url = api.portal.get().absolute_url() for brain in content_to_archive: url = brain.getURL() url = url.replace(site_url, backend_url) email_text += """<li> <a href="{url}">{title}</a></li>""".format(url=url, title=brain.Title) email_text += '</ul>' for user in api.user.get_users(): roles = api.user.get_roles(user=user) if ('Site Administrator' not in roles and 'Manager' not in roles): continue email = user.getProperty('email') if not email: continue name = user.getProperty('fullname') or user.getId() html = '<p>Hi {name},</p>'.format(name=name) + email_text send_email(recipients=email, subject="Content will be archived(Site: %s)" % (api.portal.get_registry_record('plone.site_title')), html=html)