Example #1
0
def run(app):
    singleton.SingleInstance('crawler')

    app = spoof_request(app)  # noqa
    login_as_admin(app)  # noqa

    count = 0

    while True:
        try:
            if 'site-id' in sys.argv:
                siteid = sys.argv['site-id']
                setup_site(app[siteid])
                crawl_site(app[siteid])  # noqa
            else:
                for oid in app.objectIds():  # noqa
                    obj = app[oid]  # noqa
                    if IPloneSiteRoot.providedBy(obj):
                        try:
                            setup_site(obj)
                            obj._p_jar.sync()
                            crawl_site(obj, count % 10 == 0)
                        except Exception:
                            logger.error('Error crawling site %s' % oid,
                                         exc_info=True)
        except KeyError:
            pass
        except Exception:
            logger.error('Error setting up crawling', exc_info=True)

        logger.info('Waiting to crawl again')
        time.sleep(10 * 60)
        count += 1
Example #2
0
def find_broken(site):
    setup_site(site)
    catalog = site.portal_catalog

    broken = []
    good_urls = []

    req = getRequest()
    for brain in catalog(object_provides=ILayoutAware.__identifier__):
        ob = brain.getObject()
        layout = getLayout(ob)
        dom = getHTMLSerializer(layout)
        tiles.renderTiles(req, dom.tree, ob.absolute_url() + '/layout_view')
        root = dom.tree.getroot()
        for anchor in root.cssselect('a'):
            if not anchor.attrib.get('href'):
                continue
            url = anchor.attrib['href']
            if url[0] == '#' or url.startswith('data:') or url.startswith(
                    'mailto:'):
                continue
            if url in good_urls:
                continue

            if find_url(ob, url):
                good_urls.append(url)
            else:
                try:
                    text = unidecode(anchor.text_content())
                except:
                    text = ''
                result = '{} linking to broken -> {}({})'.format(
                    brain.getPath(), url, text)
                broken.append(result)
                print(result)

        for img in root.cssselect('img'):
            if not img.attrib.get('src'):
                continue
            url = img.attrib['src']
            if url[0] == '#' or url.startswith('data:'):
                continue
            if find_url(ob, url):
                good_urls.append(url)
            else:
                result = '{} linking to broken image -> {}'.format(
                    brain.getPath(), url)
                broken.append(result)
                print(result)

    filename = 'broken-links-{}.txt'
    fi = open(filename, 'w')
    fi.write('\n'.join(broken))
    fi.close()
Example #3
0
def upgrade(site):
    setup_site(site)

    # attempt to upgrade plone first
    pm = site.portal_migration
    report = pm.upgrade(dry_run=False)
    print(report)

    ps = site.portal_setup
    # go through all profiles that need upgrading
    for profile_id in ps.listProfilesWithUpgrades():
        # do our best to detect good upgrades
        if profile_id.split(':')[0] in ('Products.CMFPlacefulWorkflow',
                                        'plone.app.iterate',
                                        'plone.app.multilingual',
                                        'Products.PloneKeywordManager',
                                        'collective.easyform',
                                        'plone.session'):
            continue
        if not profile_id.endswith(':default'):
            continue
        steps_to_run = ps.listUpgrades(profile_id)
        if steps_to_run:
            print('Running profile upgrades for {}'.format(profile_id))
            ps.upgradeProfile(profile_id)

        remaining = ps.listUpgrades(profile_id)
        if remaining:
            if args.skip_incomplete:
                print(
                    '[{}] Running upgrades did not finish all upgrade steps: {}'
                    .format(profile_id, remaining))
            else:
                raise Exception(
                    '[{}] Running upgrades did not finish all upgrade steps: {}'
                    .format(profile_id, remaining))

    transaction.commit()
Example #4
0
                      indent=4, separators=(',', ': '))


def add_quotes(ss):
    return '"' + ss + '"'


SCRIPT_DIR = os.path.join(get_module_dir(castle.cms), '_scripts')
CMFPlone_DIR = get_module_dir(Products.CMFPlone)

webpack_aliases = {}
bundles_config = {}


for site in get_sites(app):  # noqa
    setup_site(site)
    os.environ['SITE_ID'] = site.getId()
    break


registry = getUtility(IRegistry)
bundles = registry.collectionOfInterface(
    IBundleRegistry, prefix="plone.bundles", check=False)
resources = registry.collectionOfInterface(
    IResourceRegistry, prefix="plone.resources", check=False)


with open(os.path.join(SCRIPT_DIR, 'templates/watchable-grunt.js')) as fi:
    tmpl = fi.read()
    with open('watchable-grunt.js', 'w') as output:
        output.write(tmpl % {
Example #5
0
def index_site(site):
    setup_site(site)
    catalog = api.portal.get_tool('portal_catalog')
    es = ElasticSearchCatalog(catalog)
    if not es.enabled:
        return

    req = getRequest()
    assert req is not None
    alsoProvides(req, IReindexActive)

    # first we want to get all document ids from elastic
    page_size = 700
    ids = []
    result = es.connection.search(
        index=es.index_name, doc_type=es.doc_type,
        scroll='30s',
        size=page_size,
        fields=[],
        body={
            "query": {
                "match_all": {}
            }
        })
    ids.extend([r['_id'] for r in result['hits']['hits']])
    scroll_id = result['_scroll_id']
    while scroll_id:
        result = es.connection.scroll(
            scroll_id=scroll_id,
            scroll='30s'
        )
        if len(result['hits']['hits']) == 0:
            break
        ids.extend([r['_id'] for r in result['hits']['hits']])
        scroll_id = result['_scroll_id']

    index = {}
    count = 0
    for brain in catalog():
        count += 1
        # go through each object and reindex using bulk setting
        try:
            ob = brain.getObject()
        except Exception:
            print('Could not get object of %s' % brain.getPath())
            continue
        try:
            uid = IUUID(ob)
            index[uid] = ob
        except TypeError:
            print('Could not get UID of %s' % brain.getPath())
            continue
        if uid in ids:
            # remove from uids... When all said and done,
            # we'll make sure the uids left are in fact no longer on the
            # system and remove them from es
            ids.remove(uid)
        if len(index) > 300:
            print('finished indexing %i' % count)
            index_batch([], index, [], es)
            site._p_jar.invalidateCache()  # noqa
            transaction.begin()
            site._p_jar.sync()  # noqa
            index = {}
    index_batch([], index, [], es)

    remove = []
    for uid in ids:
        brains = catalog(UID=uid)
        if len(brains) == 0:
            remove.append(uid)
    index_batch(remove, {}, [], es)
Example #6
0
def archive(site):
    setup_site(site)

    if (not api.portal.get_registry_record('castle.archival_enabled')
            or not api.portal.get_registry_record('castle.aws_s3_bucket_name')
            or not api.portal.get_registry_record('castle.aws_s3_key')
            or not api.portal.get_registry_record('castle.aws_s3_secret')
            or not api.portal.get_registry_record('plone.public_url')):
        logger.error(
            'Can not archive content. Either not enabled, S3 API not set or no public '
            'url set')
        return

    storage = archival.Storage(site)
    for brain in archival.getContentToArchive():
        try:
            ob = brain.getObject()

            container = aq_parent(ob)
            if (IPloneSiteRoot.providedBy(container)
                    and getDefaultPage(container) == ob.getId()):
                continue

            allowed = set(rolesForPermissionOn('View', ob))
            if 'Anonymous' not in allowed:
                # we can *not* archive unpublished content
                continue
            new_url = storage.add_content(ob)

            # resets login creds..
            login_as_admin(app)  # noqa

            if new_url:
                logger.warn('imported %s -> %s' % (ob.absolute_url(), new_url))
                # XXX might need to re-architect... might get conflict errors with how slow
                # archiving takes...
                api.content.delete(ob)
                transaction.commit()
            else:
                logger.error('error importing %s' % ob.absolute_url())
        except:
            logger.error('Error archiving %s' % brain.getPath(), exc_info=True)

    content_to_archive = archival.getContentToArchive(7)
    if len(content_to_archive) == 0:
        return

    backend_url = get_backend_url()
    # send out email warning of content about to be archived
    email_text = """
<p>Warning, this content will be archived in 7 days.
Login to
<a href="{site_url}">{site_title}</a> to extend this content.
</p>
<ul>""".format(site_title=api.portal.get_registry_record('plone.site_title'),
               site_url=backend_url)

    site_url = api.portal.get().absolute_url()
    for brain in content_to_archive:
        url = brain.getURL()
        url = url.replace(site_url, backend_url)
        email_text += """<li>
<a href="{url}">{title}</a></li>""".format(url=url, title=brain.Title)

    email_text += '</ul>'

    for user in api.user.get_users():
        roles = api.user.get_roles(user=user)
        if ('Site Administrator' not in roles and 'Manager' not in roles):
            continue
        email = user.getProperty('email')
        if not email:
            continue

        name = user.getProperty('fullname') or user.getId()
        html = '<p>Hi {name},</p>'.format(name=name) + email_text
        send_email(recipients=email,
                   subject="Content will be archived(Site: %s)" %
                   (api.portal.get_registry_record('plone.site_title')),
                   html=html)