コード例 #1
0
ファイル: _crawler.py プロジェクト: cleanclothes/castle.cms
def run(app):
    singleton.SingleInstance('crawler')

    app = spoof_request(app)  # noqa
    login_as_admin(app)  # noqa

    count = 0

    while True:
        try:
            if 'site-id' in sys.argv:
                siteid = sys.argv['site-id']
                setup_site(app[siteid])
                crawl_site(app[siteid])  # noqa
            else:
                for oid in app.objectIds():  # noqa
                    obj = app[oid]  # noqa
                    if IPloneSiteRoot.providedBy(obj):
                        try:
                            setup_site(obj)
                            obj._p_jar.sync()
                            crawl_site(obj, count % 10 == 0)
                        except Exception:
                            logger.error('Error crawling site %s' % oid,
                                         exc_info=True)
        except KeyError:
            pass
        except Exception:
            logger.error('Error setting up crawling', exc_info=True)

        logger.info('Waiting to crawl again')
        time.sleep(10 * 60)
        count += 1
コード例 #2
0
            if 'original-url' in el.attrib:
                # need to maintain the original original
                original = el.attrib['original-url']
            mover.modify(el, aws.swap_url(url))
            if original:
                el.attrib['original-url'] = original


def get_key_from_url(url):
    parsed = urlparse(url)
    # parsed url includes bucket so we strip off bucket to get actual key
    return '/'.join(parsed.path.split('/')[2:])


if __name__ == '__main__':
    login_as_admin(app)  # noqa
    site = app[args.site_id]  # noqa
    setSite(site)

    toremove = {}  # uid: path
    catalog = api.portal.get_tool('portal_catalog')
    registry = getUtility(IRegistry)
    crawler_settings = registry.forInterface(ICrawlerConfiguration,
                                             prefix='castle')
    es = ElasticSearchCatalog(catalog)
    crawler = Crawler(site, crawler_settings, es)
    storage = archival.Storage(site)
    for key, archive_data in storage.archives.items():
        for url in (archive_data.get('view_url'), archive_data['url']):
            if not url:
                continue
コード例 #3
0
def archive(site):
    setup_site(site)

    if (not api.portal.get_registry_record('castle.archival_enabled')
            or not api.portal.get_registry_record('castle.aws_s3_bucket_name')
            or not api.portal.get_registry_record('castle.aws_s3_key')
            or not api.portal.get_registry_record('castle.aws_s3_secret')
            or not api.portal.get_registry_record('plone.public_url')):
        logger.error(
            'Can not archive content. Either not enabled, S3 API not set or no public '
            'url set')
        return

    storage = archival.Storage(site)
    for brain in archival.getContentToArchive():
        try:
            ob = brain.getObject()

            container = aq_parent(ob)
            if (IPloneSiteRoot.providedBy(container)
                    and getDefaultPage(container) == ob.getId()):
                continue

            allowed = set(rolesForPermissionOn('View', ob))
            if 'Anonymous' not in allowed:
                # we can *not* archive unpublished content
                continue
            new_url = storage.add_content(ob)

            # resets login creds..
            login_as_admin(app)  # noqa

            if new_url:
                logger.warn('imported %s -> %s' % (ob.absolute_url(), new_url))
                # XXX might need to re-architect... might get conflict errors with how slow
                # archiving takes...
                api.content.delete(ob)
                transaction.commit()
            else:
                logger.error('error importing %s' % ob.absolute_url())
        except:
            logger.error('Error archiving %s' % brain.getPath(), exc_info=True)

    content_to_archive = archival.getContentToArchive(7)
    if len(content_to_archive) == 0:
        return

    backend_url = get_backend_url()
    # send out email warning of content about to be archived
    email_text = """
<p>Warning, this content will be archived in 7 days.
Login to
<a href="{site_url}">{site_title}</a> to extend this content.
</p>
<ul>""".format(site_title=api.portal.get_registry_record('plone.site_title'),
               site_url=backend_url)

    site_url = api.portal.get().absolute_url()
    for brain in content_to_archive:
        url = brain.getURL()
        url = url.replace(site_url, backend_url)
        email_text += """<li>
<a href="{url}">{title}</a></li>""".format(url=url, title=brain.Title)

    email_text += '</ul>'

    for user in api.user.get_users():
        roles = api.user.get_roles(user=user)
        if ('Site Administrator' not in roles and 'Manager' not in roles):
            continue
        email = user.getProperty('email')
        if not email:
            continue

        name = user.getProperty('fullname') or user.getId()
        html = '<p>Hi {name},</p>'.format(name=name) + email_text
        send_email(recipients=email,
                   subject="Content will be archived(Site: %s)" %
                   (api.portal.get_registry_record('plone.site_title')),
                   html=html)