Beispiel #1
0
def run(app):
    singleton.SingleInstance('crawler')

    app = spoof_request(app)  # noqa
    login_as_admin(app)  # noqa

    count = 0

    while True:
        try:
            if 'site-id' in sys.argv:
                siteid = sys.argv['site-id']
                setup_site(app[siteid])
                crawl_site(app[siteid])  # noqa
            else:
                for oid in app.objectIds():  # noqa
                    obj = app[oid]  # noqa
                    if IPloneSiteRoot.providedBy(obj):
                        try:
                            setup_site(obj)
                            obj._p_jar.sync()
                            crawl_site(obj, count % 10 == 0)
                        except Exception:
                            logger.error('Error crawling site %s' % oid,
                                         exc_info=True)
        except KeyError:
            pass
        except Exception:
            logger.error('Error setting up crawling', exc_info=True)

        logger.info('Waiting to crawl again')
        time.sleep(10 * 60)
        count += 1
Beispiel #2
0
    def crawl_archives(self):
        registry = getUtility(IRegistry)
        base_url = registry.get('castle.aws_s3_base_url', None)

        storage = archival.Storage(self.site)
        urls = []
        for key, archive_data in storage.archives.items():
            # archives do not need to be re-indexed ever.
            # see if the key is in ES, if it is move on
            url = archive_data.get('view_url', None) or archive_data['url']
            urls.append(aws.swap_url(url, base_url=base_url))

        query = {"bool": {"filter": {"term": {"sitemap": "archives"}}}}
        existing_urls = self.get_all_from_es(query)
        for _id in set(urls) - set(existing_urls):
            # pages that have not yet been crawled
            try:
                self.crawl_archive_url(_id)
            except Exception:
                logger.error('Error indexing archive url: ' + _id,
                             exc_info=True)

        for _id in set(existing_urls) - set(urls):
            # pages that have been removed from the archive
            self.delete_from_index(_id)
Beispiel #3
0
    def crawl_site_map(self, sitemap, full=False):
        resp = requests.get(
            sitemap, headers={'User-Agent': self.settings.crawler_user_agent})
        if resp.status_code != 200:
            logger.error('Not a valid sitemap response for %s' % sitemap)
            return

        self.site._p_jar.sync()
        if sitemap in self.data['tracking']:
            last_crawled = DateTime(self.data['tracking'][sitemap])
        else:
            last_crawled = DateTime('1999/01/01')

        self.data['tracking'][sitemap] = DateTime().ISO8601().decode('utf8')
        transaction.commit()

        dom = etree.fromstring(resp.content)
        crawled_urls = []
        for url_node in dom.xpath("//*[local-name() = 'url']"):
            loc = url_node.xpath("*[local-name() = 'loc']")
            if loc:
                loc = loc[0].text.strip()
            else:
                loc = None
            url = loc
            crawled_urls.append(url)

            lastmod = url_node.xpath("*[local-name() = 'lastmod']")
            if lastmod:
                lastmod = lastmod[0].text.strip()
            else:
                lastmod = None
            if lastmod:
                lastmod = DateTime(lastmod)
                if not full and lastmod < last_crawled:
                    continue

            if not url:
                continue

            data = self.crawl_page(url)
            if data is False:
                crawled_urls.remove(url)
                try:
                    self.es.connection.delete(
                        index=self.es.index_name,
                        doc_type=CRAWLED_SITE_ES_DOC_TYPE,
                        id=url)
                except NotFoundError:
                    pass
            else:
                data['sitemap'] = sitemap
                self.es.connection.index(index=self.es.index_name,
                                         doc_type=CRAWLED_SITE_ES_DOC_TYPE,
                                         id=url,
                                         body=data)
                crawled_urls.append(url)

        self.clean_removed_pages(sitemap, crawled_urls)
Beispiel #4
0
def crawl_site(site, full=False):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ICrawlerConfiguration, prefix='castle')
    if not settings.crawler_active or not settings.crawler_site_maps:
        logger.info("Crawler must first be enabled in Site Setup")
        return False

    catalog = api.portal.get_tool('portal_catalog')
    es = ElasticSearchCatalog(catalog)
    index_name = '{site_index_name}_crawler'.format(
        site_index_name=es.index_name)
    if not es.enabled:
        logger.info(
            "Elasticsearch must be enabled in Site Setup to use crawler")
        return False

    # check index type is mapped, create if not
    try:
        es.connection.indices.get_mapping(index=index_name)
    except NotFoundError:
        # need to add it
        adapter = getMultiAdapter((getRequest(), es), IMappingProvider)
        mapping = adapter()
        mapping['properties'].update(CRAWLER_ES_MAPPING)
        if not es.connection.indices.exists(index_name):
            es.connection.indices.create(index_name)
        es.connection.indices.put_mapping(body=mapping, index=index_name)

    crawler = Crawler(site, settings, es)

    if settings.crawler_index_archive:
        crawler.crawl_archives()

    for sitemap in settings.crawler_site_maps:
        try:
            crawler.crawl_site_map(sitemap, full)
        except Exception:
            logger.error('Error crawling site map: %s' % sitemap,
                         exc_info=True)
    return True
Beispiel #5
0
def crawl_site(site, full=False):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ICrawlerConfiguration, prefix='castle')
    if not settings.crawler_active or not settings.crawler_site_maps:
        return False

    catalog = api.portal.get_tool('portal_catalog')
    es = ElasticSearchCatalog(catalog)
    if not es.enabled:
        return False

    # check index type is mapped, create if not
    try:
        es.connection.indices.get_mapping(index=es.index_name,
                                          doc_type=CRAWLED_SITE_ES_DOC_TYPE)
    except NotFoundError:
        # need to add it
        adapter = getMultiAdapter((getRequest(), es), IMappingProvider)
        mapping = adapter()
        mapping['properties'].update(CRAWLER_ES_MAPPING)
        es.connection.indices.put_mapping(doc_type=CRAWLED_SITE_ES_DOC_TYPE,
                                          body=mapping,
                                          index=es.index_name)

    crawler = Crawler(site, settings, es)

    if settings.crawler_index_archive:
        crawler.crawl_archives()

    for sitemap in settings.crawler_site_maps:
        try:
            crawler.crawl_site_map(sitemap, full)
        except:
            logger.error('Error crawling site map: %s' % sitemap,
                         exc_info=True)
    return True
Beispiel #6
0
    def __call__(self):
        self.errors = []
        self.protect()
        context = aq_inner(self.context)

        catalog = getToolByName(context, 'portal_catalog')
        mtool = getToolByName(context, 'portal_membership')

        missing = []
        for key in self.request.form.keys():
            if not key.startswith('UID_'):
                continue
            index = key.split('_')[-1]
            uid = self.request.form[key]
            brains = catalog(UID=uid)
            if len(brains) == 0:
                missing.append(uid)
                continue
            obj = brains[0].getObject()
            title = self.objectTitle(obj)
            if not mtool.checkPermission('Copy or Move', obj):
                self.errors(
                    _(u'Permission denied to rename ${title}.',
                      mapping={u'title': title}))
                continue

            sp = transaction.savepoint(optimistic=True)

            newid = self.request.form['newid_' + index].encode('utf8')
            newtitle = self.request.form['newtitle_' + index]

            lockable = ILockable(obj, None)
            if lockable:
                lockable.clear_locks()

            try:
                obid = obj.getId()
                title = obj.Title()
                change_title = newtitle and title != newtitle
                if change_title:
                    getSecurityManager().validate(obj, obj, 'setTitle',
                                                  obj.setTitle)
                    obj.setTitle(newtitle)
                    notify(ObjectModifiedEvent(obj))
                if newid and obid != newid:
                    parent = aq_parent(aq_inner(obj))
                    # Make sure newid is safe
                    newid = INameChooser(parent).chooseName(newid, obj)
                    # Update the default_page on the parent.
                    context_state = getMultiAdapter((obj, self.request),
                                                    name='plone_context_state')
                    if context_state.is_default_page():
                        parent.setDefaultPage(newid)
                    parent.manage_renameObjects((obid, ), (newid, ))
                elif change_title:
                    # the rename will have already triggered a reindex
                    obj.reindexObject()
            except ConflictError:
                raise
            except Exception as e:
                sp.rollback()
                logger.error(u'Error renaming "{title}": "{exception}"'.format(
                    title=title.decode('utf8'), exception=e))
                self.errors.append(
                    _(u'Error renaming ${title}',
                      mapping={'title': title.decode('utf8')}))

        return self.message(missing)
Beispiel #7
0
    def crawl_site_map(self, sitemap, full=False):
        resp = requests.get(
            sitemap, headers={'User-Agent': self.settings.crawler_user_agent})
        if resp.status_code != 200:
            logger.error('Not a valid sitemap response for %s' % sitemap)
            return

        self.site._p_jar.sync()
        if sitemap in self.data['tracking']:
            last_crawled = DateTime(self.data['tracking'][sitemap])
        else:
            last_crawled = DateTime('1999/01/01')

        self.data['tracking'][sitemap] = DateTime().ISO8601().decode('utf8')
        transaction.commit()
        clear_object_cache(self.site)

        if sitemap.lower().endswith('.gz'):
            sitemap_content = gzip.GzipFile(
                fileobj=StringIO(resp.content)).read()
        else:
            sitemap_content = resp.content

        dom = etree.fromstring(sitemap_content)
        crawled_urls = []
        for url_node in dom.xpath("//*[local-name() = 'url']"):
            loc = url_node.xpath("*[local-name() = 'loc']")
            if loc:
                loc = loc[0].text.strip()
            else:
                loc = None
            url = loc
            crawled_urls.append(url)

            lastmod = url_node.xpath("*[local-name() = 'lastmod']")
            if lastmod:
                lastmod = lastmod[0].text.strip()
            else:
                lastmod = None
            if lastmod:
                lastmod = DateTime(lastmod)
                if not full and lastmod < last_crawled:
                    continue

            if not url:
                continue
            try:
                interval = self.settings.crawler_interval
            except Exception:
                interval = 0
            time.sleep(interval)
            data = self.crawl_page(url)
            if data is False:
                crawled_urls.remove(url)
                try:
                    self.es.connection.delete(index=self.index_name, id=url)
                except NotFoundError:
                    pass
            else:
                data['sitemap'] = sitemap
                self.es.connection.index(index=self.index_name,
                                         id=url,
                                         body=data)
                crawled_urls.append(url)

        self.clean_removed_pages(sitemap, crawled_urls)