def run(app): singleton.SingleInstance('crawler') app = spoof_request(app) # noqa login_as_admin(app) # noqa count = 0 while True: try: if 'site-id' in sys.argv: siteid = sys.argv['site-id'] setup_site(app[siteid]) crawl_site(app[siteid]) # noqa else: for oid in app.objectIds(): # noqa obj = app[oid] # noqa if IPloneSiteRoot.providedBy(obj): try: setup_site(obj) obj._p_jar.sync() crawl_site(obj, count % 10 == 0) except Exception: logger.error('Error crawling site %s' % oid, exc_info=True) except KeyError: pass except Exception: logger.error('Error setting up crawling', exc_info=True) logger.info('Waiting to crawl again') time.sleep(10 * 60) count += 1
def crawl_archives(self): registry = getUtility(IRegistry) base_url = registry.get('castle.aws_s3_base_url', None) storage = archival.Storage(self.site) urls = [] for key, archive_data in storage.archives.items(): # archives do not need to be re-indexed ever. # see if the key is in ES, if it is move on url = archive_data.get('view_url', None) or archive_data['url'] urls.append(aws.swap_url(url, base_url=base_url)) query = {"bool": {"filter": {"term": {"sitemap": "archives"}}}} existing_urls = self.get_all_from_es(query) for _id in set(urls) - set(existing_urls): # pages that have not yet been crawled try: self.crawl_archive_url(_id) except Exception: logger.error('Error indexing archive url: ' + _id, exc_info=True) for _id in set(existing_urls) - set(urls): # pages that have been removed from the archive self.delete_from_index(_id)
def crawl_site_map(self, sitemap, full=False): resp = requests.get( sitemap, headers={'User-Agent': self.settings.crawler_user_agent}) if resp.status_code != 200: logger.error('Not a valid sitemap response for %s' % sitemap) return self.site._p_jar.sync() if sitemap in self.data['tracking']: last_crawled = DateTime(self.data['tracking'][sitemap]) else: last_crawled = DateTime('1999/01/01') self.data['tracking'][sitemap] = DateTime().ISO8601().decode('utf8') transaction.commit() dom = etree.fromstring(resp.content) crawled_urls = [] for url_node in dom.xpath("//*[local-name() = 'url']"): loc = url_node.xpath("*[local-name() = 'loc']") if loc: loc = loc[0].text.strip() else: loc = None url = loc crawled_urls.append(url) lastmod = url_node.xpath("*[local-name() = 'lastmod']") if lastmod: lastmod = lastmod[0].text.strip() else: lastmod = None if lastmod: lastmod = DateTime(lastmod) if not full and lastmod < last_crawled: continue if not url: continue data = self.crawl_page(url) if data is False: crawled_urls.remove(url) try: self.es.connection.delete( index=self.es.index_name, doc_type=CRAWLED_SITE_ES_DOC_TYPE, id=url) except NotFoundError: pass else: data['sitemap'] = sitemap self.es.connection.index(index=self.es.index_name, doc_type=CRAWLED_SITE_ES_DOC_TYPE, id=url, body=data) crawled_urls.append(url) self.clean_removed_pages(sitemap, crawled_urls)
def crawl_site(site, full=False): registry = getUtility(IRegistry) settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') if not settings.crawler_active or not settings.crawler_site_maps: logger.info("Crawler must first be enabled in Site Setup") return False catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) index_name = '{site_index_name}_crawler'.format( site_index_name=es.index_name) if not es.enabled: logger.info( "Elasticsearch must be enabled in Site Setup to use crawler") return False # check index type is mapped, create if not try: es.connection.indices.get_mapping(index=index_name) except NotFoundError: # need to add it adapter = getMultiAdapter((getRequest(), es), IMappingProvider) mapping = adapter() mapping['properties'].update(CRAWLER_ES_MAPPING) if not es.connection.indices.exists(index_name): es.connection.indices.create(index_name) es.connection.indices.put_mapping(body=mapping, index=index_name) crawler = Crawler(site, settings, es) if settings.crawler_index_archive: crawler.crawl_archives() for sitemap in settings.crawler_site_maps: try: crawler.crawl_site_map(sitemap, full) except Exception: logger.error('Error crawling site map: %s' % sitemap, exc_info=True) return True
def crawl_site(site, full=False): registry = getUtility(IRegistry) settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') if not settings.crawler_active or not settings.crawler_site_maps: return False catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) if not es.enabled: return False # check index type is mapped, create if not try: es.connection.indices.get_mapping(index=es.index_name, doc_type=CRAWLED_SITE_ES_DOC_TYPE) except NotFoundError: # need to add it adapter = getMultiAdapter((getRequest(), es), IMappingProvider) mapping = adapter() mapping['properties'].update(CRAWLER_ES_MAPPING) es.connection.indices.put_mapping(doc_type=CRAWLED_SITE_ES_DOC_TYPE, body=mapping, index=es.index_name) crawler = Crawler(site, settings, es) if settings.crawler_index_archive: crawler.crawl_archives() for sitemap in settings.crawler_site_maps: try: crawler.crawl_site_map(sitemap, full) except: logger.error('Error crawling site map: %s' % sitemap, exc_info=True) return True
def __call__(self): self.errors = [] self.protect() context = aq_inner(self.context) catalog = getToolByName(context, 'portal_catalog') mtool = getToolByName(context, 'portal_membership') missing = [] for key in self.request.form.keys(): if not key.startswith('UID_'): continue index = key.split('_')[-1] uid = self.request.form[key] brains = catalog(UID=uid) if len(brains) == 0: missing.append(uid) continue obj = brains[0].getObject() title = self.objectTitle(obj) if not mtool.checkPermission('Copy or Move', obj): self.errors( _(u'Permission denied to rename ${title}.', mapping={u'title': title})) continue sp = transaction.savepoint(optimistic=True) newid = self.request.form['newid_' + index].encode('utf8') newtitle = self.request.form['newtitle_' + index] lockable = ILockable(obj, None) if lockable: lockable.clear_locks() try: obid = obj.getId() title = obj.Title() change_title = newtitle and title != newtitle if change_title: getSecurityManager().validate(obj, obj, 'setTitle', obj.setTitle) obj.setTitle(newtitle) notify(ObjectModifiedEvent(obj)) if newid and obid != newid: parent = aq_parent(aq_inner(obj)) # Make sure newid is safe newid = INameChooser(parent).chooseName(newid, obj) # Update the default_page on the parent. context_state = getMultiAdapter((obj, self.request), name='plone_context_state') if context_state.is_default_page(): parent.setDefaultPage(newid) parent.manage_renameObjects((obid, ), (newid, )) elif change_title: # the rename will have already triggered a reindex obj.reindexObject() except ConflictError: raise except Exception as e: sp.rollback() logger.error(u'Error renaming "{title}": "{exception}"'.format( title=title.decode('utf8'), exception=e)) self.errors.append( _(u'Error renaming ${title}', mapping={'title': title.decode('utf8')})) return self.message(missing)
def crawl_site_map(self, sitemap, full=False): resp = requests.get( sitemap, headers={'User-Agent': self.settings.crawler_user_agent}) if resp.status_code != 200: logger.error('Not a valid sitemap response for %s' % sitemap) return self.site._p_jar.sync() if sitemap in self.data['tracking']: last_crawled = DateTime(self.data['tracking'][sitemap]) else: last_crawled = DateTime('1999/01/01') self.data['tracking'][sitemap] = DateTime().ISO8601().decode('utf8') transaction.commit() clear_object_cache(self.site) if sitemap.lower().endswith('.gz'): sitemap_content = gzip.GzipFile( fileobj=StringIO(resp.content)).read() else: sitemap_content = resp.content dom = etree.fromstring(sitemap_content) crawled_urls = [] for url_node in dom.xpath("//*[local-name() = 'url']"): loc = url_node.xpath("*[local-name() = 'loc']") if loc: loc = loc[0].text.strip() else: loc = None url = loc crawled_urls.append(url) lastmod = url_node.xpath("*[local-name() = 'lastmod']") if lastmod: lastmod = lastmod[0].text.strip() else: lastmod = None if lastmod: lastmod = DateTime(lastmod) if not full and lastmod < last_crawled: continue if not url: continue try: interval = self.settings.crawler_interval except Exception: interval = 0 time.sleep(interval) data = self.crawl_page(url) if data is False: crawled_urls.remove(url) try: self.es.connection.delete(index=self.index_name, id=url) except NotFoundError: pass else: data['sitemap'] = sitemap self.es.connection.index(index=self.index_name, id=url, body=data) crawled_urls.append(url) self.clean_removed_pages(sitemap, crawled_urls)