def run(app): singleton.SingleInstance('crawler') app = spoof_request(app) # noqa login_as_admin(app) # noqa count = 0 while True: try: if 'site-id' in sys.argv: siteid = sys.argv['site-id'] setup_site(app[siteid]) crawl_site(app[siteid]) # noqa else: for oid in app.objectIds(): # noqa obj = app[oid] # noqa if IPloneSiteRoot.providedBy(obj): try: setup_site(obj) obj._p_jar.sync() crawl_site(obj, count % 10 == 0) except Exception: logger.error('Error crawling site %s' % oid, exc_info=True) except KeyError: pass except Exception: logger.error('Error setting up crawling', exc_info=True) logger.info('Waiting to crawl again') time.sleep(10 * 60) count += 1
def crawl_page(self, url): logger.info('Indexing ' + url) resp = requests.get(url, stream=True, headers={ 'User-Agent': self.settings.crawler_user_agent }) if resp.status_code == 404 or \ 'html' not in resp.headers.get('content-type', '') or \ int(resp.headers.get('content-length', 0)) \ >= MAX_PAGE_SIZE: # remove from index return False dom = html.fromstring(resp.content) parsed = urlparse(url) data = { 'url': url, 'domain': parsed.netloc } for name, selectors in self._meta_properties.items(): for selector in selectors: result = dom.cssselect(selector) if len(result) > 0: result = result[0] if result.attrib.get('content'): data[name] = result.attrib['content'] break elif result.text_content().strip(): data[name] = result.text.strip() break for date_field in self.date_fields: val = data.get(date_field) if val: try: data[date_field] = DateTime(val).ISO8601() except Exception: pass searchable_text = [ data.get('Title', ''), data.get('Description', '') ] for el in dom.cssselect(self.searchable_text_selector): searchable_text.append(el.text_content()) data['SearchableText'] = ' '.join(searchable_text) return data
def crawl_page(self, url): logger.info('Indexing ' + url) try: resp = requests.get( url, headers={'User-Agent': self.settings.crawler_user_agent}) except Exception: # unable to access the page, remove for now return False if resp.status_code == 404 or 'html' not in resp.headers.get( 'content-type', ''): # remove from index return False try: dom = html.fromstring(resp.content) except etree.XMLSyntaxError: # unable to parse html, remove for now return False # lxml has been known to throw this as a bug, maybe use BeautifulSoup parsed = urlparse(url) data = {'url': url, 'domain': parsed.netloc} for name, selectors in self._meta_properties.items(): for selector in selectors: result = dom.cssselect(selector) if len(result) > 0: result = result[0] if result.attrib.get('content'): data[name] = result.attrib['content'] break elif result.text_content().strip(): data[name] = result.text.strip() break for date_field in self.date_fields: val = data.get(date_field) if val: try: data[date_field] = DateTime(val).ISO8601() except Exception: pass searchable_text = [data.get('Title', ''), data.get('Description', '')] for el in dom.cssselect(self.searchable_text_selector): searchable_text.append(el.text_content()) data['SearchableText'] = ' '.join(searchable_text) return data
def run(app): singleton.SingleInstance('twittermonitor') user = app.acl_users.getUser('admin') # noqa newSecurityManager(None, user.__of__(app.acl_users)) # noqa while True: try: if 'site-id' in sys.argv: siteid = sys.argv['site-id'] attempt_twitter_on_site(app[siteid]) # noqa else: for oid in app.objectIds(): # noqa obj = app[oid] # noqa if IPloneSiteRoot.providedBy(obj): attempt_twitter_on_site(obj) except KeyError: pass logger.info('Could not find valid site to monitor') time.sleep(10 * 60)
def crawl_site(site, full=False): registry = getUtility(IRegistry) settings = registry.forInterface(ICrawlerConfiguration, prefix='castle') if not settings.crawler_active or not settings.crawler_site_maps: logger.info("Crawler must first be enabled in Site Setup") return False catalog = api.portal.get_tool('portal_catalog') es = ElasticSearchCatalog(catalog) index_name = '{site_index_name}_crawler'.format( site_index_name=es.index_name) if not es.enabled: logger.info( "Elasticsearch must be enabled in Site Setup to use crawler") return False # check index type is mapped, create if not try: es.connection.indices.get_mapping(index=index_name) except NotFoundError: # need to add it adapter = getMultiAdapter((getRequest(), es), IMappingProvider) mapping = adapter() mapping['properties'].update(CRAWLER_ES_MAPPING) if not es.connection.indices.exists(index_name): es.connection.indices.create(index_name) es.connection.indices.put_mapping(body=mapping, index=index_name) crawler = Crawler(site, settings, es) if settings.crawler_index_archive: crawler.crawl_archives() for sitemap in settings.crawler_site_maps: try: crawler.crawl_site_map(sitemap, full) except Exception: logger.error('Error crawling site map: %s' % sitemap, exc_info=True) return True