def get_social_counts(site, obj, site_url, count=0): counts = {} site_path = '/'.join(site.getPhysicalPath()) obj_path = '/'.join(obj.getPhysicalPath()) rel_path = obj_path[len(site_path):].strip('/') print('Looking up ' + rel_path) urls = [site_url.rstrip('/') + '/' + rel_path] registry = getUtility(IRegistry) if obj.portal_type in registry.get('plone.types_use_view_action_in_listings', []): urls.append(urls[0] + '/view') container = aq_parent(obj) if defaultpage.is_default_page(container, obj): container_path = '/'.join(container.getPhysicalPath()) rel_path = container_path[len(site_path):].strip('/') urls.append(site_url.rstrip('/') + '/' + rel_path) redirector = getUtility(IRedirectionStorage) for redirect in redirector.redirects(obj_path): rel_path = redirect[len(site_path):].strip('/') urls.append(site_url.rstrip('/') + '/' + rel_path) urls = _get_urls(urls) counts = _get_counts(urls) if not _has_data(counts): return obj._p_jar.sync() annotations = IAnnotations(obj) existing = annotations.get(COUNT_ANNOTATION_KEY, OOBTree()) if not _count_diff(existing, counts): return # XXX check if value different first before transaction! existing.update(counts) annotations[COUNT_ANNOTATION_KEY] = existing transaction.commit() index_in_es(obj) if count % 200 == 0: clear_object_cache(site)
def crawl_site_map(self, sitemap, full=False): resp = requests.get( sitemap, headers={'User-Agent': self.settings.crawler_user_agent}) if resp.status_code != 200: logger.error('Not a valid sitemap response for %s' % sitemap) return self.site._p_jar.sync() if sitemap in self.data['tracking']: last_crawled = DateTime(self.data['tracking'][sitemap]) else: last_crawled = DateTime('1999/01/01') self.data['tracking'][sitemap] = DateTime().ISO8601().decode('utf8') transaction.commit() clear_object_cache(self.site) if sitemap.lower().endswith('.gz'): sitemap_content = gzip.GzipFile( fileobj=StringIO(resp.content)).read() else: sitemap_content = resp.content dom = etree.fromstring(sitemap_content) crawled_urls = [] for url_node in dom.xpath("//*[local-name() = 'url']"): loc = url_node.xpath("*[local-name() = 'loc']") if loc: loc = loc[0].text.strip() else: loc = None url = loc crawled_urls.append(url) lastmod = url_node.xpath("*[local-name() = 'lastmod']") if lastmod: lastmod = lastmod[0].text.strip() else: lastmod = None if lastmod: lastmod = DateTime(lastmod) if not full and lastmod < last_crawled: continue if not url: continue try: interval = self.settings.crawler_interval except Exception: interval = 0 time.sleep(interval) data = self.crawl_page(url) if data is False: crawled_urls.remove(url) try: self.es.connection.delete(index=self.index_name, id=url) except NotFoundError: pass else: data['sitemap'] = sitemap self.es.connection.index(index=self.index_name, id=url, body=data) crawled_urls.append(url) self.clean_removed_pages(sitemap, crawled_urls)
def find_broken(site): setup_site(site) catalog = site.portal_catalog broken = [] good_urls = [] checked_urls = [] req = getRequest() for brain in catalog(object_provides=ILayoutAware.__identifier__): ob = brain.getObject() clear_object_cache(ob) layout = getLayout(ob) dom = getHTMLSerializer(layout) tiles.renderTiles(req, dom.tree, ob.absolute_url() + '/layout_view') root = dom.tree.getroot() for anchor in root.cssselect('a'): if not anchor.attrib.get('href'): continue url = anchor.attrib['href'] if (url[0] == '#' or url.startswith('data:') or url.startswith('mailto:')): continue if url in good_urls: continue if url in checked_urls: print('skipping already checked {}'.format(url)) continue checked_urls.append(url) if find_url(ob, url): good_urls.append(url) else: try: text = unidecode(anchor.text_content()) except Exception: text = '' result = '{} linking to broken -> {}({})'.format( brain.getPath(), url, text) broken.append(result) print(result) for img in root.cssselect('img'): if not img.attrib.get('src'): continue url = img.attrib['src'] if url[0] == '#' or url.startswith('data:'): continue if url in checked_urls: print('skipping already checked {}'.format(url)) continue checked_urls.append(url) if find_url(ob, url): good_urls.append(url) else: result = '{} linking to broken image -> {}'.format( brain.getPath(), url) broken.append(result) print(result) now = datetime.datetime.now() filename = 'broken-links-{}.txt'.format(now.isoformat()) fi = open(filename, 'w') fi.write('\n'.join(broken)) fi.close()