예제 #1
0
def get_social_counts(site, obj, site_url, count=0):
    counts = {}
    site_path = '/'.join(site.getPhysicalPath())
    obj_path = '/'.join(obj.getPhysicalPath())
    rel_path = obj_path[len(site_path):].strip('/')
    print('Looking up ' + rel_path)

    urls = [site_url.rstrip('/') + '/' + rel_path]
    registry = getUtility(IRegistry)
    if obj.portal_type in registry.get('plone.types_use_view_action_in_listings', []):
        urls.append(urls[0] + '/view')

    container = aq_parent(obj)
    if defaultpage.is_default_page(container, obj):
        container_path = '/'.join(container.getPhysicalPath())
        rel_path = container_path[len(site_path):].strip('/')
        urls.append(site_url.rstrip('/') + '/' + rel_path)

    redirector = getUtility(IRedirectionStorage)
    for redirect in redirector.redirects(obj_path):
        rel_path = redirect[len(site_path):].strip('/')
        urls.append(site_url.rstrip('/') + '/' + rel_path)

    urls = _get_urls(urls)
    counts = _get_counts(urls)

    if not _has_data(counts):
        return

    obj._p_jar.sync()
    annotations = IAnnotations(obj)
    existing = annotations.get(COUNT_ANNOTATION_KEY, OOBTree())

    if not _count_diff(existing, counts):
        return

    # XXX check if value different first before transaction!
    existing.update(counts)
    annotations[COUNT_ANNOTATION_KEY] = existing

    transaction.commit()

    index_in_es(obj)
    if count % 200 == 0:
        clear_object_cache(site)
예제 #2
0
    def crawl_site_map(self, sitemap, full=False):
        resp = requests.get(
            sitemap, headers={'User-Agent': self.settings.crawler_user_agent})
        if resp.status_code != 200:
            logger.error('Not a valid sitemap response for %s' % sitemap)
            return

        self.site._p_jar.sync()
        if sitemap in self.data['tracking']:
            last_crawled = DateTime(self.data['tracking'][sitemap])
        else:
            last_crawled = DateTime('1999/01/01')

        self.data['tracking'][sitemap] = DateTime().ISO8601().decode('utf8')
        transaction.commit()
        clear_object_cache(self.site)

        if sitemap.lower().endswith('.gz'):
            sitemap_content = gzip.GzipFile(
                fileobj=StringIO(resp.content)).read()
        else:
            sitemap_content = resp.content

        dom = etree.fromstring(sitemap_content)
        crawled_urls = []
        for url_node in dom.xpath("//*[local-name() = 'url']"):
            loc = url_node.xpath("*[local-name() = 'loc']")
            if loc:
                loc = loc[0].text.strip()
            else:
                loc = None
            url = loc
            crawled_urls.append(url)

            lastmod = url_node.xpath("*[local-name() = 'lastmod']")
            if lastmod:
                lastmod = lastmod[0].text.strip()
            else:
                lastmod = None
            if lastmod:
                lastmod = DateTime(lastmod)
                if not full and lastmod < last_crawled:
                    continue

            if not url:
                continue
            try:
                interval = self.settings.crawler_interval
            except Exception:
                interval = 0
            time.sleep(interval)
            data = self.crawl_page(url)
            if data is False:
                crawled_urls.remove(url)
                try:
                    self.es.connection.delete(index=self.index_name, id=url)
                except NotFoundError:
                    pass
            else:
                data['sitemap'] = sitemap
                self.es.connection.index(index=self.index_name,
                                         id=url,
                                         body=data)
                crawled_urls.append(url)

        self.clean_removed_pages(sitemap, crawled_urls)
예제 #3
0
def find_broken(site):
    setup_site(site)
    catalog = site.portal_catalog

    broken = []
    good_urls = []
    checked_urls = []

    req = getRequest()
    for brain in catalog(object_provides=ILayoutAware.__identifier__):
        ob = brain.getObject()
        clear_object_cache(ob)
        layout = getLayout(ob)
        dom = getHTMLSerializer(layout)
        tiles.renderTiles(req, dom.tree, ob.absolute_url() + '/layout_view')
        root = dom.tree.getroot()
        for anchor in root.cssselect('a'):
            if not anchor.attrib.get('href'):
                continue
            url = anchor.attrib['href']
            if (url[0] == '#' or url.startswith('data:')
                    or url.startswith('mailto:')):
                continue
            if url in good_urls:
                continue
            if url in checked_urls:
                print('skipping already checked {}'.format(url))
                continue
            checked_urls.append(url)
            if find_url(ob, url):
                good_urls.append(url)
            else:
                try:
                    text = unidecode(anchor.text_content())
                except Exception:
                    text = ''
                result = '{} linking to broken -> {}({})'.format(
                    brain.getPath(), url, text)
                broken.append(result)
                print(result)

        for img in root.cssselect('img'):
            if not img.attrib.get('src'):
                continue
            url = img.attrib['src']
            if url[0] == '#' or url.startswith('data:'):
                continue
            if url in checked_urls:
                print('skipping already checked {}'.format(url))
                continue
            checked_urls.append(url)
            if find_url(ob, url):
                good_urls.append(url)
            else:
                result = '{} linking to broken image -> {}'.format(
                    brain.getPath(), url)
                broken.append(result)
                print(result)

    now = datetime.datetime.now()
    filename = 'broken-links-{}.txt'.format(now.isoformat())
    fi = open(filename, 'w')
    fi.write('\n'.join(broken))
    fi.close()