Example #1
0
def cluster(url):
    """
    Read URLs from sitemaps and return clusters
    url is either a website (and we detect sitemaps) or a sitemap
    """
    data = {}
    if url[:4] != "http":
        url = "http://" + url

    if re.search(r"https?://[^/?#]+[/?#].+", url):
        sitemaps = [url]  # sitemap URL given
    else:
        robots = url.strip("/") + "/robots.txt"
        sitemaps = sitemaps_from_robots(robots)
        if not sitemaps:
            # assume sitemap.xml
            sitemaps = [url.strip("/") + "/sitemap.xml"]

    if sitemaps:
        try:
            urls = read_sitemaps(sitemaps)
            if not urls:
                data["error"] = "No URLs found in sitemap"
            else:
                data["count"] = len(urls)
                urls = [x.strip() for x in urls]
                # cluster URLs
                c = urlclustering.cluster(urls)
                tmp = deepcopy(c["clusters"])
                try:
                    improve_patterns(c["clusters"])
                except:
                    c["clusters"] = tmp
                    pass
                # prepare HTML
                html = "<pre>CLUSTERS:"
                keys = sorted(c["clusters"], key=lambda k: len(c["clusters"][k]), reverse=True)
                for key in keys:
                    urls = c["clusters"][key]
                    html += "\n" + key[1] + " [%s URLs]<br/>" % len(urls)
                    html += "\t" + "\n\t".join(urls[:5])
                    html += "\n\t...%s more" % (len(urls) - 5)
                html += "\n\nUNCLUSTERED:\n"
                html += "\t" + "\n\t".join(c["unclustered"])
                html += "</pre>"
                data["html"] = html
        except:
            logging.debug(traceback.format_exc())
            data["error"] = "An error happened while fetching sitemaps"
    else:
        data["error"] = "No sitemaps found"

    return json.dumps(data)
 def test_other(self):
     x_urls = ['http://s.com/blah/%d' % x for x in range(1, 20)]
     y_urls = ['http://s.com/a/b/aa%dbb' % x for x in range(1, 20)]
     z_urls = ['http://b.com/ab/aa%dbb' % x for x in range(1, 50)]
     c = cluster_urls(x_urls + y_urls + z_urls, 10)
     improve_patterns(c['clusters'])
     self.assertEqual(c['unclustered'], [])
     self.assertEqual(
         sorted(c['clusters'].keys()),
         sorted([
             ('http://b.com/ab/aa([^/]+)bb', 'http://b.com/ab/aa[...]bb'),
             ('http://s.com/blah/(\\d+)', 'http://s.com/blah/[NUMBER]'),
             ('http://s.com/a/b/aa([^/]+)bb', 'http://s.com/a/b/aa[...]bb')
         ]))
 def test_other(self):
     x_urls = ['http://s.com/blah/%d' % x for x in range(1, 20)]
     y_urls = ['http://s.com/a/b/aa%dbb' % x for x in range(1, 20)]
     z_urls = ['http://b.com/ab/aa%dbb' % x for x in range(1, 50)]
     c = cluster_urls(x_urls + y_urls + z_urls, 10)
     improve_patterns(c['clusters'])
     self.assertEqual(c['unclustered'], [])
     self.assertEqual(
         sorted(c['clusters'].keys()),
         sorted([('http://b.com/ab/aa([^/]+)bb',
                  'http://b.com/ab/aa[...]bb'),
                 ('http://s.com/blah/(\\d+)',
                  'http://s.com/blah/[NUMBER]'),
                 ('http://s.com/a/b/aa([^/]+)bb',
                  'http://s.com/a/b/aa[...]bb')]))