def cluster(url): """ Read URLs from sitemaps and return clusters url is either a website (and we detect sitemaps) or a sitemap """ data = {} if url[:4] != "http": url = "http://" + url if re.search(r"https?://[^/?#]+[/?#].+", url): sitemaps = [url] # sitemap URL given else: robots = url.strip("/") + "/robots.txt" sitemaps = sitemaps_from_robots(robots) if not sitemaps: # assume sitemap.xml sitemaps = [url.strip("/") + "/sitemap.xml"] if sitemaps: try: urls = read_sitemaps(sitemaps) if not urls: data["error"] = "No URLs found in sitemap" else: data["count"] = len(urls) urls = [x.strip() for x in urls] # cluster URLs c = urlclustering.cluster(urls) tmp = deepcopy(c["clusters"]) try: improve_patterns(c["clusters"]) except: c["clusters"] = tmp pass # prepare HTML html = "<pre>CLUSTERS:" keys = sorted(c["clusters"], key=lambda k: len(c["clusters"][k]), reverse=True) for key in keys: urls = c["clusters"][key] html += "\n" + key[1] + " [%s URLs]<br/>" % len(urls) html += "\t" + "\n\t".join(urls[:5]) html += "\n\t...%s more" % (len(urls) - 5) html += "\n\nUNCLUSTERED:\n" html += "\t" + "\n\t".join(c["unclustered"]) html += "</pre>" data["html"] = html except: logging.debug(traceback.format_exc()) data["error"] = "An error happened while fetching sitemaps" else: data["error"] = "No sitemaps found" return json.dumps(data)
def test_other(self): x_urls = ['http://s.com/blah/%d' % x for x in range(1, 20)] y_urls = ['http://s.com/a/b/aa%dbb' % x for x in range(1, 20)] z_urls = ['http://b.com/ab/aa%dbb' % x for x in range(1, 50)] c = cluster_urls(x_urls + y_urls + z_urls, 10) improve_patterns(c['clusters']) self.assertEqual(c['unclustered'], []) self.assertEqual( sorted(c['clusters'].keys()), sorted([ ('http://b.com/ab/aa([^/]+)bb', 'http://b.com/ab/aa[...]bb'), ('http://s.com/blah/(\\d+)', 'http://s.com/blah/[NUMBER]'), ('http://s.com/a/b/aa([^/]+)bb', 'http://s.com/a/b/aa[...]bb') ]))
def test_other(self): x_urls = ['http://s.com/blah/%d' % x for x in range(1, 20)] y_urls = ['http://s.com/a/b/aa%dbb' % x for x in range(1, 20)] z_urls = ['http://b.com/ab/aa%dbb' % x for x in range(1, 50)] c = cluster_urls(x_urls + y_urls + z_urls, 10) improve_patterns(c['clusters']) self.assertEqual(c['unclustered'], []) self.assertEqual( sorted(c['clusters'].keys()), sorted([('http://b.com/ab/aa([^/]+)bb', 'http://b.com/ab/aa[...]bb'), ('http://s.com/blah/(\\d+)', 'http://s.com/blah/[NUMBER]'), ('http://s.com/a/b/aa([^/]+)bb', 'http://s.com/a/b/aa[...]bb')]))