def cluster(url): """ Read URLs from sitemaps and return clusters url is either a website (and we detect sitemaps) or a sitemap """ data = {} if url[:4] != "http": url = "http://" + url if re.search(r"https?://[^/?#]+[/?#].+", url): sitemaps = [url] # sitemap URL given else: robots = url.strip("/") + "/robots.txt" sitemaps = sitemaps_from_robots(robots) if not sitemaps: # assume sitemap.xml sitemaps = [url.strip("/") + "/sitemap.xml"] if sitemaps: try: urls = read_sitemaps(sitemaps) if not urls: data["error"] = "No URLs found in sitemap" else: data["count"] = len(urls) urls = [x.strip() for x in urls] # cluster URLs c = urlclustering.cluster(urls) tmp = deepcopy(c["clusters"]) try: improve_patterns(c["clusters"]) except: c["clusters"] = tmp pass # prepare HTML html = "<pre>CLUSTERS:" keys = sorted(c["clusters"], key=lambda k: len(c["clusters"][k]), reverse=True) for key in keys: urls = c["clusters"][key] html += "\n" + key[1] + " [%s URLs]<br/>" % len(urls) html += "\t" + "\n\t".join(urls[:5]) html += "\n\t...%s more" % (len(urls) - 5) html += "\n\nUNCLUSTERED:\n" html += "\t" + "\n\t".join(c["unclustered"]) html += "</pre>" data["html"] = html except: logging.debug(traceback.format_exc()) data["error"] = "An error happened while fetching sitemaps" else: data["error"] = "No sitemaps found" return json.dumps(data)
def cluster_urls( urls: Iterable[str], min_cluster_size: int = 10) -> pd.DataFrame: """ Cluster URLs by regex rules defined in this package: https://pypi.org/project/urlclustering/ urls: list List of urls. min_clustre_size: int Minimum cluster size """ import urlclustering clusters = urlclustering.cluster(urls, min_cluster_size) tmp = {v0: [k[1], 0] for k, v in clusters['clusters'].items() for v0 in v} tmp.update({k: [k, 1] for k in clusters['unclustered']}) clusters = pd.DataFrame.from_dict( tmp, orient='index', columns=['cluster', 'unclustered']) return clusters
#!/usr/bin/env python from __future__ import print_function import random import urlclustering def pprint(clusters): for key, urls in clusters.items(): print('REGEX:', key[0]) print('HUMAN:', key[1]) print('URLS:') print('\t' + '\n\t'.join(urls) + '\n') urls = [ u'http://example.com', u'http://example.com/about', ] cats = [u'http://example.com/cat/%s' % x for x in ('sports', 'tech', 'life', 'politics', 'world')] tags = [u'http://example.com/tag/%s/tag%s' % (random.randint(100, 999), x) for x in range(10)] arts = [u'http://example.com/article/?id=%s' % x for x in range(10)] c = urlclustering.cluster(urls + cats + tags + arts, 5) pprint(c['clusters']) print('UNCLUSTERED:') print('\t' + '\n\t'.join(c['unclustered']))
import urlclustering def pprint(clusters): for key, urls in clusters.items(): print('REGEX:', key[0]) print('HUMAN:', key[1]) print('URLS:') print('\t' + '\n\t'.join(urls) + '\n') urls = [ u'http://example.com', u'http://example.com/about', ] cats = [ u'http://example.com/cat/%s' % x for x in ('sports', 'tech', 'life', 'politics', 'world') ] tags = [ u'http://example.com/tag/%s/tag%s' % (random.randint(100, 999), x) for x in range(10) ] arts = [u'http://example.com/article/?id=%s' % x for x in range(10)] c = urlclustering.cluster(urls + cats + tags + arts, 5) pprint(c['clusters']) print('UNCLUSTERED:') print('\t' + '\n\t'.join(c['unclustered']))