-
Notifications
You must be signed in to change notification settings - Fork 0
/
sitemap.py
153 lines (136 loc) · 4.99 KB
/
sitemap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import urlclustering
from urlclustering.reimprover import improve_patterns
import sys
import logging
import traceback
import re
import urllib2
import json
from copy import deepcopy
from lxml import etree
from gzip import GzipFile
from StringIO import StringIO
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
def _fetch_url(url):
logging.debug('Fetching: ' + url)
webpage = ''
try:
request = urllib2.Request(url)
request.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(request, timeout=10)
if response.getcode() != 200:
return ''
webpage = response.read()
if response.info().get('Content-Encoding') == 'gzip':
webpage = GzipFile(fileobj=StringIO(webpage)).read()
except:
logging.debug(traceback.format_exc())
webpage = ''
return webpage
def _read_sitemap(xml, urls, sitemaps):
"""Reads a sitemap (xml) and returns all sitemaps and all urls found"""
tree = etree.fromstring(xml)
ns = [('sm', 'http://www.sitemaps.org/schemas/sitemap/0.9')]
for node in tree.xpath('//sm:sitemap | //sitemap', namespaces=ns):
for loc in node.xpath('sm:loc | loc', namespaces=ns):
if loc.text.strip() not in sitemaps:
sitemaps.append(loc.text.strip())
for node in tree.xpath('//sm:url | //url', namespaces=ns):
for loc in node.xpath('sm:loc | loc', namespaces=ns):
urls.add(loc.text)
def read_sitemaps(sitemaps, max_urls=10000):
"""
Read one or more sitemaps and return all urls.
sitemaps: a list of sitemap urls
max_urls: stop processing more sitemaps if max_urls already found
"""
urls = set()
while len(sitemaps) > 0:
url = sitemaps.pop(0)
webpage = _fetch_url(url)
if len(webpage) == 0:
continue
# not every server returns correct Content-Encoding
if 'sitemaps' not in webpage[:1000]:
try:
webpage = GzipFile(fileobj=StringIO(webpage)).read()
if 'sitemaps' not in webpage[:1000]:
continue
except:
logging.debug(traceback.format_exc())
continue
# read sitemap
logging.debug('Reading sitemap: ' + url)
if isinstance(webpage, unicode):
webpage = webpage.encode('utf-8')
_read_sitemap(webpage, urls, sitemaps)
logging.debug('URLs so far: %s' % len(urls))
if len(urls) > max_urls:
break
return list(urls)[:max_urls]
def sitemaps_from_robots(url):
"""Return list of sitemaps extracted from robots.txt"""
sitemaps = []
webpage = _fetch_url(url)
if len(webpage) > 0:
matches = re.findall(ur'^\s*Sitemap\s*:\s*(.*?)$',
webpage, re.I | re.M)
for match in matches:
if match[:4] != 'http':
if match[:1] == '/':
match = match[1:]
match = url[0:url.find('/', 8)] + '/' + match
sitemaps.append(match)
return sitemaps
def cluster(url):
"""
Read URLs from sitemaps and return clusters
url is either a website (and we detect sitemaps) or a sitemap
"""
data = {}
if url[:4] != 'http':
url = 'http://' + url
if re.search(r'https?://[^/?#]+[/?#].+', url):
sitemaps = [url] # sitemap URL given
else:
robots = url.strip('/') + '/robots.txt'
sitemaps = sitemaps_from_robots(robots)
if not sitemaps:
# assume sitemap.xml
sitemaps = [url.strip('/') + '/sitemap.xml']
if sitemaps:
try:
urls = read_sitemaps(sitemaps)
if not urls:
data['error'] = 'No URLs found in sitemap'
else:
data['count'] = len(urls)
urls = [x.strip() for x in urls]
# cluster URLs
c = urlclustering.cluster(urls)
tmp = deepcopy(c['clusters'])
try:
improve_patterns(c['clusters'])
except:
c['clusters'] = tmp
pass
# prepare HTML
html = '<pre>CLUSTERS:'
keys = sorted(c['clusters'],
key=lambda k: len(c['clusters'][k]),
reverse=True)
for key in keys:
urls = c['clusters'][key]
html += '\n' + key[1] + ' [%s URLs]<br/>' % len(urls)
html += '\t' + '\n\t'.join(urls[:5])
html += '\n\t...%s more' % (len(urls)-5)
html += '\n\nUNCLUSTERED:\n'
html += '\t' + '\n\t'.join(c['unclustered'])
html += '</pre>'
data['html'] = html
except:
logging.debug(traceback.format_exc())
data['error'] = 'An error happened while fetching sitemaps'
else:
data['error'] = 'No sitemaps found'
return json.dumps(data)