/
scraper.py
executable file
·94 lines (69 loc) · 2.35 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
# scraper.py
from bs4 import BeautifulSoup
from pprint import pprint
from urllib.parse import urlsplit, urlunsplit, urljoin, SplitResult
import os
import re
import requests
import sys
import urltools
def _find_addys(text):
addy_regex = re.compile("(?:mailto:)?([-+.\w]+@[-+.\w]+\.[a-z]+)", re.I)
return re.findall(addy_regex, text)
def _urlsplit(arg):
parts = urlsplit(arg, scheme="http", allow_fragments=True)
# dump the query and the fragment
return SplitResult(parts.scheme, parts.netloc, parts.path, "", "")
def _sanitize(url):
ret = url
ret = _urlsplit(ret)
ret = urlunsplit(ret)
ret = urltools.normalize(ret)
return ret
def _links(url, content):
soup = BeautifulSoup(content, "lxml")
ret = set(a.attrs["href"] for a in soup.find_all('a') if "href" in a.attrs)
ret = set(filter(lambda e: e.startswith("http"), ret))
ret = set(map(_urlsplit, ret))
def pred(e):
return e.hostname and e.hostname.endswith(_urlsplit(url).hostname)
ret = set(filter(pred, ret))
ret = set(map(urlunsplit, ret))
ret = set(map(urltools.normalize, ret))
ret = set(urljoin(url, link) for link in ret)
return ret
def _scrape(urls):
emails = set()
urls_new = set(map(_sanitize, urls))
urls_old = set()
while urls_new:
print("len(emails) == ", len(emails))
print("len(urls_new) == ", len(urls_new))
print("len(urls_old) == ", len(urls_old))
# print("urls_new == ", urls_new)
url = urls_new.pop()
urls_old.add(url)
# print("URLS_OLD:")
# pprint(urls_old)
# print("urls_old == ", urls_old)
print("Processing \"%s\"" % url)
try:
response = requests.get(url, headers={'Accept': 'text/html'})
except(requests.exception.MissingSchema,
requests.exception.TooManyRedirects,
requests.exception.ConnectionError):
print("Error on %s" % url)
continue
if not response.ok:
continue
if not response.headers['content-type'].startswith('text'):
continue
emails.update(set(_find_addys(response.text)))
urls_new.update(_links(url, response.text) - urls_old)
return emails
def main():
emails = _scrape(sys.argv[1:])
print(emails)
if '__main__' == __name__:
main()