forked from rcarmo/python-webarchive
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
108 lines (84 loc) · 3.16 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from asyncio import Queue, ensure_future, get_event_loop, wait_for
from cgi import parse_header
from urllib.parse import unquote, urldefrag, urljoin, urlparse
from aiohttp import ClientSession
from biplist import writePlist
from cssutils import getUrls, parseString
from lxml import html
import http.server
import socketserver
import asyncio
#from threading import Thread
import multiprocessing
from config import (ACCEPT_HEADERS, ADDITIONAL_URLS, CHANGE_DOMAIN_FROM,
CHANGE_DOMAIN_TO, OUTPUT_FILENAME, TARGET_URL,
TIMEOUT, log)
async def crawler(client, url, archive):
log.debug("Crawling url: {}".format(url))
headers = ACCEPT_HEADERS
headers['Referer'] = archive['top']
response = await client.get(url, headers=headers)
if response.status != 200:
raise Exception("got response code other than 200 for url: {} - Response code: {}".format(url, response.status))
else:
data = await response.read()
content_type, params = parse_header(response.headers['content-type'])
if CHANGE_DOMAIN_FROM and CHANGE_DOMAIN_TO:
wrUrl = url.replace(CHANGE_DOMAIN_FROM, CHANGE_DOMAIN_TO)
else:
wrUrl = url
item = {
"WebResourceData": data,
"WebResourceMIMEType": content_type,
"WebResourceURL": wrUrl
}
if 'charset' in params:
item['WebResourceTextEncodingName'] = params['charset']
archive['items'].append(item)
async def scrape(url, additionalUrls = []):
print("scrape")
client = ClientSession()
url_queue = []
archive = {
'top': url,
'items': []
}
url_queue.append(url)
for aUrl in additionalUrls:
#print("adding additional url: " + aUrl)
url_queue.append(aUrl)
for url in url_queue:
try:
await crawler(client, url, archive)
except Exception as exc:
log.warning('Exception {}'.format(exc), exc_info=False)
#exc = future.exception()
#if exc:
# log.error('Worker finished with error: {} '.format(exc), exc_info=True)
await client.close()
webarchive = {
'WebMainResource': archive['items'].pop(0),
'WebSubresources': archive['items']
}
writePlist(webarchive, OUTPUT_FILENAME)
print("finished", OUTPUT_FILENAME)
#async def serve_directory():
def serve_directory():
PORT = 8000
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", PORT), Handler) as httpd:
print("Serving at port", PORT)
httpd.serve_forever()
if __name__ == '__main__':
loop = asyncio.new_event_loop()
#t1 = loop.create_task(serve_directory())
#thread = Thread(target = serve_directory, args = ())
#thread.start()
proc = multiprocessing.Process(target = serve_directory, args=())
proc.start()
#additionalUrls = ADDITIONAL_URLS.split(";")
additionalUrls = list(filter(None, ADDITIONAL_URLS.split(";"))) # remove empty urls from list
loop.run_until_complete(scrape(TARGET_URL, additionalUrls))
print("done - joining now")
proc.terminate() # sends a SIGTERM
print("done")