/
wmscrape.py
executable file
·114 lines (92 loc) · 2.53 KB
/
wmscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/python
from urlparse import urlparse
from Cookie import BaseCookie as Cookie
import httplib
import pymayhem
def findlogin(fa):
for f in fa:
for i in f:
if i.name == 'trylogin':
return f
return None
def do_login(u, login, pwd):
conn = httplib.HTTPConnection(u.hostname, u.port)
conn.request('GET', u.path)
r = conn.getresponse()
if r.status != 200:
raise Exception, r.status, r.reason
h = pymayhem.FormRipper()
h.feed(r.read())
f = findlogin(h.forms)
if f is None:
raise ValueError, 'login form not found'
f['sausr'] = login
f['sapwd'] = pwd
params = f.get_form_data()
headers = {'Content-type': 'application/x-www-form-urlencoded',
'Referer': u.geturl(),
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us,en;q=0.5'
}
lu = urlparse(f.action)
conn = httplib.HTTPConnection(lu.hostname, lu.port)
conn.request('POST', lu.path, params, headers)
r = conn.getresponse()
if r.status != 302:
raise ValueError, 'bad response', r
cookie = None
location = None
for (k, v) in r.getheaders():
if k == 'set-cookie':
cookie = v
elif k == 'location':
location = urlparse(v)
c = Cookie(cookie)
c = c.output(c.keys(), '', ', ').strip()
q = location.query
return (c, q)
def do_url(u, login = None, pwd = None):
headers = {'Referer': u.geturl(),
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:14.0) Gecko/20100101 Firefox/14.0.1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us,en;q=0.5',
}
path = u.path
if login is not None and pwd is not None:
(c, q) = do_login(u, login, pwd)
headers['Cookie'] = c
path = path + '?' + q
conn = httplib.HTTPConnection(u.hostname, u.port)
conn.request('GET', path, None, headers)
r = conn.getresponse()
if r.status != 200:
raise Exception, r.status, r.reason
h = pymayhem.WebParser()
h.feed(r.read())
print '$PageUrl:', u.geturl()
for (k, v) in h.result.items():
print '%s: %s'%(k, v)
print
def main(argv):
l = None
p = None
if argv[1] == '--login':
l = argv[2]
if argv[3] == '--password':
p = argv[4]
if l is not None:
urls = argv[3:]
if p is not None:
urls = argv[5:]
else:
urls = argv[1:]
for url in urls:
u = urlparse(url)
if u.scheme != 'http':
raise Exception, 'HTTP URLs only'
do_url(u, l, p)
return True
if __name__ == '__main__':
from sys import argv
raise SystemExit, not main(argv)