/
parseUtils.py
83 lines (72 loc) · 2.18 KB
/
parseUtils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import urllib2
import time
import re
import os
from BeautifulSoup import BeautifulSoup
import os.path
import htmllib
import formatter
from urlparse import urlparse
# Loads a url
def loadURL(url):
# Cookie stuff from:
# http://www.voidspace.org.uk/python/articles/cookielib.shtml
COOKIEFILE = '/var/www/vhosts/davesblogbot/cookies.lwp' #/home/virtual/site1/fst/home/newstoday/BayesBlogBot/cookies.lwp'
cj = None
ClientCookie = None
cookielib = None
# Let's see if cookielib is available
try:
import cookielib
except ImportError:
try:
import ClientCookie
except ImportError:
urlopen = urllib2.urlopen
Request = urllib2.Request
else:
urlopen = ClientCookie.urlopen
Request = ClientCookie.Request
cj = ClientCookie.LWPCookieJar()
else:
urlopen = urllib2.urlopen
Request = urllib2.Request
cj = cookielib.LWPCookieJar()
if cj is not None:
if os.path.isfile(COOKIEFILE):
cj.load(COOKIEFILE)
if cookielib is not None:
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
else:
opener = ClientCookie.build_opener(ClientCookie.HTTPCookieProcessor(cj))
ClientCookie.install_opener(opener)
txdata = None
# if we were making a POST type request,
# we could encode a dictionary of values here,
# using urllib.urlencode(somedict)
txheaders = {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT)'}
try:
req = Request(url, txdata, txheaders)
handle = urlopen(req)
except IOError, e:
print 'Failed to open "%s".' % url
if hasattr(e, 'code'):
print 'Failed with error code - %s.' % e.code
elif hasattr(e, 'reason'):
print "Reason: %s" % e.reason
return None
except:
return None
print
if cj is None:
print "No cookies available."
else:
#print 'These are the cookies we have received so far :'
# for index, cookie in enumerate(cj):
# print index, ' : ', cookie
try:
cj.save(COOKIEFILE)
except:
pass
return handle.read()