/
linkScraper.py
275 lines (190 loc) · 8.25 KB
/
linkScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Current state of this script
# This script is able to log in to the CMS and will get all links matching
# certain criteria from div#content and div#rightPortlets and write them
# to a CSV
###########################################################################################
# Initilize gui, get values from user
# imports
import easygui as eg
import re
msg = "This program will grab links from the body and sidebars, test them, and store the information in a CSV in the C:\scraper directory."
title = "Link Scraper/Tester v. 0.5"
fieldNames = ["Google Docs Spreadsheet URL","U-M email address (ex. bob@umich.edu)","Name of output file (must end in .csv)","CMS Password"]
fieldValues = [] # we start with blanks for the values
fieldValues = eg.multpasswordbox(msg,title, fieldNames)
# make sure that none of the fields was left blank
while 1: # do forever, until we find acceptable values and break out
errmsg = ""
if fieldValues == None:
break
# look for errors in the returned values
for i in range(len(fieldNames)):
if fieldValues[i].strip() == "":
errmsg = errmsg + ('"%s" is a required field.\n\n' % fieldNames[i])
if errmsg == "":
break # no problems found
else:
# show the box again, with the errmsg as the message
fieldValues = eg.multenterbox(errmsg, title, fieldNames, fieldValues)
# process these inputs for later use
CMS_login = fieldValues[1].replace("@umich.edu","") #removes @umich.edu to get username for CMS login
# get spreadsheet id
key_ex = re.compile('key=([^#]*)')
spreadsheet_list = re.findall(key_ex,fieldValues[0])
spreadsheet_key = spreadsheet_list[0]
#get gid
gid_ex = re.compile('gid=([^#]*)')
spreadsheet_gid = re.findall(gid_ex,fieldValues[0])
spreadsheet_gid = int(spreadsheet_gid[0]) # make integer
#testing:
#print ("Reply was:", fieldValues)
#note: fieldValues is a list, can access with fieldValues[0] etc.
###############################################################################
# Get urls to scrape from a Google spreadsheet
# imports
import urllib, urllib2
# Download Google spreadsheet
class Spreadsheet(object):
def __init__(self, key):
super(Spreadsheet, self).__init__()
self.key = key
class Client(object):
def __init__(self, email, password):
super(Client, self).__init__()
self.email = email
self.password = password
def _get_auth_token(self, email, password, source, service):
url = "https://www.google.com/accounts/ClientLogin"
params = {
"Email": email, "Passwd": password,
"service": service,
"accountType": "HOSTED_OR_GOOGLE",
"source": source
}
req = urllib2.Request(url, urllib.urlencode(params))
return re.findall(r"Auth=(.*)", urllib2.urlopen(req).read())[0]
def get_auth_token(self):
source = type(self).__name__
return self._get_auth_token(self.email, self.password, source, service="wise")
def download(self, spreadsheet, gid=spreadsheet_gid, format="csv"):
url_format = "https://spreadsheets.google.com/feeds/download/spreadsheets/Export?key=%s&exportFormat=%s&gid=%i"
headers = {
"Authorization": "GoogleLogin auth=" + self.get_auth_token(),
"GData-Version": "3.0"
}
req = urllib2.Request(url_format % (spreadsheet.key, format, gid), headers=headers)
return urllib2.urlopen(req)
if __name__ == "__main__":
import getpass
import csv
email = fieldValues[1]
password = fieldValues[3]
spreadsheet_id = spreadsheet_key # (spreadsheet id)
# Create client and spreadsheet objects
gs = Client(email, password)
ss = Spreadsheet(spreadsheet_id)
# Request a file-like object containing the spreadsheet's contents
csv_file = gs.download(ss)
##############################################################################
# Now login to CMS in case some of the pages are unpublished
# imports
import mechanize
import cookielib
from bs4 import BeautifulSoup
# initialize variables - this is the output if no errors occur
error_code = "ok"
error_args = "ok"
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# The site we will navigate into, handling it's session
br.open('https://weblogin.umich.edu/')
# Select the first (index zero) form
br.select_form(nr=0)
# User credentials
br.form['login'] = CMS_login
br.form['password'] = fieldValues[3]
# Submit login
br.submit()
###########################################################################################
# Now specify output location
# imports
import os
path_to_folder = "c:\\" + "scraper"
#check for c:\scraper, if it doesn't exist, then create it
if not os.path.exists(path_to_folder):
os.makedirs(path_to_folder)
filename = fieldValues[2]
# open a file to write results in
outputFile = open(path_to_folder + '\\' + filename, 'a')
###########################################################################################
# Now scrape for links
# imports
import csv
import urllib2
import re
# Get urls from csv object we just downloaded and start loop
urls = csv.reader(csv_file)
# Open each page from Google Docs csv, read contents
for url in urls:
response = br.open(url[0])
fullHTML = response.read()
# put the HTML into BeautifulSoup
soup = BeautifulSoup(fullHTML)
# zoom in to div#content, get content and store
content_div = soup.find('div',attrs={'id':'contentHolder'})
# zoom in to div#rightPortlets, get content and store
portlet_div = soup.find('div',attrs={'id':'rightPortlets'})
# get all the links from div#content, write each to a line of a .csv file
if content_div is not None:
for link in content_div.findAll('a'):
h = link.get('href')
if h is not None and h.startswith('#')==False and 'mailto' not in h:
#follow link and check for 404s or server errors
if h.startswith("/"):
fullURL = ('http://www.engin.umich.edu' + h)
else:
fullURL = h
try:
pageResponse = br.open(fullURL)
error_code = 'ok'
error_args = 'ok'
except (mechanize.HTTPError,mechanize.URLError) as e:
if isinstance(e,mechanize.HTTPError):
error_code = str(e.code)
else:
error_args = str(e.reason.args)
outputFile.write ('"' + (url[0]) + '","' + link.get('href') + '","' + error_code + '","' + error_args + '"' + '\n')
# get all the links from div#rightPortlets, write each to a line of a .csv file
if portlet_div is not None:
for link in content_div.findAll('a'):
h = link.get('href')
if h is not None and h.startswith('#')==False and 'mailto' not in h:
#follow link and check for 404s or server errors
if h.startswith("/"):
fullURL = ('http://www.engin.umich.edu' + h)
else:
fullURL = h
try:
pageResponse = br.open(fullURL)
error_code = 'ok'
error_args = 'ok'
except (mechanize.HTTPError,mechanize.URLError) as e:
if isinstance(e,mechanize.HTTPError):
error_code = str(e.code)
else:
error_args = str(e.reason.args)
#write source url, link url, error code, error arg
outputFile.write ('"' + (url[0]) + '","' + link.get('href') + '","' + error_code + '","' + error_args + '"' + '\n')
outputFile.close() #end of loop