/
robot.py
229 lines (194 loc) · 7.5 KB
/
robot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
from bs4 import BeautifulSoup
import csv
import requests
import time
from reppy.cache import RobotsCache
import sqlite3
import os
class Site(object):
def __init__(self, homepage):
self.homepage = homepage # a Page object
self.pages_to_track = [homepage] # List of Page objects
self.pages_tracked = [] # List of url strings
# SETTING UP SQL database
# Changing directory to dbase (inside app directory)
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname + '/dbase/')
# Strips out http:// and dots and slash from domain name and adds '.db' to end as database file
homepage = homepage.get_url()
dbase = ''.join(homepage.split('.')[1:]) + '.db'
dbase = dbase.replace('/','')
# Create connection and c cursor (used to execute commands)
self.conn = sqlite3.connect(dbase)
self.c = self.conn.cursor()
# Clear database if it already exists
try:
self.c.execute("DROP TABLE site")
except:
pass
# Create table
self.c.execute("CREATE TABLE site (url TEXT, count INTEGER);")
def update(self):
"""
Pops a page from the pages_to_track [list]
Finds all the links in that page
If the link is not in pages_tracked
Adds new links to the database (with the value 1)
Else
Updates link count in database by 1
Adds link to pages_tracked
"""
page = self.pages_to_track.pop()
while self.robot_pass(page) == False:
print "Robot blocked: " + page.get_url()
page = self.pages_to_track.pop()
print "Now tracking: " + page.get_url()
internal_links = page.get_internal_links()
pages_tracked = self.get_pages_tracked()
while len(internal_links) > 0:
link = internal_links.pop()
if page.is_valid(link):
self.expand_link(link)
if link not in pages_tracked:
self.set_page_tracked(link)
self.set_pages_to_track(self.expand_link(link))
self.c.execute("INSERT INTO site VAlUES (?, 1)",(link,))
self.conn.commit()
else:
self.c.execute("SELECT * FROM site WHERE url=?",(link,))
data = self.c.fetchone()
value = list(data)[1]
value += 1
self.c.execute("UPDATE site SET count=? WHERE url=?",(value,link))
self.conn.commit()
def get_homepage(self):
return self.homepage.get_url()
def get_pages_to_track(self):
return self.pages_to_track
def get_pages_tracked(self):
return self.pages_tracked
def get_all_links(self):
return self.all_links
def set_page_tracked(self,link):
self.pages_tracked.append(link)
def set_pages_to_track(self,link):
self.pages_to_track.append(Page(link))
def expand_link(self,link):
"""
Accepts link [string]
If link starts with a '/' (relative link) and adds home page to new_link [string] to create absolute path
Else if start_page [string] ends in '/', strips the '/' it to prevent duplicate in path
Returns new_link
"""
homepage = self.get_homepage()
if link == "":
return homepage
if link[0] == '/':
if homepage[-1:] == '/':
new_link = homepage[:-1] + link # If relative URL ends in a '/' - removes it so you don't get '//' in newlink
else:
new_link = homepage + link
else:
new_link = link
return new_link
def robot_pass(self,page):
"""
Accepts page [object]
Creates instance of RobotsCache (from reppy)
Passes URL of page as string into robots.allowed method
Returns True or False
"""
robots = RobotsCache()
return robots.allowed(page.get_url(), '*')
class Page(object):
def __init__(self,url):
self.url = url
def get_url(self):
return self.url
def get_links(self):
"""
Accepts soup [Beautiful Soup object]
Finds all URLs in soup and adds to links [list]
Returns links
"""
soup = self.get_soup()
links = []
for link in soup.find_all('a'):
links.append(str(link.get('href')))
return links
def get_internal_links(self):
"""
Uses get_links method to get all links
Uses get_domain method to find domain page
If link is_valid and if it matches domain
adds link to internal_links [list]
Returns internal_links
"""
internal_links = []
for link in self.get_links():
try:
if self.is_valid(link) and link.split('/')[2] == self.get_domain() or link[0] == '/':
internal_links.append(link)
except:
pass
return internal_links
def get_domain(self):
"""
Removes the Protocol, Subdomain and Path and returns as a string
Example: if webpage is "http://news.google.com/world" then domain is "google.com"
"""
return self.get_url().split('/')[2]
def same_domain(self,other):
"""
Accepts other [Page object]
Checks to see if self and other belong to the same domain
Returns True or False
"""
page1 = self.get_url()
page2 = other.get_url()
page1 = page1.split('.',1)[1] #split off protocal and subdomain
page2 = page2.split('.',1)[1] #split off protocal and subdomain
print page1
print page2
try:
return page1 in page2 or page2 in page1
# Uses 'or' to see if either domain fits inside the other.
# this ensures that google.com and www.google.com match regardless of order
except:
return False # if link not valid
def get_soup(self):
"""
Creates a soup [Beautiful Soup instance] from the response using html.parser
Returns soup
"""
request_headers = {
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "http://thewebsite.com",
"Connection": "keep-alive"
}
response = requests.get(self.get_url(), headers=request_headers)
response.status_code
soup = BeautifulSoup(response.text, "html.parser")
return soup
def is_valid(self, link):
"""
Accepts link [string]
If link is empty, or a relative link (# or ?). Returns False
If link against skip_protocols [list]. Returns False.
If end of link matches matches skip_extensions [list]. Returns False
Else returns True.
"""
skip_extensions = ['jpg', 'jpeg', 'png', 'tiff', 'gif', 'apng', 'mng', 'svg', 'pdf', 'bmp', 'ico', 'xbm']
skip_protocols = ['feed' 'ftp', 'rss']
if link == '' or link[0] == '#' or link[0] == '?':
return False
for protocol in skip_protocols:
if link[:len(protocol)] == protocol:
return False
for extension in skip_extensions:
if link[-len(extension):] == extension:
return False
return True