-
Notifications
You must be signed in to change notification settings - Fork 0
/
event_scraper.py
251 lines (210 loc) · 8.84 KB
/
event_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/python3
#this is a generic events scraper that uses user-submitted data on a site to scrape events
from lxml import html
import requests
from enumerators import SiteTypes
from enumerators import PaginationTypes
from event import Event
from sitetext import SiteText
#This class has the following variables
####### self.url --> This is the url to the main document
####### self.siteType --> this is an enumerated description of what a site is. are all the event details in a list? Is this a list that points to individual event pages, which house event details??
####### self.eventDetailsPageLink_xpath
####### self.url
####### self.siteType
####### self.pagination_xpath
####### self.paginationType
####### self.eventDetailsPageLink_xpath
####### self.eventName_xpath
####### self.eventLocation_xpath
####### self.eventDateTime_xpath
####### self.eventDescription_xpath
####### self.eventPrice_xpath
####### self.allEvents
class EventScraper:
def __init__ (self, sent_url, sent_type=SiteTypes.ListLinks, sent_pagination="none", paginationType = PaginationTypes.NextPage):
self.url = sent_url
self.domain = ""
self.siteType = sent_type
self.pagination_xpath = sent_pagination
self.paginationType = paginationType
self.eventDetailsPageLink_xpath = ''
self.eventName_xpath = ''
self.eventLocation_xpath = ''
self.eventDateTime_xpath = ''
self.eventDescription_xpath = ''
self.eventPrice_xpath = ''
self.eventImagePath_xpath = ''
self.allEvents = []
self.pages = []
#print "url: ", self.url
#print "pagination: ", self.pagination
#print "Site Type: " , repr(self.siteType)
def SetObjects(self): #May just individually define local variables that i'm using.
pass
def ParsePage(self): #This is for parsing the whole page (where page includes pagination and all that other good stuffs)
#We know there has to be a list.
#First thing i'm going to do is get all the page links and put them into a URL array that i'll load and then parse
#Second thing is to check if the pages are organized by links that point to a details page, or if all the details are in the list-view
#If CompleteList,
# then parse each list item into an event and store in an array
#If InCompleteList,
# then, for each list, get all the pointer links
# fill each page into an event and store in an array
#Store the events array in a relational database
#Step 1: Get all the pages
# TODO: Make sure that we handle case where there is no pagination
# TODO: getting double the number of pages b/c of repeats. must only select one block of pagination.
if self.pagination_xpath != "none":
self.Pagination()
else: self.pages = [self.url]
#confirm that links are grabbed:
print self.pages
#Step 2: Fork to check style of the events-list
if self.siteType == SiteTypes.ListEvents:
print "This is a List of Events"
elif self.siteType == SiteTypes.ListLinks:
print "This in a List of Links to Event Landing Pages"
eventLinks = self.GetLinkToEventDetails() #Get all the event Links from every single page
print "Num Event Links: ", len(eventLinks)
i = 1
for eventLink in eventLinks:
self.allEvents.append(self.GetSingleEvent(eventLink)) #For each event page, parse the page and get all the event details i can find.
if i%25 == 0 or i == len(eventLinks):
print "Printed Page: ", i
i+=1
#if i==10:
# print "breaking"
# break
else:
print "Something went wrong getting the SiteType"
def GetLinkToEventDetails(self):
links = []
i=0
for pageURL in self.pages:
print "Getting Event Links page: " + str(i)
pageURL = self.urlify(pageURL)
self.page = requests.get(pageURL)
thisTree = html.fromstring(self.page.text)
links += thisTree.xpath(self.eventDetailsPageLink_xpath)
print "finished event page links: " + str(i)
i+=1
links = set(links)
return links
def GetSingleEvent(self, eventLandingPage):
eventLandingPage = self.urlify(eventLandingPage)
self.page = requests.get(eventLandingPage)
eventTree = html.fromstring(self.page.text)
new_event = Event()
new_event.source = eventLandingPage.encode('utf-8')
new_event.name = self.multipleXPaths(self.eventName_xpath, eventTree, 'name')
new_event.location += self.multipleXPaths(self.eventLocation_xpath, eventTree, 'location')
new_event.description = self.multipleXPaths(self.eventDescription_xpath, eventTree, 'description')
new_event.datetime = self.multipleXPaths(self.eventDateTime_xpath, eventTree, 'datetime')
new_event.price = self.multipleXPaths(self.eventPrice_xpath, eventTree, 'price')
new_event.imagePath = self.multipleXPaths(self.eventImage_xpath, eventTree, 'imagePath')
new_event.imagePath = self.urlify(new_event.imagePath)
new_event.dump = SiteText(eventLandingPage).gettext()
new_event.cleanContent()
# print (new_event)
# print (new_event.name), " name"
# print (new_event.location), " location"
# print (new_event.description), " description"
# print (new_event.datetime), " datetime"
# print (new_event.price), " price"
# print (new_event.imagePath), " imagePath"
return new_event
def multipleXPaths(self, xpath_array, pagetree, action):
output = []
if isinstance(xpath_array, list):
for apath in xpath_array:
hold = pagetree.xpath(apath)
output.append(self.returnString(hold,action))
outputStr = ",".join(output)
else:
outputStr = self.returnString(pagetree.xpath(xpath_array), action)
return outputStr
def urlify(self, sent_url):
if sent_url.startswith("http://") is not True:
if sent_url.startswith("./") or sent_url.startswith("~/"):
sent_url = sent_url[1:]
full_url = self.domain + sent_url
else:
full_url = sent_url
return full_url
def Pagination(self):
if self.paginationType == PaginationTypes.AllPages:
home = self.url
self.page = requests.get(home)
tree = html.fromstring(self.page.text)
tree = tree.getroottree()
print tree
#grab all the links in a page.
print self.pagination_xpath
self.pages = tree.xpath(self.pagination_xpath)
self.pages.insert(0,home)
print "Num Pages: " + str(len(self.pages))
elif self.paginationType == PaginationTypes.NextPage:
nextPage = self.url.strip()
while nextPage:
nextPage = self.urlify(nextPage)
print "next page: " + nextPage
self.pages.append(nextPage)
self.page = requests.get(nextPage)
if self.page.status_code == 200:
tree = html.fromstring(self.page.text)
nextPage = tree.xpath(self.pagination_xpath)
else:
print "fuck! that didn't work:"
print nextPage
print self.page.status_code
#dedupe pages
self.pages = list(set(self.pages))
def returnString(self, sentArray, action):
sendable = []
if isinstance(sentArray, list):
if len(sentArray)>0:
for item in sentArray:
item = item.encode('utf-8').strip()
sendable.append(item)
return ("\n".join(sendable)).strip()
else: return ""
else:
return str(sentArray).strip()
def cleanParameters(self):
self.url = self.stringify(self.url)
self.domain = self.stringify(self.domain)
self.siteType = self.stringify(self.siteType)
self.pagination_xpath = self.stringify(self.pagination_xpath)
self.paginationType = self.stringify(self.paginationType)
self.eventDetailsPageLink_xpath = self.stringify(self.eventDetailsPageLink_xpath)
self.eventName_xpath = self.stringify(self.eventName_xpath)
self.eventLocation_xpath = self.stringify(self.eventLocation_xpath)
self.eventDateTime_xpath = self.stringify(self.eventDateTime_xpath)
self.eventDescription_xpath = self.stringify(self.eventDescription_xpath)
self.eventPrice_xpath = self.stringify(self.eventPrice_xpath)
self.eventImagePath_xpath = self.stringify(self.eventImagePath_xpath)
def stringify(self, sentObj):
if type(sentObj) is None:
return ""
elif type(sentObj) is str:
return sentObj.encode("utf-8").strip()
elif type(sentObj) is unicode:
return str(sentObj).encode("utf-8").strip()
elif type(sentObj) is list:
for obj in sentObj:
obj = self.stringify(obj)
return sentObj
else:
return sentObj
# def returnString(self, sentArray):
# sendable = ""
# if isinstance(sentArray, list):
# if len(sentArray)>0 :
# for item in sentArray:
# hold = item.encode('utf-8').strip()
# if hold is not "": sendable = sendable + hold + ","
# return sendable[:-2]
# else: return ""
# else:
# return sentArray.strip()