def _parseRestaurantPage(self, pool, region_name, area_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, area_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('span', {'class' : 'adr'}) street = addr.find('span', {'class' : 'street-address'}).getText().strip() locality = addr.find('span', {'class' : 'locality'}).getText().strip() region = addr.find('span', {'class' : 'region'}).getText().strip() zipcode = addr.find('a', {'class' : re.compile('postal-code')}).getText().strip() address = "%s, %s, %s %s" % (street, locality, region, zipcode) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.urbanspoon = { 'uurl' : href, } self._output.put(entity)
def _parseEntity(self, sheet, index, numEntities): if numEntities > 100 and ((index - 1) % (numEntities / 100)) == 0: utils.log("[%s] done parsing %s" % \ (self.NAME, utils.getStatusStr(index - 1 - Globals.options.offset, numEntities))) time.sleep(0.1) row = sheet.row_values(index) entity = Entity() entity.subcategory = "restaurant" entity.title = row[1] entity.address = row[3] + ', ' + \ row[4] + ', ' + \ row[5] + ' ' + \ row[6] entity.openTable = { 'rid' : int(row[8]), 'reserveURL' : row[9], 'countryID' : row[10], 'metroName' : row[0], 'neighborhoodName' : row[2], } # don't make external calls to opentable in test mode if not Globals.options.test: result = OpenTableParser.parseEntity(entity) if result is None: return if entity is not None: #print entity.title #from pprint import pprint #pprint(entity.getDataAsDict()) self._output.put(entity)
def _parseEntity(self, result): entity = Entity() entity.subcategory = 'other' if 'titleNoFormatting' in result: entity.title = result['titleNoFormatting'] if 'addressLines' in result: entity.address = string.joinfields(result['addressLines'], ', ') entity.subtitle = entity.address if 'lat' in result and 'lng' in result: entity.lat = float(result['lat']) entity.lng = float(result['lng']) if 'region' in result: entity.vicinity = result['region'] if 'phoneNumbers' in result: phoneNumbers = result['phoneNumbers'] if len(phoneNumbers) > 0: entity.phone = phoneNumbers[0]['number'] entity.googleLocal = {} entity.titlel = entity.title.lower() return entity
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'name' : 'LocationDirectory' }).findAll('h3') for result in results: try: name = result.find('a').getText().strip() except Exception: continue try: raw_address = result.findNext('span', { 'class' : 'address'}).getText() street = raw_address[0:raw_address.find('(')].strip() locale = raw_address[raw_address.find(')')+1:raw_address.find('CA')+2].strip() addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "bar" entity.title = name entity.address = addr entity.sources.sfweekly = { } self._output.put(entity) # try the next page try: pagination = soup.find('span', { 'class' : 'Pagination' }).getText() if 'Next' in pagination: pagination = soup.find('span', { 'class' : 'Pagination' }) href_get = pagination.find('span', { 'class' : 'PaginationSelected' }).findNext('a').get('href') next_page = '{0}{1}'.format('http://www.sfweekly.com', href_get) else: next_page = '' except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'class' : 'split-right-column' }).findAll('div', { 'class' : 'clear' }) for result in results: try: name = result.findNext('div').find('h2').find('a').getText().strip() except Exception: continue try: street = result.findNext('div').find('address').getText() locale = '{0}, {1}'.format('Los Angeles', 'CA') addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() if 'Bars' in result.findNext('span').getText(): entity.subcategory = "bar" else: entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.timeout_la = { } self._output.put(entity) # try the next page try: href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.timeout.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.findAll('h3') for result in results: try: name = result.find('span', { 'style' : 'cursor:pointer;' }).getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return try: address1 = result.findNext('span', { 'class' : 'addresslinecaps' }).getText().strip() if '(' in address1: # sf mag does not provide any city, state or zip information, # so inserting basic universal info manually. addr = '{0}, {1}'.format(address1.split('(')[0].strip(), 'San Francisco, CA') else: addr = '{0}, {1}'.format(address1, 'San Francisco, CA') except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.sfmag = { } self._output.put(entity) #locate total pages and compare against current page num to determine if we should iterate again try: total_pages = soup.find('span', { 'class' : 'last' }).findPrevious('span').getText().strip() except AttributeError: # crawling of pages is done return index = href.find('&page=') end = href.find('&keyword') page = href[index+6:end] if int(page) <= int(total_pages)-1: next_page = href.replace('&page=' + str(page), '&page=' + str(int(page)+1)) pool.spawn(self._parseResultsPage, pool, next_page) else: return time.sleep(0.01)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'class' : 'search_results' }).findAll('div', { 'class' : 'restaurant'}) for result in results: try: name = result.find('h3').find('a').getText().strip() except Exception: continue try: street = result.find('br').previousSibling.strip() locale = '{0}, {1}'.format(result.find('br').nextSibling.strip(), 'CA') addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.sfgate = { } self._output.put(entity) # try the next page try: href_get = soup.find('li', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.sfgate.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('div', {'class' : 'address'}) street = addr.find('span', {'class' : 'street'}).getText().strip() geo = addr.find('span', {'class' : 'geo'}).getText().strip() address = "%s, %s" % (street, geo) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.zagat = { 'zurl' : self.base + href, } #self._globals['soup'] = soup # parse cuisine header = soup.find('div', {'id' : "block-zagat_restaurants-14"}) if header is not None: header = header.find('ul').find('li', {'class' : 'first'}) if header is not None: entity.cuisine = header.getText() # parse website site = soup.find('span', {'class' : 'website'}) if site is not None: site = site.find('a') if site is not None: entity.site = site.get('href') # parse preview image img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'}) if img is not None: img = img.find('img') if img is not None: entity.image = img.get('src') self._output.put(entity)
def _parseRestaurantPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return content = soup.find('div', { 'id' : 'content'}) if content is None: return entity = Entity() entity.title = content.find('h1').getText() entity.subcategory = "restaurant" entity.seattletimes = {} details = content.find('div', {'id' : 'edbtext'}) desc = details.find('p').getText() if desc is not None: entity.desc = desc details = details.findAll('p', {'class' : 'list'}) address = details[0].renderContents().strip().replace('<br />', '') address = re.sub('[ \n\t]+', ' ', address) entity.address = address if len(details) > 1: site = details[1].get('href') if site is not None: entity.site = site if len(details) > 2: hoursOfOperation = details[2].getText() if hoursOfOperation is not None: entity.hoursOfOperation = hoursOfOperation key = (entity.title, entity.address) if key in self.seen or '(closed)' in entity.title.lower(): return self.seen.add(key) self._output.put(entity)
def _parseDetailPage(self, name, href, subcategory): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return summ = soup.find('div', {'class' : 'summary-address'}) try: addrp = summ.find('p', {'class' : 'adr'}) street_addr = addrp.find('span', {'class' : 'street-address'}).getText().strip() locality = addrp.find('span', {'class' : 'locality'}).getText().strip() region = addrp.find('span', {'class' : 'region'}).getText().strip() try: postal_code = addrp.find('span', {'class' : 'postal-code'}).getText().strip() except AttributeError: postal_code = "" addr = "%s, %s, %s %s" % (street_addr, locality, region, postal_code) except AttributeError: try: p = summ.find('p').getText() r = re.compile('(.*)nr\. ', re.DOTALL) m = r.match(p) if m is None: r = re.compile('(.*)at[. ]', re.DOTALL) m = r.match(p) addr = m.groups()[0].replace('\n', ' ').strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return entity = Entity() entity.subcategory = subcategory entity.title = name entity.address = addr entity.nymag = { } self._output.put(entity)
def _parseEntity(self, row, count): # utils.log("[%s] parsing entity %d" % (self, count)) name = row["name"].lower().strip() collapsed = False if name in to_collapse: if to_collapse[name]: self.numCollapsed += 1 return to_collapse[name] = True collapsed = True # record how many times we've encountered each restaurant # if not hasattr(self, 'seen'): # self.seen = {} # if name in self.seen: # self.seen[name] += 1 # else: # self.seen[name] = 1 entity = Entity() entity.subcategory = "restaurant" entity.factual = {"table": "US_Restaurants_V2.csv"} if not collapsed: address = FactualUtils.parseAddress(row) if address is not None: entity.address = address for srcKey, destKey in self._map.iteritems(): if srcKey in row and row[srcKey]: entity[destKey] = row[srcKey] self._output.put(entity)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'id' : 'searchResults' }).findAll('td', { 'class' : 'start' }) for result in results: try: name = result.find('a').getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return x = 0 for r in result.findAll('br'): x+=1 if x == 3: try: addr = '{0}, {1}'.format(result.find('a').nextSibling.strip(), result.find('br').nextSibling.strip()) except Exception: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return elif x == 4: try: addr = '{0}, {1}'.format(result.contents[3].strip(), result.contents[5].strip()) except Exception: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return else: addr = '' if addr == '': continue if 'CLOSED' in name: continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.bostonmag = { } self._output.put(entity) # try the next page next_page_ending = soup.find('div', { 'class' : 'right_align' }).findAll('a') next_page = '' for n in next_page_ending: if 'Next' in str(n): next_page = href.replace(href[href.find('?'):], n.get('href')) else: pass if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('ul', { 'id' : 'search_pagination' }).findAll('div', { 'class' : 'listing_item' }) for result in results: try: name = result.find('h2').getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return try: addr = result.find('span', { 'class' : 'address' }).getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return if addr == '': continue if 'CLOSED' in name: continue if addr in self._seen: continue self._seen.add(addr) if name in self._count: if self._count[name] < 3: self._count[name] = self._count[name] + 1 else: continue else: self._count[name] = 1 entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.latimes = { } self._output.put(entity) #try the next page try: next_page = soup.find('a', {'class': 'next_page'}).get("href") if next_page != '': next_page_url = "{0}{1}".format('http://findlocal.latimes.com', next_page) pool.spawn(self._parseResultsPage, pool, next_page_url) except AttributeError: # crawling of pages is done #utils.log("Done crawling: %s" % href) pass
def _parseResultsPage(self, pool, url, offset=0, base=False): utils.log('[%s] parsing page %s' % (self, url)) max_offset = 8 if offset < max_offset: # optimistically process the next results page before processing this one if 'start=' in url: start = self.start_re.match(url).groups()[0] nexti = int(start) + self.results_per_page url2 = url.replace('start=%s' % start, 'start=%d' % nexti) else: url2 = "%s&start=%d" % (url, self.results_per_page) pool.spawn(self._parseResultsPage, pool, url2, offset + 1) try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, url)) return if offset >= max_offset: next_pagel = soup.find('a', {'id' : 'pager_page_next'}) if next_pagel is not None: href = self.base + next_pagel.get('href') pool.spawn(self._parseResultsPage, pool, href, 0) time.sleep(0.01) if base: categories = soup.findAll('a', {'id' : self.category_re}) if categories is not None: for category in categories: href = self.base + category.get('href') pool.spawn(self._parseResultsPage, pool, href, 0) # yield so other threads have a chance to start working time.sleep(0.01) separator = '___' results = soup.findAll('div', {'class' : re.compile('businessresult')}) if results is None: return for result in results: entity = Entity() entity.subcategory = 'restaurant' entity.sources.yelp = { } titlel = result.find('a') title = titlel.getText() entity.title = self.title_re.match(title).groups()[0] entity.yurl = self.base + titlel.get('href') addr = result.find('address').getText(separator) match = self.address_re.match(addr).groups() entity.address = "%s, %s" % (match[0], match[1]) entity.phone = match[2] rating = result.find('img') if rating is not None: entity.yrating = float(self.rating_reviews_re.match(rating.get('title')).groups()[0]) reviews = result.find('span', {'class' : 'reviews'}) if reviews is not None: entity.yreviews = int(self.rating_reviews_re.match(reviews.getText()).groups()[0]) key = (entity.title, entity.address) if key not in self.seen: self.seen.add(key) self._output.put(entity)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'class' : 'searchresults' }).findAll('div', { 'class' : 'fs1-sans' }) for result in results: if 'Price' in result.getText(): continue if 'Kid' in result.getText(): continue if 'Other' in result.getText(): continue if 'Wheelchair' in result.getText(): continue if 'Cuisines' in result.getText(): continue if 'Rating' in result.getText(): continue if 'Latest' in result.getText(): continue try: name = result.find('strong').getText().strip() except Exception: continue try: addr = '{0} {1}, {2}, {3}'.format(result.find('span').getText(), result.find('span').findNext('span').getText(), result.find('span').findNext('span').findNext('span').getText(), result.find('span').findNext('span').findNext('span').findNext('span').getText()) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.washmag = { } self._output.put(entity) return
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('td', { 'id' : 'search-results' }).findAll('tr') for result in results: try: name = result.find('td', { 'class' : 'business-name' }).find('a').getText().strip() except Exception: continue try: result.find('td', { 'class' : 'contact' }).find('br').previousSibling.strip() result.find('td', { 'class' : 'contact' }).find('br').nextSibling.strip() addr = '{0}, {1}'.format(result.find('td', { 'class' : 'contact' }).find('br').previousSibling.strip(), result.find('td', { 'class' : 'contact' }).find('br').nextSibling.strip()) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if 'OPENING SOON' in result.find('td', { 'class' : 'categories' }).getText(): continue if addr == '': continue if name == '': continue if 'CLOSED' in name: continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.chicagomag = { } self._output.put(entity) # try the next page next_page_all= soup.find('div', { 'id' : 'pager' }).findAll('a') next_page = '' for n in next_page_all: if 'Next' in n.getText(): next_page = n.get('href') else: pass if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)