def _parseResultsPage(self, pool, href, subcategory): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find("table", {"id" : "resultsFound"}).findAll("dl", {"class" : "result"}) try: next_page = soup.find("ul", {"class" : re.compile("nextpages|morepages")}).find("li", {"class" : "next"}).find("a").get("href") if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page, subcategory) except AttributeError: # crawling of pages is done #utils.log("Done crawling: %s" % href) pass time.sleep(0.01) for result in results: link = result.find("dt").find("a") href = link.get("href") name = link.getText().strip() detail = pool.spawn(self._parseDetailPage, name, href, subcategory)
def addressToLatLng(self, address): params = { 'address' : address, } url = self.BASE_URL + '?' + urllib.urlencode(params) try: # GET the data and parse the HTML response with BeautifulSoup soup = utils.getSoup(url) rows = soup.find("table").findAll("tr") # extract the latitude latRow = rows[1] latStr = latRow.findAll("td")[1].renderContents() lat = float(re.search("([0-9.-]+)", latStr).group(0)) # extract the longitude lngRow = rows[2] lngStr = lngRow.findAll("td")[1].renderContents() lng = float(re.search("([0-9.-]+)", lngStr).group(0)) return self.getValidatedLatLng((lat, lng)) except: #utils.log('[USGeocoderService] error converting "' + url + '"\n') pass return None
def _parseRestaurantPage(self, pool, region_name, area_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, area_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('span', {'class' : 'adr'}) street = addr.find('span', {'class' : 'street-address'}).getText().strip() locality = addr.find('span', {'class' : 'locality'}).getText().strip() region = addr.find('span', {'class' : 'region'}).getText().strip() zipcode = addr.find('a', {'class' : re.compile('postal-code')}).getText().strip() address = "%s, %s, %s %s" % (street, locality, region, zipcode) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.urbanspoon = { 'uurl' : href, } self._output.put(entity)
def _initPages(self): """ if self._crawler.options.test or not self._crawler.options.crawl: # hardcoded page of ~30 new york restaurants for testing purposes self.s_pages.add("http://www.opentable.com/opentables.aspx?t=reg&n=11,18,66,2987,2999,3032,3044,3047,3068,3101,3113,3128,3131,3161,7376,7382,7394,7397,7616,7628,7682&m=8&p=2&d=6/14/2011%207:00:00%20PM&scpref=108") return """ self._crawler.log("\n") self._crawler.log("Initializing crawl index for " + self._name + " (" + self.BASE_URL + ")\n") self._crawler.log("\n") url = self.BASE_URL + "state.aspx" soup = utils.getSoup(url) links = soup.find("div", {"id" : "Global"}).findAll("a", {"href" : re.compile("(city)|(country).*")}) pages = set() pages.add(url) for link in links: href = link.get("href") linkURL = self.BASE_URL + href pages.add(linkURL) #self._crawler.log(str(i) + ") " + str(rid)) self._pool.add_task(self._parsePage, linkURL, pages) self._pool.wait_completion() self._crawler.log("\n") self._crawler.log("Done initializing crawl index for " + self._name + " (" + self.BASE_URL + ")\n") self._crawler.log("\n")
def _parseLocationPage(self, pool, region_name, href): utils.log("[%s] parsing region '%s' (%s)" % (self, region_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return try: # find all cities within this state # note: could be none if zagat has not rated any cities within a given state (such as Alaska) cityLists = soup.find("div", {"id" : "loc_allCities"}).findAll("div", {"class" : "letterBlock"}) except AttributeError: # no cities found within this region; return gracefully return # asynchronously parse each city within this region for cityList in cityLists: cityList = cityList.find('ul') cities = cityList.findAll('a') for city in cities: city_name = city.getText().strip() city_href = self.base + city.get("href") pool.spawn(self._parseCityPage, pool, region_name, city_name, city_href)
def _parseAllRestaurantsInCityPage(self, pool, region_name, city_name, href): utils.log("[%s] parsing all restaurants in city '%s.%s' (%s)" % (self, region_name, city_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # parse all zagat-rated restaurants on this page restaurants = soup.findAll("li", {"class" : "zr"}) if restaurants is not None: for restaurant in restaurants: a = restaurant.find('a') restaurant_name = a.getText().strip() restaurant_href = self.base + a.get("href") # asynchronously parse the current restaurant pool.spawn(self._parseRestaurantPage, pool, region_name, city_name, restaurant_name, restaurant_href) try: # parse next page next_page = soup.find("li", {"class" : re.compile("pager-next")}).find("a", {"class" : "active"}) if next_page is not None: next_page_href = self.base + next_page.get("href") self._parseAllRestaurantsInCityPage(pool, region_name, city_name, next_page_href) except AttributeError: # no next paginated page for restaurants within this city pass
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'name' : 'LocationDirectory' }).findAll('h3') for result in results: try: name = result.find('a').getText().strip() except Exception: continue try: raw_address = result.findNext('span', { 'class' : 'address'}).getText() street = raw_address[0:raw_address.find('(')].strip() locale = raw_address[raw_address.find(')')+1:raw_address.find('CA')+2].strip() addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "bar" entity.title = name entity.address = addr entity.sources.sfweekly = { } self._output.put(entity) # try the next page try: pagination = soup.find('span', { 'class' : 'Pagination' }).getText() if 'Next' in pagination: pagination = soup.find('span', { 'class' : 'Pagination' }) href_get = pagination.find('span', { 'class' : 'PaginationSelected' }).findNext('a').get('href') next_page = '{0}{1}'.format('http://www.sfweekly.com', href_get) else: next_page = '' except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def parse_work(title, genre, link, output): utils.log("parsing work %s) %s (%s)" % (title, genre, link)) try: soup = utils.getSoup(link) except Exception, e: utils.log("error parsing work %s) %s (%s) - %s" % (title, genre, link, e)) utils.printException()
def _getEntityDetails(self, entity): baseURL = "http://www.opentable.com/httphandlers/RestaurantinfoLiteNew.ashx"; url = baseURL + "?" + urllib.urlencode({ 'rid' : entity['rid'] }) detailsSoup = utils.getSoup(url) entity['address'] = detailsSoup.find("div", {"class" : re.compile(".*address")}).renderContents().strip() self._crawler.log(entity)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'class' : 'split-right-column' }).findAll('div', { 'class' : 'clear' }) for result in results: try: name = result.findNext('div').find('h2').find('a').getText().strip() except Exception: continue try: street = result.findNext('div').find('address').getText() locale = '{0}, {1}'.format('Los Angeles', 'CA') addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() if 'Bars' in result.findNext('span').getText(): entity.subcategory = "bar" else: entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.timeout_la = { } self._output.put(entity) # try the next page try: href_get = soup.find('div', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.timeout.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.findAll('h3') for result in results: try: name = result.find('span', { 'style' : 'cursor:pointer;' }).getText().strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return try: address1 = result.findNext('span', { 'class' : 'addresslinecaps' }).getText().strip() if '(' in address1: # sf mag does not provide any city, state or zip information, # so inserting basic universal info manually. addr = '{0}, {1}'.format(address1.split('(')[0].strip(), 'San Francisco, CA') else: addr = '{0}, {1}'.format(address1, 'San Francisco, CA') except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) return entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.sfmag = { } self._output.put(entity) #locate total pages and compare against current page num to determine if we should iterate again try: total_pages = soup.find('span', { 'class' : 'last' }).findPrevious('span').getText().strip() except AttributeError: # crawling of pages is done return index = href.find('&page=') end = href.find('&keyword') page = href[index+6:end] if int(page) <= int(total_pages)-1: next_page = href.replace('&page=' + str(page), '&page=' + str(int(page)+1)) pool.spawn(self._parseResultsPage, pool, next_page) else: return time.sleep(0.01)
def scanEntry(self): try: file = open(self.filename, 'r') soup = utils.getSoup(file.read()) title = getTitle(soup, self.filename) except: print sys.exc_info() title = None print 'Cannot determine title for entry ' + self.filename finally: file.close() return title
def _parseResultsPage(self, pool, href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return results = soup.find('div', { 'class' : 'search_results' }).findAll('div', { 'class' : 'restaurant'}) for result in results: try: name = result.find('h3').find('a').getText().strip() except Exception: continue try: street = result.find('br').previousSibling.strip() locale = '{0}, {1}'.format(result.find('br').nextSibling.strip(), 'CA') addr = '{0}, {1}'.format(street, locale) except Exception: addr = '' utils.log("[%s] error parsing %s (%s)" % (self, addr, href)) continue if addr == '': continue if name == '': continue if (name, addr) in self._seen: continue self._seen.add((name, addr)) entity = Entity() entity.subcategory = "restaurant" entity.title = name entity.address = addr entity.sources.sfgate = { } self._output.put(entity) # try the next page try: href_get = soup.find('li', { 'class' : 'next' }).find('a').get('href') next_page = '{0}{1}'.format('http://www.sfgate.com', href_get) except Exception: next_page = '' if next_page != '': pool.spawn(self._parseResultsPage, pool, next_page)
def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('div', {'class' : 'address'}) street = addr.find('span', {'class' : 'street'}).getText().strip() geo = addr.find('span', {'class' : 'geo'}).getText().strip() address = "%s, %s" % (street, geo) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.zagat = { 'zurl' : self.base + href, } #self._globals['soup'] = soup # parse cuisine header = soup.find('div', {'id' : "block-zagat_restaurants-14"}) if header is not None: header = header.find('ul').find('li', {'class' : 'first'}) if header is not None: entity.cuisine = header.getText() # parse website site = soup.find('span', {'class' : 'website'}) if site is not None: site = site.find('a') if site is not None: entity.site = site.get('href') # parse preview image img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'}) if img is not None: img = img.find('img') if img is not None: entity.image = img.get('src') self._output.put(entity)
def _parseAreaPage(self, pool, region_name, area_name, href): utils.log("[%s] parsing area '%s.%s' (%s)" % (self, region_name, area_name, href)) try: soup = utils.getSoup(href) except: utils.log("[%s] error downloading page %s" % (self, href)) return region_list_link = soup.find('table', {'style' : 'width:100%'}).find('a') region_list_href = region_list_link.get('href') try: soup2 = utils.getSoup(region_list_href) except: utils.log("[%s] error downloading page %s" % (self, region_list_href)) return restaurant_list_link = soup2.find('div', {'id' : 'center'}).findAll('p')[1].find('a') restaurant_list_href = restaurant_list_link.get('href') self._parseAllRestaurantsInArea(pool, region_name, area_name, restaurant_list_href, 'A', True)
def _parsePage(self, url, pages): #http://www.opentable.com/start.aspx?m=74&mn=1309 self._crawler.log("Crawling " + url) soup = utils.getSoup(url) links = soup.findAll("a", {"href" : re.compile(".*m=[0-9]*.*mn=[0-9]*")}) for link in links: #name = link.renderContents().strip() href = link.get("href") linkURL = self.BASE_URL + href if not linkURL in pages: pages.add(linkURL) self._pool.add_task(self._parseSubPage, linkURL)
def _parseLocationsPage(self, pool, href): try: soup = utils.getSoup(href) except: utils.log("[%s] error downloading page %s" % (self, href)) return # find all links to domestic urbanspoon regions (states) locations = soup.findAll("table")[3].findAll('a') # parse each individual location page (state) for location in locations: name = location.getText().strip() href = location.get("href") pool.spawn(self._parseLocationPage, pool, name, href)
def _parseLocationsPage(self, pool, href): try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # find all links to domestic zagat regions (states) locations = soup.find("div", {"id" : "loc_domestic"}).findAll("a") # parse each individual location page (state) for location in locations: name = location.getText().strip() href = self.base + location.get("href") pool.spawn(self._parseLocationPage, pool, name, href)
def _parseCityPage(self, pool, region_name, city_name, href): utils.log("[%s] parsing city '%s.%s' (%s)" % (self, region_name, city_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # use the 'all' link on the zagat search homepage for this city to parse all # restaurants within this city restaurant_list_link = soup.find("div", {"class" : "upper-links"}).find("a") restaurant_list_href = self.base + restaurant_list_link.get("href") self._parseAllRestaurantsInCityPage(pool, region_name, city_name, restaurant_list_href)
def _parseIndexPage(self, pool, queue, url, name): utils.log('[%s] parsing page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return categories = soup.find('div', {'id' : 'bookgenremenu'}).findAll('a') for category in categories: href = self.base + category.get('href') name = category.getText().strip() pool.spawn(self._parseResultsPage, pool, queue, href, name, base=True)
def _parseResultsPage(self, queue, url, name, depth): try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return if depth < self.max_depth: # extract and parse subcategory pages category_ul = soup.find('ul', {'id' : 'zg_browseRoot'}) if category_ul is not None: while True: temp_ul = category_ul.find('ul') if temp_ul is None: break else: category_ul = temp_ul categories = category_ul.findAll('a') for category in categories: href = category.get('href') name = utils.normalize(category.getText()) queue.put_nowait((href, name, depth + 1)) self._globals['books'] = soup rss_link = soup.find('div', {'id' : 'zg_rssLinks'}) if rss_link is None: return rss_link = rss_link.findAll('a')[1].get('href') if rss_link in self.seen: return self.seen.add(rss_link) entity = Entity() entity.title = rss_link entity.subcategory = 'book' self._output.put(entity)
def _parseRestaurantPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return content = soup.find('div', { 'id' : 'content'}) if content is None: return entity = Entity() entity.title = content.find('h1').getText() entity.subcategory = "restaurant" entity.seattletimes = {} details = content.find('div', {'id' : 'edbtext'}) desc = details.find('p').getText() if desc is not None: entity.desc = desc details = details.findAll('p', {'class' : 'list'}) address = details[0].renderContents().strip().replace('<br />', '') address = re.sub('[ \n\t]+', ' ', address) entity.address = address if len(details) > 1: site = details[1].get('href') if site is not None: entity.site = site if len(details) > 2: hoursOfOperation = details[2].getText() if hoursOfOperation is not None: entity.hoursOfOperation = hoursOfOperation key = (entity.title, entity.address) if key in self.seen or '(closed)' in entity.title.lower(): return self.seen.add(key) self._output.put(entity)
def _parseDetailPage(self, name, href, subcategory): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return summ = soup.find('div', {'class' : 'summary-address'}) try: addrp = summ.find('p', {'class' : 'adr'}) street_addr = addrp.find('span', {'class' : 'street-address'}).getText().strip() locality = addrp.find('span', {'class' : 'locality'}).getText().strip() region = addrp.find('span', {'class' : 'region'}).getText().strip() try: postal_code = addrp.find('span', {'class' : 'postal-code'}).getText().strip() except AttributeError: postal_code = "" addr = "%s, %s, %s %s" % (street_addr, locality, region, postal_code) except AttributeError: try: p = summ.find('p').getText() r = re.compile('(.*)nr\. ', re.DOTALL) m = r.match(p) if m is None: r = re.compile('(.*)at[. ]', re.DOTALL) m = r.match(p) addr = m.groups()[0].replace('\n', ' ').strip() except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, name, href)) return entity = Entity() entity.subcategory = subcategory entity.title = name entity.address = addr entity.nymag = { } self._output.put(entity)
def process_shakespeare_works(): """ Downloads, parses, and returns a list of lines aggregated across all of Shakespeare's plays, where each line is represented by a simple dict format. """ # initialize environment, download and parse shakespeare index page # ----------------------------------------------------------------- pool = Pool(16) seed = "http://shakespeare.mit.edu" soup = utils.getSoup(seed) output = [ ] table = soup.find('table', {'cellpadding' : '5'}) rows = table.findAll('tr') cols = rows[1].findAll('td') genres = map(lambda td: td.getText(), rows[0].findAll('td')) assert len(cols) == len(genres) assert len(rows) == 2 # find and process each work's full text in parallel # -------------------------------------------------- # note: we're only interested in plays so we're skipping the last genre, poetry for i in xrange(len(genres) - 1): genre = genres[i] col = cols[i] works = col.findAll('a') for work in works: href = work.get('href') href = href.replace('index.html', 'full.html') link = "%s/%s" % (seed, href) title = work.getText().replace('\n', ' ') if title and href: pool.spawn(parse_work, title, genre, link, output) pool.join() return output
def getEntitiesFromURL(self, url, limit=None): soup = utils.getSoup(url) resultList = soup.findAll("tr", {"class" : re.compile("ResultRow.*")}) results = [] resultsCount = len(resultsList) resultsCount = min(resultsCount, limit or resultsCount) for i in xrange(resultsCount): result = resultList[i] # note: some pages have <div class="rinfo" rid=###> and some have <div rid=###>... row = result.find("div", {"rid" : re.compile(".*")}) entity = self._parseEntity(row) results.append(entity) self._pool.add_task(self._getEntityDetails, entity) self._pool.wait_completion() return results
def _parseDirectoryPage(self,pool,href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return try: results = soup.find('span', { 'class' : 'header' }).findNext('div').findAll('a') except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, results, href)) root = 'http://www.bostonmagazine.com' href_list = [] for r in results: link = "{0}{1}".format(root, r.get('href')) href_list.append(link) return href_list
def _parseResultsPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing results page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return #self._globals['books'] = soup # extract and parse more past results if base: prev = soup.find('div', {'class' : 'stepperDynamicPrevSm'}) if prev: prev = prev.find('a') href = prev.get('href') year, month, day = map(int, self.date_re.match(href).groups()) date = datetime(year=year, month=month, day=day) delta = timedelta(days=7) count = 10 for i in xrange(count): repl = date.date().isoformat() href2 = re.sub('\d\d\d\d-\d\d-\d\d', repl, href) queue.put_nowait((self._parseResultsPage, href2, repl, i == count - 1)) date = date - delta categories = soup.findAll('div', {'class' : re.compile('bookCategory')}) for category in categories: link = category.find('a') href = link.get('href') name2 = "%s (%s)" % (name, link.getText().strip().lower()) queue.put_nowait((self._parseListPage, href, name2, False))
def _parseLocationPage(self, pool, region_name, href): utils.log("[%s] parsing region '%s' (%s)" % (self, region_name, href)) try: soup = utils.getSoup(href) except: utils.log("[%s] error downloading page %s" % (self, href)) return try: # find all metropolitan areas within this state areas = soup.find('table', {"style" : "width:100%"}).find('td').findAll('a') except AttributeError: # no cities found within this region; return gracefully return # asynchronously parse each metropolitan area within this region for area in areas: area_name = area.getText().strip() area_href = area.get("href") pool.spawn(self._parseAreaPage, pool, region_name, area_name, area_href)
def _parseListPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing list page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return results = soup.findAll('td', {'class' : 'summary'}) for result in results: entity = Entity() entity.subcategory = "book" entity.nytimes = {} title = result.find('span', {'class' : 'bookName'}).getText().strip().title() if title.endswith(','): title = title[0:-1] entity.title = title details = result.getText(separator='___') details_match = self.details_re.match(details) if details_match: details_match = details_match.groups() entity.author = details_match[0] entity.publisher = details_match[1] entity.desc = details_match[2] key = (entity.title, entity.author) if key in self.seen: continue self.seen.add(key) self._output.put(entity)
def _parseDirectoryPage(self,pool,href): try: soup = utils.getSoup(href) except urllib2.HTTPError: utils.log("[%s] error parsing page %s" % (self, href)) return try: results = soup.find('div', { 'class' : 'module_content clearfix' }).findAll('a') except AttributeError: utils.log("[%s] error parsing %s (%s)" % (self, results, href)) root = 'http://findlocal.latimes.com' href_list = [] for r in results: link = "{0}{1}".format(root, r.get('href')) href_list.append(link) return href_list
print_info=True) tournament_info = utils.getTournamentNamesAndURLS(TOURNAMENT_INFO_PATH, TOURNAMENT_URLS_FILE, TOURNAMENT_NAMES_FILE) #Get all the gamedates and associated gamelink URLs for each tournament GameLinks=[] TOURNAMENT_NAMES = [] for i, (name, link) in enumerate(tournament_info): print("Extracting info from link {}/{}: {} ...".\ format(i+1,len(tournament_info), link)) TOURNAMENT_NAMES.append(name) tourney_date_and_games_tuples = [] #Get gamedates, check if > last_dates_hash #If new date, create list of (date, [gameURLs]) for each tournament Soup = utils.getSoup(link) GameDays = Soup.findAll(class_='day_content') firstdate=None for day in GameDays: d = day.find(class_='section_header').text.strip() #some tournamnets have duplicates if firstdate==None: firstdate=d elif firstdate==d: break if len(d.split(' '))==3: #make sure date is complete gamedate=utils.formatdate(d[:d.find('\n')], MONTHS) else: tourneyyear=re.search('20[12][0-9]',link).group(0)