def select(event_url): ''' Selects a "parent" url to search for events, given an event Keyword arguments: event_url -- url of one event Returns: list of strings, url of pages that contain links to multiple events ''' # validate url, get page from url url_html = spotonit_sec.get_url(event_url) # use beautifulsoup on webpage soup = bs4.BeautifulSoup(url_html) links = soup.find_all('a') # search for events or calendar link parent_url_list = [] all_url_list = [] for a in links: all_url_list.append(a.get('href')) if a.get_text().lower().find("event") != -1: parent_url_list.append(a.get('href')) if a.get_text().lower().find("calendar") != -1: parent_url_list.append(a.get('href')) # if we have links to pages that contain events, return them now events_pages_urls = [] if len(parent_url_list) >= 1: for parent_url in parent_url_list: absolute_url = spotonit_sec.generate_page_url(event_url, parent_url) events_pages_urls.append(absolute_url) return events_pages_urls # no event page found, now check for links on same site print "warning, this section is slow" for individual_url in all_url_list: if spotonit_sec.check_same_domain(individual_url, event_url): # check individual page titles to look for events absolute_url = spotonit_sec.generate_page_url(event_url, individual_url) individual_html = spotonit_sec.get_url(absolute_url) individual_title = bs4.BeautifulSoup(individual_html).title.string.lower() if individual_title.find("event") != -1: events_pages_urls.append(absolute_url) if individual_title.find("calendar") != -1: events_pages_urls.append(absolute_url) return events_pages_urls
def lister(events_page_url_list, num_events=10): ''' Returns a list of possible events, given a "parent" url. Keyword arguments: events_pages_urls -- list of pages containing related events num_events -- number of events in list Returns: list of strings, the url of events ''' # keep on adding events until it reaches num_events event_url_list = [] seen_urls = [] while len(event_url_list) < num_events: # if no more event pages, break if len(events_page_url_list) == 0: break # check if we already have seen this event page events_page = events_page_url_list.pop(0) if events_page in seen_urls: continue seen_urls.append(events_page) # download event page, and run beautifulsoup events_html = spotonit_sec.get_url(events_page) events_soup = bs4.BeautifulSoup(events_html) # go thru main content nodes and grab URLs, these are usually events big_url_list = [] for node in events_soup.findAll(attrs={'class': re.compile(r".*content.*")}): big_url_list.extend(spotonit_sec.extract_links(node)) # for node in events_soup.findAll(attrs={'id': re.compile(r".*content.*")}): # big_url_list.extend(spotonit_sec.extract_links(node)) for node in events_soup.findAll(attrs={'class': re.compile(r".*main.*")}): big_url_list.extend(spotonit_sec.extract_links(node)) for node in events_soup.findAll(attrs={'id': re.compile(r".*main.*")}): big_url_list.extend(spotonit_sec.extract_links(node)) for node in events_soup.findAll(attrs={'id': re.compile(r".*calendar.*")}): big_url_list.extend(spotonit_sec.extract_links(node)) for node in events_soup.find_all('tr'): big_url_list.extend(spotonit_sec.extract_links(node)) # remove dupes big_url_list = list(set(big_url_list)) # convert relative links to absolute links absolute_url_list = [] for link in big_url_list: absolute_url_list.append(spotonit_sec.generate_page_url(events_page, link)) # remove non urls (javascript, etc) big_url_list = filter(spotonit_sec.check_url, absolute_url_list) event_url_list.extend(big_url_list) # remove dupes event_url_list = list(set(event_url_list)) random.shuffle(event_url_list,random.random) # not enough events, fill up the list while len(event_url_list) < num_events: event_url_list.append("") return event_url_list[:num_events]