コード例 #1
0
ファイル: sec_select.py プロジェクト: jamesyc/spotonit-sec
def select(event_url):
    '''
    Selects a "parent" url to search for events, given an event

    Keyword arguments:
    event_url -- url of one event

    Returns: list of strings, url of pages that contain links to multiple events

    '''

    # validate url, get page from url
    url_html = spotonit_sec.get_url(event_url)

    # use beautifulsoup on webpage
    soup = bs4.BeautifulSoup(url_html)
    links = soup.find_all('a')

    # search for events or calendar link
    parent_url_list = []
    all_url_list = []
    for a in links:
        all_url_list.append(a.get('href'))
        if a.get_text().lower().find("event") != -1:
            parent_url_list.append(a.get('href'))
        if a.get_text().lower().find("calendar") != -1: 
            parent_url_list.append(a.get('href'))

    # if we have links to pages that contain events, return them now
    events_pages_urls = []
    if len(parent_url_list) >= 1:
        for parent_url in parent_url_list:
            absolute_url = spotonit_sec.generate_page_url(event_url, parent_url)
            events_pages_urls.append(absolute_url)
        return events_pages_urls

    # no event page found, now check for links on same site
    print "warning, this section is slow"
    for individual_url in all_url_list:
        if spotonit_sec.check_same_domain(individual_url, event_url):
            # check individual page titles to look for events
            absolute_url = spotonit_sec.generate_page_url(event_url, individual_url)
            individual_html = spotonit_sec.get_url(absolute_url)
            individual_title = bs4.BeautifulSoup(individual_html).title.string.lower()
            if individual_title.find("event") != -1:
                events_pages_urls.append(absolute_url)
            if individual_title.find("calendar") != -1:
                events_pages_urls.append(absolute_url)
    return events_pages_urls
コード例 #2
0
ファイル: sec_lister.py プロジェクト: jamesyc/spotonit-sec
def lister(events_page_url_list, num_events=10):
    '''
    Returns a list of possible events, given a "parent" url.

    Keyword arguments:
    events_pages_urls -- list of pages containing related events
    num_events -- number of events in list

    Returns: list of strings, the url of events

    '''

    # keep on adding events until it reaches num_events
    event_url_list = []
    seen_urls = []
    while len(event_url_list) < num_events:
        # if no more event pages, break
        if len(events_page_url_list) == 0:
            break
        # check if we already have seen this event page
        events_page = events_page_url_list.pop(0)
        if events_page in seen_urls:
            continue
        seen_urls.append(events_page)
        # download event page, and run beautifulsoup
        events_html = spotonit_sec.get_url(events_page)
        events_soup = bs4.BeautifulSoup(events_html)
        # go thru main content nodes and grab URLs, these are usually events
        big_url_list = []
        for node in events_soup.findAll(attrs={'class': re.compile(r".*content.*")}):
            big_url_list.extend(spotonit_sec.extract_links(node))
        # for node in events_soup.findAll(attrs={'id': re.compile(r".*content.*")}):
        #     big_url_list.extend(spotonit_sec.extract_links(node))
        for node in events_soup.findAll(attrs={'class': re.compile(r".*main.*")}):
            big_url_list.extend(spotonit_sec.extract_links(node))
        for node in events_soup.findAll(attrs={'id': re.compile(r".*main.*")}):
            big_url_list.extend(spotonit_sec.extract_links(node))
        for node in events_soup.findAll(attrs={'id': re.compile(r".*calendar.*")}):
            big_url_list.extend(spotonit_sec.extract_links(node))
        for node in events_soup.find_all('tr'):
            big_url_list.extend(spotonit_sec.extract_links(node))
        # remove dupes
        big_url_list = list(set(big_url_list))
        # convert relative links to absolute links
        absolute_url_list = []
        for link in big_url_list:
            absolute_url_list.append(spotonit_sec.generate_page_url(events_page, link))
        # remove non urls (javascript, etc)
        big_url_list = filter(spotonit_sec.check_url, absolute_url_list)
        event_url_list.extend(big_url_list)

    # remove dupes
    event_url_list = list(set(event_url_list))
    random.shuffle(event_url_list,random.random)
    # not enough events, fill up the list
    while len(event_url_list) < num_events:
        event_url_list.append("")
    return event_url_list[:num_events]