def peatix_selenium_run(): ''' Spider for peatix based on selenium ''' # starting url for scrawling base_url = "http://peatix.com/search/?country=SG&p=" search_range = 10 event_list = [] # load existing url for peatix existing_urls = load_existing_urls('peatix') # start web driver browser = webdriver.Firefox() for i in range(search_range): url = base_url + str(i+1) logger.info('Start to parse url %s', url) browser.get(url) # wait until certain element to ensure web page rendering finished WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "li.pod-summary-block.ng-scope")) ) browser.implicitly_wait(2) # extract all events element events = browser.find_elements_by_xpath('//li[@class="event-summary-block ng-scope"]') logger.info('Found %d events in url %s', len(events), url) if len(events) > 0: # open another web driver for event detail page parsing sub_browser = webdriver.Firefox() for event in events: try: title_element = event.find_element_by_xpath('.//div[@class="detail-block"]//h3[@class="event-name ng-binding"]') title = title_element.text # parse event detail page url url_element = event.find_element_by_xpath('.//a[@class="event-summary"]') url = url_element.get_attribute('href') # filtering out already parsed urls if url not in existing_urls: logger.info('Start to parse event %s with url %s', title, url) event_info = parse_one_event(url, sub_browser) # save parsed event to event list if event_info: event_list.append(event_info) else: logger.info('Event %s with url %s has already been parsed.', title, url) except Exception, e: logger.error(e.message, exc_info=True) sub_browser.quit()
def facebook_selenium_run(): ''' Spider for facebook. The logic is to get all event ids of https://www.facebook.com/events/upcoming through selenium. Then, retrieve the event detail through facebook api. ''' event_info_list = [] # extract facebook user going event list event_info_list.extend(_locate_event_list()) time.sleep(5) # extract facebook user nearby suggest event list event_info_list.extend(_locate_event_list(suggest=True)) logger.info('Aggrerate %d for detail parsing through facebook api', len(event_info_list)) valid_event_list = [] # load existing urls for facebook existing_urls = load_existing_urls('facebook') # parse each event detail through facebook graph api for event in event_info_list: # filtering out already exist urls if event['source_url'] not in existing_urls: try: json_postdata = _render_to_json(_create_post_url(event['id'], APP_ID, APP_SECRET)) # title is necessary, raise exception if no title event['title'] = json_postdata['name'] # start time is necessary, raise exception if no start time event['start_time'] = _process_start_time(json_postdata['start_time']) event['detail'] = json_postdata.get('description', None) event['location'] = _process_location(json_postdata.get('place', None)) event['category'] = json_postdata.get('category', None) event['w0'] = json_postdata.get('attending_count', None) event['w1'] = json_postdata.get('interested_count', None) event['w2'] = json_postdata.get('maybe_count', None) valid_event_list.append(event) except: logger.error('Error for event %d', int(event['id']), exc_info=True) else: logger.info('Event with url %s has already been parsed.', event['source_url']) logger.info('Successfully get %d events detail through facebook graph api', len(valid_event_list)) # save all valid events to database if len(valid_event_list) > 0: save_events_to_db(valid_event_list)
def meetup_selenium_run(): """Spider for meetup. The logic is as follows: 1. Find all groups on start page 2. For each group, extract its upcoming events 3. For each upcoming event, go into detail page """ # start url base_url = "http://www.meetup.com/" group_list = [] try: # start web driver browser = webdriver.Firefox() # load existing urls for meetup existing_urls = load_existing_urls("meetup") # load base url browser.get(base_url) # find all the groups in the base url groups = browser.find_elements_by_xpath('//li[@class="groupCard tileGrid-tile"]') logger.info("Find %d groups on meetup home page.", len(groups)) # start parse url for each group for group in groups: group_url = group.find_element_by_xpath('.//a[@class="display-none"]') if group_url: # add valid group url into list group_list.append(group_url.get_attribute("href")) browser.quit() logger.info("Find %d groups with url on meetup home page.", len(group_list)) # start to parse each individual group if len(group_list) > 0: for group_url in group_list: logger.info("Start to parse group with url %s", group_url) meetup_parse_one_group(group_url, existing_urls) # wait for 2 seconds time.sleep(2) except Exception, e: logging.error(e.message, exc_info=True)