def facebook_selenium_run(): ''' Spider for facebook. The logic is to get all event ids of https://www.facebook.com/events/upcoming through selenium. Then, retrieve the event detail through facebook api. ''' event_info_list = [] # extract facebook user going event list event_info_list.extend(_locate_event_list()) time.sleep(5) # extract facebook user nearby suggest event list event_info_list.extend(_locate_event_list(suggest=True)) logger.info('Aggrerate %d for detail parsing through facebook api', len(event_info_list)) valid_event_list = [] # load existing urls for facebook existing_urls = load_existing_urls('facebook') # parse each event detail through facebook graph api for event in event_info_list: # filtering out already exist urls if event['source_url'] not in existing_urls: try: json_postdata = _render_to_json(_create_post_url(event['id'], APP_ID, APP_SECRET)) # title is necessary, raise exception if no title event['title'] = json_postdata['name'] # start time is necessary, raise exception if no start time event['start_time'] = _process_start_time(json_postdata['start_time']) event['detail'] = json_postdata.get('description', None) event['location'] = _process_location(json_postdata.get('place', None)) event['category'] = json_postdata.get('category', None) event['w0'] = json_postdata.get('attending_count', None) event['w1'] = json_postdata.get('interested_count', None) event['w2'] = json_postdata.get('maybe_count', None) valid_event_list.append(event) except: logger.error('Error for event %d', int(event['id']), exc_info=True) else: logger.info('Event with url %s has already been parsed.', event['source_url']) logger.info('Successfully get %d events detail through facebook graph api', len(valid_event_list)) # save all valid events to database if len(valid_event_list) > 0: save_events_to_db(valid_event_list)
logger.info('Start to parse event %s with url %s', title, url) event_info = parse_one_event(url, sub_browser) # save parsed event to event list if event_info: event_list.append(event_info) else: logger.info('Event %s with url %s has already been parsed.', title, url) except Exception, e: logger.error(e.message, exc_info=True) sub_browser.quit() browser.quit() # save aggregate new events into database if len(event_list) > 0: save_events_to_db(event_list) def parse_one_event(url, sub_browser): ''' Parse event detail page, the input parameters are url and web driver ''' try: sub_browser.get(url) title_element = sub_browser.find_element_by_xpath('//div[@id="field-event-name"]') title = title_element.text try: cover_element = sub_browser.find_element_by_xpath('//div[@id="visual-banner-preview"]/img') cover_url = cover_element.get_attribute('src')
def close_spider(self, spider): # save parsed events into database before close spider if len(self.event_list) > 0: save_events_to_db(self.event_list)