def __places(self, nav_places: WebElement, info_dict: dict): # nav_places = _5pwrs[2] nav_places.click() sleep.random_sleep(self.__sleep) pagelet_timeline_medley_about = self.__browser.find_element_by_id( 'pagelet_timeline_medley_about') lines = pagelet_timeline_medley_about.text.split('\n') context = lines[11:len(lines)] info_dict['nav_places'] = '\n'.join(context) try: pagelet_hometown = self.__browser.find_element_by_id( 'pagelet_hometown') info_dict['pagelet_hometown'] = pagelet_hometown.text except NoSuchElementException: self.__log.debug('exception: pagelet_hometown not find') try: current_city = self.__browser.find_element_by_id('current_city') info_dict['current_city'] = current_city.text except NoSuchElementException: self.__log.debug('exception: current_city not find') try: hometown = self.__browser.find_element_by_id('hometown') info_dict['hometown'] = hometown.text except NoSuchElementException: self.__log.debug('exception: hometown not find')
def __about(self, nav_about: WebElement, info_dict: dict): # nav_about = _5pwrs[5] nav_about.click() sleep.random_sleep(self.__sleep) pagelet_timeline_medley_about = self.__browser.find_element_by_id( 'pagelet_timeline_medley_about') lines = pagelet_timeline_medley_about.text.split('\n') context = lines[11:len(lines)] info_dict['nav_about'] = '\n'.join(context) try: pagelet_bio = self.__browser.find_element_by_id('pagelet_bio') info_dict['pagelet_bio'] = pagelet_bio.text except NoSuchElementException: self.__log.debug('exception: pagelet_bio not find') try: pagelet_pronounce = self.__browser.find_element_by_id( 'pagelet_pronounce') info_dict['pagelet_pronounce'] = pagelet_pronounce.text except NoSuchElementException: self.__log.debug('exception: pagelet_pronounce not find') try: pagelet_nicknames = self.__browser.find_element_by_id( 'pagelet_nicknames') info_dict['pagelet_nicknames'] = pagelet_nicknames.text except NoSuchElementException: self.__log.debug('exception: pagelet_nicknames not find') try: pagelet_quotes = self.__browser.find_element_by_id( 'pagelet_quotes') info_dict['pagelet_quotes'] = pagelet_quotes.text except NoSuchElementException: self.__log.debug('exception: pagelet_quotes not find')
def __overview(self, nav_overview: WebElement, info_dict: dict): # nav_overview = _5pwrs[0] # 默认页面是overview,所以不需要点击事件 # nav_overview.click() sleep.random_sleep(self.__sleep) # 收集概览信息 pagelet_timeline_medley_about = self.__browser.find_element_by_id( 'pagelet_timeline_medley_about') lines = pagelet_timeline_medley_about.text.split('\n') # print(lines) context = lines[11:len(lines)] info_dict['nav_overview'] = '\n'.join(context)
def open_homepage_by_param(self, name: str, city: str, country_code: str): encode_name = quote(name) url_prefix = 'https://www.facebook.com/search/people/?' q = 'q=' + encode_name url = url_prefix + q self.__request_num = self.__request_num + 1 if self.__request_num >= self.__request_max: self.restart() info_dict = dict() # 打开搜索页 self.__log.debug("search url:" + url) self.__browser.get(url) sleep.random_sleep(self.__sleep) browse_results_container = self.__browser.find_element_by_id('BrowseResultsContainer') \ .find_elements_by_class_name("_4p2o") self.__log.debug("len(browse_results_container)=" + str(len(browse_results_container))) containers = self.__find_user_element( browse_results_container=browse_results_container, filter=name) self.__log.debug("len(containers)=" + str(len(containers))) has_find = False if len(containers) == 1: has_find = True may_user_element = containers[0].find_element_by_class_name( '_32mo') elif len(containers) > 1: if country_code in countrycode.country_code: country = countrycode.country_code[country_code] container_countrys = self.__find_user_element( browse_results_container=containers, filter=country) if len(container_countrys) == 1: has_find = True may_user_element = container_countrys[ 0].find_element_by_class_name('_32mo') elif len(container_countrys) > 1: containers = container_countrys if not has_find: container_citys = self.__find_user_element( browse_results_container=containers, filter=city) if len(container_citys) == 1: has_find = True may_user_element = container_citys[ 0].find_element_by_class_name('_32mo') if not has_find: return info_dict may_user_element.click() sleep.random_sleep(self.__sleep) info_dict['homepage'] = self.__browser.current_url self.__log.debug('homepage=' + self.__browser.current_url) info_dict = self.__collect(info_dict) return info_dict
def open_homepage_by_url(self, user_homepage: str): """ 抓取用户主页的简介信息 :param user_homepage: 用户主页 :return: dict """ self.__request_num = self.__request_num + 1 if self.__request_num >= self.__request_max: self.restart() info_dict = dict() # 打开主页 self.__browser.get(user_homepage) sleep.random_sleep(self.__sleep) info_dict = self.__collect(info_dict) return info_dict
def __edu_work(self, nav_edu_work: WebElement, info_dict: dict): # nav_edu_work = _5pwrs[1] nav_edu_work.click() sleep.random_sleep(self.__sleep) pagelet_timeline_medley_about = self.__browser.find_element_by_id( 'pagelet_timeline_medley_about') lines = pagelet_timeline_medley_about.text.split('\n') context = lines[11:len(lines)] info_dict['nav_edu_work'] = '\n'.join(context) try: pagelet_eduwork = self.__browser.find_element_by_id( 'pagelet_eduwork') info_dict['pagelet_eduwork'] = pagelet_eduwork.text except NoSuchElementException: self.__log.debug('exception: pagelet_eduwork not find')
def __all_relationships(self, nav_all_relationships: WebElement, info_dict: dict): # nav_all_relationships = _5pwrs[4] nav_all_relationships.click() sleep.random_sleep(self.__sleep) pagelet_timeline_medley_about = self.__browser.find_element_by_id( 'pagelet_timeline_medley_about') lines = pagelet_timeline_medley_about.text.split('\n') context = lines[11:len(lines)] info_dict['nav_all_relationships'] = '\n'.join(context) try: pagelet_relationships = self.__browser.find_element_by_id( 'pagelet_relationships') info_dict['pagelet_relationships'] = pagelet_relationships.text except NoSuchElementException: self.__log.debug('exception: pagelet_relationships not find')
def __year_overviews(self, nav_year_overviews: WebElement, info_dict: dict): # nav_year_overviews = _5pwrs[6] nav_year_overviews.click() sleep.random_sleep(self.__sleep) pagelet_timeline_medley_about = self.__browser.find_element_by_id( 'pagelet_timeline_medley_about') lines = pagelet_timeline_medley_about.text.split('\n') context = lines[11:len(lines)] info_dict['nav_year_overviews'] = '\n'.join(context) fb_profile_edit_experiences = self.__browser.find_element_by_class_name( 'fbProfileEditExperiences') try: info_dict[ 'fbProfileEditExperiences'] = fb_profile_edit_experiences.text except NoSuchElementException: self.__log.debug('exception: fbProfileEditExperiences not find')
def login_facebook(self): """ 登录到facebook :return: """ self.__browser.get(self.__facebook_homepage) # self.__log.debug("login_facebook open home page") sleep.random_sleep(self.__sleep * self.__double) # self.get_browser().save_screenshot('before_login.png') self.__browser.find_element_by_id('email').clear() self.__browser.find_element_by_id('pass').clear() self.__browser.find_element_by_id('email').send_keys(self.__user_name) self.__browser.find_element_by_id('pass').send_keys(self.__password) self.__browser.find_element_by_id('loginbutton').click() # self.__log.debug("login_facebook click") sleep.random_sleep(self.__sleep * self.__double) self.__login = True
def __collect(self, info_dict: dict): # 收集主页简介 try: intro_container_id = self.__browser.find_element_by_id( 'intro_container_id') info_dict['intro_container_id'] = intro_container_id.text except NoSuchElementException: self.__log.debug('this user may not have intro_container_id') # 定位简介 fb_timeline_headlines = self.__browser.find_element_by_id( 'fbTimelineHeadline').find_elements_by_class_name('_6-6') about = fb_timeline_headlines[1] # 打开简介 about.click() sleep.random_sleep(self.__sleep * self.__double) # self.__browser.save_screenshot('about.png') _5pwrs = self.__browser.find_elements_by_class_name('_5pwr') # print(_5pwrs) ################################################################### nav_overview = _5pwrs[0] self.__overview(nav_overview=nav_overview, info_dict=info_dict) # return info_dict ################################################################### nav_edu_work = _5pwrs[1] self.__edu_work(nav_edu_work=nav_edu_work, info_dict=info_dict) ################################################################### nav_places = _5pwrs[2] self.__places(nav_places=nav_places, info_dict=info_dict) ################################################################### nav_contact_basic = _5pwrs[3] self.__contact_basic(nav_contact_basic=nav_contact_basic, info_dict=info_dict) ################################################################### nav_all_relationships = _5pwrs[4] self.__all_relationships(nav_all_relationships=nav_all_relationships, info_dict=info_dict) ################################################################### nav_about = _5pwrs[5] self.__about(nav_about=nav_about, info_dict=info_dict) ################################################################### nav_year_overviews = _5pwrs[6] self.__year_overviews(nav_year_overviews=nav_year_overviews, info_dict=info_dict) return info_dict
def __contact_basic(self, nav_contact_basic: WebElement, info_dict: dict): # nav_contact_basic = _5pwrs[3] nav_contact_basic.click() sleep.random_sleep(self.__sleep) pagelet_timeline_medley_about = self.__browser.find_element_by_id( 'pagelet_timeline_medley_about') lines = pagelet_timeline_medley_about.text.split('\n') context = lines[11:len(lines)] info_dict['nav_contact_basic'] = '\n'.join(context) try: pagelet_contact = self.__browser.find_element_by_id( 'pagelet_contact') info_dict['pagelet_contact'] = pagelet_contact.text except NoSuchElementException: self.__log.debug('exception: pagelet_contact not find') try: pagelet_basic = self.__browser.find_element_by_id('pagelet_basic') info_dict['pagelet_basic'] = pagelet_basic.text except NoSuchElementException: self.__log.debug('exception: pagelet_basic not find')
def restart(self, cause='request max, try to restart facebook crawler'): """ 防止phantomJS 请求次数过多导致缓存大量页面把内存耗尽,每次重启会释放(基于java回收)所有页面内存 :return: """ self.__log.debug(cause) self.__request_num = 0 # cookies = self.__browser.get_cookies() self.__browser.quit() self.__login = False if platform.system() == 'Linux': self.__browser = webdriver.PhantomJS( executable_path= '/home/lvguangli/project/phantomjs-2.1.1-linux-x86_64/bin/phantomjs', desired_capabilities=self.__desired_capabilities, service_args=self.__service_args) elif platform.system() == 'Darwin': self.__browser = webdriver.PhantomJS( desired_capabilities=self.__desired_capabilities) else: self.__browser = webdriver.PhantomJS() sleep.random_sleep(self.__sleep * self.__double) self.login_facebook() sleep.random_sleep(self.__sleep * self.__double)