コード例 #1
0
 def __places(self, nav_places: WebElement, info_dict: dict):
     # nav_places = _5pwrs[2]
     nav_places.click()
     sleep.random_sleep(self.__sleep)
     pagelet_timeline_medley_about = self.__browser.find_element_by_id(
         'pagelet_timeline_medley_about')
     lines = pagelet_timeline_medley_about.text.split('\n')
     context = lines[11:len(lines)]
     info_dict['nav_places'] = '\n'.join(context)
     try:
         pagelet_hometown = self.__browser.find_element_by_id(
             'pagelet_hometown')
         info_dict['pagelet_hometown'] = pagelet_hometown.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_hometown not find')
     try:
         current_city = self.__browser.find_element_by_id('current_city')
         info_dict['current_city'] = current_city.text
     except NoSuchElementException:
         self.__log.debug('exception: current_city not find')
     try:
         hometown = self.__browser.find_element_by_id('hometown')
         info_dict['hometown'] = hometown.text
     except NoSuchElementException:
         self.__log.debug('exception: hometown not find')
コード例 #2
0
 def __about(self, nav_about: WebElement, info_dict: dict):
     # nav_about = _5pwrs[5]
     nav_about.click()
     sleep.random_sleep(self.__sleep)
     pagelet_timeline_medley_about = self.__browser.find_element_by_id(
         'pagelet_timeline_medley_about')
     lines = pagelet_timeline_medley_about.text.split('\n')
     context = lines[11:len(lines)]
     info_dict['nav_about'] = '\n'.join(context)
     try:
         pagelet_bio = self.__browser.find_element_by_id('pagelet_bio')
         info_dict['pagelet_bio'] = pagelet_bio.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_bio not find')
     try:
         pagelet_pronounce = self.__browser.find_element_by_id(
             'pagelet_pronounce')
         info_dict['pagelet_pronounce'] = pagelet_pronounce.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_pronounce not find')
     try:
         pagelet_nicknames = self.__browser.find_element_by_id(
             'pagelet_nicknames')
         info_dict['pagelet_nicknames'] = pagelet_nicknames.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_nicknames not find')
     try:
         pagelet_quotes = self.__browser.find_element_by_id(
             'pagelet_quotes')
         info_dict['pagelet_quotes'] = pagelet_quotes.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_quotes not find')
コード例 #3
0
 def __overview(self, nav_overview: WebElement, info_dict: dict):
     # nav_overview = _5pwrs[0]
     # 默认页面是overview,所以不需要点击事件
     # nav_overview.click()
     sleep.random_sleep(self.__sleep)
     # 收集概览信息
     pagelet_timeline_medley_about = self.__browser.find_element_by_id(
         'pagelet_timeline_medley_about')
     lines = pagelet_timeline_medley_about.text.split('\n')
     # print(lines)
     context = lines[11:len(lines)]
     info_dict['nav_overview'] = '\n'.join(context)
コード例 #4
0
 def open_homepage_by_param(self, name: str, city: str, country_code: str):
     encode_name = quote(name)
     url_prefix = 'https://www.facebook.com/search/people/?'
     q = 'q=' + encode_name
     url = url_prefix + q
     self.__request_num = self.__request_num + 1
     if self.__request_num >= self.__request_max:
         self.restart()
     info_dict = dict()
     # 打开搜索页
     self.__log.debug("search url:" + url)
     self.__browser.get(url)
     sleep.random_sleep(self.__sleep)
     browse_results_container = self.__browser.find_element_by_id('BrowseResultsContainer') \
         .find_elements_by_class_name("_4p2o")
     self.__log.debug("len(browse_results_container)=" +
                      str(len(browse_results_container)))
     containers = self.__find_user_element(
         browse_results_container=browse_results_container, filter=name)
     self.__log.debug("len(containers)=" + str(len(containers)))
     has_find = False
     if len(containers) == 1:
         has_find = True
         may_user_element = containers[0].find_element_by_class_name(
             '_32mo')
     elif len(containers) > 1:
         if country_code in countrycode.country_code:
             country = countrycode.country_code[country_code]
             container_countrys = self.__find_user_element(
                 browse_results_container=containers, filter=country)
             if len(container_countrys) == 1:
                 has_find = True
                 may_user_element = container_countrys[
                     0].find_element_by_class_name('_32mo')
             elif len(container_countrys) > 1:
                 containers = container_countrys
         if not has_find:
             container_citys = self.__find_user_element(
                 browse_results_container=containers, filter=city)
             if len(container_citys) == 1:
                 has_find = True
                 may_user_element = container_citys[
                     0].find_element_by_class_name('_32mo')
     if not has_find:
         return info_dict
     may_user_element.click()
     sleep.random_sleep(self.__sleep)
     info_dict['homepage'] = self.__browser.current_url
     self.__log.debug('homepage=' + self.__browser.current_url)
     info_dict = self.__collect(info_dict)
     return info_dict
コード例 #5
0
 def open_homepage_by_url(self, user_homepage: str):
     """
     抓取用户主页的简介信息
     :param user_homepage: 用户主页
     :return: dict
     """
     self.__request_num = self.__request_num + 1
     if self.__request_num >= self.__request_max:
         self.restart()
     info_dict = dict()
     # 打开主页
     self.__browser.get(user_homepage)
     sleep.random_sleep(self.__sleep)
     info_dict = self.__collect(info_dict)
     return info_dict
コード例 #6
0
 def __edu_work(self, nav_edu_work: WebElement, info_dict: dict):
     # nav_edu_work = _5pwrs[1]
     nav_edu_work.click()
     sleep.random_sleep(self.__sleep)
     pagelet_timeline_medley_about = self.__browser.find_element_by_id(
         'pagelet_timeline_medley_about')
     lines = pagelet_timeline_medley_about.text.split('\n')
     context = lines[11:len(lines)]
     info_dict['nav_edu_work'] = '\n'.join(context)
     try:
         pagelet_eduwork = self.__browser.find_element_by_id(
             'pagelet_eduwork')
         info_dict['pagelet_eduwork'] = pagelet_eduwork.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_eduwork not find')
コード例 #7
0
 def __all_relationships(self, nav_all_relationships: WebElement,
                         info_dict: dict):
     # nav_all_relationships = _5pwrs[4]
     nav_all_relationships.click()
     sleep.random_sleep(self.__sleep)
     pagelet_timeline_medley_about = self.__browser.find_element_by_id(
         'pagelet_timeline_medley_about')
     lines = pagelet_timeline_medley_about.text.split('\n')
     context = lines[11:len(lines)]
     info_dict['nav_all_relationships'] = '\n'.join(context)
     try:
         pagelet_relationships = self.__browser.find_element_by_id(
             'pagelet_relationships')
         info_dict['pagelet_relationships'] = pagelet_relationships.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_relationships not find')
コード例 #8
0
 def __year_overviews(self, nav_year_overviews: WebElement,
                      info_dict: dict):
     # nav_year_overviews = _5pwrs[6]
     nav_year_overviews.click()
     sleep.random_sleep(self.__sleep)
     pagelet_timeline_medley_about = self.__browser.find_element_by_id(
         'pagelet_timeline_medley_about')
     lines = pagelet_timeline_medley_about.text.split('\n')
     context = lines[11:len(lines)]
     info_dict['nav_year_overviews'] = '\n'.join(context)
     fb_profile_edit_experiences = self.__browser.find_element_by_class_name(
         'fbProfileEditExperiences')
     try:
         info_dict[
             'fbProfileEditExperiences'] = fb_profile_edit_experiences.text
     except NoSuchElementException:
         self.__log.debug('exception: fbProfileEditExperiences not find')
コード例 #9
0
 def login_facebook(self):
     """
     登录到facebook
     :return: 
     """
     self.__browser.get(self.__facebook_homepage)
     # self.__log.debug("login_facebook open home page")
     sleep.random_sleep(self.__sleep * self.__double)
     # self.get_browser().save_screenshot('before_login.png')
     self.__browser.find_element_by_id('email').clear()
     self.__browser.find_element_by_id('pass').clear()
     self.__browser.find_element_by_id('email').send_keys(self.__user_name)
     self.__browser.find_element_by_id('pass').send_keys(self.__password)
     self.__browser.find_element_by_id('loginbutton').click()
     # self.__log.debug("login_facebook click")
     sleep.random_sleep(self.__sleep * self.__double)
     self.__login = True
コード例 #10
0
 def __collect(self, info_dict: dict):
     # 收集主页简介
     try:
         intro_container_id = self.__browser.find_element_by_id(
             'intro_container_id')
         info_dict['intro_container_id'] = intro_container_id.text
     except NoSuchElementException:
         self.__log.debug('this user may not have intro_container_id')
     # 定位简介
     fb_timeline_headlines = self.__browser.find_element_by_id(
         'fbTimelineHeadline').find_elements_by_class_name('_6-6')
     about = fb_timeline_headlines[1]
     # 打开简介
     about.click()
     sleep.random_sleep(self.__sleep * self.__double)
     # self.__browser.save_screenshot('about.png')
     _5pwrs = self.__browser.find_elements_by_class_name('_5pwr')
     # print(_5pwrs)
     ###################################################################
     nav_overview = _5pwrs[0]
     self.__overview(nav_overview=nav_overview, info_dict=info_dict)
     # return info_dict
     ###################################################################
     nav_edu_work = _5pwrs[1]
     self.__edu_work(nav_edu_work=nav_edu_work, info_dict=info_dict)
     ###################################################################
     nav_places = _5pwrs[2]
     self.__places(nav_places=nav_places, info_dict=info_dict)
     ###################################################################
     nav_contact_basic = _5pwrs[3]
     self.__contact_basic(nav_contact_basic=nav_contact_basic,
                          info_dict=info_dict)
     ###################################################################
     nav_all_relationships = _5pwrs[4]
     self.__all_relationships(nav_all_relationships=nav_all_relationships,
                              info_dict=info_dict)
     ###################################################################
     nav_about = _5pwrs[5]
     self.__about(nav_about=nav_about, info_dict=info_dict)
     ###################################################################
     nav_year_overviews = _5pwrs[6]
     self.__year_overviews(nav_year_overviews=nav_year_overviews,
                           info_dict=info_dict)
     return info_dict
コード例 #11
0
 def __contact_basic(self, nav_contact_basic: WebElement, info_dict: dict):
     # nav_contact_basic = _5pwrs[3]
     nav_contact_basic.click()
     sleep.random_sleep(self.__sleep)
     pagelet_timeline_medley_about = self.__browser.find_element_by_id(
         'pagelet_timeline_medley_about')
     lines = pagelet_timeline_medley_about.text.split('\n')
     context = lines[11:len(lines)]
     info_dict['nav_contact_basic'] = '\n'.join(context)
     try:
         pagelet_contact = self.__browser.find_element_by_id(
             'pagelet_contact')
         info_dict['pagelet_contact'] = pagelet_contact.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_contact not find')
     try:
         pagelet_basic = self.__browser.find_element_by_id('pagelet_basic')
         info_dict['pagelet_basic'] = pagelet_basic.text
     except NoSuchElementException:
         self.__log.debug('exception: pagelet_basic not find')
コード例 #12
0
 def restart(self, cause='request max, try to restart facebook crawler'):
     """
     防止phantomJS 请求次数过多导致缓存大量页面把内存耗尽,每次重启会释放(基于java回收)所有页面内存
     :return: 
     """
     self.__log.debug(cause)
     self.__request_num = 0
     # cookies = self.__browser.get_cookies()
     self.__browser.quit()
     self.__login = False
     if platform.system() == 'Linux':
         self.__browser = webdriver.PhantomJS(
             executable_path=
             '/home/lvguangli/project/phantomjs-2.1.1-linux-x86_64/bin/phantomjs',
             desired_capabilities=self.__desired_capabilities,
             service_args=self.__service_args)
     elif platform.system() == 'Darwin':
         self.__browser = webdriver.PhantomJS(
             desired_capabilities=self.__desired_capabilities)
     else:
         self.__browser = webdriver.PhantomJS()
     sleep.random_sleep(self.__sleep * self.__double)
     self.login_facebook()
     sleep.random_sleep(self.__sleep * self.__double)