コード例 #1
0
    def _fetch_data(self, entry_name, url):
        # url = url.decode('utf-8')
        # if url[:5] == 'http:':
        #     url = 'https' + url[4:]
        # url = url.encode('utf-8')
        original_entry_name = entry_name
        data = dict()
        try:
            with contextlib.closing(urllib2.urlopen(url.encode('utf-8'))) as page_source:
                page_content = page_source.read()
            doc = UnicodeDammit(page_content, is_html=True)
            parser = lxml.html.HTMLParser(encoding=doc.original_encoding)
            doc = lxml.html.document_fromstring(page_content, parser=parser)

            bar_name = doc.xpath('//a[contains(@class, "star_title_h3")]')
            if not bar_name:
                bar_name = doc.xpath('//a[contains(@class, "card_title_fname")]')
            if type(bar_name) is list and len(bar_name) > 0:
                entry_name = bar_name[0].text_content().strip()
            num_visits = doc.xpath('//span[contains(@class, "j_visit_num")]')
            if not num_visits:
                num_visits = doc.xpath('//span[contains(@class, "card_menNum")]')
            num_posts = doc.xpath('//span[contains(@class, "j_post_num")]')
            if not num_posts:
                num_posts = doc.xpath('//span[contains(@class, "card_infoNum")]')
            if type(num_visits) is list and len(num_visits) > 0:
                num_visits = num_visits[0].text_content()
                num_visits = cogtu_misc.get_first_number_from_text(num_visits)
            else:
                num_visits = 0
            if type(num_posts) is list and len(num_posts) > 0:
                num_posts = num_posts[0].text_content()
                num_posts = cogtu_misc.get_first_number_from_text(num_posts)
            else:
                num_posts = 0
            num_groups = doc.xpath("//a[contains(@class, 'star_nav_ico_group')]/span")
            if type(num_groups) is list and len(num_groups) > 0:
                num_groups = num_groups[0].text_content()
                num_groups = cogtu_misc.get_first_number_from_text(num_groups)
            else:
                num_groups = 0
        except urllib2.HTTPError:
            logging.info('urllib2.HTTPError. Skip.')
            return None, None
        except urllib2.URLError:
            logging.info('urllib2.URLError. Skip.')
            return None, None

        data['num_visits'] = int(num_visits)
        data['num_posts'] = int(num_posts)
        data['num_groups'] = int(num_groups)
        data['entry_name'] = entry_name
        data['original_entry_name'] = original_entry_name
        data['url'] = url
        return entry_name, data
コード例 #2
0
 def fetch_info(self, keyword):
     self.gscraper_config['SCRAPING']['keyword'] = keyword
     info = defaultdict(dict)
     info['num_results_for_query']['baidu'] = 0
     info['num_results_for_query']['google'] = 0
     for i in range(0, RETRY):
         try:
             search = scrape_with_config(self.gscraper_config)
         except GoogleSearchError as e:
             logging.info(e)
             search = ''
             return
         for serp in search.serps:
             text = serp.num_results_for_query
             if 'baidu' in serp.search_engine_name:
                 info['num_results_for_query']['baidu'] = int(cogtu_misc.get_first_number_from_text(text))
             elif 'google' in serp.search_engine_name:
                 info['num_results_for_query']['google'] = int(cogtu_misc.get_first_number_from_text(text))
         if info['num_results_for_query']['baidu'] is not 0 or\
                 info['num_results_for_query']['google'] is not 0:
             break
         logging.info('RETRYING...')
     return info