コード例 #1
0
    def parse(self, response):
        global article_id
        page_category = response.xpath(
            "//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract(
            )
        page_category = [l.strip() for l in page_category]
        item = BaiduBaikeItem()

        item['article_id'] = article_id
        item['articles'] = ''

        if u'演员' in page_category or u'电影' in page_category:
            print("Get a actor/movie page")
            soup = BeautifulSoup(response.text, 'lxml')
            root_node = soup.find("div",
                                  class_="main_tab main_tab-defaultTab curTab")

            para_nodes = soup.find_all("div", class_="para")
            basic_item = self._get_from_findall(para_nodes)
            article_content = ' '.join(basic_item)
            article_content = article_content.replace("\n", " ")
            item['articles'] = str(article_content)
            article_id += 1
            yield item
            if article_id % 50 == 0:
                print(
                    "The nums of total articles up to: {}".format(article_id))

        soup = BeautifulSoup(response.text, 'lxml')
        links = soup.find_all('a', href=re.compile(r"/item/"))
        for link in links:
            new_url = link["href"]
            new_full_url = urlparse.urljoin('https://baike.baidu.com/',
                                            new_url)
            yield scrapy.Request(new_full_url, callback=self.parse)
コード例 #2
0
    def parse(self, response):
        # tooooo ugly,,,, but can not use defaultdict
        item = BaiduBaikeItem()
        for sub_item in [
                'title', 'title_id', 'abstract', 'infobox', 'subject',
                'disambi', 'redirect', 'curLink', 'interPic', 'interLink',
                'exterLink', 'relateLemma'
        ]:
            item[sub_item] = None

        mainTitle = response.xpath(
            "//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()
        subTitle = response.xpath(
            "//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract()
        redirect_name = response.xpath(
            "//span[@class='viewTip-fromTitle']/text()").extract()
        try:
            item['title'] = ' '.join(mainTitle)
        except:
            item['title'] = None
        try:
            item['disambi'] = ' '.join(mainTitle + subTitle)
        except:
            item['disambi'] = None
        try:
            item['redirect'] = ' '.join(redirect_name)
        except:
            item['redirect'] = None
        try:
            item['curLink'] = str(response.url)
        except:
            item['curLink'] = None

        soup = BeautifulSoup(response.text, 'lxml')
        summary_node = soup.find("div", class_="lemma-summary")
        try:
            item['abstract'] = summary_node.get_text().replace("\n", " ")
        except:
            item['abstract'] = None

        page_category = response.xpath(
            "//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract(
            )
        page_category = [l.strip() for l in page_category]
        try:
            item['subject'] = ','.join(page_category)
        except:
            item['subject'] = None

        # Get infobox
        all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name")
        basic_item = self._get_from_findall(all_basicInfo_Item)
        basic_item = [s.strip().replace('\n', ' ') for s in basic_item]
        all_basicInfo_value = soup.find_all("dd",
                                            class_="basicInfo-item value")
        basic_value = self._get_from_findall(all_basicInfo_value)
        basic_value = [s.strip().replace(u'收起', '') for s in basic_value]
        info_dict = {}
        for i, info in enumerate(basic_item):
            info_dict[info] = basic_value[i]
        try:
            item['infobox'] = json.dumps(info_dict)
        except:
            item['infobox'] = None

        # Get inter picture
        selector = scrapy.Selector(response)
        img_path = selector.xpath("//img[@class='picture']/@src").extract()
        try:
            item['interPic'] = ','.join(img_path)
        except:
            item['interPic'] = None

        inter_links_dict = {}
        soup = BeautifulSoup(response.text, 'lxml')
        inter_links = soup.find_all('a', href=re.compile(r"/item/"))
        for link in inter_links:
            new_url = link["href"]
            url_name = link.get_text()
            new_full_url = urlparse.urljoin('https://baike.baidu.com/',
                                            new_url)
            inter_links_dict[url_name] = new_full_url
        try:
            item['interLink'] = json.dumps(inter_links_dict)
        except:
            item['interLink'] = None

        exter_links_dict = {}
        soup = BeautifulSoup(response.text, 'lxml')
        exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/"))
        for link in exterLink_links:
            new_url = link["href"]
            url_name = link.get_text()
            new_full_url = urlparse.urljoin('https://baike.baidu.com/',
                                            new_url)
            exter_links_dict[url_name] = new_full_url
        try:
            item['exterLink'] = json.dumps(exter_links_dict)
        except:
            item['exterLink'] = None

        all_para = soup.find_all('div', class_="para")
        all_text = [para.get_text() for para in all_para]
        try:
            item['all_text'] = ' '.join(all_text)
        except:
            item['all_text'] = None

        yield item

        soup = BeautifulSoup(response.text, 'lxml')
        links = soup.find_all('a', href=re.compile(r"/item/"))
        for link in links:
            new_url = link["href"]
            new_full_url = urlparse.urljoin('https://baike.baidu.com/',
                                            new_url)
            yield scrapy.Request(new_full_url, callback=self.parse)
コード例 #3
0
ファイル: baidu_baike.py プロジェクト: JimyFengqi/baidubaike
 def parse(self, response):
     print(response.url)
     title = response.xpath('//head/title/text()').extract_first()
     print('title is ' + title)
     item = BaiduBaikeItem()
     return item
コード例 #4
0
    def parse(self, response):
        # 分析 response来提取出页面最下部的标签信息,如果包含演员或电影则进行爬取,否则跳过
        page_category = response.xpath(
            "//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract(
            )
        page_category = [l.strip() for l in page_category]
        item = BaiduBaikeItem()

        # tooooo ugly,,,, but can not use defaultdict
        for sub_item in [
                'actor_bio', 'actor_chName', 'actor_foreName',
                'actor_nationality', 'actor_constellation', 'actor_birthPlace',
                'actor_birthDay', 'actor_repWorks', 'actor_achiem',
                'actor_brokerage', 'movie_bio', 'movie_chName',
                'movie_foreName', 'movie_prodTime', 'movie_prodCompany',
                'movie_director', 'movie_screenwriter', 'movie_genre',
                'movie_star', 'movie_length', 'movie_rekeaseTime',
                'movie_language', 'movie_achiem'
        ]:
            item[sub_item] = None

        # 如果包含演员标签则认为是演员
        if u'演员' in page_category:
            print("Get a actor page")
            soup = BeautifulSoup(response.text, 'lxml')
            summary_node = soup.find("div", class_="lemma-summary")
            item['actor_bio'] = summary_node.get_text().replace("\n", " ")

            # 使用 bs4 对页面内信息进行提取并保存到对应的item内
            all_basicInfo_Item = soup.find_all("dt",
                                               class_="basicInfo-item name")
            basic_item = self._get_from_findall(all_basicInfo_Item)
            basic_item = [s.strip() for s in basic_item]
            all_basicInfo_value = soup.find_all("dd",
                                                class_="basicInfo-item value")
            basic_value = self._get_from_findall(all_basicInfo_value)
            basic_value = [s.strip() for s in basic_value]
            for i, info in enumerate(basic_item):
                info = info.replace(u"\xa0", "")
                if info == u'中文名':
                    item['actor_chName'] = basic_value[i]
                elif info == u'外文名':
                    item['actor_foreName'] = basic_value[i]
                elif info == u'国籍':
                    item['actor_nationality'] = basic_value[i]
                elif info == u'星座':
                    item['actor_constellation'] = basic_value[i]
                elif info == u'出生地':
                    item['actor_birthPlace'] = basic_value[i]
                elif info == u'出生日期':
                    item['actor_birthDay'] = basic_value[i]
                elif info == u'代表作品':
                    item['actor_repWorks'] = basic_value[i]
                elif info == u'主要成就':
                    item['actor_achiem'] = basic_value[i]
                elif info == u'经纪公司':
                    item['actor_brokerage'] = basic_value[i]
            yield item
        elif u'电影' in page_category:
            print("Get a movie page!!")

            # 使用 bs4 对页面内的链接进行提取,而后进行循环爬取
            soup = BeautifulSoup(response.text, 'lxml')
            summary_node = soup.find("div", class_="lemma-summary")
            item['movie_bio'] = summary_node.get_text().replace("\n", " ")
            all_basicInfo_Item = soup.find_all("dt",
                                               class_="basicInfo-item name")
            basic_item = self._get_from_findall(all_basicInfo_Item)
            basic_item = [s.strip() for s in basic_item]
            all_basicInfo_value = soup.find_all("dd",
                                                class_="basicInfo-item value")
            basic_value = self._get_from_findall(all_basicInfo_value)
            basic_value = [s.strip() for s in basic_value]
            for i, info in enumerate(basic_item):
                info = info.replace(u"\xa0", "")
                if info == u'中文名':
                    item['movie_chName'] = basic_value[i]
                elif info == u'外文名':
                    item['movie_foreName'] = basic_value[i]
                elif info == u'出品时间':
                    item['movie_prodTime'] = basic_value[i]
                elif info == u'出品公司':
                    item['movie_prodCompany'] = basic_value[i]
                elif info == u'导演':
                    item['movie_director'] = basic_value[i]
                elif info == u'编剧':
                    item['movie_screenwriter'] = basic_value[i]
                elif info == u'类型':
                    item['movie_genre'] = basic_value[i]
                elif info == u'主演':
                    item['movie_star'] = basic_value[i]
                elif info == u'片长':
                    item['movie_length'] = basic_value[i]
                elif info == u'上映时间':
                    item['movie_rekeaseTime'] = basic_value[i]
                elif info == u'对白语言':
                    item['movie_language'] = basic_value[i]
                elif info == u'主要成就':
                    item['movie_achiem'] = basic_value[i]
            yield item

        soup = BeautifulSoup(response.text, 'lxml')
        links = soup.find_all('a', href=re.compile(r"/item/"))
        for link in links:
            new_url = link["href"]
            new_full_url = urllib.parse.urljoin('https://baike.baidu.com/',
                                                new_url)
            yield scrapy.Request(new_full_url, callback=self.parse)