def parse(self, response): global article_id page_category = response.xpath( "//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract( ) page_category = [l.strip() for l in page_category] item = BaiduBaikeItem() item['article_id'] = article_id item['articles'] = '' if u'演员' in page_category or u'电影' in page_category: print("Get a actor/movie page") soup = BeautifulSoup(response.text, 'lxml') root_node = soup.find("div", class_="main_tab main_tab-defaultTab curTab") para_nodes = soup.find_all("div", class_="para") basic_item = self._get_from_findall(para_nodes) article_content = ' '.join(basic_item) article_content = article_content.replace("\n", " ") item['articles'] = str(article_content) article_id += 1 yield item if article_id % 50 == 0: print( "The nums of total articles up to: {}".format(article_id)) soup = BeautifulSoup(response.text, 'lxml') links = soup.find_all('a', href=re.compile(r"/item/")) for link in links: new_url = link["href"] new_full_url = urlparse.urljoin('https://baike.baidu.com/', new_url) yield scrapy.Request(new_full_url, callback=self.parse)
def parse(self, response): # tooooo ugly,,,, but can not use defaultdict item = BaiduBaikeItem() for sub_item in [ 'title', 'title_id', 'abstract', 'infobox', 'subject', 'disambi', 'redirect', 'curLink', 'interPic', 'interLink', 'exterLink', 'relateLemma' ]: item[sub_item] = None mainTitle = response.xpath( "//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract() subTitle = response.xpath( "//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()").extract() redirect_name = response.xpath( "//span[@class='viewTip-fromTitle']/text()").extract() try: item['title'] = ' '.join(mainTitle) except: item['title'] = None try: item['disambi'] = ' '.join(mainTitle + subTitle) except: item['disambi'] = None try: item['redirect'] = ' '.join(redirect_name) except: item['redirect'] = None try: item['curLink'] = str(response.url) except: item['curLink'] = None soup = BeautifulSoup(response.text, 'lxml') summary_node = soup.find("div", class_="lemma-summary") try: item['abstract'] = summary_node.get_text().replace("\n", " ") except: item['abstract'] = None page_category = response.xpath( "//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract( ) page_category = [l.strip() for l in page_category] try: item['subject'] = ','.join(page_category) except: item['subject'] = None # Get infobox all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") basic_item = self._get_from_findall(all_basicInfo_Item) basic_item = [s.strip().replace('\n', ' ') for s in basic_item] all_basicInfo_value = soup.find_all("dd", class_="basicInfo-item value") basic_value = self._get_from_findall(all_basicInfo_value) basic_value = [s.strip().replace(u'收起', '') for s in basic_value] info_dict = {} for i, info in enumerate(basic_item): info_dict[info] = basic_value[i] try: item['infobox'] = json.dumps(info_dict) except: item['infobox'] = None # Get inter picture selector = scrapy.Selector(response) img_path = selector.xpath("//img[@class='picture']/@src").extract() try: item['interPic'] = ','.join(img_path) except: item['interPic'] = None inter_links_dict = {} soup = BeautifulSoup(response.text, 'lxml') inter_links = soup.find_all('a', href=re.compile(r"/item/")) for link in inter_links: new_url = link["href"] url_name = link.get_text() new_full_url = urlparse.urljoin('https://baike.baidu.com/', new_url) inter_links_dict[url_name] = new_full_url try: item['interLink'] = json.dumps(inter_links_dict) except: item['interLink'] = None exter_links_dict = {} soup = BeautifulSoup(response.text, 'lxml') exterLink_links = soup.find_all('a', href=re.compile(r"/redirect/")) for link in exterLink_links: new_url = link["href"] url_name = link.get_text() new_full_url = urlparse.urljoin('https://baike.baidu.com/', new_url) exter_links_dict[url_name] = new_full_url try: item['exterLink'] = json.dumps(exter_links_dict) except: item['exterLink'] = None all_para = soup.find_all('div', class_="para") all_text = [para.get_text() for para in all_para] try: item['all_text'] = ' '.join(all_text) except: item['all_text'] = None yield item soup = BeautifulSoup(response.text, 'lxml') links = soup.find_all('a', href=re.compile(r"/item/")) for link in links: new_url = link["href"] new_full_url = urlparse.urljoin('https://baike.baidu.com/', new_url) yield scrapy.Request(new_full_url, callback=self.parse)
def parse(self, response): print(response.url) title = response.xpath('//head/title/text()').extract_first() print('title is ' + title) item = BaiduBaikeItem() return item
def parse(self, response): # 分析 response来提取出页面最下部的标签信息,如果包含演员或电影则进行爬取,否则跳过 page_category = response.xpath( "//dd[@id='open-tag-item']/span[@class='taglist']/text()").extract( ) page_category = [l.strip() for l in page_category] item = BaiduBaikeItem() # tooooo ugly,,,, but can not use defaultdict for sub_item in [ 'actor_bio', 'actor_chName', 'actor_foreName', 'actor_nationality', 'actor_constellation', 'actor_birthPlace', 'actor_birthDay', 'actor_repWorks', 'actor_achiem', 'actor_brokerage', 'movie_bio', 'movie_chName', 'movie_foreName', 'movie_prodTime', 'movie_prodCompany', 'movie_director', 'movie_screenwriter', 'movie_genre', 'movie_star', 'movie_length', 'movie_rekeaseTime', 'movie_language', 'movie_achiem' ]: item[sub_item] = None # 如果包含演员标签则认为是演员 if u'演员' in page_category: print("Get a actor page") soup = BeautifulSoup(response.text, 'lxml') summary_node = soup.find("div", class_="lemma-summary") item['actor_bio'] = summary_node.get_text().replace("\n", " ") # 使用 bs4 对页面内信息进行提取并保存到对应的item内 all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") basic_item = self._get_from_findall(all_basicInfo_Item) basic_item = [s.strip() for s in basic_item] all_basicInfo_value = soup.find_all("dd", class_="basicInfo-item value") basic_value = self._get_from_findall(all_basicInfo_value) basic_value = [s.strip() for s in basic_value] for i, info in enumerate(basic_item): info = info.replace(u"\xa0", "") if info == u'中文名': item['actor_chName'] = basic_value[i] elif info == u'外文名': item['actor_foreName'] = basic_value[i] elif info == u'国籍': item['actor_nationality'] = basic_value[i] elif info == u'星座': item['actor_constellation'] = basic_value[i] elif info == u'出生地': item['actor_birthPlace'] = basic_value[i] elif info == u'出生日期': item['actor_birthDay'] = basic_value[i] elif info == u'代表作品': item['actor_repWorks'] = basic_value[i] elif info == u'主要成就': item['actor_achiem'] = basic_value[i] elif info == u'经纪公司': item['actor_brokerage'] = basic_value[i] yield item elif u'电影' in page_category: print("Get a movie page!!") # 使用 bs4 对页面内的链接进行提取,而后进行循环爬取 soup = BeautifulSoup(response.text, 'lxml') summary_node = soup.find("div", class_="lemma-summary") item['movie_bio'] = summary_node.get_text().replace("\n", " ") all_basicInfo_Item = soup.find_all("dt", class_="basicInfo-item name") basic_item = self._get_from_findall(all_basicInfo_Item) basic_item = [s.strip() for s in basic_item] all_basicInfo_value = soup.find_all("dd", class_="basicInfo-item value") basic_value = self._get_from_findall(all_basicInfo_value) basic_value = [s.strip() for s in basic_value] for i, info in enumerate(basic_item): info = info.replace(u"\xa0", "") if info == u'中文名': item['movie_chName'] = basic_value[i] elif info == u'外文名': item['movie_foreName'] = basic_value[i] elif info == u'出品时间': item['movie_prodTime'] = basic_value[i] elif info == u'出品公司': item['movie_prodCompany'] = basic_value[i] elif info == u'导演': item['movie_director'] = basic_value[i] elif info == u'编剧': item['movie_screenwriter'] = basic_value[i] elif info == u'类型': item['movie_genre'] = basic_value[i] elif info == u'主演': item['movie_star'] = basic_value[i] elif info == u'片长': item['movie_length'] = basic_value[i] elif info == u'上映时间': item['movie_rekeaseTime'] = basic_value[i] elif info == u'对白语言': item['movie_language'] = basic_value[i] elif info == u'主要成就': item['movie_achiem'] = basic_value[i] yield item soup = BeautifulSoup(response.text, 'lxml') links = soup.find_all('a', href=re.compile(r"/item/")) for link in links: new_url = link["href"] new_full_url = urllib.parse.urljoin('https://baike.baidu.com/', new_url) yield scrapy.Request(new_full_url, callback=self.parse)