Ejemplo n.º 1
0
 def parse_result_page(self, result):
     print("正在解析360页面......................")
     res_list = result.find_all("li", attrs={'class': 'res-list'})
     print("找到解析网站词条数目: ", len(res_list))
     for res_list_item in res_list:
         res_list_h3 = res_list_item.find("h3")
         if res_list_h3 is not None:
             self.page_index += 1
             craw_item = CrawlerItem()
             setattr(craw_item, 'search', "PC端360")
             setattr(craw_item, 'keyword', self.keyword)
             setattr(craw_item, 'index', self.page_index)
             setattr(craw_item, 'page', str(self.cur_parse_page))
             setattr(craw_item, 'relate_search', 0)
             res_list_h3_a = res_list_h3.find("a")
             if res_list_h3_a is not None:
                 setattr(craw_item, 'title',
                         res_list_h3_a.get_text().replace("\n", ""))
                 setattr(craw_item, 'page_url', res_list_h3_a.get("href"))
             desc_content_p = res_list_item.find(
                 attrs={'class': re.compile(".*(res-desc).*")})
             if desc_content_p is not None:
                 page_content = desc_content_p.get_text()
             else:
                 page_content_div = res_list_item.find("div")
                 page_content = page_content_div.get_text()
             setattr(craw_item, 'content', page_content.replace("\n", ""))
             res_linkinfo_p = res_list_item.find(
                 attrs={"class": "res-linkinfo"})
             linkinfo = ''
             if res_linkinfo_p is not None:
                 linkinfo = res_linkinfo_p.find("cite").get_text()
             else:
                 temp_url = res_list_item.find(
                     attrs={"class": "mh-showurl"})
                 if temp_url is not None:
                     linkinfo = temp_url.find("cite").get_text()
             setattr(craw_item, 'domain', linkinfo)
             setattr(craw_item, 'index', self.page_index)
             self.content_parse_list.append(craw_item)
             more_answer = res_list_item.find("ul",
                                              attrs={"class", "more-ans"})
             print(craw_item)
             if more_answer is not None:
                 more_answer_alist = more_answer.find_all("a")
                 if len(more_answer_alist) > 0:
                     for more_answer_item in more_answer_alist:
                         craw_son_item = CrawlerItem()
                         setattr(craw_son_item, 'title',
                                 more_answer_item.get_text())
                         setattr(craw_son_item, 'page_url',
                                 more_answer_item.get("href"))
                         self.content_parse_list.append(craw_son_item)
                         print(craw_son_item)
     print("解析360页面结束......................")
Ejemplo n.º 2
0
 def parse_result_page(self, result):
     content_div_list = result.find_all("div",
                                        attrs={'class': 'article ali_row'})
     if content_div_list is not None and len(content_div_list) > 0:
         for content_div_item in content_div_list:
             title_h2 = content_div_item.find("h2")
             if title_h2 is not None:
                 self.page_index += 1
                 craw_item = CrawlerItem()
                 setattr(craw_item, 'search', "移动端神马")
                 setattr(craw_item, 'relate_search', self.cur_parse_page)
                 setattr(craw_item, 'keyword', self.keyword)
                 setattr(craw_item, 'index', self.page_index)
                 setattr(craw_item, 'page', str(self.cur_parse_page))
                 setattr(craw_item, 'title',
                         title_h2.get_text().replace("\n", ""))
                 setattr(craw_item, 'page_url',
                         title_h2.find("a").get("href"))
                 content_desc_p = content_div_item.find("p")
                 if content_desc_p is not None:
                     setattr(craw_item, 'content',
                             content_desc_p.get_text())
                 else:
                     setattr(craw_item, 'content',
                             content_div_item.get_text())
                 down_link_div = content_div_item.find(
                     "div", attrs={'class': 'other'})
                 if down_link_div is not None:
                     setattr(craw_item, 'domain', down_link_div.get_text())
             self.content_parse_list.append(craw_item)
             print(craw_item)
Ejemplo n.º 3
0
 def parse_result_page(self, result):
     content_div_list = result.find_all(
         "div", attrs={'class': re.compile("^(vrResult|result)$")})
     print("搜索到网站词条数目: ", len(content_div_list))
     if content_div_list is not None:
         for content_div_item in content_div_list:
             # 创建网站词条搜索对象,保存爬取数据
             data_extquery = content_div_item.get("data-extquery")
             # 如果data-extquery不为空,则为网站词条
             if data_extquery is None:
                 content_title = content_div_item.find(
                     "h3", attrs={'class': 'vr-tit'})
                 if content_title is None:
                     content_title = content_div_item.find("h3")
                 if content_title is not None:
                     craw_item = CrawlerItem()
                     self.page_index += 1
                     setattr(craw_item, 'search', "移动端搜狗")
                     setattr(craw_item, 'relate_search', 0)
                     setattr(craw_item, 'keyword', self.keyword)
                     setattr(
                         craw_item, 'title',
                         re.sub('[\r\n\t\b ]', '',
                                content_title.get_text()))
                     content_title_a = content_title.find("a")
                     if content_title_a is None:
                         content_title_a = content_div_item.find("a")
                     if content_title_a is not None:
                         setattr(craw_item, 'page_url',
                                 content_title_a.get("href"))
                     setattr(craw_item, 'index', self.page_index)
                     setattr(craw_item, 'page', str(self.cur_parse_page))
                     content_desc_div = \
                         content_div_item.find("div", attrs={'class': re.compile("^(info|text-layout)$")})
                     if content_desc_div is None:
                         content_desc_div = content_div_item.find("div")
                     if content_desc_div is not None:
                         setattr(
                             craw_item, 'content',
                             re.sub('[\r\n\t\b ]', '',
                                    content_desc_div.get_text()))
                     website_domain_span = content_div_item.find(
                         "div",
                         attrs={'class': re.compile(".*(citeurl).*")})
                     if website_domain_span is not None:
                         setattr(craw_item, 'domain',
                                 website_domain_span.get_text())
                     else:
                         setattr(craw_item, 'domain', 'wenwen.sougou.com')
                     self.content_parse_list.append(craw_item)
                     print(craw_item)
Ejemplo n.º 4
0
 def parse_result_page(self, result):
     content_div_list = result.find_all("div", attrs={'class': 'c-container'})
     print("搜索到网站词条数目: ", len(content_div_list))
     if content_div_list is not None:
         for content_div_item in content_div_list:
             # 创建网站词条搜索对象,保存爬取数据
             craw_item = CrawlerItem()
             content_a = content_div_item.find("a", attrs={'class': 'c-blocka'})
             if content_a is not None:
                 self.page_index += 1
                 setattr(craw_item, 'search', "移动端百度")
                 setattr(craw_item, 'keyword', self.keyword)
                 setattr(craw_item, 'title', content_a.get_text())
                 setattr(craw_item, 'page_url', content_a.get("href"))
                 setattr(craw_item, 'index', self.page_index)
                 setattr(craw_item, 'page', str(self.cur_parse_page))
                 setattr(craw_item, 'relate_search', 0)
                 content_desc_p = content_div_item.find("p", attrs={'class': re.compile(".*(c-line).*")})
                 if content_desc_p is not None:
                     setattr(craw_item, 'content', content_desc_p.get_text())
                 else:
                     setattr(craw_item, 'content', content_div_item.get_text())
                 website_domain_span = content_div_item.find("span", attrs={'class': 'c-showurl'})
                 if website_domain_span is not None:
                     setattr(craw_item, 'domain', website_domain_span.get_text())
                 else:
                     showurl_div = content_div_item.find("div", attrs={'class': re.compile(".*(c-line-clamp1).*")})
                     if showurl_div is not None:
                         setattr(craw_item, 'domain', showurl_div.get_text())
                 offset_div = content_div_item.find("div", attrs={'class': "c-offset"})
                 # 解析类似于百度知道的下拉连接
                 # if offset_div is not None:
                 #     craw_other_item = CrawlerItem()
                 #     down_list_tr = offset_div.find_all("tr")
                 #     print("解析百度知道,找到下拉连接个数: ", len(down_list_tr))
                 #     for down_item in down_list_tr:
                 #         setattr(craw_other_item, 'title', down_item.find("a").get_text())
                 #         setattr(craw_other_item, 'page', down_item.find("a").get("href"))
                 #         print(craw_other_item)
             self.content_parse_list.append(craw_item)
             print(craw_item)
Ejemplo n.º 5
0
 def parse_result_page(self, result):
     print("正在解析移动端360页面......................")
     res_list = result.find_all("div", attrs={'class': re.compile(".*(g-card).*")})
     print("找到解析网站词条数目: ", len(res_list))
     for res_list_item in res_list:
         res_list_h3 = res_list_item.find("h3", attrs={'class': 'res-title'})
         if res_list_h3 is not None:
             self.page_index += 1
             craw_item = CrawlerItem()
             setattr(craw_item, 'search', "移动端360")
             setattr(craw_item, 'keyword', self.keyword)
             setattr(craw_item, 'index', self.page_index)
             setattr(craw_item, 'page', str(self.cur_parse_page))
             setattr(craw_item, 'relate_search', 0)
             setattr(craw_item, 'other_search', self.cur_parse_page)
             setattr(craw_item, 'title', res_list_h3.get_text().replace("\n", ""))
             res_list_alink = res_list_item.find("a", attrs={'class': 'alink'})
             if res_list_alink is not None:
                 setattr(craw_item, 'page_url', res_list_alink.get("href"))
             desc_content_p = res_list_item.find(attrs={'class': re.compile(".*(summary).*")})
             if desc_content_p is not None:
                 page_content = desc_content_p.get_text()
             else:
                 page_content_div = res_list_item.find("div")
                 page_content = page_content_div.get_text()
             setattr(craw_item, 'content', page_content.replace("\n", ""))
             res_linkinfo_p = res_list_item.find(attrs={"class": "res-site-url"})
             linkinfo = ''
             if res_linkinfo_p is not None:
                 linkinfo = res_linkinfo_p.get_text()
             else:
                 temp_url = res_list_item.find(attrs={"class": "res-site-name"})
                 if temp_url is not None:
                     linkinfo = temp_url.get_text()
             setattr(craw_item, 'domain', linkinfo)
             setattr(craw_item, 'index', self.page_index)
             self.content_parse_list.append(craw_item)
             print(craw_item)
     print("解析移动端360页面结束......................")
Ejemplo n.º 6
0
 def parse_result_page(self, result):
     print("正在解析搜狗网站词条内容.....................")
     content_div_list = result.find_all("div", recursive=False)
     for content_div_item in content_div_list:
         # 寻找div树下的h3标签,如果存在,则当网站词条进行解析
         title_h3 = content_div_item.find("h3")
         if title_h3 is not None:
             self.page_index += 1
             craw_item = CrawlerItem()
             setattr(craw_item, 'search', "PC端搜狗")
             setattr(craw_item, 'keyword', self.keyword)
             setattr(craw_item, 'index', self.page_index)
             setattr(craw_item, 'page', str(self.cur_parse_page))
             # PC搜狗只有第一页的相关推荐是不同的
             if self.cur_parse_page == 1:
                 setattr(craw_item, 'relate_search', 1)
             else:
                 setattr(craw_item, 'relate_search', 2)
             title_h3_a = title_h3.find("a")
             if title_h3_a is not None:
                 setattr(craw_item, 'title', title_h3_a.get_text().replace('\n', ''))
                 href_str = title_h3_a.get("href")
                 if href_str is not None:
                     setattr(craw_item, 'page_url', href_str.replace('\n', ''))
                 str_info_div = content_div_item.find("div", attrs={'class': 'str_info_div'})
                 if str_info_div is not None:
                     p_text = str_info_div.find("p")
                     if p_text is not None:
                         setattr(craw_item, 'content', p_text.get_text().replace('\n', ''))
                     else:
                         setattr(craw_item, 'content', str_info_div.find("ul").get_text().replace('\n', ''))
                 else:
                     ft_content_div = content_div_item.find("div")
                     setattr(craw_item, 'content', re.sub('[\r\n\t\b ]', '', ft_content_div.get_text()))
                 fb_link_div = content_div_item.find("div", attrs={'class': 'fb'})
                 if fb_link_div is not None:
                     website_domain = fb_link_div.find("cite").get_text().replace('\n', '')
                     setattr(craw_item, 'domain', website_domain)
             # 解析搜狗知识,类似百度知道的模块
             str_pd_box = content_div_item.find("div", attrs={'class': 'str-pd-box'})
             self.content_parse_list.append(craw_item)
             print(craw_item)
             if str_pd_box is not None:
                 start_box_item_start = str_pd_box.find("p", attrs={'class': 'str_time'})
                 if start_box_item_start is not None:
                     craw_box_item = CrawlerItem()
                     setattr(craw_box_item, 'title', start_box_item_start.get_text().replace('\n', ''))
                     setattr(craw_box_item, 'page_url', start_box_item_start.find("a").get("href").replace('\n', ''))
                     self.content_parse_list.append(craw_box_item)
                     print(craw_box_item)
                 start_box_item_list = str_pd_box.find("ul")
                 if start_box_item_list is not None:
                     li_list = start_box_item_list.find_all("li")
                     for li_list_item in li_list:
                         craw_box_item = CrawlerItem()
                         setattr(craw_box_item, 'title', li_list_item.find("a").get_text().replace('\n', ''))
                         setattr(craw_box_item, 'page_url', li_list_item.find("a").get("href").replace('\n', ''))
                         self.content_parse_list.append(craw_box_item)
                         print(craw_box_item)
     print("搜索到网站词条数目为:", len(content_div_list))
     print("解析搜狗网站词条内容结束.....................")
Ejemplo n.º 7
0
 def parse_result_page(self, result):
     content_div_list = result.find_all(
         "div", attrs={'class': re.compile(".*(c-container).*")})
     print("搜索到网站词条数目: ", len(content_div_list))
     if content_div_list is not None:
         for content_div_item in content_div_list:
             #创建网站词条搜索对象,保存爬取数据
             craw_item = CrawlerItem()
             content_h3 = content_div_item.find(
                 "h3", attrs={'class': re.compile("t.*")})
             if content_h3 is not None:
                 self.page_index += 1
                 content_titlea = content_h3.find("a")
                 setattr(craw_item, 'search', "PC端百度")
                 setattr(craw_item, 'keyword', self.keyword)
                 setattr(craw_item, 'title', content_titlea.get_text())
                 setattr(craw_item, 'page_url', content_titlea.get("href"))
                 setattr(craw_item, 'index',
                         int(content_div_item.get("id")))
                 setattr(craw_item, 'page', str(self.cur_parse_page))
                 # 百度的所有相关搜索都是一样的,直接填默认值0
                 setattr(craw_item, 'relate_search', 0)
             content_desc_div = content_div_item.find(
                 "div", attrs={'class': re.compile(".*(c-abstract).*")})
             if content_desc_div is not None:
                 setattr(craw_item, 'content', content_desc_div.get_text())
             else:
                 content_desc_div = content_div_item.find(
                     attrs={'class': re.compile(".*(c-row).*")})
                 if content_desc_div is not None:
                     setattr(
                         craw_item, 'content',
                         re.sub('[\r\n\t\b ]', '',
                                content_desc_div.get_text()))
                 else:
                     setattr(
                         craw_item, 'content',
                         re.sub('[\r\n\t\b ]', '',
                                content_div_item.get_text()))
             website_domain_div = content_div_item.find(
                 "div", attrs={'class': re.compile(".*f13.*")})
             if website_domain_div is not None:
                 showurl_a = website_domain_div.find(
                     attrs={'class': "c-showurl"})
                 if showurl_a is None:
                     showurl_a = website_domain_div.find("a")
                 if showurl_a is not None:
                     setattr(craw_item, 'domain', showurl_a.get_text())
             else:
                 showurl_a = content_div_item.find_all(
                     "span", attrs={'class': "c-showurl"})
                 if len(showurl_a) > 0:
                     setattr(craw_item, 'domain', showurl_a[0].get_text())
             offset_div = content_div_item.find("div",
                                                attrs={'class': "c-offset"})
             # 将解析词条加入数列
             self.content_parse_list.append(craw_item)
             # 解析类似于百度知道的下拉连接
             if offset_div is not None:
                 down_list_tr = offset_div.find_all("a")
                 print("解析百度知道,找到下拉连接个数: ", len(down_list_tr))
                 for down_item in down_list_tr:
                     craw_other_item = CrawlerItem()
                     setattr(craw_other_item, 'title', down_item.get_text())
                     setattr(craw_other_item, 'page_url',
                             down_item.get("href"))
                     print(craw_other_item)
                     self.content_parse_list.append(craw_other_item)
             print(craw_item)