コード例 #1
0
 def parse(self, response):
     soup = BeautifulSoup(response.body, "lxml")
     data = response.meta["data"]
     flag_list = []
     for i in soup.find_all("div", class_="c")[1:-2]:
         strTime = i.find("span",
                          class_="ct").get_text(strip=True).split(u" 来自")[0]
         pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime)
         flag_list.append(flag)
         if flag == 1:
             content_id = i["id"].strip("M_")
             # self.rconn.delete("Sina:content_id")
             redis_flag = self.rconn.sadd("Sina:content_id", content_id)
             # redis_flag = 1
             if redis_flag == 1:
                 detail = {}
                 detail["key"] = data["key"]
                 comment_url = "https://weibo.cn/comment/%s" % content_id
                 detail["contentId"] = content_id
                 detail["pushTime"] = pushTime
                 yield Request(url=comment_url,
                               callback=self.parse_comment,
                               meta={"data": detail})
                 # break
     if 2 not in flag_list:
         hxs = Selector(response)
         url_next = hxs.xpath(
             'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
             .decode('utf8')).extract()[0]
         if url_next:
             req_url = "https://weibo.cn%s" % url_next
             yield Request(url=req_url,
                           callback=self.parse,
                           meta={"data": data})
コード例 #2
0
 def parse(self, response):
     hxs = Selector(response)
     data = response.meta["data"]
     c = hxs.xpath('body/div[@class="c" and @id]')
     for div in c:
         try:
             like = re.findall('赞\[(\d+)\]'.decode('utf8'),
                               div.extract())[0]  # 点赞数
             transfer = re.findall('转发\[(\d+)\]'.decode('utf8'),
                                   div.extract())[0]  # 转载数
             commentNum = re.findall('评论\[(\d+)\]'.decode('utf8'),
                                     div.extract())[0]  # 评论数
             contentId = div.xpath("@id").extract()[0].split('M_')[1]
             others = div.xpath('div/span[@class="ct"]/text()').extract(
             )  # 求时间和使用工具(手机或平台)
             strs = others[0].split(u"来自")
             pushTime, flag = Time_stamp().time_handle(strs[0].strip())
             tool = strs[1]
             detail = {}
             detail["key"] = data["key"]
             comment_url = "https://weibo.cn/comment/%s" % contentId
             detail["contentId"] = contentId
             detail["pushTime"] = pushTime
             detail["commentNum"] = commentNum
             detail["transfer"] = transfer
             detail["like"] = like
             detail["tool"] = tool
             yield Request(url=comment_url,
                           callback=self.parse_comment,
                           meta={"data": detail},
                           dont_filter=True)
             # break
         except:
             pass
     url_next = hxs.xpath(
         'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
         .decode('utf8')).extract()
     if url_next and data['page'] < 5:
         req_url = "https://weibo.cn%s" % url_next[0]
         data["page"] += 1
         yield Request(url=req_url,
                       callback=self.parse,
                       meta={"data": data},
                       dont_filter=True)
コード例 #3
0
ファイル: SoGouSpider.py プロジェクト: chenxy0761/NewsSpider2
 def parse_result(self, response):
     soup = BeautifulSoup(response.body, "lxml")
     data = response.meta
     try:
         scr = soup.find_all("script")[-2:-1][0].get_text(strip=True)
         data_list = scr.split("var msgList = ")[1].split(
             'seajs.use("sougou/p')[0].strip().strip(";")
         j_data = json.loads(data_list)
         art_list = []
         for li in j_data["list"]:
             dic = {}
             dic["id"] = li["comm_msg_info"]["id"]
             dic["pushtime"] = li["comm_msg_info"]["datetime"]
             dic["title"] = li["app_msg_ext_info"]["title"]
             dic["fileid"] = li["app_msg_ext_info"]["fileid"]
             dic["url"] = self.host + li["app_msg_ext_info"][
                 "content_url"].replace("amp;", "")
             art_list.append(dic)
             for ls in li["app_msg_ext_info"]["multi_app_msg_item_list"]:
                 dict_ls = {}
                 dict_ls["id"] = dic["id"]
                 dict_ls["pushtime"] = dic["pushtime"]
                 dict_ls["url"] = self.host + ls["content_url"].replace(
                     "amp;", "")
                 dict_ls["title"] = ls["title"]
                 dict_ls["fileid"] = ls["fileid"]
                 art_list.append(dict_ls)
         for ks in art_list:
             timeFormat, flag = Time_stamp().time_handle(ks["pushtime"])
             if flag == 1:
                 line = data["key"].split("--")[-1] + "--" + str(
                     ks["pushtime"])
                 self.rconn.delete("SoGou:Account")
                 flag = self.rconn.sadd("SoGou:Account", line)
                 if flag == 1:
                     ks["pushtime"] = timeFormat
                     ks["keyword"] = data["key"]
                     data["ks"] = ks
                     yield Request(url=ks["url"],
                                   callback=self.parse_article,
                                   meta=data)
     except Exception, e:
         logger.error("parse_result error <<%s>>" % e)
コード例 #4
0
 def parse_comment(self, response):
     data = response.meta["data"]
     hxs = Selector(response)
     if not data.has_key("page"):
         detail = {}
         detail["contentId"] = data["contentId"]
         detail["pushTime"] = data["pushTime"]
         keys = data["key"].split("--")
         detail["SinaName"] = keys[0]
         detail["Vermicelli"] = keys[1]
         detail["SinaID"] = keys[2]
         detail["SinaOthID"] = keys[2]
         contentStr = hxs.xpath(
             '//div/span[@class="ctt"]//text()').extract()  # 微博内容
         reprintStr = hxs.xpath(
             '//div/span[@class="pms"]/preceding-sibling::span/a//text()'
         ).extract()
         commontStr = hxs.xpath(
             '//div/span[@class="pms"]//text()').extract()
         thumbs_upStr = hxs.xpath(
             '//div/span[@class="pms"]/following-sibling::span/a//text()'
         ).extract()
         content = "0"
         reprint = "0"
         commont = "0"
         if '[' in str(reprintStr[0]):
             reprint = str(reprintStr[0])[str(reprintStr[0]).index('[') +
                                          1:str(reprintStr[0]).index(']')]
         if '[' in str(commontStr[0]):
             commont = str(commontStr[0])[str(commontStr[0]).index('[') +
                                          1:str(commontStr[0]).index(']')]
         thumbs_up = str(thumbs_upStr[0])[str(thumbs_upStr[0]).index('[') +
                                          1:str(thumbs_upStr[0]).index(']')]
         for cd in contentStr:
             if len(cd) >= 3:
                 content += cd.replace(" ", "")
         detail["content"] = content
         detail["reprint"] = int(reprint)
         detail["commont"] = int(commont)
         detail["thumbs_up"] = int(thumbs_up)
         flag = int(self.wm.predict(detail["content"])[0])
         if flag != 1:
             total = 0
             for word in self.words:
                 if word.strip() in detail["content"]:
                     total += 1
                     if total >= 2:
                         flag = 1
                         break
         if flag == 1:
             detail["flag"] = 1
             contentItem = SinaContentItem()
             for key, val in detail.items():
                 contentItem[key] = val
             yield contentItem
             c = hxs.xpath('body/div[@class="c" and @id]')[1:]
         else:
             c = []
     else:
         c = hxs.xpath('body/div[@class="c" and @id]')
     for div in c:
         comme = {}
         comme["contentId"] = data["contentId"]
         ID = div.xpath("a/@href").extract_first()
         userName = div.xpath("a//text()").extract_first()
         commentId = div.xpath("@id").extract()[0].split('C_')[1]
         try:
             userId = ID.split("u/")[1]
         except:
             userId = ID.split('/')[1]
         commentStr = div.xpath(
             'span[@class="ctt"]//text()').extract()  # 微博内容
         comment = ""
         for co in commentStr:
             if len(co) >= 3:
                 comment += co.replace(" ", "")
         strTime = div.xpath(
             'span[@class="ct"]//text()').extract()[0].split(u" 来自")[0]
         pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime)
         comme['pushTime'] = pushTime
         comme["userName"] = userName
         comme["commentId"] = commentId
         comme["userId"] = userId
         comme["comment"] = comment
         commentItem = SinaCommentItem()
         for key, val in comme.items():
             commentItem[key] = val
         yield commentItem
     url_next = hxs.xpath(
         'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
         .decode('utf8')).extract()
     if c != [] and url_next:
         data["page"] = True
         next_url = self.rootUrl + url_next[0]
         yield Request(url=next_url,
                       callback=self.parse_comment,
                       meta={"data": data},
                       dont_filter=True)