def parse(self, response): soup = BeautifulSoup(response.body, "lxml") data = response.meta["data"] flag_list = [] for i in soup.find_all("div", class_="c")[1:-2]: strTime = i.find("span", class_="ct").get_text(strip=True).split(u" 来自")[0] pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime) flag_list.append(flag) if flag == 1: content_id = i["id"].strip("M_") # self.rconn.delete("Sina:content_id") redis_flag = self.rconn.sadd("Sina:content_id", content_id) # redis_flag = 1 if redis_flag == 1: detail = {} detail["key"] = data["key"] comment_url = "" % content_id detail["contentId"] = content_id detail["pushTime"] = pushTime yield Request(url=comment_url, callback=self.parse_comment, meta={"data": detail}) # break if 2 not in flag_list: hxs = Selector(response) url_next = hxs.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract()[0] if url_next: req_url = "" % url_next yield Request(url=req_url, callback=self.parse, meta={"data": data})
def parse(self, response): hxs = Selector(response) data = response.meta["data"] c = hxs.xpath('body/div[@class="c" and @id]') for div in c: try: like = re.findall('赞\[(\d+)\]'.decode('utf8'), div.extract())[0] # 点赞数 transfer = re.findall('转发\[(\d+)\]'.decode('utf8'), div.extract())[0] # 转载数 commentNum = re.findall('评论\[(\d+)\]'.decode('utf8'), div.extract())[0] # 评论数 contentId = div.xpath("@id").extract()[0].split('M_')[1] others = div.xpath('div/span[@class="ct"]/text()').extract( ) # 求时间和使用工具(手机或平台) strs = others[0].split(u"来自") pushTime, flag = Time_stamp().time_handle(strs[0].strip()) tool = strs[1] detail = {} detail["key"] = data["key"] comment_url = "" % contentId detail["contentId"] = contentId detail["pushTime"] = pushTime detail["commentNum"] = commentNum detail["transfer"] = transfer detail["like"] = like detail["tool"] = tool yield Request(url=comment_url, callback=self.parse_comment, meta={"data": detail}, dont_filter=True) # break except: pass url_next = hxs.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract() if url_next and data['page'] < 5: req_url = "" % url_next[0] data["page"] += 1 yield Request(url=req_url, callback=self.parse, meta={"data": data}, dont_filter=True)
def parse_result(self, response): soup = BeautifulSoup(response.body, "lxml") data = response.meta try: scr = soup.find_all("script")[-2:-1][0].get_text(strip=True) data_list = scr.split("var msgList = ")[1].split( 'seajs.use("sougou/p')[0].strip().strip(";") j_data = json.loads(data_list) art_list = [] for li in j_data["list"]: dic = {} dic["id"] = li["comm_msg_info"]["id"] dic["pushtime"] = li["comm_msg_info"]["datetime"] dic["title"] = li["app_msg_ext_info"]["title"] dic["fileid"] = li["app_msg_ext_info"]["fileid"] dic["url"] = + li["app_msg_ext_info"][ "content_url"].replace("amp;", "") art_list.append(dic) for ls in li["app_msg_ext_info"]["multi_app_msg_item_list"]: dict_ls = {} dict_ls["id"] = dic["id"] dict_ls["pushtime"] = dic["pushtime"] dict_ls["url"] = + ls["content_url"].replace( "amp;", "") dict_ls["title"] = ls["title"] dict_ls["fileid"] = ls["fileid"] art_list.append(dict_ls) for ks in art_list: timeFormat, flag = Time_stamp().time_handle(ks["pushtime"]) if flag == 1: line = data["key"].split("--")[-1] + "--" + str( ks["pushtime"]) self.rconn.delete("SoGou:Account") flag = self.rconn.sadd("SoGou:Account", line) if flag == 1: ks["pushtime"] = timeFormat ks["keyword"] = data["key"] data["ks"] = ks yield Request(url=ks["url"], callback=self.parse_article, meta=data) except Exception, e: logger.error("parse_result error <<%s>>" % e)
def parse_comment(self, response): data = response.meta["data"] hxs = Selector(response) if not data.has_key("page"): detail = {} detail["contentId"] = data["contentId"] detail["pushTime"] = data["pushTime"] keys = data["key"].split("--") detail["SinaName"] = keys[0] detail["Vermicelli"] = keys[1] detail["SinaID"] = keys[2] detail["SinaOthID"] = keys[2] contentStr = hxs.xpath( '//div/span[@class="ctt"]//text()').extract() # 微博内容 reprintStr = hxs.xpath( '//div/span[@class="pms"]/preceding-sibling::span/a//text()' ).extract() commontStr = hxs.xpath( '//div/span[@class="pms"]//text()').extract() thumbs_upStr = hxs.xpath( '//div/span[@class="pms"]/following-sibling::span/a//text()' ).extract() content = "0" reprint = "0" commont = "0" if '[' in str(reprintStr[0]): reprint = str(reprintStr[0])[str(reprintStr[0]).index('[') + 1:str(reprintStr[0]).index(']')] if '[' in str(commontStr[0]): commont = str(commontStr[0])[str(commontStr[0]).index('[') + 1:str(commontStr[0]).index(']')] thumbs_up = str(thumbs_upStr[0])[str(thumbs_upStr[0]).index('[') + 1:str(thumbs_upStr[0]).index(']')] for cd in contentStr: if len(cd) >= 3: content += cd.replace(" ", "") detail["content"] = content detail["reprint"] = int(reprint) detail["commont"] = int(commont) detail["thumbs_up"] = int(thumbs_up) flag = int(self.wm.predict(detail["content"])[0]) if flag != 1: total = 0 for word in self.words: if word.strip() in detail["content"]: total += 1 if total >= 2: flag = 1 break if flag == 1: detail["flag"] = 1 contentItem = SinaContentItem() for key, val in detail.items(): contentItem[key] = val yield contentItem c = hxs.xpath('body/div[@class="c" and @id]')[1:] else: c = [] else: c = hxs.xpath('body/div[@class="c" and @id]') for div in c: comme = {} comme["contentId"] = data["contentId"] ID = div.xpath("a/@href").extract_first() userName = div.xpath("a//text()").extract_first() commentId = div.xpath("@id").extract()[0].split('C_')[1] try: userId = ID.split("u/")[1] except: userId = ID.split('/')[1] commentStr = div.xpath( 'span[@class="ctt"]//text()').extract() # 微博内容 comment = "" for co in commentStr: if len(co) >= 3: comment += co.replace(" ", "") strTime = div.xpath( 'span[@class="ct"]//text()').extract()[0].split(u" 来自")[0] pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime) comme['pushTime'] = pushTime comme["userName"] = userName comme["commentId"] = commentId comme["userId"] = userId comme["comment"] = comment commentItem = SinaCommentItem() for key, val in comme.items(): commentItem[key] = val yield commentItem url_next = hxs.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract() if c != [] and url_next: data["page"] = True next_url = self.rootUrl + url_next[0] yield Request(url=next_url, callback=self.parse_comment, meta={"data": data}, dont_filter=True)