def getpagecomments(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href') titles = xpath.getlist('//*[@class="sosResult"]/strong/a') pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformtime( pubtimes[index].text).split(' ')[0] pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT) inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= int(self.querylastdays): newurl = self.preprocess(hrefs[index]) if newurl is not None: urllist.append(newurl) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step2bbs(self, params): Logger.getlogging().info("Dm5Commnets.STEP_2") # 将STEP_1中的docurl传下来 docurl = params.customized['docurl'] comments_count = self.r.parse(ur'(\d+)个回复', params.content)[0] # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comments_count: return URLStorage.setcmtnum(params.originalurl, comments_count) # 总数除以page_size,然后加1,可得到评论总页数comments_count pagenum = 0 xparser = XPathUtility(params.content) if not xparser.xpath('//*[@class="inkk ma5"]'): Logger.getlogging().warning('{0}:30001'.format(params.originalurl)) return pageList = xparser.xpath('//*[@id="search_fy"]/a/text()') if not pageList: pagenum = 1 else: pagenum = int(pageList[-2]) for page in range(1, pagenum + 1, 1): comment_url = Dm5Commnets.COMMENT_URL.format(docurl=docurl, page=page) self.storeurl(comment_url, params.originalurl, Dm5Commnets.STEP_3_BBS)
def getcomments_step2(self, params): bookId = params.customized['bookId'] xhtml = XPathUtility(html=params.content) page_counts = int(xhtml.xpath('//div[@class="page"]/@pagenum')[0]) comments_count = int(xhtml.xpath('//div[@class="page"]/@total')[0]) Logger.getlogging().debug(comments_count) if page_counts == 0: return # 判断增量 cmtnum = CMTStorage.getcount(params.originalurl, True) if cmtnum >= comments_count: return page_num = int( math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE)) if page_num >= self.maxpages: page_num = self.maxpages NewsStorage.setcmtnum(params.originalurl, comments_count) for page in range(1, page_num + 1, 1): comment_url = PubComments.COMMENTS_URL self.storeposturl(comment_url, params.originalurl, PubComments.STEP_3, { 'bookId': bookId, 'pageNum': page })
def baidutiebasearch_step2(self, params): # Step2: 根据返回内容,通过xpath: //*[@class="nums"] 得到最大总条数 # 获取第一页的搜索结果 self.baidutiebasearch_step3(params) # 获取尾页page数 xparser = XPathUtility(html=params.content) pager_search = xparser.xpath('//*[@class="pager pager-search"]') queryurl = '' if pager_search: tailpageurl = xparser.xpath('//*[@class="pager pager-search"]/a[last()]/@href') try: if tailpageurl: lists = tailpageurl[0].split('pn=') queryurl = 'http://tieba.baidu.com'+lists[0] tailpage = int(lists[1]) if tailpage > BaiduTiebaS2Query2.DEFAULT_MAX_PAGESIZE: tailpage = BaiduTiebaS2Query2.DEFAULT_MAX_PAGESIZE if tailpage > self.maxpages: tailpage = self.maxpages else: tailpage = 1 except: tailpage = 1 else: # 没有检索结果,直接返回 Logger.log(params.url, constant.ERRORCODE_EXCEPTTION_JSON) return if not queryurl: return # 根据上面的tailpage数,拼出除了第一页之外的所有的搜索结果url querylist = [] for page in range(2, tailpage + 1, 1): url = queryurl + 'pn={page}'.format(page=page) querylist.append(url) self.__storeqeuryurllist__(querylist, BaiduTiebaS2Query2.BAIDU_TIEBA_SEARCH_EACH_PAGE)
def getsearchresult(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//li/h3/a/@href') titles = xpath.getlist('//li/h3/a') pubtimes = xpath.xpath('//li/p') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text) pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= self.querylastdays: urllist.append(hrefs[index]) else: # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。 break if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def pageprocess(self, params): # 获取文本 xparser = XPathUtility(params.content) # 获取该页超级链接 hreflist = xparser.xpath('//h3/a/@href') hrefs = [] for mid_url in hreflist: mid = self.preprocess(mid_url) if mid is not None: hrefs.append(mid) # 获取该页内容的所有发布时间 publictime = xparser.xpath('//*[@class="scontent"]/text()[1]') publicTimes = [] for timeindex in publictime: middle = str(timeindex).replace('\n', '').replace('\t', '').strip() publicTimes.append( str(str(middle).split(' ')[0]) + ' ' + str(str(middle).split(' ')[1])) # 获取改页所有title titles = [] titles_list = xparser.getlist('//h3') for title in titles_list: mid_title = str(title).replace('\n', '').replace('\t', '').strip() titles.append(mid_title) # 获取关键字 KEY_mid = params.customized['KEY'] KEY = Common.urldec(KEY_mid) # 获取标题正则表达式 titlePatten = KEY # 获取一周前日期 today = datetime.datetime.now() before_days = today + datetime.timedelta(-self.inputtime) before_arr = str(before_days).split('.') before_time = before_arr[0] urllist = [] len_hrefs = len(hrefs) number = 0 for index in publicTimes[:len_hrefs]: # 是否是标题命中 # mid_value = re.compile(titlePatten) # flg = mid_value.search(str(titles[number])) flg = Common.checktitle(titlePatten, str(titles[number])) # 是当前一周内发布视频,并且标题命中的场合 if index > before_time and flg: url = hrefs[number] urllist.append(url) number = number + 1 # 获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step1(self, params): # 获得首页url参数 info = params.customized['query'] xparser = XPathUtility(params.content) if not xparser.xpath('//*[@class="mytopic topiclisttr"]'): Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS) return pageList = xparser.getcomments('//span[@class="right"]/a') if len(pageList) == 1: pageTotal = 1 else: pageTotal = pageList[len(pageList) - 2] if int(pageTotal) >= self.maxpages: pageTotal = self.maxpages # 所有循环列表 querylist = [] # 根据总页数,获取query列表 for page in range(1, int(pageTotal) + 1, 1): if page == 1: self.step2(params) continue url = hupuS2Query.HUPU_QUERY_TEMPLATE.format(q=info, pn=page) querylist.append(url) self.__storeqeuryurllist__(querylist, hupuS2Query.HUPU_S2QUERY_EACH_PAGE, {'query': info})
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[@id="short_comment_content"]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() publicTime = ip_pubtimes_xpath[index] if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime): publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime)[0] if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime): publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime)[0] if URLStorage.storeupdatetime(params.originalurl, getuniformtime(publicTime)): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti)
def step1(self, params): Logger.getlogging().info("DmOneTwoThreeNewsComments.STEP_1") id = self.r.parse('^http://www.dm123.cn/.*/(\d+).html', params.originalurl)[0] xparser = XPathUtility(params.content) classid = xparser.xpath("//input[@id='classid']/@value")[0] # 1. 根据输入原始url, 拼出评论首页 commentinfo_url = Dm123NewsComments.COMMENT_URL.format(page=0, classid=classid, id=id) # 评论首页URL # 论坛 self.storeurl(commentinfo_url, params.originalurl, Dm123NewsComments.STEP_2, {'classid': classid, 'id': id})
def getkurlcomments(self, params): xparser = XPathUtility(params.content) # 获取评论列表 comments_xpath = xparser.xpath('//*[@class="page-pl-list-text"]') # 获取评论时间 pubtime_xpath = xparser.xpath('//*[@class="page-pl-user-timer"]') if len(comments_xpath) >= len(pubtime_xpath): start = len(comments_xpath) - len(pubtime_xpath) comments = [] for index in range(start, len(comments_xpath), 1): if URLStorage.storeupdatetime( params.originalurl, getuniformtime(pubtime_xpath[index].text)): cmti = CommentInfo() cmti.content = comments_xpath[index].text comments.append(cmti) # 保存获取到的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def geturlcomments(self, params): # 获取具体评论 xparser = XPathUtility(params.content) comments_xpath = xparser.xpath('//*[contains(@id, "cm_")]') if not comments_xpath: return # 获取发布时间 ip_pubtimes_xpath = xparser.getlist('//*[contains(@id,"CList___CommentList_UserLink_")]/..') if len(comments_xpath) == len(ip_pubtimes_xpath): comments = [] # 获取评论 for index in range(0, len(comments_xpath), 1): cmti = CommentInfo() if URLStorage.storeupdatetime(params.originalurl, getuniformtime(ip_pubtimes_xpath[index])): # 获取增加的评论(根据时间比较) cmti.content = comments_xpath[index].text comments.append(cmti) # 保存获取的评论 if len(comments) > 0: self.commentstorage.store(params.originalurl, comments)
def process(self, params): try: if params.step is None: # 从url中获取拼接评论url的参数 if not self.r.search( '^http[s]{0,1}://www\.fun\.tv/vplay/\w-(\d+)(\.\w-\d+)?/$', params.originalurl): return galleryid = self.r.parse( '^http[s]{0,1}://www\.fun\.tv/vplay/\w-(\d+)(\.\w-\d+)?/$', params.originalurl)[0][0] # 拼接第一页评论url comments_url = FunComments.COMMENTS_URL % (galleryid, 1) #通知下载平台,根据评论url获取第一页评论内容 self.storeurl(comments_url, params.originalurl, FunComments.STEP_2, {'galleryid': galleryid}) #直接通过拼页面获取,除电视剧存在此种合辑问题,其他都可以直接获取 xhtml = XPathUtility(params.content) torrent_panel = xhtml.xpath('//*[@class="torrent-panel"]') if torrent_panel: lis = xhtml.xpath('//*[@class="torrent-panel"]/ul/li') if len(lis) == 0: return numobj = xhtml.xpath( '//*[@class="playInfo crumbs"]/div/a[@class="exp-num"]' ) if numobj: clicknum = self.str2num(numobj[0].text) new_clicknum = int(clicknum) / len(lis) NewsStorage.setclicknum(params.originalurl, new_clicknum) #获取第一页评论内容,循环获取全部评论url elif params.step == FunComments.STEP_2: galleryid = params.customized['galleryid'] # 获取评论的Jason返回值 comments = json.loads(params.content) # 比较上次抓取该url的页面评论量和当前取到的评论量 curcmtnum = int(comments['data']['total_num']) NewsStorage.setcmtnum(params.originalurl, curcmtnum) dbcmtnum = CMTStorage.getcount(params.originalurl, True) if dbcmtnum >= curcmtnum: return # 循环取得评论的url pages = int( math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): if page == 1: self.step3(params) continue commentUrl = FunComments.COMMENTS_URL % (galleryid, page) self.storeurl(commentUrl, params.originalurl, FunComments.STEP_3, {'galleryid': galleryid}) #解析评论数据 elif params.step == FunComments.STEP_3: self.step3(params) except: Logger.printexception()
def process(self, params): Logger.getlogging().info(params.url) try: if params.step is Xie17NewsComments.STEP_1: #Step1: 通过得到docurl,得到获取评论的首页url参数。 articleId = self.r.parse('^http://xiaoshuo\.17xie\.com/book/(\d+)/', params.originalurl)[0] # 取得评论的url列表 comments_url = Xie17NewsComments.COMMENT_URL % (articleId, 1) self.storeurl(comments_url, params.originalurl, Xie17NewsComments.STEP_2, {'articleId': articleId}) elif params.step == Xie17NewsComments.STEP_2: # 获得评论参数 articleId = params.customized['articleId'] # 取得总件数 comment_count = float(self.r.parse(ur'共(\d+)人说过', params.content)[0]) if comment_count == 0: return # 判断增量 cmtnum = URLStorage.getcmtnum(params.originalurl) if cmtnum >= comment_count: return URLStorage.setcmtnum(params.originalurl, comment_count) # 获取页数 page = int(math.ceil(comment_count / Xie17NewsComments.PAGE_SIZE)) # 获得url列表 for page in range(1, page + 1, 1): url = Xie17NewsComments.COMMENT_URL % (articleId, page) self.storeurl(url, params.originalurl, Xie17NewsComments.STEP_3) elif params.step == Xie17NewsComments.STEP_3: # Step3: 通过Step2设置的url,得到所有评论,抽取评论 Logger.getlogging().info("params.step == 3") xparser = XPathUtility(params.content) # 取得所有评论 comments = xparser.getcomments('/html/body/ul/li[2]/dl/dd') # 取得所有评论时间 commenttimes = xparser.xpath('/html/body/ul/li[2]/dl/dt/text()') commentsInfo = [] # 取得所有评论 for index in range(0, int(len(commenttimes)), 1): # 提取时间 if self.r.search(ur'\d+年\d+月',commenttimes[index].strip()): tm = TimeUtility.getuniformtime(str(commenttimes[index]).strip(), '%Y年%m月') else: tm = getuniformtime(commenttimes[index].strip()) if URLStorage.storeupdatetime(params.originalurl, tm): cmti = CommentInfo() comment = comments[index * 3] + comments[index * 3 + 1] + comments[index * 3 + 2] cmti.content = comment commentsInfo.append(cmti) # 保存获取的评论 if len(commentsInfo) > 0: self.commentstorage.store(params.originalurl, commentsInfo) else: