def step2(self, params): """""" q = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('.videobox') if not divs: Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS) return urllist = [] for div in divs: title = div.select_one('.title').get_text() #print title tm = getuniformtime(div.select_one('.date').get_text()) url = div.select_one('.title > a').get('href') Logger.getlogging().debug(title) if not compareNow(tm, self.querylastdays): Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) continue if not Common.checktitle(Common.urldec(q), title): Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) continue urllist.append(url) #获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): """""" try: key = params.customized['key'] soup = BeautifulSoup(params.content, 'html5lib') #print soup #searchListOne = soup.select('.searchListOne > ul') searchListOne = soup.select('.searchListOne > ul > li > div') if not searchListOne: Logger.getlogging().warning('{}:40000 No urllist'.format( params.originalurl)) return lis = soup.select( '.searchListOne > ul > li' )[:-1] #最后一个<li id=search_msg style="display:none"></li>,过滤掉 urllist = [] for li in lis: url = li.select_one('h3 > a').get('href') #print '*********',url tm = li.select('.source > span')[0].get_text() tm = getuniformtime(tm) now = getuniformtime(str(time.time())) cmt_num = li.select('.source > span')[-1].get_text() title = li.select_one('h3').get_text() if Common.checktitle(Common.urldec(key), title): if compareNow(tm, self.querylastdays): urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA) except: #traceback.print_exc() Logger.printexception() Logger.getlogging().error( 'extract comment error from {site}'.format(site=params.url))
def pageprocess(self, params): # Step3:根据返回的html,通过xpath://*[@class="scout_anim_titletext"],获得检索结果的标题 # //*[@class="scout_anim_title"]/div/a/@href,获得检索结果的url #Logger.getlogging().debug(params.content) indexstart = params.content.find('(') indexstop = params.content.rfind(')') if indexstart > -1 and indexstop > -1: jsonvalue = params.content[indexstart + 1:indexstop] jsondata = json.loads(jsonvalue) info = params.customized['query'] soup = BeautifulSoup(jsondata['content'], 'html5lib') uls = soup.select('.scout_anim_odd > .scout_anim_odd_ul') if uls: for ul in uls: #titles = ul.select_one('.scout_anim_titletext') titles = ul.select_one('.scout_anim_titletext').get_text() Logger.getlogging().debug(titles) # if info not in titles: if not Common.checktitle(info, titles): return content = ul.select('.scout_anim_content > div > ul > li') if content: if len(content) > 3: content = content[-3:] urllist = [ 'https://donghua.dmzj.com' + item.find('a').get('href') for item in content ] self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def getsearchresult(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//li/h3/a/@href') titles = xpath.getlist('//li/h3/a') pubtimes = xpath.xpath('//li/p') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text) pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= self.querylastdays: urllist.append(hrefs[index]) else: # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。 break if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): info = Common.urldec(params.customized['info']) soup = BeautifulSoup(params.content, 'html5lib') text_divs = soup.select('.s_r_txt') urllist = [] if text_divs: for item in text_divs: title = item.select_one('h3 > a').get_text() url = item.select_one('h3 > a').get('href') curtime = item.select('p')[-1].get_text().strip() try: if TimeUtility.compareNow( TimeUtility.getuniformtime(curtime), self.querylastdays): if Common.checktitle(info, title): urllist.append(url) else: Logger.log( url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) except: urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def getpagecomments(self, params): info = params.customized['query'] xpath = XPathUtility(html=params.content) hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href') titles = xpath.getlist('//*[@class="sosResult"]/strong/a') pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]') today = datetime.datetime.strptime( TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT).date() urllist = [] for index in range(0, len(titles), 1): # 标题中包含指定要查询的关键字 # if titles[index].find(info) > -1: if Common.checktitle(info, titles[index]): pubtimestr = TimeUtility.getuniformtime( pubtimes[index].text).split(' ')[0] pubtime = datetime.datetime.strptime( pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date() # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT) inteveral = today - pubtime # 时间在指定周期内 if inteveral.days <= int(self.querylastdays): newurl = self.preprocess(hrefs[index]) if newurl is not None: urllist.append(newurl) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def process(self,params): if params.step == S2Query.STEP_1: html=etree.HTML(params.content) #try: #quit=html.xpath['//div[@id="results"]/text()'] #totalpage='0' #except: #totalpage=html.xpath('//div[@class="page"]/span/text()')[0] #totalpage= totalpage.split("/")[-1] #totalpage=re.sub("\D", "",totalpage) results = html.xpath('//*[@id="results"]') if not results: return totalpage=html.xpath('//*[@id="div_3"]/*[@class="page"]/span/text()') if totalpage: totalpage = self.r.parse('(\d+)',totalpage[0].split('/')[-1])[0] else: Logger.getlogging().info("there are no results you want!") return urllist=[] if int(totalpage) >= self.maxpages: totalpage = self.maxpages if totalpage <>'0': for pages in range(0,int(totalpage)): searchurl = S2Query.S2_URL % (pages+1,params.customized['key']) urllist.append(searchurl) self.__storeqeuryurllist__(urllist, S2Query.STEP_2,{'key':params.customized['key']}) else: return elif params.step == S2Query.STEP_2: comquerkey=Common.urldec(params.customized['key']).decode('gbk').encode('utf-8') soup = BeautifulSoup(params.content,'html5lib') urllist = [] divs = soup.find_all(attrs={'class':'result f s0'}) if not divs: return for div in divs: title = div.select_one('h3.c-title').get_text() title = ''.join(title.strip().split()) url_tm = div.select_one('.c-showurl').get_text() tm = getuniformtime(url_tm.split('/')[-1]) url = 'http://'+'/'.join(url_tm.split('/')[0:-1]) Logger.getlogging().debug(title) #Logger.getlogging().debug(url_tm) if not Common.checktitle(comquerkey, title): Logger.getlogging().warning('{url}:40000 out of range, the title!'.format(url=params.originalurl)) continue if not compareNow(tm, self.querylastdays): Logger.getlogging().warning('{url}:40000 out of range, the time!'.format(url=params.originalurl)) continue urllist.append(url) self.__storeurllist__(urllist,SPIDER_S2_WEBSITE_VIDEO)
def pageprocess(self, params): # 获取文本 xparser = XPathUtility(params.content) # 获取该页超级链接 hreflist = xparser.xpath('//h3/a/@href') hrefs = [] for mid_url in hreflist: mid = self.preprocess(mid_url) if mid is not None: hrefs.append(mid) # 获取该页内容的所有发布时间 publictime = xparser.xpath('//*[@class="scontent"]/text()[1]') publicTimes = [] for timeindex in publictime: middle = str(timeindex).replace('\n', '').replace('\t', '').strip() publicTimes.append( str(str(middle).split(' ')[0]) + ' ' + str(str(middle).split(' ')[1])) # 获取改页所有title titles = [] titles_list = xparser.getlist('//h3') for title in titles_list: mid_title = str(title).replace('\n', '').replace('\t', '').strip() titles.append(mid_title) # 获取关键字 KEY_mid = params.customized['KEY'] KEY = Common.urldec(KEY_mid) # 获取标题正则表达式 titlePatten = KEY # 获取一周前日期 today = datetime.datetime.now() before_days = today + datetime.timedelta(-self.inputtime) before_arr = str(before_days).split('.') before_time = before_arr[0] urllist = [] len_hrefs = len(hrefs) number = 0 for index in publicTimes[:len_hrefs]: # 是否是标题命中 # mid_value = re.compile(titlePatten) # flg = mid_value.search(str(titles[number])) flg = Common.checktitle(titlePatten, str(titles[number])) # 是当前一周内发布视频,并且标题命中的场合 if index > before_time and flg: url = hrefs[number] urllist.append(url) number = number + 1 # 获取最终url列表 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def process(self, params): # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL if params.step == kumiS2Query.KUMI_S2QUERY_FIRST_PAGE: # 获得首页url参数 titlePatten = params.customized['query'] # 获取标题正则表达式 soup = BeautifulSoup(params.content, 'html5lib') boxs = soup.find_all(attrs={'class': re.compile('seaResultBox')}) if boxs: urllist = [] for box in boxs: title = box.select_one('.seaResultA > a').get_text() if not Common.checktitle(titlePatten, title): continue url = box.select_one('.seaResultA > a').get('href') urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step3(self, params): """获取新闻类的url列表""" key = Common.urldec(params.customized['query']) soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.wzlist > ul > li.wztitle') if lis: urllist = [] for li in lis: title = li.select_one('a').get_text() # if key not in title: if not Common.checktitle(key, title): continue pubtime = li.select_one('span').get_text() url = 'http://www.52tian.net' + li.select_one('a').get('href') if compareNow(getuniformtime(pubtime), self.querylastdays): urllist.append(url) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): """""" try: key = params.customized['key'] xhtml = etree.HTML(params.content) url = xhtml.xpath('//*[@class="mod_book_cover db"]/@href') url = ['http://ac.qq.com' + u for u in url] title = xhtml.xpath('//*[@class="mod_book_cover db"]/@title') title_url = zip(title, url) urllist = [] for t, u in title_url: # if Common.urldec(key) in t: if Common.checktitle(Common.urldec(key), t): urllist.append(u) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS) except: Logger.printexception()
def getpagecontents(self, params): info = params.customized['query'] # 创建 beautifulsoup 对象 soup = BeautifulSoup(params.content, 'html.parser') # 查找class属性为c-title的h3标记 results = soup.find_all('h3', 'c-title') urllist = [] if len(results) > 0: for result in results: # 获取a标记的文本,因search的url中含有sti的参数,这边不考虑时间 title = (result.find('a').get_text()).strip() # if info.decode('utf8') in title.decode('utf8'): # if info in title: if Common.checktitle(info, title): # 获取a标记的href属性 href = result.find('a').get('href') urllist.append(href) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step2(self, params): key = params.customized['query'] query = Common.urldec(key) soup = BeautifulSoup(params.content, 'html5lib') tbody = soup.select('.search_topic_list > form > table > tbody') lis = tbody[-1].select('tr') urllist = [] for li in lis: url = li.select_one('.p_title > a').get('href') title = li.select_one('.p_title > a').get_text() curtime = li.select('td')[3].get_text() if TimeUtility.compareNow(TimeUtility.getuniformtime(curtime), self.querylastdays): if Common.checktitle(query, title): urllist.append(url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step_last(self, params): urllist = [] info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('#results > .result') for div in divs: publish = div.select_one('.c-summary-1').get_text() title = div.select_one('h3 > a').get_text().strip() url = div.select_one('h3 > a').get('href').strip() #url = self.preprocess(href) if TimeUtility.compareNow(TimeUtility.getuniformtime(publish), self.querylastdays): if Common.checktitle(Common.trydecode(info), Common.trydecode(title)): urllist.append(url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def getsearchresult(self, params): info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('ul.ckl_cktpp > li.cfix') urllist = [] if lis: for li in lis: title = li.select_one('h3').get_text() # if info not in title: if not Common.checktitle(info, title): continue times = li.select('p')[-2].get_text() times = getuniformtime(times) url = li.select_one('h3 > a').get('href') if compareNow(times, self.querylastdays): urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO) return len(urllist) else: return -1
def step2(self, params): key = params.customized['key'] query = Common.urldec(key) soup = BeautifulSoup(params.content, 'html5lib') lis = soup.select('.sresult > ul > li') urllist = [] for li in lis: url = li.select_one('.stitle > a').get('href') title = li.select_one('.stitle').get_text() curtime = li.select_one('.scontent').get_text() if TimeUtility.compareNow(TimeUtility.getuniformtime(curtime), self.querylastdays): if Common.checktitle(query, title): urllist.append('http://bbs.tgbus.com/' + url) else: Logger.log('http://bbs.tgbus.com/' + url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log('http://bbs.tgbus.com/' + url, constant.ERRORCODE_WARNNING_NOMATCHTIME) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def process(self, params): if params.step == KanKanS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回内容,通过xpath: //*[@data-search-page="item"] 得到最大page数、(返回数组的倒数第二位) info = params.customized['query'] limit = params.customized['limit'] html = etree.HTML(params.content) if not html.xpath('//*[@id="video_box"]'): return nodes = html.xpath('//p[@class="list-pager-v2"]/a[last()-1]') # 获取不到,则返回 if len(nodes) == 0: page_count = 1 else: page_count = int(nodes[0].text) # 根据上面的page_count数,拼出所有的搜索结果url(最新1周) querylist = [] if page_count > 0: for page in range(1, page_count + 1, 1): url = KanKanS2Query.KANKAN_QUERY_TEMPLATE.format( keyword=info, page=page, limit=limit) querylist.append(url) self.__storeqeuryurllist__(querylist, KanKanS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == KanKanS2Query.S2QUERY_EACH_PAGE: info = params.customized['query'] soup = BeautifulSoup(params.content, 'lxml') results = soup.find_all('p', 'title') urllist = [] for result in results: title = result.find('a').get('title') # if info in title: if Common.checktitle(info, title): #因search的url中含有limit=7(包含一周)的参数,这边不考虑时间 href = result.find('a').get('href') urllist.append(href) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): """""" info = params.customized['query'] info = Common.urldec(info) soup = BeautifulSoup(params.content, 'html5lib') videos = soup.select('.uiVideo > .uiVideo__item') if videos: urllist = [] for video in videos: title = video.select_one('h3 > a').get('title') pubtime = video.select('.result__data > span')[-1].get_text() url = video.select_one('h3 > a').get('href') # if not info in title: if compareNow(getuniformtime(pubtime), self.querylastdays): if Common.checktitle(info, title): urllist.append(url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def step2(self, params): """解析每一搜索页面""" info = params.customized['info'] soup = BeautifulSoup(params.content, 'html5lib') divs = soup.select('.pbw') #divs = soup.select('h3.xs3 > a') if not divs: return urllist = [] for div in divs: tm = div.select('p > span')[0].get_text() tm = TimeUtility.getuniformtime(tm) geturl = div.select_one('h3.xs3 > a').get('href') title = div.select_one('h3.xs3 > a').get_text() if not re.search('http://.*com.*', geturl): if re.search('(http://.*com).*', params.url): urltemp = re.findall('(http://.*com).*', params.url)[0] elif re.search('(http://.*cn).*', params.url): urltemp = re.findall('(http://.*cn).*', params.url)[0] elif re.search('(http://.*net).*', params.url): urltemp = re.findall('(http://.*net).*', params.url)[0] geturl = urltemp + '/' + geturl if re.search('(http.*)&highlight', geturl): geturl = re.findall('(http.*)&highlight', geturl)[0] Logger.getlogging().info(Common.trydecode(title)) #to compare time and match title if not TimeUtility.compareNow(tm, self.querylastdays): Logger.log(geturl, constant.ERRORCODE_WARNNING_NOMATCHTIME) continue if not Common.checktitle(Common.trydecode(info), Common.trydecode(title)): Logger.log(geturl, constant.ERRORCODE_WARNNING_NOMATCHTITLE) continue #print geturl urllist.append(geturl) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def process(self, params): # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL if params.step == SinaS2Query.SINA_S2QUERY_FIRST_PAGE: jsdata = json.loads(params.content) count = int(jsdata['total']) querylist = [] if count > 0: for page in range(1, int(math.ceil(float(count) / SinaS2Query.DEFAULT_PAGE_SIZE)) + 1, 1): url = SinaS2Query.SINA_QUERY_TEMPLATE.format( q=Common.urlenc(params.customized['query']), ps=page, pn=SinaS2Query.DEFAULT_PAGE_SIZE, pf=self.querylastdays) querylist.append(url) self.__storeqeuryurllist__(querylist, SinaS2Query.SINA_S2QUERY_EACH_PAGE, {'query': params.customized['query']}) # 从查询页面中获取视频URL elif params.step == SinaS2Query.SINA_S2QUERY_EACH_PAGE: jsdata = json.loads(params.content) searchlist = jsdata['list'] urllist = [] try: for search in searchlist: # if params.customized['query'].decode(CHARSET_UTF8) in search['videoname']: title = search['videoname'] if re.search('<.*>[\s\S]*?</.*>',title): soup = BeautifulSoup(title,'html5lib') title = soup.get_text() title = ''.join(title.strip().split()) Logger.getlogging().debug(title) #if Common.checktitle(params.customized['query'], search['videoname']): if Common.checktitle(params.customized['query'], title): urllist.append(search['url']) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO) except: Logger.printexception()
def step2(self, params): """""" query = params.customized['query'] soup = BeautifulSoup(params.content, 'html.parser') trs = soup.select('#schend') if not trs: return urllist = [] for tr in trs: title = tr.select_one('.sb14b').get_text() content = etree.HTML(str(tr)) publicTimes = content.xpath( '//*[@id="schend"]/table[1]/tr/td[3]/text()')[-1].strip() href = tr.select_one('.sb14b').get('href') id = re.findall('id=(\d+)&', href)[0] url = 'http://forum.home.news.cn/detail/' + id + '/1.html' if not compareNow(getuniformtime(publicTimes), self.querylastdays): continue if not Common.checktitle(Common.trydecode(query), Common.trydecode(title)): continue urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def step2(self, params): """""" info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') if soup.select_one('.sk_null > .sorry'): Logger.getlogging().warning('{0}:40000 No urllist!'.format( params.url)) return html = etree.HTML(params.content) hrefs = html.xpath('//*[@class="v-link"]/a/@href') titles = html.xpath('//*[@class="v-link"]/a/@title') urllist = [] for index in range(0, len(titles), 1): if titles[index]: Logger.getlogging().debug(titles[index]) # match title if not Common.checktitle(Common.urldec(info), titles[index]): Logger.getlogging().debug( 'http:+{url}:40000 checktitle,out of title'.format( url=hrefs[index])) continue urllist.append('http:' + hrefs[index]) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def pageprocess(self, params): # Step3: 根据Step2的返回jason数据,获取 # 标题:jsondata['data'][0开始到19]['title'] # 连接:jsondata['data'][0开始到19]['url'] # 视频发布时间:jsondata['data'][0开始到19]['modifydatel'] 这个需要截断前10位,只能对比日期 info = params.customized['query'] jsondata = json.loads(params.content) searchresult = jsondata['data'] urllist = [] for result in searchresult: title = result['title'] url = result['url'] pubtime = result['modifydate'] # if not info in title: if compareNow(getuniformtime(pubtime), self.querylastdays): if Common.checktitle(info, title): urllist.append(self.MAIN_DOMAIN + url) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE) else: Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME) self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def processVideo(self, params): if params.step == MofangS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数 #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面 info = params.customized['query'] keyvalue = Common.urlenc(info) try: jsondate = json.loads(params.content) comments_count = jsondate['totalnums'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取不到,则返回 if int(comments_count) == 0: return page_count = int( math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE)) # 根据上面的page_count数,拼出所有的搜索结果url(最新1周) querylist = [] if page_count > 0: for page in range(1, page_count + 1, 1): url = MofangS2Query.QUERY_TEMPLATE.format( key=keyvalue, pageno=page, pagesize=self.DEFAULT_PAGE_SIZE) Logger.getlogging().debug(url) querylist.append(url) self.__storeqeuryurllist__(querylist, MofangS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == MofangS2Query.S2QUERY_EACH_PAGE: # Step3: 根据Step2的返回jason数据,获取 # 标题:comments['data'][0开始到19]['title'] # 连接:comments['data'][0开始到19]['url'] # 视频发布时间:comments['data'][0开始到19]['inputtime'] 这个需要截断前10位,只能对比日期 info = params.customized['query'] try: jsondate = json.loads(params.content) searchresult = jsondate['data'] except: Logger.getlogging().warning('{}:40000'.format(params.url)) return # 获取当前日(日期类型) today = datetime.datetime.strptime(TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT) urllist = [] for index in range(0, len(searchresult), 1): #print searchresult[index]['title'] #print searchresult[index]['inputtime'] if searchresult[index]['title'] is not None: # 标题中包含指定要查询的关键字,对应的url保存 # if searchresult[index]['title'].find(info) > -1: if Common.checktitle(info, searchresult[index]['title']): if searchresult[index]['inputtime'] is not None: #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['inputtime'])), TimeUtility.TIME_FORMAT_DEFAULT) #intervaldays = today - inputtime #if intervaldays.days <= int(self.querylastdays): pubtime = getuniformtime( str(searchresult[index]['inputtime'])) if compareNow(pubtime, int(self.querylastdays)): urllist.append(searchresult[index]['url']) else: # 获取不到发布时间,则默认为周期以内 urllist.append(searchresult[index]['url']) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def checktitle(self, query, title): return Common.checktitle(query, title)
def step3(self, params): """解析每一页面""" try: key = Common.urldec(params.customized['key']) soup = BeautifulSoup(params.content, 'html.parser') urllist = [] #1.判断是合集还是单集 #2.判断是电视剧、综艺、电影 #<span class="type">电视剧\综艺\电影<span> #3.判断跟新数量 #合集,电视剧class="result_episode_list cf"或综艺class="result_link_list cf"[纯综艺和电视剧综艺] items_v = soup.find_all( attrs={'class': re.compile('result_item_v')}) if items_v: for item in items_v: flag = None title = item.select_one('h2.result_title > a') # if key not in title.get_text(): if not Common.checktitle(key, title.get_text()): continue #判断是否是电视剧,flag = 'tv',需要reversed序列,按从大到小排序,在剔除预告片 videoObj = None tvObj = item.select_one('.result_episode_list') if tvObj: flag = 'tv' videoObj = tvObj #判断是否是综艺,flag = 'vv' 'variaty vedio' vvObj = item.select_one('.result_link_list') if vvObj: flag = 'vv' videoObj = vvObj #针对flag='tv',flag='vv' if videoObj: #正确的url序列 tvs = videoObj.select('div.item > a') if flag == 'tv': tvs = reversed(tvs) tvs = [ tv for tv in tvs if re.findall('^http[s]{0,1}://', tv.get('href')) ] #计数'预告'个数 imgmarkObjs = videoObj.select('.item > .mark_v > img') altlen = 0 if imgmarkObjs: for i in range(len(imgmarkObjs)): alt = imgmarkObjs[i].get('alt') if re.search('预告', alt) or re.search( u'预告', alt): altlen += 1 #剔除预告片 urlitems = tvs[altlen:altlen + 3] for urlitem in urlitems: #print 'uuuuuuuuuuuuuuuuuuuu:',urlitem.get('href') url = urlitem.get('href') if re.search('http://v.qq.com/x/page/\w+\.html', url): #url = url.replace('page','cover/lvxqk7s7yynbdba') url = url.replace('http://', 'https://') urllist.append(url) continue #如果以上都不是,(可以继续判断是否是电影,flag = 'mv' 预告与花絮不抓取,只抓取主链接) otherObj = item.select_one('h2.result_title > a') if otherObj: if re.search('电影', otherObj.get_text()) or re.search( u'电影', otherObj.get_text()): flag = 'mv' url = otherObj.get('href') if re.search('http://v.qq.com/x/page/\w+\.html', url): ##url = url.replace('page','cover/lvxqk7s7yynbdba') url = url.replace('http://', 'https://') urllist.append(url) continue #单集 items_h = soup.find_all( attrs={'class': re.compile('result_item_h')}) if items_h: for item in items_h: title = item.select_one('h2.result_title > a') # if key not in title.get_text(): if not Common.checktitle(key, title.get_text()): continue url = item.select_one('h2.result_title > a').get('href') if re.search('^http[s]{0,1}://', url): if re.search('http://v.qq.com/x/page/\w+\.html', url): url = url.replace('http://', 'https://') urllist.append(url) #系列 items_series = soup.find_all( attrs={'class': re.compile('result_series')}) if items_series: for item in items_series: title = item.select_one('.figure_title > a') # if key not in title.get_text(): if not Common.checktitle(key, title.get_text()): continue urlitems = item.select('.list_item > a') for urlitem in urlitems: url = urlitem.get('href') if re.search('http://v.qq.com/x/page/\w+\.html', url): #url = url.replace('page','cover/lvxqk7s7yynbdba') url = url.replace('http://', 'https://') urllist.append(url) if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO) except: Logger.printexception()
def process(self, params): if params.step == Cine107S2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回内容,得到搜索结果总数 info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') # 搜素结果 #print soup results = soup.select('#results')[0] # 无查找结果,则返回 if results.get_text().find('抱歉') > -1: return else: resultStr = results.select('span.support-text-top')[0].get_text().strip() resultStr = resultStr[8:resultStr.index('个')] if resultStr.find(',') > -1: result_counts = int(resultStr.replace(',', '')) else: result_counts = int(resultStr) Logger.getlogging().debug(result_counts) #搜索结果只能查看75[0:74]页,如果超过75页,按照75页处理 if result_counts > 750: result_counts = 750 #计算出循环页数page_count if result_counts < 10: page_count = 0 else: page_count = int(math.ceil(result_counts / Cine107S2Query.DEFAULT_PAGE_SIZE)) # 根据上面的page_count数,拼出所有的搜索结果url querylist = [] for page in range(0, page_count): url = Cine107S2Query.QUERY_TEMPLATE.format(key=info, pageno=page) querylist.append(url) self.__storeqeuryurllist__(querylist, Cine107S2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == Cine107S2Query.S2QUERY_EACH_PAGE: # Step3: 根据Step2的url,获取搜索结果的url,把url写入文件 info = params.customized['query'] soup = BeautifulSoup(params.content, 'html.parser') titles = soup.select('h3.c-title') times = soup.select('span.c-showurl') urllist = [] index = 0 for result in titles[0:]: title = result.get_text().strip() nodeUrl = result.select('a')[0].get('href') timeStr = times[index].get_text().strip() if timeStr.find('html') > -1: timeStr = timeStr[timeStr.index('html') + 5:] elif timeStr.find('...') > -1: timeStr = timeStr[timeStr.index('...')+4:] # 标题中包含指定要查询的关键字,并且是一周内的帖子,对应的url保存 if self.r.search(ur'(\d+-\d+-\d+)', timeStr): timeStr = self.r.parse(ur'(\d+-\d+-\d+)', timeStr)[0] #if title.find(Common.urldec(info)) > -1 and TimeUtility.getuniformtime(timeStr, '%Y-%m-%d') > TimeUtility.getuniformdatebefore(7): #urllist.append(nodeUrl) #Logger.getlogging().debug(title) #Logger.getlogging().debug(nodeUrl) #Logger.getlogging().debug(Common.urldec(info)) #Logger.getlogging().debug(title) if compareNow(getuniformtime(timeStr),self.querylastdays): if Common.checktitle(Common.urldec(info), title): urllist.append(nodeUrl) index += 1 if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
def process(self, params): if params.step == HuyaS2Query.S2QUERY_FIRST_PAGE: Logger.getlogging().debug("HuyaS2Query.S2QUERY_FIRST_PAGE") #Step2: 根据返回的html,通过xpath://*[@id="tab1"]/div[1]/div/span/em 得到搜索结果总件数 # 根据总件数,计算出page总数(总件数/20件,除不尽向上取整)拼出搜索结果的url,写文件保存 soup = BeautifulSoup(params.content, 'html5lib') if soup.select('.search-no-data-wrap'): return # 获取不到,则返回 totalstr = soup.select_one('.search-list > .mod-tab-hd > .act') if not totalstr: return # 获取总检索页数(例如:160) totalstr = totalstr.get_text().replace(',', '') count = int(re.findall('\d+', totalstr)[0]) # 根据上面的count数,拼出所有的搜索结果url info = params.customized['query'] keyvalue = Common.urlenc(info) querylist = [] pagecount = float(count) / self.DEFAULT_PAGE_SIZE pages = int(math.ceil(pagecount)) if pages >= self.maxpages: pages = self.maxpages for page in range(1, pages + 1, 1): url = HuyaS2Query.QUERY_TEMPLATE.format(pageno=page, key=keyvalue) Logger.getlogging().debug(url) querylist.append(url) self.__storeqeuryurllist__(querylist, HuyaS2Query.S2QUERY_EACH_PAGE, {'query': info}) elif params.step == HuyaS2Query.S2QUERY_EACH_PAGE: info = params.customized['query'] soup = BeautifulSoup(params.content, 'html5lib') if soup.select('.search-no-data-wrap'): return divs = soup.select('ul.video-list') if divs: divs = divs[-1] divs = divs.select('li') if not divs: return urllist = [] for div in divs: video = div.select_one('.video-title > .video-wrap') timestr = div.select_one('.result-data') times = getuniformtime(timestr.get_text()) titles = video.get('title') url = video.get('href') if compareNow(times, self.querylastdays) and Common.checktitle( info, titles): Logger.getlogging().debug(titles) Logger.getlogging().debug(url) urllist.append(url) else: Logger.getlogging().debug( titles + ' not match title or out of time') if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
def process(self, params): urllist = [] if params.step == ZolBbsS2Query.S2QUERY_FIRST_PAGE: #Step2: 根据返回内容,通过xpath: //*[@data-search-page="item"] 得到最大page数、(返回数组的倒数第二位) info = params.customized['info'] keyvalue = params.customized['query'] cdate = params.customized['cdate'] #keyvalue = Common.urlenc(info) html = etree.HTML(params.content) totalstr = html.xpath('//span[@class="search-title"]/text()') totalcount = re.sub("\D", "", totalstr[1]) #print totalcount # 获取不到,则返回 if int(totalcount) == 0: return # 根据上面的page_count数,拼出所有的搜索结果url(最新1周) querylist = [] if totalcount > 0: for page in range( 1, int( math.ceil( float(totalcount) / self.DEFAULT_PAGE_SIZE)) + 1, 1): url = ZolBbsS2Query.ZOLBBS_QUERY_TEMPLATE.format( kword=keyvalue, cdate=cdate, page=page) querylist.append(url) self.__storeqeuryurllist__(querylist, ZolBbsS2Query.S2QUERY_EACH_PAGE, { 'query': keyvalue, 'info': info }) elif params.step == ZolBbsS2Query.S2QUERY_EACH_PAGE: info = params.customized['info'] keyvalue = params.customized['query'] html = etree.HTML(params.content) soup = BeautifulSoup(params.content, 'lxml') results = soup.find_all('ul', 'results-list') for result in results: restr = str(result.find('a', '')) value = self.r.parse( '<a href="(.*)" target="_blank" title="(.*)">.*</a>', restr)[0] if value: if value[1] is not None: title = value[1].strip() if Common.checktitle(Common.trydecode(info), Common.trydecode(title)): href = value[0] #如果href是以http开始,http://bbs.zol.com.cn/sjbbs/d34130_156664.htm if self.r.parse(r'^http://.*', href): urllist.append(href) #如果href是以/开头,/quanzi/d643_841594.html,则拼上http://bbs.zol.com.cn elif self.r.parse(r'^/.*', href): href = self.MAIN_URL + href urllist.append(href) else: return if len(urllist) > 0: self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)