def stripy_article_list( self, section_name, page_num ): try: self.cur_page = page_num article_list = [] if page_num == 0: url = self.section_url_map[section_name] else: url = self.section_url_map[section_name][0:-6] + str(self.cur_page) + '.html' #nPos = self.section_url_map[section_name].index('nowPage=') #url = self.section_url_map[section_name][0:nPos] + 'nowPage=' + str( self.cur_page ) + self.section_url_map[section_name][nPos+len('nowPage=')+1:] print url # sys.exit(1) contentHtml = self.session.get(url, stream=True) encoding = chardet.detect(contentHtml.content)['encoding'] if contentHtml.status_code == requests.codes.ok: pattern = r'<td width="80%" class="align_L"><a ><A href=\'(.*?)\'.*?>(.*?)</A></a></td>\s.*?<td.*?>(.*?)</td>' for mtFind in re.finditer( pattern ,contentHtml.content,re.S ): if mtFind.groups()[0][0:4] == "http": article_url = mtFind.groups()[0] else: proto,rest = urllib.splittype( self.section_url_map[section_name]) article_url = proto + "://" + urllib.splithost( rest )[0] + "/" + mtFind.groups()[0][1:] #article_url = self.section_url_map[section_name][0:-1] + mtFind.groups()[0][1:] public_time = self.strip_tags(mtFind.groups()[2]) title = mtFind.groups()[1].decode(encoding) #print public_time ##print repr(title) #print title #print article_url #sys.exit(1) item = article_item(article_url, title, public_time) item.set_section_name(section_name) article_list.append(item) else: self.logger.error(u'没有获取到文章列表 ' + str(page_num) ) return article_list except BaseException, e: self.logger.error(str(e))
def stripy_article_list(self, section_name, page_num): self.cur_page = page_num article_list = [] if page_num == 0: url = self.section_url_map[section_name] else: url = self.section_url_map[section_name][0:-7] + '_' + str(self.cur_page) + '.html' #print url #sys.exit(1) contentHtml = self.session.get(url, stream=True) #common_utils.write_to_file_with_stream(contentHtml.content,'cqqqqq.txt') #sys.exit(1) if contentHtml.status_code == requests.codes.ok: pattern = r'[^>]<a href="(.*?)" class="title" target="_blank">(.*?)</a>\s.*?<span\s.*?\[(.*?)\]' for mtFind in re.finditer(pattern, contentHtml.content, re.S): if mtFind.groups()[0][0:4] == "http": article_url = mtFind.groups()[0] else: article_url = '%s%s' %(self.section_url_map[section_name][0:29], mtFind.groups()[0][1:]) #print article_url #sys.exit() #public_time = mtFind.groups()[2] #public_time = time.strptime(mtFind.groups()[2],"%Y-%m-%d") public_time = mtFind.groups()[2] title = mtFind.groups()[1] title = title.decode('gbk').strip() title = self.strip_tags(title) #print title ##print article_url #sys.exit(1) item = article_item(article_url, title, public_time ) item.set_section_name(section_name) # if time.strptime(public_time, '%Y-%m-%d' ).tm_year == 2015: article_list.append(item) else: self.logger.error(u'没有获取到文章列表 ' + str(page_num) ) return article_list