Exemple #1
0
    def stripy_article_list( self, section_name, page_num ):
        try:
            self.cur_page = page_num
            article_list = []
            if page_num == 0:
                url = self.section_url_map[section_name]
            else:
                url = self.section_url_map[section_name][0:-6] +  str(self.cur_page) + '.html'
                #nPos = self.section_url_map[section_name].index('nowPage=')
                #url = self.section_url_map[section_name][0:nPos] + 'nowPage=' + str( self.cur_page ) + self.section_url_map[section_name][nPos+len('nowPage=')+1:]
            print url
            # sys.exit(1)


            contentHtml = self.session.get(url, stream=True)
            encoding = chardet.detect(contentHtml.content)['encoding']

            if contentHtml.status_code == requests.codes.ok:
                pattern = r'<td width="80%"  class="align_L"><a ><A href=\'(.*?)\'.*?>(.*?)</A></a></td>\s.*?<td.*?>(.*?)</td>'
                for mtFind in re.finditer( pattern ,contentHtml.content,re.S ):
                    if mtFind.groups()[0][0:4] == "http":
                        article_url = mtFind.groups()[0]
                    else:
                        proto,rest = urllib.splittype( self.section_url_map[section_name])
                        article_url = proto + "://" + urllib.splithost( rest )[0] + "/" + mtFind.groups()[0][1:]
                        #article_url = self.section_url_map[section_name][0:-1] + mtFind.groups()[0][1:]


                    public_time = self.strip_tags(mtFind.groups()[2])

                    title = mtFind.groups()[1].decode(encoding)

                    #print public_time
                    ##print repr(title)
                    #print title
                    #print article_url
                    #sys.exit(1)

                    item = article_item(article_url, title, public_time)
                    item.set_section_name(section_name)
                    article_list.append(item)
            else:
                self.logger.error(u'没有获取到文章列表 ' + str(page_num) )
            return article_list
        except BaseException, e:
            self.logger.error(str(e))
Exemple #2
0
    def stripy_article_list(self, section_name, page_num):
        self.cur_page = page_num
        article_list = []
        if page_num == 0:
            url = self.section_url_map[section_name]
        else:
            url = self.section_url_map[section_name][0:-7] + '_' + str(self.cur_page) + '.html'
        #print url
        #sys.exit(1)

        contentHtml = self.session.get(url, stream=True)
        #common_utils.write_to_file_with_stream(contentHtml.content,'cqqqqq.txt')
        #sys.exit(1)

        if contentHtml.status_code == requests.codes.ok:
            pattern = r'[^>]<a href="(.*?)" class="title" target="_blank">(.*?)</a>\s.*?<span\s.*?\[(.*?)\]'
            for mtFind in re.finditer(pattern, contentHtml.content, re.S):
                if mtFind.groups()[0][0:4] == "http":
                    article_url = mtFind.groups()[0]
                else:
                    article_url = '%s%s' %(self.section_url_map[section_name][0:29], mtFind.groups()[0][1:])


                #print article_url
                #sys.exit()

                #public_time = mtFind.groups()[2]
                #public_time = time.strptime(mtFind.groups()[2],"%Y-%m-%d")
                public_time = mtFind.groups()[2]

                title = mtFind.groups()[1]
                title = title.decode('gbk').strip()
                title = self.strip_tags(title)
                #print title
                ##print article_url
                #sys.exit(1)

                item = article_item(article_url, title, public_time )
                item.set_section_name(section_name)
                # if time.strptime(public_time, '%Y-%m-%d' ).tm_year == 2015:
                article_list.append(item)
        else:
            self.logger.error(u'没有获取到文章列表 ' + str(page_num) )
        return article_list