Python URLOpener.URLOpenerの例、lib.url_req.URLOpener.URLOpener Pythonの例

コード例 #1

0

ファイルを表示

	def fetcharticle2(self, url, decoder):
		#url = self.http_daili % url[7:]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		print result.realurl
		status_code, content = result.code, result.content
		if status_code != 200 or not content:
			self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
			return None
		soup = BeautifulSoup(content,'lxml')
		cont = soup.findAll(attrs={"align":"right"})
		url = cont[0].a['href']

		url = self.trueURL_zzh(url)
		#文章url
		result = opener.open(url)
		status_code, content = result.code, result.content
		if status_code != 200 or not content:
			self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
			return None

		if self.page_encoding:
			return content.decode(self.page_encoding)
		else:
			return decoder.decode(content,url)

コード例 #2

0

ファイルを表示

ファイル: Shuwu.py プロジェクト: wyntung/RedKindle

	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		section = self.feeds[0][0]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		if result.code == 200 and result.content:
			content = result.content.decode(self.feed_encoding)
			soup = BeautifulSoup(content, "lxml")
			tag_a = soup.find_all('a')
			href = tag_a[1]['href']
			temp_url = href[0:6]
			url = 'http://www.housebook.com.cn/'+ href
			result = opener.open(url)
			if result.code != 200:
				self.log.warn('fetch rss failed:%s'%mainurl)
				return []
			content = result.content.decode(self.feed_encoding)
			soup = BeautifulSoup(content, "lxml")
			tag_a = soup.find_all('a')
			for art in tag_a:
				if art['href'] == '../main.htm':
					continue
				urlfeed = 'http://www.housebook.com.cn/' + temp_url +'/' +art['href']
				title = art.text
				urls.append((section, title, urlfeed, None))
				urladded.add(urlfeed)
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls

コード例 #3

0

ファイルを表示

    def ParseFeedUrls(self):
        #解析xml，返回相关信息
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1]
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout + 10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)

            if result.code == 200 and result.content:
                if self.feed_encoding:
                    content = result.content.decode(self.feed_encoding)
                else:
                    content = AutoDecoder(True).decode(result.content, url)
                feed = feedparser.parse(content)  #进行解析

                #分解得到的内容
                for e in feed['entries'][:self.
                                         max_articles_per_feed]:  #取相应数量的feed
                    if self.oldest_article > 0 and hasattr(
                            e, 'updated_parsed'):  #是否有更新
                        updated = e.updated_parsed
                        if updated:
                            delta = tnow - datetime.datetime(*(updated[0:6]))
                            #根据时间来判断要取的文章
                            if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article:
                                self.log.info("Skip old article: %s" % e.link)
                                continue
                    #支持HTTPS
                    urlfeed = e.link.replace(
                        'http://',
                        'https://') if url.startswith('https://') else e.link
                    if urlfeed in urladded:
                        continue

                    desc = None
                    if isfulltext:
                        if hasattr(e, 'content') and e.content[0]['value']:
                            desc = e.content[0]['value']
                        elif hasattr(e, 'description'):
                            desc = e.description
                        else:
                            self.log.warn(
                                'fulltext feed item no has desc,link to webpage for article.(%s)'
                                % e.title)
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s' % (result.code, url))

        return urls

コード例 #4

0

ファイルを表示

ファイル: ZhihuDaily.py プロジェクト: wyntung/RedKindle

    def fetcharticle(self, url, decoder):
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        status_code, content = result.code, result.content
        if status_code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (status_code, url))
            return None

        if self.page_encoding:
            return content.decode(self.page_encoding)
        else:
            return decoder.decode(content, url)

コード例 #5

0

ファイルを表示

    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        mainurl = 'http://www.economist.com/printedition'
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=30)
        result = opener.open(mainurl)
        if result.code != 200:
            self.log.warn('fetch rss failed:%s' % mainurl)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")
        #href=re.compile("elsie")
        for section in soup.find_all(id=re.compile("section-")):
            h4 = section.find('h4')
            if h4 is None:
                self.log.warn('h4 is empty')
                continue
            sectitle = string_of_tag(h4).strip()
            if not sectitle:
                self.log.warn('h4 string is empty')
                continue
            #self.log.info('Found section: %s' % section_title)
            articles = []
            subsection = ''
            for node in section.find_all(class_='article'):
                subsec = node.find('h5')
                if subsec is not None:
                    subsection = string_of_tag(subsec)
                prefix = (subsection + ': ') if subsection else ''
                a = node.find('a', attrs={"href": True}, recursive=False)
                if a is not None:
                    url = a['href']
                    if url.startswith(r'/'):
                        url = 'http://www.economist.com' + url
                    url += '/print'
                    title = string_of_tag(a)
                    if title:
                        title = prefix + title
                        #self.log.info('\tFound article:%s' % title)
                        if url not in urladded:
                            urls.append((sectitle, title, url, None))
                            urladded.add(url)
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls

コード例 #6

0

ファイルを表示

	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		section = self.feeds[0][0]
		if result.code == 200 and result.content:
			soup = BeautifulSoup(result.content,'lxml')
			cont = soup.findAll('item')
			for con in cont:
				title = con.title.get_text()
				href = con.contents[2]
				urls.append((section, title, href, None))
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls

コード例 #7

0

ファイルを表示

    def fetcharticle(self, url, decoder):
        """链接网页获取一篇文章"""
        if self.fulltext_by_instapaper and not self.fulltext_by_readability:
            #用instapaper进行初期的内容提取
            url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)

        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        code, content = result.code, result.content
        if code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (code, url))
            return None

        if self.page_encoding:
            return content.decode(self.page_encoding)
        else:
            return decoder.decode(content, url)

コード例 #8

0

ファイルを表示

ファイル: Chuansm.py プロジェクト: bluesky4485/RedKindle

	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		section = self.feeds[0][0]
		if result.code == 200 and result.content:
			soup = BeautifulSoup(result.content,'lxml')
			cont = soup.findAll(attrs={"class":"feed_item_question"})
			for con in cont:
				title = con.a.get_text()
				href = "http://chuansongme.com%s" % con.a['href']
				urls.append((section, title, href, None))
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls

コード例 #9

0

ファイルを表示

    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        url = self.feeds[0][1]
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.code != 200 or not result.content:
            self.log.warn('fetch webpage failed(%d):%s.' % (result.code, url))
            return []

        if self.feed_encoding:
            try:
                content = result.content.decode(self.feed_encoding)
            except UnicodeDecodeError:
                content = AutoDecoder(False).decode(result.content,
                                                    opener.realurl)
        else:
            content = AutoDecoder(False).decode(result.content, opener.realurl)

        soup = BeautifulSoup(content, 'lxml')
        for article in soup.find_all('div', attrs={'class': 'post'}):
            title = article.find('a', attrs={'class': 'title'})
            if not title or not title.string.startswith(u'安邦'):
                continue

            #获取发布时间
            pubdate = article.find('span', attrs={'class': 'date'})
            if not pubdate:
                continue
            mt = re.match(ur'(\d{4})年(\d{1,2})月(\d{1,2})日', pubdate.string)
            if not mt:
                continue
            pubdate = datetime.datetime(int(mt.group(1)), int(mt.group(2)),
                                        int(mt.group(3)))

            #确定文章是否需要推送，时区固定为北京时间
            tnow = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
            delta = tnow - pubdate
            if self.oldest_article > 0 and delta.days > self.oldest_article:
                continue

            urls.append((u'安邦咨询', title.string, title['href'], None))

        return urls

コード例 #10

0

ファイルを表示

ファイル: ZhihuDaily.py プロジェクト: wyntung/RedKindle

    def ParseFeedUrls(self):
        urls = []
        urladded = set()
        url = self.feeds[0][1]
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.code == 200 and result.content:
            feed = json.loads(result.content.decode(self.feed_encoding))

            for partition, section in self.partitions:
                for item in feed[partition]:
                    urlfeed = item['share_url']
                    if urlfeed in urladded:
                        self.log.info('skipped %s' % urlfeed)
                        continue
                    urls.append((section, item['title'], urlfeed, None))
                    urladded.add(urlfeed)
        else:
            self.log.warn('fetch rss failed(%d):%s' % (result.code, url))
        return urls

コード例 #11

0

ファイルを表示

	def ParseFeedUrls(self):
		urls = []
		urladded = set()
		url = self.feeds[0][1]
		opener = URLOpener(self.host, timeout=self.timeout)
		result = opener.open(url)
		section = self.feeds[0][0]
		if result.code == 200 and result.content:
			soup = BeautifulSoup(result.content,'lxml')
			cont1 = soup.findAll("title")
			cont2 = soup.findAll("guid")
			nums = len(cont2)
			for i in range(nums):
				title = cont1[i+2].string
				href = cont2[i].string
				url = self.trueURL_zzh(href)
				urls.append((section, title, url, None))
		else:
			self.log.warn('fetch rss failed(%d):%s'%(result.code,url))
		return urls

コード例 #12

0

ファイルを表示

ファイル: PaoPao.py プロジェクト: wyntung/RedKindle

 def ParseFeedUrls(self):
     urls = []
     urladded = set()
     url = self.feeds[0][1]
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     section = self.feeds[0][0]
     if result.code == 200 and result.content:
         soup = BeautifulSoup(result.content, 'lxml')
         cont = soup.findAll(attrs={
             "class":
             "field field-name-title field-type-ds field-label-hidden"
         })
         root_url = 'https://s3.amazonaws.com/pao-pao/%s'
         for con in cont:
             title = con.a.get_text()
             href = root_url % con.a['href']
             urls.append((section, title, href, None))
     else:
         self.log.warn('fetch rss failed(%d):%s' % (result.code, url))
     return urls

コード例 #13

0

ファイルを表示

    def GET(self):
        code = web.input().get('code')
        client = Client(
            KEY_Q,
            SECRET_Q,
            site='https://graph.qq.com',
            authorize_url='https://graph.qq.com/oauth2.0/authorize',
            token_url='https://graph.qq.com/oauth2.0/token')

        if not code:
            try:
                authorize_url = client.auth_code.authorize_url(
                    redirect_uri=CALLBACK_Q, scope='get_user_info')
                web.seeother(authorize_url)
            except:
                raise web.seeother(r'/')
        else:
            try:
                access_token = client.auth_code.get_token(
                    code, redirect_uri=CALLBACK_Q, parse='query')
                url = "https://graph.qq.com/oauth2.0/me?access_token=%s" % access_token.token
                opener = URLOpener()
                result = opener.open(url)
                r_code, content = result.code, result.content
            except:
                raise web.seeother(r'/')
            if content.find('error') == 0:
                raise web.seeother(r'/')

            if content.find("callback") == 0:
                lp = content.find('(')
                rp = content.find(')')
                con = content[lp + 1:rp - 1]

                try:
                    data = json.loads(con)

                    openid = data['openid']
                    clientid = data['client_id']

                    url2 = "https://graph.qq.com/user/get_user_info?oauth_consumer_key=%s&access_token=%s&openid=%s&format=json" % (
                        KEY_Q, access_token.token, openid)

                    r2 = opener.open(url2)
                    content2 = r2.content
                    data2 = json.loads(content2)
                    ret = data2['ret']
                except:
                    raise web.seeother(r'/')
                if ret == 0:
                    #name = data2['nickname']+'('+openid[2:6]+')'
                    name = openid[2:6]
                    #存在，登录
                    if model.isuser(name, 'qq') == 1:
                        session.login = 1
                        session.username = name
                        model.update_logintime(local_time(), name)
                        raise web.seeother(r'/')
                    else:
                        #不存在，注册,登录返回
                        #注册
                        model.input_user(name, 'qq')
                        if model.isuser(name, 'qq') == 1:
                            session.login = 1
                            session.username = name
                            raise web.seeother(r'/my')
                        else:
                            return jjenv.get_template("register.html").render(
                                nickname='', title='Register', tips="")
                else:
                    raise web.seeother(r'/')
            else:
                raise web.seeother(r'/')

コード例 #14

0

ファイルを表示

encoding = chardet.detect(content)['encoding']
print encoding
result = content.decode(encoding)

netloc = urlparse.urlsplit(url)[1]

print netloc
r.set(netloc,encoding)

print r.get(netloc)
'''
#url='http://tech.sina.com.cn/internet/'
#url='http://tech.sina.com.cn/i/2014-01-08/08039077686.shtml'
#url='http://blog.knownsec.com/2012/04/about-content-encoding-gzip/'
url = 'http://book.douban.com/review/6549990/'
zzh = URLOpener()
re = zzh.open(url)
#print re.info()
#print re.content.decode('GBK').encode('utf-8')
#print re.content
fout = open('zhang_test', 'wb')
fout.write(re.content)
fout.close()
'''
encoding = chardet.detect(re.content)['encoding']
print encoding
print re.headers
print isinstance(re.content,unicode)
print re.content.decode(encoding,'ignore').encode('utf-8')
'''
doc = readability.Document(re.content)

コード例 #15

0

ファイルを表示

ファイル: test.py プロジェクト: wyntung/RedKindle

from lib.img import rescale_image
from lib.url_req import URLOpener
import os

#rescale_image(data, maxsizeb=4000000, dimen=None, png2jpg=False,     graying=True, reduceto=(600,800)):


test = URLOpener().open('http://img.xinjunshi.com/uploads/allimg/140224/11-140224101225.jpg')
#test=URLOpener().open('http://www.sucaitianxia.com/d/file/20131222/28caa29d1ddad3c085035e024a9f0b02.png')
con = test.content

con = rescale_image(con,reduceto=(400,600),graying=False)
fout = open('zzh.jpg', "wb")
fout.write(con)
fout.close()

コード例 #16

0

ファイルを表示

    def Items(self, opts=None):
        decoder = AutoDecoder(False)
        timeout = self.timeout
        for section, url in self.feeds:
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            code, content = result.code, result.content
            if code != 200 or not content:
                self.log.warn('fetch article failed(%d):%s.' % (code, url))
                continue

            if self.page_encoding:
                try:
                    content = content.decode(self.page_encoding)
                except UnicodeDecodeError:
                    content = decoder.decode(content, opener.realurl)
            else:
                content = decoder.decode(content, opener.realurl)

            content = self.preprocess(content)
            soup = BeautifulSoup(content, "lxml")

            h = soup.find('head')
            if not h:
                h = soup.new_tag('head')
                t = soup.new_tag('title')
                t.string = section
                h.append(t)
                soup.html.insert(0, h)
            try:
                title = soup.html.head.title.string
            except AttributeError:
                title = section

            title = self.processtitle(title)

            if self.keep_only_tags:
                body = soup.new_tag('body')
                try:
                    if isinstance(self.keep_only_tags, dict):
                        keep_only_tags = [self.keep_only_tags]
                    else:
                        keep_only_tags = self.keep_only_tags
                    for spec in keep_only_tags:
                        for tag in soup.find('body').find_all(**spec):
                            body.insert(len(body.contents), tag)
                    soup.find('body').replace_with(body)
                except AttributeError:  # soup has no body element
                    pass

            for spec in self.remove_tags_after:
                tag = soup.find(**spec)
                remove_beyond(tag, 'next_sibling')

            for spec in self.remove_tags_before:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previous_sibling')

            remove_tags = self.insta_remove_tags + self.remove_tags
            remove_ids = self.insta_remove_ids + self.remove_ids
            remove_classes = self.insta_remove_classes + self.remove_classes
            remove_attrs = self.insta_remove_attrs + self.remove_attrs
            for tag in soup.find_all(remove_tags):
                tag.decompose()
            for id in remove_ids:
                for tag in soup.find_all(attrs={"id": id}):
                    tag.decompose()
            for cls in remove_classes:
                for tag in soup.find_all(attrs={"class": cls}):
                    tag.decompose()
            for attr in remove_attrs:
                for tag in soup.find_all(attrs={attr: True}):
                    del tag[attr]
            for cmt in soup.find_all(
                    text=lambda text: isinstance(text, Comment)):
                cmt.extract()

            if self.extra_css:
                sty = soup.new_tag('style', type="text/css")
                sty.string = self.extra_css
                soup.html.head.append(sty)

            if self.keep_image:
                self.soupbeforeimage(soup)
                for img in soup.find_all('img', attrs={'src': True}):
                    imgurl = img['src']
                    if img.get('height') in ('1','2','3','4','5') \
                     or img.get('width') in ('1','2','3','4','5'):
                        self.log.warn('img size too small,take away it:%s' %
                                      imgurl)
                        img.decompose()
                        continue
                    if not imgurl.startswith('http'):
                        imgurl = self.urljoin(url, imgurl)
                    if self.fetch_img_via_ssl and url.startswith('https://'):
                        imgurl = imgurl.replace('http://', 'https://')
                    if self.isfiltered(imgurl):
                        self.log.warn('img filtered:%s' % imgurl)
                        img.decompose()
                        continue

                    imgresult = opener.open(imgurl)
                    imgcontent = self.process_image(
                        imgresult.content,
                        opts) if imgresult.code == 200 else None
                    if imgcontent:
                        imgtype = imghdr.what(None, imgcontent)
                        if imgtype:
                            imgmime = r"image/" + imgtype
                            fnimg = "img%d.%s" % (self.imgindex,
                                                  'jpg' if imgtype == 'jpeg'
                                                  else imgtype)
                            img['src'] = fnimg
                            yield (imgmime, imgurl, fnimg, imgcontent, None)
                        else:
                            img.decompose()
                    else:
                        self.log.warn('fetch img failed(err:%d):%s' %
                                      (imgresult.code, imgurl))
                        img.decompose()

                for img in soup.find_all('img'):
                    if img.parent and img.parent.parent and img.parent.name == 'a':
                        img.parent.replace_with(img)
            else:
                for img in soup.find_all('img'):
                    img.decompose()

            self.soupprocessex(soup)
            content = unicode(soup)

            brief = u''
            if GENERATE_TOC_DESC:
                body = soup.find('body')
                for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2，避免和标题重
                    h.decompose()
                for s in body.stripped_strings:
                    brief += unicode(s) + u' '
                    if len(brief) >= TOC_DESC_WORD_LIMIT:
                        brief = brief[:TOC_DESC_WORD_LIMIT]
                        break

            soup = None
            content = self.postprocess(content)
            yield (section, url, title, content, brief)

コード例 #17

0

ファイルを表示

    def readability_by_soup(self, article, url, opts=None):
        content = self.preprocess(article)
        soup = BeautifulSoup(content, "lxml")

        try:
            title = soup.html.head.title.string
        except AttributeError:
            self.log.warn('object soup invalid!(%s)' % url)
            return

        title = self.processtitle(title)
        soup.html.head.title.string = title

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    keep_only_tags = [self.keep_only_tags]
                else:
                    keep_only_tags = self.keep_only_tags
                for spec in keep_only_tags:
                    for tag in soup.find('body').find_all(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replace_with(body)
            except AttributeError:
                pass

        for spec in self.remove_tags_after:
            tag = soup.find(**spec)
            remove_beyond(tag, 'next_sibling')

        for spec in self.remove_tags_before:
            tag = soup.find(**spec)
            remove_beyond(tag, 'previous_sibling')

        remove_tags = self.insta_remove_tags + self.remove_tags
        remove_ids = self.insta_remove_ids + self.remove_ids
        remove_classes = self.insta_remove_classes + self.remove_classes
        remove_attrs = self.insta_remove_attrs + self.remove_attrs

        for tag in soup.find_all(remove_tags):
            tag.decompose()
        for id in remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            self.soupbeforeimage(soup)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                 or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take away it:%s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered:%s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content, opts) if imgresult.code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.code, imgurl))
                    img.decompose()

            for img in soup.find_all('img'):  #去掉图像上面的链接
                if img.parent and img.parent.parent and \
                 img.parent.name == 'a':
                    img.parent.replace_with(img)

        else:
            for img in soup.find_all('img'):
                img.decompose()

        #如果没有内容标题则添加
        t = soup.html.body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h1')
            t.string = title
            soup.html.body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现，不是文章标题
                    t = soup.new_tag('h1')
                    t.string = title
                    soup.html.body.insert(0, t)
                    break

        self.soupprocessex(soup)
        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2，避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)

コード例 #18

0

ファイルを表示

    def readability(self, article, url, opts=None):
        """ 使用readability-lxml处理全文信息 """
        content = self.preprocess(article)
        #		print '--------------'
        #		print content
        #		print '---------------'
        # 提取正文
        try:
            doc = readability.Document(content)
            summary = doc.summary(html_partial=True)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        title = self.processtitle(title)
        #		print '=================='
        #		print summary
        #		print '==================='

        soup = BeautifulSoup(summary, 'lxml')
        #	soup = BeautifulSoup(content,'lxml')
        '''
		#没有head
		h = soup.find('head')
		if not h:
			h = soup.new_tag('head')
			t = soup.new_tag('title')
			t.string = title
			h.append(t)
			soup.html.insert(0,h)

		#没有h
		t = soup.html.body.find(['h1','h2'])
		if not t:
			t = soup.new_tag('h1')
			t.string = title
			soup.html.body.insert(0,t)
		else:
			totallen = 0
			for ps in t.previous_siblings:
				totallen += len(string_of_tag(ps))
				if totallen > 40:
					t = soup.new_tag('h1')
					t.string = title
					soup.html.body.insert(0,t)
					break
		'''
        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                 or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take it away : %s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content, opts) if imgresult.code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.code, imgurl))
                    img.decompose()
            #去掉图像上面的链接
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                 img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        #		print '====-=-=-=-=-=-=-='
        #		print soup
        #		print '-=-=-=-=-=-=-=-=-=-=-'
        cc = soup.body.contents[0]
        #		cc.name = "articleblock"
        #		print cc
        #		print soup.body.renderContents()
        #content = unicode(soup)
        content = unicode(cc)

        #print soup.find('body').contents
        #print soup.body.contents

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2，避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break

        soup = None
        yield (title, None, None, content, brief)